diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..f0fc857 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,22 @@ +{ + "name": "GIS ETL Development", + "image": "mcr.microsoft.com/devcontainers/python:3.11-bullseye", + "features": { + "ghcr.io/devcontainers/features/git:1": {}, + "ghcr.io/devcontainers/features/github-cli:1": {} + }, + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "ms-python.flake8", + "ms-python.black-formatter", + "redhat.vscode-yaml" + ] + } + }, + "postCreateCommand": "pip install -r requirements.txt", + "remoteUser": "vscode", + "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind", + "workspaceFolder": "/workspace" +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..fbcb797 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.11-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y \ + curl \ + wget \ + gdal-bin \ + libgdal-dev \ + python3-gdal \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY src/ ./src/ +COPY scripts/ ./scripts/ +COPY config/ ./config/ + +RUN chmod +x scripts/*.sh + +ENV PYTHONPATH=/app/src + +CMD ["python", "src/main.py"] \ No newline at end of file diff --git a/README.md b/README.md index 6471db3..62087c9 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,11 @@ # KNEL-SwEng-Platform-ETL-GIS-Inbox -Code to ingest raw GIS data \ No newline at end of file +The purpose of this repository is to contain: + +devcontainer.json +Dockerfile +Bash/python scripts + +meant to run in a docker container, orchestrated by Jenkins or Apache Airflow or similiar type tooling which will download data from NASA/USGS for use in planning coverage for wireless internet service providers. + +Specifically it will need to be given a geo boundary in some format (kmz/geojson etc) and pull SRTM files from NASA. \ No newline at end of file diff --git a/config/logging.conf b/config/logging.conf new file mode 100644 index 0000000..0f6199f --- /dev/null +++ b/config/logging.conf @@ -0,0 +1,28 @@ +[loggers] +keys=root + +[handlers] +keys=consoleHandler,fileHandler + +[formatters] +keys=simpleFormatter + +[logger_root] +level=INFO +handlers=consoleHandler,fileHandler + +[handler_consoleHandler] +class=StreamHandler +level=INFO +formatter=simpleFormatter +args=(sys.stdout,) + +[handler_fileHandler] +class=FileHandler +level=DEBUG +formatter=simpleFormatter +args=('etl.log',) + +[formatter_simpleFormatter] +format=%(asctime)s - %(name)s - %(levelname)s - %(message)s +datefmt=%Y-%m-%d %H:%M:%S \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ffde716 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +requests>=2.31.0 +geopandas>=0.14.0 +shapely>=2.0.0 +fiona>=1.9.0 +rasterio>=1.3.0 +pyproj>=3.6.0 +numpy>=1.24.0 +pandas>=2.0.0 +GDAL>=3.6.0 +click>=8.1.0 \ No newline at end of file diff --git a/scripts/docker_run.sh b/scripts/docker_run.sh new file mode 100755 index 0000000..0941a79 --- /dev/null +++ b/scripts/docker_run.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +set -e + +IMAGE_NAME="gis-etl-inbox" +TAG="latest" +BOUNDARY_FILE="" +OUTPUT_DIR="$(pwd)/data/output" +DATA_DIR="$(pwd)/data" + +usage() { + echo "Usage: $0 -b BOUNDARY_FILE [-o OUTPUT_DIR] [-d DATA_DIR] [-t TAG]" + echo " -b BOUNDARY_FILE Path to boundary file (KMZ/GeoJSON)" + echo " -o OUTPUT_DIR Host output directory (default: ./data/output)" + echo " -d DATA_DIR Host data directory (default: ./data)" + echo " -t TAG Docker image tag (default: latest)" + exit 1 +} + +while getopts "b:o:d:t:h" opt; do + case $opt in + b) BOUNDARY_FILE="$OPTARG" ;; + o) OUTPUT_DIR="$OPTARG" ;; + d) DATA_DIR="$OPTARG" ;; + t) TAG="$OPTARG" ;; + h) usage ;; + \?) echo "Invalid option -$OPTARG" >&2; usage ;; + esac +done + +if [ -z "$BOUNDARY_FILE" ]; then + echo "Error: Boundary file is required" + usage +fi + +if [ ! -f "$BOUNDARY_FILE" ]; then + echo "Error: Boundary file '$BOUNDARY_FILE' not found" + exit 1 +fi + +mkdir -p "$OUTPUT_DIR" +mkdir -p "$DATA_DIR" + +BOUNDARY_FILENAME=$(basename "$BOUNDARY_FILE") + +echo "Building Docker image..." +docker build -t "${IMAGE_NAME}:${TAG}" . + +echo "Running ETL in Docker container..." +docker run --rm \ + -v "$BOUNDARY_FILE:/app/data/input/$BOUNDARY_FILENAME:ro" \ + -v "$OUTPUT_DIR:/app/data/output" \ + -v "$DATA_DIR/temp:/app/data/temp" \ + "${IMAGE_NAME}:${TAG}" \ + python src/main.py \ + --boundary-file "/app/data/input/$BOUNDARY_FILENAME" \ + --output-dir "/app/data/output" \ + --temp-dir "/app/data/temp" + +echo "Docker ETL process completed successfully" \ No newline at end of file diff --git a/scripts/run_etl.sh b/scripts/run_etl.sh new file mode 100755 index 0000000..143b34f --- /dev/null +++ b/scripts/run_etl.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +set -e + +BOUNDARY_FILE="" +OUTPUT_DIR="./data/output" +TEMP_DIR="./data/temp" +VERBOSE=false + +usage() { + echo "Usage: $0 -b BOUNDARY_FILE [-o OUTPUT_DIR] [-t TEMP_DIR] [-v]" + echo " -b BOUNDARY_FILE Path to boundary file (KMZ/GeoJSON)" + echo " -o OUTPUT_DIR Output directory (default: ./data/output)" + echo " -t TEMP_DIR Temporary directory (default: ./data/temp)" + echo " -v Verbose output" + exit 1 +} + +while getopts "b:o:t:vh" opt; do + case $opt in + b) BOUNDARY_FILE="$OPTARG" ;; + o) OUTPUT_DIR="$OPTARG" ;; + t) TEMP_DIR="$OPTARG" ;; + v) VERBOSE=true ;; + h) usage ;; + \?) echo "Invalid option -$OPTARG" >&2; usage ;; + esac +done + +if [ -z "$BOUNDARY_FILE" ]; then + echo "Error: Boundary file is required" + usage +fi + +if [ ! -f "$BOUNDARY_FILE" ]; then + echo "Error: Boundary file '$BOUNDARY_FILE' not found" + exit 1 +fi + +echo "Starting SRTM ETL process..." +echo "Boundary file: $BOUNDARY_FILE" +echo "Output directory: $OUTPUT_DIR" +echo "Temp directory: $TEMP_DIR" + +if [ "$VERBOSE" = true ]; then + LOG_LEVEL="DEBUG" +else + LOG_LEVEL="INFO" +fi + +export PYTHONPATH="${PYTHONPATH}:$(pwd)/src" + +python3 src/main.py \ + --boundary-file "$BOUNDARY_FILE" \ + --output-dir "$OUTPUT_DIR" \ + --temp-dir "$TEMP_DIR" + +echo "ETL process completed successfully" \ No newline at end of file diff --git a/src/geo_processor.py b/src/geo_processor.py new file mode 100644 index 0000000..d58696b --- /dev/null +++ b/src/geo_processor.py @@ -0,0 +1,70 @@ +import geopandas as gpd +import zipfile +import tempfile +from pathlib import Path +import logging + +logger = logging.getLogger(__name__) + +class GeoProcessor: + def __init__(self, boundary_file): + self.boundary_file = Path(boundary_file) + self.gdf = None + self._load_boundary() + + def _load_boundary(self): + """Load boundary file (KMZ/GeoJSON/Shapefile).""" + try: + if self.boundary_file.suffix.lower() == '.kmz': + self._load_kmz() + elif self.boundary_file.suffix.lower() == '.geojson': + self.gdf = gpd.read_file(self.boundary_file) + elif self.boundary_file.suffix.lower() in ['.shp', '.zip']: + self.gdf = gpd.read_file(self.boundary_file) + else: + raise ValueError(f"Unsupported file format: {self.boundary_file.suffix}") + + # Ensure CRS is WGS84 + if self.gdf.crs != 'EPSG:4326': + self.gdf = self.gdf.to_crs('EPSG:4326') + + logger.info(f"Loaded {len(self.gdf)} features from {self.boundary_file}") + + except Exception as e: + logger.error(f"Failed to load boundary file: {e}") + raise + + def _load_kmz(self): + """Load KMZ file by extracting and reading KML.""" + with tempfile.TemporaryDirectory() as temp_dir: + with zipfile.ZipFile(self.boundary_file, 'r') as zip_ref: + zip_ref.extractall(temp_dir) + + # Find KML file + kml_files = list(Path(temp_dir).glob('*.kml')) + if not kml_files: + raise ValueError("No KML file found in KMZ archive") + + # Load KML + gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw' + self.gdf = gpd.read_file(kml_files[0], driver='KML') + + def get_bounds(self): + """Get bounding box of all features.""" + if self.gdf is None: + raise ValueError("No boundary data loaded") + + bounds = self.gdf.total_bounds + return { + 'min_lon': bounds[0], + 'min_lat': bounds[1], + 'max_lon': bounds[2], + 'max_lat': bounds[3] + } + + def get_geometry(self): + """Get the combined geometry of all features.""" + if self.gdf is None: + raise ValueError("No boundary data loaded") + + return self.gdf.geometry.unary_union \ No newline at end of file diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..c2f7ed2 --- /dev/null +++ b/src/main.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import click +import logging +from pathlib import Path +from geo_processor import GeoProcessor +from srtm_downloader import SRTMDownloader + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +@click.command() +@click.option('--boundary-file', required=True, help='Path to boundary file (KMZ/GeoJSON)') +@click.option('--output-dir', default='./data/output', help='Output directory for SRTM files') +@click.option('--temp-dir', default='./data/temp', help='Temporary directory for processing') +def main(boundary_file, output_dir, temp_dir): + """Download SRTM data for given geographic boundary.""" + try: + logger.info(f"Processing boundary file: {boundary_file}") + + # Create directories + Path(output_dir).mkdir(parents=True, exist_ok=True) + Path(temp_dir).mkdir(parents=True, exist_ok=True) + + # Process geographic boundary + geo_processor = GeoProcessor(boundary_file) + bounds = geo_processor.get_bounds() + + # Download SRTM data + downloader = SRTMDownloader(output_dir, temp_dir) + downloader.download_for_bounds(bounds) + + logger.info("Processing completed successfully") + + except Exception as e: + logger.error(f"Error during processing: {e}") + raise + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/srtm_downloader.py b/src/srtm_downloader.py new file mode 100644 index 0000000..ff0f1b3 --- /dev/null +++ b/src/srtm_downloader.py @@ -0,0 +1,98 @@ +import requests +import math +import logging +from pathlib import Path +from urllib.parse import urljoin +import time + +logger = logging.getLogger(__name__) + +class SRTMDownloader: + def __init__(self, output_dir, temp_dir=None): + self.output_dir = Path(output_dir) + self.temp_dir = Path(temp_dir) if temp_dir else self.output_dir / 'temp' + self.base_url = "https://cloud.sdstate.edu/index.php/s/UjQFkr4y8EGB3JH/download" + + # Create directories + self.output_dir.mkdir(parents=True, exist_ok=True) + self.temp_dir.mkdir(parents=True, exist_ok=True) + + def _get_srtm_tiles(self, bounds): + """Calculate which SRTM tiles are needed for given bounds.""" + min_lon, min_lat = bounds['min_lon'], bounds['min_lat'] + max_lon, max_lat = bounds['max_lon'], bounds['max_lat'] + + # SRTM tiles are 1 degree x 1 degree + tiles = [] + + for lon in range(math.floor(min_lon), math.ceil(max_lon)): + for lat in range(math.floor(min_lat), math.ceil(max_lat)): + # SRTM naming convention: N/S followed by 2-digit lat, E/W followed by 3-digit lon + lat_str = f"{'N' if lat >= 0 else 'S'}{abs(lat):02d}" + lon_str = f"{'E' if lon >= 0 else 'W'}{abs(lon):03d}" + tile_name = f"{lat_str}{lon_str}" + tiles.append(tile_name) + + return tiles + + def _download_tile(self, tile_name): + """Download a single SRTM tile.""" + filename = f"{tile_name}.SRTMGL1.hgt.zip" + output_path = self.output_dir / filename + + if output_path.exists(): + logger.info(f"Tile {tile_name} already exists, skipping") + return True + + # Try multiple SRTM data sources + urls = [ + f"https://cloud.sdstate.edu/index.php/s/UjQFkr4y8EGB3JH/download?path=%2F&files={filename}", + f"https://e4ftl01.cr.usgs.gov/MEASURES/SRTMGL1.003/2000.02.11/{filename}" + ] + + for url in urls: + try: + logger.info(f"Downloading {tile_name} from {url}") + + response = requests.get(url, timeout=300, stream=True) + response.raise_for_status() + + with open(output_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + logger.info(f"Successfully downloaded {tile_name}") + return True + + except requests.RequestException as e: + logger.warning(f"Failed to download {tile_name} from {url}: {e}") + continue + + logger.error(f"Failed to download tile {tile_name} from all sources") + return False + + def download_for_bounds(self, bounds): + """Download all SRTM tiles needed for given bounds.""" + tiles = self._get_srtm_tiles(bounds) + + logger.info(f"Need to download {len(tiles)} SRTM tiles") + logger.info(f"Tiles: {tiles}") + + successful = 0 + failed = 0 + + for tile in tiles: + if self._download_tile(tile): + successful += 1 + else: + failed += 1 + + # Be nice to the server + time.sleep(1) + + logger.info(f"Download complete: {successful} successful, {failed} failed") + + if failed > 0: + raise RuntimeError(f"Failed to download {failed} tiles") + + return successful \ No newline at end of file