Add complete GIS ETL project skeleton for SRTM data processing

- Python modules for geo boundary processing and SRTM downloads - Docker containerization with GDAL support - Development environment with devcontainer - Orchestration scripts for local and containerized execution - Support for KMZ/GeoJSON boundary files and NASA SRTM data 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-21 12:24:35 -05:00
10 changed files with 419 additions and 1 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,22 @@
+{
+    "name": "GIS ETL Development",
+    "image": "mcr.microsoft.com/devcontainers/python:3.11-bullseye",
+    "features": {
+        "ghcr.io/devcontainers/features/git:1": {},
+        "ghcr.io/devcontainers/features/github-cli:1": {}
+    },
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-python.flake8",
+                "ms-python.black-formatter",
+                "redhat.vscode-yaml"
+            ]
+        }
+    },
+    "postCreateCommand": "pip install -r requirements.txt",
+    "remoteUser": "vscode",
+    "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind",
+    "workspaceFolder": "/workspace"
+}
--- a/24
+++ b/24
@@ -0,0 +1,24 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y \
+    curl \
+    wget \
+    gdal-bin \
+    libgdal-dev \
+    python3-gdal \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY src/ ./src/
+COPY scripts/ ./scripts/
+COPY config/ ./config/
+
+RUN chmod +x scripts/*.sh
+
+ENV PYTHONPATH=/app/src
+
+CMD ["python", "src/main.py"]
--- a/README.md
+++ b/README.md
@@ -1,3 +1,11 @@
 # KNEL-SwEng-Platform-ETL-GIS-Inbox

-Code to ingest raw GIS data 
+The purpose of this repository is to contain:
+
+devcontainer.json
+Dockerfile
+Bash/python scripts
+
+meant to run in a docker container, orchestrated by Jenkins or Apache Airflow or similiar type tooling which will download data from NASA/USGS for use in planning coverage for wireless internet service providers.
+
+Specifically it will need to be given a geo boundary in some format (kmz/geojson etc) and pull SRTM files from NASA.
--- a/config/logging.conf
+++ b/config/logging.conf
@@ -0,0 +1,28 @@
+[loggers]
+keys=root
+
+[handlers]
+keys=consoleHandler,fileHandler
+
+[formatters]
+keys=simpleFormatter
+
+[logger_root]
+level=INFO
+handlers=consoleHandler,fileHandler
+
+[handler_consoleHandler]
+class=StreamHandler
+level=INFO
+formatter=simpleFormatter
+args=(sys.stdout,)
+
+[handler_fileHandler]
+class=FileHandler
+level=DEBUG
+formatter=simpleFormatter
+args=('etl.log',)
+
+[formatter_simpleFormatter]
+format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
+datefmt=%Y-%m-%d %H:%M:%S
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,10 @@
+requests>=2.31.0
+geopandas>=0.14.0
+shapely>=2.0.0
+fiona>=1.9.0
+rasterio>=1.3.0
+pyproj>=3.6.0
+numpy>=1.24.0
+pandas>=2.0.0
+GDAL>=3.6.0
+click>=8.1.0
--- a/scripts/docker_run.sh
+++ b/scripts/docker_run.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+set -e
+
+IMAGE_NAME="gis-etl-inbox"
+TAG="latest"
+BOUNDARY_FILE=""
+OUTPUT_DIR="$(pwd)/data/output"
+DATA_DIR="$(pwd)/data"
+
+usage() {
+    echo "Usage: $0 -b BOUNDARY_FILE [-o OUTPUT_DIR] [-d DATA_DIR] [-t TAG]"
+    echo "  -b BOUNDARY_FILE  Path to boundary file (KMZ/GeoJSON)"
+    echo "  -o OUTPUT_DIR     Host output directory (default: ./data/output)"
+    echo "  -d DATA_DIR       Host data directory (default: ./data)"
+    echo "  -t TAG            Docker image tag (default: latest)"
+    exit 1
+}
+
+while getopts "b:o:d:t:h" opt; do
+    case $opt in
+        b) BOUNDARY_FILE="$OPTARG" ;;
+        o) OUTPUT_DIR="$OPTARG" ;;
+        d) DATA_DIR="$OPTARG" ;;
+        t) TAG="$OPTARG" ;;
+        h) usage ;;
+        \?) echo "Invalid option -$OPTARG" >&2; usage ;;
+    esac
+done
+
+if [ -z "$BOUNDARY_FILE" ]; then
+    echo "Error: Boundary file is required"
+    usage
+fi
+
+if [ ! -f "$BOUNDARY_FILE" ]; then
+    echo "Error: Boundary file '$BOUNDARY_FILE' not found"
+    exit 1
+fi
+
+mkdir -p "$OUTPUT_DIR"
+mkdir -p "$DATA_DIR"
+
+BOUNDARY_FILENAME=$(basename "$BOUNDARY_FILE")
+
+echo "Building Docker image..."
+docker build -t "${IMAGE_NAME}:${TAG}" .
+
+echo "Running ETL in Docker container..."
+docker run --rm \
+    -v "$BOUNDARY_FILE:/app/data/input/$BOUNDARY_FILENAME:ro" \
+    -v "$OUTPUT_DIR:/app/data/output" \
+    -v "$DATA_DIR/temp:/app/data/temp" \
+    "${IMAGE_NAME}:${TAG}" \
+    python src/main.py \
+    --boundary-file "/app/data/input/$BOUNDARY_FILENAME" \
+    --output-dir "/app/data/output" \
+    --temp-dir "/app/data/temp"
+
+echo "Docker ETL process completed successfully"
--- a/scripts/run_etl.sh
+++ b/scripts/run_etl.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+set -e
+
+BOUNDARY_FILE=""
+OUTPUT_DIR="./data/output"
+TEMP_DIR="./data/temp"
+VERBOSE=false
+
+usage() {
+    echo "Usage: $0 -b BOUNDARY_FILE [-o OUTPUT_DIR] [-t TEMP_DIR] [-v]"
+    echo "  -b BOUNDARY_FILE  Path to boundary file (KMZ/GeoJSON)"
+    echo "  -o OUTPUT_DIR     Output directory (default: ./data/output)"
+    echo "  -t TEMP_DIR       Temporary directory (default: ./data/temp)"
+    echo "  -v                Verbose output"
+    exit 1
+}
+
+while getopts "b:o:t:vh" opt; do
+    case $opt in
+        b) BOUNDARY_FILE="$OPTARG" ;;
+        o) OUTPUT_DIR="$OPTARG" ;;
+        t) TEMP_DIR="$OPTARG" ;;
+        v) VERBOSE=true ;;
+        h) usage ;;
+        \?) echo "Invalid option -$OPTARG" >&2; usage ;;
+    esac
+done
+
+if [ -z "$BOUNDARY_FILE" ]; then
+    echo "Error: Boundary file is required"
+    usage
+fi
+
+if [ ! -f "$BOUNDARY_FILE" ]; then
+    echo "Error: Boundary file '$BOUNDARY_FILE' not found"
+    exit 1
+fi
+
+echo "Starting SRTM ETL process..."
+echo "Boundary file: $BOUNDARY_FILE"
+echo "Output directory: $OUTPUT_DIR"
+echo "Temp directory: $TEMP_DIR"
+
+if [ "$VERBOSE" = true ]; then
+    LOG_LEVEL="DEBUG"
+else
+    LOG_LEVEL="INFO"
+fi
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)/src"
+
+python3 src/main.py \
+    --boundary-file "$BOUNDARY_FILE" \
+    --output-dir "$OUTPUT_DIR" \
+    --temp-dir "$TEMP_DIR"
+
+echo "ETL process completed successfully"
--- a/src/geo_processor.py
+++ b/src/geo_processor.py
@@ -0,0 +1,70 @@
+import geopandas as gpd
+import zipfile
+import tempfile
+from pathlib import Path
+import logging
+
+logger = logging.getLogger(__name__)
+
+class GeoProcessor:
+    def __init__(self, boundary_file):
+        self.boundary_file = Path(boundary_file)
+        self.gdf = None
+        self._load_boundary()
+    
+    def _load_boundary(self):
+        """Load boundary file (KMZ/GeoJSON/Shapefile)."""
+        try:
+            if self.boundary_file.suffix.lower() == '.kmz':
+                self._load_kmz()
+            elif self.boundary_file.suffix.lower() == '.geojson':
+                self.gdf = gpd.read_file(self.boundary_file)
+            elif self.boundary_file.suffix.lower() in ['.shp', '.zip']:
+                self.gdf = gpd.read_file(self.boundary_file)
+            else:
+                raise ValueError(f"Unsupported file format: {self.boundary_file.suffix}")
+            
+            # Ensure CRS is WGS84
+            if self.gdf.crs != 'EPSG:4326':
+                self.gdf = self.gdf.to_crs('EPSG:4326')
+                
+            logger.info(f"Loaded {len(self.gdf)} features from {self.boundary_file}")
+            
+        except Exception as e:
+            logger.error(f"Failed to load boundary file: {e}")
+            raise
+    
+    def _load_kmz(self):
+        """Load KMZ file by extracting and reading KML."""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            with zipfile.ZipFile(self.boundary_file, 'r') as zip_ref:
+                zip_ref.extractall(temp_dir)
+            
+            # Find KML file
+            kml_files = list(Path(temp_dir).glob('*.kml'))
+            if not kml_files:
+                raise ValueError("No KML file found in KMZ archive")
+            
+            # Load KML
+            gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'
+            self.gdf = gpd.read_file(kml_files[0], driver='KML')
+    
+    def get_bounds(self):
+        """Get bounding box of all features."""
+        if self.gdf is None:
+            raise ValueError("No boundary data loaded")
+        
+        bounds = self.gdf.total_bounds
+        return {
+            'min_lon': bounds[0],
+            'min_lat': bounds[1], 
+            'max_lon': bounds[2],
+            'max_lat': bounds[3]
+        }
+    
+    def get_geometry(self):
+        """Get the combined geometry of all features."""
+        if self.gdf is None:
+            raise ValueError("No boundary data loaded")
+        
+        return self.gdf.geometry.unary_union
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+import click
+import logging
+from pathlib import Path
+from geo_processor import GeoProcessor
+from srtm_downloader import SRTMDownloader
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+@click.command()
+@click.option('--boundary-file', required=True, help='Path to boundary file (KMZ/GeoJSON)')
+@click.option('--output-dir', default='./data/output', help='Output directory for SRTM files')
+@click.option('--temp-dir', default='./data/temp', help='Temporary directory for processing')
+def main(boundary_file, output_dir, temp_dir):
+    """Download SRTM data for given geographic boundary."""
+    try:
+        logger.info(f"Processing boundary file: {boundary_file}")
+        
+        # Create directories
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        
+        # Process geographic boundary
+        geo_processor = GeoProcessor(boundary_file)
+        bounds = geo_processor.get_bounds()
+        
+        # Download SRTM data
+        downloader = SRTMDownloader(output_dir, temp_dir)
+        downloader.download_for_bounds(bounds)
+        
+        logger.info("Processing completed successfully")
+        
+    except Exception as e:
+        logger.error(f"Error during processing: {e}")
+        raise
+
+if __name__ == '__main__':
+    main()
--- a/src/srtm_downloader.py
+++ b/src/srtm_downloader.py
@@ -0,0 +1,98 @@
+import requests
+import math
+import logging
+from pathlib import Path
+from urllib.parse import urljoin
+import time
+
+logger = logging.getLogger(__name__)
+
+class SRTMDownloader:
+    def __init__(self, output_dir, temp_dir=None):
+        self.output_dir = Path(output_dir)
+        self.temp_dir = Path(temp_dir) if temp_dir else self.output_dir / 'temp'
+        self.base_url = "https://cloud.sdstate.edu/index.php/s/UjQFkr4y8EGB3JH/download"
+        
+        # Create directories
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.temp_dir.mkdir(parents=True, exist_ok=True)
+    
+    def _get_srtm_tiles(self, bounds):
+        """Calculate which SRTM tiles are needed for given bounds."""
+        min_lon, min_lat = bounds['min_lon'], bounds['min_lat']
+        max_lon, max_lat = bounds['max_lon'], bounds['max_lat']
+        
+        # SRTM tiles are 1 degree x 1 degree
+        tiles = []
+        
+        for lon in range(math.floor(min_lon), math.ceil(max_lon)):
+            for lat in range(math.floor(min_lat), math.ceil(max_lat)):
+                # SRTM naming convention: N/S followed by 2-digit lat, E/W followed by 3-digit lon
+                lat_str = f"{'N' if lat >= 0 else 'S'}{abs(lat):02d}"
+                lon_str = f"{'E' if lon >= 0 else 'W'}{abs(lon):03d}"
+                tile_name = f"{lat_str}{lon_str}"
+                tiles.append(tile_name)
+        
+        return tiles
+    
+    def _download_tile(self, tile_name):
+        """Download a single SRTM tile."""
+        filename = f"{tile_name}.SRTMGL1.hgt.zip"
+        output_path = self.output_dir / filename
+        
+        if output_path.exists():
+            logger.info(f"Tile {tile_name} already exists, skipping")
+            return True
+        
+        # Try multiple SRTM data sources
+        urls = [
+            f"https://cloud.sdstate.edu/index.php/s/UjQFkr4y8EGB3JH/download?path=%2F&files={filename}",
+            f"https://e4ftl01.cr.usgs.gov/MEASURES/SRTMGL1.003/2000.02.11/{filename}"
+        ]
+        
+        for url in urls:
+            try:
+                logger.info(f"Downloading {tile_name} from {url}")
+                
+                response = requests.get(url, timeout=300, stream=True)
+                response.raise_for_status()
+                
+                with open(output_path, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                
+                logger.info(f"Successfully downloaded {tile_name}")
+                return True
+                
+            except requests.RequestException as e:
+                logger.warning(f"Failed to download {tile_name} from {url}: {e}")
+                continue
+        
+        logger.error(f"Failed to download tile {tile_name} from all sources")
+        return False
+    
+    def download_for_bounds(self, bounds):
+        """Download all SRTM tiles needed for given bounds."""
+        tiles = self._get_srtm_tiles(bounds)
+        
+        logger.info(f"Need to download {len(tiles)} SRTM tiles")
+        logger.info(f"Tiles: {tiles}")
+        
+        successful = 0
+        failed = 0
+        
+        for tile in tiles:
+            if self._download_tile(tile):
+                successful += 1
+            else:
+                failed += 1
+            
+            # Be nice to the server
+            time.sleep(1)
+        
+        logger.info(f"Download complete: {successful} successful, {failed} failed")
+        
+        if failed > 0:
+            raise RuntimeError(f"Failed to download {failed} tiles")
+        
+        return successful