Add complete GIS ETL project skeleton for SRTM data processing

- Python modules for geo boundary processing and SRTM downloads - Docker containerization with GDAL support - Development environment with devcontainer - Orchestration scripts for local and containerized execution - Support for KMZ/GeoJSON boundary files and NASA SRTM data 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-21 12:24:35 -05:00
10 changed files with 419 additions and 1 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,22 @@
 {
    "name": "GIS ETL Development",
    "image": "mcr.microsoft.com/devcontainers/python:3.11-bullseye",
    "features": {
        "ghcr.io/devcontainers/features/git:1": {},
        "ghcr.io/devcontainers/features/github-cli:1": {}
    },
    "customizations": {
        "vscode": {
            "extensions": [
                "ms-python.python",
                "ms-python.flake8",
                "ms-python.black-formatter",
                "redhat.vscode-yaml"
            ]
        }
    },
    "postCreateCommand": "pip install -r requirements.txt",
    "remoteUser": "vscode",
    "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind",
    "workspaceFolder": "/workspace"
 }
--- a/24
+++ b/24
@@ -0,0 +1,24 @@
 FROM python:3.11-slim
 WORKDIR /app
 RUN apt-get update && apt-get install -y \
    curl \
    wget \
    gdal-bin \
    libgdal-dev \
    python3-gdal \
    && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY src/ ./src/
 COPY scripts/ ./scripts/
 COPY config/ ./config/
 RUN chmod +x scripts/*.sh
 ENV PYTHONPATH=/app/src
 CMD ["python", "src/main.py"]
--- a/README.md
+++ b/README.md
@@ -1,3 +1,11 @@
 # KNEL-SwEng-Platform-ETL-GIS-Inbox
-Code to ingest raw GIS data 
+The purpose of this repository is to contain:
 devcontainer.json
 Dockerfile
 Bash/python scripts
 meant to run in a docker container, orchestrated by Jenkins or Apache Airflow or similiar type tooling which will download data from NASA/USGS for use in planning coverage for wireless internet service providers.
 Specifically it will need to be given a geo boundary in some format (kmz/geojson etc) and pull SRTM files from NASA.
--- a/config/logging.conf
+++ b/config/logging.conf
@@ -0,0 +1,28 @@
 [loggers]
 keys=root
 [handlers]
 keys=consoleHandler,fileHandler
 [formatters]
 keys=simpleFormatter
 [logger_root]
 level=INFO
 handlers=consoleHandler,fileHandler
 [handler_consoleHandler]
 class=StreamHandler
 level=INFO
 formatter=simpleFormatter
 args=(sys.stdout,)
 [handler_fileHandler]
 class=FileHandler
 level=DEBUG
 formatter=simpleFormatter
 args=('etl.log',)
 [formatter_simpleFormatter]
 format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
 datefmt=%Y-%m-%d %H:%M:%S
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,10 @@
 requests>=2.31.0
 geopandas>=0.14.0
 shapely>=2.0.0
 fiona>=1.9.0
 rasterio>=1.3.0
 pyproj>=3.6.0
 numpy>=1.24.0
 pandas>=2.0.0
 GDAL>=3.6.0
 click>=8.1.0
--- a/scripts/docker_run.sh
+++ b/scripts/docker_run.sh
@@ -0,0 +1,60 @@
 #!/bin/bash
 set -e
 IMAGE_NAME="gis-etl-inbox"
 TAG="latest"
 BOUNDARY_FILE=""
 OUTPUT_DIR="$(pwd)/data/output"
 DATA_DIR="$(pwd)/data"
 usage() {
    echo "Usage: $0 -b BOUNDARY_FILE [-o OUTPUT_DIR] [-d DATA_DIR] [-t TAG]"
    echo "  -b BOUNDARY_FILE  Path to boundary file (KMZ/GeoJSON)"
    echo "  -o OUTPUT_DIR     Host output directory (default: ./data/output)"
    echo "  -d DATA_DIR       Host data directory (default: ./data)"
    echo "  -t TAG            Docker image tag (default: latest)"
    exit 1
 }
 while getopts "b:o:d:t:h" opt; do
    case $opt in
        b) BOUNDARY_FILE="$OPTARG" ;;
        o) OUTPUT_DIR="$OPTARG" ;;
        d) DATA_DIR="$OPTARG" ;;
        t) TAG="$OPTARG" ;;
        h) usage ;;
        \?) echo "Invalid option -$OPTARG" >&2; usage ;;
    esac
 done
 if [ -z "$BOUNDARY_FILE" ]; then
    echo "Error: Boundary file is required"
    usage
 fi
 if [ ! -f "$BOUNDARY_FILE" ]; then
    echo "Error: Boundary file '$BOUNDARY_FILE' not found"
    exit 1
 fi
 mkdir -p "$OUTPUT_DIR"
 mkdir -p "$DATA_DIR"
 BOUNDARY_FILENAME=$(basename "$BOUNDARY_FILE")
 echo "Building Docker image..."
 docker build -t "${IMAGE_NAME}:${TAG}" .
 echo "Running ETL in Docker container..."
 docker run --rm \
    -v "$BOUNDARY_FILE:/app/data/input/$BOUNDARY_FILENAME:ro" \
    -v "$OUTPUT_DIR:/app/data/output" \
    -v "$DATA_DIR/temp:/app/data/temp" \
    "${IMAGE_NAME}:${TAG}" \
    python src/main.py \
    --boundary-file "/app/data/input/$BOUNDARY_FILENAME" \
    --output-dir "/app/data/output" \
    --temp-dir "/app/data/temp"
 echo "Docker ETL process completed successfully"
--- a/scripts/run_etl.sh
+++ b/scripts/run_etl.sh
@@ -0,0 +1,58 @@
 #!/bin/bash
 set -e
 BOUNDARY_FILE=""
 OUTPUT_DIR="./data/output"
 TEMP_DIR="./data/temp"
 VERBOSE=false
 usage() {
    echo "Usage: $0 -b BOUNDARY_FILE [-o OUTPUT_DIR] [-t TEMP_DIR] [-v]"
    echo "  -b BOUNDARY_FILE  Path to boundary file (KMZ/GeoJSON)"
    echo "  -o OUTPUT_DIR     Output directory (default: ./data/output)"
    echo "  -t TEMP_DIR       Temporary directory (default: ./data/temp)"
    echo "  -v                Verbose output"
    exit 1
 }
 while getopts "b:o:t:vh" opt; do
    case $opt in
        b) BOUNDARY_FILE="$OPTARG" ;;
        o) OUTPUT_DIR="$OPTARG" ;;
        t) TEMP_DIR="$OPTARG" ;;
        v) VERBOSE=true ;;
        h) usage ;;
        \?) echo "Invalid option -$OPTARG" >&2; usage ;;
    esac
 done
 if [ -z "$BOUNDARY_FILE" ]; then
    echo "Error: Boundary file is required"
    usage
 fi
 if [ ! -f "$BOUNDARY_FILE" ]; then
    echo "Error: Boundary file '$BOUNDARY_FILE' not found"
    exit 1
 fi
 echo "Starting SRTM ETL process..."
 echo "Boundary file: $BOUNDARY_FILE"
 echo "Output directory: $OUTPUT_DIR"
 echo "Temp directory: $TEMP_DIR"
 if [ "$VERBOSE" = true ]; then
    LOG_LEVEL="DEBUG"
 else
    LOG_LEVEL="INFO"
 fi
 export PYTHONPATH="${PYTHONPATH}:$(pwd)/src"
 python3 src/main.py \
    --boundary-file "$BOUNDARY_FILE" \
    --output-dir "$OUTPUT_DIR" \
    --temp-dir "$TEMP_DIR"
 echo "ETL process completed successfully"
--- a/src/geo_processor.py
+++ b/src/geo_processor.py
@@ -0,0 +1,70 @@
 import geopandas as gpd
 import zipfile
 import tempfile
 from pathlib import Path
 import logging
 logger = logging.getLogger(__name__)
 class GeoProcessor:
    def __init__(self, boundary_file):
        self.boundary_file = Path(boundary_file)
        self.gdf = None
        self._load_boundary()
    def _load_boundary(self):
        """Load boundary file (KMZ/GeoJSON/Shapefile)."""
        try:
            if self.boundary_file.suffix.lower() == '.kmz':
                self._load_kmz()
            elif self.boundary_file.suffix.lower() == '.geojson':
                self.gdf = gpd.read_file(self.boundary_file)
            elif self.boundary_file.suffix.lower() in ['.shp', '.zip']:
                self.gdf = gpd.read_file(self.boundary_file)
            else:
                raise ValueError(f"Unsupported file format: {self.boundary_file.suffix}")
            # Ensure CRS is WGS84
            if self.gdf.crs != 'EPSG:4326':
                self.gdf = self.gdf.to_crs('EPSG:4326')
            logger.info(f"Loaded {len(self.gdf)} features from {self.boundary_file}")
        except Exception as e:
            logger.error(f"Failed to load boundary file: {e}")
            raise
    def _load_kmz(self):
        """Load KMZ file by extracting and reading KML."""
        with tempfile.TemporaryDirectory() as temp_dir:
            with zipfile.ZipFile(self.boundary_file, 'r') as zip_ref:
                zip_ref.extractall(temp_dir)
            # Find KML file
            kml_files = list(Path(temp_dir).glob('*.kml'))
            if not kml_files:
                raise ValueError("No KML file found in KMZ archive")
            # Load KML
            gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'
            self.gdf = gpd.read_file(kml_files[0], driver='KML')
    def get_bounds(self):
        """Get bounding box of all features."""
        if self.gdf is None:
            raise ValueError("No boundary data loaded")
        bounds = self.gdf.total_bounds
        return {
            'min_lon': bounds[0],
            'min_lat': bounds[1], 
            'max_lon': bounds[2],
            'max_lat': bounds[3]
        }
    def get_geometry(self):
        """Get the combined geometry of all features."""
        if self.gdf is None:
            raise ValueError("No boundary data loaded")
        return self.gdf.geometry.unary_union
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,40 @@
 #!/usr/bin/env python3
 import click
 import logging
 from pathlib import Path
 from geo_processor import GeoProcessor
 from srtm_downloader import SRTMDownloader
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
@click.command()
@click.option('--boundary-file', required=True, help='Path to boundary file (KMZ/GeoJSON)')
@click.option('--output-dir', default='./data/output', help='Output directory for SRTM files')
@click.option('--temp-dir', default='./data/temp', help='Temporary directory for processing')
 def main(boundary_file, output_dir, temp_dir):
    """Download SRTM data for given geographic boundary."""
    try:
        logger.info(f"Processing boundary file: {boundary_file}")
        # Create directories
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        # Process geographic boundary
        geo_processor = GeoProcessor(boundary_file)
        bounds = geo_processor.get_bounds()
        # Download SRTM data
        downloader = SRTMDownloader(output_dir, temp_dir)
        downloader.download_for_bounds(bounds)
        logger.info("Processing completed successfully")
    except Exception as e:
        logger.error(f"Error during processing: {e}")
        raise
 if __name__ == '__main__':
    main()
--- a/src/srtm_downloader.py
+++ b/src/srtm_downloader.py
@@ -0,0 +1,98 @@
 import requests
 import math
 import logging
 from pathlib import Path
 from urllib.parse import urljoin
 import time
 logger = logging.getLogger(__name__)
 class SRTMDownloader:
    def __init__(self, output_dir, temp_dir=None):
        self.output_dir = Path(output_dir)
        self.temp_dir = Path(temp_dir) if temp_dir else self.output_dir / 'temp'
        self.base_url = "https://cloud.sdstate.edu/index.php/s/UjQFkr4y8EGB3JH/download"
        # Create directories
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.temp_dir.mkdir(parents=True, exist_ok=True)
    def _get_srtm_tiles(self, bounds):
        """Calculate which SRTM tiles are needed for given bounds."""
        min_lon, min_lat = bounds['min_lon'], bounds['min_lat']
        max_lon, max_lat = bounds['max_lon'], bounds['max_lat']
        # SRTM tiles are 1 degree x 1 degree
        tiles = []
        for lon in range(math.floor(min_lon), math.ceil(max_lon)):
            for lat in range(math.floor(min_lat), math.ceil(max_lat)):
                # SRTM naming convention: N/S followed by 2-digit lat, E/W followed by 3-digit lon
                lat_str = f"{'N' if lat >= 0 else 'S'}{abs(lat):02d}"
                lon_str = f"{'E' if lon >= 0 else 'W'}{abs(lon):03d}"
                tile_name = f"{lat_str}{lon_str}"
                tiles.append(tile_name)
        return tiles
    def _download_tile(self, tile_name):
        """Download a single SRTM tile."""
        filename = f"{tile_name}.SRTMGL1.hgt.zip"
        output_path = self.output_dir / filename
        if output_path.exists():
            logger.info(f"Tile {tile_name} already exists, skipping")
            return True
        # Try multiple SRTM data sources
        urls = [
            f"https://cloud.sdstate.edu/index.php/s/UjQFkr4y8EGB3JH/download?path=%2F&files={filename}",
            f"https://e4ftl01.cr.usgs.gov/MEASURES/SRTMGL1.003/2000.02.11/{filename}"
        ]
        for url in urls:
            try:
                logger.info(f"Downloading {tile_name} from {url}")
                response = requests.get(url, timeout=300, stream=True)
                response.raise_for_status()
                with open(output_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                logger.info(f"Successfully downloaded {tile_name}")
                return True
            except requests.RequestException as e:
                logger.warning(f"Failed to download {tile_name} from {url}: {e}")
                continue
        logger.error(f"Failed to download tile {tile_name} from all sources")
        return False
    def download_for_bounds(self, bounds):
        """Download all SRTM tiles needed for given bounds."""
        tiles = self._get_srtm_tiles(bounds)
        logger.info(f"Need to download {len(tiles)} SRTM tiles")
        logger.info(f"Tiles: {tiles}")
        successful = 0
        failed = 0
        for tile in tiles:
            if self._download_tile(tile):
                successful += 1
            else:
                failed += 1
            # Be nice to the server
            time.sleep(1)
        logger.info(f"Download complete: {successful} successful, {failed} failed")
        if failed > 0:
            raise RuntimeError(f"Failed to download {failed} tiles")
        return successful