Compare commits

1 Commits

Author SHA1 Message Date
2192c2ae9c Add complete GIS ETL project skeleton for SRTM data processing
- Python modules for geo boundary processing and SRTM downloads
- Docker containerization with GDAL support
- Development environment with devcontainer
- Orchestration scripts for local and containerized execution
- Support for KMZ/GeoJSON boundary files and NASA SRTM data

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-21 12:24:35 -05:00
10 changed files with 419 additions and 1 deletions

View File

@@ -0,0 +1,22 @@
{
"name": "GIS ETL Development",
"image": "mcr.microsoft.com/devcontainers/python:3.11-bullseye",
"features": {
"ghcr.io/devcontainers/features/git:1": {},
"ghcr.io/devcontainers/features/github-cli:1": {}
},
"customizations": {
"vscode": {
"extensions": [
"ms-python.python",
"ms-python.flake8",
"ms-python.black-formatter",
"redhat.vscode-yaml"
]
}
},
"postCreateCommand": "pip install -r requirements.txt",
"remoteUser": "vscode",
"workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind",
"workspaceFolder": "/workspace"
}

24
Dockerfile Normal file
View File

@@ -0,0 +1,24 @@
FROM python:3.11-slim
WORKDIR /app
RUN apt-get update && apt-get install -y \
curl \
wget \
gdal-bin \
libgdal-dev \
python3-gdal \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY src/ ./src/
COPY scripts/ ./scripts/
COPY config/ ./config/
RUN chmod +x scripts/*.sh
ENV PYTHONPATH=/app/src
CMD ["python", "src/main.py"]

View File

@@ -1,3 +1,11 @@
# KNEL-SwEng-Platform-ETL-GIS-Inbox # KNEL-SwEng-Platform-ETL-GIS-Inbox
Code to ingest raw GIS data The purpose of this repository is to contain:
devcontainer.json
Dockerfile
Bash/python scripts
meant to run in a docker container, orchestrated by Jenkins or Apache Airflow or similiar type tooling which will download data from NASA/USGS for use in planning coverage for wireless internet service providers.
Specifically it will need to be given a geo boundary in some format (kmz/geojson etc) and pull SRTM files from NASA.

28
config/logging.conf Normal file
View File

@@ -0,0 +1,28 @@
[loggers]
keys=root
[handlers]
keys=consoleHandler,fileHandler
[formatters]
keys=simpleFormatter
[logger_root]
level=INFO
handlers=consoleHandler,fileHandler
[handler_consoleHandler]
class=StreamHandler
level=INFO
formatter=simpleFormatter
args=(sys.stdout,)
[handler_fileHandler]
class=FileHandler
level=DEBUG
formatter=simpleFormatter
args=('etl.log',)
[formatter_simpleFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
datefmt=%Y-%m-%d %H:%M:%S

10
requirements.txt Normal file
View File

@@ -0,0 +1,10 @@
requests>=2.31.0
geopandas>=0.14.0
shapely>=2.0.0
fiona>=1.9.0
rasterio>=1.3.0
pyproj>=3.6.0
numpy>=1.24.0
pandas>=2.0.0
GDAL>=3.6.0
click>=8.1.0

60
scripts/docker_run.sh Executable file
View File

@@ -0,0 +1,60 @@
#!/bin/bash
set -e
IMAGE_NAME="gis-etl-inbox"
TAG="latest"
BOUNDARY_FILE=""
OUTPUT_DIR="$(pwd)/data/output"
DATA_DIR="$(pwd)/data"
usage() {
echo "Usage: $0 -b BOUNDARY_FILE [-o OUTPUT_DIR] [-d DATA_DIR] [-t TAG]"
echo " -b BOUNDARY_FILE Path to boundary file (KMZ/GeoJSON)"
echo " -o OUTPUT_DIR Host output directory (default: ./data/output)"
echo " -d DATA_DIR Host data directory (default: ./data)"
echo " -t TAG Docker image tag (default: latest)"
exit 1
}
while getopts "b:o:d:t:h" opt; do
case $opt in
b) BOUNDARY_FILE="$OPTARG" ;;
o) OUTPUT_DIR="$OPTARG" ;;
d) DATA_DIR="$OPTARG" ;;
t) TAG="$OPTARG" ;;
h) usage ;;
\?) echo "Invalid option -$OPTARG" >&2; usage ;;
esac
done
if [ -z "$BOUNDARY_FILE" ]; then
echo "Error: Boundary file is required"
usage
fi
if [ ! -f "$BOUNDARY_FILE" ]; then
echo "Error: Boundary file '$BOUNDARY_FILE' not found"
exit 1
fi
mkdir -p "$OUTPUT_DIR"
mkdir -p "$DATA_DIR"
BOUNDARY_FILENAME=$(basename "$BOUNDARY_FILE")
echo "Building Docker image..."
docker build -t "${IMAGE_NAME}:${TAG}" .
echo "Running ETL in Docker container..."
docker run --rm \
-v "$BOUNDARY_FILE:/app/data/input/$BOUNDARY_FILENAME:ro" \
-v "$OUTPUT_DIR:/app/data/output" \
-v "$DATA_DIR/temp:/app/data/temp" \
"${IMAGE_NAME}:${TAG}" \
python src/main.py \
--boundary-file "/app/data/input/$BOUNDARY_FILENAME" \
--output-dir "/app/data/output" \
--temp-dir "/app/data/temp"
echo "Docker ETL process completed successfully"

58
scripts/run_etl.sh Executable file
View File

@@ -0,0 +1,58 @@
#!/bin/bash
set -e
BOUNDARY_FILE=""
OUTPUT_DIR="./data/output"
TEMP_DIR="./data/temp"
VERBOSE=false
usage() {
echo "Usage: $0 -b BOUNDARY_FILE [-o OUTPUT_DIR] [-t TEMP_DIR] [-v]"
echo " -b BOUNDARY_FILE Path to boundary file (KMZ/GeoJSON)"
echo " -o OUTPUT_DIR Output directory (default: ./data/output)"
echo " -t TEMP_DIR Temporary directory (default: ./data/temp)"
echo " -v Verbose output"
exit 1
}
while getopts "b:o:t:vh" opt; do
case $opt in
b) BOUNDARY_FILE="$OPTARG" ;;
o) OUTPUT_DIR="$OPTARG" ;;
t) TEMP_DIR="$OPTARG" ;;
v) VERBOSE=true ;;
h) usage ;;
\?) echo "Invalid option -$OPTARG" >&2; usage ;;
esac
done
if [ -z "$BOUNDARY_FILE" ]; then
echo "Error: Boundary file is required"
usage
fi
if [ ! -f "$BOUNDARY_FILE" ]; then
echo "Error: Boundary file '$BOUNDARY_FILE' not found"
exit 1
fi
echo "Starting SRTM ETL process..."
echo "Boundary file: $BOUNDARY_FILE"
echo "Output directory: $OUTPUT_DIR"
echo "Temp directory: $TEMP_DIR"
if [ "$VERBOSE" = true ]; then
LOG_LEVEL="DEBUG"
else
LOG_LEVEL="INFO"
fi
export PYTHONPATH="${PYTHONPATH}:$(pwd)/src"
python3 src/main.py \
--boundary-file "$BOUNDARY_FILE" \
--output-dir "$OUTPUT_DIR" \
--temp-dir "$TEMP_DIR"
echo "ETL process completed successfully"

70
src/geo_processor.py Normal file
View File

@@ -0,0 +1,70 @@
import geopandas as gpd
import zipfile
import tempfile
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
class GeoProcessor:
def __init__(self, boundary_file):
self.boundary_file = Path(boundary_file)
self.gdf = None
self._load_boundary()
def _load_boundary(self):
"""Load boundary file (KMZ/GeoJSON/Shapefile)."""
try:
if self.boundary_file.suffix.lower() == '.kmz':
self._load_kmz()
elif self.boundary_file.suffix.lower() == '.geojson':
self.gdf = gpd.read_file(self.boundary_file)
elif self.boundary_file.suffix.lower() in ['.shp', '.zip']:
self.gdf = gpd.read_file(self.boundary_file)
else:
raise ValueError(f"Unsupported file format: {self.boundary_file.suffix}")
# Ensure CRS is WGS84
if self.gdf.crs != 'EPSG:4326':
self.gdf = self.gdf.to_crs('EPSG:4326')
logger.info(f"Loaded {len(self.gdf)} features from {self.boundary_file}")
except Exception as e:
logger.error(f"Failed to load boundary file: {e}")
raise
def _load_kmz(self):
"""Load KMZ file by extracting and reading KML."""
with tempfile.TemporaryDirectory() as temp_dir:
with zipfile.ZipFile(self.boundary_file, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
# Find KML file
kml_files = list(Path(temp_dir).glob('*.kml'))
if not kml_files:
raise ValueError("No KML file found in KMZ archive")
# Load KML
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'
self.gdf = gpd.read_file(kml_files[0], driver='KML')
def get_bounds(self):
"""Get bounding box of all features."""
if self.gdf is None:
raise ValueError("No boundary data loaded")
bounds = self.gdf.total_bounds
return {
'min_lon': bounds[0],
'min_lat': bounds[1],
'max_lon': bounds[2],
'max_lat': bounds[3]
}
def get_geometry(self):
"""Get the combined geometry of all features."""
if self.gdf is None:
raise ValueError("No boundary data loaded")
return self.gdf.geometry.unary_union

40
src/main.py Normal file
View File

@@ -0,0 +1,40 @@
#!/usr/bin/env python3
import click
import logging
from pathlib import Path
from geo_processor import GeoProcessor
from srtm_downloader import SRTMDownloader
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
@click.command()
@click.option('--boundary-file', required=True, help='Path to boundary file (KMZ/GeoJSON)')
@click.option('--output-dir', default='./data/output', help='Output directory for SRTM files')
@click.option('--temp-dir', default='./data/temp', help='Temporary directory for processing')
def main(boundary_file, output_dir, temp_dir):
"""Download SRTM data for given geographic boundary."""
try:
logger.info(f"Processing boundary file: {boundary_file}")
# Create directories
Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(temp_dir).mkdir(parents=True, exist_ok=True)
# Process geographic boundary
geo_processor = GeoProcessor(boundary_file)
bounds = geo_processor.get_bounds()
# Download SRTM data
downloader = SRTMDownloader(output_dir, temp_dir)
downloader.download_for_bounds(bounds)
logger.info("Processing completed successfully")
except Exception as e:
logger.error(f"Error during processing: {e}")
raise
if __name__ == '__main__':
main()

98
src/srtm_downloader.py Normal file
View File

@@ -0,0 +1,98 @@
import requests
import math
import logging
from pathlib import Path
from urllib.parse import urljoin
import time
logger = logging.getLogger(__name__)
class SRTMDownloader:
def __init__(self, output_dir, temp_dir=None):
self.output_dir = Path(output_dir)
self.temp_dir = Path(temp_dir) if temp_dir else self.output_dir / 'temp'
self.base_url = "https://cloud.sdstate.edu/index.php/s/UjQFkr4y8EGB3JH/download"
# Create directories
self.output_dir.mkdir(parents=True, exist_ok=True)
self.temp_dir.mkdir(parents=True, exist_ok=True)
def _get_srtm_tiles(self, bounds):
"""Calculate which SRTM tiles are needed for given bounds."""
min_lon, min_lat = bounds['min_lon'], bounds['min_lat']
max_lon, max_lat = bounds['max_lon'], bounds['max_lat']
# SRTM tiles are 1 degree x 1 degree
tiles = []
for lon in range(math.floor(min_lon), math.ceil(max_lon)):
for lat in range(math.floor(min_lat), math.ceil(max_lat)):
# SRTM naming convention: N/S followed by 2-digit lat, E/W followed by 3-digit lon
lat_str = f"{'N' if lat >= 0 else 'S'}{abs(lat):02d}"
lon_str = f"{'E' if lon >= 0 else 'W'}{abs(lon):03d}"
tile_name = f"{lat_str}{lon_str}"
tiles.append(tile_name)
return tiles
def _download_tile(self, tile_name):
"""Download a single SRTM tile."""
filename = f"{tile_name}.SRTMGL1.hgt.zip"
output_path = self.output_dir / filename
if output_path.exists():
logger.info(f"Tile {tile_name} already exists, skipping")
return True
# Try multiple SRTM data sources
urls = [
f"https://cloud.sdstate.edu/index.php/s/UjQFkr4y8EGB3JH/download?path=%2F&files={filename}",
f"https://e4ftl01.cr.usgs.gov/MEASURES/SRTMGL1.003/2000.02.11/{filename}"
]
for url in urls:
try:
logger.info(f"Downloading {tile_name} from {url}")
response = requests.get(url, timeout=300, stream=True)
response.raise_for_status()
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
logger.info(f"Successfully downloaded {tile_name}")
return True
except requests.RequestException as e:
logger.warning(f"Failed to download {tile_name} from {url}: {e}")
continue
logger.error(f"Failed to download tile {tile_name} from all sources")
return False
def download_for_bounds(self, bounds):
"""Download all SRTM tiles needed for given bounds."""
tiles = self._get_srtm_tiles(bounds)
logger.info(f"Need to download {len(tiles)} SRTM tiles")
logger.info(f"Tiles: {tiles}")
successful = 0
failed = 0
for tile in tiles:
if self._download_tile(tile):
successful += 1
else:
failed += 1
# Be nice to the server
time.sleep(1)
logger.info(f"Download complete: {successful} successful, {failed} failed")
if failed > 0:
raise RuntimeError(f"Failed to download {failed} tiles")
return successful