Add complete GIS ETL project skeleton for SRTM data processing
- Python modules for geo boundary processing and SRTM downloads - Docker containerization with GDAL support - Development environment with devcontainer - Orchestration scripts for local and containerized execution - Support for KMZ/GeoJSON boundary files and NASA SRTM data 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
22
.devcontainer/devcontainer.json
Normal file
22
.devcontainer/devcontainer.json
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
{
|
||||||
|
"name": "GIS ETL Development",
|
||||||
|
"image": "mcr.microsoft.com/devcontainers/python:3.11-bullseye",
|
||||||
|
"features": {
|
||||||
|
"ghcr.io/devcontainers/features/git:1": {},
|
||||||
|
"ghcr.io/devcontainers/features/github-cli:1": {}
|
||||||
|
},
|
||||||
|
"customizations": {
|
||||||
|
"vscode": {
|
||||||
|
"extensions": [
|
||||||
|
"ms-python.python",
|
||||||
|
"ms-python.flake8",
|
||||||
|
"ms-python.black-formatter",
|
||||||
|
"redhat.vscode-yaml"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"postCreateCommand": "pip install -r requirements.txt",
|
||||||
|
"remoteUser": "vscode",
|
||||||
|
"workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind",
|
||||||
|
"workspaceFolder": "/workspace"
|
||||||
|
}
|
24
Dockerfile
Normal file
24
Dockerfile
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
curl \
|
||||||
|
wget \
|
||||||
|
gdal-bin \
|
||||||
|
libgdal-dev \
|
||||||
|
python3-gdal \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY src/ ./src/
|
||||||
|
COPY scripts/ ./scripts/
|
||||||
|
COPY config/ ./config/
|
||||||
|
|
||||||
|
RUN chmod +x scripts/*.sh
|
||||||
|
|
||||||
|
ENV PYTHONPATH=/app/src
|
||||||
|
|
||||||
|
CMD ["python", "src/main.py"]
|
10
README.md
10
README.md
@@ -1,3 +1,11 @@
|
|||||||
# KNEL-SwEng-Platform-ETL-GIS-Inbox
|
# KNEL-SwEng-Platform-ETL-GIS-Inbox
|
||||||
|
|
||||||
Code to ingest raw GIS data
|
The purpose of this repository is to contain:
|
||||||
|
|
||||||
|
devcontainer.json
|
||||||
|
Dockerfile
|
||||||
|
Bash/python scripts
|
||||||
|
|
||||||
|
meant to run in a docker container, orchestrated by Jenkins or Apache Airflow or similiar type tooling which will download data from NASA/USGS for use in planning coverage for wireless internet service providers.
|
||||||
|
|
||||||
|
Specifically it will need to be given a geo boundary in some format (kmz/geojson etc) and pull SRTM files from NASA.
|
28
config/logging.conf
Normal file
28
config/logging.conf
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
[loggers]
|
||||||
|
keys=root
|
||||||
|
|
||||||
|
[handlers]
|
||||||
|
keys=consoleHandler,fileHandler
|
||||||
|
|
||||||
|
[formatters]
|
||||||
|
keys=simpleFormatter
|
||||||
|
|
||||||
|
[logger_root]
|
||||||
|
level=INFO
|
||||||
|
handlers=consoleHandler,fileHandler
|
||||||
|
|
||||||
|
[handler_consoleHandler]
|
||||||
|
class=StreamHandler
|
||||||
|
level=INFO
|
||||||
|
formatter=simpleFormatter
|
||||||
|
args=(sys.stdout,)
|
||||||
|
|
||||||
|
[handler_fileHandler]
|
||||||
|
class=FileHandler
|
||||||
|
level=DEBUG
|
||||||
|
formatter=simpleFormatter
|
||||||
|
args=('etl.log',)
|
||||||
|
|
||||||
|
[formatter_simpleFormatter]
|
||||||
|
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
|
||||||
|
datefmt=%Y-%m-%d %H:%M:%S
|
10
requirements.txt
Normal file
10
requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
requests>=2.31.0
|
||||||
|
geopandas>=0.14.0
|
||||||
|
shapely>=2.0.0
|
||||||
|
fiona>=1.9.0
|
||||||
|
rasterio>=1.3.0
|
||||||
|
pyproj>=3.6.0
|
||||||
|
numpy>=1.24.0
|
||||||
|
pandas>=2.0.0
|
||||||
|
GDAL>=3.6.0
|
||||||
|
click>=8.1.0
|
60
scripts/docker_run.sh
Executable file
60
scripts/docker_run.sh
Executable file
@@ -0,0 +1,60 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
IMAGE_NAME="gis-etl-inbox"
|
||||||
|
TAG="latest"
|
||||||
|
BOUNDARY_FILE=""
|
||||||
|
OUTPUT_DIR="$(pwd)/data/output"
|
||||||
|
DATA_DIR="$(pwd)/data"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "Usage: $0 -b BOUNDARY_FILE [-o OUTPUT_DIR] [-d DATA_DIR] [-t TAG]"
|
||||||
|
echo " -b BOUNDARY_FILE Path to boundary file (KMZ/GeoJSON)"
|
||||||
|
echo " -o OUTPUT_DIR Host output directory (default: ./data/output)"
|
||||||
|
echo " -d DATA_DIR Host data directory (default: ./data)"
|
||||||
|
echo " -t TAG Docker image tag (default: latest)"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "b:o:d:t:h" opt; do
|
||||||
|
case $opt in
|
||||||
|
b) BOUNDARY_FILE="$OPTARG" ;;
|
||||||
|
o) OUTPUT_DIR="$OPTARG" ;;
|
||||||
|
d) DATA_DIR="$OPTARG" ;;
|
||||||
|
t) TAG="$OPTARG" ;;
|
||||||
|
h) usage ;;
|
||||||
|
\?) echo "Invalid option -$OPTARG" >&2; usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -z "$BOUNDARY_FILE" ]; then
|
||||||
|
echo "Error: Boundary file is required"
|
||||||
|
usage
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f "$BOUNDARY_FILE" ]; then
|
||||||
|
echo "Error: Boundary file '$BOUNDARY_FILE' not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p "$OUTPUT_DIR"
|
||||||
|
mkdir -p "$DATA_DIR"
|
||||||
|
|
||||||
|
BOUNDARY_FILENAME=$(basename "$BOUNDARY_FILE")
|
||||||
|
|
||||||
|
echo "Building Docker image..."
|
||||||
|
docker build -t "${IMAGE_NAME}:${TAG}" .
|
||||||
|
|
||||||
|
echo "Running ETL in Docker container..."
|
||||||
|
docker run --rm \
|
||||||
|
-v "$BOUNDARY_FILE:/app/data/input/$BOUNDARY_FILENAME:ro" \
|
||||||
|
-v "$OUTPUT_DIR:/app/data/output" \
|
||||||
|
-v "$DATA_DIR/temp:/app/data/temp" \
|
||||||
|
"${IMAGE_NAME}:${TAG}" \
|
||||||
|
python src/main.py \
|
||||||
|
--boundary-file "/app/data/input/$BOUNDARY_FILENAME" \
|
||||||
|
--output-dir "/app/data/output" \
|
||||||
|
--temp-dir "/app/data/temp"
|
||||||
|
|
||||||
|
echo "Docker ETL process completed successfully"
|
58
scripts/run_etl.sh
Executable file
58
scripts/run_etl.sh
Executable file
@@ -0,0 +1,58 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
BOUNDARY_FILE=""
|
||||||
|
OUTPUT_DIR="./data/output"
|
||||||
|
TEMP_DIR="./data/temp"
|
||||||
|
VERBOSE=false
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "Usage: $0 -b BOUNDARY_FILE [-o OUTPUT_DIR] [-t TEMP_DIR] [-v]"
|
||||||
|
echo " -b BOUNDARY_FILE Path to boundary file (KMZ/GeoJSON)"
|
||||||
|
echo " -o OUTPUT_DIR Output directory (default: ./data/output)"
|
||||||
|
echo " -t TEMP_DIR Temporary directory (default: ./data/temp)"
|
||||||
|
echo " -v Verbose output"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "b:o:t:vh" opt; do
|
||||||
|
case $opt in
|
||||||
|
b) BOUNDARY_FILE="$OPTARG" ;;
|
||||||
|
o) OUTPUT_DIR="$OPTARG" ;;
|
||||||
|
t) TEMP_DIR="$OPTARG" ;;
|
||||||
|
v) VERBOSE=true ;;
|
||||||
|
h) usage ;;
|
||||||
|
\?) echo "Invalid option -$OPTARG" >&2; usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -z "$BOUNDARY_FILE" ]; then
|
||||||
|
echo "Error: Boundary file is required"
|
||||||
|
usage
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f "$BOUNDARY_FILE" ]; then
|
||||||
|
echo "Error: Boundary file '$BOUNDARY_FILE' not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Starting SRTM ETL process..."
|
||||||
|
echo "Boundary file: $BOUNDARY_FILE"
|
||||||
|
echo "Output directory: $OUTPUT_DIR"
|
||||||
|
echo "Temp directory: $TEMP_DIR"
|
||||||
|
|
||||||
|
if [ "$VERBOSE" = true ]; then
|
||||||
|
LOG_LEVEL="DEBUG"
|
||||||
|
else
|
||||||
|
LOG_LEVEL="INFO"
|
||||||
|
fi
|
||||||
|
|
||||||
|
export PYTHONPATH="${PYTHONPATH}:$(pwd)/src"
|
||||||
|
|
||||||
|
python3 src/main.py \
|
||||||
|
--boundary-file "$BOUNDARY_FILE" \
|
||||||
|
--output-dir "$OUTPUT_DIR" \
|
||||||
|
--temp-dir "$TEMP_DIR"
|
||||||
|
|
||||||
|
echo "ETL process completed successfully"
|
70
src/geo_processor.py
Normal file
70
src/geo_processor.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
import geopandas as gpd
|
||||||
|
import zipfile
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class GeoProcessor:
|
||||||
|
def __init__(self, boundary_file):
|
||||||
|
self.boundary_file = Path(boundary_file)
|
||||||
|
self.gdf = None
|
||||||
|
self._load_boundary()
|
||||||
|
|
||||||
|
def _load_boundary(self):
|
||||||
|
"""Load boundary file (KMZ/GeoJSON/Shapefile)."""
|
||||||
|
try:
|
||||||
|
if self.boundary_file.suffix.lower() == '.kmz':
|
||||||
|
self._load_kmz()
|
||||||
|
elif self.boundary_file.suffix.lower() == '.geojson':
|
||||||
|
self.gdf = gpd.read_file(self.boundary_file)
|
||||||
|
elif self.boundary_file.suffix.lower() in ['.shp', '.zip']:
|
||||||
|
self.gdf = gpd.read_file(self.boundary_file)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported file format: {self.boundary_file.suffix}")
|
||||||
|
|
||||||
|
# Ensure CRS is WGS84
|
||||||
|
if self.gdf.crs != 'EPSG:4326':
|
||||||
|
self.gdf = self.gdf.to_crs('EPSG:4326')
|
||||||
|
|
||||||
|
logger.info(f"Loaded {len(self.gdf)} features from {self.boundary_file}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to load boundary file: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _load_kmz(self):
|
||||||
|
"""Load KMZ file by extracting and reading KML."""
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
with zipfile.ZipFile(self.boundary_file, 'r') as zip_ref:
|
||||||
|
zip_ref.extractall(temp_dir)
|
||||||
|
|
||||||
|
# Find KML file
|
||||||
|
kml_files = list(Path(temp_dir).glob('*.kml'))
|
||||||
|
if not kml_files:
|
||||||
|
raise ValueError("No KML file found in KMZ archive")
|
||||||
|
|
||||||
|
# Load KML
|
||||||
|
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'
|
||||||
|
self.gdf = gpd.read_file(kml_files[0], driver='KML')
|
||||||
|
|
||||||
|
def get_bounds(self):
|
||||||
|
"""Get bounding box of all features."""
|
||||||
|
if self.gdf is None:
|
||||||
|
raise ValueError("No boundary data loaded")
|
||||||
|
|
||||||
|
bounds = self.gdf.total_bounds
|
||||||
|
return {
|
||||||
|
'min_lon': bounds[0],
|
||||||
|
'min_lat': bounds[1],
|
||||||
|
'max_lon': bounds[2],
|
||||||
|
'max_lat': bounds[3]
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_geometry(self):
|
||||||
|
"""Get the combined geometry of all features."""
|
||||||
|
if self.gdf is None:
|
||||||
|
raise ValueError("No boundary data loaded")
|
||||||
|
|
||||||
|
return self.gdf.geometry.unary_union
|
40
src/main.py
Normal file
40
src/main.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import click
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from geo_processor import GeoProcessor
|
||||||
|
from srtm_downloader import SRTMDownloader
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option('--boundary-file', required=True, help='Path to boundary file (KMZ/GeoJSON)')
|
||||||
|
@click.option('--output-dir', default='./data/output', help='Output directory for SRTM files')
|
||||||
|
@click.option('--temp-dir', default='./data/temp', help='Temporary directory for processing')
|
||||||
|
def main(boundary_file, output_dir, temp_dir):
|
||||||
|
"""Download SRTM data for given geographic boundary."""
|
||||||
|
try:
|
||||||
|
logger.info(f"Processing boundary file: {boundary_file}")
|
||||||
|
|
||||||
|
# Create directories
|
||||||
|
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||||
|
Path(temp_dir).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Process geographic boundary
|
||||||
|
geo_processor = GeoProcessor(boundary_file)
|
||||||
|
bounds = geo_processor.get_bounds()
|
||||||
|
|
||||||
|
# Download SRTM data
|
||||||
|
downloader = SRTMDownloader(output_dir, temp_dir)
|
||||||
|
downloader.download_for_bounds(bounds)
|
||||||
|
|
||||||
|
logger.info("Processing completed successfully")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during processing: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
98
src/srtm_downloader.py
Normal file
98
src/srtm_downloader.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
import requests
|
||||||
|
import math
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
import time
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class SRTMDownloader:
|
||||||
|
def __init__(self, output_dir, temp_dir=None):
|
||||||
|
self.output_dir = Path(output_dir)
|
||||||
|
self.temp_dir = Path(temp_dir) if temp_dir else self.output_dir / 'temp'
|
||||||
|
self.base_url = "https://cloud.sdstate.edu/index.php/s/UjQFkr4y8EGB3JH/download"
|
||||||
|
|
||||||
|
# Create directories
|
||||||
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.temp_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def _get_srtm_tiles(self, bounds):
|
||||||
|
"""Calculate which SRTM tiles are needed for given bounds."""
|
||||||
|
min_lon, min_lat = bounds['min_lon'], bounds['min_lat']
|
||||||
|
max_lon, max_lat = bounds['max_lon'], bounds['max_lat']
|
||||||
|
|
||||||
|
# SRTM tiles are 1 degree x 1 degree
|
||||||
|
tiles = []
|
||||||
|
|
||||||
|
for lon in range(math.floor(min_lon), math.ceil(max_lon)):
|
||||||
|
for lat in range(math.floor(min_lat), math.ceil(max_lat)):
|
||||||
|
# SRTM naming convention: N/S followed by 2-digit lat, E/W followed by 3-digit lon
|
||||||
|
lat_str = f"{'N' if lat >= 0 else 'S'}{abs(lat):02d}"
|
||||||
|
lon_str = f"{'E' if lon >= 0 else 'W'}{abs(lon):03d}"
|
||||||
|
tile_name = f"{lat_str}{lon_str}"
|
||||||
|
tiles.append(tile_name)
|
||||||
|
|
||||||
|
return tiles
|
||||||
|
|
||||||
|
def _download_tile(self, tile_name):
|
||||||
|
"""Download a single SRTM tile."""
|
||||||
|
filename = f"{tile_name}.SRTMGL1.hgt.zip"
|
||||||
|
output_path = self.output_dir / filename
|
||||||
|
|
||||||
|
if output_path.exists():
|
||||||
|
logger.info(f"Tile {tile_name} already exists, skipping")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Try multiple SRTM data sources
|
||||||
|
urls = [
|
||||||
|
f"https://cloud.sdstate.edu/index.php/s/UjQFkr4y8EGB3JH/download?path=%2F&files={filename}",
|
||||||
|
f"https://e4ftl01.cr.usgs.gov/MEASURES/SRTMGL1.003/2000.02.11/{filename}"
|
||||||
|
]
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
try:
|
||||||
|
logger.info(f"Downloading {tile_name} from {url}")
|
||||||
|
|
||||||
|
response = requests.get(url, timeout=300, stream=True)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
with open(output_path, 'wb') as f:
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
|
logger.info(f"Successfully downloaded {tile_name}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
logger.warning(f"Failed to download {tile_name} from {url}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.error(f"Failed to download tile {tile_name} from all sources")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def download_for_bounds(self, bounds):
|
||||||
|
"""Download all SRTM tiles needed for given bounds."""
|
||||||
|
tiles = self._get_srtm_tiles(bounds)
|
||||||
|
|
||||||
|
logger.info(f"Need to download {len(tiles)} SRTM tiles")
|
||||||
|
logger.info(f"Tiles: {tiles}")
|
||||||
|
|
||||||
|
successful = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for tile in tiles:
|
||||||
|
if self._download_tile(tile):
|
||||||
|
successful += 1
|
||||||
|
else:
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
# Be nice to the server
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
logger.info(f"Download complete: {successful} successful, {failed} failed")
|
||||||
|
|
||||||
|
if failed > 0:
|
||||||
|
raise RuntimeError(f"Failed to download {failed} tiles")
|
||||||
|
|
||||||
|
return successful
|
Reference in New Issue
Block a user