Compare commits
1 Commits
v20250721-
...
main
Author | SHA1 | Date | |
---|---|---|---|
2192c2ae9c |
22
.devcontainer/devcontainer.json
Normal file
22
.devcontainer/devcontainer.json
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "GIS ETL Development",
|
||||
"image": "mcr.microsoft.com/devcontainers/python:3.11-bullseye",
|
||||
"features": {
|
||||
"ghcr.io/devcontainers/features/git:1": {},
|
||||
"ghcr.io/devcontainers/features/github-cli:1": {}
|
||||
},
|
||||
"customizations": {
|
||||
"vscode": {
|
||||
"extensions": [
|
||||
"ms-python.python",
|
||||
"ms-python.flake8",
|
||||
"ms-python.black-formatter",
|
||||
"redhat.vscode-yaml"
|
||||
]
|
||||
}
|
||||
},
|
||||
"postCreateCommand": "pip install -r requirements.txt",
|
||||
"remoteUser": "vscode",
|
||||
"workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind",
|
||||
"workspaceFolder": "/workspace"
|
||||
}
|
24
Dockerfile
Normal file
24
Dockerfile
Normal file
@@ -0,0 +1,24 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
wget \
|
||||
gdal-bin \
|
||||
libgdal-dev \
|
||||
python3-gdal \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY src/ ./src/
|
||||
COPY scripts/ ./scripts/
|
||||
COPY config/ ./config/
|
||||
|
||||
RUN chmod +x scripts/*.sh
|
||||
|
||||
ENV PYTHONPATH=/app/src
|
||||
|
||||
CMD ["python", "src/main.py"]
|
10
README.md
10
README.md
@@ -1,3 +1,11 @@
|
||||
# KNEL-SwEng-Platform-ETL-GIS-Inbox
|
||||
|
||||
Code to ingest raw GIS data
|
||||
The purpose of this repository is to contain:
|
||||
|
||||
devcontainer.json
|
||||
Dockerfile
|
||||
Bash/python scripts
|
||||
|
||||
meant to run in a docker container, orchestrated by Jenkins or Apache Airflow or similiar type tooling which will download data from NASA/USGS for use in planning coverage for wireless internet service providers.
|
||||
|
||||
Specifically it will need to be given a geo boundary in some format (kmz/geojson etc) and pull SRTM files from NASA.
|
28
config/logging.conf
Normal file
28
config/logging.conf
Normal file
@@ -0,0 +1,28 @@
|
||||
[loggers]
|
||||
keys=root
|
||||
|
||||
[handlers]
|
||||
keys=consoleHandler,fileHandler
|
||||
|
||||
[formatters]
|
||||
keys=simpleFormatter
|
||||
|
||||
[logger_root]
|
||||
level=INFO
|
||||
handlers=consoleHandler,fileHandler
|
||||
|
||||
[handler_consoleHandler]
|
||||
class=StreamHandler
|
||||
level=INFO
|
||||
formatter=simpleFormatter
|
||||
args=(sys.stdout,)
|
||||
|
||||
[handler_fileHandler]
|
||||
class=FileHandler
|
||||
level=DEBUG
|
||||
formatter=simpleFormatter
|
||||
args=('etl.log',)
|
||||
|
||||
[formatter_simpleFormatter]
|
||||
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
|
||||
datefmt=%Y-%m-%d %H:%M:%S
|
10
requirements.txt
Normal file
10
requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
requests>=2.31.0
|
||||
geopandas>=0.14.0
|
||||
shapely>=2.0.0
|
||||
fiona>=1.9.0
|
||||
rasterio>=1.3.0
|
||||
pyproj>=3.6.0
|
||||
numpy>=1.24.0
|
||||
pandas>=2.0.0
|
||||
GDAL>=3.6.0
|
||||
click>=8.1.0
|
60
scripts/docker_run.sh
Executable file
60
scripts/docker_run.sh
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
IMAGE_NAME="gis-etl-inbox"
|
||||
TAG="latest"
|
||||
BOUNDARY_FILE=""
|
||||
OUTPUT_DIR="$(pwd)/data/output"
|
||||
DATA_DIR="$(pwd)/data"
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 -b BOUNDARY_FILE [-o OUTPUT_DIR] [-d DATA_DIR] [-t TAG]"
|
||||
echo " -b BOUNDARY_FILE Path to boundary file (KMZ/GeoJSON)"
|
||||
echo " -o OUTPUT_DIR Host output directory (default: ./data/output)"
|
||||
echo " -d DATA_DIR Host data directory (default: ./data)"
|
||||
echo " -t TAG Docker image tag (default: latest)"
|
||||
exit 1
|
||||
}
|
||||
|
||||
while getopts "b:o:d:t:h" opt; do
|
||||
case $opt in
|
||||
b) BOUNDARY_FILE="$OPTARG" ;;
|
||||
o) OUTPUT_DIR="$OPTARG" ;;
|
||||
d) DATA_DIR="$OPTARG" ;;
|
||||
t) TAG="$OPTARG" ;;
|
||||
h) usage ;;
|
||||
\?) echo "Invalid option -$OPTARG" >&2; usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$BOUNDARY_FILE" ]; then
|
||||
echo "Error: Boundary file is required"
|
||||
usage
|
||||
fi
|
||||
|
||||
if [ ! -f "$BOUNDARY_FILE" ]; then
|
||||
echo "Error: Boundary file '$BOUNDARY_FILE' not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
mkdir -p "$DATA_DIR"
|
||||
|
||||
BOUNDARY_FILENAME=$(basename "$BOUNDARY_FILE")
|
||||
|
||||
echo "Building Docker image..."
|
||||
docker build -t "${IMAGE_NAME}:${TAG}" .
|
||||
|
||||
echo "Running ETL in Docker container..."
|
||||
docker run --rm \
|
||||
-v "$BOUNDARY_FILE:/app/data/input/$BOUNDARY_FILENAME:ro" \
|
||||
-v "$OUTPUT_DIR:/app/data/output" \
|
||||
-v "$DATA_DIR/temp:/app/data/temp" \
|
||||
"${IMAGE_NAME}:${TAG}" \
|
||||
python src/main.py \
|
||||
--boundary-file "/app/data/input/$BOUNDARY_FILENAME" \
|
||||
--output-dir "/app/data/output" \
|
||||
--temp-dir "/app/data/temp"
|
||||
|
||||
echo "Docker ETL process completed successfully"
|
58
scripts/run_etl.sh
Executable file
58
scripts/run_etl.sh
Executable file
@@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
BOUNDARY_FILE=""
|
||||
OUTPUT_DIR="./data/output"
|
||||
TEMP_DIR="./data/temp"
|
||||
VERBOSE=false
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 -b BOUNDARY_FILE [-o OUTPUT_DIR] [-t TEMP_DIR] [-v]"
|
||||
echo " -b BOUNDARY_FILE Path to boundary file (KMZ/GeoJSON)"
|
||||
echo " -o OUTPUT_DIR Output directory (default: ./data/output)"
|
||||
echo " -t TEMP_DIR Temporary directory (default: ./data/temp)"
|
||||
echo " -v Verbose output"
|
||||
exit 1
|
||||
}
|
||||
|
||||
while getopts "b:o:t:vh" opt; do
|
||||
case $opt in
|
||||
b) BOUNDARY_FILE="$OPTARG" ;;
|
||||
o) OUTPUT_DIR="$OPTARG" ;;
|
||||
t) TEMP_DIR="$OPTARG" ;;
|
||||
v) VERBOSE=true ;;
|
||||
h) usage ;;
|
||||
\?) echo "Invalid option -$OPTARG" >&2; usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$BOUNDARY_FILE" ]; then
|
||||
echo "Error: Boundary file is required"
|
||||
usage
|
||||
fi
|
||||
|
||||
if [ ! -f "$BOUNDARY_FILE" ]; then
|
||||
echo "Error: Boundary file '$BOUNDARY_FILE' not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Starting SRTM ETL process..."
|
||||
echo "Boundary file: $BOUNDARY_FILE"
|
||||
echo "Output directory: $OUTPUT_DIR"
|
||||
echo "Temp directory: $TEMP_DIR"
|
||||
|
||||
if [ "$VERBOSE" = true ]; then
|
||||
LOG_LEVEL="DEBUG"
|
||||
else
|
||||
LOG_LEVEL="INFO"
|
||||
fi
|
||||
|
||||
export PYTHONPATH="${PYTHONPATH}:$(pwd)/src"
|
||||
|
||||
python3 src/main.py \
|
||||
--boundary-file "$BOUNDARY_FILE" \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--temp-dir "$TEMP_DIR"
|
||||
|
||||
echo "ETL process completed successfully"
|
70
src/geo_processor.py
Normal file
70
src/geo_processor.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import geopandas as gpd
|
||||
import zipfile
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class GeoProcessor:
|
||||
def __init__(self, boundary_file):
|
||||
self.boundary_file = Path(boundary_file)
|
||||
self.gdf = None
|
||||
self._load_boundary()
|
||||
|
||||
def _load_boundary(self):
|
||||
"""Load boundary file (KMZ/GeoJSON/Shapefile)."""
|
||||
try:
|
||||
if self.boundary_file.suffix.lower() == '.kmz':
|
||||
self._load_kmz()
|
||||
elif self.boundary_file.suffix.lower() == '.geojson':
|
||||
self.gdf = gpd.read_file(self.boundary_file)
|
||||
elif self.boundary_file.suffix.lower() in ['.shp', '.zip']:
|
||||
self.gdf = gpd.read_file(self.boundary_file)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file format: {self.boundary_file.suffix}")
|
||||
|
||||
# Ensure CRS is WGS84
|
||||
if self.gdf.crs != 'EPSG:4326':
|
||||
self.gdf = self.gdf.to_crs('EPSG:4326')
|
||||
|
||||
logger.info(f"Loaded {len(self.gdf)} features from {self.boundary_file}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load boundary file: {e}")
|
||||
raise
|
||||
|
||||
def _load_kmz(self):
|
||||
"""Load KMZ file by extracting and reading KML."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
with zipfile.ZipFile(self.boundary_file, 'r') as zip_ref:
|
||||
zip_ref.extractall(temp_dir)
|
||||
|
||||
# Find KML file
|
||||
kml_files = list(Path(temp_dir).glob('*.kml'))
|
||||
if not kml_files:
|
||||
raise ValueError("No KML file found in KMZ archive")
|
||||
|
||||
# Load KML
|
||||
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'
|
||||
self.gdf = gpd.read_file(kml_files[0], driver='KML')
|
||||
|
||||
def get_bounds(self):
|
||||
"""Get bounding box of all features."""
|
||||
if self.gdf is None:
|
||||
raise ValueError("No boundary data loaded")
|
||||
|
||||
bounds = self.gdf.total_bounds
|
||||
return {
|
||||
'min_lon': bounds[0],
|
||||
'min_lat': bounds[1],
|
||||
'max_lon': bounds[2],
|
||||
'max_lat': bounds[3]
|
||||
}
|
||||
|
||||
def get_geometry(self):
|
||||
"""Get the combined geometry of all features."""
|
||||
if self.gdf is None:
|
||||
raise ValueError("No boundary data loaded")
|
||||
|
||||
return self.gdf.geometry.unary_union
|
40
src/main.py
Normal file
40
src/main.py
Normal file
@@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import click
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from geo_processor import GeoProcessor
|
||||
from srtm_downloader import SRTMDownloader
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@click.command()
|
||||
@click.option('--boundary-file', required=True, help='Path to boundary file (KMZ/GeoJSON)')
|
||||
@click.option('--output-dir', default='./data/output', help='Output directory for SRTM files')
|
||||
@click.option('--temp-dir', default='./data/temp', help='Temporary directory for processing')
|
||||
def main(boundary_file, output_dir, temp_dir):
|
||||
"""Download SRTM data for given geographic boundary."""
|
||||
try:
|
||||
logger.info(f"Processing boundary file: {boundary_file}")
|
||||
|
||||
# Create directories
|
||||
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||
Path(temp_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Process geographic boundary
|
||||
geo_processor = GeoProcessor(boundary_file)
|
||||
bounds = geo_processor.get_bounds()
|
||||
|
||||
# Download SRTM data
|
||||
downloader = SRTMDownloader(output_dir, temp_dir)
|
||||
downloader.download_for_bounds(bounds)
|
||||
|
||||
logger.info("Processing completed successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during processing: {e}")
|
||||
raise
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
98
src/srtm_downloader.py
Normal file
98
src/srtm_downloader.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import requests
|
||||
import math
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin
|
||||
import time
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class SRTMDownloader:
|
||||
def __init__(self, output_dir, temp_dir=None):
|
||||
self.output_dir = Path(output_dir)
|
||||
self.temp_dir = Path(temp_dir) if temp_dir else self.output_dir / 'temp'
|
||||
self.base_url = "https://cloud.sdstate.edu/index.php/s/UjQFkr4y8EGB3JH/download"
|
||||
|
||||
# Create directories
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.temp_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _get_srtm_tiles(self, bounds):
|
||||
"""Calculate which SRTM tiles are needed for given bounds."""
|
||||
min_lon, min_lat = bounds['min_lon'], bounds['min_lat']
|
||||
max_lon, max_lat = bounds['max_lon'], bounds['max_lat']
|
||||
|
||||
# SRTM tiles are 1 degree x 1 degree
|
||||
tiles = []
|
||||
|
||||
for lon in range(math.floor(min_lon), math.ceil(max_lon)):
|
||||
for lat in range(math.floor(min_lat), math.ceil(max_lat)):
|
||||
# SRTM naming convention: N/S followed by 2-digit lat, E/W followed by 3-digit lon
|
||||
lat_str = f"{'N' if lat >= 0 else 'S'}{abs(lat):02d}"
|
||||
lon_str = f"{'E' if lon >= 0 else 'W'}{abs(lon):03d}"
|
||||
tile_name = f"{lat_str}{lon_str}"
|
||||
tiles.append(tile_name)
|
||||
|
||||
return tiles
|
||||
|
||||
def _download_tile(self, tile_name):
|
||||
"""Download a single SRTM tile."""
|
||||
filename = f"{tile_name}.SRTMGL1.hgt.zip"
|
||||
output_path = self.output_dir / filename
|
||||
|
||||
if output_path.exists():
|
||||
logger.info(f"Tile {tile_name} already exists, skipping")
|
||||
return True
|
||||
|
||||
# Try multiple SRTM data sources
|
||||
urls = [
|
||||
f"https://cloud.sdstate.edu/index.php/s/UjQFkr4y8EGB3JH/download?path=%2F&files={filename}",
|
||||
f"https://e4ftl01.cr.usgs.gov/MEASURES/SRTMGL1.003/2000.02.11/{filename}"
|
||||
]
|
||||
|
||||
for url in urls:
|
||||
try:
|
||||
logger.info(f"Downloading {tile_name} from {url}")
|
||||
|
||||
response = requests.get(url, timeout=300, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
with open(output_path, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
|
||||
logger.info(f"Successfully downloaded {tile_name}")
|
||||
return True
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.warning(f"Failed to download {tile_name} from {url}: {e}")
|
||||
continue
|
||||
|
||||
logger.error(f"Failed to download tile {tile_name} from all sources")
|
||||
return False
|
||||
|
||||
def download_for_bounds(self, bounds):
|
||||
"""Download all SRTM tiles needed for given bounds."""
|
||||
tiles = self._get_srtm_tiles(bounds)
|
||||
|
||||
logger.info(f"Need to download {len(tiles)} SRTM tiles")
|
||||
logger.info(f"Tiles: {tiles}")
|
||||
|
||||
successful = 0
|
||||
failed = 0
|
||||
|
||||
for tile in tiles:
|
||||
if self._download_tile(tile):
|
||||
successful += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
# Be nice to the server
|
||||
time.sleep(1)
|
||||
|
||||
logger.info(f"Download complete: {successful} successful, {failed} failed")
|
||||
|
||||
if failed > 0:
|
||||
raise RuntimeError(f"Failed to download {failed} tiles")
|
||||
|
||||
return successful
|
Reference in New Issue
Block a user