diff --git a/Why and How to Do Your Taxes with Open Tax Solver.pdf b/docs/Why and How to Do Your Taxes with Open Tax Solver.pdf similarity index 100% rename from Why and How to Do Your Taxes with Open Tax Solver.pdf rename to docs/Why and How to Do Your Taxes with Open Tax Solver.pdf diff --git a/utilities/NeatToPaperless.sh b/utilities/NeatToPaperless.sh new file mode 100644 index 0000000..0fd6907 --- /dev/null +++ b/utilities/NeatToPaperless.sh @@ -0,0 +1,389 @@ +#!/bin/bash + +# neat.com to paperless-ngx migration script +# Splits yearly PDFs into individual pages organized by year +# Saves summary pages separately for ERP system integration +# Reads configuration from PaperlessImportConfig file +# Designed to run from within each year directory (receipts/YYYY/) + +set -euo pipefail # Exit on error, undefined vars, pipe failures +IFS=$'\n\t' # Secure Internal Field Separator + +# Default Configuration +readonly SOURCE_DIR="./FromNeat" +readonly RECEIPTS_DIR="./ImportToPaperless" +readonly SUMMARY_DIR="./ImportToERP" +readonly CONFIG_FILE="./PaperlessImportConfig" +readonly TEMP_PREFIX="neat_migration" + +# Global variables (will be set by config) +SUMMARY_PAGES=1 + +# Colors for output +readonly RED='\033[0;31m' +readonly GREEN='\033[0;32m' +readonly YELLOW='\033[1;33m' +readonly NC='\033[0m' # No Color + +# Cleanup function for temporary files +cleanup() { + local exit_code=$? + if [[ -n "${temp_dir:-}" ]] && [[ -d "$temp_dir" ]]; then + rm -rf "$temp_dir" + fi + exit $exit_code +} + +# Set trap for cleanup +trap cleanup EXIT INT TERM + +# Function to print colored output +print_status() { + echo -e "${GREEN}[INFO]${NC} $1" >&2 +} + +print_warning() { + echo -e "${YELLOW}[WARN]${NC} $1" >&2 +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} + +# Function to validate dependencies +check_dependencies() { + local missing_deps=() + + if ! command -v pdfseparate >/dev/null 2>&1; then + missing_deps+=("pdfseparate") + fi + + if ! command -v pdfinfo >/dev/null 2>&1; then + missing_deps+=("pdfinfo") + fi + + if [[ ${#missing_deps[@]} -gt 0 ]]; then + print_error "Missing required dependencies: ${missing_deps[*]}" + print_error "Install with: sudo apt-get install poppler-utils" + exit 1 + fi +} + +# Function to validate directory structure +validate_directories() { + if [[ ! -d "$SOURCE_DIR" ]]; then + print_error "Source directory '$SOURCE_DIR' not found!" + print_error "Please create it and place your neat PDF files there." + exit 1 + fi + + # Create output directories with error checking + if ! mkdir -p "$RECEIPTS_DIR" "$SUMMARY_DIR"; then + print_error "Failed to create output directories" + exit 1 + fi +} + +# Function to safely read configuration file +read_config() { + if [[ -f "$CONFIG_FILE" ]]; then + print_status "Reading configuration from $CONFIG_FILE" + + # Validate config file is readable + if [[ ! -r "$CONFIG_FILE" ]]; then + print_error "Config file $CONFIG_FILE exists but is not readable" + exit 1 + fi + + # Source the config file safely + while IFS= read -r line; do + # Skip empty lines and comments + [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue + + # Parse SUMMARY_PAGES setting with strict validation + if [[ "$line" =~ ^[[:space:]]*SUMMARY_PAGES[[:space:]]*=[[:space:]]*([0-9]+)[[:space:]]*$ ]]; then + local parsed_pages="${BASH_REMATCH[1]}" + if [[ "$parsed_pages" =~ ^[0-9]+$ ]] && [[ "$parsed_pages" -ge 0 ]] && [[ "$parsed_pages" -le 10 ]]; then + SUMMARY_PAGES="$parsed_pages" + print_status " Set SUMMARY_PAGES=$SUMMARY_PAGES" + else + print_warning " Invalid SUMMARY_PAGES value: $parsed_pages (must be 0-10)" + fi + elif [[ "$line" =~ ^[[:space:]]*[A-Z_]+ ]]; then + print_warning " Unknown config option: $line" + fi + done < "$CONFIG_FILE" + else + print_warning "Config file $CONFIG_FILE not found, using defaults (SUMMARY_PAGES=$SUMMARY_PAGES)" + print_status "Creating example config file..." + + if ! create_example_config; then + print_error "Failed to create config file" + exit 1 + fi + + print_status "Created $CONFIG_FILE with default settings. Edit as needed and re-run." + fi +} + +# Function to create example configuration file +create_example_config() { + cat > "$CONFIG_FILE" << 'EOF' +# PaperlessImportConfig +# Configuration for neat.com to paperless-ngx migration + +# Number of summary pages at the beginning of each PDF +# These will be saved to ImportToERP directory +SUMMARY_PAGES=1 + +# You can add comments with # +# Examples: +# SUMMARY_PAGES=2 # If your neat exports have 2 summary pages +# SUMMARY_PAGES=0 # If there are no summary pages to separate +EOF +} + +# Function to extract year from current directory or filename +extract_year() { + local year + + # First try to get year from current directory name + local current_dir + current_dir=$(basename "$PWD") + if [[ "$current_dir" =~ ^(20[1-2][0-9])$ ]]; then + year="${BASH_REMATCH[1]}" + if [[ "$year" -ge 2011 ]] && [[ "$year" -le 2025 ]]; then + echo "$year" + return 0 + fi + fi + + # Fallback: try to extract from filename if provided + if [[ -n "${1:-}" ]]; then + local filename="$1" + if [[ "$filename" =~ (20[1-2][0-9]) ]]; then + year="${BASH_REMATCH[1]}" + if [[ "$year" -ge 2011 ]] && [[ "$year" -le 2025 ]]; then + echo "$year" + return 0 + fi + fi + fi + + return 1 +} + +# Function to safely create directories +safe_mkdir() { + local dir="$1" + + if ! mkdir -p "$dir"; then + print_error "Failed to create directory: $dir" + return 1 + fi +} + +# Function to get PDF page count with validation +get_pdf_pages() { + local pdf_file="$1" + local pages + + # Validate file exists and is readable + if [[ ! -f "$pdf_file" ]] || [[ ! -r "$pdf_file" ]]; then + print_error "PDF file not found or not readable: $pdf_file" + return 1 + fi + + # Get page count with error handling + if ! pages=$(pdfinfo "$pdf_file" 2>/dev/null | grep "Pages:" | awk '{print $2}'); then + print_error "Failed to get page information from: $pdf_file" + return 1 + fi + + # Validate page count is a positive integer + if [[ ! "$pages" =~ ^[0-9]+$ ]] || [[ "$pages" -eq 0 ]]; then + print_error "Invalid page count ($pages) for: $pdf_file" + return 1 + fi + + echo "$pages" +} + +# Function to process a single PDF with comprehensive error handling +process_pdf() { + local pdf_file="$1" + local filename year total_pages receipt_pages + local temp_dir + local summary_count=0 receipt_count=0 + + # Validate input + [[ -n "$pdf_file" ]] || { + print_error "process_pdf called with empty filename" + return 1 + } + + filename=$(basename "$pdf_file") + + # Extract year (from current directory first, then filename) + if ! year=$(extract_year "$filename"); then + print_warning "Could not extract year from directory or filename $filename, skipping" + return 1 + fi + + print_status "Processing $filename (Year: $year)" + + # Get total pages with error handling + if ! total_pages=$(get_pdf_pages "$pdf_file"); then + return 1 + fi + + print_status " Total pages: $total_pages" + + # Calculate summary and receipt pages + receipt_pages=$((total_pages - SUMMARY_PAGES)) + + if [[ "$total_pages" -le 0 ]]; then + print_warning " No pages found in PDF" + return 1 + fi + + print_status " Summary pages: $SUMMARY_PAGES, Receipt pages: $receipt_pages" + + # Output directories are directly in current directory (no year subdirs needed) + if ! safe_mkdir "$RECEIPTS_DIR" || ! safe_mkdir "$SUMMARY_DIR"; then + return 1 + fi + + # Create secure temporary directory + if ! temp_dir=$(mktemp -d -t "${TEMP_PREFIX}.XXXXXXXXXX"); then + print_error "Failed to create temporary directory" + return 1 + fi + + # Split PDF into individual pages with error handling + print_status " Splitting PDF..." + if ! pdfseparate "$pdf_file" "$temp_dir/page_%03d.pdf" 2>/dev/null; then + print_error "Failed to split PDF: $pdf_file" + rm -rf "$temp_dir" + return 1 + fi + + # Process summary pages (first SUMMARY_PAGES pages) + local i + for (( i=1; i<=SUMMARY_PAGES && i<=total_pages; i++ )); do + local page_file output_file + + printf -v page_file "$temp_dir/page_%03d.pdf" "$i" + + if [[ -f "$page_file" ]]; then + summary_count=$((summary_count + 1)) + printf -v output_file "$SUMMARY_DIR/%s_summary_%03d.pdf" "$year" "$summary_count" + + if ! mv "$page_file" "$output_file"; then + print_error "Failed to move summary page: $page_file" + rm -rf "$temp_dir" + return 1 + fi + fi + done + + # Process receipt pages (remaining pages after summary pages) + for (( i=$((SUMMARY_PAGES + 1)); i<=total_pages; i++ )); do + local page_file output_file + + printf -v page_file "$temp_dir/page_%03d.pdf" "$i" + + if [[ -f "$page_file" ]]; then + receipt_count=$((receipt_count + 1)) + printf -v output_file "$RECEIPTS_DIR/%s_receipt_%03d.pdf" "$year" "$receipt_count" + + if ! mv "$page_file" "$output_file"; then + print_error "Failed to move receipt page: $page_file" + rm -rf "$temp_dir" + return 1 + fi + fi + done + + # Clean up temp directory + rm -rf "$temp_dir" + + print_status " ✓ Processed $summary_count summary pages and $receipt_count receipts for $year" + + return 0 +} + +# Main execution function +main() { + local pdf_files=() processed=0 skipped=0 + + echo "=== neat.com to paperless-ngx Migration Script ===" + echo + + # Read configuration with error handling + if ! read_config; then + print_error "Failed to read configuration" + exit 1 + fi + echo + + # Check dependencies + check_dependencies + + # Validate and create directories + validate_directories + + # Find PDF files with proper error handling + while IFS= read -r -d '' file; do + pdf_files+=("$file") + done < <(find "$SOURCE_DIR" -maxdepth 1 -name "*.pdf" -type f -print0 2>/dev/null) + + if [[ ${#pdf_files[@]} -eq 0 ]]; then + print_error "No PDF files found in $SOURCE_DIR" + exit 1 + fi + + print_status "Found ${#pdf_files[@]} PDF files to process" + echo + + # Process each PDF with error tracking + local pdf_file + for pdf_file in "${pdf_files[@]}"; do + if process_pdf "$pdf_file"; then + processed=$((processed + 1)) + else + skipped=$((skipped + 1)) + fi + echo + done + + # Summary with validation + echo "=== Processing Complete ===" + print_status "Processed: $processed PDFs" + if [[ "$skipped" -gt 0 ]]; then + print_warning "Skipped: $skipped PDFs" + fi + echo + print_status "Receipt files directory: $RECEIPTS_DIR" + print_status "Summary files directory: $SUMMARY_DIR" + print_status "Summary pages (for ERP): YYYY_summary_*.pdf files" + print_status "Receipt pages (for paperless-ngx): YYYY_receipt_*.pdf files" + + # Show directory structure with error handling + echo + echo "Directory structure created:" + echo "Receipts:" + if [[ -d "$RECEIPTS_DIR" ]]; then + ls -la "$RECEIPTS_DIR"/ 2>/dev/null | head -10 || echo " (empty or inaccessible)" + fi + echo "Summaries:" + if [[ -d "$SUMMARY_DIR" ]]; then + ls -la "$SUMMARY_DIR"/ 2>/dev/null | head -10 || echo " (empty or inaccessible)" + fi +} + +# Run main function with error handling +if ! main "$@"; then + print_error "Script execution failed" + exit 1 +fi