#!/bin/bash

# neat.com to paperless-ngx migration script
# Splits yearly PDFs into individual pages organized by year
# Saves summary pages separately for ERP system integration
# Reads configuration from PaperlessImportConfig file
# Designed to run from within each year directory (receipts/YYYY/)

set -euo pipefail  # Exit on error, undefined vars, pipe failures
IFS=$'\n\t'       # Secure Internal Field Separator

# Default Configuration
readonly SOURCE_DIR="./FromNeat"
readonly RECEIPTS_DIR="./ImportToPaperless"
readonly SUMMARY_DIR="./ImportToERP"
readonly CONFIG_FILE="./PaperlessImportConfig"
readonly TEMP_PREFIX="neat_migration"

# Global variables (will be set by config)
SUMMARY_PAGES=1

# Colors for output
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly NC='\033[0m' # No Color

# Cleanup function for temporary files
cleanup() {
    local exit_code=$?
    if [[ -n "${temp_dir:-}" ]] && [[ -d "$temp_dir" ]]; then
        rm -rf "$temp_dir"
    fi
    exit $exit_code
}

# Set trap for cleanup
trap cleanup EXIT INT TERM

# Function to print colored output
print_status() {
    echo -e "${GREEN}[INFO]${NC} $1" >&2
}

print_warning() {
    echo -e "${YELLOW}[WARN]${NC} $1" >&2
}

print_error() {
    echo -e "${RED}[ERROR]${NC} $1" >&2
}

# Function to validate dependencies
check_dependencies() {
    local missing_deps=()
    
    if ! command -v pdfseparate >/dev/null 2>&1; then
        missing_deps+=("pdfseparate")
    fi
    
    if ! command -v pdfinfo >/dev/null 2>&1; then
        missing_deps+=("pdfinfo")
    fi
    
    if [[ ${#missing_deps[@]} -gt 0 ]]; then
        print_error "Missing required dependencies: ${missing_deps[*]}"
        print_error "Install with: sudo apt-get install poppler-utils"
        exit 1
    fi
}

# Function to validate directory structure
validate_directories() {
    if [[ ! -d "$SOURCE_DIR" ]]; then
        print_error "Source directory '$SOURCE_DIR' not found!"
        print_error "Please create it and place your neat PDF files there."
        exit 1
    fi
    
    # Create output directories with error checking
    if ! mkdir -p "$RECEIPTS_DIR" "$SUMMARY_DIR"; then
        print_error "Failed to create output directories"
        exit 1
    fi
}

# Function to safely read configuration file
read_config() {
    if [[ -f "$CONFIG_FILE" ]]; then
        print_status "Reading configuration from $CONFIG_FILE"
        
        # Validate config file is readable
        if [[ ! -r "$CONFIG_FILE" ]]; then
            print_error "Config file $CONFIG_FILE exists but is not readable"
            exit 1
        fi
        
        # Source the config file safely
        while IFS= read -r line; do
            # Skip empty lines and comments
            [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
            
            # Parse SUMMARY_PAGES setting with strict validation
            if [[ "$line" =~ ^[[:space:]]*SUMMARY_PAGES[[:space:]]*=[[:space:]]*([0-9]+)[[:space:]]*$ ]]; then
                local parsed_pages="${BASH_REMATCH[1]}"
                if [[ "$parsed_pages" =~ ^[0-9]+$ ]] && [[ "$parsed_pages" -ge 0 ]] && [[ "$parsed_pages" -le 10 ]]; then
                    SUMMARY_PAGES="$parsed_pages"
                    print_status "  Set SUMMARY_PAGES=$SUMMARY_PAGES"
                else
                    print_warning "  Invalid SUMMARY_PAGES value: $parsed_pages (must be 0-10)"
                fi
            elif [[ "$line" =~ ^[[:space:]]*[A-Z_]+ ]]; then
                print_warning "  Unknown config option: $line"
            fi
        done < "$CONFIG_FILE"
    else
        print_warning "Config file $CONFIG_FILE not found, using defaults (SUMMARY_PAGES=$SUMMARY_PAGES)"
        print_status "Creating example config file..."
        
        if ! create_example_config; then
            print_error "Failed to create config file"
            exit 1
        fi
        
        print_status "Created $CONFIG_FILE with default settings. Edit as needed and re-run."
    fi
}

# Function to create example configuration file
create_example_config() {
    cat > "$CONFIG_FILE" << 'EOF'
# PaperlessImportConfig
# Configuration for neat.com to paperless-ngx migration

# Number of summary pages at the beginning of each PDF
# These will be saved to ImportToERP directory
SUMMARY_PAGES=1

# You can add comments with # 
# Examples:
# SUMMARY_PAGES=2  # If your neat exports have 2 summary pages
# SUMMARY_PAGES=0  # If there are no summary pages to separate
EOF
}

# Function to extract year from current directory or filename
extract_year() {
    local year
    
    # First try to get year from current directory name
    local current_dir
    current_dir=$(basename "$PWD")
    if [[ "$current_dir" =~ ^(20[1-2][0-9])$ ]]; then
        year="${BASH_REMATCH[1]}"
        if [[ "$year" -ge 2011 ]] && [[ "$year" -le 2025 ]]; then
            echo "$year"
            return 0
        fi
    fi
    
    # Fallback: try to extract from filename if provided
    if [[ -n "${1:-}" ]]; then
        local filename="$1"
        if [[ "$filename" =~ (20[1-2][0-9]) ]]; then
            year="${BASH_REMATCH[1]}"
            if [[ "$year" -ge 2011 ]] && [[ "$year" -le 2025 ]]; then
                echo "$year"
                return 0
            fi
        fi
    fi
    
    return 1
}

# Function to safely create directories
safe_mkdir() {
    local dir="$1"
    
    if ! mkdir -p "$dir"; then
        print_error "Failed to create directory: $dir"
        return 1
    fi
}

# Function to get PDF page count with validation
get_pdf_pages() {
    local pdf_file="$1"
    local pages
    
    # Validate file exists and is readable
    if [[ ! -f "$pdf_file" ]] || [[ ! -r "$pdf_file" ]]; then
        print_error "PDF file not found or not readable: $pdf_file"
        return 1
    fi
    
    # Get page count with error handling
    if ! pages=$(pdfinfo "$pdf_file" 2>/dev/null | grep "Pages:" | awk '{print $2}'); then
        print_error "Failed to get page information from: $pdf_file"
        return 1
    fi
    
    # Validate page count is a positive integer
    if [[ ! "$pages" =~ ^[0-9]+$ ]] || [[ "$pages" -eq 0 ]]; then
        print_error "Invalid page count ($pages) for: $pdf_file"
        return 1
    fi
    
    echo "$pages"
}

# Function to process a single PDF with comprehensive error handling
process_pdf() {
    local pdf_file="$1"
    local filename year total_pages receipt_pages
    local temp_dir
    local summary_count=0 receipt_count=0
    
    # Validate input
    [[ -n "$pdf_file" ]] || {
        print_error "process_pdf called with empty filename"
        return 1
    }
    
    filename=$(basename "$pdf_file")
    
    # Extract year (from current directory first, then filename)
    if ! year=$(extract_year "$filename"); then
        print_warning "Could not extract year from directory or filename $filename, skipping"
        return 1
    fi
    
    print_status "Processing $filename (Year: $year)"
    
    # Get total pages with error handling
    if ! total_pages=$(get_pdf_pages "$pdf_file"); then
        return 1
    fi
    
    print_status "  Total pages: $total_pages"
    
    # Calculate summary and receipt pages
    receipt_pages=$((total_pages - SUMMARY_PAGES))
    
    if [[ "$total_pages" -le 0 ]]; then
        print_warning "  No pages found in PDF"
        return 1
    fi
    
    print_status "  Summary pages: $SUMMARY_PAGES, Receipt pages: $receipt_pages"
    
    # Output directories are directly in current directory (no year subdirs needed)
    if ! safe_mkdir "$RECEIPTS_DIR" || ! safe_mkdir "$SUMMARY_DIR"; then
        return 1
    fi
    
    # Create secure temporary directory
    if ! temp_dir=$(mktemp -d -t "${TEMP_PREFIX}.XXXXXXXXXX"); then
        print_error "Failed to create temporary directory"
        return 1
    fi
    
    # Split PDF into individual pages with error handling
    print_status "  Splitting PDF..."
    if ! pdfseparate "$pdf_file" "$temp_dir/page_%03d.pdf" 2>/dev/null; then
        print_error "Failed to split PDF: $pdf_file"
        rm -rf "$temp_dir"
        return 1
    fi
    
    # Process summary pages (first SUMMARY_PAGES pages)
    local i
    for (( i=1; i<=SUMMARY_PAGES && i<=total_pages; i++ )); do
        local page_file output_file
        
        printf -v page_file "$temp_dir/page_%03d.pdf" "$i"
        
        if [[ -f "$page_file" ]]; then
            summary_count=$((summary_count + 1))
            printf -v output_file "$SUMMARY_DIR/%s_summary_%03d.pdf" "$year" "$summary_count"
            
            if ! mv "$page_file" "$output_file"; then
                print_error "Failed to move summary page: $page_file"
                rm -rf "$temp_dir"
                return 1
            fi
        fi
    done
    
    # Process receipt pages (remaining pages after summary pages)
    for (( i=$((SUMMARY_PAGES + 1)); i<=total_pages; i++ )); do
        local page_file output_file
        
        printf -v page_file "$temp_dir/page_%03d.pdf" "$i"
        
        if [[ -f "$page_file" ]]; then
            receipt_count=$((receipt_count + 1))
            printf -v output_file "$RECEIPTS_DIR/%s_receipt_%03d.pdf" "$year" "$receipt_count"
            
            if ! mv "$page_file" "$output_file"; then
                print_error "Failed to move receipt page: $page_file"
                rm -rf "$temp_dir"
                return 1
            fi
        fi
    done
    
    # Clean up temp directory
    rm -rf "$temp_dir"
    
    print_status "  ✓ Processed $summary_count summary pages and $receipt_count receipts for $year"
    
    return 0
}

# Main execution function
main() {
    local pdf_files=() processed=0 skipped=0
    
    echo "=== neat.com to paperless-ngx Migration Script ==="
    echo
    
    # Read configuration with error handling
    if ! read_config; then
        print_error "Failed to read configuration"
        exit 1
    fi
    echo
    
    # Check dependencies
    check_dependencies
    
    # Validate and create directories
    validate_directories
    
    # Find PDF files with proper error handling
    while IFS= read -r -d '' file; do
        pdf_files+=("$file")
    done < <(find "$SOURCE_DIR" -maxdepth 1 -name "*.pdf" -type f -print0 2>/dev/null)
    
    if [[ ${#pdf_files[@]} -eq 0 ]]; then
        print_error "No PDF files found in $SOURCE_DIR"
        exit 1
    fi
    
    print_status "Found ${#pdf_files[@]} PDF files to process"
    echo
    
    # Process each PDF with error tracking
    local pdf_file
    for pdf_file in "${pdf_files[@]}"; do
        if process_pdf "$pdf_file"; then
            processed=$((processed + 1))
        else
            skipped=$((skipped + 1))
        fi
        echo
    done
    
    # Summary with validation
    echo "=== Processing Complete ==="
    print_status "Processed: $processed PDFs"
    if [[ "$skipped" -gt 0 ]]; then
        print_warning "Skipped: $skipped PDFs"
    fi
    echo
    print_status "Receipt files directory: $RECEIPTS_DIR"
    print_status "Summary files directory: $SUMMARY_DIR"
    print_status "Summary pages (for ERP): YYYY_summary_*.pdf files"
    print_status "Receipt pages (for paperless-ngx): YYYY_receipt_*.pdf files"
    
    # Show directory structure with error handling
    echo
    echo "Directory structure created:"
    echo "Receipts:"
    if [[ -d "$RECEIPTS_DIR" ]]; then
        ls -la "$RECEIPTS_DIR"/ 2>/dev/null | head -10 || echo "  (empty or inaccessible)"
    fi
    echo "Summaries:"
    if [[ -d "$SUMMARY_DIR" ]]; then
        ls -la "$SUMMARY_DIR"/ 2>/dev/null | head -10 || echo "  (empty or inaccessible)"
    fi
}

# Run main function with error handling
if ! main "$@"; then
    print_error "Script execution failed"
    exit 1
fi