#!/bin/bash # neat.com to paperless-ngx migration script # Splits yearly PDFs into individual pages organized by year # Saves summary pages separately for ERP system integration # Reads configuration from PaperlessImportConfig file # Designed to run from within each year directory (receipts/YYYY/) set -euo pipefail # Exit on error, undefined vars, pipe failures IFS=$'\n\t' # Secure Internal Field Separator # Default Configuration readonly SOURCE_DIR="./FromNeat" readonly RECEIPTS_DIR="./ImportToPaperless" readonly SUMMARY_DIR="./ImportToERP" readonly CONFIG_FILE="./PaperlessImportConfig" readonly TEMP_PREFIX="neat_migration" # Global variables (will be set by config) SUMMARY_PAGES=1 # Colors for output readonly RED='\033[0;31m' readonly GREEN='\033[0;32m' readonly YELLOW='\033[1;33m' readonly NC='\033[0m' # No Color # Cleanup function for temporary files cleanup() { local exit_code=$? if [[ -n "${temp_dir:-}" ]] && [[ -d "$temp_dir" ]]; then rm -rf "$temp_dir" fi exit $exit_code } # Set trap for cleanup trap cleanup EXIT INT TERM # Function to print colored output print_status() { echo -e "${GREEN}[INFO]${NC} $1" >&2 } print_warning() { echo -e "${YELLOW}[WARN]${NC} $1" >&2 } print_error() { echo -e "${RED}[ERROR]${NC} $1" >&2 } # Function to validate dependencies check_dependencies() { local missing_deps=() if ! command -v pdfseparate >/dev/null 2>&1; then missing_deps+=("pdfseparate") fi if ! command -v pdfinfo >/dev/null 2>&1; then missing_deps+=("pdfinfo") fi if [[ ${#missing_deps[@]} -gt 0 ]]; then print_error "Missing required dependencies: ${missing_deps[*]}" print_error "Install with: sudo apt-get install poppler-utils" exit 1 fi } # Function to validate directory structure validate_directories() { if [[ ! -d "$SOURCE_DIR" ]]; then print_error "Source directory '$SOURCE_DIR' not found!" print_error "Please create it and place your neat PDF files there." exit 1 fi # Create output directories with error checking if ! mkdir -p "$RECEIPTS_DIR" "$SUMMARY_DIR"; then print_error "Failed to create output directories" exit 1 fi } # Function to safely read configuration file read_config() { if [[ -f "$CONFIG_FILE" ]]; then print_status "Reading configuration from $CONFIG_FILE" # Validate config file is readable if [[ ! -r "$CONFIG_FILE" ]]; then print_error "Config file $CONFIG_FILE exists but is not readable" exit 1 fi # Source the config file safely while IFS= read -r line; do # Skip empty lines and comments [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue # Parse SUMMARY_PAGES setting with strict validation if [[ "$line" =~ ^[[:space:]]*SUMMARY_PAGES[[:space:]]*=[[:space:]]*([0-9]+)[[:space:]]*$ ]]; then local parsed_pages="${BASH_REMATCH[1]}" if [[ "$parsed_pages" =~ ^[0-9]+$ ]] && [[ "$parsed_pages" -ge 0 ]] && [[ "$parsed_pages" -le 10 ]]; then SUMMARY_PAGES="$parsed_pages" print_status " Set SUMMARY_PAGES=$SUMMARY_PAGES" else print_warning " Invalid SUMMARY_PAGES value: $parsed_pages (must be 0-10)" fi elif [[ "$line" =~ ^[[:space:]]*[A-Z_]+ ]]; then print_warning " Unknown config option: $line" fi done < "$CONFIG_FILE" else print_warning "Config file $CONFIG_FILE not found, using defaults (SUMMARY_PAGES=$SUMMARY_PAGES)" print_status "Creating example config file..." if ! create_example_config; then print_error "Failed to create config file" exit 1 fi print_status "Created $CONFIG_FILE with default settings. Edit as needed and re-run." fi } # Function to create example configuration file create_example_config() { cat > "$CONFIG_FILE" << 'EOF' # PaperlessImportConfig # Configuration for neat.com to paperless-ngx migration # Number of summary pages at the beginning of each PDF # These will be saved to ImportToERP directory SUMMARY_PAGES=1 # You can add comments with # # Examples: # SUMMARY_PAGES=2 # If your neat exports have 2 summary pages # SUMMARY_PAGES=0 # If there are no summary pages to separate EOF } # Function to extract year from current directory or filename extract_year() { local year # First try to get year from current directory name local current_dir current_dir=$(basename "$PWD") if [[ "$current_dir" =~ ^(20[1-2][0-9])$ ]]; then year="${BASH_REMATCH[1]}" if [[ "$year" -ge 2011 ]] && [[ "$year" -le 2025 ]]; then echo "$year" return 0 fi fi # Fallback: try to extract from filename if provided if [[ -n "${1:-}" ]]; then local filename="$1" if [[ "$filename" =~ (20[1-2][0-9]) ]]; then year="${BASH_REMATCH[1]}" if [[ "$year" -ge 2011 ]] && [[ "$year" -le 2025 ]]; then echo "$year" return 0 fi fi fi return 1 } # Function to safely create directories safe_mkdir() { local dir="$1" if ! mkdir -p "$dir"; then print_error "Failed to create directory: $dir" return 1 fi } # Function to get PDF page count with validation get_pdf_pages() { local pdf_file="$1" local pages # Validate file exists and is readable if [[ ! -f "$pdf_file" ]] || [[ ! -r "$pdf_file" ]]; then print_error "PDF file not found or not readable: $pdf_file" return 1 fi # Get page count with error handling if ! pages=$(pdfinfo "$pdf_file" 2>/dev/null | grep "Pages:" | awk '{print $2}'); then print_error "Failed to get page information from: $pdf_file" return 1 fi # Validate page count is a positive integer if [[ ! "$pages" =~ ^[0-9]+$ ]] || [[ "$pages" -eq 0 ]]; then print_error "Invalid page count ($pages) for: $pdf_file" return 1 fi echo "$pages" } # Function to process a single PDF with comprehensive error handling process_pdf() { local pdf_file="$1" local filename year total_pages receipt_pages local temp_dir local summary_count=0 receipt_count=0 # Validate input [[ -n "$pdf_file" ]] || { print_error "process_pdf called with empty filename" return 1 } filename=$(basename "$pdf_file") # Extract year (from current directory first, then filename) if ! year=$(extract_year "$filename"); then print_warning "Could not extract year from directory or filename $filename, skipping" return 1 fi print_status "Processing $filename (Year: $year)" # Get total pages with error handling if ! total_pages=$(get_pdf_pages "$pdf_file"); then return 1 fi print_status " Total pages: $total_pages" # Calculate summary and receipt pages receipt_pages=$((total_pages - SUMMARY_PAGES)) if [[ "$total_pages" -le 0 ]]; then print_warning " No pages found in PDF" return 1 fi print_status " Summary pages: $SUMMARY_PAGES, Receipt pages: $receipt_pages" # Output directories are directly in current directory (no year subdirs needed) if ! safe_mkdir "$RECEIPTS_DIR" || ! safe_mkdir "$SUMMARY_DIR"; then return 1 fi # Create secure temporary directory if ! temp_dir=$(mktemp -d -t "${TEMP_PREFIX}.XXXXXXXXXX"); then print_error "Failed to create temporary directory" return 1 fi # Split PDF into individual pages with error handling print_status " Splitting PDF..." if ! pdfseparate "$pdf_file" "$temp_dir/page_%03d.pdf" 2>/dev/null; then print_error "Failed to split PDF: $pdf_file" rm -rf "$temp_dir" return 1 fi # Process summary pages (first SUMMARY_PAGES pages) local i for (( i=1; i<=SUMMARY_PAGES && i<=total_pages; i++ )); do local page_file output_file printf -v page_file "$temp_dir/page_%03d.pdf" "$i" if [[ -f "$page_file" ]]; then summary_count=$((summary_count + 1)) printf -v output_file "$SUMMARY_DIR/%s_summary_%03d.pdf" "$year" "$summary_count" if ! mv "$page_file" "$output_file"; then print_error "Failed to move summary page: $page_file" rm -rf "$temp_dir" return 1 fi fi done # Process receipt pages (remaining pages after summary pages) for (( i=$((SUMMARY_PAGES + 1)); i<=total_pages; i++ )); do local page_file output_file printf -v page_file "$temp_dir/page_%03d.pdf" "$i" if [[ -f "$page_file" ]]; then receipt_count=$((receipt_count + 1)) printf -v output_file "$RECEIPTS_DIR/%s_receipt_%03d.pdf" "$year" "$receipt_count" if ! mv "$page_file" "$output_file"; then print_error "Failed to move receipt page: $page_file" rm -rf "$temp_dir" return 1 fi fi done # Clean up temp directory rm -rf "$temp_dir" print_status " ✓ Processed $summary_count summary pages and $receipt_count receipts for $year" return 0 } # Main execution function main() { local pdf_files=() processed=0 skipped=0 echo "=== neat.com to paperless-ngx Migration Script ===" echo # Read configuration with error handling if ! read_config; then print_error "Failed to read configuration" exit 1 fi echo # Check dependencies check_dependencies # Validate and create directories validate_directories # Find PDF files with proper error handling while IFS= read -r -d '' file; do pdf_files+=("$file") done < <(find "$SOURCE_DIR" -maxdepth 1 -name "*.pdf" -type f -print0 2>/dev/null) if [[ ${#pdf_files[@]} -eq 0 ]]; then print_error "No PDF files found in $SOURCE_DIR" exit 1 fi print_status "Found ${#pdf_files[@]} PDF files to process" echo # Process each PDF with error tracking local pdf_file for pdf_file in "${pdf_files[@]}"; do if process_pdf "$pdf_file"; then processed=$((processed + 1)) else skipped=$((skipped + 1)) fi echo done # Summary with validation echo "=== Processing Complete ===" print_status "Processed: $processed PDFs" if [[ "$skipped" -gt 0 ]]; then print_warning "Skipped: $skipped PDFs" fi echo print_status "Receipt files directory: $RECEIPTS_DIR" print_status "Summary files directory: $SUMMARY_DIR" print_status "Summary pages (for ERP): YYYY_summary_*.pdf files" print_status "Receipt pages (for paperless-ngx): YYYY_receipt_*.pdf files" # Show directory structure with error handling echo echo "Directory structure created:" echo "Receipts:" if [[ -d "$RECEIPTS_DIR" ]]; then ls -la "$RECEIPTS_DIR"/ 2>/dev/null | head -10 || echo " (empty or inaccessible)" fi echo "Summaries:" if [[ -d "$SUMMARY_DIR" ]]; then ls -la "$SUMMARY_DIR"/ 2>/dev/null | head -10 || echo " (empty or inaccessible)" fi } # Run main function with error handling if ! main "$@"; then print_error "Script execution failed" exit 1 fi