moved docs to docs and added a claude generated script for neat.com to paperless migration

This commit is contained in:
2025-08-01 12:01:44 -05:00
parent ada8f2502a
commit 2d62adfaaa
2 changed files with 389 additions and 0 deletions

View File

@@ -0,0 +1,389 @@
#!/bin/bash
# neat.com to paperless-ngx migration script
# Splits yearly PDFs into individual pages organized by year
# Saves summary pages separately for ERP system integration
# Reads configuration from PaperlessImportConfig file
# Designed to run from within each year directory (receipts/YYYY/)
set -euo pipefail # Exit on error, undefined vars, pipe failures
IFS=$'\n\t' # Secure Internal Field Separator
# Default Configuration
readonly SOURCE_DIR="./FromNeat"
readonly RECEIPTS_DIR="./ImportToPaperless"
readonly SUMMARY_DIR="./ImportToERP"
readonly CONFIG_FILE="./PaperlessImportConfig"
readonly TEMP_PREFIX="neat_migration"
# Global variables (will be set by config)
SUMMARY_PAGES=1
# Colors for output
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly NC='\033[0m' # No Color
# Cleanup function for temporary files
cleanup() {
local exit_code=$?
if [[ -n "${temp_dir:-}" ]] && [[ -d "$temp_dir" ]]; then
rm -rf "$temp_dir"
fi
exit $exit_code
}
# Set trap for cleanup
trap cleanup EXIT INT TERM
# Function to print colored output
print_status() {
echo -e "${GREEN}[INFO]${NC} $1" >&2
}
print_warning() {
echo -e "${YELLOW}[WARN]${NC} $1" >&2
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
# Function to validate dependencies
check_dependencies() {
local missing_deps=()
if ! command -v pdfseparate >/dev/null 2>&1; then
missing_deps+=("pdfseparate")
fi
if ! command -v pdfinfo >/dev/null 2>&1; then
missing_deps+=("pdfinfo")
fi
if [[ ${#missing_deps[@]} -gt 0 ]]; then
print_error "Missing required dependencies: ${missing_deps[*]}"
print_error "Install with: sudo apt-get install poppler-utils"
exit 1
fi
}
# Function to validate directory structure
validate_directories() {
if [[ ! -d "$SOURCE_DIR" ]]; then
print_error "Source directory '$SOURCE_DIR' not found!"
print_error "Please create it and place your neat PDF files there."
exit 1
fi
# Create output directories with error checking
if ! mkdir -p "$RECEIPTS_DIR" "$SUMMARY_DIR"; then
print_error "Failed to create output directories"
exit 1
fi
}
# Function to safely read configuration file
read_config() {
if [[ -f "$CONFIG_FILE" ]]; then
print_status "Reading configuration from $CONFIG_FILE"
# Validate config file is readable
if [[ ! -r "$CONFIG_FILE" ]]; then
print_error "Config file $CONFIG_FILE exists but is not readable"
exit 1
fi
# Source the config file safely
while IFS= read -r line; do
# Skip empty lines and comments
[[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
# Parse SUMMARY_PAGES setting with strict validation
if [[ "$line" =~ ^[[:space:]]*SUMMARY_PAGES[[:space:]]*=[[:space:]]*([0-9]+)[[:space:]]*$ ]]; then
local parsed_pages="${BASH_REMATCH[1]}"
if [[ "$parsed_pages" =~ ^[0-9]+$ ]] && [[ "$parsed_pages" -ge 0 ]] && [[ "$parsed_pages" -le 10 ]]; then
SUMMARY_PAGES="$parsed_pages"
print_status " Set SUMMARY_PAGES=$SUMMARY_PAGES"
else
print_warning " Invalid SUMMARY_PAGES value: $parsed_pages (must be 0-10)"
fi
elif [[ "$line" =~ ^[[:space:]]*[A-Z_]+ ]]; then
print_warning " Unknown config option: $line"
fi
done < "$CONFIG_FILE"
else
print_warning "Config file $CONFIG_FILE not found, using defaults (SUMMARY_PAGES=$SUMMARY_PAGES)"
print_status "Creating example config file..."
if ! create_example_config; then
print_error "Failed to create config file"
exit 1
fi
print_status "Created $CONFIG_FILE with default settings. Edit as needed and re-run."
fi
}
# Function to create example configuration file
create_example_config() {
cat > "$CONFIG_FILE" << 'EOF'
# PaperlessImportConfig
# Configuration for neat.com to paperless-ngx migration
# Number of summary pages at the beginning of each PDF
# These will be saved to ImportToERP directory
SUMMARY_PAGES=1
# You can add comments with #
# Examples:
# SUMMARY_PAGES=2 # If your neat exports have 2 summary pages
# SUMMARY_PAGES=0 # If there are no summary pages to separate
EOF
}
# Function to extract year from current directory or filename
extract_year() {
local year
# First try to get year from current directory name
local current_dir
current_dir=$(basename "$PWD")
if [[ "$current_dir" =~ ^(20[1-2][0-9])$ ]]; then
year="${BASH_REMATCH[1]}"
if [[ "$year" -ge 2011 ]] && [[ "$year" -le 2025 ]]; then
echo "$year"
return 0
fi
fi
# Fallback: try to extract from filename if provided
if [[ -n "${1:-}" ]]; then
local filename="$1"
if [[ "$filename" =~ (20[1-2][0-9]) ]]; then
year="${BASH_REMATCH[1]}"
if [[ "$year" -ge 2011 ]] && [[ "$year" -le 2025 ]]; then
echo "$year"
return 0
fi
fi
fi
return 1
}
# Function to safely create directories
safe_mkdir() {
local dir="$1"
if ! mkdir -p "$dir"; then
print_error "Failed to create directory: $dir"
return 1
fi
}
# Function to get PDF page count with validation
get_pdf_pages() {
local pdf_file="$1"
local pages
# Validate file exists and is readable
if [[ ! -f "$pdf_file" ]] || [[ ! -r "$pdf_file" ]]; then
print_error "PDF file not found or not readable: $pdf_file"
return 1
fi
# Get page count with error handling
if ! pages=$(pdfinfo "$pdf_file" 2>/dev/null | grep "Pages:" | awk '{print $2}'); then
print_error "Failed to get page information from: $pdf_file"
return 1
fi
# Validate page count is a positive integer
if [[ ! "$pages" =~ ^[0-9]+$ ]] || [[ "$pages" -eq 0 ]]; then
print_error "Invalid page count ($pages) for: $pdf_file"
return 1
fi
echo "$pages"
}
# Function to process a single PDF with comprehensive error handling
process_pdf() {
local pdf_file="$1"
local filename year total_pages receipt_pages
local temp_dir
local summary_count=0 receipt_count=0
# Validate input
[[ -n "$pdf_file" ]] || {
print_error "process_pdf called with empty filename"
return 1
}
filename=$(basename "$pdf_file")
# Extract year (from current directory first, then filename)
if ! year=$(extract_year "$filename"); then
print_warning "Could not extract year from directory or filename $filename, skipping"
return 1
fi
print_status "Processing $filename (Year: $year)"
# Get total pages with error handling
if ! total_pages=$(get_pdf_pages "$pdf_file"); then
return 1
fi
print_status " Total pages: $total_pages"
# Calculate summary and receipt pages
receipt_pages=$((total_pages - SUMMARY_PAGES))
if [[ "$total_pages" -le 0 ]]; then
print_warning " No pages found in PDF"
return 1
fi
print_status " Summary pages: $SUMMARY_PAGES, Receipt pages: $receipt_pages"
# Output directories are directly in current directory (no year subdirs needed)
if ! safe_mkdir "$RECEIPTS_DIR" || ! safe_mkdir "$SUMMARY_DIR"; then
return 1
fi
# Create secure temporary directory
if ! temp_dir=$(mktemp -d -t "${TEMP_PREFIX}.XXXXXXXXXX"); then
print_error "Failed to create temporary directory"
return 1
fi
# Split PDF into individual pages with error handling
print_status " Splitting PDF..."
if ! pdfseparate "$pdf_file" "$temp_dir/page_%03d.pdf" 2>/dev/null; then
print_error "Failed to split PDF: $pdf_file"
rm -rf "$temp_dir"
return 1
fi
# Process summary pages (first SUMMARY_PAGES pages)
local i
for (( i=1; i<=SUMMARY_PAGES && i<=total_pages; i++ )); do
local page_file output_file
printf -v page_file "$temp_dir/page_%03d.pdf" "$i"
if [[ -f "$page_file" ]]; then
summary_count=$((summary_count + 1))
printf -v output_file "$SUMMARY_DIR/%s_summary_%03d.pdf" "$year" "$summary_count"
if ! mv "$page_file" "$output_file"; then
print_error "Failed to move summary page: $page_file"
rm -rf "$temp_dir"
return 1
fi
fi
done
# Process receipt pages (remaining pages after summary pages)
for (( i=$((SUMMARY_PAGES + 1)); i<=total_pages; i++ )); do
local page_file output_file
printf -v page_file "$temp_dir/page_%03d.pdf" "$i"
if [[ -f "$page_file" ]]; then
receipt_count=$((receipt_count + 1))
printf -v output_file "$RECEIPTS_DIR/%s_receipt_%03d.pdf" "$year" "$receipt_count"
if ! mv "$page_file" "$output_file"; then
print_error "Failed to move receipt page: $page_file"
rm -rf "$temp_dir"
return 1
fi
fi
done
# Clean up temp directory
rm -rf "$temp_dir"
print_status " ✓ Processed $summary_count summary pages and $receipt_count receipts for $year"
return 0
}
# Main execution function
main() {
local pdf_files=() processed=0 skipped=0
echo "=== neat.com to paperless-ngx Migration Script ==="
echo
# Read configuration with error handling
if ! read_config; then
print_error "Failed to read configuration"
exit 1
fi
echo
# Check dependencies
check_dependencies
# Validate and create directories
validate_directories
# Find PDF files with proper error handling
while IFS= read -r -d '' file; do
pdf_files+=("$file")
done < <(find "$SOURCE_DIR" -maxdepth 1 -name "*.pdf" -type f -print0 2>/dev/null)
if [[ ${#pdf_files[@]} -eq 0 ]]; then
print_error "No PDF files found in $SOURCE_DIR"
exit 1
fi
print_status "Found ${#pdf_files[@]} PDF files to process"
echo
# Process each PDF with error tracking
local pdf_file
for pdf_file in "${pdf_files[@]}"; do
if process_pdf "$pdf_file"; then
processed=$((processed + 1))
else
skipped=$((skipped + 1))
fi
echo
done
# Summary with validation
echo "=== Processing Complete ==="
print_status "Processed: $processed PDFs"
if [[ "$skipped" -gt 0 ]]; then
print_warning "Skipped: $skipped PDFs"
fi
echo
print_status "Receipt files directory: $RECEIPTS_DIR"
print_status "Summary files directory: $SUMMARY_DIR"
print_status "Summary pages (for ERP): YYYY_summary_*.pdf files"
print_status "Receipt pages (for paperless-ngx): YYYY_receipt_*.pdf files"
# Show directory structure with error handling
echo
echo "Directory structure created:"
echo "Receipts:"
if [[ -d "$RECEIPTS_DIR" ]]; then
ls -la "$RECEIPTS_DIR"/ 2>/dev/null | head -10 || echo " (empty or inaccessible)"
fi
echo "Summaries:"
if [[ -d "$SUMMARY_DIR" ]]; then
ls -la "$SUMMARY_DIR"/ 2>/dev/null | head -10 || echo " (empty or inaccessible)"
fi
}
# Run main function with error handling
if ! main "$@"; then
print_error "Script execution failed"
exit 1
fi