moved docs to docs and added a claude generated script for neat.com to paperless migration
This commit is contained in:
389
utilities/NeatToPaperless.sh
Normal file
389
utilities/NeatToPaperless.sh
Normal file
@@ -0,0 +1,389 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# neat.com to paperless-ngx migration script
|
||||||
|
# Splits yearly PDFs into individual pages organized by year
|
||||||
|
# Saves summary pages separately for ERP system integration
|
||||||
|
# Reads configuration from PaperlessImportConfig file
|
||||||
|
# Designed to run from within each year directory (receipts/YYYY/)
|
||||||
|
|
||||||
|
set -euo pipefail # Exit on error, undefined vars, pipe failures
|
||||||
|
IFS=$'\n\t' # Secure Internal Field Separator
|
||||||
|
|
||||||
|
# Default Configuration
|
||||||
|
readonly SOURCE_DIR="./FromNeat"
|
||||||
|
readonly RECEIPTS_DIR="./ImportToPaperless"
|
||||||
|
readonly SUMMARY_DIR="./ImportToERP"
|
||||||
|
readonly CONFIG_FILE="./PaperlessImportConfig"
|
||||||
|
readonly TEMP_PREFIX="neat_migration"
|
||||||
|
|
||||||
|
# Global variables (will be set by config)
|
||||||
|
SUMMARY_PAGES=1
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
readonly RED='\033[0;31m'
|
||||||
|
readonly GREEN='\033[0;32m'
|
||||||
|
readonly YELLOW='\033[1;33m'
|
||||||
|
readonly NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Cleanup function for temporary files
|
||||||
|
cleanup() {
|
||||||
|
local exit_code=$?
|
||||||
|
if [[ -n "${temp_dir:-}" ]] && [[ -d "$temp_dir" ]]; then
|
||||||
|
rm -rf "$temp_dir"
|
||||||
|
fi
|
||||||
|
exit $exit_code
|
||||||
|
}
|
||||||
|
|
||||||
|
# Set trap for cleanup
|
||||||
|
trap cleanup EXIT INT TERM
|
||||||
|
|
||||||
|
# Function to print colored output
|
||||||
|
print_status() {
|
||||||
|
echo -e "${GREEN}[INFO]${NC} $1" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
print_warning() {
|
||||||
|
echo -e "${YELLOW}[WARN]${NC} $1" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
print_error() {
|
||||||
|
echo -e "${RED}[ERROR]${NC} $1" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to validate dependencies
|
||||||
|
check_dependencies() {
|
||||||
|
local missing_deps=()
|
||||||
|
|
||||||
|
if ! command -v pdfseparate >/dev/null 2>&1; then
|
||||||
|
missing_deps+=("pdfseparate")
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! command -v pdfinfo >/dev/null 2>&1; then
|
||||||
|
missing_deps+=("pdfinfo")
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ${#missing_deps[@]} -gt 0 ]]; then
|
||||||
|
print_error "Missing required dependencies: ${missing_deps[*]}"
|
||||||
|
print_error "Install with: sudo apt-get install poppler-utils"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to validate directory structure
|
||||||
|
validate_directories() {
|
||||||
|
if [[ ! -d "$SOURCE_DIR" ]]; then
|
||||||
|
print_error "Source directory '$SOURCE_DIR' not found!"
|
||||||
|
print_error "Please create it and place your neat PDF files there."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Create output directories with error checking
|
||||||
|
if ! mkdir -p "$RECEIPTS_DIR" "$SUMMARY_DIR"; then
|
||||||
|
print_error "Failed to create output directories"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to safely read configuration file
|
||||||
|
read_config() {
|
||||||
|
if [[ -f "$CONFIG_FILE" ]]; then
|
||||||
|
print_status "Reading configuration from $CONFIG_FILE"
|
||||||
|
|
||||||
|
# Validate config file is readable
|
||||||
|
if [[ ! -r "$CONFIG_FILE" ]]; then
|
||||||
|
print_error "Config file $CONFIG_FILE exists but is not readable"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Source the config file safely
|
||||||
|
while IFS= read -r line; do
|
||||||
|
# Skip empty lines and comments
|
||||||
|
[[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
|
||||||
|
|
||||||
|
# Parse SUMMARY_PAGES setting with strict validation
|
||||||
|
if [[ "$line" =~ ^[[:space:]]*SUMMARY_PAGES[[:space:]]*=[[:space:]]*([0-9]+)[[:space:]]*$ ]]; then
|
||||||
|
local parsed_pages="${BASH_REMATCH[1]}"
|
||||||
|
if [[ "$parsed_pages" =~ ^[0-9]+$ ]] && [[ "$parsed_pages" -ge 0 ]] && [[ "$parsed_pages" -le 10 ]]; then
|
||||||
|
SUMMARY_PAGES="$parsed_pages"
|
||||||
|
print_status " Set SUMMARY_PAGES=$SUMMARY_PAGES"
|
||||||
|
else
|
||||||
|
print_warning " Invalid SUMMARY_PAGES value: $parsed_pages (must be 0-10)"
|
||||||
|
fi
|
||||||
|
elif [[ "$line" =~ ^[[:space:]]*[A-Z_]+ ]]; then
|
||||||
|
print_warning " Unknown config option: $line"
|
||||||
|
fi
|
||||||
|
done < "$CONFIG_FILE"
|
||||||
|
else
|
||||||
|
print_warning "Config file $CONFIG_FILE not found, using defaults (SUMMARY_PAGES=$SUMMARY_PAGES)"
|
||||||
|
print_status "Creating example config file..."
|
||||||
|
|
||||||
|
if ! create_example_config; then
|
||||||
|
print_error "Failed to create config file"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
print_status "Created $CONFIG_FILE with default settings. Edit as needed and re-run."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to create example configuration file
|
||||||
|
create_example_config() {
|
||||||
|
cat > "$CONFIG_FILE" << 'EOF'
|
||||||
|
# PaperlessImportConfig
|
||||||
|
# Configuration for neat.com to paperless-ngx migration
|
||||||
|
|
||||||
|
# Number of summary pages at the beginning of each PDF
|
||||||
|
# These will be saved to ImportToERP directory
|
||||||
|
SUMMARY_PAGES=1
|
||||||
|
|
||||||
|
# You can add comments with #
|
||||||
|
# Examples:
|
||||||
|
# SUMMARY_PAGES=2 # If your neat exports have 2 summary pages
|
||||||
|
# SUMMARY_PAGES=0 # If there are no summary pages to separate
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to extract year from current directory or filename
|
||||||
|
extract_year() {
|
||||||
|
local year
|
||||||
|
|
||||||
|
# First try to get year from current directory name
|
||||||
|
local current_dir
|
||||||
|
current_dir=$(basename "$PWD")
|
||||||
|
if [[ "$current_dir" =~ ^(20[1-2][0-9])$ ]]; then
|
||||||
|
year="${BASH_REMATCH[1]}"
|
||||||
|
if [[ "$year" -ge 2011 ]] && [[ "$year" -le 2025 ]]; then
|
||||||
|
echo "$year"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Fallback: try to extract from filename if provided
|
||||||
|
if [[ -n "${1:-}" ]]; then
|
||||||
|
local filename="$1"
|
||||||
|
if [[ "$filename" =~ (20[1-2][0-9]) ]]; then
|
||||||
|
year="${BASH_REMATCH[1]}"
|
||||||
|
if [[ "$year" -ge 2011 ]] && [[ "$year" -le 2025 ]]; then
|
||||||
|
echo "$year"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to safely create directories
|
||||||
|
safe_mkdir() {
|
||||||
|
local dir="$1"
|
||||||
|
|
||||||
|
if ! mkdir -p "$dir"; then
|
||||||
|
print_error "Failed to create directory: $dir"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to get PDF page count with validation
|
||||||
|
get_pdf_pages() {
|
||||||
|
local pdf_file="$1"
|
||||||
|
local pages
|
||||||
|
|
||||||
|
# Validate file exists and is readable
|
||||||
|
if [[ ! -f "$pdf_file" ]] || [[ ! -r "$pdf_file" ]]; then
|
||||||
|
print_error "PDF file not found or not readable: $pdf_file"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get page count with error handling
|
||||||
|
if ! pages=$(pdfinfo "$pdf_file" 2>/dev/null | grep "Pages:" | awk '{print $2}'); then
|
||||||
|
print_error "Failed to get page information from: $pdf_file"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Validate page count is a positive integer
|
||||||
|
if [[ ! "$pages" =~ ^[0-9]+$ ]] || [[ "$pages" -eq 0 ]]; then
|
||||||
|
print_error "Invalid page count ($pages) for: $pdf_file"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "$pages"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to process a single PDF with comprehensive error handling
|
||||||
|
process_pdf() {
|
||||||
|
local pdf_file="$1"
|
||||||
|
local filename year total_pages receipt_pages
|
||||||
|
local temp_dir
|
||||||
|
local summary_count=0 receipt_count=0
|
||||||
|
|
||||||
|
# Validate input
|
||||||
|
[[ -n "$pdf_file" ]] || {
|
||||||
|
print_error "process_pdf called with empty filename"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
filename=$(basename "$pdf_file")
|
||||||
|
|
||||||
|
# Extract year (from current directory first, then filename)
|
||||||
|
if ! year=$(extract_year "$filename"); then
|
||||||
|
print_warning "Could not extract year from directory or filename $filename, skipping"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
print_status "Processing $filename (Year: $year)"
|
||||||
|
|
||||||
|
# Get total pages with error handling
|
||||||
|
if ! total_pages=$(get_pdf_pages "$pdf_file"); then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
print_status " Total pages: $total_pages"
|
||||||
|
|
||||||
|
# Calculate summary and receipt pages
|
||||||
|
receipt_pages=$((total_pages - SUMMARY_PAGES))
|
||||||
|
|
||||||
|
if [[ "$total_pages" -le 0 ]]; then
|
||||||
|
print_warning " No pages found in PDF"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
print_status " Summary pages: $SUMMARY_PAGES, Receipt pages: $receipt_pages"
|
||||||
|
|
||||||
|
# Output directories are directly in current directory (no year subdirs needed)
|
||||||
|
if ! safe_mkdir "$RECEIPTS_DIR" || ! safe_mkdir "$SUMMARY_DIR"; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Create secure temporary directory
|
||||||
|
if ! temp_dir=$(mktemp -d -t "${TEMP_PREFIX}.XXXXXXXXXX"); then
|
||||||
|
print_error "Failed to create temporary directory"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Split PDF into individual pages with error handling
|
||||||
|
print_status " Splitting PDF..."
|
||||||
|
if ! pdfseparate "$pdf_file" "$temp_dir/page_%03d.pdf" 2>/dev/null; then
|
||||||
|
print_error "Failed to split PDF: $pdf_file"
|
||||||
|
rm -rf "$temp_dir"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Process summary pages (first SUMMARY_PAGES pages)
|
||||||
|
local i
|
||||||
|
for (( i=1; i<=SUMMARY_PAGES && i<=total_pages; i++ )); do
|
||||||
|
local page_file output_file
|
||||||
|
|
||||||
|
printf -v page_file "$temp_dir/page_%03d.pdf" "$i"
|
||||||
|
|
||||||
|
if [[ -f "$page_file" ]]; then
|
||||||
|
summary_count=$((summary_count + 1))
|
||||||
|
printf -v output_file "$SUMMARY_DIR/%s_summary_%03d.pdf" "$year" "$summary_count"
|
||||||
|
|
||||||
|
if ! mv "$page_file" "$output_file"; then
|
||||||
|
print_error "Failed to move summary page: $page_file"
|
||||||
|
rm -rf "$temp_dir"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Process receipt pages (remaining pages after summary pages)
|
||||||
|
for (( i=$((SUMMARY_PAGES + 1)); i<=total_pages; i++ )); do
|
||||||
|
local page_file output_file
|
||||||
|
|
||||||
|
printf -v page_file "$temp_dir/page_%03d.pdf" "$i"
|
||||||
|
|
||||||
|
if [[ -f "$page_file" ]]; then
|
||||||
|
receipt_count=$((receipt_count + 1))
|
||||||
|
printf -v output_file "$RECEIPTS_DIR/%s_receipt_%03d.pdf" "$year" "$receipt_count"
|
||||||
|
|
||||||
|
if ! mv "$page_file" "$output_file"; then
|
||||||
|
print_error "Failed to move receipt page: $page_file"
|
||||||
|
rm -rf "$temp_dir"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Clean up temp directory
|
||||||
|
rm -rf "$temp_dir"
|
||||||
|
|
||||||
|
print_status " ✓ Processed $summary_count summary pages and $receipt_count receipts for $year"
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main execution function
|
||||||
|
main() {
|
||||||
|
local pdf_files=() processed=0 skipped=0
|
||||||
|
|
||||||
|
echo "=== neat.com to paperless-ngx Migration Script ==="
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Read configuration with error handling
|
||||||
|
if ! read_config; then
|
||||||
|
print_error "Failed to read configuration"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Check dependencies
|
||||||
|
check_dependencies
|
||||||
|
|
||||||
|
# Validate and create directories
|
||||||
|
validate_directories
|
||||||
|
|
||||||
|
# Find PDF files with proper error handling
|
||||||
|
while IFS= read -r -d '' file; do
|
||||||
|
pdf_files+=("$file")
|
||||||
|
done < <(find "$SOURCE_DIR" -maxdepth 1 -name "*.pdf" -type f -print0 2>/dev/null)
|
||||||
|
|
||||||
|
if [[ ${#pdf_files[@]} -eq 0 ]]; then
|
||||||
|
print_error "No PDF files found in $SOURCE_DIR"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
print_status "Found ${#pdf_files[@]} PDF files to process"
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Process each PDF with error tracking
|
||||||
|
local pdf_file
|
||||||
|
for pdf_file in "${pdf_files[@]}"; do
|
||||||
|
if process_pdf "$pdf_file"; then
|
||||||
|
processed=$((processed + 1))
|
||||||
|
else
|
||||||
|
skipped=$((skipped + 1))
|
||||||
|
fi
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
|
||||||
|
# Summary with validation
|
||||||
|
echo "=== Processing Complete ==="
|
||||||
|
print_status "Processed: $processed PDFs"
|
||||||
|
if [[ "$skipped" -gt 0 ]]; then
|
||||||
|
print_warning "Skipped: $skipped PDFs"
|
||||||
|
fi
|
||||||
|
echo
|
||||||
|
print_status "Receipt files directory: $RECEIPTS_DIR"
|
||||||
|
print_status "Summary files directory: $SUMMARY_DIR"
|
||||||
|
print_status "Summary pages (for ERP): YYYY_summary_*.pdf files"
|
||||||
|
print_status "Receipt pages (for paperless-ngx): YYYY_receipt_*.pdf files"
|
||||||
|
|
||||||
|
# Show directory structure with error handling
|
||||||
|
echo
|
||||||
|
echo "Directory structure created:"
|
||||||
|
echo "Receipts:"
|
||||||
|
if [[ -d "$RECEIPTS_DIR" ]]; then
|
||||||
|
ls -la "$RECEIPTS_DIR"/ 2>/dev/null | head -10 || echo " (empty or inaccessible)"
|
||||||
|
fi
|
||||||
|
echo "Summaries:"
|
||||||
|
if [[ -d "$SUMMARY_DIR" ]]; then
|
||||||
|
ls -la "$SUMMARY_DIR"/ 2>/dev/null | head -10 || echo " (empty or inaccessible)"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run main function with error handling
|
||||||
|
if ! main "$@"; then
|
||||||
|
print_error "Script execution failed"
|
||||||
|
exit 1
|
||||||
|
fi
|
Reference in New Issue
Block a user