From d7682ee87ddb7a575e621a64e0864353f20dae4d Mon Sep 17 00:00:00 2001 From: ReachableCEO Date: Wed, 15 Oct 2025 13:39:37 -0500 Subject: [PATCH] feat(output): add wrapper and failed conversion handling --- AGENTS.md | 14 +++ output/AGENTS.md | 18 +++ output/Docker/Dockerfile | 26 +++++ output/Docker/docker-compose.yml | 19 +++ output/Docker/entrypoint.sh | 18 +++ output/Docker/run-output-processor.sh | 28 +++++ output/Docker/watch_and_convert.py | 159 ++++++++++++++++++++++++++ output/ForRelease/failed/.gitkeep | 0 output/ForRelease/outbox/.gitkeep | 1 + output/ForRelease/processed/.gitkeep | 1 + output/README.md | 40 +++++++ 11 files changed, 324 insertions(+) create mode 100644 AGENTS.md create mode 100644 output/AGENTS.md create mode 100644 output/Docker/Dockerfile create mode 100644 output/Docker/docker-compose.yml create mode 100755 output/Docker/entrypoint.sh create mode 100755 output/Docker/run-output-processor.sh create mode 100755 output/Docker/watch_and_convert.py create mode 100644 output/ForRelease/failed/.gitkeep create mode 100644 output/ForRelease/outbox/.gitkeep create mode 100644 output/ForRelease/processed/.gitkeep create mode 100644 output/README.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..9171961 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,14 @@ +# Agent Overview + +This repository splits work into two coordinated areas: + +- `input/` – upstream tooling that prepares role-specific Markdown resumes. +- `output/` – the conversion pipeline that renders those Markdown files into DOCX/PDF deliverables. + +Agents should treat these areas independently so changes can be reasoned about and tested in isolation. + +## Working Guidelines +- Keep shared instructions in this file minimal; place deeper guidance in `input/AGENTS.md` or `output/AGENTS.md` as appropriate. +- When making automated edits, avoid touching both `input/` and `output/` in the same change set unless the work explicitly spans both pipelines. +- Resume conversion templates live under `input/templates`. Output services mount them read-only; update templates from the input side and verify with a fresh conversion run. +- Use conventional commits (`(scope): `) to signal which side of the system a change targets, e.g., `feat(output): add failed-processing bucket`. diff --git a/output/AGENTS.md b/output/AGENTS.md new file mode 100644 index 0000000..4e509cc --- /dev/null +++ b/output/AGENTS.md @@ -0,0 +1,18 @@ +# Output Agent Guide + +## Scope +The `output/` tree houses the delivery pipeline that watches for approved Markdown resumes, converts them to DOCX/PDF using Pandoc, and archives the source material. It is intended to run independently from the `input/` authoring workflow. + +## Key Components +- `ForRelease/inbox`: manually populated with a single vetted `.md` resume for conversion. +- `ForRelease/outbox`: timestamped folders containing generated DOCX/PDF pairs ready for sharing. +- `ForRelease/processed`: timestamped archives of Markdown files that converted successfully. +- `ForRelease/failed`: Markdown originals for conversion attempts that Pandoc could not render. +- `Docker/`: container definition, watcher script, and wrapper to run the stack without root-owned outputs. + +## Operational Rules +- Always launch the service with `Docker/run-output-processor.sh` so the container inherits the caller’s UID/GID. +- Before testing, ensure `ForRelease/inbox` is empty; this watcher expects at most one Markdown file at a time. +- Monitor logs via `./run-output-processor.sh logs -f` while converting to confirm the Markdown leaves inbox and the exports appear in outbox. +- If Pandoc fails, the Markdown moves to `ForRelease/failed`; fix the content there, then move it back to `inbox` for another run. +- Only remove history from `outbox/` or `processed/` after you are certain the artifacts are no longer needed. diff --git a/output/Docker/Dockerfile b/output/Docker/Dockerfile new file mode 100644 index 0000000..29036b3 --- /dev/null +++ b/output/Docker/Dockerfile @@ -0,0 +1,26 @@ +FROM debian:bookworm + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +RUN apt-get update \ + && apt-get install --yes --no-install-recommends \ + python3 \ + python3-venv \ + gosu \ + pandoc \ + texlive-full \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY watch_and_convert.py entrypoint.sh ./ + +RUN chmod +x /app/entrypoint.sh /app/watch_and_convert.py + +ENV PUID=1000 \ + PGID=1000 + +ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/output/Docker/docker-compose.yml b/output/Docker/docker-compose.yml new file mode 100644 index 0000000..f762ef9 --- /dev/null +++ b/output/Docker/docker-compose.yml @@ -0,0 +1,19 @@ +name: ResumeCustomizer-OutputProcessor + +services: + resumecustomizer-outputprocessor: + build: + context: . + dockerfile: Dockerfile + container_name: ResumeCustomizer-OutputProcessor + restart: always + environment: + PUID: "${LOCAL_UID:-1000}" + PGID: "${LOCAL_GID:-1000}" + volumes: + - ../ForRelease/inbox:/data/inbox + - ../ForRelease/outbox:/data/outbox + - ../ForRelease/processed:/data/processed + - ../ForRelease/failed:/data/failed + - ../../input/templates:/templates:ro + - /etc/localtime:/etc/localtime:ro diff --git a/output/Docker/entrypoint.sh b/output/Docker/entrypoint.sh new file mode 100755 index 0000000..b9272eb --- /dev/null +++ b/output/Docker/entrypoint.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +set -euo pipefail + +PUID=${PUID:-1000} +PGID=${PGID:-1000} + +if ! command -v gosu >/dev/null 2>&1; then + echo "gosu is required but not installed" >&2 + exit 1 +fi + +if [ -d /data ]; then + chown -R "${PUID}:${PGID}" /data +fi + +export HOME=${HOME:-/tmp} + +exec gosu "${PUID}:${PGID}" python3 /app/watch_and_convert.py diff --git a/output/Docker/run-output-processor.sh b/output/Docker/run-output-processor.sh new file mode 100755 index 0000000..06c8d37 --- /dev/null +++ b/output/Docker/run-output-processor.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Wrapper to run docker compose with the caller's UID/GID so generated files stay writable. +set -euo pipefail + +if ! command -v docker >/dev/null 2>&1; then + echo "Error: docker is not installed or not on PATH." >&2 + exit 1 +fi + +if docker compose version >/dev/null 2>&1; then + COMPOSE_CMD=(docker compose) +elif command -v docker-compose >/dev/null 2>&1; then + COMPOSE_CMD=(docker-compose) +else + echo "Error: docker compose plugin or docker-compose binary is required." >&2 + exit 1 +fi + +CALLER_UID=$(id -u) +CALLER_GID=$(id -g) + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + +# Run docker compose from the Docker directory so it picks up the bundled yaml. +( + cd "${SCRIPT_DIR}" + LOCAL_UID="${CALLER_UID}" LOCAL_GID="${CALLER_GID}" "${COMPOSE_CMD[@]}" "$@" +) diff --git a/output/Docker/watch_and_convert.py b/output/Docker/watch_and_convert.py new file mode 100755 index 0000000..82c2b9e --- /dev/null +++ b/output/Docker/watch_and_convert.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +Monitor the inbox directory for Markdown files and convert them to DOCX/PDF outputs. + +The script runs indefinitely inside the container, polling the inbox for new files. +When a Markdown file is found, pandoc generates DOCX and PDF outputs using the +reference templates, places the results in a timestamped outbox path, and moves the +original Markdown file into the processed directory. +""" + +import logging +import shutil +import subprocess +import time +from datetime import datetime +from pathlib import Path + +INBOX = Path("/data/inbox") +OUTBOX = Path("/data/outbox") +PROCESSED = Path("/data/processed") +FAILED = Path("/data/failed") +TEMPLATES = Path("/templates") + +DOCX_TEMPLATE = TEMPLATES / "resume-reference.docx" +TEX_TEMPLATE = TEMPLATES / "resume-template.tex" + +POLL_INTERVAL_SECONDS = 5 + + +def ensure_environment() -> None: + """Verify required files and directories exist before processing starts.""" + missing = [] + for path in (INBOX, OUTBOX, PROCESSED, FAILED, DOCX_TEMPLATE, TEX_TEMPLATE): + if not path.exists(): + missing.append(str(path)) + + if missing: + raise FileNotFoundError( + "Required paths are missing inside the container: " + ", ".join(missing) + ) + + +def run_pandoc(input_md: Path, output_docx: Path, output_pdf: Path) -> None: + """Invoke pandoc twice to create DOCX and PDF artifacts.""" + subprocess.run( + [ + "pandoc", + str(input_md), + "--from", + "gfm", + "--to", + "docx", + "--reference-doc", + str(DOCX_TEMPLATE), + "--output", + str(output_docx), + ], + check=True, + ) + + subprocess.run( + [ + "pandoc", + str(input_md), + "--from", + "gfm", + "--pdf-engine", + "xelatex", + "--template", + str(TEX_TEMPLATE), + "--output", + str(output_pdf), + ], + check=True, + ) + + +def build_timestamp_dir(base: Path, timestamp: datetime) -> Path: + """Create (if needed) and return the timestamped directory under base.""" + subdir = ( + base + / timestamp.strftime("%Y") + / timestamp.strftime("%m") + / timestamp.strftime("%d") + / timestamp.strftime("%H%M") + ) + subdir.mkdir(parents=True, exist_ok=True) + return subdir + + +def process_markdown(md_file: Path) -> None: + """Convert the Markdown file and move it into the processed directory.""" + timestamp = datetime.now().astimezone() + out_dir = build_timestamp_dir(OUTBOX, timestamp) + processed_dir = build_timestamp_dir(PROCESSED, timestamp) + + stem = md_file.stem + output_docx = out_dir / f"{stem}.docx" + output_pdf = out_dir / f"{stem}.pdf" + + logging.info("Processing %s", md_file.name) + run_pandoc(md_file, output_docx, output_pdf) + + processed_target = processed_dir / md_file.name + counter = 1 + while processed_target.exists(): + processed_target = processed_dir / f"{stem}_{counter}.md" + counter += 1 + + shutil.move(str(md_file), processed_target) + logging.info("Completed %s -> %s (processed archived at %s)", md_file.name, out_dir, processed_target) + + +def move_to_failed(md_file: Path) -> None: + """Move the markdown file into the failed directory to avoid repeated retries.""" + if not md_file.exists(): + return + + stem = md_file.stem + failed_target = FAILED / md_file.name + counter = 1 + while failed_target.exists(): + failed_target = FAILED / f"{stem}_{counter}.md" + counter += 1 + + FAILED.mkdir(parents=True, exist_ok=True) + shutil.move(str(md_file), failed_target) + logging.info("Archived %s in failed directory at %s", md_file.name, failed_target) + + +def main() -> None: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + ) + + ensure_environment() + logging.info("Resume customizer watcher started") + + while True: + md_files = sorted(INBOX.glob("*.md")) + if not md_files: + time.sleep(POLL_INTERVAL_SECONDS) + continue + + for md_file in md_files: + try: + process_markdown(md_file) + except subprocess.CalledProcessError as exc: + logging.error("Pandoc failed for %s: %s", md_file.name, exc) + move_to_failed(md_file) + except Exception as exc: # noqa: BLE001 + logging.exception("Unexpected error while processing %s: %s", md_file.name, exc) + + time.sleep(POLL_INTERVAL_SECONDS) + + +if __name__ == "__main__": + main() diff --git a/output/ForRelease/failed/.gitkeep b/output/ForRelease/failed/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/output/ForRelease/outbox/.gitkeep b/output/ForRelease/outbox/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/output/ForRelease/outbox/.gitkeep @@ -0,0 +1 @@ + diff --git a/output/ForRelease/processed/.gitkeep b/output/ForRelease/processed/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/output/ForRelease/processed/.gitkeep @@ -0,0 +1 @@ + diff --git a/output/README.md b/output/README.md new file mode 100644 index 0000000..2f77111 --- /dev/null +++ b/output/README.md @@ -0,0 +1,40 @@ +# Output Pipeline Overview + +This directory contains the post-processing side of ResumeCustomizer. It is responsible for taking job-targeted Markdown resumes produced elsewhere in the system and turning them into printable DOCX/PDF artifacts. + +## Directory Layout +- `ForRelease/inbox`: drop a single `*.md` file here to trigger conversion. +- `ForRelease/outbox/YYYY/MM/DD/HHMM`: conversion results (paired `.docx` and `.pdf`) organized by timestamp so repeated runs never overwrite each other. +- `ForRelease/processed/YYYY/MM/DD/HHMM`: archives of Markdown files that converted successfully. +- `ForRelease/failed`: Markdown files that encountered an error during conversion (contains `.gitkeep` to preserve the directory). +- `Docker/`: container definition, watcher script, and helper wrapper that run the conversion daemon. + +## Running the Output Processor +Use the wrapper so the container writes files with your UID/GID: + +```bash +cd output/Docker +./run-output-processor.sh up -d +``` + +The script detects either the Docker Compose plugin or the legacy `docker-compose` binary and forwards any additional arguments you supply (`down`, `logs`, etc.). + +## What the Watcher Does +1. Polls `ForRelease/inbox` every few seconds for Markdown files. +2. Runs Pandoc using the shared DOCX and LaTeX templates to generate DOCX/PDF. +3. Drops the exports into the timestamped folder under `ForRelease/outbox`. +4. Moves the original Markdown into the matching timestamp folder under `ForRelease/processed`. +5. If the Pandoc conversion fails, moves the Markdown into `ForRelease/failed` so it can be reviewed without blocking subsequent runs. + +## Prerequisites +- Docker Engine with either the Compose plugin (`docker compose`) or standalone `docker-compose`. +- Pandoc templates available under `input/templates` relative to the repo root (mounted read-only into the container). + +Stop the service with: + +```bash +cd output/Docker +./run-output-processor.sh down +``` + +Log output is available through `./run-output-processor.sh logs -f`.