feat(input): normalize job descriptions

2025-10-15 16:53:58 -05:00
parent 392daff8cc
commit 599821d25a
6 changed files with 223 additions and 74 deletions
--- a/input/Docker/watch_and_customize.py
+++ b/input/Docker/watch_and_customize.py
@@ -1,12 +1,12 @@
 #!/usr/bin/env python3
 """
-Monitor the customization inbox for job description Markdown files and run the Codex CLI
+Monitor the customization inbox, normalize messy job descriptions, and run the Codex CLI
 to produce tailored resumes.

-The script expects exactly one base resume Markdown file and processes one job file at a
-time. After a successful Codex run, the generated resume is written into a timestamped
-outbox folder and the job description is archived under processed/. Failures move the
-job description into failed/.
+The watcher expects exactly one base resume Markdown file and processes one job file at a
+time. After Codex succeeds, the generated resume is written into a timestamped outbox
+folder using the pattern <company>-<jobtitle>.md, while the original job file is archived
+under processed/. Failures move the job description into failed/.
 """

 from __future__ import annotations
@@ -32,15 +32,22 @@ TEMPLATES_DIR = Path("/templates")
 TEMPLATE_CACHE = Path("/tmp/templates")
 PROMPT_TEMPLATE = TEMPLATES_DIR / "ResumeCustomizerPrompt.md"
 PROMPT_TEMPLATE_EXAMPLE = TEMPLATES_DIR / "ResumeCustomizerPrompt.md.example"
+NORMALIZER_TEMPLATE = TEMPLATES_DIR / "JobDescriptionNormalizerPrompt.md"
+NORMALIZER_TEMPLATE_EXAMPLE = TEMPLATES_DIR / "JobDescriptionNormalizerPrompt.md.example"

 POLL_INTERVAL_SECONDS = int(os.environ.get("POLL_INTERVAL_SECONDS", "5"))
 CODEX_COMMAND_TEMPLATE = os.environ.get(
    "CODEX_COMMAND_TEMPLATE",
    "codex prompt --input {prompt} --output {output} --format markdown",
 )
+CODEX_NORMALIZER_COMMAND_TEMPLATE = os.environ.get(
+    "CODEX_NORMALIZER_COMMAND_TEMPLATE",
+    CODEX_COMMAND_TEMPLATE,
+)
 CODEX_TIMEOUT_SECONDS = int(os.environ.get("CODEX_TIMEOUT_SECONDS", "600"))

 RESOLVED_PROMPT_TEMPLATE: Path | None = None
+RESOLVED_NORMALIZER_TEMPLATE: Path | None = None


 class FatalConfigurationError(RuntimeError):
@@ -48,15 +55,15 @@ class FatalConfigurationError(RuntimeError):


@dataclass(frozen=True)
-class MarkdownInputs:
-    resume: Path
-    job_description: Path
-    prompt_template: Path
+class NormalizedJobDescription:
+    company: str
+    job_title: str
+    description_markdown: str


 def ensure_environment() -> None:
    """Verify required directories and template assets exist."""
-    global RESOLVED_PROMPT_TEMPLATE
+    global RESOLVED_PROMPT_TEMPLATE, RESOLVED_NORMALIZER_TEMPLATE

    missing = [
        str(path)
@@ -76,15 +83,27 @@ def ensure_environment() -> None:
            "Input pipeline is missing required paths: " + ", ".join(missing)
        )

-    RESOLVED_PROMPT_TEMPLATE = resolve_prompt_template(
+    RESOLVED_PROMPT_TEMPLATE = resolve_template(
        PROMPT_TEMPLATE,
        PROMPT_TEMPLATE_EXAMPLE,
        TEMPLATE_CACHE,
+        "Resume customization prompt",
+    )
+    RESOLVED_NORMALIZER_TEMPLATE = resolve_template(
+        NORMALIZER_TEMPLATE,
+        NORMALIZER_TEMPLATE_EXAMPLE,
+        TEMPLATE_CACHE,
+        "Job description normalizer prompt",
    )


-def resolve_prompt_template(primary: Path, example: Path, cache_dir: Path) -> Path:
-    """Return the prompt template path, copying the example if needed."""
+def resolve_template(
+    primary: Path,
+    example: Path,
+    cache_dir: Path,
+    description: str,
+) -> Path:
+    """Return the template path, copying the example if needed."""
    if primary.exists():
        return primary

@@ -95,7 +114,7 @@ def resolve_prompt_template(primary: Path, example: Path, cache_dir: Path) -> Pa
        return cached

    raise FatalConfigurationError(
-        f"Prompt template missing: {primary} (no example found at {example})"
+        f"{description} missing: {primary} (no example found at {example})"
    )


@@ -115,46 +134,26 @@ def ensure_single_resume() -> Path:
    return resumes[0]


-def ensure_single_job(md_files: Sequence[Path]) -> Path | None:
-    """Validate there is at most one job description file."""
-    if not md_files:
+def ensure_single_job(paths: Sequence[Path]) -> Path | None:
+    """Validate there is at most one job description file (any extension)."""
+    visible = [path for path in paths if path.is_file() and not path.name.startswith(".")]
+    if not visible:
        return None

-    if len(md_files) > 1:
-        names = ", ".join(p.name for p in md_files)
+    if len(visible) > 1:
+        names = ", ".join(p.name for p in visible)
        raise FatalConfigurationError(
            f"Multiple job description files detected in inbox: {names} "
            "— expected exactly one."
        )

-    return md_files[0]
+    return visible[0]


-def read_inputs(job_file: Path) -> MarkdownInputs:
-    """Gather and return all markdown inputs required for the prompt."""
-    resume = ensure_single_resume()
-
-    missing = [str(path) for path in (job_file,) if not path.exists()]
-    if missing:
-        raise FatalConfigurationError(
-            "Required files disappeared before processing: " + ", ".join(missing)
-        )
-
-    if RESOLVED_PROMPT_TEMPLATE is None:
-        raise FatalConfigurationError("Prompt template was not resolved during startup.")
-
-    return MarkdownInputs(
-        resume=resume,
-        job_description=job_file,
-        prompt_template=RESOLVED_PROMPT_TEMPLATE,
-    )
-
-
-def build_prompt_text(inputs: MarkdownInputs) -> str:
+def build_prompt_text(resume: Path, job_markdown: str, prompt_template: Path) -> str:
    """Return the combined prompt string fed to the Codex CLI."""
-    resume_text = inputs.resume.read_text(encoding="utf-8").strip()
-    jd_text = inputs.job_description.read_text(encoding="utf-8").strip()
-    instructions_text = inputs.prompt_template.read_text(encoding="utf-8").strip()
+    resume_text = resume.read_text(encoding="utf-8").strip()
+    instructions_text = prompt_template.read_text(encoding="utf-8").strip()

    return (
        "# Resume Customization Request\n\n"
@@ -162,7 +161,7 @@ def build_prompt_text(inputs: MarkdownInputs) -> str:
        f"{instructions_text}\n\n"
        "---\n\n"
        "## Job Description\n"
-        f"{jd_text}\n\n"
+        f"{job_markdown.strip()}\n\n"
        "---\n\n"
        "## Current Resume\n"
        f"{resume_text}\n"
@@ -187,9 +186,18 @@ def sanitize_stem(stem: str) -> str:
    return "".join(ch if ch.isalnum() else "_" for ch in stem) or "resume"


-def run_codex(prompt_path: Path, output_path: Path) -> None:
-    """Execute the Codex CLI using the configured command template."""
-    command_text = CODEX_COMMAND_TEMPLATE.format(
+def slugify(component: str) -> str:
+    """Turn a free-form string into a filesystem-friendly slug."""
+    normalized = "".join(
+        ch.lower() if ch.isalnum() else "-" for ch in component.strip()
+    )
+    parts = [part for part in normalized.split("-") if part]
+    return "-".join(parts)
+
+
+def run_codex(prompt_path: Path, output_path: Path, command_template: str) -> None:
+    """Execute the Codex CLI using the provided command template."""
+    command_text = command_template.format(
        prompt=str(prompt_path),
        output=str(output_path),
    )
@@ -199,7 +207,7 @@ def run_codex(prompt_path: Path, output_path: Path) -> None:
        command = shlex.split(command_text)
    except ValueError as exc:
        raise FatalConfigurationError(
-            f"Unable to parse CODEX_COMMAND_TEMPLATE into arguments: {exc}"
+            f"Unable to parse Codex command template into arguments: {exc}"
        ) from exc

    try:
@@ -222,6 +230,85 @@ def run_codex(prompt_path: Path, output_path: Path) -> None:
        )


+def build_normalizer_prompt(raw_text: str) -> str:
+    """Construct the prompt for normalizing the raw job description."""
+    if RESOLVED_NORMALIZER_TEMPLATE is None:
+        raise FatalConfigurationError("Normalizer template was not resolved during startup.")
+
+    instructions = RESOLVED_NORMALIZER_TEMPLATE.read_text(encoding="utf-8").strip()
+    return (
+        f"{instructions}\n\n"
+        "---\n\n"
+        "## Raw Job Description\n"
+        "```\n"
+        f"{raw_text.strip()}\n"
+        "```\n"
+    )
+
+
+def parse_normalized_output(text: str) -> NormalizedJobDescription:
+    """Parse the Codex-normalized output into structured pieces."""
+    lines = text.splitlines()
+    idx = 0
+
+    def next_non_empty(start: int) -> tuple[int, str]:
+        pos = start
+        while pos < len(lines):
+            content = lines[pos].strip()
+            if content:
+                return pos, content
+            pos += 1
+        raise RuntimeError("Normalized output is missing expected lines.")
+
+    idx, company_line = next_non_empty(idx)
+    if not company_line.lower().startswith("company:"):
+        raise RuntimeError(f"Expected 'Company:' line, found: {company_line!r}")
+    company = company_line[len("company:") :].strip()
+
+    idx, job_title_line = next_non_empty(idx + 1)
+    if not job_title_line.lower().startswith("job title:"):
+        raise RuntimeError(f"Expected 'Job Title:' line, found: {job_title_line!r}")
+    job_title = job_title_line[len("job title:") :].strip()
+
+    idx += 1
+    while idx < len(lines) and lines[idx].strip():
+        idx += 1
+
+    while idx < len(lines) and not lines[idx].strip():
+        idx += 1
+
+    description_lines = lines[idx:]
+    description = "\n".join(description_lines).strip()
+    if not description:
+        raise RuntimeError("Normalized output did not include a job description section.")
+
+    return NormalizedJobDescription(
+        company=company or "Company",
+        job_title=job_title or "Role",
+        description_markdown=description,
+    )
+
+
+def normalize_job_description(job_file: Path) -> NormalizedJobDescription:
+    """Use Codex to clean and extract metadata from the raw job description."""
+    raw_text = job_file.read_text(encoding="utf-8", errors="ignore").strip()
+    if not raw_text:
+        raise RuntimeError(f"Job description file {job_file.name} is empty after trimming.")
+
+    prompt_text = build_normalizer_prompt(raw_text)
+
+    with TemporaryDirectory() as tmp_dir_str:
+        tmp_dir = Path(tmp_dir_str)
+        prompt_path = tmp_dir / "normalize_prompt.md"
+        prompt_path.write_text(prompt_text, encoding="utf-8")
+
+        output_path = tmp_dir / "normalize_output.md"
+        run_codex(prompt_path, output_path, CODEX_NORMALIZER_COMMAND_TEMPLATE)
+        normalized_text = output_path.read_text(encoding="utf-8").strip()
+
+    return parse_normalized_output(normalized_text)
+
+
 def move_with_unique_target(source: Path, destination_dir: Path) -> Path:
    """Move source into destination_dir, avoiding collisions with numeric suffixes."""
    destination_dir.mkdir(parents=True, exist_ok=True)
@@ -240,17 +327,30 @@ def move_with_unique_target(source: Path, destination_dir: Path) -> Path:


 def process_job(job_file: Path) -> None:
-    """Combine inputs, run Codex, and archive outputs."""
+    """Normalize the job description, run Codex, and archive outputs."""
    timestamp = datetime.now().astimezone()
    out_dir = build_timestamp_dir(OUTBOX, timestamp)
    processed_dir = build_timestamp_dir(PROCESSED, timestamp)

-    inputs = read_inputs(job_file)
-    prompt_text = build_prompt_text(inputs)
+    resume_path = ensure_single_resume()
+    normalized = normalize_job_description(job_file)

-    safe_resume_stem = sanitize_stem(inputs.resume.stem)
-    safe_job_stem = sanitize_stem(job_file.stem)
-    output_filename = f"{safe_resume_stem}-for-{safe_job_stem}.md"
+    if RESOLVED_PROMPT_TEMPLATE is None:
+        raise FatalConfigurationError("Prompt template was not resolved during startup.")
+
+    prompt_text = build_prompt_text(
+        resume_path,
+        normalized.description_markdown,
+        RESOLVED_PROMPT_TEMPLATE,
+    )
+
+    safe_company = slugify(normalized.company)
+    safe_title = slugify(normalized.job_title)
+    if safe_company and safe_title:
+        output_stem = f"{safe_company}-{safe_title}"
+    else:
+        output_stem = sanitize_stem(job_file.stem)
+    output_filename = f"{output_stem}.md"

    with TemporaryDirectory() as tmp_dir_str:
        tmp_dir = Path(tmp_dir_str)
@@ -258,21 +358,36 @@ def process_job(job_file: Path) -> None:
        prompt_path.write_text(prompt_text, encoding="utf-8")

        output_path = tmp_dir / "codex_output.md"
-
-        run_codex(prompt_path, output_path)
+        run_codex(prompt_path, output_path, CODEX_COMMAND_TEMPLATE)

        generated_output = out_dir / output_filename
        counter = 1
        while generated_output.exists():
-            generated_output = out_dir / f"{safe_resume_stem}-for-{safe_job_stem}_{counter}.md"
+            generated_output = out_dir / f"{output_stem}_{counter}.md"
            counter += 1

        shutil.move(str(output_path), generated_output)
-        logging.info("Generated customized resume at %s", generated_output)
+        logging.info(
+            "Generated customized resume for %s - %s at %s",
+            normalized.company,
+            normalized.job_title,
+            generated_output,
+        )

-        prompt_archive = out_dir / f"prompt-{safe_job_stem}.md"
+        prompt_archive = out_dir / f"prompt-{generated_output.stem}.md"
        prompt_archive.write_text(prompt_text, encoding="utf-8")

+        normalized_archive = out_dir / f"job-description-{generated_output.stem}.md"
+        normalized_archive.write_text(
+            (
+                f"Company: {normalized.company}\n"
+                f"Job Title: {normalized.job_title}\n\n"
+                "# Job Description\n"
+                f"{normalized.description_markdown}\n"
+            ),
+            encoding="utf-8",
+        )
+
    processed_target = move_with_unique_target(job_file, processed_dir)
    logging.info(
        "Archived job description %s to %s",
@@ -309,10 +424,10 @@ def main() -> None:
    logging.info("Resume customizer watcher started")

    while True:
-        job_files = sorted(INBOX.glob("*.md"))
+        job_paths = sorted(INBOX.iterdir())

        try:
-            job_file = ensure_single_job(job_files)
+            job_file = ensure_single_job(job_paths)
        except FatalConfigurationError as exc:
            logging.error("Fatal configuration error: %s", exc)
            raise SystemExit(2) from exc