feat(input): normalize job descriptions

This commit is contained in:
2025-10-15 16:53:58 -05:00
parent 392daff8cc
commit 599821d25a
6 changed files with 223 additions and 74 deletions

239
input/Docker/watch_and_customize.py Executable file → Normal file
View File

@@ -1,12 +1,12 @@
#!/usr/bin/env python3
"""
Monitor the customization inbox for job description Markdown files and run the Codex CLI
Monitor the customization inbox, normalize messy job descriptions, and run the Codex CLI
to produce tailored resumes.
The script expects exactly one base resume Markdown file and processes one job file at a
time. After a successful Codex run, the generated resume is written into a timestamped
outbox folder and the job description is archived under processed/. Failures move the
job description into failed/.
The watcher expects exactly one base resume Markdown file and processes one job file at a
time. After Codex succeeds, the generated resume is written into a timestamped outbox
folder using the pattern <company>-<jobtitle>.md, while the original job file is archived
under processed/. Failures move the job description into failed/.
"""
from __future__ import annotations
@@ -32,15 +32,22 @@ TEMPLATES_DIR = Path("/templates")
TEMPLATE_CACHE = Path("/tmp/templates")
PROMPT_TEMPLATE = TEMPLATES_DIR / "ResumeCustomizerPrompt.md"
PROMPT_TEMPLATE_EXAMPLE = TEMPLATES_DIR / "ResumeCustomizerPrompt.md.example"
NORMALIZER_TEMPLATE = TEMPLATES_DIR / "JobDescriptionNormalizerPrompt.md"
NORMALIZER_TEMPLATE_EXAMPLE = TEMPLATES_DIR / "JobDescriptionNormalizerPrompt.md.example"
POLL_INTERVAL_SECONDS = int(os.environ.get("POLL_INTERVAL_SECONDS", "5"))
CODEX_COMMAND_TEMPLATE = os.environ.get(
"CODEX_COMMAND_TEMPLATE",
"codex prompt --input {prompt} --output {output} --format markdown",
)
CODEX_NORMALIZER_COMMAND_TEMPLATE = os.environ.get(
"CODEX_NORMALIZER_COMMAND_TEMPLATE",
CODEX_COMMAND_TEMPLATE,
)
CODEX_TIMEOUT_SECONDS = int(os.environ.get("CODEX_TIMEOUT_SECONDS", "600"))
RESOLVED_PROMPT_TEMPLATE: Path | None = None
RESOLVED_NORMALIZER_TEMPLATE: Path | None = None
class FatalConfigurationError(RuntimeError):
@@ -48,15 +55,15 @@ class FatalConfigurationError(RuntimeError):
@dataclass(frozen=True)
class MarkdownInputs:
resume: Path
job_description: Path
prompt_template: Path
class NormalizedJobDescription:
company: str
job_title: str
description_markdown: str
def ensure_environment() -> None:
"""Verify required directories and template assets exist."""
global RESOLVED_PROMPT_TEMPLATE
global RESOLVED_PROMPT_TEMPLATE, RESOLVED_NORMALIZER_TEMPLATE
missing = [
str(path)
@@ -76,15 +83,27 @@ def ensure_environment() -> None:
"Input pipeline is missing required paths: " + ", ".join(missing)
)
RESOLVED_PROMPT_TEMPLATE = resolve_prompt_template(
RESOLVED_PROMPT_TEMPLATE = resolve_template(
PROMPT_TEMPLATE,
PROMPT_TEMPLATE_EXAMPLE,
TEMPLATE_CACHE,
"Resume customization prompt",
)
RESOLVED_NORMALIZER_TEMPLATE = resolve_template(
NORMALIZER_TEMPLATE,
NORMALIZER_TEMPLATE_EXAMPLE,
TEMPLATE_CACHE,
"Job description normalizer prompt",
)
def resolve_prompt_template(primary: Path, example: Path, cache_dir: Path) -> Path:
"""Return the prompt template path, copying the example if needed."""
def resolve_template(
primary: Path,
example: Path,
cache_dir: Path,
description: str,
) -> Path:
"""Return the template path, copying the example if needed."""
if primary.exists():
return primary
@@ -95,7 +114,7 @@ def resolve_prompt_template(primary: Path, example: Path, cache_dir: Path) -> Pa
return cached
raise FatalConfigurationError(
f"Prompt template missing: {primary} (no example found at {example})"
f"{description} missing: {primary} (no example found at {example})"
)
@@ -115,46 +134,26 @@ def ensure_single_resume() -> Path:
return resumes[0]
def ensure_single_job(md_files: Sequence[Path]) -> Path | None:
"""Validate there is at most one job description file."""
if not md_files:
def ensure_single_job(paths: Sequence[Path]) -> Path | None:
"""Validate there is at most one job description file (any extension)."""
visible = [path for path in paths if path.is_file() and not path.name.startswith(".")]
if not visible:
return None
if len(md_files) > 1:
names = ", ".join(p.name for p in md_files)
if len(visible) > 1:
names = ", ".join(p.name for p in visible)
raise FatalConfigurationError(
f"Multiple job description files detected in inbox: {names} "
"— expected exactly one."
)
return md_files[0]
return visible[0]
def read_inputs(job_file: Path) -> MarkdownInputs:
"""Gather and return all markdown inputs required for the prompt."""
resume = ensure_single_resume()
missing = [str(path) for path in (job_file,) if not path.exists()]
if missing:
raise FatalConfigurationError(
"Required files disappeared before processing: " + ", ".join(missing)
)
if RESOLVED_PROMPT_TEMPLATE is None:
raise FatalConfigurationError("Prompt template was not resolved during startup.")
return MarkdownInputs(
resume=resume,
job_description=job_file,
prompt_template=RESOLVED_PROMPT_TEMPLATE,
)
def build_prompt_text(inputs: MarkdownInputs) -> str:
def build_prompt_text(resume: Path, job_markdown: str, prompt_template: Path) -> str:
"""Return the combined prompt string fed to the Codex CLI."""
resume_text = inputs.resume.read_text(encoding="utf-8").strip()
jd_text = inputs.job_description.read_text(encoding="utf-8").strip()
instructions_text = inputs.prompt_template.read_text(encoding="utf-8").strip()
resume_text = resume.read_text(encoding="utf-8").strip()
instructions_text = prompt_template.read_text(encoding="utf-8").strip()
return (
"# Resume Customization Request\n\n"
@@ -162,7 +161,7 @@ def build_prompt_text(inputs: MarkdownInputs) -> str:
f"{instructions_text}\n\n"
"---\n\n"
"## Job Description\n"
f"{jd_text}\n\n"
f"{job_markdown.strip()}\n\n"
"---\n\n"
"## Current Resume\n"
f"{resume_text}\n"
@@ -187,9 +186,18 @@ def sanitize_stem(stem: str) -> str:
return "".join(ch if ch.isalnum() else "_" for ch in stem) or "resume"
def run_codex(prompt_path: Path, output_path: Path) -> None:
"""Execute the Codex CLI using the configured command template."""
command_text = CODEX_COMMAND_TEMPLATE.format(
def slugify(component: str) -> str:
"""Turn a free-form string into a filesystem-friendly slug."""
normalized = "".join(
ch.lower() if ch.isalnum() else "-" for ch in component.strip()
)
parts = [part for part in normalized.split("-") if part]
return "-".join(parts)
def run_codex(prompt_path: Path, output_path: Path, command_template: str) -> None:
"""Execute the Codex CLI using the provided command template."""
command_text = command_template.format(
prompt=str(prompt_path),
output=str(output_path),
)
@@ -199,7 +207,7 @@ def run_codex(prompt_path: Path, output_path: Path) -> None:
command = shlex.split(command_text)
except ValueError as exc:
raise FatalConfigurationError(
f"Unable to parse CODEX_COMMAND_TEMPLATE into arguments: {exc}"
f"Unable to parse Codex command template into arguments: {exc}"
) from exc
try:
@@ -222,6 +230,85 @@ def run_codex(prompt_path: Path, output_path: Path) -> None:
)
def build_normalizer_prompt(raw_text: str) -> str:
"""Construct the prompt for normalizing the raw job description."""
if RESOLVED_NORMALIZER_TEMPLATE is None:
raise FatalConfigurationError("Normalizer template was not resolved during startup.")
instructions = RESOLVED_NORMALIZER_TEMPLATE.read_text(encoding="utf-8").strip()
return (
f"{instructions}\n\n"
"---\n\n"
"## Raw Job Description\n"
"```\n"
f"{raw_text.strip()}\n"
"```\n"
)
def parse_normalized_output(text: str) -> NormalizedJobDescription:
"""Parse the Codex-normalized output into structured pieces."""
lines = text.splitlines()
idx = 0
def next_non_empty(start: int) -> tuple[int, str]:
pos = start
while pos < len(lines):
content = lines[pos].strip()
if content:
return pos, content
pos += 1
raise RuntimeError("Normalized output is missing expected lines.")
idx, company_line = next_non_empty(idx)
if not company_line.lower().startswith("company:"):
raise RuntimeError(f"Expected 'Company:' line, found: {company_line!r}")
company = company_line[len("company:") :].strip()
idx, job_title_line = next_non_empty(idx + 1)
if not job_title_line.lower().startswith("job title:"):
raise RuntimeError(f"Expected 'Job Title:' line, found: {job_title_line!r}")
job_title = job_title_line[len("job title:") :].strip()
idx += 1
while idx < len(lines) and lines[idx].strip():
idx += 1
while idx < len(lines) and not lines[idx].strip():
idx += 1
description_lines = lines[idx:]
description = "\n".join(description_lines).strip()
if not description:
raise RuntimeError("Normalized output did not include a job description section.")
return NormalizedJobDescription(
company=company or "Company",
job_title=job_title or "Role",
description_markdown=description,
)
def normalize_job_description(job_file: Path) -> NormalizedJobDescription:
"""Use Codex to clean and extract metadata from the raw job description."""
raw_text = job_file.read_text(encoding="utf-8", errors="ignore").strip()
if not raw_text:
raise RuntimeError(f"Job description file {job_file.name} is empty after trimming.")
prompt_text = build_normalizer_prompt(raw_text)
with TemporaryDirectory() as tmp_dir_str:
tmp_dir = Path(tmp_dir_str)
prompt_path = tmp_dir / "normalize_prompt.md"
prompt_path.write_text(prompt_text, encoding="utf-8")
output_path = tmp_dir / "normalize_output.md"
run_codex(prompt_path, output_path, CODEX_NORMALIZER_COMMAND_TEMPLATE)
normalized_text = output_path.read_text(encoding="utf-8").strip()
return parse_normalized_output(normalized_text)
def move_with_unique_target(source: Path, destination_dir: Path) -> Path:
"""Move source into destination_dir, avoiding collisions with numeric suffixes."""
destination_dir.mkdir(parents=True, exist_ok=True)
@@ -240,17 +327,30 @@ def move_with_unique_target(source: Path, destination_dir: Path) -> Path:
def process_job(job_file: Path) -> None:
"""Combine inputs, run Codex, and archive outputs."""
"""Normalize the job description, run Codex, and archive outputs."""
timestamp = datetime.now().astimezone()
out_dir = build_timestamp_dir(OUTBOX, timestamp)
processed_dir = build_timestamp_dir(PROCESSED, timestamp)
inputs = read_inputs(job_file)
prompt_text = build_prompt_text(inputs)
resume_path = ensure_single_resume()
normalized = normalize_job_description(job_file)
safe_resume_stem = sanitize_stem(inputs.resume.stem)
safe_job_stem = sanitize_stem(job_file.stem)
output_filename = f"{safe_resume_stem}-for-{safe_job_stem}.md"
if RESOLVED_PROMPT_TEMPLATE is None:
raise FatalConfigurationError("Prompt template was not resolved during startup.")
prompt_text = build_prompt_text(
resume_path,
normalized.description_markdown,
RESOLVED_PROMPT_TEMPLATE,
)
safe_company = slugify(normalized.company)
safe_title = slugify(normalized.job_title)
if safe_company and safe_title:
output_stem = f"{safe_company}-{safe_title}"
else:
output_stem = sanitize_stem(job_file.stem)
output_filename = f"{output_stem}.md"
with TemporaryDirectory() as tmp_dir_str:
tmp_dir = Path(tmp_dir_str)
@@ -258,21 +358,36 @@ def process_job(job_file: Path) -> None:
prompt_path.write_text(prompt_text, encoding="utf-8")
output_path = tmp_dir / "codex_output.md"
run_codex(prompt_path, output_path)
run_codex(prompt_path, output_path, CODEX_COMMAND_TEMPLATE)
generated_output = out_dir / output_filename
counter = 1
while generated_output.exists():
generated_output = out_dir / f"{safe_resume_stem}-for-{safe_job_stem}_{counter}.md"
generated_output = out_dir / f"{output_stem}_{counter}.md"
counter += 1
shutil.move(str(output_path), generated_output)
logging.info("Generated customized resume at %s", generated_output)
logging.info(
"Generated customized resume for %s - %s at %s",
normalized.company,
normalized.job_title,
generated_output,
)
prompt_archive = out_dir / f"prompt-{safe_job_stem}.md"
prompt_archive = out_dir / f"prompt-{generated_output.stem}.md"
prompt_archive.write_text(prompt_text, encoding="utf-8")
normalized_archive = out_dir / f"job-description-{generated_output.stem}.md"
normalized_archive.write_text(
(
f"Company: {normalized.company}\n"
f"Job Title: {normalized.job_title}\n\n"
"# Job Description\n"
f"{normalized.description_markdown}\n"
),
encoding="utf-8",
)
processed_target = move_with_unique_target(job_file, processed_dir)
logging.info(
"Archived job description %s to %s",
@@ -309,10 +424,10 @@ def main() -> None:
logging.info("Resume customizer watcher started")
while True:
job_files = sorted(INBOX.glob("*.md"))
job_paths = sorted(INBOX.iterdir())
try:
job_file = ensure_single_job(job_files)
job_file = ensure_single_job(job_paths)
except FatalConfigurationError as exc:
logging.error("Fatal configuration error: %s", exc)
raise SystemExit(2) from exc