feat(input): normalize job descriptions
This commit is contained in:
239
input/Docker/watch_and_customize.py
Executable file → Normal file
239
input/Docker/watch_and_customize.py
Executable file → Normal file
@@ -1,12 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Monitor the customization inbox for job description Markdown files and run the Codex CLI
|
||||
Monitor the customization inbox, normalize messy job descriptions, and run the Codex CLI
|
||||
to produce tailored resumes.
|
||||
|
||||
The script expects exactly one base resume Markdown file and processes one job file at a
|
||||
time. After a successful Codex run, the generated resume is written into a timestamped
|
||||
outbox folder and the job description is archived under processed/. Failures move the
|
||||
job description into failed/.
|
||||
The watcher expects exactly one base resume Markdown file and processes one job file at a
|
||||
time. After Codex succeeds, the generated resume is written into a timestamped outbox
|
||||
folder using the pattern <company>-<jobtitle>.md, while the original job file is archived
|
||||
under processed/. Failures move the job description into failed/.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -32,15 +32,22 @@ TEMPLATES_DIR = Path("/templates")
|
||||
TEMPLATE_CACHE = Path("/tmp/templates")
|
||||
PROMPT_TEMPLATE = TEMPLATES_DIR / "ResumeCustomizerPrompt.md"
|
||||
PROMPT_TEMPLATE_EXAMPLE = TEMPLATES_DIR / "ResumeCustomizerPrompt.md.example"
|
||||
NORMALIZER_TEMPLATE = TEMPLATES_DIR / "JobDescriptionNormalizerPrompt.md"
|
||||
NORMALIZER_TEMPLATE_EXAMPLE = TEMPLATES_DIR / "JobDescriptionNormalizerPrompt.md.example"
|
||||
|
||||
POLL_INTERVAL_SECONDS = int(os.environ.get("POLL_INTERVAL_SECONDS", "5"))
|
||||
CODEX_COMMAND_TEMPLATE = os.environ.get(
|
||||
"CODEX_COMMAND_TEMPLATE",
|
||||
"codex prompt --input {prompt} --output {output} --format markdown",
|
||||
)
|
||||
CODEX_NORMALIZER_COMMAND_TEMPLATE = os.environ.get(
|
||||
"CODEX_NORMALIZER_COMMAND_TEMPLATE",
|
||||
CODEX_COMMAND_TEMPLATE,
|
||||
)
|
||||
CODEX_TIMEOUT_SECONDS = int(os.environ.get("CODEX_TIMEOUT_SECONDS", "600"))
|
||||
|
||||
RESOLVED_PROMPT_TEMPLATE: Path | None = None
|
||||
RESOLVED_NORMALIZER_TEMPLATE: Path | None = None
|
||||
|
||||
|
||||
class FatalConfigurationError(RuntimeError):
|
||||
@@ -48,15 +55,15 @@ class FatalConfigurationError(RuntimeError):
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MarkdownInputs:
|
||||
resume: Path
|
||||
job_description: Path
|
||||
prompt_template: Path
|
||||
class NormalizedJobDescription:
|
||||
company: str
|
||||
job_title: str
|
||||
description_markdown: str
|
||||
|
||||
|
||||
def ensure_environment() -> None:
|
||||
"""Verify required directories and template assets exist."""
|
||||
global RESOLVED_PROMPT_TEMPLATE
|
||||
global RESOLVED_PROMPT_TEMPLATE, RESOLVED_NORMALIZER_TEMPLATE
|
||||
|
||||
missing = [
|
||||
str(path)
|
||||
@@ -76,15 +83,27 @@ def ensure_environment() -> None:
|
||||
"Input pipeline is missing required paths: " + ", ".join(missing)
|
||||
)
|
||||
|
||||
RESOLVED_PROMPT_TEMPLATE = resolve_prompt_template(
|
||||
RESOLVED_PROMPT_TEMPLATE = resolve_template(
|
||||
PROMPT_TEMPLATE,
|
||||
PROMPT_TEMPLATE_EXAMPLE,
|
||||
TEMPLATE_CACHE,
|
||||
"Resume customization prompt",
|
||||
)
|
||||
RESOLVED_NORMALIZER_TEMPLATE = resolve_template(
|
||||
NORMALIZER_TEMPLATE,
|
||||
NORMALIZER_TEMPLATE_EXAMPLE,
|
||||
TEMPLATE_CACHE,
|
||||
"Job description normalizer prompt",
|
||||
)
|
||||
|
||||
|
||||
def resolve_prompt_template(primary: Path, example: Path, cache_dir: Path) -> Path:
|
||||
"""Return the prompt template path, copying the example if needed."""
|
||||
def resolve_template(
|
||||
primary: Path,
|
||||
example: Path,
|
||||
cache_dir: Path,
|
||||
description: str,
|
||||
) -> Path:
|
||||
"""Return the template path, copying the example if needed."""
|
||||
if primary.exists():
|
||||
return primary
|
||||
|
||||
@@ -95,7 +114,7 @@ def resolve_prompt_template(primary: Path, example: Path, cache_dir: Path) -> Pa
|
||||
return cached
|
||||
|
||||
raise FatalConfigurationError(
|
||||
f"Prompt template missing: {primary} (no example found at {example})"
|
||||
f"{description} missing: {primary} (no example found at {example})"
|
||||
)
|
||||
|
||||
|
||||
@@ -115,46 +134,26 @@ def ensure_single_resume() -> Path:
|
||||
return resumes[0]
|
||||
|
||||
|
||||
def ensure_single_job(md_files: Sequence[Path]) -> Path | None:
|
||||
"""Validate there is at most one job description file."""
|
||||
if not md_files:
|
||||
def ensure_single_job(paths: Sequence[Path]) -> Path | None:
|
||||
"""Validate there is at most one job description file (any extension)."""
|
||||
visible = [path for path in paths if path.is_file() and not path.name.startswith(".")]
|
||||
if not visible:
|
||||
return None
|
||||
|
||||
if len(md_files) > 1:
|
||||
names = ", ".join(p.name for p in md_files)
|
||||
if len(visible) > 1:
|
||||
names = ", ".join(p.name for p in visible)
|
||||
raise FatalConfigurationError(
|
||||
f"Multiple job description files detected in inbox: {names} "
|
||||
"— expected exactly one."
|
||||
)
|
||||
|
||||
return md_files[0]
|
||||
return visible[0]
|
||||
|
||||
|
||||
def read_inputs(job_file: Path) -> MarkdownInputs:
|
||||
"""Gather and return all markdown inputs required for the prompt."""
|
||||
resume = ensure_single_resume()
|
||||
|
||||
missing = [str(path) for path in (job_file,) if not path.exists()]
|
||||
if missing:
|
||||
raise FatalConfigurationError(
|
||||
"Required files disappeared before processing: " + ", ".join(missing)
|
||||
)
|
||||
|
||||
if RESOLVED_PROMPT_TEMPLATE is None:
|
||||
raise FatalConfigurationError("Prompt template was not resolved during startup.")
|
||||
|
||||
return MarkdownInputs(
|
||||
resume=resume,
|
||||
job_description=job_file,
|
||||
prompt_template=RESOLVED_PROMPT_TEMPLATE,
|
||||
)
|
||||
|
||||
|
||||
def build_prompt_text(inputs: MarkdownInputs) -> str:
|
||||
def build_prompt_text(resume: Path, job_markdown: str, prompt_template: Path) -> str:
|
||||
"""Return the combined prompt string fed to the Codex CLI."""
|
||||
resume_text = inputs.resume.read_text(encoding="utf-8").strip()
|
||||
jd_text = inputs.job_description.read_text(encoding="utf-8").strip()
|
||||
instructions_text = inputs.prompt_template.read_text(encoding="utf-8").strip()
|
||||
resume_text = resume.read_text(encoding="utf-8").strip()
|
||||
instructions_text = prompt_template.read_text(encoding="utf-8").strip()
|
||||
|
||||
return (
|
||||
"# Resume Customization Request\n\n"
|
||||
@@ -162,7 +161,7 @@ def build_prompt_text(inputs: MarkdownInputs) -> str:
|
||||
f"{instructions_text}\n\n"
|
||||
"---\n\n"
|
||||
"## Job Description\n"
|
||||
f"{jd_text}\n\n"
|
||||
f"{job_markdown.strip()}\n\n"
|
||||
"---\n\n"
|
||||
"## Current Resume\n"
|
||||
f"{resume_text}\n"
|
||||
@@ -187,9 +186,18 @@ def sanitize_stem(stem: str) -> str:
|
||||
return "".join(ch if ch.isalnum() else "_" for ch in stem) or "resume"
|
||||
|
||||
|
||||
def run_codex(prompt_path: Path, output_path: Path) -> None:
|
||||
"""Execute the Codex CLI using the configured command template."""
|
||||
command_text = CODEX_COMMAND_TEMPLATE.format(
|
||||
def slugify(component: str) -> str:
|
||||
"""Turn a free-form string into a filesystem-friendly slug."""
|
||||
normalized = "".join(
|
||||
ch.lower() if ch.isalnum() else "-" for ch in component.strip()
|
||||
)
|
||||
parts = [part for part in normalized.split("-") if part]
|
||||
return "-".join(parts)
|
||||
|
||||
|
||||
def run_codex(prompt_path: Path, output_path: Path, command_template: str) -> None:
|
||||
"""Execute the Codex CLI using the provided command template."""
|
||||
command_text = command_template.format(
|
||||
prompt=str(prompt_path),
|
||||
output=str(output_path),
|
||||
)
|
||||
@@ -199,7 +207,7 @@ def run_codex(prompt_path: Path, output_path: Path) -> None:
|
||||
command = shlex.split(command_text)
|
||||
except ValueError as exc:
|
||||
raise FatalConfigurationError(
|
||||
f"Unable to parse CODEX_COMMAND_TEMPLATE into arguments: {exc}"
|
||||
f"Unable to parse Codex command template into arguments: {exc}"
|
||||
) from exc
|
||||
|
||||
try:
|
||||
@@ -222,6 +230,85 @@ def run_codex(prompt_path: Path, output_path: Path) -> None:
|
||||
)
|
||||
|
||||
|
||||
def build_normalizer_prompt(raw_text: str) -> str:
|
||||
"""Construct the prompt for normalizing the raw job description."""
|
||||
if RESOLVED_NORMALIZER_TEMPLATE is None:
|
||||
raise FatalConfigurationError("Normalizer template was not resolved during startup.")
|
||||
|
||||
instructions = RESOLVED_NORMALIZER_TEMPLATE.read_text(encoding="utf-8").strip()
|
||||
return (
|
||||
f"{instructions}\n\n"
|
||||
"---\n\n"
|
||||
"## Raw Job Description\n"
|
||||
"```\n"
|
||||
f"{raw_text.strip()}\n"
|
||||
"```\n"
|
||||
)
|
||||
|
||||
|
||||
def parse_normalized_output(text: str) -> NormalizedJobDescription:
|
||||
"""Parse the Codex-normalized output into structured pieces."""
|
||||
lines = text.splitlines()
|
||||
idx = 0
|
||||
|
||||
def next_non_empty(start: int) -> tuple[int, str]:
|
||||
pos = start
|
||||
while pos < len(lines):
|
||||
content = lines[pos].strip()
|
||||
if content:
|
||||
return pos, content
|
||||
pos += 1
|
||||
raise RuntimeError("Normalized output is missing expected lines.")
|
||||
|
||||
idx, company_line = next_non_empty(idx)
|
||||
if not company_line.lower().startswith("company:"):
|
||||
raise RuntimeError(f"Expected 'Company:' line, found: {company_line!r}")
|
||||
company = company_line[len("company:") :].strip()
|
||||
|
||||
idx, job_title_line = next_non_empty(idx + 1)
|
||||
if not job_title_line.lower().startswith("job title:"):
|
||||
raise RuntimeError(f"Expected 'Job Title:' line, found: {job_title_line!r}")
|
||||
job_title = job_title_line[len("job title:") :].strip()
|
||||
|
||||
idx += 1
|
||||
while idx < len(lines) and lines[idx].strip():
|
||||
idx += 1
|
||||
|
||||
while idx < len(lines) and not lines[idx].strip():
|
||||
idx += 1
|
||||
|
||||
description_lines = lines[idx:]
|
||||
description = "\n".join(description_lines).strip()
|
||||
if not description:
|
||||
raise RuntimeError("Normalized output did not include a job description section.")
|
||||
|
||||
return NormalizedJobDescription(
|
||||
company=company or "Company",
|
||||
job_title=job_title or "Role",
|
||||
description_markdown=description,
|
||||
)
|
||||
|
||||
|
||||
def normalize_job_description(job_file: Path) -> NormalizedJobDescription:
|
||||
"""Use Codex to clean and extract metadata from the raw job description."""
|
||||
raw_text = job_file.read_text(encoding="utf-8", errors="ignore").strip()
|
||||
if not raw_text:
|
||||
raise RuntimeError(f"Job description file {job_file.name} is empty after trimming.")
|
||||
|
||||
prompt_text = build_normalizer_prompt(raw_text)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir_str:
|
||||
tmp_dir = Path(tmp_dir_str)
|
||||
prompt_path = tmp_dir / "normalize_prompt.md"
|
||||
prompt_path.write_text(prompt_text, encoding="utf-8")
|
||||
|
||||
output_path = tmp_dir / "normalize_output.md"
|
||||
run_codex(prompt_path, output_path, CODEX_NORMALIZER_COMMAND_TEMPLATE)
|
||||
normalized_text = output_path.read_text(encoding="utf-8").strip()
|
||||
|
||||
return parse_normalized_output(normalized_text)
|
||||
|
||||
|
||||
def move_with_unique_target(source: Path, destination_dir: Path) -> Path:
|
||||
"""Move source into destination_dir, avoiding collisions with numeric suffixes."""
|
||||
destination_dir.mkdir(parents=True, exist_ok=True)
|
||||
@@ -240,17 +327,30 @@ def move_with_unique_target(source: Path, destination_dir: Path) -> Path:
|
||||
|
||||
|
||||
def process_job(job_file: Path) -> None:
|
||||
"""Combine inputs, run Codex, and archive outputs."""
|
||||
"""Normalize the job description, run Codex, and archive outputs."""
|
||||
timestamp = datetime.now().astimezone()
|
||||
out_dir = build_timestamp_dir(OUTBOX, timestamp)
|
||||
processed_dir = build_timestamp_dir(PROCESSED, timestamp)
|
||||
|
||||
inputs = read_inputs(job_file)
|
||||
prompt_text = build_prompt_text(inputs)
|
||||
resume_path = ensure_single_resume()
|
||||
normalized = normalize_job_description(job_file)
|
||||
|
||||
safe_resume_stem = sanitize_stem(inputs.resume.stem)
|
||||
safe_job_stem = sanitize_stem(job_file.stem)
|
||||
output_filename = f"{safe_resume_stem}-for-{safe_job_stem}.md"
|
||||
if RESOLVED_PROMPT_TEMPLATE is None:
|
||||
raise FatalConfigurationError("Prompt template was not resolved during startup.")
|
||||
|
||||
prompt_text = build_prompt_text(
|
||||
resume_path,
|
||||
normalized.description_markdown,
|
||||
RESOLVED_PROMPT_TEMPLATE,
|
||||
)
|
||||
|
||||
safe_company = slugify(normalized.company)
|
||||
safe_title = slugify(normalized.job_title)
|
||||
if safe_company and safe_title:
|
||||
output_stem = f"{safe_company}-{safe_title}"
|
||||
else:
|
||||
output_stem = sanitize_stem(job_file.stem)
|
||||
output_filename = f"{output_stem}.md"
|
||||
|
||||
with TemporaryDirectory() as tmp_dir_str:
|
||||
tmp_dir = Path(tmp_dir_str)
|
||||
@@ -258,21 +358,36 @@ def process_job(job_file: Path) -> None:
|
||||
prompt_path.write_text(prompt_text, encoding="utf-8")
|
||||
|
||||
output_path = tmp_dir / "codex_output.md"
|
||||
|
||||
run_codex(prompt_path, output_path)
|
||||
run_codex(prompt_path, output_path, CODEX_COMMAND_TEMPLATE)
|
||||
|
||||
generated_output = out_dir / output_filename
|
||||
counter = 1
|
||||
while generated_output.exists():
|
||||
generated_output = out_dir / f"{safe_resume_stem}-for-{safe_job_stem}_{counter}.md"
|
||||
generated_output = out_dir / f"{output_stem}_{counter}.md"
|
||||
counter += 1
|
||||
|
||||
shutil.move(str(output_path), generated_output)
|
||||
logging.info("Generated customized resume at %s", generated_output)
|
||||
logging.info(
|
||||
"Generated customized resume for %s - %s at %s",
|
||||
normalized.company,
|
||||
normalized.job_title,
|
||||
generated_output,
|
||||
)
|
||||
|
||||
prompt_archive = out_dir / f"prompt-{safe_job_stem}.md"
|
||||
prompt_archive = out_dir / f"prompt-{generated_output.stem}.md"
|
||||
prompt_archive.write_text(prompt_text, encoding="utf-8")
|
||||
|
||||
normalized_archive = out_dir / f"job-description-{generated_output.stem}.md"
|
||||
normalized_archive.write_text(
|
||||
(
|
||||
f"Company: {normalized.company}\n"
|
||||
f"Job Title: {normalized.job_title}\n\n"
|
||||
"# Job Description\n"
|
||||
f"{normalized.description_markdown}\n"
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
processed_target = move_with_unique_target(job_file, processed_dir)
|
||||
logging.info(
|
||||
"Archived job description %s to %s",
|
||||
@@ -309,10 +424,10 @@ def main() -> None:
|
||||
logging.info("Resume customizer watcher started")
|
||||
|
||||
while True:
|
||||
job_files = sorted(INBOX.glob("*.md"))
|
||||
job_paths = sorted(INBOX.iterdir())
|
||||
|
||||
try:
|
||||
job_file = ensure_single_job(job_files)
|
||||
job_file = ensure_single_job(job_paths)
|
||||
except FatalConfigurationError as exc:
|
||||
logging.error("Fatal configuration error: %s", exc)
|
||||
raise SystemExit(2) from exc
|
||||
|
||||
Reference in New Issue
Block a user