Source code for luci.bibtools

import json
import re
import subprocess
import tempfile
from pathlib import Path

import typer

DUPLICATE_RE = re.compile(
    r"DUPLICATE_ENTRY: Duplicate removed\. Entry (\S+) .* entry (\S+)\."
)

# Emoji regex covering general emoji ranges
EMOJI_RE = re.compile(
    "[\U0001f600-\U0001f64f"  # emoticons
    "\U0001f300-\U0001f5ff"  # symbols & pictographs
    "\U0001f680-\U0001f6ff"  # transport & map symbols
    "\U0001f1e0-\U0001f1ff"  # flags (iOS)
    "\U00002702-\U000027b0"
    "\U000024c2-\U0001f251"
    "]+",
    flags=re.UNICODE,
)


[docs] def strip_emojis(text: str) -> str: """Remove emoji characters from text.""" return EMOJI_RE.sub("", text)
[docs] def merge_bibtex_files(bibfiles: list[Path], merged_path: Path): """Merge multiple BibTeX files into a single file (earlier takes precedence).""" merged_content = "" for bibfile in bibfiles: typer.echo(f"Merging {bibfile}") merged_content += bibfile.read_text(encoding="utf-8") + "\n" # Strip emojis from merged content cleaned_content = strip_emojis(merged_content) merged_path.write_text(cleaned_content, encoding="utf-8") typer.echo(f"Merged and cleaned {len(bibfiles)} files into {merged_path}")
[docs] def run_bibtex_tidy_dedupe(input_bib: Path) -> tuple[str, dict[str, str]]: """Run bibtex-tidy to deduplicate, returning (deduplicated text, old→new mapping).""" cmd = [ "bibtex-tidy", "--duplicates=doi,citation,key", "--merge=first", "--omit=abstract,note", "--remove-empty-fields", "--remove-dupe-fields", "--escape", "--sort-fields", "--strip-comments", "--modify", "--v2", str(input_bib), ] typer.echo(f"Running: {' '.join(cmd)}") result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: typer.echo("bibtex-tidy failed:") typer.echo(result.stdout) typer.echo(result.stderr) raise RuntimeError("bibtex-tidy failed") # Parse duplicate mappings from stderr key_updates = {} for line in result.stdout.splitlines(): if m := DUPLICATE_RE.search(line): old_key, new_key = m.groups() key_updates[old_key] = new_key return input_bib.read_text(), key_updates
[docs] def merge_and_dedupe( bibfiles: list[Path], output: Path = Path("merged.bib"), mapping: Path = Path("duplicate_keys.json"), ): """ Merge multiple BibTeX files and deduplicate using bibtex-tidy. Writes the deduplicated BibTeX to --output and a JSON map of removed duplicate keys to kept keys to --mapping. Earlier files take precedence. Requires `bibtex-tidy` to be installed and on PATH. """ with tempfile.NamedTemporaryFile(delete=False, suffix=".bib") as tmp: merged_path = Path(tmp.name) tmp.close() try: merge_bibtex_files(bibfiles, merged_path) dedup_text, key_updates = run_bibtex_tidy_dedupe(merged_path) output.write_text(dedup_text, encoding="utf-8") mapping.write_text(json.dumps(key_updates, indent=2)) typer.echo(f"Wrote deduplicated bib to {output}") typer.echo(f"Wrote deduplication key map to {mapping}") finally: merged_path.unlink() # Clean up temporary file
[docs] def update_citation(duplicate_keys: Path, files: list[Path]): r""" Update LaTeX citation keys using a JSON mapping. Use the mapping produced by `merge-bibs` (old→new keys) to rewrite citation commands (e.g. \cite, \citet) across one or more files. """ key_updates = json.loads(duplicate_keys.read_text()) cite_pattern = re.compile(r"(\\cite\w*\{([^}]+)\})") def replace_cite_keys(match): cite_command = match.group(1) keys_str = match.group(2) keys = [k.strip() for k in keys_str.split(",")] updated_keys = [ key_updates.get(k, k) for k in keys if key_updates.get(k, k) is not None ] return f"{cite_command.split('{')[0]}{{{','.join(updated_keys)}}}" for file in files: with open(file) as f: updated = [] for line in f: updated.append(cite_pattern.sub(replace_cite_keys, line)) with open(file, "w") as f: f.writelines(updated)