import json
import re
import subprocess
import tempfile
from pathlib import Path
import typer
DUPLICATE_RE = re.compile(
r"DUPLICATE_ENTRY: Duplicate removed\. Entry (\S+) .* entry (\S+)\."
)
# Emoji regex covering general emoji ranges
EMOJI_RE = re.compile(
"[\U0001f600-\U0001f64f" # emoticons
"\U0001f300-\U0001f5ff" # symbols & pictographs
"\U0001f680-\U0001f6ff" # transport & map symbols
"\U0001f1e0-\U0001f1ff" # flags (iOS)
"\U00002702-\U000027b0"
"\U000024c2-\U0001f251"
"]+",
flags=re.UNICODE,
)
[docs]
def strip_emojis(text: str) -> str:
"""Remove emoji characters from text."""
return EMOJI_RE.sub("", text)
[docs]
def merge_bibtex_files(bibfiles: list[Path], merged_path: Path):
"""Merge multiple BibTeX files into a single file (earlier takes precedence)."""
merged_content = ""
for bibfile in bibfiles:
typer.echo(f"Merging {bibfile}")
merged_content += bibfile.read_text(encoding="utf-8") + "\n"
# Strip emojis from merged content
cleaned_content = strip_emojis(merged_content)
merged_path.write_text(cleaned_content, encoding="utf-8")
typer.echo(f"Merged and cleaned {len(bibfiles)} files into {merged_path}")
[docs]
def run_bibtex_tidy_dedupe(input_bib: Path) -> tuple[str, dict[str, str]]:
"""Run bibtex-tidy to deduplicate, returning (deduplicated text, old→new mapping)."""
cmd = [
"bibtex-tidy",
"--duplicates=doi,citation,key",
"--merge=first",
"--omit=abstract,note",
"--remove-empty-fields",
"--remove-dupe-fields",
"--escape",
"--sort-fields",
"--strip-comments",
"--modify",
"--v2",
str(input_bib),
]
typer.echo(f"Running: {' '.join(cmd)}")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
typer.echo("bibtex-tidy failed:")
typer.echo(result.stdout)
typer.echo(result.stderr)
raise RuntimeError("bibtex-tidy failed")
# Parse duplicate mappings from stderr
key_updates = {}
for line in result.stdout.splitlines():
print(line)
if m := DUPLICATE_RE.search(line):
old_key, new_key = m.groups()
key_updates[old_key] = new_key
return input_bib.read_text(), key_updates
[docs]
def merge_and_dedupe(
bibfiles: list[Path],
output: Path = Path("merged.bib"),
mapping: Path = Path("duplicate_keys.json"),
):
"""Merge multiple BibTeX files, deduplicate, and write output and removed key map.
Earlier files take precedence.
This function takes a list of BibTeX files, merges them into a single file,
and then uses `bibtex-tidy` to deduplicate the entries. The deduplicated
BibTeX file is written to the specified output path. A JSON file containing
a mapping of the removed duplicate keys to the keys that were kept is also
generated.
Args:
bibfiles: A list of paths to the BibTeX files to merge.
output: The path to write the merged and deduplicated BibTeX file to.
mapping: The path to write the JSON file with the duplicate key mappings to.
"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".bib") as tmp:
merged_path = Path(tmp.name)
tmp.close()
try:
merge_bibtex_files(bibfiles, merged_path)
dedup_text, key_updates = run_bibtex_tidy_dedupe(merged_path)
output.write_text(dedup_text, encoding="utf-8")
mapping.write_text(json.dumps(key_updates, indent=2))
typer.echo(f"Wrote deduplicated bib to {output}")
typer.echo(f"Wrote deduplication key map to {mapping}")
finally:
merged_path.unlink() # Clean up temporary file
[docs]
def update_citation(duplicate_keys: Path, files: list[Path]):
"""
Update citations in LaTeX files based on duplicate key mappings from a JSON file.
"""
key_updates = json.loads(duplicate_keys.read_text())
cite_pattern = re.compile(r"(\\cite\w*\{([^}]+)\})")
def replace_cite_keys(match):
cite_command = match.group(1)
keys_str = match.group(2)
keys = [k.strip() for k in keys_str.split(",")]
updated_keys = [
key_updates.get(k, k) for k in keys if key_updates.get(k, k) is not None
]
return f"{cite_command.split('{')[0]}{{{','.join(updated_keys)}}}"
for file in files:
with open(file, "r") as f:
updated = []
for line in f:
updated.append(cite_pattern.sub(replace_cite_keys, line))
with open(file, "w") as f:
f.writelines(updated)