#!/usr/bin/env python3 from __future__ import annotations import argparse import shutil import tempfile import unicodedata import xml.etree.ElementTree as ET from pathlib import Path from zipfile import ZIP_DEFLATED, ZipFile W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" XML_NS = "http://www.w3.org/XML/1998/namespace" NS = {"w": W_NS} ET.register_namespace("w", W_NS) REPLACEMENTS = { "coviabilité": "co-viabilité", "sacroinstitutionnelle": "sacro-institutionnelle", "technologistique": "techno-logistique", "scripturonormative": "scripturo-normative", "textesrepères": "textes-repères", "ellemême": "elle-même", "opérateur de d’archicration": "opérateur d’archicration", "systèmes plusieurs statuts": "systèmes à plusieurs statuts", "celle-ci se donne à voir": "Celle-ci se donne à voir", "Pour autant il serait": "Pour autant, il serait", "Telles peuvent être le cas de": "Tels peuvent être les cas de", } # volontairement NON auto-corrigé : "la co-viabilité devient ," # ce cas demande une décision éditoriale humaine. def qn(tag: str) -> str: prefix, local = tag.split(":") if prefix != "w": raise ValueError(tag) return f"{{{W_NS}}}{local}" def norm(s: str) -> str: return unicodedata.normalize("NFC", s or "") def paragraph_text(p: ET.Element) -> str: return "".join(t.text or "" for t in p.findall(".//w:t", NS)) def replaced_text(s: str) -> str: out = norm(s) for bad, good in REPLACEMENTS.items(): out = out.replace(bad, good) return out def rewrite_paragraph_text(p: ET.Element, new_text: str) -> None: ppr = p.find("w:pPr", NS) for child in list(p): if ppr is not None and child is ppr: continue p.remove(child) r = ET.Element(qn("w:r")) t = ET.SubElement(r, qn("w:t")) t.set(f"{{{XML_NS}}}space", "preserve") t.text = new_text p.append(r) def process_document_xml(xml_path: Path) -> int: tree = ET.parse(xml_path) root = tree.getroot() changed = 0 for p in root.findall(".//w:p", NS): old = paragraph_text(p) new = replaced_text(old) if new != old: rewrite_paragraph_text(p, new) changed += 1 tree.write(xml_path, encoding="utf-8", xml_declaration=True) return changed def repack_docx(tmpdir: Path, out_docx: Path) -> None: tmp_out = out_docx.with_suffix(out_docx.suffix + ".tmp") with ZipFile(tmp_out, "w", ZIP_DEFLATED) as zf: for p in sorted(tmpdir.rglob("*")): if p.is_file(): zf.write(p, p.relative_to(tmpdir)) shutil.move(tmp_out, out_docx) def main() -> int: parser = argparse.ArgumentParser(description="Répare mécaniquement certaines scories DOCX.") parser.add_argument("docx", help="Chemin du DOCX") parser.add_argument("--in-place", action="store_true", help="Réécrit le DOCX en place") args = parser.parse_args() src = Path(args.docx) if not src.exists(): print(f"ECHEC: fichier introuvable: {src}", file=sys.stderr) return 2 out = src if args.in_place else src.with_name(src.stem + ".fixed.docx") with tempfile.TemporaryDirectory(prefix="docx-fix-") as td: td_path = Path(td) with ZipFile(src) as zf: zf.extractall(td_path) document_xml = td_path / "word" / "document.xml" if not document_xml.exists(): print("ECHEC: word/document.xml absent.", file=sys.stderr) return 2 changed = process_document_xml(document_xml) repack_docx(td_path, out) print(f"OK: DOCX réparé par réécriture paragraphe/XML. Paragraphes modifiés: {changed}") return 0 if __name__ == "__main__": import sys raise SystemExit(main())