From f86704d67e7f6cdf963c579a02e5f2798d57d452 Mon Sep 17 00:00:00 2001 From: Archicratia Date: Sat, 28 Mar 2026 23:34:42 +0100 Subject: [PATCH] chore(tooling): add docx source audit and repair helpers --- scripts/audit-docx-source.py | 72 +++++++++++++++++++ scripts/fix-docx-source.py | 132 +++++++++++++++++++++++++++++++++++ scripts/refresh-chapter2.sh | 29 ++++++++ 3 files changed, 233 insertions(+) create mode 100755 scripts/audit-docx-source.py create mode 100755 scripts/fix-docx-source.py create mode 100755 scripts/refresh-chapter2.sh diff --git a/scripts/audit-docx-source.py b/scripts/audit-docx-source.py new file mode 100755 index 0000000..77833bf --- /dev/null +++ b/scripts/audit-docx-source.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import sys +import unicodedata +import xml.etree.ElementTree as ET +from zipfile import ZipFile + +NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} + +FORBIDDEN = [ + "coviabilité", + "sacroinstitutionnelle", + "technologistique", + "scripturonormative", + "textesrepères", + "ellemême", + "opérateur de d’archicration", + "systèmes plusieurs statuts", + "celle-ci se donne à voir", + "Pour autant il serait", + "Telles peuvent être le cas de", + "la co-viabilité devient ,", +] + + +def norm(s: str) -> str: + return unicodedata.normalize("NFC", s or "") + + +def main() -> int: + parser = argparse.ArgumentParser(description="Audit simple d’un DOCX source officiel.") + parser.add_argument("docx", help="Chemin du fichier .docx") + args = parser.parse_args() + + try: + with ZipFile(args.docx) as zf: + data = zf.read("word/document.xml") + except FileNotFoundError: + print(f"ECHEC: fichier introuvable: {args.docx}", file=sys.stderr) + return 2 + except KeyError: + print("ECHEC: word/document.xml introuvable dans le DOCX.", file=sys.stderr) + return 2 + except Exception as e: + print(f"ECHEC: impossible d’ouvrir le DOCX: {e}", file=sys.stderr) + return 2 + + root = ET.fromstring(data) + found = False + + for i, p in enumerate(root.findall(".//w:p", NS), start=1): + txt = "".join(t.text or "" for t in p.findall(".//w:t", NS)) + txt_n = norm(txt) + hits = [needle for needle in FORBIDDEN if needle in txt_n] + if hits: + found = True + print(f"\n[paragraphe {i}]") + print("Hits :", ", ".join(hits)) + print(txt_n) + + if found: + print("\nECHEC: formes interdites encore présentes dans le DOCX.") + return 1 + + print("OK: aucune forme interdite trouvée dans le DOCX.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) \ No newline at end of file diff --git a/scripts/fix-docx-source.py b/scripts/fix-docx-source.py new file mode 100755 index 0000000..8538a0e --- /dev/null +++ b/scripts/fix-docx-source.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import shutil +import tempfile +import unicodedata +import xml.etree.ElementTree as ET +from pathlib import Path +from zipfile import ZIP_DEFLATED, ZipFile + +W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" +XML_NS = "http://www.w3.org/XML/1998/namespace" +NS = {"w": W_NS} + +ET.register_namespace("w", W_NS) + + +REPLACEMENTS = { + "coviabilité": "co-viabilité", + "sacroinstitutionnelle": "sacro-institutionnelle", + "technologistique": "techno-logistique", + "scripturonormative": "scripturo-normative", + "textesrepères": "textes-repères", + "ellemême": "elle-même", + "opérateur de d’archicration": "opérateur d’archicration", + "systèmes plusieurs statuts": "systèmes à plusieurs statuts", + "celle-ci se donne à voir": "Celle-ci se donne à voir", + "Pour autant il serait": "Pour autant, il serait", + "Telles peuvent être le cas de": "Tels peuvent être les cas de", +} + +# volontairement NON auto-corrigé : "la co-viabilité devient ," +# ce cas demande une décision éditoriale humaine. + + +def qn(tag: str) -> str: + prefix, local = tag.split(":") + if prefix != "w": + raise ValueError(tag) + return f"{{{W_NS}}}{local}" + + +def norm(s: str) -> str: + return unicodedata.normalize("NFC", s or "") + + +def paragraph_text(p: ET.Element) -> str: + return "".join(t.text or "" for t in p.findall(".//w:t", NS)) + + +def replaced_text(s: str) -> str: + out = norm(s) + for bad, good in REPLACEMENTS.items(): + out = out.replace(bad, good) + return out + + +def rewrite_paragraph_text(p: ET.Element, new_text: str) -> None: + ppr = p.find("w:pPr", NS) + + for child in list(p): + if ppr is not None and child is ppr: + continue + p.remove(child) + + r = ET.Element(qn("w:r")) + t = ET.SubElement(r, qn("w:t")) + t.set(f"{{{XML_NS}}}space", "preserve") + t.text = new_text + p.append(r) + + +def process_document_xml(xml_path: Path) -> int: + tree = ET.parse(xml_path) + root = tree.getroot() + + changed = 0 + + for p in root.findall(".//w:p", NS): + old = paragraph_text(p) + new = replaced_text(old) + if new != old: + rewrite_paragraph_text(p, new) + changed += 1 + + tree.write(xml_path, encoding="utf-8", xml_declaration=True) + return changed + + +def repack_docx(tmpdir: Path, out_docx: Path) -> None: + tmp_out = out_docx.with_suffix(out_docx.suffix + ".tmp") + with ZipFile(tmp_out, "w", ZIP_DEFLATED) as zf: + for p in sorted(tmpdir.rglob("*")): + if p.is_file(): + zf.write(p, p.relative_to(tmpdir)) + shutil.move(tmp_out, out_docx) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Répare mécaniquement certaines scories DOCX.") + parser.add_argument("docx", help="Chemin du DOCX") + parser.add_argument("--in-place", action="store_true", help="Réécrit le DOCX en place") + args = parser.parse_args() + + src = Path(args.docx) + if not src.exists(): + print(f"ECHEC: fichier introuvable: {src}", file=sys.stderr) + return 2 + + out = src if args.in_place else src.with_name(src.stem + ".fixed.docx") + + with tempfile.TemporaryDirectory(prefix="docx-fix-") as td: + td_path = Path(td) + with ZipFile(src) as zf: + zf.extractall(td_path) + + document_xml = td_path / "word" / "document.xml" + if not document_xml.exists(): + print("ECHEC: word/document.xml absent.", file=sys.stderr) + return 2 + + changed = process_document_xml(document_xml) + repack_docx(td_path, out) + + print(f"OK: DOCX réparé par réécriture paragraphe/XML. Paragraphes modifiés: {changed}") + return 0 + + +if __name__ == "__main__": + import sys + raise SystemExit(main()) \ No newline at end of file diff --git a/scripts/refresh-chapter2.sh b/scripts/refresh-chapter2.sh new file mode 100755 index 0000000..8b1cff9 --- /dev/null +++ b/scripts/refresh-chapter2.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +set -euo pipefail + +DOCX="sources/docx/archicrat-ia/Chapitre_2–Archeogenese_des_regimes_de_co-viabilite-version_officielle.docx" +MANIFEST="sources/manifest.yml" +ONLY="archicrat-ia/chapitre-2" + +echo "== Audit source avant fix ==" +if ! python3 scripts/audit-docx-source.py "$DOCX"; then + echo + echo "== Fix source ==" + python3 scripts/fix-docx-source.py --in-place "$DOCX" + + echo + echo "== Audit source après fix ==" + python3 scripts/audit-docx-source.py "$DOCX" +fi + +echo +echo "== Réimport ==" +node scripts/import-docx.mjs --manifest "$MANIFEST" --only "$ONLY" --force + +echo +echo "== Build ==" +npm run build + +echo +echo "== Tests ==" +npm test \ No newline at end of file