From f86704d67e7f6cdf963c579a02e5f2798d57d452 Mon Sep 17 00:00:00 2001
From: Archicratia <area.technik@proton.me>
Date: Sat, 28 Mar 2026 23:34:42 +0100
Subject: [PATCH] chore(tooling): add docx source audit and repair helpers

---
 scripts/audit-docx-source.py |  72 +++++++++++++++++++
 scripts/fix-docx-source.py   | 132 +++++++++++++++++++++++++++++++++++
 scripts/refresh-chapter2.sh  |  29 ++++++++
 3 files changed, 233 insertions(+)
 create mode 100755 scripts/audit-docx-source.py
 create mode 100755 scripts/fix-docx-source.py
 create mode 100755 scripts/refresh-chapter2.sh

diff --git a/scripts/audit-docx-source.py b/scripts/audit-docx-source.py
new file mode 100755
index 0000000..77833bf
--- /dev/null
+++ b/scripts/audit-docx-source.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import sys
+import unicodedata
+import xml.etree.ElementTree as ET
+from zipfile import ZipFile
+
+NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
+
+FORBIDDEN = [
+    "coviabilité",
+    "sacroinstitutionnelle",
+    "technologistique",
+    "scripturonormative",
+    "textesrepères",
+    "ellemême",
+    "opérateur de d’archicration",
+    "systèmes plusieurs statuts",
+    "celle-ci se donne à voir",
+    "Pour autant il serait",
+    "Telles peuvent être le cas de",
+    "la co-viabilité devient ,",
+]
+
+
+def norm(s: str) -> str:
+    return unicodedata.normalize("NFC", s or "")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Audit simple d’un DOCX source officiel.")
+    parser.add_argument("docx", help="Chemin du fichier .docx")
+    args = parser.parse_args()
+
+    try:
+        with ZipFile(args.docx) as zf:
+            data = zf.read("word/document.xml")
+    except FileNotFoundError:
+        print(f"ECHEC: fichier introuvable: {args.docx}", file=sys.stderr)
+        return 2
+    except KeyError:
+        print("ECHEC: word/document.xml introuvable dans le DOCX.", file=sys.stderr)
+        return 2
+    except Exception as e:
+        print(f"ECHEC: impossible d’ouvrir le DOCX: {e}", file=sys.stderr)
+        return 2
+
+    root = ET.fromstring(data)
+    found = False
+
+    for i, p in enumerate(root.findall(".//w:p", NS), start=1):
+        txt = "".join(t.text or "" for t in p.findall(".//w:t", NS))
+        txt_n = norm(txt)
+        hits = [needle for needle in FORBIDDEN if needle in txt_n]
+        if hits:
+            found = True
+            print(f"\n[paragraphe {i}]")
+            print("Hits :", ", ".join(hits))
+            print(txt_n)
+
+    if found:
+        print("\nECHEC: formes interdites encore présentes dans le DOCX.")
+        return 1
+
+    print("OK: aucune forme interdite trouvée dans le DOCX.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
\ No newline at end of file
diff --git a/scripts/fix-docx-source.py b/scripts/fix-docx-source.py
new file mode 100755
index 0000000..8538a0e
--- /dev/null
+++ b/scripts/fix-docx-source.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import shutil
+import tempfile
+import unicodedata
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from zipfile import ZIP_DEFLATED, ZipFile
+
+W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+XML_NS = "http://www.w3.org/XML/1998/namespace"
+NS = {"w": W_NS}
+
+ET.register_namespace("w", W_NS)
+
+
+REPLACEMENTS = {
+    "coviabilité": "co-viabilité",
+    "sacroinstitutionnelle": "sacro-institutionnelle",
+    "technologistique": "techno-logistique",
+    "scripturonormative": "scripturo-normative",
+    "textesrepères": "textes-repères",
+    "ellemême": "elle-même",
+    "opérateur de d’archicration": "opérateur d’archicration",
+    "systèmes plusieurs statuts": "systèmes à plusieurs statuts",
+    "celle-ci se donne à voir": "Celle-ci se donne à voir",
+    "Pour autant il serait": "Pour autant, il serait",
+    "Telles peuvent être le cas de": "Tels peuvent être les cas de",
+}
+
+# volontairement NON auto-corrigé : "la co-viabilité devient ,"
+# ce cas demande une décision éditoriale humaine.
+
+
+def qn(tag: str) -> str:
+    prefix, local = tag.split(":")
+    if prefix != "w":
+        raise ValueError(tag)
+    return f"{{{W_NS}}}{local}"
+
+
+def norm(s: str) -> str:
+    return unicodedata.normalize("NFC", s or "")
+
+
+def paragraph_text(p: ET.Element) -> str:
+    return "".join(t.text or "" for t in p.findall(".//w:t", NS))
+
+
+def replaced_text(s: str) -> str:
+    out = norm(s)
+    for bad, good in REPLACEMENTS.items():
+        out = out.replace(bad, good)
+    return out
+
+
+def rewrite_paragraph_text(p: ET.Element, new_text: str) -> None:
+    ppr = p.find("w:pPr", NS)
+
+    for child in list(p):
+        if ppr is not None and child is ppr:
+            continue
+        p.remove(child)
+
+    r = ET.Element(qn("w:r"))
+    t = ET.SubElement(r, qn("w:t"))
+    t.set(f"{{{XML_NS}}}space", "preserve")
+    t.text = new_text
+    p.append(r)
+
+
+def process_document_xml(xml_path: Path) -> int:
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+
+    changed = 0
+
+    for p in root.findall(".//w:p", NS):
+        old = paragraph_text(p)
+        new = replaced_text(old)
+        if new != old:
+            rewrite_paragraph_text(p, new)
+            changed += 1
+
+    tree.write(xml_path, encoding="utf-8", xml_declaration=True)
+    return changed
+
+
+def repack_docx(tmpdir: Path, out_docx: Path) -> None:
+    tmp_out = out_docx.with_suffix(out_docx.suffix + ".tmp")
+    with ZipFile(tmp_out, "w", ZIP_DEFLATED) as zf:
+        for p in sorted(tmpdir.rglob("*")):
+            if p.is_file():
+                zf.write(p, p.relative_to(tmpdir))
+    shutil.move(tmp_out, out_docx)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Répare mécaniquement certaines scories DOCX.")
+    parser.add_argument("docx", help="Chemin du DOCX")
+    parser.add_argument("--in-place", action="store_true", help="Réécrit le DOCX en place")
+    args = parser.parse_args()
+
+    src = Path(args.docx)
+    if not src.exists():
+        print(f"ECHEC: fichier introuvable: {src}", file=sys.stderr)
+        return 2
+
+    out = src if args.in_place else src.with_name(src.stem + ".fixed.docx")
+
+    with tempfile.TemporaryDirectory(prefix="docx-fix-") as td:
+        td_path = Path(td)
+        with ZipFile(src) as zf:
+            zf.extractall(td_path)
+
+        document_xml = td_path / "word" / "document.xml"
+        if not document_xml.exists():
+            print("ECHEC: word/document.xml absent.", file=sys.stderr)
+            return 2
+
+        changed = process_document_xml(document_xml)
+        repack_docx(td_path, out)
+
+    print(f"OK: DOCX réparé par réécriture paragraphe/XML. Paragraphes modifiés: {changed}")
+    return 0
+
+
+if __name__ == "__main__":
+    import sys
+    raise SystemExit(main())
\ No newline at end of file
diff --git a/scripts/refresh-chapter2.sh b/scripts/refresh-chapter2.sh
new file mode 100755
index 0000000..8b1cff9
--- /dev/null
+++ b/scripts/refresh-chapter2.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+DOCX="sources/docx/archicrat-ia/Chapitre_2–Archeogenese_des_regimes_de_co-viabilite-version_officielle.docx"
+MANIFEST="sources/manifest.yml"
+ONLY="archicrat-ia/chapitre-2"
+
+echo "== Audit source avant fix =="
+if ! python3 scripts/audit-docx-source.py "$DOCX"; then
+  echo
+  echo "== Fix source =="
+  python3 scripts/fix-docx-source.py --in-place "$DOCX"
+
+  echo
+  echo "== Audit source après fix =="
+  python3 scripts/audit-docx-source.py "$DOCX"
+fi
+
+echo
+echo "== Réimport =="
+node scripts/import-docx.mjs --manifest "$MANIFEST" --only "$ONLY" --force
+
+echo
+echo "== Build =="
+npm run build
+
+echo
+echo "== Tests =="
+npm test
\ No newline at end of file