chore(tooling): add docx source audit and repair helpers
All checks were successful
SMOKE / smoke (push) Successful in 7s
CI / build-and-anchors (push) Successful in 48s
CI / build-and-anchors (pull_request) Successful in 47s

This commit is contained in:
2026-03-28 23:34:42 +01:00
parent ec8e29a313
commit f86704d67e
3 changed files with 233 additions and 0 deletions

72
scripts/audit-docx-source.py Executable file
View File

@@ -0,0 +1,72 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import sys
import unicodedata
import xml.etree.ElementTree as ET
from zipfile import ZipFile
NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
FORBIDDEN = [
"coviabilité",
"sacroinstitutionnelle",
"technologistique",
"scripturonormative",
"textesrepères",
"ellemême",
"opérateur de darchicration",
"systèmes plusieurs statuts",
"celle-ci se donne à voir",
"Pour autant il serait",
"Telles peuvent être le cas de",
"la co-viabilité devient ,",
]
def norm(s: str) -> str:
return unicodedata.normalize("NFC", s or "")
def main() -> int:
parser = argparse.ArgumentParser(description="Audit simple dun DOCX source officiel.")
parser.add_argument("docx", help="Chemin du fichier .docx")
args = parser.parse_args()
try:
with ZipFile(args.docx) as zf:
data = zf.read("word/document.xml")
except FileNotFoundError:
print(f"ECHEC: fichier introuvable: {args.docx}", file=sys.stderr)
return 2
except KeyError:
print("ECHEC: word/document.xml introuvable dans le DOCX.", file=sys.stderr)
return 2
except Exception as e:
print(f"ECHEC: impossible douvrir le DOCX: {e}", file=sys.stderr)
return 2
root = ET.fromstring(data)
found = False
for i, p in enumerate(root.findall(".//w:p", NS), start=1):
txt = "".join(t.text or "" for t in p.findall(".//w:t", NS))
txt_n = norm(txt)
hits = [needle for needle in FORBIDDEN if needle in txt_n]
if hits:
found = True
print(f"\n[paragraphe {i}]")
print("Hits :", ", ".join(hits))
print(txt_n)
if found:
print("\nECHEC: formes interdites encore présentes dans le DOCX.")
return 1
print("OK: aucune forme interdite trouvée dans le DOCX.")
return 0
if __name__ == "__main__":
raise SystemExit(main())

132
scripts/fix-docx-source.py Executable file
View File

@@ -0,0 +1,132 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import shutil
import tempfile
import unicodedata
import xml.etree.ElementTree as ET
from pathlib import Path
from zipfile import ZIP_DEFLATED, ZipFile
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
XML_NS = "http://www.w3.org/XML/1998/namespace"
NS = {"w": W_NS}
ET.register_namespace("w", W_NS)
REPLACEMENTS = {
"coviabilité": "co-viabilité",
"sacroinstitutionnelle": "sacro-institutionnelle",
"technologistique": "techno-logistique",
"scripturonormative": "scripturo-normative",
"textesrepères": "textes-repères",
"ellemême": "elle-même",
"opérateur de darchicration": "opérateur darchicration",
"systèmes plusieurs statuts": "systèmes à plusieurs statuts",
"celle-ci se donne à voir": "Celle-ci se donne à voir",
"Pour autant il serait": "Pour autant, il serait",
"Telles peuvent être le cas de": "Tels peuvent être les cas de",
}
# volontairement NON auto-corrigé : "la co-viabilité devient ,"
# ce cas demande une décision éditoriale humaine.
def qn(tag: str) -> str:
prefix, local = tag.split(":")
if prefix != "w":
raise ValueError(tag)
return f"{{{W_NS}}}{local}"
def norm(s: str) -> str:
return unicodedata.normalize("NFC", s or "")
def paragraph_text(p: ET.Element) -> str:
return "".join(t.text or "" for t in p.findall(".//w:t", NS))
def replaced_text(s: str) -> str:
out = norm(s)
for bad, good in REPLACEMENTS.items():
out = out.replace(bad, good)
return out
def rewrite_paragraph_text(p: ET.Element, new_text: str) -> None:
ppr = p.find("w:pPr", NS)
for child in list(p):
if ppr is not None and child is ppr:
continue
p.remove(child)
r = ET.Element(qn("w:r"))
t = ET.SubElement(r, qn("w:t"))
t.set(f"{{{XML_NS}}}space", "preserve")
t.text = new_text
p.append(r)
def process_document_xml(xml_path: Path) -> int:
tree = ET.parse(xml_path)
root = tree.getroot()
changed = 0
for p in root.findall(".//w:p", NS):
old = paragraph_text(p)
new = replaced_text(old)
if new != old:
rewrite_paragraph_text(p, new)
changed += 1
tree.write(xml_path, encoding="utf-8", xml_declaration=True)
return changed
def repack_docx(tmpdir: Path, out_docx: Path) -> None:
tmp_out = out_docx.with_suffix(out_docx.suffix + ".tmp")
with ZipFile(tmp_out, "w", ZIP_DEFLATED) as zf:
for p in sorted(tmpdir.rglob("*")):
if p.is_file():
zf.write(p, p.relative_to(tmpdir))
shutil.move(tmp_out, out_docx)
def main() -> int:
parser = argparse.ArgumentParser(description="Répare mécaniquement certaines scories DOCX.")
parser.add_argument("docx", help="Chemin du DOCX")
parser.add_argument("--in-place", action="store_true", help="Réécrit le DOCX en place")
args = parser.parse_args()
src = Path(args.docx)
if not src.exists():
print(f"ECHEC: fichier introuvable: {src}", file=sys.stderr)
return 2
out = src if args.in_place else src.with_name(src.stem + ".fixed.docx")
with tempfile.TemporaryDirectory(prefix="docx-fix-") as td:
td_path = Path(td)
with ZipFile(src) as zf:
zf.extractall(td_path)
document_xml = td_path / "word" / "document.xml"
if not document_xml.exists():
print("ECHEC: word/document.xml absent.", file=sys.stderr)
return 2
changed = process_document_xml(document_xml)
repack_docx(td_path, out)
print(f"OK: DOCX réparé par réécriture paragraphe/XML. Paragraphes modifiés: {changed}")
return 0
if __name__ == "__main__":
import sys
raise SystemExit(main())

29
scripts/refresh-chapter2.sh Executable file
View File

@@ -0,0 +1,29 @@
#!/usr/bin/env bash
set -euo pipefail
DOCX="sources/docx/archicrat-ia/Chapitre_2Archeogenese_des_regimes_de_co-viabilite-version_officielle.docx"
MANIFEST="sources/manifest.yml"
ONLY="archicrat-ia/chapitre-2"
echo "== Audit source avant fix =="
if ! python3 scripts/audit-docx-source.py "$DOCX"; then
echo
echo "== Fix source =="
python3 scripts/fix-docx-source.py --in-place "$DOCX"
echo
echo "== Audit source après fix =="
python3 scripts/audit-docx-source.py "$DOCX"
fi
echo
echo "== Réimport =="
node scripts/import-docx.mjs --manifest "$MANIFEST" --only "$ONLY" --force
echo
echo "== Build =="
npm run build
echo
echo "== Tests =="
npm test