Files
archicratie-edition/scripts/audit-docx-source.py
Archicratia f86704d67e
All checks were successful
SMOKE / smoke (push) Successful in 7s
CI / build-and-anchors (push) Successful in 48s
CI / build-and-anchors (pull_request) Successful in 47s
chore(tooling): add docx source audit and repair helpers
2026-03-28 23:34:42 +01:00

72 lines
2.0 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import sys
import unicodedata
import xml.etree.ElementTree as ET
from zipfile import ZipFile
NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
FORBIDDEN = [
"coviabilité",
"sacroinstitutionnelle",
"technologistique",
"scripturonormative",
"textesrepères",
"ellemême",
"opérateur de darchicration",
"systèmes plusieurs statuts",
"celle-ci se donne à voir",
"Pour autant il serait",
"Telles peuvent être le cas de",
"la co-viabilité devient ,",
]
def norm(s: str) -> str:
return unicodedata.normalize("NFC", s or "")
def main() -> int:
parser = argparse.ArgumentParser(description="Audit simple dun DOCX source officiel.")
parser.add_argument("docx", help="Chemin du fichier .docx")
args = parser.parse_args()
try:
with ZipFile(args.docx) as zf:
data = zf.read("word/document.xml")
except FileNotFoundError:
print(f"ECHEC: fichier introuvable: {args.docx}", file=sys.stderr)
return 2
except KeyError:
print("ECHEC: word/document.xml introuvable dans le DOCX.", file=sys.stderr)
return 2
except Exception as e:
print(f"ECHEC: impossible douvrir le DOCX: {e}", file=sys.stderr)
return 2
root = ET.fromstring(data)
found = False
for i, p in enumerate(root.findall(".//w:p", NS), start=1):
txt = "".join(t.text or "" for t in p.findall(".//w:t", NS))
txt_n = norm(txt)
hits = [needle for needle in FORBIDDEN if needle in txt_n]
if hits:
found = True
print(f"\n[paragraphe {i}]")
print("Hits :", ", ".join(hits))
print(txt_n)
if found:
print("\nECHEC: formes interdites encore présentes dans le DOCX.")
return 1
print("OK: aucune forme interdite trouvée dans le DOCX.")
return 0
if __name__ == "__main__":
raise SystemExit(main())