chore(tooling): add docx source audit and repair helpers
This commit is contained in:
72
scripts/audit-docx-source.py
Executable file
72
scripts/audit-docx-source.py
Executable file
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import unicodedata
|
||||
import xml.etree.ElementTree as ET
|
||||
from zipfile import ZipFile
|
||||
|
||||
NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
||||
|
||||
FORBIDDEN = [
|
||||
"coviabilité",
|
||||
"sacroinstitutionnelle",
|
||||
"technologistique",
|
||||
"scripturonormative",
|
||||
"textesrepères",
|
||||
"ellemême",
|
||||
"opérateur de d’archicration",
|
||||
"systèmes plusieurs statuts",
|
||||
"celle-ci se donne à voir",
|
||||
"Pour autant il serait",
|
||||
"Telles peuvent être le cas de",
|
||||
"la co-viabilité devient ,",
|
||||
]
|
||||
|
||||
|
||||
def norm(s: str) -> str:
|
||||
return unicodedata.normalize("NFC", s or "")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Audit simple d’un DOCX source officiel.")
|
||||
parser.add_argument("docx", help="Chemin du fichier .docx")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
with ZipFile(args.docx) as zf:
|
||||
data = zf.read("word/document.xml")
|
||||
except FileNotFoundError:
|
||||
print(f"ECHEC: fichier introuvable: {args.docx}", file=sys.stderr)
|
||||
return 2
|
||||
except KeyError:
|
||||
print("ECHEC: word/document.xml introuvable dans le DOCX.", file=sys.stderr)
|
||||
return 2
|
||||
except Exception as e:
|
||||
print(f"ECHEC: impossible d’ouvrir le DOCX: {e}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
root = ET.fromstring(data)
|
||||
found = False
|
||||
|
||||
for i, p in enumerate(root.findall(".//w:p", NS), start=1):
|
||||
txt = "".join(t.text or "" for t in p.findall(".//w:t", NS))
|
||||
txt_n = norm(txt)
|
||||
hits = [needle for needle in FORBIDDEN if needle in txt_n]
|
||||
if hits:
|
||||
found = True
|
||||
print(f"\n[paragraphe {i}]")
|
||||
print("Hits :", ", ".join(hits))
|
||||
print(txt_n)
|
||||
|
||||
if found:
|
||||
print("\nECHEC: formes interdites encore présentes dans le DOCX.")
|
||||
return 1
|
||||
|
||||
print("OK: aucune forme interdite trouvée dans le DOCX.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
132
scripts/fix-docx-source.py
Executable file
132
scripts/fix-docx-source.py
Executable file
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
import tempfile
|
||||
import unicodedata
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
from zipfile import ZIP_DEFLATED, ZipFile
|
||||
|
||||
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
XML_NS = "http://www.w3.org/XML/1998/namespace"
|
||||
NS = {"w": W_NS}
|
||||
|
||||
ET.register_namespace("w", W_NS)
|
||||
|
||||
|
||||
REPLACEMENTS = {
|
||||
"coviabilité": "co-viabilité",
|
||||
"sacroinstitutionnelle": "sacro-institutionnelle",
|
||||
"technologistique": "techno-logistique",
|
||||
"scripturonormative": "scripturo-normative",
|
||||
"textesrepères": "textes-repères",
|
||||
"ellemême": "elle-même",
|
||||
"opérateur de d’archicration": "opérateur d’archicration",
|
||||
"systèmes plusieurs statuts": "systèmes à plusieurs statuts",
|
||||
"celle-ci se donne à voir": "Celle-ci se donne à voir",
|
||||
"Pour autant il serait": "Pour autant, il serait",
|
||||
"Telles peuvent être le cas de": "Tels peuvent être les cas de",
|
||||
}
|
||||
|
||||
# volontairement NON auto-corrigé : "la co-viabilité devient ,"
|
||||
# ce cas demande une décision éditoriale humaine.
|
||||
|
||||
|
||||
def qn(tag: str) -> str:
|
||||
prefix, local = tag.split(":")
|
||||
if prefix != "w":
|
||||
raise ValueError(tag)
|
||||
return f"{{{W_NS}}}{local}"
|
||||
|
||||
|
||||
def norm(s: str) -> str:
|
||||
return unicodedata.normalize("NFC", s or "")
|
||||
|
||||
|
||||
def paragraph_text(p: ET.Element) -> str:
|
||||
return "".join(t.text or "" for t in p.findall(".//w:t", NS))
|
||||
|
||||
|
||||
def replaced_text(s: str) -> str:
|
||||
out = norm(s)
|
||||
for bad, good in REPLACEMENTS.items():
|
||||
out = out.replace(bad, good)
|
||||
return out
|
||||
|
||||
|
||||
def rewrite_paragraph_text(p: ET.Element, new_text: str) -> None:
|
||||
ppr = p.find("w:pPr", NS)
|
||||
|
||||
for child in list(p):
|
||||
if ppr is not None and child is ppr:
|
||||
continue
|
||||
p.remove(child)
|
||||
|
||||
r = ET.Element(qn("w:r"))
|
||||
t = ET.SubElement(r, qn("w:t"))
|
||||
t.set(f"{{{XML_NS}}}space", "preserve")
|
||||
t.text = new_text
|
||||
p.append(r)
|
||||
|
||||
|
||||
def process_document_xml(xml_path: Path) -> int:
|
||||
tree = ET.parse(xml_path)
|
||||
root = tree.getroot()
|
||||
|
||||
changed = 0
|
||||
|
||||
for p in root.findall(".//w:p", NS):
|
||||
old = paragraph_text(p)
|
||||
new = replaced_text(old)
|
||||
if new != old:
|
||||
rewrite_paragraph_text(p, new)
|
||||
changed += 1
|
||||
|
||||
tree.write(xml_path, encoding="utf-8", xml_declaration=True)
|
||||
return changed
|
||||
|
||||
|
||||
def repack_docx(tmpdir: Path, out_docx: Path) -> None:
|
||||
tmp_out = out_docx.with_suffix(out_docx.suffix + ".tmp")
|
||||
with ZipFile(tmp_out, "w", ZIP_DEFLATED) as zf:
|
||||
for p in sorted(tmpdir.rglob("*")):
|
||||
if p.is_file():
|
||||
zf.write(p, p.relative_to(tmpdir))
|
||||
shutil.move(tmp_out, out_docx)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Répare mécaniquement certaines scories DOCX.")
|
||||
parser.add_argument("docx", help="Chemin du DOCX")
|
||||
parser.add_argument("--in-place", action="store_true", help="Réécrit le DOCX en place")
|
||||
args = parser.parse_args()
|
||||
|
||||
src = Path(args.docx)
|
||||
if not src.exists():
|
||||
print(f"ECHEC: fichier introuvable: {src}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
out = src if args.in_place else src.with_name(src.stem + ".fixed.docx")
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix="docx-fix-") as td:
|
||||
td_path = Path(td)
|
||||
with ZipFile(src) as zf:
|
||||
zf.extractall(td_path)
|
||||
|
||||
document_xml = td_path / "word" / "document.xml"
|
||||
if not document_xml.exists():
|
||||
print("ECHEC: word/document.xml absent.", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
changed = process_document_xml(document_xml)
|
||||
repack_docx(td_path, out)
|
||||
|
||||
print(f"OK: DOCX réparé par réécriture paragraphe/XML. Paragraphes modifiés: {changed}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
raise SystemExit(main())
|
||||
29
scripts/refresh-chapter2.sh
Executable file
29
scripts/refresh-chapter2.sh
Executable file
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
DOCX="sources/docx/archicrat-ia/Chapitre_2–Archeogenese_des_regimes_de_co-viabilite-version_officielle.docx"
|
||||
MANIFEST="sources/manifest.yml"
|
||||
ONLY="archicrat-ia/chapitre-2"
|
||||
|
||||
echo "== Audit source avant fix =="
|
||||
if ! python3 scripts/audit-docx-source.py "$DOCX"; then
|
||||
echo
|
||||
echo "== Fix source =="
|
||||
python3 scripts/fix-docx-source.py --in-place "$DOCX"
|
||||
|
||||
echo
|
||||
echo "== Audit source après fix =="
|
||||
python3 scripts/audit-docx-source.py "$DOCX"
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "== Réimport =="
|
||||
node scripts/import-docx.mjs --manifest "$MANIFEST" --only "$ONLY" --force
|
||||
|
||||
echo
|
||||
echo "== Build =="
|
||||
npm run build
|
||||
|
||||
echo
|
||||
echo "== Tests =="
|
||||
npm test
|
||||
Reference in New Issue
Block a user