Files
archicratie-edition/scripts/fix-docx-source.py
Archicratia f86704d67e
All checks were successful
SMOKE / smoke (push) Successful in 7s
CI / build-and-anchors (push) Successful in 48s
CI / build-and-anchors (pull_request) Successful in 47s
chore(tooling): add docx source audit and repair helpers
2026-03-28 23:34:42 +01:00

132 lines
3.7 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import shutil
import tempfile
import unicodedata
import xml.etree.ElementTree as ET
from pathlib import Path
from zipfile import ZIP_DEFLATED, ZipFile
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
XML_NS = "http://www.w3.org/XML/1998/namespace"
NS = {"w": W_NS}
ET.register_namespace("w", W_NS)
REPLACEMENTS = {
"coviabilité": "co-viabilité",
"sacroinstitutionnelle": "sacro-institutionnelle",
"technologistique": "techno-logistique",
"scripturonormative": "scripturo-normative",
"textesrepères": "textes-repères",
"ellemême": "elle-même",
"opérateur de darchicration": "opérateur darchicration",
"systèmes plusieurs statuts": "systèmes à plusieurs statuts",
"celle-ci se donne à voir": "Celle-ci se donne à voir",
"Pour autant il serait": "Pour autant, il serait",
"Telles peuvent être le cas de": "Tels peuvent être les cas de",
}
# volontairement NON auto-corrigé : "la co-viabilité devient ,"
# ce cas demande une décision éditoriale humaine.
def qn(tag: str) -> str:
prefix, local = tag.split(":")
if prefix != "w":
raise ValueError(tag)
return f"{{{W_NS}}}{local}"
def norm(s: str) -> str:
return unicodedata.normalize("NFC", s or "")
def paragraph_text(p: ET.Element) -> str:
return "".join(t.text or "" for t in p.findall(".//w:t", NS))
def replaced_text(s: str) -> str:
out = norm(s)
for bad, good in REPLACEMENTS.items():
out = out.replace(bad, good)
return out
def rewrite_paragraph_text(p: ET.Element, new_text: str) -> None:
ppr = p.find("w:pPr", NS)
for child in list(p):
if ppr is not None and child is ppr:
continue
p.remove(child)
r = ET.Element(qn("w:r"))
t = ET.SubElement(r, qn("w:t"))
t.set(f"{{{XML_NS}}}space", "preserve")
t.text = new_text
p.append(r)
def process_document_xml(xml_path: Path) -> int:
tree = ET.parse(xml_path)
root = tree.getroot()
changed = 0
for p in root.findall(".//w:p", NS):
old = paragraph_text(p)
new = replaced_text(old)
if new != old:
rewrite_paragraph_text(p, new)
changed += 1
tree.write(xml_path, encoding="utf-8", xml_declaration=True)
return changed
def repack_docx(tmpdir: Path, out_docx: Path) -> None:
tmp_out = out_docx.with_suffix(out_docx.suffix + ".tmp")
with ZipFile(tmp_out, "w", ZIP_DEFLATED) as zf:
for p in sorted(tmpdir.rglob("*")):
if p.is_file():
zf.write(p, p.relative_to(tmpdir))
shutil.move(tmp_out, out_docx)
def main() -> int:
parser = argparse.ArgumentParser(description="Répare mécaniquement certaines scories DOCX.")
parser.add_argument("docx", help="Chemin du DOCX")
parser.add_argument("--in-place", action="store_true", help="Réécrit le DOCX en place")
args = parser.parse_args()
src = Path(args.docx)
if not src.exists():
print(f"ECHEC: fichier introuvable: {src}", file=sys.stderr)
return 2
out = src if args.in_place else src.with_name(src.stem + ".fixed.docx")
with tempfile.TemporaryDirectory(prefix="docx-fix-") as td:
td_path = Path(td)
with ZipFile(src) as zf:
zf.extractall(td_path)
document_xml = td_path / "word" / "document.xml"
if not document_xml.exists():
print("ECHEC: word/document.xml absent.", file=sys.stderr)
return 2
changed = process_document_xml(document_xml)
repack_docx(td_path, out)
print(f"OK: DOCX réparé par réécriture paragraphe/XML. Paragraphes modifiés: {changed}")
return 0
if __name__ == "__main__":
import sys
raise SystemExit(main())