chore(tooling): add docx source audit and repair helpers
This commit is contained in:
132
scripts/fix-docx-source.py
Executable file
132
scripts/fix-docx-source.py
Executable file
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
import tempfile
|
||||
import unicodedata
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
from zipfile import ZIP_DEFLATED, ZipFile
|
||||
|
||||
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
XML_NS = "http://www.w3.org/XML/1998/namespace"
|
||||
NS = {"w": W_NS}
|
||||
|
||||
ET.register_namespace("w", W_NS)
|
||||
|
||||
|
||||
REPLACEMENTS = {
|
||||
"coviabilité": "co-viabilité",
|
||||
"sacroinstitutionnelle": "sacro-institutionnelle",
|
||||
"technologistique": "techno-logistique",
|
||||
"scripturonormative": "scripturo-normative",
|
||||
"textesrepères": "textes-repères",
|
||||
"ellemême": "elle-même",
|
||||
"opérateur de d’archicration": "opérateur d’archicration",
|
||||
"systèmes plusieurs statuts": "systèmes à plusieurs statuts",
|
||||
"celle-ci se donne à voir": "Celle-ci se donne à voir",
|
||||
"Pour autant il serait": "Pour autant, il serait",
|
||||
"Telles peuvent être le cas de": "Tels peuvent être les cas de",
|
||||
}
|
||||
|
||||
# volontairement NON auto-corrigé : "la co-viabilité devient ,"
|
||||
# ce cas demande une décision éditoriale humaine.
|
||||
|
||||
|
||||
def qn(tag: str) -> str:
|
||||
prefix, local = tag.split(":")
|
||||
if prefix != "w":
|
||||
raise ValueError(tag)
|
||||
return f"{{{W_NS}}}{local}"
|
||||
|
||||
|
||||
def norm(s: str) -> str:
|
||||
return unicodedata.normalize("NFC", s or "")
|
||||
|
||||
|
||||
def paragraph_text(p: ET.Element) -> str:
|
||||
return "".join(t.text or "" for t in p.findall(".//w:t", NS))
|
||||
|
||||
|
||||
def replaced_text(s: str) -> str:
|
||||
out = norm(s)
|
||||
for bad, good in REPLACEMENTS.items():
|
||||
out = out.replace(bad, good)
|
||||
return out
|
||||
|
||||
|
||||
def rewrite_paragraph_text(p: ET.Element, new_text: str) -> None:
|
||||
ppr = p.find("w:pPr", NS)
|
||||
|
||||
for child in list(p):
|
||||
if ppr is not None and child is ppr:
|
||||
continue
|
||||
p.remove(child)
|
||||
|
||||
r = ET.Element(qn("w:r"))
|
||||
t = ET.SubElement(r, qn("w:t"))
|
||||
t.set(f"{{{XML_NS}}}space", "preserve")
|
||||
t.text = new_text
|
||||
p.append(r)
|
||||
|
||||
|
||||
def process_document_xml(xml_path: Path) -> int:
|
||||
tree = ET.parse(xml_path)
|
||||
root = tree.getroot()
|
||||
|
||||
changed = 0
|
||||
|
||||
for p in root.findall(".//w:p", NS):
|
||||
old = paragraph_text(p)
|
||||
new = replaced_text(old)
|
||||
if new != old:
|
||||
rewrite_paragraph_text(p, new)
|
||||
changed += 1
|
||||
|
||||
tree.write(xml_path, encoding="utf-8", xml_declaration=True)
|
||||
return changed
|
||||
|
||||
|
||||
def repack_docx(tmpdir: Path, out_docx: Path) -> None:
|
||||
tmp_out = out_docx.with_suffix(out_docx.suffix + ".tmp")
|
||||
with ZipFile(tmp_out, "w", ZIP_DEFLATED) as zf:
|
||||
for p in sorted(tmpdir.rglob("*")):
|
||||
if p.is_file():
|
||||
zf.write(p, p.relative_to(tmpdir))
|
||||
shutil.move(tmp_out, out_docx)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Répare mécaniquement certaines scories DOCX.")
|
||||
parser.add_argument("docx", help="Chemin du DOCX")
|
||||
parser.add_argument("--in-place", action="store_true", help="Réécrit le DOCX en place")
|
||||
args = parser.parse_args()
|
||||
|
||||
src = Path(args.docx)
|
||||
if not src.exists():
|
||||
print(f"ECHEC: fichier introuvable: {src}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
out = src if args.in_place else src.with_name(src.stem + ".fixed.docx")
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix="docx-fix-") as td:
|
||||
td_path = Path(td)
|
||||
with ZipFile(src) as zf:
|
||||
zf.extractall(td_path)
|
||||
|
||||
document_xml = td_path / "word" / "document.xml"
|
||||
if not document_xml.exists():
|
||||
print("ECHEC: word/document.xml absent.", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
changed = process_document_xml(document_xml)
|
||||
repack_docx(td_path, out)
|
||||
|
||||
print(f"OK: DOCX réparé par réécriture paragraphe/XML. Paragraphes modifiés: {changed}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user