132 lines
3.7 KiB
Python
Executable File
132 lines
3.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import shutil
|
||
import tempfile
|
||
import unicodedata
|
||
import xml.etree.ElementTree as ET
|
||
from pathlib import Path
|
||
from zipfile import ZIP_DEFLATED, ZipFile
|
||
|
||
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||
XML_NS = "http://www.w3.org/XML/1998/namespace"
|
||
NS = {"w": W_NS}
|
||
|
||
ET.register_namespace("w", W_NS)
|
||
|
||
|
||
REPLACEMENTS = {
|
||
"coviabilité": "co-viabilité",
|
||
"sacroinstitutionnelle": "sacro-institutionnelle",
|
||
"technologistique": "techno-logistique",
|
||
"scripturonormative": "scripturo-normative",
|
||
"textesrepères": "textes-repères",
|
||
"ellemême": "elle-même",
|
||
"opérateur de d’archicration": "opérateur d’archicration",
|
||
"systèmes plusieurs statuts": "systèmes à plusieurs statuts",
|
||
"celle-ci se donne à voir": "Celle-ci se donne à voir",
|
||
"Pour autant il serait": "Pour autant, il serait",
|
||
"Telles peuvent être le cas de": "Tels peuvent être les cas de",
|
||
}
|
||
|
||
# volontairement NON auto-corrigé : "la co-viabilité devient ,"
|
||
# ce cas demande une décision éditoriale humaine.
|
||
|
||
|
||
def qn(tag: str) -> str:
|
||
prefix, local = tag.split(":")
|
||
if prefix != "w":
|
||
raise ValueError(tag)
|
||
return f"{{{W_NS}}}{local}"
|
||
|
||
|
||
def norm(s: str) -> str:
|
||
return unicodedata.normalize("NFC", s or "")
|
||
|
||
|
||
def paragraph_text(p: ET.Element) -> str:
|
||
return "".join(t.text or "" for t in p.findall(".//w:t", NS))
|
||
|
||
|
||
def replaced_text(s: str) -> str:
|
||
out = norm(s)
|
||
for bad, good in REPLACEMENTS.items():
|
||
out = out.replace(bad, good)
|
||
return out
|
||
|
||
|
||
def rewrite_paragraph_text(p: ET.Element, new_text: str) -> None:
|
||
ppr = p.find("w:pPr", NS)
|
||
|
||
for child in list(p):
|
||
if ppr is not None and child is ppr:
|
||
continue
|
||
p.remove(child)
|
||
|
||
r = ET.Element(qn("w:r"))
|
||
t = ET.SubElement(r, qn("w:t"))
|
||
t.set(f"{{{XML_NS}}}space", "preserve")
|
||
t.text = new_text
|
||
p.append(r)
|
||
|
||
|
||
def process_document_xml(xml_path: Path) -> int:
|
||
tree = ET.parse(xml_path)
|
||
root = tree.getroot()
|
||
|
||
changed = 0
|
||
|
||
for p in root.findall(".//w:p", NS):
|
||
old = paragraph_text(p)
|
||
new = replaced_text(old)
|
||
if new != old:
|
||
rewrite_paragraph_text(p, new)
|
||
changed += 1
|
||
|
||
tree.write(xml_path, encoding="utf-8", xml_declaration=True)
|
||
return changed
|
||
|
||
|
||
def repack_docx(tmpdir: Path, out_docx: Path) -> None:
|
||
tmp_out = out_docx.with_suffix(out_docx.suffix + ".tmp")
|
||
with ZipFile(tmp_out, "w", ZIP_DEFLATED) as zf:
|
||
for p in sorted(tmpdir.rglob("*")):
|
||
if p.is_file():
|
||
zf.write(p, p.relative_to(tmpdir))
|
||
shutil.move(tmp_out, out_docx)
|
||
|
||
|
||
def main() -> int:
|
||
parser = argparse.ArgumentParser(description="Répare mécaniquement certaines scories DOCX.")
|
||
parser.add_argument("docx", help="Chemin du DOCX")
|
||
parser.add_argument("--in-place", action="store_true", help="Réécrit le DOCX en place")
|
||
args = parser.parse_args()
|
||
|
||
src = Path(args.docx)
|
||
if not src.exists():
|
||
print(f"ECHEC: fichier introuvable: {src}", file=sys.stderr)
|
||
return 2
|
||
|
||
out = src if args.in_place else src.with_name(src.stem + ".fixed.docx")
|
||
|
||
with tempfile.TemporaryDirectory(prefix="docx-fix-") as td:
|
||
td_path = Path(td)
|
||
with ZipFile(src) as zf:
|
||
zf.extractall(td_path)
|
||
|
||
document_xml = td_path / "word" / "document.xml"
|
||
if not document_xml.exists():
|
||
print("ECHEC: word/document.xml absent.", file=sys.stderr)
|
||
return 2
|
||
|
||
changed = process_document_xml(document_xml)
|
||
repack_docx(td_path, out)
|
||
|
||
print(f"OK: DOCX réparé par réécriture paragraphe/XML. Paragraphes modifiés: {changed}")
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import sys
|
||
raise SystemExit(main()) |