Files
archicratie-edition/scripts/convert_mdx_to_docx.py
Archicratia 5b427d5602
All checks were successful
SMOKE / smoke (push) Successful in 4s
CI / build-and-anchors (push) Successful in 37s
CI / build-and-anchors (pull_request) Successful in 34s
Synchronise les contenus glossaire et ajoute les scripts de conversion DOCX/MDX
2026-04-23 12:04:31 +02:00

305 lines
9.0 KiB
Python

#!/usr/bin/env python3
import argparse
import re
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
import zipfile
try:
import yaml
except ImportError:
print("Erreur : PyYAML n'est pas installé. Lance : pip3 install pyyaml")
sys.exit(1)
try:
from docx import Document
except ImportError:
print("Erreur : python-docx n'est pas installé. Lance : pip3 install python-docx")
sys.exit(1)
def split_frontmatter(text: str):
if not text.startswith("---\n"):
return {}, text
match = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.DOTALL)
if not match:
return {}, text
yaml_block = match.group(1)
body = match.group(2)
try:
metadata = yaml.safe_load(yaml_block) or {}
except Exception as e:
print(f"Avertissement : frontmatter YAML illisible : {e}")
metadata = {}
return metadata, body
def strip_mdx_artifacts(text: str):
# imports / exports MDX
text = re.sub(r"^\s*(import|export)\s+.+?$", "", text, flags=re.MULTILINE)
# composants autofermants : <Component />
text = re.sub(r"<[A-Z][A-Za-z0-9._-]*\b[^>]*\/>", "", text)
# composants bloc : <Component ...>...</Component>
text = re.sub(
r"<([A-Z][A-Za-z0-9._-]*)\b[^>]*>.*?</\1>",
"",
text,
flags=re.DOTALL,
)
# accolades seules résiduelles sur ligne
text = re.sub(r"^\s*{\s*}\s*$", "", text, flags=re.MULTILINE)
# lignes vides multiples
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip() + "\n"
def inject_h1_from_title(metadata: dict, body: str):
title = metadata.get("title", "")
if not title:
return body
if re.match(r"^\s*#\s+", body):
return body
return f"# {title}\n\n{body.lstrip()}"
def find_style_by_candidates(doc, candidates):
# Cherche d'abord par nom visible
for style in doc.styles:
for candidate in candidates:
if style.name == candidate:
return style
# Puis par style_id Word interne
for style in doc.styles:
style_id = getattr(style, "style_id", "")
if style_id in {"BodyText", "Heading1", "Heading2", "Heading3", "Heading4"}:
for candidate in candidates:
if candidate in {"Body Text", "Corps de texte"} and style_id == "BodyText":
return style
if candidate in {"Heading 1", "Titre 1"} and style_id == "Heading1":
return style
if candidate in {"Heading 2", "Titre 2"} and style_id == "Heading2":
return style
if candidate in {"Heading 3", "Titre 3"} and style_id == "Heading3":
return style
if candidate in {"Heading 4", "Titre 4"} and style_id == "Heading4":
return style
return None
def strip_leading_paragraph_numbers(text: str):
"""
Supprime les numéros de paragraphe du type :
2. Texte...
11. Texte...
101. Texte...
sans toucher aux titres Markdown (#, ##, ###).
"""
fixed_lines = []
for line in text.splitlines():
stripped = line.lstrip()
# Ne jamais toucher aux titres Markdown
if stripped.startswith("#"):
fixed_lines.append(line)
continue
# Supprime un numéro de paragraphe en début de ligne
line = re.sub(r"^\s*\d+\.\s+", "", line)
fixed_lines.append(line)
return "\n".join(fixed_lines) + "\n"
def normalize_non_heading_paragraphs(docx_path: Path):
"""
Force tous les paragraphes non-titres en Body Text / Corps de texte.
On laisse intacts les Heading 1-4.
"""
doc = Document(str(docx_path))
body_style = find_style_by_candidates(doc, ["Body Text", "Corps de texte"])
if body_style is None:
print(f"Avertissement : style 'Body Text / Corps de texte' introuvable dans {docx_path.name}")
return
heading_names = {
"Heading 1", "Heading 2", "Heading 3", "Heading 4",
"Titre 1", "Titre 2", "Titre 3", "Titre 4",
}
heading_ids = {"Heading1", "Heading2", "Heading3", "Heading4"}
changed = 0
for para in doc.paragraphs:
text = para.text.strip()
if not text:
continue
current_style = para.style
current_name = current_style.name if current_style else ""
current_id = getattr(current_style, "style_id", "") if current_style else ""
if current_name in heading_names or current_id in heading_ids:
continue
# Tout le reste passe en Body Text
para.style = body_style
changed += 1
doc.save(str(docx_path))
print(f" ↳ normalisation styles : {changed} paragraphe(s) mis en 'Body Text / Corps de texte'")
def remove_word_bookmarks(docx_path: Path):
"""
Supprime les bookmarks Word (signets) du DOCX.
Ce sont eux qui apparaissent comme crochets gris dans LibreOffice/Word
quand l'affichage des signets est activé.
"""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Dézipper le docx
with zipfile.ZipFile(docx_path, "r") as zin:
zin.extractall(tmpdir)
xml_targets = [
tmpdir / "word" / "document.xml",
tmpdir / "word" / "footnotes.xml",
tmpdir / "word" / "endnotes.xml",
tmpdir / "word" / "comments.xml",
]
removed = 0
for xml_file in xml_targets:
if not xml_file.exists():
continue
text = xml_file.read_text(encoding="utf-8")
# enlever <w:bookmarkStart .../> et <w:bookmarkEnd .../>
text, c1 = re.subn(r"<w:bookmarkStart\b[^>]*/>", "", text)
text, c2 = re.subn(r"<w:bookmarkEnd\b[^>]*/>", "", text)
removed += c1 + c2
xml_file.write_text(text, encoding="utf-8")
# Rezipper
tmp_output = docx_path.with_suffix(".cleaned.docx")
with zipfile.ZipFile(tmp_output, "w", zipfile.ZIP_DEFLATED) as zout:
for file in tmpdir.rglob("*"):
if file.is_file():
zout.write(file, file.relative_to(tmpdir))
tmp_output.replace(docx_path)
print(f" ↳ suppression signets : {removed} balise(s) supprimée(s)")
def convert_one_file(input_path: Path, output_path: Path, reference_doc: Path | None):
raw = input_path.read_text(encoding="utf-8")
metadata, body = split_frontmatter(raw)
body = strip_mdx_artifacts(body)
body = strip_leading_paragraph_numbers(body)
body = inject_h1_from_title(metadata, body)
with tempfile.NamedTemporaryFile("w", suffix=".md", delete=False, encoding="utf-8") as tmp:
tmp.write(body)
tmp_md = Path(tmp.name)
cmd = [
"pandoc",
str(tmp_md),
"-f",
"markdown",
"-o",
str(output_path),
]
if reference_doc:
cmd.extend(["--reference-doc", str(reference_doc)])
try:
subprocess.run(cmd, check=True)
finally:
try:
tmp_md.unlink()
except FileNotFoundError:
pass
normalize_non_heading_paragraphs(output_path)
remove_word_bookmarks(output_path)
def main():
parser = argparse.ArgumentParser(
description="Convertit des fichiers MDX en DOCX en conservant H1/H2/H3/H4 et en forçant le corps en Body Text."
)
parser.add_argument("input_dir", help="Dossier contenant les .mdx")
parser.add_argument(
"--output-dir",
default=str(Path.home() / "Desktop" / "archicrat-ia-docx"),
help="Dossier de sortie DOCX"
)
parser.add_argument(
"--reference-doc",
default=None,
help="DOCX modèle Word à utiliser comme reference-doc"
)
args = parser.parse_args()
input_dir = Path(args.input_dir)
output_dir = Path(args.output_dir)
reference_doc = Path(args.reference_doc) if args.reference_doc else None
if not shutil.which("pandoc"):
print("Erreur : pandoc n'est pas installé. Installe-le avec : brew install pandoc")
sys.exit(1)
if not input_dir.exists() or not input_dir.is_dir():
print(f"Erreur : dossier introuvable : {input_dir}")
sys.exit(1)
if reference_doc and not reference_doc.exists():
print(f"Erreur : reference-doc introuvable : {reference_doc}")
sys.exit(1)
output_dir.mkdir(parents=True, exist_ok=True)
mdx_files = sorted(input_dir.glob("*.mdx"))
if not mdx_files:
print(f"Aucun fichier .mdx trouvé dans : {input_dir}")
sys.exit(1)
print(f"Conversion de {len(mdx_files)} fichier(s)...")
print(f"Entrée : {input_dir}")
print(f"Sortie : {output_dir}")
if reference_doc:
print(f"Modèle : {reference_doc}")
print()
for mdx_file in mdx_files:
docx_name = mdx_file.with_suffix(".docx").name
out_file = output_dir / docx_name
print(f"{mdx_file.name} -> {docx_name}")
convert_one_file(mdx_file, out_file, reference_doc)
print()
print("✅ Conversion terminée.")
if __name__ == "__main__":
main()