Synchronise les contenus glossaire et ajoute les scripts de conversion DOCX/MDX

2026-04-23 12:04:31 +02:00
parent fa46971e76
commit 5b427d5602
34 changed files with 6781 additions and 1084 deletions
--- a/scripts/convert_mdx_to_docx.py
+++ b/scripts/convert_mdx_to_docx.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python3
+import argparse
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+import zipfile
+
+try:
+    import yaml
+except ImportError:
+    print("Erreur : PyYAML n'est pas installé. Lance : pip3 install pyyaml")
+    sys.exit(1)
+
+try:
+    from docx import Document
+except ImportError:
+    print("Erreur : python-docx n'est pas installé. Lance : pip3 install python-docx")
+    sys.exit(1)
+
+
+def split_frontmatter(text: str):
+    if not text.startswith("---\n"):
+        return {}, text
+
+    match = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.DOTALL)
+    if not match:
+        return {}, text
+
+    yaml_block = match.group(1)
+    body = match.group(2)
+
+    try:
+        metadata = yaml.safe_load(yaml_block) or {}
+    except Exception as e:
+        print(f"Avertissement : frontmatter YAML illisible : {e}")
+        metadata = {}
+
+    return metadata, body
+
+
+def strip_mdx_artifacts(text: str):
+    # imports / exports MDX
+    text = re.sub(r"^\s*(import|export)\s+.+?$", "", text, flags=re.MULTILINE)
+
+    # composants autofermants : <Component />
+    text = re.sub(r"<[A-Z][A-Za-z0-9._-]*\b[^>]*\/>", "", text)
+
+    # composants bloc : <Component ...>...</Component>
+    text = re.sub(
+        r"<([A-Z][A-Za-z0-9._-]*)\b[^>]*>.*?</\1>",
+        "",
+        text,
+        flags=re.DOTALL,
+    )
+
+    # accolades seules résiduelles sur ligne
+    text = re.sub(r"^\s*{\s*}\s*$", "", text, flags=re.MULTILINE)
+
+    # lignes vides multiples
+    text = re.sub(r"\n{3,}", "\n\n", text)
+
+    return text.strip() + "\n"
+
+
+def inject_h1_from_title(metadata: dict, body: str):
+    title = metadata.get("title", "")
+    if not title:
+        return body
+
+    if re.match(r"^\s*#\s+", body):
+        return body
+
+    return f"# {title}\n\n{body.lstrip()}"
+
+
+def find_style_by_candidates(doc, candidates):
+    # Cherche d'abord par nom visible
+    for style in doc.styles:
+        for candidate in candidates:
+            if style.name == candidate:
+                return style
+
+    # Puis par style_id Word interne
+    for style in doc.styles:
+        style_id = getattr(style, "style_id", "")
+        if style_id in {"BodyText", "Heading1", "Heading2", "Heading3", "Heading4"}:
+            for candidate in candidates:
+                if candidate in {"Body Text", "Corps de texte"} and style_id == "BodyText":
+                    return style
+                if candidate in {"Heading 1", "Titre 1"} and style_id == "Heading1":
+                    return style
+                if candidate in {"Heading 2", "Titre 2"} and style_id == "Heading2":
+                    return style
+                if candidate in {"Heading 3", "Titre 3"} and style_id == "Heading3":
+                    return style
+                if candidate in {"Heading 4", "Titre 4"} and style_id == "Heading4":
+                    return style
+    return None
+
+def strip_leading_paragraph_numbers(text: str):
+    """
+    Supprime les numéros de paragraphe du type :
+    2. Texte...
+    11. Texte...
+    101. Texte...
+    sans toucher aux titres Markdown (#, ##, ###).
+    """
+    fixed_lines = []
+
+    for line in text.splitlines():
+        stripped = line.lstrip()
+
+        # Ne jamais toucher aux titres Markdown
+        if stripped.startswith("#"):
+            fixed_lines.append(line)
+            continue
+
+        # Supprime un numéro de paragraphe en début de ligne
+        line = re.sub(r"^\s*\d+\.\s+", "", line)
+        fixed_lines.append(line)
+
+    return "\n".join(fixed_lines) + "\n"
+
+def normalize_non_heading_paragraphs(docx_path: Path):
+    """
+    Force tous les paragraphes non-titres en Body Text / Corps de texte.
+    On laisse intacts les Heading 1-4.
+    """
+    doc = Document(str(docx_path))
+
+    body_style = find_style_by_candidates(doc, ["Body Text", "Corps de texte"])
+    if body_style is None:
+        print(f"Avertissement : style 'Body Text / Corps de texte' introuvable dans {docx_path.name}")
+        return
+
+    heading_names = {
+        "Heading 1", "Heading 2", "Heading 3", "Heading 4",
+        "Titre 1", "Titre 2", "Titre 3", "Titre 4",
+    }
+    heading_ids = {"Heading1", "Heading2", "Heading3", "Heading4"}
+
+    changed = 0
+
+    for para in doc.paragraphs:
+        text = para.text.strip()
+        if not text:
+            continue
+
+        current_style = para.style
+        current_name = current_style.name if current_style else ""
+        current_id = getattr(current_style, "style_id", "") if current_style else ""
+
+        if current_name in heading_names or current_id in heading_ids:
+            continue
+
+        # Tout le reste passe en Body Text
+        para.style = body_style
+        changed += 1
+
+    doc.save(str(docx_path))
+    print(f"   ↳ normalisation styles : {changed} paragraphe(s) mis en 'Body Text / Corps de texte'")
+
+def remove_word_bookmarks(docx_path: Path):
+    """
+    Supprime les bookmarks Word (signets) du DOCX.
+    Ce sont eux qui apparaissent comme crochets gris dans LibreOffice/Word
+    quand l'affichage des signets est activé.
+    """
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Dézipper le docx
+        with zipfile.ZipFile(docx_path, "r") as zin:
+            zin.extractall(tmpdir)
+
+        xml_targets = [
+            tmpdir / "word" / "document.xml",
+            tmpdir / "word" / "footnotes.xml",
+            tmpdir / "word" / "endnotes.xml",
+            tmpdir / "word" / "comments.xml",
+        ]
+
+        removed = 0
+
+        for xml_file in xml_targets:
+            if not xml_file.exists():
+                continue
+
+            text = xml_file.read_text(encoding="utf-8")
+
+            # enlever <w:bookmarkStart .../> et <w:bookmarkEnd .../>
+            text, c1 = re.subn(r"<w:bookmarkStart\b[^>]*/>", "", text)
+            text, c2 = re.subn(r"<w:bookmarkEnd\b[^>]*/>", "", text)
+
+            removed += c1 + c2
+            xml_file.write_text(text, encoding="utf-8")
+
+        # Rezipper
+        tmp_output = docx_path.with_suffix(".cleaned.docx")
+        with zipfile.ZipFile(tmp_output, "w", zipfile.ZIP_DEFLATED) as zout:
+            for file in tmpdir.rglob("*"):
+                if file.is_file():
+                    zout.write(file, file.relative_to(tmpdir))
+
+        tmp_output.replace(docx_path)
+        print(f"   ↳ suppression signets : {removed} balise(s) supprimée(s)")
+
+def convert_one_file(input_path: Path, output_path: Path, reference_doc: Path | None):
+    raw = input_path.read_text(encoding="utf-8")
+    metadata, body = split_frontmatter(raw)
+    body = strip_mdx_artifacts(body)
+    body = strip_leading_paragraph_numbers(body)
+    body = inject_h1_from_title(metadata, body)
+
+    with tempfile.NamedTemporaryFile("w", suffix=".md", delete=False, encoding="utf-8") as tmp:
+        tmp.write(body)
+        tmp_md = Path(tmp.name)
+
+    cmd = [
+        "pandoc",
+        str(tmp_md),
+        "-f",
+        "markdown",
+        "-o",
+        str(output_path),
+    ]
+
+    if reference_doc:
+        cmd.extend(["--reference-doc", str(reference_doc)])
+
+    try:
+        subprocess.run(cmd, check=True)
+    finally:
+        try:
+            tmp_md.unlink()
+        except FileNotFoundError:
+            pass
+
+    normalize_non_heading_paragraphs(output_path)
+    remove_word_bookmarks(output_path)
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convertit des fichiers MDX en DOCX en conservant H1/H2/H3/H4 et en forçant le corps en Body Text."
+    )
+    parser.add_argument("input_dir", help="Dossier contenant les .mdx")
+    parser.add_argument(
+        "--output-dir",
+        default=str(Path.home() / "Desktop" / "archicrat-ia-docx"),
+        help="Dossier de sortie DOCX"
+    )
+    parser.add_argument(
+        "--reference-doc",
+        default=None,
+        help="DOCX modèle Word à utiliser comme reference-doc"
+    )
+
+    args = parser.parse_args()
+
+    input_dir = Path(args.input_dir)
+    output_dir = Path(args.output_dir)
+    reference_doc = Path(args.reference_doc) if args.reference_doc else None
+
+    if not shutil.which("pandoc"):
+        print("Erreur : pandoc n'est pas installé. Installe-le avec : brew install pandoc")
+        sys.exit(1)
+
+    if not input_dir.exists() or not input_dir.is_dir():
+        print(f"Erreur : dossier introuvable : {input_dir}")
+        sys.exit(1)
+
+    if reference_doc and not reference_doc.exists():
+        print(f"Erreur : reference-doc introuvable : {reference_doc}")
+        sys.exit(1)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    mdx_files = sorted(input_dir.glob("*.mdx"))
+    if not mdx_files:
+        print(f"Aucun fichier .mdx trouvé dans : {input_dir}")
+        sys.exit(1)
+
+    print(f"Conversion de {len(mdx_files)} fichier(s)...")
+    print(f"Entrée  : {input_dir}")
+    print(f"Sortie  : {output_dir}")
+    if reference_doc:
+        print(f"Modèle  : {reference_doc}")
+    print()
+
+    for mdx_file in mdx_files:
+        docx_name = mdx_file.with_suffix(".docx").name
+        out_file = output_dir / docx_name
+        print(f"→ {mdx_file.name} -> {docx_name}")
+        convert_one_file(mdx_file, out_file, reference_doc)
+
+    print()
+    print("✅ Conversion terminée.")
+
+
+if __name__ == "__main__":
+    main()