archicratie-edition/scripts/convert_mdx_to_docx.py

#!/usr/bin/env python3
import argparse
import re
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
import zipfile

try:
    import yaml
except ImportError:
    print("Erreur : PyYAML n'est pas installé. Lance : pip3 install pyyaml")
    sys.exit(1)

try:
    from docx import Document
except ImportError:
    print("Erreur : python-docx n'est pas installé. Lance : pip3 install python-docx")
    sys.exit(1)


def split_frontmatter(text: str):
    if not text.startswith("---\n"):
        return {}, text

    match = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.DOTALL)
    if not match:
        return {}, text

    yaml_block = match.group(1)
    body = match.group(2)

    try:
        metadata = yaml.safe_load(yaml_block) or {}
    except Exception as e:
        print(f"Avertissement : frontmatter YAML illisible : {e}")
        metadata = {}

    return metadata, body


def strip_mdx_artifacts(text: str):
    # imports / exports MDX
    text = re.sub(r"^\s*(import|export)\s+.+?$", "", text, flags=re.MULTILINE)

    # composants autofermants : <Component />
    text = re.sub(r"<[A-Z][A-Za-z0-9._-]*\b[^>]*\/>", "", text)

    # composants bloc : <Component ...>...</Component>
    text = re.sub(
        r"<([A-Z][A-Za-z0-9._-]*)\b[^>]*>.*?</\1>",
        "",
        text,
        flags=re.DOTALL,
    )

    # accolades seules résiduelles sur ligne
    text = re.sub(r"^\s*{\s*}\s*$", "", text, flags=re.MULTILINE)

    # lignes vides multiples
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip() + "\n"


def inject_h1_from_title(metadata: dict, body: str):
    title = metadata.get("title", "")
    if not title:
        return body

    if re.match(r"^\s*#\s+", body):
        return body

    return f"# {title}\n\n{body.lstrip()}"


def find_style_by_candidates(doc, candidates):
    # Cherche d'abord par nom visible
    for style in doc.styles:
        for candidate in candidates:
            if style.name == candidate:
                return style

    # Puis par style_id Word interne
    for style in doc.styles:
        style_id = getattr(style, "style_id", "")
        if style_id in {"BodyText", "Heading1", "Heading2", "Heading3", "Heading4"}:
            for candidate in candidates:
                if candidate in {"Body Text", "Corps de texte"} and style_id == "BodyText":
                    return style
                if candidate in {"Heading 1", "Titre 1"} and style_id == "Heading1":
                    return style
                if candidate in {"Heading 2", "Titre 2"} and style_id == "Heading2":
                    return style
                if candidate in {"Heading 3", "Titre 3"} and style_id == "Heading3":
                    return style
                if candidate in {"Heading 4", "Titre 4"} and style_id == "Heading4":
                    return style
    return None

def strip_leading_paragraph_numbers(text: str):
    """
    Supprime les numéros de paragraphe du type :
    2. Texte...
    11. Texte...
    101. Texte...
    sans toucher aux titres Markdown (#, ##, ###).
    """
    fixed_lines = []

    for line in text.splitlines():
        stripped = line.lstrip()

        # Ne jamais toucher aux titres Markdown
        if stripped.startswith("#"):
            fixed_lines.append(line)
            continue

        # Supprime un numéro de paragraphe en début de ligne
        line = re.sub(r"^\s*\d+\.\s+", "", line)
        fixed_lines.append(line)

    return "\n".join(fixed_lines) + "\n"

def normalize_non_heading_paragraphs(docx_path: Path):
    """
    Force tous les paragraphes non-titres en Body Text / Corps de texte.
    On laisse intacts les Heading 1-4.
    """
    doc = Document(str(docx_path))

    body_style = find_style_by_candidates(doc, ["Body Text", "Corps de texte"])
    if body_style is None:
        print(f"Avertissement : style 'Body Text / Corps de texte' introuvable dans {docx_path.name}")
        return

    heading_names = {
        "Heading 1", "Heading 2", "Heading 3", "Heading 4",
        "Titre 1", "Titre 2", "Titre 3", "Titre 4",
    }
    heading_ids = {"Heading1", "Heading2", "Heading3", "Heading4"}

    changed = 0

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue

        current_style = para.style
        current_name = current_style.name if current_style else ""
        current_id = getattr(current_style, "style_id", "") if current_style else ""

        if current_name in heading_names or current_id in heading_ids:
            continue

        # Tout le reste passe en Body Text
        para.style = body_style
        changed += 1

    doc.save(str(docx_path))
    print(f"   ↳ normalisation styles : {changed} paragraphe(s) mis en 'Body Text / Corps de texte'")

def remove_word_bookmarks(docx_path: Path):
    """
    Supprime les bookmarks Word (signets) du DOCX.
    Ce sont eux qui apparaissent comme crochets gris dans LibreOffice/Word
    quand l'affichage des signets est activé.
    """
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        # Dézipper le docx
        with zipfile.ZipFile(docx_path, "r") as zin:
            zin.extractall(tmpdir)

        xml_targets = [
            tmpdir / "word" / "document.xml",
            tmpdir / "word" / "footnotes.xml",
            tmpdir / "word" / "endnotes.xml",
            tmpdir / "word" / "comments.xml",
        ]

        removed = 0

        for xml_file in xml_targets:
            if not xml_file.exists():
                continue

            text = xml_file.read_text(encoding="utf-8")

            # enlever <w:bookmarkStart .../> et <w:bookmarkEnd .../>
            text, c1 = re.subn(r"<w:bookmarkStart\b[^>]*/>", "", text)
            text, c2 = re.subn(r"<w:bookmarkEnd\b[^>]*/>", "", text)

            removed += c1 + c2
            xml_file.write_text(text, encoding="utf-8")

        # Rezipper
        tmp_output = docx_path.with_suffix(".cleaned.docx")
        with zipfile.ZipFile(tmp_output, "w", zipfile.ZIP_DEFLATED) as zout:
            for file in tmpdir.rglob("*"):
                if file.is_file():
                    zout.write(file, file.relative_to(tmpdir))

        tmp_output.replace(docx_path)
        print(f"   ↳ suppression signets : {removed} balise(s) supprimée(s)")

def convert_one_file(input_path: Path, output_path: Path, reference_doc: Path | None):
    raw = input_path.read_text(encoding="utf-8")
    metadata, body = split_frontmatter(raw)
    body = strip_mdx_artifacts(body)
    body = strip_leading_paragraph_numbers(body)
    body = inject_h1_from_title(metadata, body)

    with tempfile.NamedTemporaryFile("w", suffix=".md", delete=False, encoding="utf-8") as tmp:
        tmp.write(body)
        tmp_md = Path(tmp.name)

    cmd = [
        "pandoc",
        str(tmp_md),
        "-f",
        "markdown",
        "-o",
        str(output_path),
    ]

    if reference_doc:
        cmd.extend(["--reference-doc", str(reference_doc)])

    try:
        subprocess.run(cmd, check=True)
    finally:
        try:
            tmp_md.unlink()
        except FileNotFoundError:
            pass

    normalize_non_heading_paragraphs(output_path)
    remove_word_bookmarks(output_path)

def main():
    parser = argparse.ArgumentParser(
        description="Convertit des fichiers MDX en DOCX en conservant H1/H2/H3/H4 et en forçant le corps en Body Text."
    )
    parser.add_argument("input_dir", help="Dossier contenant les .mdx")
    parser.add_argument(
        "--output-dir",
        default=str(Path.home() / "Desktop" / "archicrat-ia-docx"),
        help="Dossier de sortie DOCX"
    )
    parser.add_argument(
        "--reference-doc",
        default=None,
        help="DOCX modèle Word à utiliser comme reference-doc"
    )

    args = parser.parse_args()

    input_dir = Path(args.input_dir)
    output_dir = Path(args.output_dir)
    reference_doc = Path(args.reference_doc) if args.reference_doc else None

    if not shutil.which("pandoc"):
        print("Erreur : pandoc n'est pas installé. Installe-le avec : brew install pandoc")
        sys.exit(1)

    if not input_dir.exists() or not input_dir.is_dir():
        print(f"Erreur : dossier introuvable : {input_dir}")
        sys.exit(1)

    if reference_doc and not reference_doc.exists():
        print(f"Erreur : reference-doc introuvable : {reference_doc}")
        sys.exit(1)

    output_dir.mkdir(parents=True, exist_ok=True)

    mdx_files = sorted(input_dir.glob("*.mdx"))
    if not mdx_files:
        print(f"Aucun fichier .mdx trouvé dans : {input_dir}")
        sys.exit(1)

    print(f"Conversion de {len(mdx_files)} fichier(s)...")
    print(f"Entrée  : {input_dir}")
    print(f"Sortie  : {output_dir}")
    if reference_doc:
        print(f"Modèle  : {reference_doc}")
    print()

    for mdx_file in mdx_files:
        docx_name = mdx_file.with_suffix(".docx").name
        out_file = output_dir / docx_name
        print(f"→ {mdx_file.name} -> {docx_name}")
        convert_one_file(mdx_file, out_file, reference_doc)

    print()
    print("✅ Conversion terminée.")


if __name__ == "__main__":
    main()