archicratie-edition/scripts/convert_docx_to_mdx.py

#!/usr/bin/env python3
import argparse
import os
import re
import shutil
import subprocess
import sys
from pathlib import Path

try:
    import yaml
except ImportError:
    print("Erreur : PyYAML n'est pas installé. Lance : pip3 install pyyaml")
    sys.exit(1)


EDITION = "archicrat-ia"
STATUS = "essai_these"
VERSION = "0.1.0"


ORDER_MAP = {
    "prologue": 10,
    "chapitre-1": 20,
    "chapitre-2": 30,
    "chapitre-3": 40,
    "chapitre-4": 50,
    "chapitre-5": 60,
    "conclusion": 70,
}


TITLE_MAP = {
    "prologue": "Prologue — Fondation, finalité sociopolitique et historique",
    "chapitre-1": "Chapitre 1 — Fondements épistémologiques et modélisation",
    "chapitre-2": "Chapitre 2 — Archéogenèse des régimes de co-viabilité",
    "chapitre-3": "Chapitre 3 — Philosophies du pouvoir et archicration",
    "chapitre-4": "Chapitre 4 — Histoire archicratique des révolutions industrielles",
    "chapitre-5": "Chapitre 5 — Tensions, co-viabilités et régulations",
    "conclusion": "Conclusion — ArchiCraT-IA",
}


def slugify_name(path: Path) -> str:
    stem = path.stem.lower().strip()

    replacements = {
        " ": "-",
        "_": "-",
        "—": "-",
        "–": "-",
        "é": "e",
        "è": "e",
        "ê": "e",
        "ë": "e",
        "à": "a",
        "â": "a",
        "ä": "a",
        "î": "i",
        "ï": "i",
        "ô": "o",
        "ö": "o",
        "ù": "u",
        "û": "u",
        "ü": "u",
        "ç": "c",
        "'": "",
        "’": "",
    }

    for old, new in replacements.items():
        stem = stem.replace(old, new)

    stem = re.sub(r"-+", "-", stem).strip("-")

    # normalisations spécifiques
    stem = stem.replace("chapitre-1-fondements-epistemologiques-et-modelisation-archicratie-version-officielle-revise", "chapitre-1")
    stem = stem.replace("chapitre-2", "chapitre-2")
    stem = stem.replace("chapitre-3", "chapitre-3")
    stem = stem.replace("chapitre-4", "chapitre-4")
    stem = stem.replace("chapitre-5", "chapitre-5")

    if "prologue" in stem:
        return "prologue"
    if "chapitre-1" in stem:
        return "chapitre-1"
    if "chapitre-2" in stem:
        return "chapitre-2"
    if "chapitre-3" in stem:
        return "chapitre-3"
    if "chapitre-4" in stem:
        return "chapitre-4"
    if "chapitre-5" in stem:
        return "chapitre-5"
    if "conclusion" in stem:
        return "conclusion"

    return stem


def extract_title_from_markdown(md_text: str) -> str | None:
    for line in md_text.splitlines():
        line = line.strip()
        if not line:
            continue
        if line.startswith("# "):
            return line[2:].strip()
    return None


def remove_first_h1(md_text: str) -> str:
    lines = md_text.splitlines()
    out = []
    removed = False

    for line in lines:
        if not removed and line.strip().startswith("# "):
            removed = True
            continue
        out.append(line)

    text = "\n".join(out).lstrip()
    return text


def clean_markdown(md_text: str) -> str:
    text = md_text.replace("\r\n", "\n").replace("\r", "\n")

    # nettoyer espaces multiples
    text = re.sub(r"\n{3,}", "\n\n", text)

    # supprimer éventuels signets/artefacts de liens internes Pandoc
    text = re.sub(r"\[\]\(#.*?\)", "", text)

    # convertir astérismes parasites
    text = re.sub(r"[ \t]+$", "", text, flags=re.MULTILINE)

    return text.strip() + "\n"


def compute_level(slug: str) -> int:
    if slug == "prologue":
        return 1
    if slug.startswith("chapitre-"):
        return 1
    if slug == "conclusion":
        return 1
    return 1


def convert_one_file(input_docx: Path, output_dir: Path, source_root: Path):
    slug = slugify_name(input_docx)
    output_mdx = output_dir / f"{slug}.mdx"

    cmd = [
        "pandoc",
        str(input_docx),
        "-f",
        "docx",
        "-t",
        "gfm+smart",
    ]

    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
    md_text = result.stdout

    detected_title = extract_title_from_markdown(md_text)
    md_body = remove_first_h1(md_text)
    md_body = clean_markdown(md_body)

    title = TITLE_MAP.get(slug) or detected_title or input_docx.stem
    order = ORDER_MAP.get(slug, 999)
    level = compute_level(slug)

    relative_source = input_docx
    try:
        relative_source = input_docx.relative_to(source_root)
    except ValueError:
        relative_source = input_docx.name

    frontmatter = {
        "title": title,
        "edition": EDITION,
        "status": STATUS,
        "level": level,
        "version": VERSION,
        "concepts": [],
        "links": [],
        "order": order,
        "summary": "",
        "source": {
            "kind": "docx",
            "path": str(relative_source),
        },
    }

    yaml_block = yaml.safe_dump(
        frontmatter,
        allow_unicode=True,
        sort_keys=False,
        default_flow_style=False,
    ).strip()

    final_text = f"---\n{yaml_block}\n---\n{md_body if md_body.startswith(chr(10)) else chr(10) + md_body}"
    output_mdx.write_text(final_text, encoding="utf-8")
    print(f"✅ {input_docx.name} -> {output_mdx.name}")


def main():
    parser = argparse.ArgumentParser(description="Convertit un dossier DOCX en MDX avec frontmatter.")
    parser.add_argument("input_dir", help="Dossier source contenant les DOCX")
    parser.add_argument("output_dir", help="Dossier de sortie pour les MDX")
    args = parser.parse_args()

    input_dir = Path(args.input_dir).expanduser().resolve()
    output_dir = Path(args.output_dir).expanduser().resolve()

    if not shutil.which("pandoc"):
        print("Erreur : pandoc n'est pas installé. Lance : brew install pandoc")
        sys.exit(1)

    if not input_dir.exists() or not input_dir.is_dir():
        print(f"Erreur : dossier source introuvable : {input_dir}")
        sys.exit(1)

    output_dir.mkdir(parents=True, exist_ok=True)

    docx_files = sorted(input_dir.glob("*.docx"))
    if not docx_files:
        print(f"Aucun DOCX trouvé dans : {input_dir}")
        sys.exit(1)

    for docx_file in docx_files:
        convert_one_file(docx_file, output_dir, input_dir)

    print()
    print("Conversion DOCX -> MDX terminée.")


if __name__ == "__main__":
    main()