Synchronise les contenus glossaire et ajoute les scripts de conversion DOCX/MDX

2026-04-23 12:04:31 +02:00
parent fa46971e76
commit 5b427d5602
34 changed files with 6781 additions and 1084 deletions
--- a/scripts/convert_docx_to_mdx.py
+++ b/scripts/convert_docx_to_mdx.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+import argparse
+import os
+import re
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print("Erreur : PyYAML n'est pas installé. Lance : pip3 install pyyaml")
+    sys.exit(1)
+
+
+EDITION = "archicrat-ia"
+STATUS = "essai_these"
+VERSION = "0.1.0"
+
+
+ORDER_MAP = {
+    "prologue": 10,
+    "chapitre-1": 20,
+    "chapitre-2": 30,
+    "chapitre-3": 40,
+    "chapitre-4": 50,
+    "chapitre-5": 60,
+    "conclusion": 70,
+}
+
+
+TITLE_MAP = {
+    "prologue": "Prologue — Fondation, finalité sociopolitique et historique",
+    "chapitre-1": "Chapitre 1 — Fondements épistémologiques et modélisation",
+    "chapitre-2": "Chapitre 2 — Archéogenèse des régimes de co-viabilité",
+    "chapitre-3": "Chapitre 3 — Philosophies du pouvoir et archicration",
+    "chapitre-4": "Chapitre 4 — Histoire archicratique des révolutions industrielles",
+    "chapitre-5": "Chapitre 5 — Tensions, co-viabilités et régulations",
+    "conclusion": "Conclusion — ArchiCraT-IA",
+}
+
+
+def slugify_name(path: Path) -> str:
+    stem = path.stem.lower().strip()
+
+    replacements = {
+        " ": "-",
+        "_": "-",
+        "—": "-",
+        "–": "-",
+        "é": "e",
+        "è": "e",
+        "ê": "e",
+        "ë": "e",
+        "à": "a",
+        "â": "a",
+        "ä": "a",
+        "î": "i",
+        "ï": "i",
+        "ô": "o",
+        "ö": "o",
+        "ù": "u",
+        "û": "u",
+        "ü": "u",
+        "ç": "c",
+        "'": "",
+        "’": "",
+    }
+
+    for old, new in replacements.items():
+        stem = stem.replace(old, new)
+
+    stem = re.sub(r"-+", "-", stem).strip("-")
+
+    # normalisations spécifiques
+    stem = stem.replace("chapitre-1-fondements-epistemologiques-et-modelisation-archicratie-version-officielle-revise", "chapitre-1")
+    stem = stem.replace("chapitre-2", "chapitre-2")
+    stem = stem.replace("chapitre-3", "chapitre-3")
+    stem = stem.replace("chapitre-4", "chapitre-4")
+    stem = stem.replace("chapitre-5", "chapitre-5")
+
+    if "prologue" in stem:
+        return "prologue"
+    if "chapitre-1" in stem:
+        return "chapitre-1"
+    if "chapitre-2" in stem:
+        return "chapitre-2"
+    if "chapitre-3" in stem:
+        return "chapitre-3"
+    if "chapitre-4" in stem:
+        return "chapitre-4"
+    if "chapitre-5" in stem:
+        return "chapitre-5"
+    if "conclusion" in stem:
+        return "conclusion"
+
+    return stem
+
+
+def extract_title_from_markdown(md_text: str) -> str | None:
+    for line in md_text.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith("# "):
+            return line[2:].strip()
+    return None
+
+
+def remove_first_h1(md_text: str) -> str:
+    lines = md_text.splitlines()
+    out = []
+    removed = False
+
+    for line in lines:
+        if not removed and line.strip().startswith("# "):
+            removed = True
+            continue
+        out.append(line)
+
+    text = "\n".join(out).lstrip()
+    return text
+
+
+def clean_markdown(md_text: str) -> str:
+    text = md_text.replace("\r\n", "\n").replace("\r", "\n")
+
+    # nettoyer espaces multiples
+    text = re.sub(r"\n{3,}", "\n\n", text)
+
+    # supprimer éventuels signets/artefacts de liens internes Pandoc
+    text = re.sub(r"\[\]\(#.*?\)", "", text)
+
+    # convertir astérismes parasites
+    text = re.sub(r"[ \t]+$", "", text, flags=re.MULTILINE)
+
+    return text.strip() + "\n"
+
+
+def compute_level(slug: str) -> int:
+    if slug == "prologue":
+        return 1
+    if slug.startswith("chapitre-"):
+        return 1
+    if slug == "conclusion":
+        return 1
+    return 1
+
+
+def convert_one_file(input_docx: Path, output_dir: Path, source_root: Path):
+    slug = slugify_name(input_docx)
+    output_mdx = output_dir / f"{slug}.mdx"
+
+    cmd = [
+        "pandoc",
+        str(input_docx),
+        "-f",
+        "docx",
+        "-t",
+        "gfm+smart",
+    ]
+
+    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+    md_text = result.stdout
+
+    detected_title = extract_title_from_markdown(md_text)
+    md_body = remove_first_h1(md_text)
+    md_body = clean_markdown(md_body)
+
+    title = TITLE_MAP.get(slug) or detected_title or input_docx.stem
+    order = ORDER_MAP.get(slug, 999)
+    level = compute_level(slug)
+
+    relative_source = input_docx
+    try:
+        relative_source = input_docx.relative_to(source_root)
+    except ValueError:
+        relative_source = input_docx.name
+
+    frontmatter = {
+        "title": title,
+        "edition": EDITION,
+        "status": STATUS,
+        "level": level,
+        "version": VERSION,
+        "concepts": [],
+        "links": [],
+        "order": order,
+        "summary": "",
+        "source": {
+            "kind": "docx",
+            "path": str(relative_source),
+        },
+    }
+
+    yaml_block = yaml.safe_dump(
+        frontmatter,
+        allow_unicode=True,
+        sort_keys=False,
+        default_flow_style=False,
+    ).strip()
+
+    final_text = f"---\n{yaml_block}\n---\n{md_body if md_body.startswith(chr(10)) else chr(10) + md_body}"
+    output_mdx.write_text(final_text, encoding="utf-8")
+    print(f"✅ {input_docx.name} -> {output_mdx.name}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convertit un dossier DOCX en MDX avec frontmatter.")
+    parser.add_argument("input_dir", help="Dossier source contenant les DOCX")
+    parser.add_argument("output_dir", help="Dossier de sortie pour les MDX")
+    args = parser.parse_args()
+
+    input_dir = Path(args.input_dir).expanduser().resolve()
+    output_dir = Path(args.output_dir).expanduser().resolve()
+
+    if not shutil.which("pandoc"):
+        print("Erreur : pandoc n'est pas installé. Lance : brew install pandoc")
+        sys.exit(1)
+
+    if not input_dir.exists() or not input_dir.is_dir():
+        print(f"Erreur : dossier source introuvable : {input_dir}")
+        sys.exit(1)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    docx_files = sorted(input_dir.glob("*.docx"))
+    if not docx_files:
+        print(f"Aucun DOCX trouvé dans : {input_dir}")
+        sys.exit(1)
+
+    for docx_file in docx_files:
+        convert_one_file(docx_file, output_dir, input_dir)
+
+    print()
+    print("Conversion DOCX -> MDX terminée.")
+
+
+if __name__ == "__main__":
+    main()