Synchronise les contenus glossaire et ajoute les scripts de conversion DOCX/MDX

2026-04-23 12:04:31 +02:00
parent fa46971e76
commit 5b427d5602
34 changed files with 6781 additions and 1084 deletions
--- a/scripts/convert_docx_to_mdx.py
+++ b/scripts/convert_docx_to_mdx.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+import argparse
+import os
+import re
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print("Erreur : PyYAML n'est pas installé. Lance : pip3 install pyyaml")
+    sys.exit(1)
+
+
+EDITION = "archicrat-ia"
+STATUS = "essai_these"
+VERSION = "0.1.0"
+
+
+ORDER_MAP = {
+    "prologue": 10,
+    "chapitre-1": 20,
+    "chapitre-2": 30,
+    "chapitre-3": 40,
+    "chapitre-4": 50,
+    "chapitre-5": 60,
+    "conclusion": 70,
+}
+
+
+TITLE_MAP = {
+    "prologue": "Prologue — Fondation, finalité sociopolitique et historique",
+    "chapitre-1": "Chapitre 1 — Fondements épistémologiques et modélisation",
+    "chapitre-2": "Chapitre 2 — Archéogenèse des régimes de co-viabilité",
+    "chapitre-3": "Chapitre 3 — Philosophies du pouvoir et archicration",
+    "chapitre-4": "Chapitre 4 — Histoire archicratique des révolutions industrielles",
+    "chapitre-5": "Chapitre 5 — Tensions, co-viabilités et régulations",
+    "conclusion": "Conclusion — ArchiCraT-IA",
+}
+
+
+def slugify_name(path: Path) -> str:
+    stem = path.stem.lower().strip()
+
+    replacements = {
+        " ": "-",
+        "_": "-",
+        "—": "-",
+        "–": "-",
+        "é": "e",
+        "è": "e",
+        "ê": "e",
+        "ë": "e",
+        "à": "a",
+        "â": "a",
+        "ä": "a",
+        "î": "i",
+        "ï": "i",
+        "ô": "o",
+        "ö": "o",
+        "ù": "u",
+        "û": "u",
+        "ü": "u",
+        "ç": "c",
+        "'": "",
+        "’": "",
+    }
+
+    for old, new in replacements.items():
+        stem = stem.replace(old, new)
+
+    stem = re.sub(r"-+", "-", stem).strip("-")
+
+    # normalisations spécifiques
+    stem = stem.replace("chapitre-1-fondements-epistemologiques-et-modelisation-archicratie-version-officielle-revise", "chapitre-1")
+    stem = stem.replace("chapitre-2", "chapitre-2")
+    stem = stem.replace("chapitre-3", "chapitre-3")
+    stem = stem.replace("chapitre-4", "chapitre-4")
+    stem = stem.replace("chapitre-5", "chapitre-5")
+
+    if "prologue" in stem:
+        return "prologue"
+    if "chapitre-1" in stem:
+        return "chapitre-1"
+    if "chapitre-2" in stem:
+        return "chapitre-2"
+    if "chapitre-3" in stem:
+        return "chapitre-3"
+    if "chapitre-4" in stem:
+        return "chapitre-4"
+    if "chapitre-5" in stem:
+        return "chapitre-5"
+    if "conclusion" in stem:
+        return "conclusion"
+
+    return stem
+
+
+def extract_title_from_markdown(md_text: str) -> str | None:
+    for line in md_text.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith("# "):
+            return line[2:].strip()
+    return None
+
+
+def remove_first_h1(md_text: str) -> str:
+    lines = md_text.splitlines()
+    out = []
+    removed = False
+
+    for line in lines:
+        if not removed and line.strip().startswith("# "):
+            removed = True
+            continue
+        out.append(line)
+
+    text = "\n".join(out).lstrip()
+    return text
+
+
+def clean_markdown(md_text: str) -> str:
+    text = md_text.replace("\r\n", "\n").replace("\r", "\n")
+
+    # nettoyer espaces multiples
+    text = re.sub(r"\n{3,}", "\n\n", text)
+
+    # supprimer éventuels signets/artefacts de liens internes Pandoc
+    text = re.sub(r"\[\]\(#.*?\)", "", text)
+
+    # convertir astérismes parasites
+    text = re.sub(r"[ \t]+$", "", text, flags=re.MULTILINE)
+
+    return text.strip() + "\n"
+
+
+def compute_level(slug: str) -> int:
+    if slug == "prologue":
+        return 1
+    if slug.startswith("chapitre-"):
+        return 1
+    if slug == "conclusion":
+        return 1
+    return 1
+
+
+def convert_one_file(input_docx: Path, output_dir: Path, source_root: Path):
+    slug = slugify_name(input_docx)
+    output_mdx = output_dir / f"{slug}.mdx"
+
+    cmd = [
+        "pandoc",
+        str(input_docx),
+        "-f",
+        "docx",
+        "-t",
+        "gfm+smart",
+    ]
+
+    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+    md_text = result.stdout
+
+    detected_title = extract_title_from_markdown(md_text)
+    md_body = remove_first_h1(md_text)
+    md_body = clean_markdown(md_body)
+
+    title = TITLE_MAP.get(slug) or detected_title or input_docx.stem
+    order = ORDER_MAP.get(slug, 999)
+    level = compute_level(slug)
+
+    relative_source = input_docx
+    try:
+        relative_source = input_docx.relative_to(source_root)
+    except ValueError:
+        relative_source = input_docx.name
+
+    frontmatter = {
+        "title": title,
+        "edition": EDITION,
+        "status": STATUS,
+        "level": level,
+        "version": VERSION,
+        "concepts": [],
+        "links": [],
+        "order": order,
+        "summary": "",
+        "source": {
+            "kind": "docx",
+            "path": str(relative_source),
+        },
+    }
+
+    yaml_block = yaml.safe_dump(
+        frontmatter,
+        allow_unicode=True,
+        sort_keys=False,
+        default_flow_style=False,
+    ).strip()
+
+    final_text = f"---\n{yaml_block}\n---\n{md_body if md_body.startswith(chr(10)) else chr(10) + md_body}"
+    output_mdx.write_text(final_text, encoding="utf-8")
+    print(f"✅ {input_docx.name} -> {output_mdx.name}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convertit un dossier DOCX en MDX avec frontmatter.")
+    parser.add_argument("input_dir", help="Dossier source contenant les DOCX")
+    parser.add_argument("output_dir", help="Dossier de sortie pour les MDX")
+    args = parser.parse_args()
+
+    input_dir = Path(args.input_dir).expanduser().resolve()
+    output_dir = Path(args.output_dir).expanduser().resolve()
+
+    if not shutil.which("pandoc"):
+        print("Erreur : pandoc n'est pas installé. Lance : brew install pandoc")
+        sys.exit(1)
+
+    if not input_dir.exists() or not input_dir.is_dir():
+        print(f"Erreur : dossier source introuvable : {input_dir}")
+        sys.exit(1)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    docx_files = sorted(input_dir.glob("*.docx"))
+    if not docx_files:
+        print(f"Aucun DOCX trouvé dans : {input_dir}")
+        sys.exit(1)
+
+    for docx_file in docx_files:
+        convert_one_file(docx_file, output_dir, input_dir)
+
+    print()
+    print("Conversion DOCX -> MDX terminée.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/convert_mdx_to_docx.py
+++ b/scripts/convert_mdx_to_docx.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python3
+import argparse
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+import zipfile
+
+try:
+    import yaml
+except ImportError:
+    print("Erreur : PyYAML n'est pas installé. Lance : pip3 install pyyaml")
+    sys.exit(1)
+
+try:
+    from docx import Document
+except ImportError:
+    print("Erreur : python-docx n'est pas installé. Lance : pip3 install python-docx")
+    sys.exit(1)
+
+
+def split_frontmatter(text: str):
+    if not text.startswith("---\n"):
+        return {}, text
+
+    match = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.DOTALL)
+    if not match:
+        return {}, text
+
+    yaml_block = match.group(1)
+    body = match.group(2)
+
+    try:
+        metadata = yaml.safe_load(yaml_block) or {}
+    except Exception as e:
+        print(f"Avertissement : frontmatter YAML illisible : {e}")
+        metadata = {}
+
+    return metadata, body
+
+
+def strip_mdx_artifacts(text: str):
+    # imports / exports MDX
+    text = re.sub(r"^\s*(import|export)\s+.+?$", "", text, flags=re.MULTILINE)
+
+    # composants autofermants : <Component />
+    text = re.sub(r"<[A-Z][A-Za-z0-9._-]*\b[^>]*\/>", "", text)
+
+    # composants bloc : <Component ...>...</Component>
+    text = re.sub(
+        r"<([A-Z][A-Za-z0-9._-]*)\b[^>]*>.*?</\1>",
+        "",
+        text,
+        flags=re.DOTALL,
+    )
+
+    # accolades seules résiduelles sur ligne
+    text = re.sub(r"^\s*{\s*}\s*$", "", text, flags=re.MULTILINE)
+
+    # lignes vides multiples
+    text = re.sub(r"\n{3,}", "\n\n", text)
+
+    return text.strip() + "\n"
+
+
+def inject_h1_from_title(metadata: dict, body: str):
+    title = metadata.get("title", "")
+    if not title:
+        return body
+
+    if re.match(r"^\s*#\s+", body):
+        return body
+
+    return f"# {title}\n\n{body.lstrip()}"
+
+
+def find_style_by_candidates(doc, candidates):
+    # Cherche d'abord par nom visible
+    for style in doc.styles:
+        for candidate in candidates:
+            if style.name == candidate:
+                return style
+
+    # Puis par style_id Word interne
+    for style in doc.styles:
+        style_id = getattr(style, "style_id", "")
+        if style_id in {"BodyText", "Heading1", "Heading2", "Heading3", "Heading4"}:
+            for candidate in candidates:
+                if candidate in {"Body Text", "Corps de texte"} and style_id == "BodyText":
+                    return style
+                if candidate in {"Heading 1", "Titre 1"} and style_id == "Heading1":
+                    return style
+                if candidate in {"Heading 2", "Titre 2"} and style_id == "Heading2":
+                    return style
+                if candidate in {"Heading 3", "Titre 3"} and style_id == "Heading3":
+                    return style
+                if candidate in {"Heading 4", "Titre 4"} and style_id == "Heading4":
+                    return style
+    return None
+
+def strip_leading_paragraph_numbers(text: str):
+    """
+    Supprime les numéros de paragraphe du type :
+    2. Texte...
+    11. Texte...
+    101. Texte...
+    sans toucher aux titres Markdown (#, ##, ###).
+    """
+    fixed_lines = []
+
+    for line in text.splitlines():
+        stripped = line.lstrip()
+
+        # Ne jamais toucher aux titres Markdown
+        if stripped.startswith("#"):
+            fixed_lines.append(line)
+            continue
+
+        # Supprime un numéro de paragraphe en début de ligne
+        line = re.sub(r"^\s*\d+\.\s+", "", line)
+        fixed_lines.append(line)
+
+    return "\n".join(fixed_lines) + "\n"
+
+def normalize_non_heading_paragraphs(docx_path: Path):
+    """
+    Force tous les paragraphes non-titres en Body Text / Corps de texte.
+    On laisse intacts les Heading 1-4.
+    """
+    doc = Document(str(docx_path))
+
+    body_style = find_style_by_candidates(doc, ["Body Text", "Corps de texte"])
+    if body_style is None:
+        print(f"Avertissement : style 'Body Text / Corps de texte' introuvable dans {docx_path.name}")
+        return
+
+    heading_names = {
+        "Heading 1", "Heading 2", "Heading 3", "Heading 4",
+        "Titre 1", "Titre 2", "Titre 3", "Titre 4",
+    }
+    heading_ids = {"Heading1", "Heading2", "Heading3", "Heading4"}
+
+    changed = 0
+
+    for para in doc.paragraphs:
+        text = para.text.strip()
+        if not text:
+            continue
+
+        current_style = para.style
+        current_name = current_style.name if current_style else ""
+        current_id = getattr(current_style, "style_id", "") if current_style else ""
+
+        if current_name in heading_names or current_id in heading_ids:
+            continue
+
+        # Tout le reste passe en Body Text
+        para.style = body_style
+        changed += 1
+
+    doc.save(str(docx_path))
+    print(f"   ↳ normalisation styles : {changed} paragraphe(s) mis en 'Body Text / Corps de texte'")
+
+def remove_word_bookmarks(docx_path: Path):
+    """
+    Supprime les bookmarks Word (signets) du DOCX.
+    Ce sont eux qui apparaissent comme crochets gris dans LibreOffice/Word
+    quand l'affichage des signets est activé.
+    """
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Dézipper le docx
+        with zipfile.ZipFile(docx_path, "r") as zin:
+            zin.extractall(tmpdir)
+
+        xml_targets = [
+            tmpdir / "word" / "document.xml",
+            tmpdir / "word" / "footnotes.xml",
+            tmpdir / "word" / "endnotes.xml",
+            tmpdir / "word" / "comments.xml",
+        ]
+
+        removed = 0
+
+        for xml_file in xml_targets:
+            if not xml_file.exists():
+                continue
+
+            text = xml_file.read_text(encoding="utf-8")
+
+            # enlever <w:bookmarkStart .../> et <w:bookmarkEnd .../>
+            text, c1 = re.subn(r"<w:bookmarkStart\b[^>]*/>", "", text)
+            text, c2 = re.subn(r"<w:bookmarkEnd\b[^>]*/>", "", text)
+
+            removed += c1 + c2
+            xml_file.write_text(text, encoding="utf-8")
+
+        # Rezipper
+        tmp_output = docx_path.with_suffix(".cleaned.docx")
+        with zipfile.ZipFile(tmp_output, "w", zipfile.ZIP_DEFLATED) as zout:
+            for file in tmpdir.rglob("*"):
+                if file.is_file():
+                    zout.write(file, file.relative_to(tmpdir))
+
+        tmp_output.replace(docx_path)
+        print(f"   ↳ suppression signets : {removed} balise(s) supprimée(s)")
+
+def convert_one_file(input_path: Path, output_path: Path, reference_doc: Path | None):
+    raw = input_path.read_text(encoding="utf-8")
+    metadata, body = split_frontmatter(raw)
+    body = strip_mdx_artifacts(body)
+    body = strip_leading_paragraph_numbers(body)
+    body = inject_h1_from_title(metadata, body)
+
+    with tempfile.NamedTemporaryFile("w", suffix=".md", delete=False, encoding="utf-8") as tmp:
+        tmp.write(body)
+        tmp_md = Path(tmp.name)
+
+    cmd = [
+        "pandoc",
+        str(tmp_md),
+        "-f",
+        "markdown",
+        "-o",
+        str(output_path),
+    ]
+
+    if reference_doc:
+        cmd.extend(["--reference-doc", str(reference_doc)])
+
+    try:
+        subprocess.run(cmd, check=True)
+    finally:
+        try:
+            tmp_md.unlink()
+        except FileNotFoundError:
+            pass
+
+    normalize_non_heading_paragraphs(output_path)
+    remove_word_bookmarks(output_path)
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convertit des fichiers MDX en DOCX en conservant H1/H2/H3/H4 et en forçant le corps en Body Text."
+    )
+    parser.add_argument("input_dir", help="Dossier contenant les .mdx")
+    parser.add_argument(
+        "--output-dir",
+        default=str(Path.home() / "Desktop" / "archicrat-ia-docx"),
+        help="Dossier de sortie DOCX"
+    )
+    parser.add_argument(
+        "--reference-doc",
+        default=None,
+        help="DOCX modèle Word à utiliser comme reference-doc"
+    )
+
+    args = parser.parse_args()
+
+    input_dir = Path(args.input_dir)
+    output_dir = Path(args.output_dir)
+    reference_doc = Path(args.reference_doc) if args.reference_doc else None
+
+    if not shutil.which("pandoc"):
+        print("Erreur : pandoc n'est pas installé. Installe-le avec : brew install pandoc")
+        sys.exit(1)
+
+    if not input_dir.exists() or not input_dir.is_dir():
+        print(f"Erreur : dossier introuvable : {input_dir}")
+        sys.exit(1)
+
+    if reference_doc and not reference_doc.exists():
+        print(f"Erreur : reference-doc introuvable : {reference_doc}")
+        sys.exit(1)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    mdx_files = sorted(input_dir.glob("*.mdx"))
+    if not mdx_files:
+        print(f"Aucun fichier .mdx trouvé dans : {input_dir}")
+        sys.exit(1)
+
+    print(f"Conversion de {len(mdx_files)} fichier(s)...")
+    print(f"Entrée  : {input_dir}")
+    print(f"Sortie  : {output_dir}")
+    if reference_doc:
+        print(f"Modèle  : {reference_doc}")
+    print()
+
+    for mdx_file in mdx_files:
+        docx_name = mdx_file.with_suffix(".docx").name
+        out_file = output_dir / docx_name
+        print(f"→ {mdx_file.name} -> {docx_name}")
+        convert_one_file(mdx_file, out_file, reference_doc)
+
+    print()
+    print("✅ Conversion terminée.")
+
+
+if __name__ == "__main__":
+    main()