Files
archicratie-edition/scripts/convert_docx_to_mdx.py
Archicratia 5b427d5602
All checks were successful
SMOKE / smoke (push) Successful in 4s
CI / build-and-anchors (push) Successful in 37s
CI / build-and-anchors (pull_request) Successful in 34s
Synchronise les contenus glossaire et ajoute les scripts de conversion DOCX/MDX
2026-04-23 12:04:31 +02:00

242 lines
6.2 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
import argparse
import os
import re
import shutil
import subprocess
import sys
from pathlib import Path
try:
import yaml
except ImportError:
print("Erreur : PyYAML n'est pas installé. Lance : pip3 install pyyaml")
sys.exit(1)
EDITION = "archicrat-ia"
STATUS = "essai_these"
VERSION = "0.1.0"
ORDER_MAP = {
"prologue": 10,
"chapitre-1": 20,
"chapitre-2": 30,
"chapitre-3": 40,
"chapitre-4": 50,
"chapitre-5": 60,
"conclusion": 70,
}
TITLE_MAP = {
"prologue": "Prologue — Fondation, finalité sociopolitique et historique",
"chapitre-1": "Chapitre 1 — Fondements épistémologiques et modélisation",
"chapitre-2": "Chapitre 2 — Archéogenèse des régimes de co-viabilité",
"chapitre-3": "Chapitre 3 — Philosophies du pouvoir et archicration",
"chapitre-4": "Chapitre 4 — Histoire archicratique des révolutions industrielles",
"chapitre-5": "Chapitre 5 — Tensions, co-viabilités et régulations",
"conclusion": "Conclusion — ArchiCraT-IA",
}
def slugify_name(path: Path) -> str:
stem = path.stem.lower().strip()
replacements = {
" ": "-",
"_": "-",
"": "-",
"": "-",
"é": "e",
"è": "e",
"ê": "e",
"ë": "e",
"à": "a",
"â": "a",
"ä": "a",
"î": "i",
"ï": "i",
"ô": "o",
"ö": "o",
"ù": "u",
"û": "u",
"ü": "u",
"ç": "c",
"'": "",
"": "",
}
for old, new in replacements.items():
stem = stem.replace(old, new)
stem = re.sub(r"-+", "-", stem).strip("-")
# normalisations spécifiques
stem = stem.replace("chapitre-1-fondements-epistemologiques-et-modelisation-archicratie-version-officielle-revise", "chapitre-1")
stem = stem.replace("chapitre-2", "chapitre-2")
stem = stem.replace("chapitre-3", "chapitre-3")
stem = stem.replace("chapitre-4", "chapitre-4")
stem = stem.replace("chapitre-5", "chapitre-5")
if "prologue" in stem:
return "prologue"
if "chapitre-1" in stem:
return "chapitre-1"
if "chapitre-2" in stem:
return "chapitre-2"
if "chapitre-3" in stem:
return "chapitre-3"
if "chapitre-4" in stem:
return "chapitre-4"
if "chapitre-5" in stem:
return "chapitre-5"
if "conclusion" in stem:
return "conclusion"
return stem
def extract_title_from_markdown(md_text: str) -> str | None:
for line in md_text.splitlines():
line = line.strip()
if not line:
continue
if line.startswith("# "):
return line[2:].strip()
return None
def remove_first_h1(md_text: str) -> str:
lines = md_text.splitlines()
out = []
removed = False
for line in lines:
if not removed and line.strip().startswith("# "):
removed = True
continue
out.append(line)
text = "\n".join(out).lstrip()
return text
def clean_markdown(md_text: str) -> str:
text = md_text.replace("\r\n", "\n").replace("\r", "\n")
# nettoyer espaces multiples
text = re.sub(r"\n{3,}", "\n\n", text)
# supprimer éventuels signets/artefacts de liens internes Pandoc
text = re.sub(r"\[\]\(#.*?\)", "", text)
# convertir astérismes parasites
text = re.sub(r"[ \t]+$", "", text, flags=re.MULTILINE)
return text.strip() + "\n"
def compute_level(slug: str) -> int:
if slug == "prologue":
return 1
if slug.startswith("chapitre-"):
return 1
if slug == "conclusion":
return 1
return 1
def convert_one_file(input_docx: Path, output_dir: Path, source_root: Path):
slug = slugify_name(input_docx)
output_mdx = output_dir / f"{slug}.mdx"
cmd = [
"pandoc",
str(input_docx),
"-f",
"docx",
"-t",
"gfm+smart",
]
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
md_text = result.stdout
detected_title = extract_title_from_markdown(md_text)
md_body = remove_first_h1(md_text)
md_body = clean_markdown(md_body)
title = TITLE_MAP.get(slug) or detected_title or input_docx.stem
order = ORDER_MAP.get(slug, 999)
level = compute_level(slug)
relative_source = input_docx
try:
relative_source = input_docx.relative_to(source_root)
except ValueError:
relative_source = input_docx.name
frontmatter = {
"title": title,
"edition": EDITION,
"status": STATUS,
"level": level,
"version": VERSION,
"concepts": [],
"links": [],
"order": order,
"summary": "",
"source": {
"kind": "docx",
"path": str(relative_source),
},
}
yaml_block = yaml.safe_dump(
frontmatter,
allow_unicode=True,
sort_keys=False,
default_flow_style=False,
).strip()
final_text = f"---\n{yaml_block}\n---\n{md_body if md_body.startswith(chr(10)) else chr(10) + md_body}"
output_mdx.write_text(final_text, encoding="utf-8")
print(f"{input_docx.name} -> {output_mdx.name}")
def main():
parser = argparse.ArgumentParser(description="Convertit un dossier DOCX en MDX avec frontmatter.")
parser.add_argument("input_dir", help="Dossier source contenant les DOCX")
parser.add_argument("output_dir", help="Dossier de sortie pour les MDX")
args = parser.parse_args()
input_dir = Path(args.input_dir).expanduser().resolve()
output_dir = Path(args.output_dir).expanduser().resolve()
if not shutil.which("pandoc"):
print("Erreur : pandoc n'est pas installé. Lance : brew install pandoc")
sys.exit(1)
if not input_dir.exists() or not input_dir.is_dir():
print(f"Erreur : dossier source introuvable : {input_dir}")
sys.exit(1)
output_dir.mkdir(parents=True, exist_ok=True)
docx_files = sorted(input_dir.glob("*.docx"))
if not docx_files:
print(f"Aucun DOCX trouvé dans : {input_dir}")
sys.exit(1)
for docx_file in docx_files:
convert_one_file(docx_file, output_dir, input_dir)
print()
print("Conversion DOCX -> MDX terminée.")
if __name__ == "__main__":
main()