242 lines
6.2 KiB
Python
Executable File
242 lines
6.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
import argparse
|
||
import os
|
||
import re
|
||
import shutil
|
||
import subprocess
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
try:
|
||
import yaml
|
||
except ImportError:
|
||
print("Erreur : PyYAML n'est pas installé. Lance : pip3 install pyyaml")
|
||
sys.exit(1)
|
||
|
||
|
||
EDITION = "archicrat-ia"
|
||
STATUS = "essai_these"
|
||
VERSION = "0.1.0"
|
||
|
||
|
||
ORDER_MAP = {
|
||
"prologue": 10,
|
||
"chapitre-1": 20,
|
||
"chapitre-2": 30,
|
||
"chapitre-3": 40,
|
||
"chapitre-4": 50,
|
||
"chapitre-5": 60,
|
||
"conclusion": 70,
|
||
}
|
||
|
||
|
||
TITLE_MAP = {
|
||
"prologue": "Prologue — Fondation, finalité sociopolitique et historique",
|
||
"chapitre-1": "Chapitre 1 — Fondements épistémologiques et modélisation",
|
||
"chapitre-2": "Chapitre 2 — Archéogenèse des régimes de co-viabilité",
|
||
"chapitre-3": "Chapitre 3 — Philosophies du pouvoir et archicration",
|
||
"chapitre-4": "Chapitre 4 — Histoire archicratique des révolutions industrielles",
|
||
"chapitre-5": "Chapitre 5 — Tensions, co-viabilités et régulations",
|
||
"conclusion": "Conclusion — ArchiCraT-IA",
|
||
}
|
||
|
||
|
||
def slugify_name(path: Path) -> str:
|
||
stem = path.stem.lower().strip()
|
||
|
||
replacements = {
|
||
" ": "-",
|
||
"_": "-",
|
||
"—": "-",
|
||
"–": "-",
|
||
"é": "e",
|
||
"è": "e",
|
||
"ê": "e",
|
||
"ë": "e",
|
||
"à": "a",
|
||
"â": "a",
|
||
"ä": "a",
|
||
"î": "i",
|
||
"ï": "i",
|
||
"ô": "o",
|
||
"ö": "o",
|
||
"ù": "u",
|
||
"û": "u",
|
||
"ü": "u",
|
||
"ç": "c",
|
||
"'": "",
|
||
"’": "",
|
||
}
|
||
|
||
for old, new in replacements.items():
|
||
stem = stem.replace(old, new)
|
||
|
||
stem = re.sub(r"-+", "-", stem).strip("-")
|
||
|
||
# normalisations spécifiques
|
||
stem = stem.replace("chapitre-1-fondements-epistemologiques-et-modelisation-archicratie-version-officielle-revise", "chapitre-1")
|
||
stem = stem.replace("chapitre-2", "chapitre-2")
|
||
stem = stem.replace("chapitre-3", "chapitre-3")
|
||
stem = stem.replace("chapitre-4", "chapitre-4")
|
||
stem = stem.replace("chapitre-5", "chapitre-5")
|
||
|
||
if "prologue" in stem:
|
||
return "prologue"
|
||
if "chapitre-1" in stem:
|
||
return "chapitre-1"
|
||
if "chapitre-2" in stem:
|
||
return "chapitre-2"
|
||
if "chapitre-3" in stem:
|
||
return "chapitre-3"
|
||
if "chapitre-4" in stem:
|
||
return "chapitre-4"
|
||
if "chapitre-5" in stem:
|
||
return "chapitre-5"
|
||
if "conclusion" in stem:
|
||
return "conclusion"
|
||
|
||
return stem
|
||
|
||
|
||
def extract_title_from_markdown(md_text: str) -> str | None:
|
||
for line in md_text.splitlines():
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
if line.startswith("# "):
|
||
return line[2:].strip()
|
||
return None
|
||
|
||
|
||
def remove_first_h1(md_text: str) -> str:
|
||
lines = md_text.splitlines()
|
||
out = []
|
||
removed = False
|
||
|
||
for line in lines:
|
||
if not removed and line.strip().startswith("# "):
|
||
removed = True
|
||
continue
|
||
out.append(line)
|
||
|
||
text = "\n".join(out).lstrip()
|
||
return text
|
||
|
||
|
||
def clean_markdown(md_text: str) -> str:
|
||
text = md_text.replace("\r\n", "\n").replace("\r", "\n")
|
||
|
||
# nettoyer espaces multiples
|
||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||
|
||
# supprimer éventuels signets/artefacts de liens internes Pandoc
|
||
text = re.sub(r"\[\]\(#.*?\)", "", text)
|
||
|
||
# convertir astérismes parasites
|
||
text = re.sub(r"[ \t]+$", "", text, flags=re.MULTILINE)
|
||
|
||
return text.strip() + "\n"
|
||
|
||
|
||
def compute_level(slug: str) -> int:
|
||
if slug == "prologue":
|
||
return 1
|
||
if slug.startswith("chapitre-"):
|
||
return 1
|
||
if slug == "conclusion":
|
||
return 1
|
||
return 1
|
||
|
||
|
||
def convert_one_file(input_docx: Path, output_dir: Path, source_root: Path):
|
||
slug = slugify_name(input_docx)
|
||
output_mdx = output_dir / f"{slug}.mdx"
|
||
|
||
cmd = [
|
||
"pandoc",
|
||
str(input_docx),
|
||
"-f",
|
||
"docx",
|
||
"-t",
|
||
"gfm+smart",
|
||
]
|
||
|
||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||
md_text = result.stdout
|
||
|
||
detected_title = extract_title_from_markdown(md_text)
|
||
md_body = remove_first_h1(md_text)
|
||
md_body = clean_markdown(md_body)
|
||
|
||
title = TITLE_MAP.get(slug) or detected_title or input_docx.stem
|
||
order = ORDER_MAP.get(slug, 999)
|
||
level = compute_level(slug)
|
||
|
||
relative_source = input_docx
|
||
try:
|
||
relative_source = input_docx.relative_to(source_root)
|
||
except ValueError:
|
||
relative_source = input_docx.name
|
||
|
||
frontmatter = {
|
||
"title": title,
|
||
"edition": EDITION,
|
||
"status": STATUS,
|
||
"level": level,
|
||
"version": VERSION,
|
||
"concepts": [],
|
||
"links": [],
|
||
"order": order,
|
||
"summary": "",
|
||
"source": {
|
||
"kind": "docx",
|
||
"path": str(relative_source),
|
||
},
|
||
}
|
||
|
||
yaml_block = yaml.safe_dump(
|
||
frontmatter,
|
||
allow_unicode=True,
|
||
sort_keys=False,
|
||
default_flow_style=False,
|
||
).strip()
|
||
|
||
final_text = f"---\n{yaml_block}\n---\n{md_body if md_body.startswith(chr(10)) else chr(10) + md_body}"
|
||
output_mdx.write_text(final_text, encoding="utf-8")
|
||
print(f"✅ {input_docx.name} -> {output_mdx.name}")
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Convertit un dossier DOCX en MDX avec frontmatter.")
|
||
parser.add_argument("input_dir", help="Dossier source contenant les DOCX")
|
||
parser.add_argument("output_dir", help="Dossier de sortie pour les MDX")
|
||
args = parser.parse_args()
|
||
|
||
input_dir = Path(args.input_dir).expanduser().resolve()
|
||
output_dir = Path(args.output_dir).expanduser().resolve()
|
||
|
||
if not shutil.which("pandoc"):
|
||
print("Erreur : pandoc n'est pas installé. Lance : brew install pandoc")
|
||
sys.exit(1)
|
||
|
||
if not input_dir.exists() or not input_dir.is_dir():
|
||
print(f"Erreur : dossier source introuvable : {input_dir}")
|
||
sys.exit(1)
|
||
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
docx_files = sorted(input_dir.glob("*.docx"))
|
||
if not docx_files:
|
||
print(f"Aucun DOCX trouvé dans : {input_dir}")
|
||
sys.exit(1)
|
||
|
||
for docx_file in docx_files:
|
||
convert_one_file(docx_file, output_dir, input_dir)
|
||
|
||
print()
|
||
print("Conversion DOCX -> MDX terminée.")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|