Synchronise les contenus glossaire et ajoute les scripts de conversion DOCX/MDX
This commit is contained in:
241
scripts/convert_docx_to_mdx.py
Executable file
241
scripts/convert_docx_to_mdx.py
Executable file
@@ -0,0 +1,241 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
print("Erreur : PyYAML n'est pas installé. Lance : pip3 install pyyaml")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
EDITION = "archicrat-ia"
|
||||
STATUS = "essai_these"
|
||||
VERSION = "0.1.0"
|
||||
|
||||
|
||||
ORDER_MAP = {
|
||||
"prologue": 10,
|
||||
"chapitre-1": 20,
|
||||
"chapitre-2": 30,
|
||||
"chapitre-3": 40,
|
||||
"chapitre-4": 50,
|
||||
"chapitre-5": 60,
|
||||
"conclusion": 70,
|
||||
}
|
||||
|
||||
|
||||
TITLE_MAP = {
|
||||
"prologue": "Prologue — Fondation, finalité sociopolitique et historique",
|
||||
"chapitre-1": "Chapitre 1 — Fondements épistémologiques et modélisation",
|
||||
"chapitre-2": "Chapitre 2 — Archéogenèse des régimes de co-viabilité",
|
||||
"chapitre-3": "Chapitre 3 — Philosophies du pouvoir et archicration",
|
||||
"chapitre-4": "Chapitre 4 — Histoire archicratique des révolutions industrielles",
|
||||
"chapitre-5": "Chapitre 5 — Tensions, co-viabilités et régulations",
|
||||
"conclusion": "Conclusion — ArchiCraT-IA",
|
||||
}
|
||||
|
||||
|
||||
def slugify_name(path: Path) -> str:
|
||||
stem = path.stem.lower().strip()
|
||||
|
||||
replacements = {
|
||||
" ": "-",
|
||||
"_": "-",
|
||||
"—": "-",
|
||||
"–": "-",
|
||||
"é": "e",
|
||||
"è": "e",
|
||||
"ê": "e",
|
||||
"ë": "e",
|
||||
"à": "a",
|
||||
"â": "a",
|
||||
"ä": "a",
|
||||
"î": "i",
|
||||
"ï": "i",
|
||||
"ô": "o",
|
||||
"ö": "o",
|
||||
"ù": "u",
|
||||
"û": "u",
|
||||
"ü": "u",
|
||||
"ç": "c",
|
||||
"'": "",
|
||||
"’": "",
|
||||
}
|
||||
|
||||
for old, new in replacements.items():
|
||||
stem = stem.replace(old, new)
|
||||
|
||||
stem = re.sub(r"-+", "-", stem).strip("-")
|
||||
|
||||
# normalisations spécifiques
|
||||
stem = stem.replace("chapitre-1-fondements-epistemologiques-et-modelisation-archicratie-version-officielle-revise", "chapitre-1")
|
||||
stem = stem.replace("chapitre-2", "chapitre-2")
|
||||
stem = stem.replace("chapitre-3", "chapitre-3")
|
||||
stem = stem.replace("chapitre-4", "chapitre-4")
|
||||
stem = stem.replace("chapitre-5", "chapitre-5")
|
||||
|
||||
if "prologue" in stem:
|
||||
return "prologue"
|
||||
if "chapitre-1" in stem:
|
||||
return "chapitre-1"
|
||||
if "chapitre-2" in stem:
|
||||
return "chapitre-2"
|
||||
if "chapitre-3" in stem:
|
||||
return "chapitre-3"
|
||||
if "chapitre-4" in stem:
|
||||
return "chapitre-4"
|
||||
if "chapitre-5" in stem:
|
||||
return "chapitre-5"
|
||||
if "conclusion" in stem:
|
||||
return "conclusion"
|
||||
|
||||
return stem
|
||||
|
||||
|
||||
def extract_title_from_markdown(md_text: str) -> str | None:
|
||||
for line in md_text.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
if line.startswith("# "):
|
||||
return line[2:].strip()
|
||||
return None
|
||||
|
||||
|
||||
def remove_first_h1(md_text: str) -> str:
|
||||
lines = md_text.splitlines()
|
||||
out = []
|
||||
removed = False
|
||||
|
||||
for line in lines:
|
||||
if not removed and line.strip().startswith("# "):
|
||||
removed = True
|
||||
continue
|
||||
out.append(line)
|
||||
|
||||
text = "\n".join(out).lstrip()
|
||||
return text
|
||||
|
||||
|
||||
def clean_markdown(md_text: str) -> str:
|
||||
text = md_text.replace("\r\n", "\n").replace("\r", "\n")
|
||||
|
||||
# nettoyer espaces multiples
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
|
||||
# supprimer éventuels signets/artefacts de liens internes Pandoc
|
||||
text = re.sub(r"\[\]\(#.*?\)", "", text)
|
||||
|
||||
# convertir astérismes parasites
|
||||
text = re.sub(r"[ \t]+$", "", text, flags=re.MULTILINE)
|
||||
|
||||
return text.strip() + "\n"
|
||||
|
||||
|
||||
def compute_level(slug: str) -> int:
|
||||
if slug == "prologue":
|
||||
return 1
|
||||
if slug.startswith("chapitre-"):
|
||||
return 1
|
||||
if slug == "conclusion":
|
||||
return 1
|
||||
return 1
|
||||
|
||||
|
||||
def convert_one_file(input_docx: Path, output_dir: Path, source_root: Path):
|
||||
slug = slugify_name(input_docx)
|
||||
output_mdx = output_dir / f"{slug}.mdx"
|
||||
|
||||
cmd = [
|
||||
"pandoc",
|
||||
str(input_docx),
|
||||
"-f",
|
||||
"docx",
|
||||
"-t",
|
||||
"gfm+smart",
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
md_text = result.stdout
|
||||
|
||||
detected_title = extract_title_from_markdown(md_text)
|
||||
md_body = remove_first_h1(md_text)
|
||||
md_body = clean_markdown(md_body)
|
||||
|
||||
title = TITLE_MAP.get(slug) or detected_title or input_docx.stem
|
||||
order = ORDER_MAP.get(slug, 999)
|
||||
level = compute_level(slug)
|
||||
|
||||
relative_source = input_docx
|
||||
try:
|
||||
relative_source = input_docx.relative_to(source_root)
|
||||
except ValueError:
|
||||
relative_source = input_docx.name
|
||||
|
||||
frontmatter = {
|
||||
"title": title,
|
||||
"edition": EDITION,
|
||||
"status": STATUS,
|
||||
"level": level,
|
||||
"version": VERSION,
|
||||
"concepts": [],
|
||||
"links": [],
|
||||
"order": order,
|
||||
"summary": "",
|
||||
"source": {
|
||||
"kind": "docx",
|
||||
"path": str(relative_source),
|
||||
},
|
||||
}
|
||||
|
||||
yaml_block = yaml.safe_dump(
|
||||
frontmatter,
|
||||
allow_unicode=True,
|
||||
sort_keys=False,
|
||||
default_flow_style=False,
|
||||
).strip()
|
||||
|
||||
final_text = f"---\n{yaml_block}\n---\n{md_body if md_body.startswith(chr(10)) else chr(10) + md_body}"
|
||||
output_mdx.write_text(final_text, encoding="utf-8")
|
||||
print(f"✅ {input_docx.name} -> {output_mdx.name}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Convertit un dossier DOCX en MDX avec frontmatter.")
|
||||
parser.add_argument("input_dir", help="Dossier source contenant les DOCX")
|
||||
parser.add_argument("output_dir", help="Dossier de sortie pour les MDX")
|
||||
args = parser.parse_args()
|
||||
|
||||
input_dir = Path(args.input_dir).expanduser().resolve()
|
||||
output_dir = Path(args.output_dir).expanduser().resolve()
|
||||
|
||||
if not shutil.which("pandoc"):
|
||||
print("Erreur : pandoc n'est pas installé. Lance : brew install pandoc")
|
||||
sys.exit(1)
|
||||
|
||||
if not input_dir.exists() or not input_dir.is_dir():
|
||||
print(f"Erreur : dossier source introuvable : {input_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
docx_files = sorted(input_dir.glob("*.docx"))
|
||||
if not docx_files:
|
||||
print(f"Aucun DOCX trouvé dans : {input_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
for docx_file in docx_files:
|
||||
convert_one_file(docx_file, output_dir, input_dir)
|
||||
|
||||
print()
|
||||
print("Conversion DOCX -> MDX terminée.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
304
scripts/convert_mdx_to_docx.py
Normal file
304
scripts/convert_mdx_to_docx.py
Normal file
@@ -0,0 +1,304 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import zipfile
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
print("Erreur : PyYAML n'est pas installé. Lance : pip3 install pyyaml")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
from docx import Document
|
||||
except ImportError:
|
||||
print("Erreur : python-docx n'est pas installé. Lance : pip3 install python-docx")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def split_frontmatter(text: str):
|
||||
if not text.startswith("---\n"):
|
||||
return {}, text
|
||||
|
||||
match = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.DOTALL)
|
||||
if not match:
|
||||
return {}, text
|
||||
|
||||
yaml_block = match.group(1)
|
||||
body = match.group(2)
|
||||
|
||||
try:
|
||||
metadata = yaml.safe_load(yaml_block) or {}
|
||||
except Exception as e:
|
||||
print(f"Avertissement : frontmatter YAML illisible : {e}")
|
||||
metadata = {}
|
||||
|
||||
return metadata, body
|
||||
|
||||
|
||||
def strip_mdx_artifacts(text: str):
|
||||
# imports / exports MDX
|
||||
text = re.sub(r"^\s*(import|export)\s+.+?$", "", text, flags=re.MULTILINE)
|
||||
|
||||
# composants autofermants : <Component />
|
||||
text = re.sub(r"<[A-Z][A-Za-z0-9._-]*\b[^>]*\/>", "", text)
|
||||
|
||||
# composants bloc : <Component ...>...</Component>
|
||||
text = re.sub(
|
||||
r"<([A-Z][A-Za-z0-9._-]*)\b[^>]*>.*?</\1>",
|
||||
"",
|
||||
text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
|
||||
# accolades seules résiduelles sur ligne
|
||||
text = re.sub(r"^\s*{\s*}\s*$", "", text, flags=re.MULTILINE)
|
||||
|
||||
# lignes vides multiples
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
|
||||
return text.strip() + "\n"
|
||||
|
||||
|
||||
def inject_h1_from_title(metadata: dict, body: str):
|
||||
title = metadata.get("title", "")
|
||||
if not title:
|
||||
return body
|
||||
|
||||
if re.match(r"^\s*#\s+", body):
|
||||
return body
|
||||
|
||||
return f"# {title}\n\n{body.lstrip()}"
|
||||
|
||||
|
||||
def find_style_by_candidates(doc, candidates):
|
||||
# Cherche d'abord par nom visible
|
||||
for style in doc.styles:
|
||||
for candidate in candidates:
|
||||
if style.name == candidate:
|
||||
return style
|
||||
|
||||
# Puis par style_id Word interne
|
||||
for style in doc.styles:
|
||||
style_id = getattr(style, "style_id", "")
|
||||
if style_id in {"BodyText", "Heading1", "Heading2", "Heading3", "Heading4"}:
|
||||
for candidate in candidates:
|
||||
if candidate in {"Body Text", "Corps de texte"} and style_id == "BodyText":
|
||||
return style
|
||||
if candidate in {"Heading 1", "Titre 1"} and style_id == "Heading1":
|
||||
return style
|
||||
if candidate in {"Heading 2", "Titre 2"} and style_id == "Heading2":
|
||||
return style
|
||||
if candidate in {"Heading 3", "Titre 3"} and style_id == "Heading3":
|
||||
return style
|
||||
if candidate in {"Heading 4", "Titre 4"} and style_id == "Heading4":
|
||||
return style
|
||||
return None
|
||||
|
||||
def strip_leading_paragraph_numbers(text: str):
|
||||
"""
|
||||
Supprime les numéros de paragraphe du type :
|
||||
2. Texte...
|
||||
11. Texte...
|
||||
101. Texte...
|
||||
sans toucher aux titres Markdown (#, ##, ###).
|
||||
"""
|
||||
fixed_lines = []
|
||||
|
||||
for line in text.splitlines():
|
||||
stripped = line.lstrip()
|
||||
|
||||
# Ne jamais toucher aux titres Markdown
|
||||
if stripped.startswith("#"):
|
||||
fixed_lines.append(line)
|
||||
continue
|
||||
|
||||
# Supprime un numéro de paragraphe en début de ligne
|
||||
line = re.sub(r"^\s*\d+\.\s+", "", line)
|
||||
fixed_lines.append(line)
|
||||
|
||||
return "\n".join(fixed_lines) + "\n"
|
||||
|
||||
def normalize_non_heading_paragraphs(docx_path: Path):
|
||||
"""
|
||||
Force tous les paragraphes non-titres en Body Text / Corps de texte.
|
||||
On laisse intacts les Heading 1-4.
|
||||
"""
|
||||
doc = Document(str(docx_path))
|
||||
|
||||
body_style = find_style_by_candidates(doc, ["Body Text", "Corps de texte"])
|
||||
if body_style is None:
|
||||
print(f"Avertissement : style 'Body Text / Corps de texte' introuvable dans {docx_path.name}")
|
||||
return
|
||||
|
||||
heading_names = {
|
||||
"Heading 1", "Heading 2", "Heading 3", "Heading 4",
|
||||
"Titre 1", "Titre 2", "Titre 3", "Titre 4",
|
||||
}
|
||||
heading_ids = {"Heading1", "Heading2", "Heading3", "Heading4"}
|
||||
|
||||
changed = 0
|
||||
|
||||
for para in doc.paragraphs:
|
||||
text = para.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
current_style = para.style
|
||||
current_name = current_style.name if current_style else ""
|
||||
current_id = getattr(current_style, "style_id", "") if current_style else ""
|
||||
|
||||
if current_name in heading_names or current_id in heading_ids:
|
||||
continue
|
||||
|
||||
# Tout le reste passe en Body Text
|
||||
para.style = body_style
|
||||
changed += 1
|
||||
|
||||
doc.save(str(docx_path))
|
||||
print(f" ↳ normalisation styles : {changed} paragraphe(s) mis en 'Body Text / Corps de texte'")
|
||||
|
||||
def remove_word_bookmarks(docx_path: Path):
|
||||
"""
|
||||
Supprime les bookmarks Word (signets) du DOCX.
|
||||
Ce sont eux qui apparaissent comme crochets gris dans LibreOffice/Word
|
||||
quand l'affichage des signets est activé.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Dézipper le docx
|
||||
with zipfile.ZipFile(docx_path, "r") as zin:
|
||||
zin.extractall(tmpdir)
|
||||
|
||||
xml_targets = [
|
||||
tmpdir / "word" / "document.xml",
|
||||
tmpdir / "word" / "footnotes.xml",
|
||||
tmpdir / "word" / "endnotes.xml",
|
||||
tmpdir / "word" / "comments.xml",
|
||||
]
|
||||
|
||||
removed = 0
|
||||
|
||||
for xml_file in xml_targets:
|
||||
if not xml_file.exists():
|
||||
continue
|
||||
|
||||
text = xml_file.read_text(encoding="utf-8")
|
||||
|
||||
# enlever <w:bookmarkStart .../> et <w:bookmarkEnd .../>
|
||||
text, c1 = re.subn(r"<w:bookmarkStart\b[^>]*/>", "", text)
|
||||
text, c2 = re.subn(r"<w:bookmarkEnd\b[^>]*/>", "", text)
|
||||
|
||||
removed += c1 + c2
|
||||
xml_file.write_text(text, encoding="utf-8")
|
||||
|
||||
# Rezipper
|
||||
tmp_output = docx_path.with_suffix(".cleaned.docx")
|
||||
with zipfile.ZipFile(tmp_output, "w", zipfile.ZIP_DEFLATED) as zout:
|
||||
for file in tmpdir.rglob("*"):
|
||||
if file.is_file():
|
||||
zout.write(file, file.relative_to(tmpdir))
|
||||
|
||||
tmp_output.replace(docx_path)
|
||||
print(f" ↳ suppression signets : {removed} balise(s) supprimée(s)")
|
||||
|
||||
def convert_one_file(input_path: Path, output_path: Path, reference_doc: Path | None):
|
||||
raw = input_path.read_text(encoding="utf-8")
|
||||
metadata, body = split_frontmatter(raw)
|
||||
body = strip_mdx_artifacts(body)
|
||||
body = strip_leading_paragraph_numbers(body)
|
||||
body = inject_h1_from_title(metadata, body)
|
||||
|
||||
with tempfile.NamedTemporaryFile("w", suffix=".md", delete=False, encoding="utf-8") as tmp:
|
||||
tmp.write(body)
|
||||
tmp_md = Path(tmp.name)
|
||||
|
||||
cmd = [
|
||||
"pandoc",
|
||||
str(tmp_md),
|
||||
"-f",
|
||||
"markdown",
|
||||
"-o",
|
||||
str(output_path),
|
||||
]
|
||||
|
||||
if reference_doc:
|
||||
cmd.extend(["--reference-doc", str(reference_doc)])
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True)
|
||||
finally:
|
||||
try:
|
||||
tmp_md.unlink()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
normalize_non_heading_paragraphs(output_path)
|
||||
remove_word_bookmarks(output_path)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convertit des fichiers MDX en DOCX en conservant H1/H2/H3/H4 et en forçant le corps en Body Text."
|
||||
)
|
||||
parser.add_argument("input_dir", help="Dossier contenant les .mdx")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default=str(Path.home() / "Desktop" / "archicrat-ia-docx"),
|
||||
help="Dossier de sortie DOCX"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reference-doc",
|
||||
default=None,
|
||||
help="DOCX modèle Word à utiliser comme reference-doc"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
input_dir = Path(args.input_dir)
|
||||
output_dir = Path(args.output_dir)
|
||||
reference_doc = Path(args.reference_doc) if args.reference_doc else None
|
||||
|
||||
if not shutil.which("pandoc"):
|
||||
print("Erreur : pandoc n'est pas installé. Installe-le avec : brew install pandoc")
|
||||
sys.exit(1)
|
||||
|
||||
if not input_dir.exists() or not input_dir.is_dir():
|
||||
print(f"Erreur : dossier introuvable : {input_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
if reference_doc and not reference_doc.exists():
|
||||
print(f"Erreur : reference-doc introuvable : {reference_doc}")
|
||||
sys.exit(1)
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
mdx_files = sorted(input_dir.glob("*.mdx"))
|
||||
if not mdx_files:
|
||||
print(f"Aucun fichier .mdx trouvé dans : {input_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Conversion de {len(mdx_files)} fichier(s)...")
|
||||
print(f"Entrée : {input_dir}")
|
||||
print(f"Sortie : {output_dir}")
|
||||
if reference_doc:
|
||||
print(f"Modèle : {reference_doc}")
|
||||
print()
|
||||
|
||||
for mdx_file in mdx_files:
|
||||
docx_name = mdx_file.with_suffix(".docx").name
|
||||
out_file = output_dir / docx_name
|
||||
print(f"→ {mdx_file.name} -> {docx_name}")
|
||||
convert_one_file(mdx_file, out_file, reference_doc)
|
||||
|
||||
print()
|
||||
print("✅ Conversion terminée.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user