#!/usr/bin/env python3 import argparse import re import shutil import subprocess import sys import tempfile from pathlib import Path import zipfile try: import yaml except ImportError: print("Erreur : PyYAML n'est pas installé. Lance : pip3 install pyyaml") sys.exit(1) try: from docx import Document except ImportError: print("Erreur : python-docx n'est pas installé. Lance : pip3 install python-docx") sys.exit(1) def split_frontmatter(text: str): if not text.startswith("---\n"): return {}, text match = re.match(r"^---\n(.*?)\n---\n(.*)$", text, flags=re.DOTALL) if not match: return {}, text yaml_block = match.group(1) body = match.group(2) try: metadata = yaml.safe_load(yaml_block) or {} except Exception as e: print(f"Avertissement : frontmatter YAML illisible : {e}") metadata = {} return metadata, body def strip_mdx_artifacts(text: str): # imports / exports MDX text = re.sub(r"^\s*(import|export)\s+.+?$", "", text, flags=re.MULTILINE) # composants autofermants : text = re.sub(r"<[A-Z][A-Za-z0-9._-]*\b[^>]*\/>", "", text) # composants bloc : ... text = re.sub( r"<([A-Z][A-Za-z0-9._-]*)\b[^>]*>.*?", "", text, flags=re.DOTALL, ) # accolades seules résiduelles sur ligne text = re.sub(r"^\s*{\s*}\s*$", "", text, flags=re.MULTILINE) # lignes vides multiples text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() + "\n" def inject_h1_from_title(metadata: dict, body: str): title = metadata.get("title", "") if not title: return body if re.match(r"^\s*#\s+", body): return body return f"# {title}\n\n{body.lstrip()}" def find_style_by_candidates(doc, candidates): # Cherche d'abord par nom visible for style in doc.styles: for candidate in candidates: if style.name == candidate: return style # Puis par style_id Word interne for style in doc.styles: style_id = getattr(style, "style_id", "") if style_id in {"BodyText", "Heading1", "Heading2", "Heading3", "Heading4"}: for candidate in candidates: if candidate in {"Body Text", "Corps de texte"} and style_id == "BodyText": return style if candidate in {"Heading 1", "Titre 1"} and style_id == "Heading1": return style if candidate in {"Heading 2", "Titre 2"} and style_id == "Heading2": return style if candidate in {"Heading 3", "Titre 3"} and style_id == "Heading3": return style if candidate in {"Heading 4", "Titre 4"} and style_id == "Heading4": return style return None def strip_leading_paragraph_numbers(text: str): """ Supprime les numéros de paragraphe du type : 2. Texte... 11. Texte... 101. Texte... sans toucher aux titres Markdown (#, ##, ###). """ fixed_lines = [] for line in text.splitlines(): stripped = line.lstrip() # Ne jamais toucher aux titres Markdown if stripped.startswith("#"): fixed_lines.append(line) continue # Supprime un numéro de paragraphe en début de ligne line = re.sub(r"^\s*\d+\.\s+", "", line) fixed_lines.append(line) return "\n".join(fixed_lines) + "\n" def normalize_non_heading_paragraphs(docx_path: Path): """ Force tous les paragraphes non-titres en Body Text / Corps de texte. On laisse intacts les Heading 1-4. """ doc = Document(str(docx_path)) body_style = find_style_by_candidates(doc, ["Body Text", "Corps de texte"]) if body_style is None: print(f"Avertissement : style 'Body Text / Corps de texte' introuvable dans {docx_path.name}") return heading_names = { "Heading 1", "Heading 2", "Heading 3", "Heading 4", "Titre 1", "Titre 2", "Titre 3", "Titre 4", } heading_ids = {"Heading1", "Heading2", "Heading3", "Heading4"} changed = 0 for para in doc.paragraphs: text = para.text.strip() if not text: continue current_style = para.style current_name = current_style.name if current_style else "" current_id = getattr(current_style, "style_id", "") if current_style else "" if current_name in heading_names or current_id in heading_ids: continue # Tout le reste passe en Body Text para.style = body_style changed += 1 doc.save(str(docx_path)) print(f" ↳ normalisation styles : {changed} paragraphe(s) mis en 'Body Text / Corps de texte'") def remove_word_bookmarks(docx_path: Path): """ Supprime les bookmarks Word (signets) du DOCX. Ce sont eux qui apparaissent comme crochets gris dans LibreOffice/Word quand l'affichage des signets est activé. """ with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) # Dézipper le docx with zipfile.ZipFile(docx_path, "r") as zin: zin.extractall(tmpdir) xml_targets = [ tmpdir / "word" / "document.xml", tmpdir / "word" / "footnotes.xml", tmpdir / "word" / "endnotes.xml", tmpdir / "word" / "comments.xml", ] removed = 0 for xml_file in xml_targets: if not xml_file.exists(): continue text = xml_file.read_text(encoding="utf-8") # enlever et text, c1 = re.subn(r"]*/>", "", text) text, c2 = re.subn(r"]*/>", "", text) removed += c1 + c2 xml_file.write_text(text, encoding="utf-8") # Rezipper tmp_output = docx_path.with_suffix(".cleaned.docx") with zipfile.ZipFile(tmp_output, "w", zipfile.ZIP_DEFLATED) as zout: for file in tmpdir.rglob("*"): if file.is_file(): zout.write(file, file.relative_to(tmpdir)) tmp_output.replace(docx_path) print(f" ↳ suppression signets : {removed} balise(s) supprimée(s)") def convert_one_file(input_path: Path, output_path: Path, reference_doc: Path | None): raw = input_path.read_text(encoding="utf-8") metadata, body = split_frontmatter(raw) body = strip_mdx_artifacts(body) body = strip_leading_paragraph_numbers(body) body = inject_h1_from_title(metadata, body) with tempfile.NamedTemporaryFile("w", suffix=".md", delete=False, encoding="utf-8") as tmp: tmp.write(body) tmp_md = Path(tmp.name) cmd = [ "pandoc", str(tmp_md), "-f", "markdown", "-o", str(output_path), ] if reference_doc: cmd.extend(["--reference-doc", str(reference_doc)]) try: subprocess.run(cmd, check=True) finally: try: tmp_md.unlink() except FileNotFoundError: pass normalize_non_heading_paragraphs(output_path) remove_word_bookmarks(output_path) def main(): parser = argparse.ArgumentParser( description="Convertit des fichiers MDX en DOCX en conservant H1/H2/H3/H4 et en forçant le corps en Body Text." ) parser.add_argument("input_dir", help="Dossier contenant les .mdx") parser.add_argument( "--output-dir", default=str(Path.home() / "Desktop" / "archicrat-ia-docx"), help="Dossier de sortie DOCX" ) parser.add_argument( "--reference-doc", default=None, help="DOCX modèle Word à utiliser comme reference-doc" ) args = parser.parse_args() input_dir = Path(args.input_dir) output_dir = Path(args.output_dir) reference_doc = Path(args.reference_doc) if args.reference_doc else None if not shutil.which("pandoc"): print("Erreur : pandoc n'est pas installé. Installe-le avec : brew install pandoc") sys.exit(1) if not input_dir.exists() or not input_dir.is_dir(): print(f"Erreur : dossier introuvable : {input_dir}") sys.exit(1) if reference_doc and not reference_doc.exists(): print(f"Erreur : reference-doc introuvable : {reference_doc}") sys.exit(1) output_dir.mkdir(parents=True, exist_ok=True) mdx_files = sorted(input_dir.glob("*.mdx")) if not mdx_files: print(f"Aucun fichier .mdx trouvé dans : {input_dir}") sys.exit(1) print(f"Conversion de {len(mdx_files)} fichier(s)...") print(f"Entrée : {input_dir}") print(f"Sortie : {output_dir}") if reference_doc: print(f"Modèle : {reference_doc}") print() for mdx_file in mdx_files: docx_name = mdx_file.with_suffix(".docx").name out_file = output_dir / docx_name print(f"→ {mdx_file.name} -> {docx_name}") convert_one_file(mdx_file, out_file, reference_doc) print() print("✅ Conversion terminée.") if __name__ == "__main__": main()