// scripts/import-docx.mjs // Import DOCX -> MDX (manifest-driven), preferring pandoc if available. // Usage examples: // node scripts/import-docx.mjs --manifest sources/manifest.yml --only archicrat-ia/prologue,archicrat-ia/chapitre-1 // node scripts/import-docx.mjs --manifest sources/manifest.yml --all // // Output: src/content//.mdx // Assets (images): public/imported///... import fs from "node:fs/promises"; import path from "node:path"; import { spawnSync } from "node:child_process"; import process from "node:process"; import { parse as parseYAML } from "yaml"; import mammoth from "mammoth"; function parseArgs(argv) { const out = { manifest: "sources/manifest.yml", only: null, all: false, dryRun: false, force: false }; for (let i = 2; i < argv.length; i++) { const a = argv[i]; if (a === "--manifest") out.manifest = argv[++i]; else if (a === "--only") out.only = (argv[++i] || "").split(",").map(s => s.trim()).filter(Boolean); else if (a === "--all") out.all = true; else if (a === "--dry-run") out.dryRun = true; else if (a === "--force") out.force = true; else if (a === "--help" || a === "-h") { console.log(`DOCX importer --manifest (default: sources/manifest.yml) --only import only those slugs --all import all entries in manifest --dry-run show what would be done, write nothing --force overwrite existing mdx files `); process.exit(0); } } if (!out.all && (!out.only || out.only.length === 0)) { throw new Error("Missing --only or --all. Example: --only archicrat-ia/prologue,archicrat-ia/chapitre-1"); } return out; } async function readManifest(filePath) { const raw = await fs.readFile(filePath, "utf8"); const data = parseYAML(raw); // Accept either: // - root array // - { items: [...] } const items = Array.isArray(data) ? data : (data?.items || data?.docs || []); if (!Array.isArray(items) || items.length === 0) { throw new Error(`Manifest has no items: ${filePath}`); } // Normalize keys return items.map((it, idx) => { const source = it.source || it.src || it.path; const collection = it.collection; const slug = it.slug; const title = it.title || it.titre || slug; const order = it.order ?? it.ordre ?? idx; if (!source || !collection || !slug) { throw new Error(`Manifest item missing fields (need source+collection+slug). Item: ${JSON.stringify(it)}`); } return { source, collection, slug, title, order }; }); } function havePandoc() { const r = spawnSync("pandoc", ["--version"], { stdio: "ignore" }); return r.status === 0; } function runPandoc(docxPath, assetsOutDir) { // Extract media into assetsOutDir (pandoc will create subfolders). // We output GitHub-flavored markdown (good enough for MDX). const args = [ docxPath, "-f", "docx", "-t", "gfm", "--wrap=none", "--extract-media", assetsOutDir, ]; const r = spawnSync("pandoc", args, { encoding: "utf8" }); if (r.status !== 0) { throw new Error(`pandoc failed for ${docxPath}\n${r.stderr || ""}`); } return r.stdout || ""; } async function runMammoth(docxPath, assetsOutDirWebRoot) { // Mammoth -> HTML; we keep HTML inside MDX. // Images are saved into public/... so they can be referenced. const assetsDiskDir = path.resolve(assetsOutDirWebRoot); await fs.mkdir(assetsDiskDir, { recursive: true }); let imgCount = 0; const result = await mammoth.convertToHtml( { path: docxPath }, { convertImage: mammoth.images.imgElement(async (image) => { imgCount++; const ext = image.contentType?.split("/")?.[1] || "png"; const fileName = `image-${String(imgCount).padStart(2, "0")}.${ext}`; const buf = await image.read(); await fs.writeFile(path.join(assetsDiskDir, fileName), buf); // Return public URL path (we'll set correct prefix outside) return { src: fileName }; }), } ); let html = result.value || ""; // Mammoth gives relative src="image-xx.png" ; we will prefix later return html; } function escapeFrontmatterString(s) { return String(s).replace(/"/g, '\\"'); } function stripDuplicateTitle(markdownOrHtml, title) { const t = String(title || "").trim(); if (!t) return markdownOrHtml; // Remove leading "# Title" const md = markdownOrHtml.replace(/^\s*#\s+([^\n]+)\n+/, (m, h1) => { return h1.trim() === t ? "" : m; }); // Remove leading

Title

return md.replace(/^\s*]*>(.*?)<\/h1>\s*/i, (m, h1) => { const plain = String(h1).replace(/<[^>]+>/g, "").trim(); return plain === t ? "" : m; }); } function stripWordToc(markdownOrHtml) { // Remove a Word-generated TOC block near the start of the document. // Heuristic: in the first ~4000 chars, detect a "Sommaire" / "Table des matières" heading // followed by a run of links/list items, and cut until the next real heading. const text = String(markdownOrHtml || ""); const head = text.slice(0, 4000); const m = head.match(/(^|\n)\s{0,3}(#{1,6}\s*)?(Sommaire|Table des matières)\s*\n/i); if (!m) return markdownOrHtml; const startIdx = m.index ?? 0; const rest = text.slice(startIdx); // Cut until next heading (markdown-style). If none, drop everything from startIdx. const nextHeading = rest.slice(1).match(/\n#{1,6}\s+[^\n]+\n/); if (!nextHeading) return (text.slice(0, startIdx)).trim() + "\n"; const endIdx = startIdx + 1 + (nextHeading.index ?? 0); return (text.slice(0, startIdx) + text.slice(endIdx)).trim() + "\n"; } function rewriteLocalImageLinks(text, publicPrefix) { // pandoc tends to output ![](media/xxx) or ![](/media/xxx) // mammoth fallback gives let out = text; // Markdown image links: ](media/... out = out.replace(/\]\(\s*media\//g, `](${publicPrefix}/media/`); // HTML img src="image-xx.ext" out = out.replace(/src="(image-\d+\.[a-z0-9]+)"/gi, `src="${publicPrefix}/$1"`); return out; } function stripHtmlComments(text) { return String(text || "").replace(//g, ""); } async function exists(p) { try { await fs.access(p); return true; } catch { return false; } } /** * ✅ compat: * - ancien : collection="archicratie" + slug="archicrat-ia/chapitre-3" * - nouveau : collection="archicrat-ia" + slug="chapitre-3" * * But : toujours écrire dans src/content/archicrat-ia/.mdx */ function normalizeDest(collection, slug) { let outCollection = String(collection || "").trim(); let outSlug = String(slug || "").trim().replace(/^\/+|\/+$/g, ""); if (outCollection === "archicratie" && outSlug.startsWith("archicrat-ia/")) { outCollection = "archicrat-ia"; outSlug = outSlug.replace(/^archicrat-ia\//, ""); } return { outCollection, outSlug }; } async function main() { const args = parseArgs(process.argv); const manifestPath = path.resolve(args.manifest); const items = await readManifest(manifestPath); const selected = args.all ? items : items.filter(it => args.only.includes(it.slug)); if (!args.all && selected.length !== args.only.length) { const found = new Set(selected.map(s => s.slug)); const missing = args.only.filter(s => !found.has(s)); throw new Error(`Some --only slugs not found in manifest: ${missing.join(", ")}`); } const pandocOk = havePandoc(); console.log(`Manifest: ${manifestPath}`); console.log(`Pandoc: ${pandocOk ? "YES (preferred)" : "NO (fallback to mammoth/html)"}`); console.log(`Items: ${selected.length}`); for (const it of selected) { const docxPath = path.resolve(it.source); const { outCollection, outSlug } = normalizeDest(it.collection, it.slug); const outFile = path.resolve("src/content", outCollection, `${outSlug}.mdx`); const outDir = path.dirname(outFile); const assetsPublicDir = path.posix.join("/imported", outCollection, outSlug); const assetsDiskDir = path.resolve("public", "imported", outCollection, outSlug); if (!(await exists(docxPath))) { throw new Error(`Missing source docx: ${docxPath}`); } if ((await exists(outFile)) && !args.force) { throw new Error(`Refusing to overwrite existing: ${outFile} (use --force)`); } console.log(`\n→ ${it.slug}`); console.log(` source: ${it.source}`); console.log(` out: ${path.relative(process.cwd(), outFile)}`); if (args.dryRun) continue; await fs.mkdir(outDir, { recursive: true }); await fs.mkdir(assetsDiskDir, { recursive: true }); let body = ""; if (pandocOk) { // pandoc extract-media wants a directory; it will create media/ inside. body = runPandoc(docxPath, assetsDiskDir); body = stripDuplicateTitle(body, it.title); body = stripWordToc(body); body = stripHtmlComments(body); body = rewriteLocalImageLinks(body, assetsPublicDir); } else { let html = await runMammoth(docxPath, assetsDiskDir); html = stripDuplicateTitle(html, it.title); html = stripWordToc(html); html = stripHtmlComments(html); html = rewriteLocalImageLinks(html, assetsPublicDir); body = html.trim() ? html : "

(Import vide)

"; } const defaultVersion = process.env.PUBLIC_RELEASE || "0.1.0"; // ✅ IMPORTANT: archicrat-ia partage edition/status avec archicratie (pas de migration frontmatter) const schemaDefaultsByCollection = { archicratie: { edition: "archicratie", status: "modele_sociopolitique", level: 1 }, "archicrat-ia": { edition: "archicrat-ia", status: "essai_these", level: 1 }, "cas-ia": { edition: "cas-ia", status: "application", level: 1 }, traite: { edition: "traite", status: "ontodynamique", level: 1 }, glossaire: { edition: "glossaire", status: "lexique", level: 1 }, atlas: { edition: "atlas", status: "atlas", level: 1 }, }; const defaults = schemaDefaultsByCollection[outCollection] || { edition: outCollection, status: "draft", level: 1 }; const fm = [ "---", `title: "${escapeFrontmatterString(it.title)}"`, `edition: "${escapeFrontmatterString(defaults.edition)}"`, `status: "${escapeFrontmatterString(defaults.status)}"`, `level: ${Number(defaults.level)}`, `version: "${escapeFrontmatterString(defaultVersion)}"`, `concepts: []`, `links: []`, `order: ${Number(it.order)}`, `summary: ""`, `source:`, ` kind: docx`, ` path: "${escapeFrontmatterString(it.source)}"`, "---", "", ].join("\n"); const mdx = fm + body.trim() + "\n"; await fs.writeFile(outFile, mdx, "utf8"); } console.log("\nDone."); } main().catch((e) => { console.error("\nERROR:", e?.message || e); process.exit(1); });