Seed from NAS prod snapshot 20260130-190531

2026-01-31 10:51:38 +00:00
commit 60d88939b0
142 changed files with 33443 additions and 0 deletions
--- a/scripts/import-docx.mjs
+++ b/scripts/import-docx.mjs
@@ -0,0 +1,285 @@
+// scripts/import-docx.mjs
+// Import DOCX -> MDX (manifest-driven), preferring pandoc if available.
+// Usage examples:
+//   node scripts/import-docx.mjs --manifest sources/manifest.yml --only archicrat-ia/prologue,archicrat-ia/chapitre-1
+//   node scripts/import-docx.mjs --manifest sources/manifest.yml --all
+//
+// Output: src/content/<collection>/<slug>.mdx
+// Assets (images): public/imported/<collection>/<slug>/...
+
+import fs from "node:fs/promises";
+import path from "node:path";
+import { spawnSync } from "node:child_process";
+import process from "node:process";
+
+import { parse as parseYAML } from "yaml";
+import mammoth from "mammoth";
+
+function parseArgs(argv) {
+  const out = { manifest: "sources/manifest.yml", only: null, all: false, dryRun: false, force: false };
+  for (let i = 2; i < argv.length; i++) {
+    const a = argv[i];
+    if (a === "--manifest") out.manifest = argv[++i];
+    else if (a === "--only") out.only = (argv[++i] || "").split(",").map(s => s.trim()).filter(Boolean);
+    else if (a === "--all") out.all = true;
+    else if (a === "--dry-run") out.dryRun = true;
+    else if (a === "--force") out.force = true;
+    else if (a === "--help" || a === "-h") {
+      console.log(`DOCX importer
+--manifest <path>    (default: sources/manifest.yml)
+--only <slug,slug>   import only those slugs
+--all                import all entries in manifest
+--dry-run            show what would be done, write nothing
+--force              overwrite existing mdx files
+`);
+      process.exit(0);
+    }
+  }
+  if (!out.all && (!out.only || out.only.length === 0)) {
+    throw new Error("Missing --only or --all. Example: --only archicrat-ia/prologue,archicrat-ia/chapitre-1");
+  }
+  return out;
+}
+
+async function readManifest(filePath) {
+  const raw = await fs.readFile(filePath, "utf8");
+  const data = parseYAML(raw);
+
+  // Accept either:
+  // - root array
+  // - { items: [...] }
+  const items = Array.isArray(data) ? data : (data?.items || data?.docs || []);
+  if (!Array.isArray(items) || items.length === 0) {
+    throw new Error(`Manifest has no items: ${filePath}`);
+  }
+
+  // Normalize keys
+  return items.map((it, idx) => {
+    const source = it.source || it.src || it.path;
+    const collection = it.collection;
+    const slug = it.slug;
+    const title = it.title || it.titre || slug;
+    const order = it.order ?? it.ordre ?? idx;
+
+    if (!source || !collection || !slug) {
+      throw new Error(`Manifest item missing fields (need source+collection+slug). Item: ${JSON.stringify(it)}`);
+    }
+
+    return { source, collection, slug, title, order };
+  });
+}
+
+function havePandoc() {
+  const r = spawnSync("pandoc", ["--version"], { stdio: "ignore" });
+  return r.status === 0;
+}
+
+function runPandoc(docxPath, assetsOutDir) {
+  // Extract media into assetsOutDir (pandoc will create subfolders).
+  // We output GitHub-flavored markdown (good enough for MDX).
+  const args = [
+    docxPath,
+    "-f", "docx",
+    "-t", "gfm",
+    "--wrap=none",
+    "--extract-media", assetsOutDir,
+  ];
+  const r = spawnSync("pandoc", args, { encoding: "utf8" });
+  if (r.status !== 0) {
+    throw new Error(`pandoc failed for ${docxPath}\n${r.stderr || ""}`);
+  }
+  return r.stdout || "";
+}
+
+async function runMammoth(docxPath, assetsOutDirWebRoot) {
+  // Mammoth -> HTML; we keep HTML inside MDX.
+  // Images are saved into public/... so they can be referenced.
+  const assetsDiskDir = path.resolve(assetsOutDirWebRoot);
+  await fs.mkdir(assetsDiskDir, { recursive: true });
+
+  let imgCount = 0;
+  const result = await mammoth.convertToHtml(
+    { path: docxPath },
+    {
+      convertImage: mammoth.images.imgElement(async (image) => {
+        imgCount++;
+        const ext = image.contentType?.split("/")?.[1] || "png";
+        const fileName = `image-${String(imgCount).padStart(2, "0")}.${ext}`;
+        const buf = await image.read();
+        await fs.writeFile(path.join(assetsDiskDir, fileName), buf);
+        // Return public URL path (we'll set correct prefix outside)
+        return { src: fileName };
+      }),
+    }
+  );
+
+  let html = result.value || "";
+
+  // Mammoth gives relative src="image-xx.png" ; we will prefix later
+  return html;
+}
+
+function escapeFrontmatterString(s) {
+  return String(s).replace(/"/g, '\\"');
+}
+
+function stripDuplicateTitle(markdownOrHtml, title) {
+  const t = String(title || "").trim();
+  if (!t) return markdownOrHtml;
+
+  // Remove leading "# Title"
+  const md = markdownOrHtml.replace(/^\s*#\s+([^\n]+)\n+/, (m, h1) => {
+    return h1.trim() === t ? "" : m;
+  });
+
+  // Remove leading <h1>Title</h1>
+  return md.replace(/^\s*<h1[^>]*>(.*?)<\/h1>\s*/i, (m, h1) => {
+    const plain = String(h1).replace(/<[^>]+>/g, "").trim();
+    return plain === t ? "" : m;
+  });
+}
+
+function stripWordToc(markdownOrHtml) {
+  // Remove a Word-generated TOC block near the start of the document.
+  // Heuristic: in the first ~4000 chars, detect a "Sommaire" / "Table des matières" heading
+  // followed by a run of links/list items, and cut until the next real heading.
+  const text = String(markdownOrHtml || "");
+  const head = text.slice(0, 4000);
+
+  const m = head.match(/(^|\n)\s{0,3}(#{1,6}\s*)?(Sommaire|Table des matières)\s*\n/i);
+  if (!m) return markdownOrHtml;
+
+  const startIdx = m.index ?? 0;
+  const rest = text.slice(startIdx);
+
+  // Cut until next heading (markdown-style). If none, drop everything from startIdx.
+  const nextHeading = rest.slice(1).match(/\n#{1,6}\s+[^\n]+\n/);
+  if (!nextHeading) return (text.slice(0, startIdx)).trim() + "\n";
+
+  const endIdx = startIdx + 1 + (nextHeading.index ?? 0);
+  return (text.slice(0, startIdx) + text.slice(endIdx)).trim() + "\n";
+}
+
+function rewriteLocalImageLinks(text, publicPrefix) {
+  // pandoc tends to output ![](media/xxx) or ![](<dir>/media/xxx)
+  // mammoth fallback gives <img src="image-01.png">
+  let out = text;
+
+  // Markdown image links: ](media/...
+  out = out.replace(/\]\(\s*media\//g, `](${publicPrefix}/media/`);
+
+  // HTML img src="image-xx.ext"
+  out = out.replace(/src="(image-\d+\.[a-z0-9]+)"/gi, `src="${publicPrefix}/$1"`);
+
+  return out;
+}
+
+function stripHtmlComments(text) {
+  return String(text || "").replace(/<!--[\s\S]*?-->/g, "");
+}
+
+async function exists(p) {
+  try { await fs.access(p); return true; } catch { return false; }
+}
+
+async function main() {
+  const args = parseArgs(process.argv);
+  const manifestPath = path.resolve(args.manifest);
+
+  const items = await readManifest(manifestPath);
+  const selected = args.all ? items : items.filter(it => args.only.includes(it.slug));
+
+  if (!args.all && selected.length !== args.only.length) {
+    const found = new Set(selected.map(s => s.slug));
+    const missing = args.only.filter(s => !found.has(s));
+    throw new Error(`Some --only slugs not found in manifest: ${missing.join(", ")}`);
+  }
+
+  const pandocOk = havePandoc();
+
+  console.log(`Manifest: ${manifestPath}`);
+  console.log(`Pandoc: ${pandocOk ? "YES (preferred)" : "NO (fallback to mammoth/html)"}`);
+  console.log(`Items: ${selected.length}`);
+
+  for (const it of selected) {
+    const docxPath = path.resolve(it.source);
+    const outFile = path.resolve("src/content", it.collection, `${it.slug}.mdx`);
+    const outDir = path.dirname(outFile);
+
+    const assetsPublicDir = path.posix.join("/imported", it.collection, it.slug);
+    const assetsDiskDir = path.resolve("public", "imported", it.collection, it.slug);
+
+    if (!(await exists(docxPath))) {
+      throw new Error(`Missing source docx: ${docxPath}`);
+    }
+    if ((await exists(outFile)) && !args.force) {
+      throw new Error(`Refusing to overwrite existing: ${outFile} (use --force)`);
+    }
+
+    console.log(`\n→ ${it.slug}`);
+    console.log(`   source: ${it.source}`);
+    console.log(`   out:    ${path.relative(process.cwd(), outFile)}`);
+
+    if (args.dryRun) continue;
+
+    await fs.mkdir(outDir, { recursive: true });
+    await fs.mkdir(assetsDiskDir, { recursive: true });
+
+    let body = "";
+    if (pandocOk) {
+      // pandoc extract-media wants a directory; it will create media/ inside.
+      body = runPandoc(docxPath, assetsDiskDir);
+      body = stripDuplicateTitle(body, it.title);
+      body = stripWordToc(body);
+      body = stripHtmlComments(body);
+      body = rewriteLocalImageLinks(body, assetsPublicDir);
+    } else {
+      let html = await runMammoth(docxPath, assetsDiskDir);
+      html = stripDuplicateTitle(html, it.title);
+      html = stripWordToc(html);
+      html = stripHtmlComments(html);
+      html = rewriteLocalImageLinks(html, assetsPublicDir);
+      body = html.trim() ? html : "<p>(Import vide)</p>";
+    }
+  
+    const defaultVersion = process.env.PUBLIC_RELEASE || "0.1.0";
+
+    const schemaDefaultsByCollection = {
+      archicratie: { edition: "archicratie", status: "modele_sociopolitique", level: 1 },
+      ia:         { edition: "ia",         status: "cas_pratique",           level: 1 },
+      traite:     { edition: "traite",     status: "ontodynamique",          level: 1 },
+      glossaire:  { edition: "glossaire",  status: "lexique",                level: 1 },
+      atlas:      { edition: "atlas",      status: "atlas",                  level: 1 },
+    };
+
+    const defaults = schemaDefaultsByCollection[it.collection] || { edition: it.collection, status: "draft", level: 1 };
+
+    const fm = [
+      "---",
+      `title: "${escapeFrontmatterString(it.title)}"`,
+      `edition: "${escapeFrontmatterString(defaults.edition)}"`,
+      `status: "${escapeFrontmatterString(defaults.status)}"`,
+      `level: ${Number(defaults.level)}`,
+      `version: "${escapeFrontmatterString(defaultVersion)}"`,
+      `concepts: []`,
+      `links: []`,
+      `order: ${Number(it.order)}`,
+      `summary: ""`,
+      `source:`,
+      `  kind: docx`,
+      `  path: "${escapeFrontmatterString(it.source)}"`,
+      "---",
+      "",
+    ].join("\n");
+
+    const mdx = fm + body.trim() + "\n";
+    await fs.writeFile(outFile, mdx, "utf8");
+  }
+
+  console.log("\nDone.");
+}
+
+main().catch((e) => {
+  console.error("\nERROR:", e?.message || e);
+  process.exit(1);
+});