323 lines
11 KiB
JavaScript
323 lines
11 KiB
JavaScript
// scripts/import-docx.mjs
|
|
// Import DOCX -> MDX (manifest-driven), preferring pandoc if available.
|
|
// Usage examples:
|
|
// node scripts/import-docx.mjs --manifest sources/manifest.yml --only archicrat-ia/prologue,archicrat-ia/chapitre-1
|
|
// node scripts/import-docx.mjs --manifest sources/manifest.yml --all
|
|
//
|
|
// Output: src/content/<collection>/<slug>.mdx
|
|
// Assets (images): public/imported/<collection>/<slug>/...
|
|
|
|
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import { spawnSync } from "node:child_process";
|
|
import process from "node:process";
|
|
|
|
import { parse as parseYAML } from "yaml";
|
|
import mammoth from "mammoth";
|
|
|
|
function parseArgs(argv) {
|
|
const out = { manifest: "sources/manifest.yml", only: null, all: false, dryRun: false, force: false };
|
|
for (let i = 2; i < argv.length; i++) {
|
|
const a = argv[i];
|
|
if (a === "--manifest") out.manifest = argv[++i];
|
|
else if (a === "--only") out.only = (argv[++i] || "").split(",").map(s => s.trim()).filter(Boolean);
|
|
else if (a === "--all") out.all = true;
|
|
else if (a === "--dry-run") out.dryRun = true;
|
|
else if (a === "--force") out.force = true;
|
|
else if (a === "--help" || a === "-h") {
|
|
console.log(`DOCX importer
|
|
--manifest <path> (default: sources/manifest.yml)
|
|
--only <slug,slug> import only those slugs
|
|
--all import all entries in manifest
|
|
--dry-run show what would be done, write nothing
|
|
--force overwrite existing mdx files
|
|
`);
|
|
process.exit(0);
|
|
}
|
|
}
|
|
if (!out.all && (!out.only || out.only.length === 0)) {
|
|
throw new Error("Missing --only or --all. Example: --only archicrat-ia/prologue,archicrat-ia/chapitre-1");
|
|
}
|
|
return out;
|
|
}
|
|
|
|
async function readManifest(filePath) {
|
|
const raw = await fs.readFile(filePath, "utf8");
|
|
const data = parseYAML(raw);
|
|
|
|
// Accept either:
|
|
// - root array
|
|
// - { items: [...] }
|
|
const items = Array.isArray(data) ? data : (data?.items || data?.docs || []);
|
|
if (!Array.isArray(items) || items.length === 0) {
|
|
throw new Error(`Manifest has no items: ${filePath}`);
|
|
}
|
|
|
|
// Normalize keys
|
|
return items.map((it, idx) => {
|
|
const source = it.source || it.src || it.path;
|
|
const collection = it.collection;
|
|
const slug = it.slug;
|
|
const title = it.title || it.titre || slug;
|
|
const order = it.order ?? it.ordre ?? idx;
|
|
|
|
if (!source || !collection || !slug) {
|
|
throw new Error(`Manifest item missing fields (need source+collection+slug). Item: ${JSON.stringify(it)}`);
|
|
}
|
|
|
|
return { source, collection, slug, title, order };
|
|
});
|
|
}
|
|
|
|
function havePandoc() {
|
|
const r = spawnSync("pandoc", ["--version"], { stdio: "ignore" });
|
|
return r.status === 0;
|
|
}
|
|
|
|
function runPandoc(docxPath, assetsOutDir) {
|
|
// Extract media into assetsOutDir (pandoc will create subfolders).
|
|
// We output GitHub-flavored markdown (good enough for MDX).
|
|
const args = [
|
|
docxPath,
|
|
"-f", "docx",
|
|
"-t", "gfm",
|
|
"--wrap=none",
|
|
"--extract-media", assetsOutDir,
|
|
];
|
|
const r = spawnSync("pandoc", args, { encoding: "utf8" });
|
|
if (r.status !== 0) {
|
|
throw new Error(`pandoc failed for ${docxPath}\n${r.stderr || ""}`);
|
|
}
|
|
return r.stdout || "";
|
|
}
|
|
|
|
async function runMammoth(docxPath, assetsOutDirWebRoot) {
|
|
// Mammoth -> HTML; we keep HTML inside MDX.
|
|
// Images are saved into public/... so they can be referenced.
|
|
const assetsDiskDir = path.resolve(assetsOutDirWebRoot);
|
|
await fs.mkdir(assetsDiskDir, { recursive: true });
|
|
|
|
let imgCount = 0;
|
|
const result = await mammoth.convertToHtml(
|
|
{ path: docxPath },
|
|
{
|
|
convertImage: mammoth.images.imgElement(async (image) => {
|
|
imgCount++;
|
|
const ext = image.contentType?.split("/")?.[1] || "png";
|
|
const fileName = `image-${String(imgCount).padStart(2, "0")}.${ext}`;
|
|
const buf = await image.read();
|
|
await fs.writeFile(path.join(assetsDiskDir, fileName), buf);
|
|
// Return public URL path (we'll set correct prefix outside)
|
|
return { src: fileName };
|
|
}),
|
|
}
|
|
);
|
|
|
|
let html = result.value || "";
|
|
// Mammoth gives relative src="image-xx.png" ; we will prefix later
|
|
return html;
|
|
}
|
|
|
|
function escapeFrontmatterString(s) {
|
|
return String(s).replace(/"/g, '\\"');
|
|
}
|
|
|
|
function stripDuplicateTitle(markdownOrHtml, title) {
|
|
const t = String(title || "").trim();
|
|
if (!t) return markdownOrHtml;
|
|
|
|
// Remove leading "# Title"
|
|
const md = markdownOrHtml.replace(/^\s*#\s+([^\n]+)\n+/, (m, h1) => {
|
|
return h1.trim() === t ? "" : m;
|
|
});
|
|
|
|
// Remove leading <h1>Title</h1>
|
|
return md.replace(/^\s*<h1[^>]*>(.*?)<\/h1>\s*/i, (m, h1) => {
|
|
const plain = String(h1).replace(/<[^>]+>/g, "").trim();
|
|
return plain === t ? "" : m;
|
|
});
|
|
}
|
|
|
|
function stripWordToc(markdownOrHtml) {
|
|
// Remove a Word-generated TOC block near the start of the document.
|
|
// Heuristic: in the first ~4000 chars, detect a "Sommaire" / "Table des matières" heading
|
|
// followed by a run of links/list items, and cut until the next real heading.
|
|
const text = String(markdownOrHtml || "");
|
|
const head = text.slice(0, 4000);
|
|
|
|
const m = head.match(/(^|\n)\s{0,3}(#{1,6}\s*)?(Sommaire|Table des matières)\s*\n/i);
|
|
if (!m) return markdownOrHtml;
|
|
|
|
const startIdx = m.index ?? 0;
|
|
const rest = text.slice(startIdx);
|
|
|
|
// Cut until next heading (markdown-style). If none, drop everything from startIdx.
|
|
const nextHeading = rest.slice(1).match(/\n#{1,6}\s+[^\n]+\n/);
|
|
if (!nextHeading) return (text.slice(0, startIdx)).trim() + "\n";
|
|
|
|
const endIdx = startIdx + 1 + (nextHeading.index ?? 0);
|
|
return (text.slice(0, startIdx) + text.slice(endIdx)).trim() + "\n";
|
|
}
|
|
|
|
function rewriteLocalImageLinks(text, publicPrefix) {
|
|
// pandoc tends to output  or 
|
|
// mammoth fallback gives <img src="image-01.png">
|
|
let out = text;
|
|
|
|
// Markdown image links: ](media/...
|
|
out = out.replace(/\]\(\s*media\//g, `](${publicPrefix}/media/`);
|
|
|
|
// HTML img src="image-xx.ext"
|
|
out = out.replace(/src="(image-\d+\.[a-z0-9]+)"/gi, `src="${publicPrefix}/$1"`);
|
|
|
|
return out;
|
|
}
|
|
|
|
function stripHtmlComments(text) {
|
|
return String(text || "").replace(/<!--[\s\S]*?-->/g, "");
|
|
}
|
|
|
|
async function exists(p) {
|
|
try { await fs.access(p); return true; } catch { return false; }
|
|
}
|
|
|
|
/**
|
|
* ✅ compat:
|
|
* - ancien : collection="archicratie" + slug="archicrat-ia/chapitre-3"
|
|
* - nouveau : collection="archicrat-ia" + slug="chapitre-3"
|
|
*
|
|
* But : toujours écrire dans src/content/archicrat-ia/<slugSansPrefix>.mdx
|
|
*/
|
|
function normalizeDest(collection, slug) {
|
|
let outCollection = String(collection || "").trim();
|
|
let outSlug = String(slug || "").trim().replace(/^\/+|\/+$/g, "");
|
|
|
|
if (outCollection === "archicratie" && outSlug.startsWith("archicrat-ia/")) {
|
|
outCollection = "archicrat-ia";
|
|
outSlug = outSlug.replace(/^archicrat-ia\//, "");
|
|
}
|
|
|
|
return { outCollection, outSlug };
|
|
}
|
|
|
|
async function main() {
|
|
const args = parseArgs(process.argv);
|
|
const manifestPath = path.resolve(args.manifest);
|
|
|
|
const items = await readManifest(manifestPath);
|
|
const selected = args.all ? items : items.filter(it => args.only.includes(it.slug));
|
|
|
|
if (!args.all && selected.length !== args.only.length) {
|
|
const found = new Set(selected.map(s => s.slug));
|
|
const missing = args.only.filter(s => !found.has(s));
|
|
throw new Error(`Some --only slugs not found in manifest: ${missing.join(", ")}`);
|
|
}
|
|
|
|
const pandocOk = havePandoc();
|
|
|
|
console.log(`Manifest: ${manifestPath}`);
|
|
console.log(`Pandoc: ${pandocOk ? "YES (preferred)" : "NO (fallback to mammoth/html)"}`);
|
|
console.log(`Items: ${selected.length}`);
|
|
|
|
for (const it of selected) {
|
|
const docxPath = path.resolve(it.source);
|
|
|
|
const { outCollection, outSlug } = normalizeDest(it.collection, it.slug);
|
|
|
|
const outFile = path.resolve("src/content", outCollection, `${outSlug}.mdx`);
|
|
const outDir = path.dirname(outFile);
|
|
|
|
const assetsPublicDir = path.posix.join("/imported", outCollection, outSlug);
|
|
const assetsDiskDir = path.resolve("public", "imported", outCollection, outSlug);
|
|
|
|
if (!(await exists(docxPath))) {
|
|
throw new Error(`Missing source docx: ${docxPath}`);
|
|
}
|
|
if ((await exists(outFile)) && !args.force) {
|
|
throw new Error(`Refusing to overwrite existing: ${outFile} (use --force)`);
|
|
}
|
|
|
|
console.log(`\n→ ${it.slug}`);
|
|
console.log(` source: ${it.source}`);
|
|
console.log(` out: ${path.relative(process.cwd(), outFile)}`);
|
|
|
|
if (args.dryRun) continue;
|
|
|
|
await fs.mkdir(outDir, { recursive: true });
|
|
await fs.mkdir(assetsDiskDir, { recursive: true });
|
|
|
|
let body = "";
|
|
if (pandocOk) {
|
|
// pandoc extract-media wants a directory; it will create media/ inside.
|
|
body = runPandoc(docxPath, assetsDiskDir);
|
|
body = stripDuplicateTitle(body, it.title);
|
|
body = stripWordToc(body);
|
|
body = stripHtmlComments(body);
|
|
body = rewriteLocalImageLinks(body, assetsPublicDir);
|
|
} else {
|
|
let html = await runMammoth(docxPath, assetsDiskDir);
|
|
html = stripDuplicateTitle(html, it.title);
|
|
html = stripWordToc(html);
|
|
html = stripHtmlComments(html);
|
|
html = rewriteLocalImageLinks(html, assetsPublicDir);
|
|
body = html.trim() ? html : "<p>(Import vide)</p>";
|
|
}
|
|
|
|
const defaultVersion = process.env.PUBLIC_RELEASE || "0.1.0";
|
|
|
|
// ✅ IMPORTANT: archicrat-ia partage edition/status avec archicratie (pas de migration frontmatter)
|
|
const schemaDefaultsByCollection = {
|
|
archicratie: { edition: "archicratie", status: "modele_sociopolitique", level: 1 },
|
|
"archicrat-ia": { edition: "archicrat-ia", status: "essai_these", level: 1 },
|
|
"cas-ia": { edition: "cas-ia", status: "application", level: 1 },
|
|
traite: { edition: "traite", status: "ontodynamique", level: 1 },
|
|
glossaire: { edition: "glossaire", status: "lexique", level: 1 },
|
|
atlas: { edition: "atlas", status: "atlas", level: 1 },
|
|
};
|
|
|
|
// Compat legacy :
|
|
// manifest collection="archicratie" + slug="archicrat-ia/..."
|
|
// => on écrit bien dans src/content/archicrat-ia/...
|
|
// => mais on conserve edition/status historiques de type archicratie/modele_sociopolitique
|
|
const defaultsKey =
|
|
String(it.collection || "").trim() === "archicratie" &&
|
|
String(it.slug || "").trim().startsWith("archicrat-ia/")
|
|
? "archicratie"
|
|
: outCollection;
|
|
|
|
const defaults =
|
|
schemaDefaultsByCollection[defaultsKey] || {
|
|
edition: defaultsKey,
|
|
status: "draft",
|
|
level: 1,
|
|
};
|
|
|
|
const fm = [
|
|
"---",
|
|
`title: "${escapeFrontmatterString(it.title)}"`,
|
|
`edition: "${escapeFrontmatterString(defaults.edition)}"`,
|
|
`status: "${escapeFrontmatterString(defaults.status)}"`,
|
|
`level: ${Number(defaults.level)}`,
|
|
`version: "${escapeFrontmatterString(defaultVersion)}"`,
|
|
`concepts: []`,
|
|
`links: []`,
|
|
`order: ${Number(it.order)}`,
|
|
`summary: ""`,
|
|
`source:`,
|
|
` kind: docx`,
|
|
` path: "${escapeFrontmatterString(it.source)}"`,
|
|
"---",
|
|
"",
|
|
].join("\n");
|
|
|
|
const mdx = fm + body.trim() + "\n";
|
|
await fs.writeFile(outFile, mdx, "utf8");
|
|
}
|
|
|
|
console.log("\nDone.");
|
|
}
|
|
|
|
main().catch((e) => {
|
|
console.error("\nERROR:", e?.message || e);
|
|
process.exit(1);
|
|
}); |