diff --git a/scripts/build-annotations-index.mjs b/scripts/build-annotations-index.mjs index 9eca3c8..4654b0e 100644 --- a/scripts/build-annotations-index.mjs +++ b/scripts/build-annotations-index.mjs @@ -1,28 +1,106 @@ +#!/usr/bin/env node // scripts/build-annotations-index.mjs +// Construit dist/annotations-index.json à partir de src/annotations/**/*.yml +// Supporte: +// - monolith : src/annotations/.yml +// - shard : src/annotations//.yml (paraId = p--...) +// Invariants: +// - doc.schema === 1 +// - doc.page (si présent) == pageKey déduit du chemin +// - shard: doc.paras doit contenir EXACTEMENT la clé paraId (sinon fail) +// +// Deep-merge non destructif (media/refs/comments dédupliqués), tri stable. + import fs from "node:fs/promises"; import path from "node:path"; import YAML from "yaml"; -function parseArgs(argv) { - const out = { - inDir: "src/annotations", - outFile: "dist/annotations-index.json", - }; +const ROOT = process.cwd(); +const ANNO_ROOT = path.join(ROOT, "src", "annotations"); +const DIST_DIR = path.join(ROOT, "dist"); +const OUT = path.join(DIST_DIR, "annotations-index.json"); - for (let i = 0; i < argv.length; i++) { - const a = argv[i]; +function assert(cond, msg) { + if (!cond) throw new Error(msg); +} - if (a === "--in" && argv[i + 1]) out.inDir = argv[++i]; - else if (a.startsWith("--in=")) out.inDir = a.slice("--in=".length); +function isObj(x) { + return !!x && typeof x === "object" && !Array.isArray(x); +} +function isArr(x) { + return Array.isArray(x); +} - if (a === "--out" && argv[i + 1]) out.outFile = argv[++i]; - else if (a.startsWith("--out=")) out.outFile = a.slice("--out=".length); +function normPath(s) { + return String(s || "") + .replace(/\\/g, "/") + .replace(/^\/+|\/+$/g, ""); +} + +function paraNum(pid) { + const m = String(pid).match(/^p-(\d+)-/i); + return m ? Number(m[1]) : Number.POSITIVE_INFINITY; +} + +function stableSortByTs(arr) { + if (!Array.isArray(arr)) return; + arr.sort((a, b) => { + const ta = Date.parse(a?.ts || "") || 0; + const tb = Date.parse(b?.ts || "") || 0; + if (ta !== tb) return ta - tb; + return JSON.stringify(a).localeCompare(JSON.stringify(b)); + }); +} + +function keyMedia(x) { return String(x?.src || ""); } +function keyRef(x) { + return `${x?.url || ""}||${x?.label || ""}||${x?.kind || ""}||${x?.citation || ""}`; +} +function keyComment(x) { return String(x?.text || "").trim(); } + +function uniqUnion(dst, src, keyFn) { + const out = isArr(dst) ? [...dst] : []; + const seen = new Set(out.map((x) => keyFn(x))); + for (const it of (isArr(src) ? src : [])) { + const k = keyFn(it); + if (!k) continue; + if (!seen.has(k)) { + seen.add(k); + out.push(it); + } } return out; } -async function exists(p) { - try { await fs.access(p); return true; } catch { return false; } +function deepMergeEntry(dst, src) { + if (!isObj(dst) || !isObj(src)) return; + + for (const [k, v] of Object.entries(src)) { + if (k === "media" && isArr(v)) { dst.media = uniqUnion(dst.media, v, keyMedia); continue; } + if (k === "refs" && isArr(v)) { dst.refs = uniqUnion(dst.refs, v, keyRef); continue; } + if (k === "comments_editorial" && isArr(v)) { dst.comments_editorial = uniqUnion(dst.comments_editorial, v, keyComment); continue; } + + if (isObj(v)) { + if (!isObj(dst[k])) dst[k] = {}; + deepMergeEntry(dst[k], v); + continue; + } + + if (isArr(v)) { + const cur = isArr(dst[k]) ? dst[k] : []; + const seen = new Set(cur.map((x) => JSON.stringify(x))); + const out = [...cur]; + for (const it of v) { + const s = JSON.stringify(it); + if (!seen.has(s)) { seen.add(s); out.push(it); } + } + dst[k] = out; + continue; + } + + // scalar: set only if missing/empty + if (!(k in dst) || dst[k] == null || dst[k] === "") dst[k] = v; + } } async function walk(dir) { @@ -30,111 +108,119 @@ async function walk(dir) { const ents = await fs.readdir(dir, { withFileTypes: true }); for (const e of ents) { const p = path.join(dir, e.name); - if (e.isDirectory()) out.push(...(await walk(p))); - else out.push(p); + if (e.isDirectory()) out.push(...await walk(p)); + else if (e.isFile() && /\.ya?ml$/i.test(e.name)) out.push(p); } return out; } -function inferPageKeyFromFile(inDirAbs, fileAbs) { - // src/annotations/.yml -> "" - const rel = path.relative(inDirAbs, fileAbs).replace(/\\/g, "/"); - return rel.replace(/\.(ya?ml|json)$/i, ""); +function inferExpectedFromRel(relNoExt) { + const parts = relNoExt.split("/").filter(Boolean); + const last = parts.at(-1) || ""; + const isShard = /^p-\d+-/i.test(last); + const pageKey = isShard ? parts.slice(0, -1).join("/") : relNoExt; + const paraId = isShard ? last : null; + return { isShard, pageKey, paraId }; } -function assert(cond, msg) { - if (!cond) throw new Error(msg); -} +function validateAndNormalizeDoc(doc, relFile, expectedPageKey, expectedParaId) { + assert(isObj(doc), `${relFile}: doc must be an object`); + assert(doc.schema === 1, `${relFile}: schema must be 1`); + assert(isObj(doc.paras), `${relFile}: missing object key "paras"`); -function isPlainObject(x) { - return !!x && typeof x === "object" && !Array.isArray(x); -} + const gotPage = doc.page != null ? normPath(doc.page) : ""; + const expPage = normPath(expectedPageKey); -function normalizePageKey(s) { - // pas de / en tête/fin - return String(s || "").replace(/^\/+/, "").replace(/\/+$/, ""); -} - -function validateAndNormalizeDoc(doc, pageKey, fileRel) { - assert(isPlainObject(doc), `${fileRel}: document must be an object`); - assert(doc.schema === 1, `${fileRel}: schema must be 1`); - if (doc.page != null) { + if (gotPage) { assert( - normalizePageKey(doc.page) === pageKey, - `${fileRel}: page mismatch (page="${doc.page}" vs path="${pageKey}")` + gotPage === expPage, + `${relFile}: page mismatch (page="${doc.page}" vs path="${expectedPageKey}")` + ); + } else { + doc.page = expPage; + } + + if (expectedParaId) { + // invariant shard : exactement 1 clé, celle du filename + const keys = Object.keys(doc.paras || {}).map(String); + assert( + keys.includes(expectedParaId), + `${relFile}: shard mismatch: must contain paras["${expectedParaId}"]` + ); + assert( + keys.length === 1 && keys[0] === expectedParaId, + `${relFile}: shard invariant violated: shard file must contain ONLY paras["${expectedParaId}"] (got: ${keys.join(", ")})` ); } - assert(isPlainObject(doc.paras), `${fileRel}: missing object key "paras"`); - const parasOut = Object.create(null); - - for (const [paraId, entry] of Object.entries(doc.paras)) { - assert(/^p-\d+-/i.test(paraId), `${fileRel}: invalid para id "${paraId}"`); - - // entry peut être vide, mais doit être un objet si présent - assert(entry == null || isPlainObject(entry), `${fileRel}: paras.${paraId} must be an object`); - - const e = entry ? { ...entry } : {}; - - // Sanity checks (non destructifs : on n’écrase pas, on vérifie juste les types) - if (e.refs != null) assert(Array.isArray(e.refs), `${fileRel}: paras.${paraId}.refs must be an array`); - if (e.authors != null) assert(Array.isArray(e.authors), `${fileRel}: paras.${paraId}.authors must be an array`); - if (e.quotes != null) assert(Array.isArray(e.quotes), `${fileRel}: paras.${paraId}.quotes must be an array`); - if (e.media != null) assert(Array.isArray(e.media), `${fileRel}: paras.${paraId}.media must be an array`); - if (e.comments_editorial != null) assert(Array.isArray(e.comments_editorial), `${fileRel}: paras.${paraId}.comments_editorial must be an array`); - - parasOut[paraId] = e; - } - - return parasOut; -} - -async function readDoc(fileAbs) { - const raw = await fs.readFile(fileAbs, "utf8"); - if (/\.json$/i.test(fileAbs)) return JSON.parse(raw); - return YAML.parse(raw); + return doc; } async function main() { - const { inDir, outFile } = parseArgs(process.argv.slice(2)); - const CWD = process.cwd(); + const pages = {}; + const errors = []; - const inDirAbs = path.isAbsolute(inDir) ? inDir : path.join(CWD, inDir); - const outAbs = path.isAbsolute(outFile) ? outFile : path.join(CWD, outFile); + await fs.mkdir(DIST_DIR, { recursive: true }); - // antifragile - if (!(await exists(inDirAbs))) { - console.log(`ℹ️ annotations-index: skip (input missing): ${inDir}`); - process.exit(0); - } + const files = await walk(ANNO_ROOT); - const files = (await walk(inDirAbs)).filter((p) => /\.(ya?ml|json)$/i.test(p)); - if (!files.length) { - console.log(`ℹ️ annotations-index: skip (no .yml/.yaml/.json found in): ${inDir}`); - process.exit(0); - } + for (const fp of files) { + const rel = normPath(path.relative(ANNO_ROOT, fp)); // e.g. archicrat-ia/chapitre-4/p-11-... + const relNoExt = rel.replace(/\.ya?ml$/i, ""); // no ext + const { isShard, pageKey, paraId } = inferExpectedFromRel(relNoExt); - const pages = Object.create(null); - let paraCount = 0; - - for (const f of files) { - const fileRel = path.relative(CWD, f).replace(/\\/g, "/"); - const pageKey = normalizePageKey(inferPageKeyFromFile(inDirAbs, f)); - assert(pageKey, `${fileRel}: cannot infer page key`); - - let doc; try { - doc = await readDoc(f); + const raw = await fs.readFile(fp, "utf8"); + const doc = YAML.parse(raw) || {}; + + // ignore non schema:1 + if (!isObj(doc) || doc.schema !== 1) continue; + + validateAndNormalizeDoc( + doc, + `src/annotations/${rel}`, + pageKey, + isShard ? paraId : null + ); + + const pg = (pages[pageKey] ??= { paras: {} }); + + if (isShard) { + const entry = doc.paras[paraId]; + if (!isObj(pg.paras[paraId])) pg.paras[paraId] = {}; + if (isObj(entry)) deepMergeEntry(pg.paras[paraId], entry); + + stableSortByTs(pg.paras[paraId].media); + stableSortByTs(pg.paras[paraId].refs); + stableSortByTs(pg.paras[paraId].comments_editorial); + } else { + for (const [pid, entry] of Object.entries(doc.paras || {})) { + const p = String(pid); + if (!isObj(pg.paras[p])) pg.paras[p] = {}; + if (isObj(entry)) deepMergeEntry(pg.paras[p], entry); + + stableSortByTs(pg.paras[p].media); + stableSortByTs(pg.paras[p].refs); + stableSortByTs(pg.paras[p].comments_editorial); + } + } } catch (e) { - throw new Error(`${fileRel}: parse failed: ${String(e?.message ?? e)}`); + errors.push({ file: `src/annotations/${rel}`, error: String(e?.message || e) }); } + } - const paras = validateAndNormalizeDoc(doc, pageKey, fileRel); - - // 1 fichier = 1 page (canon) - assert(!pages[pageKey], `${fileRel}: duplicate page "${pageKey}" (only one file per page)`); - pages[pageKey] = { paras }; - paraCount += Object.keys(paras).length; + // sort paras per page + for (const [pageKey, pg] of Object.entries(pages)) { + const keys = Object.keys(pg.paras || {}); + keys.sort((a, b) => { + const ia = paraNum(a); + const ib = paraNum(b); + if (Number.isFinite(ia) && Number.isFinite(ib) && ia !== ib) return ia - ib; + return String(a).localeCompare(String(b)); + }); + const next = {}; + for (const k of keys) next[k] = pg.paras[k]; + pg.paras = next; } const out = { @@ -143,17 +229,22 @@ async function main() { pages, stats: { pages: Object.keys(pages).length, - paras: paraCount, + paras: Object.values(pages).reduce((n, p) => n + Object.keys(p.paras || {}).length, 0), + errors: errors.length, }, + errors, }; - await fs.mkdir(path.dirname(outAbs), { recursive: true }); - await fs.writeFile(outAbs, JSON.stringify(out), "utf8"); + // CI behaviour: if ANY error => fail build + if (errors.length) { + throw new Error(`${errors[0].file}: ${errors[0].error}`); + } - console.log(`✅ annotations-index: pages=${out.stats.pages} paras=${out.stats.paras} -> ${path.relative(CWD, outAbs)}`); + await fs.writeFile(OUT, JSON.stringify(out), "utf8"); + console.log(`✅ annotations-index: pages=${out.stats.pages} paras=${out.stats.paras} -> dist/annotations-index.json`); } main().catch((e) => { - console.error("FAIL: build-annotations-index crashed:", e); + console.error(`FAIL: build-annotations-index crashed: ${e?.stack || e?.message || e}`); process.exit(1); -}); +}); \ No newline at end of file