anno: build-annotations-index supports shard annotations
This commit is contained in:
@@ -1,28 +1,106 @@
|
|||||||
|
#!/usr/bin/env node
|
||||||
// scripts/build-annotations-index.mjs
|
// scripts/build-annotations-index.mjs
|
||||||
|
// Construit dist/annotations-index.json à partir de src/annotations/**/*.yml
|
||||||
|
// Supporte:
|
||||||
|
// - monolith : src/annotations/<pageKey>.yml
|
||||||
|
// - shard : src/annotations/<pageKey>/<paraId>.yml (paraId = p-<n>-...)
|
||||||
|
// Invariants:
|
||||||
|
// - doc.schema === 1
|
||||||
|
// - doc.page (si présent) == pageKey déduit du chemin
|
||||||
|
// - shard: doc.paras doit contenir EXACTEMENT la clé paraId (sinon fail)
|
||||||
|
//
|
||||||
|
// Deep-merge non destructif (media/refs/comments dédupliqués), tri stable.
|
||||||
|
|
||||||
import fs from "node:fs/promises";
|
import fs from "node:fs/promises";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
import YAML from "yaml";
|
import YAML from "yaml";
|
||||||
|
|
||||||
function parseArgs(argv) {
|
const ROOT = process.cwd();
|
||||||
const out = {
|
const ANNO_ROOT = path.join(ROOT, "src", "annotations");
|
||||||
inDir: "src/annotations",
|
const DIST_DIR = path.join(ROOT, "dist");
|
||||||
outFile: "dist/annotations-index.json",
|
const OUT = path.join(DIST_DIR, "annotations-index.json");
|
||||||
};
|
|
||||||
|
|
||||||
for (let i = 0; i < argv.length; i++) {
|
function assert(cond, msg) {
|
||||||
const a = argv[i];
|
if (!cond) throw new Error(msg);
|
||||||
|
}
|
||||||
|
|
||||||
if (a === "--in" && argv[i + 1]) out.inDir = argv[++i];
|
function isObj(x) {
|
||||||
else if (a.startsWith("--in=")) out.inDir = a.slice("--in=".length);
|
return !!x && typeof x === "object" && !Array.isArray(x);
|
||||||
|
}
|
||||||
|
function isArr(x) {
|
||||||
|
return Array.isArray(x);
|
||||||
|
}
|
||||||
|
|
||||||
if (a === "--out" && argv[i + 1]) out.outFile = argv[++i];
|
function normPath(s) {
|
||||||
else if (a.startsWith("--out=")) out.outFile = a.slice("--out=".length);
|
return String(s || "")
|
||||||
|
.replace(/\\/g, "/")
|
||||||
|
.replace(/^\/+|\/+$/g, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
function paraNum(pid) {
|
||||||
|
const m = String(pid).match(/^p-(\d+)-/i);
|
||||||
|
return m ? Number(m[1]) : Number.POSITIVE_INFINITY;
|
||||||
|
}
|
||||||
|
|
||||||
|
function stableSortByTs(arr) {
|
||||||
|
if (!Array.isArray(arr)) return;
|
||||||
|
arr.sort((a, b) => {
|
||||||
|
const ta = Date.parse(a?.ts || "") || 0;
|
||||||
|
const tb = Date.parse(b?.ts || "") || 0;
|
||||||
|
if (ta !== tb) return ta - tb;
|
||||||
|
return JSON.stringify(a).localeCompare(JSON.stringify(b));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function keyMedia(x) { return String(x?.src || ""); }
|
||||||
|
function keyRef(x) {
|
||||||
|
return `${x?.url || ""}||${x?.label || ""}||${x?.kind || ""}||${x?.citation || ""}`;
|
||||||
|
}
|
||||||
|
function keyComment(x) { return String(x?.text || "").trim(); }
|
||||||
|
|
||||||
|
function uniqUnion(dst, src, keyFn) {
|
||||||
|
const out = isArr(dst) ? [...dst] : [];
|
||||||
|
const seen = new Set(out.map((x) => keyFn(x)));
|
||||||
|
for (const it of (isArr(src) ? src : [])) {
|
||||||
|
const k = keyFn(it);
|
||||||
|
if (!k) continue;
|
||||||
|
if (!seen.has(k)) {
|
||||||
|
seen.add(k);
|
||||||
|
out.push(it);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function exists(p) {
|
function deepMergeEntry(dst, src) {
|
||||||
try { await fs.access(p); return true; } catch { return false; }
|
if (!isObj(dst) || !isObj(src)) return;
|
||||||
|
|
||||||
|
for (const [k, v] of Object.entries(src)) {
|
||||||
|
if (k === "media" && isArr(v)) { dst.media = uniqUnion(dst.media, v, keyMedia); continue; }
|
||||||
|
if (k === "refs" && isArr(v)) { dst.refs = uniqUnion(dst.refs, v, keyRef); continue; }
|
||||||
|
if (k === "comments_editorial" && isArr(v)) { dst.comments_editorial = uniqUnion(dst.comments_editorial, v, keyComment); continue; }
|
||||||
|
|
||||||
|
if (isObj(v)) {
|
||||||
|
if (!isObj(dst[k])) dst[k] = {};
|
||||||
|
deepMergeEntry(dst[k], v);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isArr(v)) {
|
||||||
|
const cur = isArr(dst[k]) ? dst[k] : [];
|
||||||
|
const seen = new Set(cur.map((x) => JSON.stringify(x)));
|
||||||
|
const out = [...cur];
|
||||||
|
for (const it of v) {
|
||||||
|
const s = JSON.stringify(it);
|
||||||
|
if (!seen.has(s)) { seen.add(s); out.push(it); }
|
||||||
|
}
|
||||||
|
dst[k] = out;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// scalar: set only if missing/empty
|
||||||
|
if (!(k in dst) || dst[k] == null || dst[k] === "") dst[k] = v;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function walk(dir) {
|
async function walk(dir) {
|
||||||
@@ -30,111 +108,119 @@ async function walk(dir) {
|
|||||||
const ents = await fs.readdir(dir, { withFileTypes: true });
|
const ents = await fs.readdir(dir, { withFileTypes: true });
|
||||||
for (const e of ents) {
|
for (const e of ents) {
|
||||||
const p = path.join(dir, e.name);
|
const p = path.join(dir, e.name);
|
||||||
if (e.isDirectory()) out.push(...(await walk(p)));
|
if (e.isDirectory()) out.push(...await walk(p));
|
||||||
else out.push(p);
|
else if (e.isFile() && /\.ya?ml$/i.test(e.name)) out.push(p);
|
||||||
}
|
}
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
function inferPageKeyFromFile(inDirAbs, fileAbs) {
|
function inferExpectedFromRel(relNoExt) {
|
||||||
// src/annotations/<page>.yml -> "<page>"
|
const parts = relNoExt.split("/").filter(Boolean);
|
||||||
const rel = path.relative(inDirAbs, fileAbs).replace(/\\/g, "/");
|
const last = parts.at(-1) || "";
|
||||||
return rel.replace(/\.(ya?ml|json)$/i, "");
|
const isShard = /^p-\d+-/i.test(last);
|
||||||
|
const pageKey = isShard ? parts.slice(0, -1).join("/") : relNoExt;
|
||||||
|
const paraId = isShard ? last : null;
|
||||||
|
return { isShard, pageKey, paraId };
|
||||||
}
|
}
|
||||||
|
|
||||||
function assert(cond, msg) {
|
function validateAndNormalizeDoc(doc, relFile, expectedPageKey, expectedParaId) {
|
||||||
if (!cond) throw new Error(msg);
|
assert(isObj(doc), `${relFile}: doc must be an object`);
|
||||||
}
|
assert(doc.schema === 1, `${relFile}: schema must be 1`);
|
||||||
|
assert(isObj(doc.paras), `${relFile}: missing object key "paras"`);
|
||||||
|
|
||||||
function isPlainObject(x) {
|
const gotPage = doc.page != null ? normPath(doc.page) : "";
|
||||||
return !!x && typeof x === "object" && !Array.isArray(x);
|
const expPage = normPath(expectedPageKey);
|
||||||
}
|
|
||||||
|
|
||||||
function normalizePageKey(s) {
|
if (gotPage) {
|
||||||
// pas de / en tête/fin
|
|
||||||
return String(s || "").replace(/^\/+/, "").replace(/\/+$/, "");
|
|
||||||
}
|
|
||||||
|
|
||||||
function validateAndNormalizeDoc(doc, pageKey, fileRel) {
|
|
||||||
assert(isPlainObject(doc), `${fileRel}: document must be an object`);
|
|
||||||
assert(doc.schema === 1, `${fileRel}: schema must be 1`);
|
|
||||||
if (doc.page != null) {
|
|
||||||
assert(
|
assert(
|
||||||
normalizePageKey(doc.page) === pageKey,
|
gotPage === expPage,
|
||||||
`${fileRel}: page mismatch (page="${doc.page}" vs path="${pageKey}")`
|
`${relFile}: page mismatch (page="${doc.page}" vs path="${expectedPageKey}")`
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
doc.page = expPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (expectedParaId) {
|
||||||
|
// invariant shard : exactement 1 clé, celle du filename
|
||||||
|
const keys = Object.keys(doc.paras || {}).map(String);
|
||||||
|
assert(
|
||||||
|
keys.includes(expectedParaId),
|
||||||
|
`${relFile}: shard mismatch: must contain paras["${expectedParaId}"]`
|
||||||
|
);
|
||||||
|
assert(
|
||||||
|
keys.length === 1 && keys[0] === expectedParaId,
|
||||||
|
`${relFile}: shard invariant violated: shard file must contain ONLY paras["${expectedParaId}"] (got: ${keys.join(", ")})`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
assert(isPlainObject(doc.paras), `${fileRel}: missing object key "paras"`);
|
|
||||||
|
|
||||||
const parasOut = Object.create(null);
|
return doc;
|
||||||
|
|
||||||
for (const [paraId, entry] of Object.entries(doc.paras)) {
|
|
||||||
assert(/^p-\d+-/i.test(paraId), `${fileRel}: invalid para id "${paraId}"`);
|
|
||||||
|
|
||||||
// entry peut être vide, mais doit être un objet si présent
|
|
||||||
assert(entry == null || isPlainObject(entry), `${fileRel}: paras.${paraId} must be an object`);
|
|
||||||
|
|
||||||
const e = entry ? { ...entry } : {};
|
|
||||||
|
|
||||||
// Sanity checks (non destructifs : on n’écrase pas, on vérifie juste les types)
|
|
||||||
if (e.refs != null) assert(Array.isArray(e.refs), `${fileRel}: paras.${paraId}.refs must be an array`);
|
|
||||||
if (e.authors != null) assert(Array.isArray(e.authors), `${fileRel}: paras.${paraId}.authors must be an array`);
|
|
||||||
if (e.quotes != null) assert(Array.isArray(e.quotes), `${fileRel}: paras.${paraId}.quotes must be an array`);
|
|
||||||
if (e.media != null) assert(Array.isArray(e.media), `${fileRel}: paras.${paraId}.media must be an array`);
|
|
||||||
if (e.comments_editorial != null) assert(Array.isArray(e.comments_editorial), `${fileRel}: paras.${paraId}.comments_editorial must be an array`);
|
|
||||||
|
|
||||||
parasOut[paraId] = e;
|
|
||||||
}
|
|
||||||
|
|
||||||
return parasOut;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function readDoc(fileAbs) {
|
|
||||||
const raw = await fs.readFile(fileAbs, "utf8");
|
|
||||||
if (/\.json$/i.test(fileAbs)) return JSON.parse(raw);
|
|
||||||
return YAML.parse(raw);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
const { inDir, outFile } = parseArgs(process.argv.slice(2));
|
const pages = {};
|
||||||
const CWD = process.cwd();
|
const errors = [];
|
||||||
|
|
||||||
const inDirAbs = path.isAbsolute(inDir) ? inDir : path.join(CWD, inDir);
|
await fs.mkdir(DIST_DIR, { recursive: true });
|
||||||
const outAbs = path.isAbsolute(outFile) ? outFile : path.join(CWD, outFile);
|
|
||||||
|
|
||||||
// antifragile
|
const files = await walk(ANNO_ROOT);
|
||||||
if (!(await exists(inDirAbs))) {
|
|
||||||
console.log(`ℹ️ annotations-index: skip (input missing): ${inDir}`);
|
|
||||||
process.exit(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
const files = (await walk(inDirAbs)).filter((p) => /\.(ya?ml|json)$/i.test(p));
|
for (const fp of files) {
|
||||||
if (!files.length) {
|
const rel = normPath(path.relative(ANNO_ROOT, fp)); // e.g. archicrat-ia/chapitre-4/p-11-...
|
||||||
console.log(`ℹ️ annotations-index: skip (no .yml/.yaml/.json found in): ${inDir}`);
|
const relNoExt = rel.replace(/\.ya?ml$/i, ""); // no ext
|
||||||
process.exit(0);
|
const { isShard, pageKey, paraId } = inferExpectedFromRel(relNoExt);
|
||||||
}
|
|
||||||
|
|
||||||
const pages = Object.create(null);
|
|
||||||
let paraCount = 0;
|
|
||||||
|
|
||||||
for (const f of files) {
|
|
||||||
const fileRel = path.relative(CWD, f).replace(/\\/g, "/");
|
|
||||||
const pageKey = normalizePageKey(inferPageKeyFromFile(inDirAbs, f));
|
|
||||||
assert(pageKey, `${fileRel}: cannot infer page key`);
|
|
||||||
|
|
||||||
let doc;
|
|
||||||
try {
|
try {
|
||||||
doc = await readDoc(f);
|
const raw = await fs.readFile(fp, "utf8");
|
||||||
|
const doc = YAML.parse(raw) || {};
|
||||||
|
|
||||||
|
// ignore non schema:1
|
||||||
|
if (!isObj(doc) || doc.schema !== 1) continue;
|
||||||
|
|
||||||
|
validateAndNormalizeDoc(
|
||||||
|
doc,
|
||||||
|
`src/annotations/${rel}`,
|
||||||
|
pageKey,
|
||||||
|
isShard ? paraId : null
|
||||||
|
);
|
||||||
|
|
||||||
|
const pg = (pages[pageKey] ??= { paras: {} });
|
||||||
|
|
||||||
|
if (isShard) {
|
||||||
|
const entry = doc.paras[paraId];
|
||||||
|
if (!isObj(pg.paras[paraId])) pg.paras[paraId] = {};
|
||||||
|
if (isObj(entry)) deepMergeEntry(pg.paras[paraId], entry);
|
||||||
|
|
||||||
|
stableSortByTs(pg.paras[paraId].media);
|
||||||
|
stableSortByTs(pg.paras[paraId].refs);
|
||||||
|
stableSortByTs(pg.paras[paraId].comments_editorial);
|
||||||
|
} else {
|
||||||
|
for (const [pid, entry] of Object.entries(doc.paras || {})) {
|
||||||
|
const p = String(pid);
|
||||||
|
if (!isObj(pg.paras[p])) pg.paras[p] = {};
|
||||||
|
if (isObj(entry)) deepMergeEntry(pg.paras[p], entry);
|
||||||
|
|
||||||
|
stableSortByTs(pg.paras[p].media);
|
||||||
|
stableSortByTs(pg.paras[p].refs);
|
||||||
|
stableSortByTs(pg.paras[p].comments_editorial);
|
||||||
|
}
|
||||||
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
throw new Error(`${fileRel}: parse failed: ${String(e?.message ?? e)}`);
|
errors.push({ file: `src/annotations/${rel}`, error: String(e?.message || e) });
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const paras = validateAndNormalizeDoc(doc, pageKey, fileRel);
|
// sort paras per page
|
||||||
|
for (const [pageKey, pg] of Object.entries(pages)) {
|
||||||
// 1 fichier = 1 page (canon)
|
const keys = Object.keys(pg.paras || {});
|
||||||
assert(!pages[pageKey], `${fileRel}: duplicate page "${pageKey}" (only one file per page)`);
|
keys.sort((a, b) => {
|
||||||
pages[pageKey] = { paras };
|
const ia = paraNum(a);
|
||||||
paraCount += Object.keys(paras).length;
|
const ib = paraNum(b);
|
||||||
|
if (Number.isFinite(ia) && Number.isFinite(ib) && ia !== ib) return ia - ib;
|
||||||
|
return String(a).localeCompare(String(b));
|
||||||
|
});
|
||||||
|
const next = {};
|
||||||
|
for (const k of keys) next[k] = pg.paras[k];
|
||||||
|
pg.paras = next;
|
||||||
}
|
}
|
||||||
|
|
||||||
const out = {
|
const out = {
|
||||||
@@ -143,17 +229,22 @@ async function main() {
|
|||||||
pages,
|
pages,
|
||||||
stats: {
|
stats: {
|
||||||
pages: Object.keys(pages).length,
|
pages: Object.keys(pages).length,
|
||||||
paras: paraCount,
|
paras: Object.values(pages).reduce((n, p) => n + Object.keys(p.paras || {}).length, 0),
|
||||||
|
errors: errors.length,
|
||||||
},
|
},
|
||||||
|
errors,
|
||||||
};
|
};
|
||||||
|
|
||||||
await fs.mkdir(path.dirname(outAbs), { recursive: true });
|
// CI behaviour: if ANY error => fail build
|
||||||
await fs.writeFile(outAbs, JSON.stringify(out), "utf8");
|
if (errors.length) {
|
||||||
|
throw new Error(`${errors[0].file}: ${errors[0].error}`);
|
||||||
|
}
|
||||||
|
|
||||||
console.log(`✅ annotations-index: pages=${out.stats.pages} paras=${out.stats.paras} -> ${path.relative(CWD, outAbs)}`);
|
await fs.writeFile(OUT, JSON.stringify(out), "utf8");
|
||||||
|
console.log(`✅ annotations-index: pages=${out.stats.pages} paras=${out.stats.paras} -> dist/annotations-index.json`);
|
||||||
}
|
}
|
||||||
|
|
||||||
main().catch((e) => {
|
main().catch((e) => {
|
||||||
console.error("FAIL: build-annotations-index crashed:", e);
|
console.error(`FAIL: build-annotations-index crashed: ${e?.stack || e?.message || e}`);
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
});
|
});
|
||||||
Reference in New Issue
Block a user