Files
crawler_js/graph-utils.js
2026-03-30 01:32:43 +01:00

417 lines
9.5 KiB
JavaScript

// graph-utils.js
//@7
/*
Consumes : flows.json
: selectors.json
: status.json
Produces : graph object consumed by graph-data.js
Short friendly labels enabled
*/
console.log("Loading: graph-utils.js");
import fs from "fs";
export function buildGraph() {
const flows = JSON.parse(fs.readFileSync("./flows.json", "utf8")).flows || [];
const selectors = JSON.parse(fs.readFileSync("./selectors.json", "utf8"));
let status = {};
try {
status = JSON.parse(fs.readFileSync("./status.json", "utf8"));
} catch {}
const nodes = new Map();
const edges = [];
const normalize = url =>
String(url || "")
.trim()
.replace(/\/+$/, "")
.toLowerCase();
const shortPath = url => {
try {
const u = new URL(url);
return u.pathname || "/";
} catch {
return url;
}
};
const lastSegment = url => {
const parts = url.split("/");
return parts[parts.length - 1] || url;
};
const clusterName = url => {
const parts = url.split("/");
return parts.length > 1 ? parts[1] : "root";
};
function addNode(fullUrl, type = null) {
const id = normalize(fullUrl);
if (!id) return null;
if (!nodes.has(id)) {
nodes.set(id, {
fullUrl,
label: shortPath(fullUrl),
cluster: clusterName(shortPath(fullUrl)),
types: new Set()
});
}
if (type) nodes.get(id).types.add(type);
return id;
}
function addEdge(a, b, type) {
if (!a || !b || a === b) return;
edges.push({ from: a, to: b, type });
}
// flows (navigation)
for (const flow of flows) {
const steps = flow.map(s => addNode(s, "flow")).filter(Boolean);
for (let i = 0; i < steps.length - 1; i++) {
addEdge(steps[i], steps[i + 1], "flow");
}
}
// selectors (links/forms/UI/sections)
for (const page of Object.keys(selectors)) {
const pageId = addNode(page, "page");
const entry = selectors[page];
for (const link of entry.links || []) {
const to = addNode(link.to, "link");
addEdge(pageId, to, "link");
}
for (const form of entry.forms || []) {
const to = addNode(form.action, "form");
addEdge(pageId, to, "form");
}
for (const ui of entry.ui || []) {
const uiId = addNode(`${shortPath(page)}#ui:${ui.friendly}`, "ui");
addEdge(pageId, uiId, "ui");
}
for (const section of entry.sections || []) {
const secId = addNode(`${shortPath(page)}#section:${section.friendly}`, "section");
addEdge(pageId, secId, "section");
}
}
const outgoing = new Map();
for (const e of edges) {
outgoing.set(e.from, (outgoing.get(e.from) || 0) + 1);
}
const deadEnds = [...nodes.keys()].filter(n => !outgoing.has(n));
const nodeIds = [...nodes.keys()];
const N = nodeIds.length;
const index = new Map(nodeIds.map((id, i) => [id, i]));
const incoming = Array.from({ length: N }, () => []);
const outdeg = new Array(N).fill(0);
for (const e of edges) {
const a = index.get(e.from);
const b = index.get(e.to);
if (a == null || b == null) continue;
outdeg[a]++;
incoming[b].push(a);
}
const d = 0.85;
let rank = new Array(N).fill(1 / N);
for (let iter = 0; iter < 20; iter++) {
const next = new Array(N).fill((1 - d) / N);
for (let i = 0; i < N; i++) {
for (const j of incoming[i]) {
if (outdeg[j] > 0) next[i] += d * (rank[j] / outdeg[j]);
}
}
rank = next;
}
const pageRank = {};
for (let i = 0; i < N; i++) {
pageRank[nodeIds[i]] = rank[i];
}
const rankThreshold =
[...rank].sort((a, b) => b - a)[Math.floor(N * 0.1)] || 0;
const adj = new Map();
for (const id of nodeIds) adj.set(id, []);
for (const e of edges) adj.get(e.from).push(e.to);
const visited2 = new Set();
const stack = new Set();
const cycles = [];
function dfs(node, path) {
if (stack.has(node)) {
const idx = path.indexOf(node);
if (idx !== -1) cycles.push(path.slice(idx));
return;
}
if (visited2.has(node)) return;
visited2.add(node);
stack.add(node);
path.push(node);
for (const next of adj.get(node)) {
dfs(next, path);
if (cycles.length > 50) break;
}
path.pop();
stack.delete(node);
}
for (const id of nodeIds) {
if (!visited2.has(id)) dfs(id, []);
if (cycles.length > 50) break;
}
const sitemap = [...nodes.values()]
.map(n => ({
url: n.fullUrl,
cluster: n.cluster,
types: [...n.types]
}))
.sort((a, b) => a.url.localeCompare(b.url));
const brokenLinks = Object.entries(status)
.filter(([url, s]) =>
s.status >= 400 ||
s.status === 0 ||
s.soft404 === true
)
.map(([url, s]) => ({
url: normalize(url),
status: s.status,
finalUrl: normalize(s.finalUrl)
}));
return {
nodes,
edges,
deadEnds,
pageRank,
cycles,
rankThreshold,
sitemap,
brokenLinks
};
}
// graph-utils.js
//@7
/*
Consumes : flows.json
: selectors.json
: status.json
Produces : graph object consumed by graph-data.js
console.log("Loading: graph-utils.js");
import fs from "fs";
export function buildGraph() {
const flows = JSON.parse(fs.readFileSync("./flows.json", "utf8")).flows || [];
const selectors = JSON.parse(fs.readFileSync("./selectors.json", "utf8"));
let status = {};
try {
status = JSON.parse(fs.readFileSync("./status.json", "utf8"));
} catch {}
const nodes = new Map();
const edges = [];
const normalize = url =>
String(url || "")
.trim()
.replace(/\/+$/, "")
.toLowerCase();
const lastSegment = url => {
const parts = url.split("/");
return parts[parts.length - 1] || url;
};
const clusterName = url => {
const parts = url.split("/");
return parts.length > 1 ? parts[1] : "root";
};
function addNode(fullUrl, type = null) {
const id = normalize(fullUrl);
if (!id) return null;
if (!nodes.has(id)) {
nodes.set(id, {
fullUrl,
label: lastSegment(id),
cluster: clusterName(id),
types: new Set()
});
}
if (type) nodes.get(id).types.add(type);
return id;
}
function addEdge(a, b, type) {
if (!a || !b || a === b) return;
edges.push({ from: a, to: b, type });
}
// flows (navigation)
for (const flow of flows) {
const steps = flow.map(s => addNode(s, "flow")).filter(Boolean);
for (let i = 0; i < steps.length - 1; i++) {
addEdge(steps[i], steps[i + 1], "flow");
}
}
// selectors (links/forms/UI/sections)
for (const page of Object.keys(selectors)) {
const pageId = addNode(page, "page");
const entry = selectors[page];
for (const link of entry.links || []) {
const to = addNode(link.to, "link");
addEdge(pageId, to, "link");
}
for (const form of entry.forms || []) {
const to = addNode(form.action, "form");
addEdge(pageId, to, "form");
}
for (const ui of entry.ui || []) {
const uiId = addNode(`${page}#ui:${ui.id}`, "ui");
addEdge(pageId, uiId, "ui");
}
for (const section of entry.sections || []) {
const secId = addNode(`${page}#section:${section.id}`, "section");
addEdge(pageId, secId, "section");
}
}
const outgoing = new Map();
for (const e of edges) {
outgoing.set(e.from, (outgoing.get(e.from) || 0) + 1);
}
const deadEnds = [...nodes.keys()].filter(n => !outgoing.has(n));
const nodeIds = [...nodes.keys()];
const N = nodeIds.length;
const index = new Map(nodeIds.map((id, i) => [id, i]));
const incoming = Array.from({ length: N }, () => []);
const outdeg = new Array(N).fill(0);
for (const e of edges) {
const a = index.get(e.from);
const b = index.get(e.to);
if (a == null || b == null) continue;
outdeg[a]++;
incoming[b].push(a);
}
const d = 0.85;
let rank = new Array(N).fill(1 / N);
for (let iter = 0; iter < 20; iter++) {
const next = new Array(N).fill((1 - d) / N);
for (let i = 0; i < N; i++) {
for (const j of incoming[i]) {
if (outdeg[j] > 0) next[i] += d * (rank[j] / outdeg[j]);
}
}
rank = next;
}
const pageRank = {};
for (let i = 0; i < N; i++) {
pageRank[nodeIds[i]] = rank[i];
}
const rankThreshold =
[...rank].sort((a, b) => b - a)[Math.floor(N * 0.1)] || 0;
const adj = new Map();
for (const id of nodeIds) adj.set(id, []);
for (const e of edges) adj.get(e.from).push(e.to);
const visited = new Set();
const stack = new Set();
const cycles = [];
function dfs(node, path) {
if (stack.has(node)) {
const idx = path.indexOf(node);
if (idx !== -1) cycles.push(path.slice(idx));
return;
}
if (visited.has(node)) return;
visited.add(node);
stack.add(node);
path.push(node);
for (const next of adj.get(node)) {
dfs(next, path);
if (cycles.length > 50) break;
}
path.pop();
stack.delete(node);
}
for (const id of nodeIds) {
if (!visited.has(id)) dfs(id, []);
if (cycles.length > 50) break;
}
const sitemap = [...nodes.values()]
.map(n => ({
url: n.fullUrl,
cluster: n.cluster,
types: [...n.types]
}))
.sort((a, b) => a.url.localeCompare(b.url));
const brokenLinks = Object.entries(status)
.filter(([url, s]) =>
s.status >= 400 ||
s.status === 0 ||
s.soft404 === true
)
.map(([url, s]) => ({
url: normalize(url),
status: s.status,
finalUrl: normalize(s.finalUrl)
}));
return {
nodes,
edges,
deadEnds,
pageRank,
cycles,
rankThreshold,
sitemap,
brokenLinks
};
}
*/