Files
crawler_js/crawler-flow.js
2026-03-30 01:32:43 +01:00

477 lines
12 KiB
JavaScript

// crawler-flow.js
//@2
/*
Writes to : flows.json
: status.json
Consumed by : graph-utils.js
Short friendly labels enabled
*/
console.log("Loading: crawler-flow.js");
import fs from "fs";
import { chromium } from "playwright";
import { normalizeUrl } from "./utils/normalizeUrl.js";
import { UrlPatternMatcher } from "./utils/urlPatternMatcher.js";
const config = JSON.parse(fs.readFileSync("./login-config.json", "utf8"));
const {
startURL,
maxDepth,
loginConfig,
includePatterns = [],
excludePatterns = [],
matcherOptions = {}
} = config;
const selectors = JSON.parse(fs.readFileSync("./selectors.json", "utf8"));
const visited = new Set();
const flows = [];
const statusMap = {};
const matcher = new UrlPatternMatcher(includePatterns, excludePatterns, matcherOptions);
const LOGOUT_KEYWORDS = ["logout", "signout", "logoff", "sign-out", "log-out"];
function isLogoutElement(info) {
const text = (info.text || "").toLowerCase();
const href = (info.href || "").toLowerCase();
const onclick = (info.onclick || "").toLowerCase();
return LOGOUT_KEYWORDS.some(k =>
text.includes(k) || href.includes(k) || onclick.includes(k)
);
}
// Generate short friendly labels for UI and section nodes
function friendlyLabel(text, role, dataset, tag, index) {
const clean = s =>
String(s || "")
.trim()
.replace(/\s+/g, "-")
.replace(/[^a-zA-Z0-9-_]/g, "")
.toLowerCase();
if (text && clean(text)) return clean(text);
if (dataset?.tab) return `tab-${clean(dataset.tab)}`;
if (dataset?.modal) return `modal-${clean(dataset.modal)}`;
if (dataset?.accordion) return `accordion-${clean(dataset.accordion)}`;
if (role && clean(role)) return clean(role);
return `${tag}-${index}`;
}
async function getClickableElements(page) {
const clickables = await page.$$eval(
`
button,
[onclick],
a[onclick]:not([href]),
img[onclick],
img[role='button'],
img[tabindex],
div[onclick],
span[onclick],
[role='button'],
[data-toggle],
[data-tab],
[data-accordion]
`,
els =>
els.map((el, index) => ({
text: el.innerText || "",
href: el.getAttribute("href"),
onclick: el.getAttribute("onclick"),
tag: el.tagName.toLowerCase(),
role: el.getAttribute("role"),
dataset: { ...el.dataset },
index,
friendly: null,
xpath: (() => {
let path = "";
let current = el;
while (current && current.nodeType === 1) {
let idx = 1;
let sibling = current.previousElementSibling;
while (sibling) {
if (sibling.tagName === current.tagName) idx++;
sibling = sibling.previousElementSibling;
}
path = `/${current.tagName}[${idx}]` + path;
current = current.parentElement;
}
return path;
})()
}))
);
for (const el of clickables)
el.friendly = friendlyLabel(el.text, el.role, el.dataset, el.tag, el.index);
return clickables;
}
async function scrollToReveal(page) {
await page.evaluate(async () => {
const total =
document.body.scrollHeight || document.documentElement.scrollHeight || 0;
const step = Math.max(200, Math.floor(total / 5));
for (let y = 0; y < total; y += step) {
window.scrollTo(0, y);
await new Promise(r => setTimeout(r, 150));
}
window.scrollTo(0, total);
});
}
async function discoverByClicking(page, url, depth, path) {
await scrollToReveal(page);
const clickables = await getClickableElements(page);
for (const info of clickables) {
if (isLogoutElement(info)) continue;
try {
const beforeLinks = new Set(
await page.$$eval("a[href]", as => as.map(a => a.href))
);
await page.evaluate(xpath => {
const getNode = xp => {
const result = document.evaluate(
xp,
document,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
return result.singleNodeValue;
};
const el = getNode(xpath);
if (el) el.click();
}, info.xpath);
await page.waitForTimeout(400);
const afterLinks = await page.$$eval("a[href]", as =>
as.map(a => a.href)
);
for (const link of afterLinks) {
if (!beforeLinks.has(link)) {
const normalized = normalizeUrl(link);
if (normalized) {
await crawl(page, normalized, depth + 1, path);
}
}
}
} catch {}
}
}
async function performLoginIfNeeded(page, url) {
for (const rule of loginConfig.logins) {
if (url.includes(rule.match)) {
for (const [name, value] of Object.entries(rule.fields)) {
const sel = `input[name="${name}"]`;
await page.waitForSelector(sel);
await page.fill(sel, value);
}
const submitSel = `[name="${rule.submit}"], button[name="${rule.submit}"]`;
await page.waitForSelector(submitSel);
await page.click(submitSel);
await page.waitForLoadState("networkidle");
}
}
}
async function crawl(page, rawUrl, depth, path) {
const url = normalizeUrl(rawUrl);
if (depth > maxDepth) return;
if (!matcher.allow(url)) return;
if (visited.has(url)) return;
visited.add(url);
const response = await page.goto(url, { waitUntil: "networkidle" });
const status = response?.status() || 0;
const finalUrl = response?.url() || url;
const title = await page.title();
const soft404 = title.includes("404");
statusMap[url] = { status, finalUrl, soft404 };
await performLoginIfNeeded(page, url);
const newPath = [...path, url];
flows.push(newPath);
await discoverByClicking(page, url, depth, newPath);
const links = await page.$$eval("a[href]", as =>
as.map(a => a.href).filter(h => h.startsWith("http"))
);
const normalizedLinks = links
.map(h => normalizeUrl(h))
.filter(h => h);
for (const link of normalizedLinks) {
await crawl(page, link, depth + 1, newPath);
}
if (selectors[url]) {
const entry = selectors[url];
for (const link of entry.links || []) {
const target = normalizeUrl(link.to);
if (target) await crawl(page, target, depth + 1, newPath);
}
for (const form of entry.forms || []) {
const target = normalizeUrl(form.action);
if (target) await crawl(page, target, depth + 1, newPath);
}
}
}
(async () => {
const browser = await chromium.launch({ headless: false });
const page = await browser.newPage();
await crawl(page, startURL, 0, []);
fs.writeFileSync("flows.json", JSON.stringify({ flows }, null, 2));
fs.writeFileSync("status.json", JSON.stringify(statusMap, null, 2));
await browser.close();
})();
// crawler-flow.js
//@2
/*
Writes to : flows.json
: status.json
Consumed by : graph-utils.js
console.log("Loading: crawler-flow.js");
import fs from "fs";
import { chromium } from "playwright";
import { normalizeUrl } from "./utils/normalizeUrl.js";
import { UrlPatternMatcher } from "./utils/urlPatternMatcher.js";
const config = JSON.parse(fs.readFileSync("./login-config.json", "utf8"));
const {
startURL,
maxDepth,
loginConfig,
includePatterns = [],
excludePatterns = [],
matcherOptions = {}
} = config;
const selectors = JSON.parse(fs.readFileSync("./selectors.json", "utf8"));
const visited = new Set();
const flows = [];
const statusMap = {};
const matcher = new UrlPatternMatcher(includePatterns, excludePatterns, matcherOptions);
const LOGOUT_KEYWORDS = ["logout", "signout", "logoff", "sign-out", "log-out"];
function isLogoutElement(info) {
const text = (info.text || "").toLowerCase();
const href = (info.href || "").toLowerCase();
const onclick = (info.onclick || "").toLowerCase();
return LOGOUT_KEYWORDS.some(k =>
text.includes(k) || href.includes(k) || onclick.includes(k)
);
}
async function getClickableElements(page) {
return await page.$$eval(
`
button,
[onclick],
a[onclick]:not([href]),
img[onclick],
img[role='button'],
img[tabindex],
div[onclick],
span[onclick],
[role='button'],
[data-toggle],
[data-tab],
[data-accordion]
`,
els =>
els.map(el => ({
text: el.innerText || "",
href: el.getAttribute("href"),
onclick: el.getAttribute("onclick"),
xpath: (() => {
let path = "";
let current = el;
while (current && current.nodeType === 1) {
let index = 1;
let sibling = current.previousElementSibling;
while (sibling) {
if (sibling.tagName === current.tagName) index++;
sibling = sibling.previousElementSibling;
}
path = `/${current.tagName}[${index}]` + path;
current = current.parentElement;
}
return path;
})()
}))
);
}
async function scrollToReveal(page) {
await page.evaluate(async () => {
const total =
document.body.scrollHeight || document.documentElement.scrollHeight || 0;
const step = Math.max(200, Math.floor(total / 5));
for (let y = 0; y < total; y += step) {
window.scrollTo(0, y);
await new Promise(r => setTimeout(r, 150));
}
window.scrollTo(0, total);
});
}
async function discoverByClicking(page, url, depth, path) {
await scrollToReveal(page);
const clickables = await getClickableElements(page);
for (const info of clickables) {
if (isLogoutElement(info)) continue;
try {
const beforeLinks = new Set(
await page.$$eval("a[href]", as => as.map(a => a.href))
);
await page.evaluate(xpath => {
const getNode = xp => {
const result = document.evaluate(
xp,
document,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
return result.singleNodeValue;
};
const el = getNode(xpath);
if (el) el.click();
}, info.xpath);
await page.waitForTimeout(400);
const afterLinks = await page.$$eval("a[href]", as =>
as.map(a => a.href)
);
for (const link of afterLinks) {
if (!beforeLinks.has(link)) {
const normalized = normalizeUrl(link);
if (normalized) {
await crawl(page, normalized, depth + 1, path);
}
}
}
} catch {}
}
}
async function performLoginIfNeeded(page, url) {
for (const rule of loginConfig.logins) {
if (url.includes(rule.match)) {
for (const [name, value] of Object.entries(rule.fields)) {
const sel = `input[name="${name}"]`;
await page.waitForSelector(sel);
await page.fill(sel, value);
}
const submitSel = `[name="${rule.submit}"], button[name="${rule.submit}"]`;
await page.waitForSelector(submitSel);
await page.click(submitSel);
await page.waitForLoadState("networkidle");
}
}
}
async function crawl(page, rawUrl, depth, path) {
const url = normalizeUrl(rawUrl);
if (depth > maxDepth) return;
if (!matcher.allow(url)) return;
if (visited.has(url)) return;
visited.add(url);
const response = await page.goto(url, { waitUntil: "networkidle" });
const status = response?.status() || 0;
const finalUrl = response?.url() || url;
const title = await page.title();
const soft404 = title.includes("404");
statusMap[url] = { status, finalUrl, soft404 };
await performLoginIfNeeded(page, url);
const newPath = [...path, url];
flows.push(newPath);
await discoverByClicking(page, url, depth, newPath);
const links = await page.$$eval("a[href]", as =>
as.map(a => a.href).filter(h => h.startsWith("http"))
);
const normalizedLinks = links
.map(h => normalizeUrl(h))
.filter(h => h);
for (const link of normalizedLinks) {
await crawl(page, link, depth + 1, newPath);
}
if (selectors[url]) {
const entry = selectors[url];
for (const link of entry.links || []) {
const target = normalizeUrl(link.to);
if (target) await crawl(page, target, depth + 1, newPath);
}
for (const form of entry.forms || []) {
const target = normalizeUrl(form.action);
if (target) await crawl(page, target, depth + 1, newPath);
}
}
}
(async () => {
const browser = await chromium.launch({ headless: false });
const page = await browser.newPage();
await crawl(page, startURL, 0, []);
fs.writeFileSync("flows.json", JSON.stringify({ flows }, null, 2));
fs.writeFileSync("status.json", JSON.stringify(statusMap, null, 2));
await browser.close();
})();
*/