477 lines
12 KiB
JavaScript
477 lines
12 KiB
JavaScript
// crawler-flow.js
|
|
//@2
|
|
/*
|
|
Writes to : flows.json
|
|
: status.json
|
|
Consumed by : graph-utils.js
|
|
Short friendly labels enabled
|
|
*/
|
|
|
|
console.log("Loading: crawler-flow.js");
|
|
|
|
import fs from "fs";
|
|
import { chromium } from "playwright";
|
|
import { normalizeUrl } from "./utils/normalizeUrl.js";
|
|
import { UrlPatternMatcher } from "./utils/urlPatternMatcher.js";
|
|
|
|
const config = JSON.parse(fs.readFileSync("./login-config.json", "utf8"));
|
|
const {
|
|
startURL,
|
|
maxDepth,
|
|
loginConfig,
|
|
includePatterns = [],
|
|
excludePatterns = [],
|
|
matcherOptions = {}
|
|
} = config;
|
|
|
|
const selectors = JSON.parse(fs.readFileSync("./selectors.json", "utf8"));
|
|
|
|
const visited = new Set();
|
|
const flows = [];
|
|
const statusMap = {};
|
|
|
|
const matcher = new UrlPatternMatcher(includePatterns, excludePatterns, matcherOptions);
|
|
|
|
const LOGOUT_KEYWORDS = ["logout", "signout", "logoff", "sign-out", "log-out"];
|
|
|
|
function isLogoutElement(info) {
|
|
const text = (info.text || "").toLowerCase();
|
|
const href = (info.href || "").toLowerCase();
|
|
const onclick = (info.onclick || "").toLowerCase();
|
|
return LOGOUT_KEYWORDS.some(k =>
|
|
text.includes(k) || href.includes(k) || onclick.includes(k)
|
|
);
|
|
}
|
|
|
|
// Generate short friendly labels for UI and section nodes
|
|
function friendlyLabel(text, role, dataset, tag, index) {
|
|
const clean = s =>
|
|
String(s || "")
|
|
.trim()
|
|
.replace(/\s+/g, "-")
|
|
.replace(/[^a-zA-Z0-9-_]/g, "")
|
|
.toLowerCase();
|
|
|
|
if (text && clean(text)) return clean(text);
|
|
if (dataset?.tab) return `tab-${clean(dataset.tab)}`;
|
|
if (dataset?.modal) return `modal-${clean(dataset.modal)}`;
|
|
if (dataset?.accordion) return `accordion-${clean(dataset.accordion)}`;
|
|
if (role && clean(role)) return clean(role);
|
|
return `${tag}-${index}`;
|
|
}
|
|
|
|
async function getClickableElements(page) {
|
|
const clickables = await page.$$eval(
|
|
`
|
|
button,
|
|
[onclick],
|
|
a[onclick]:not([href]),
|
|
img[onclick],
|
|
img[role='button'],
|
|
img[tabindex],
|
|
div[onclick],
|
|
span[onclick],
|
|
[role='button'],
|
|
[data-toggle],
|
|
[data-tab],
|
|
[data-accordion]
|
|
`,
|
|
els =>
|
|
els.map((el, index) => ({
|
|
text: el.innerText || "",
|
|
href: el.getAttribute("href"),
|
|
onclick: el.getAttribute("onclick"),
|
|
tag: el.tagName.toLowerCase(),
|
|
role: el.getAttribute("role"),
|
|
dataset: { ...el.dataset },
|
|
index,
|
|
friendly: null,
|
|
xpath: (() => {
|
|
let path = "";
|
|
let current = el;
|
|
while (current && current.nodeType === 1) {
|
|
let idx = 1;
|
|
let sibling = current.previousElementSibling;
|
|
while (sibling) {
|
|
if (sibling.tagName === current.tagName) idx++;
|
|
sibling = sibling.previousElementSibling;
|
|
}
|
|
path = `/${current.tagName}[${idx}]` + path;
|
|
current = current.parentElement;
|
|
}
|
|
return path;
|
|
})()
|
|
}))
|
|
);
|
|
|
|
for (const el of clickables)
|
|
el.friendly = friendlyLabel(el.text, el.role, el.dataset, el.tag, el.index);
|
|
|
|
return clickables;
|
|
}
|
|
|
|
async function scrollToReveal(page) {
|
|
await page.evaluate(async () => {
|
|
const total =
|
|
document.body.scrollHeight || document.documentElement.scrollHeight || 0;
|
|
const step = Math.max(200, Math.floor(total / 5));
|
|
for (let y = 0; y < total; y += step) {
|
|
window.scrollTo(0, y);
|
|
await new Promise(r => setTimeout(r, 150));
|
|
}
|
|
window.scrollTo(0, total);
|
|
});
|
|
}
|
|
|
|
async function discoverByClicking(page, url, depth, path) {
|
|
await scrollToReveal(page);
|
|
|
|
const clickables = await getClickableElements(page);
|
|
|
|
for (const info of clickables) {
|
|
if (isLogoutElement(info)) continue;
|
|
|
|
try {
|
|
const beforeLinks = new Set(
|
|
await page.$$eval("a[href]", as => as.map(a => a.href))
|
|
);
|
|
|
|
await page.evaluate(xpath => {
|
|
const getNode = xp => {
|
|
const result = document.evaluate(
|
|
xp,
|
|
document,
|
|
null,
|
|
XPathResult.FIRST_ORDERED_NODE_TYPE,
|
|
null
|
|
);
|
|
return result.singleNodeValue;
|
|
};
|
|
const el = getNode(xpath);
|
|
if (el) el.click();
|
|
}, info.xpath);
|
|
|
|
await page.waitForTimeout(400);
|
|
|
|
const afterLinks = await page.$$eval("a[href]", as =>
|
|
as.map(a => a.href)
|
|
);
|
|
|
|
for (const link of afterLinks) {
|
|
if (!beforeLinks.has(link)) {
|
|
const normalized = normalizeUrl(link);
|
|
if (normalized) {
|
|
await crawl(page, normalized, depth + 1, path);
|
|
}
|
|
}
|
|
}
|
|
} catch {}
|
|
}
|
|
}
|
|
|
|
async function performLoginIfNeeded(page, url) {
|
|
for (const rule of loginConfig.logins) {
|
|
if (url.includes(rule.match)) {
|
|
for (const [name, value] of Object.entries(rule.fields)) {
|
|
const sel = `input[name="${name}"]`;
|
|
await page.waitForSelector(sel);
|
|
await page.fill(sel, value);
|
|
}
|
|
|
|
const submitSel = `[name="${rule.submit}"], button[name="${rule.submit}"]`;
|
|
await page.waitForSelector(submitSel);
|
|
await page.click(submitSel);
|
|
|
|
await page.waitForLoadState("networkidle");
|
|
}
|
|
}
|
|
}
|
|
|
|
async function crawl(page, rawUrl, depth, path) {
|
|
const url = normalizeUrl(rawUrl);
|
|
|
|
if (depth > maxDepth) return;
|
|
if (!matcher.allow(url)) return;
|
|
if (visited.has(url)) return;
|
|
|
|
visited.add(url);
|
|
|
|
const response = await page.goto(url, { waitUntil: "networkidle" });
|
|
const status = response?.status() || 0;
|
|
const finalUrl = response?.url() || url;
|
|
const title = await page.title();
|
|
const soft404 = title.includes("404");
|
|
|
|
statusMap[url] = { status, finalUrl, soft404 };
|
|
|
|
await performLoginIfNeeded(page, url);
|
|
|
|
const newPath = [...path, url];
|
|
flows.push(newPath);
|
|
|
|
await discoverByClicking(page, url, depth, newPath);
|
|
|
|
const links = await page.$$eval("a[href]", as =>
|
|
as.map(a => a.href).filter(h => h.startsWith("http"))
|
|
);
|
|
|
|
const normalizedLinks = links
|
|
.map(h => normalizeUrl(h))
|
|
.filter(h => h);
|
|
|
|
for (const link of normalizedLinks) {
|
|
await crawl(page, link, depth + 1, newPath);
|
|
}
|
|
|
|
if (selectors[url]) {
|
|
const entry = selectors[url];
|
|
|
|
for (const link of entry.links || []) {
|
|
const target = normalizeUrl(link.to);
|
|
if (target) await crawl(page, target, depth + 1, newPath);
|
|
}
|
|
|
|
for (const form of entry.forms || []) {
|
|
const target = normalizeUrl(form.action);
|
|
if (target) await crawl(page, target, depth + 1, newPath);
|
|
}
|
|
}
|
|
}
|
|
|
|
(async () => {
|
|
const browser = await chromium.launch({ headless: false });
|
|
const page = await browser.newPage();
|
|
|
|
await crawl(page, startURL, 0, []);
|
|
|
|
fs.writeFileSync("flows.json", JSON.stringify({ flows }, null, 2));
|
|
fs.writeFileSync("status.json", JSON.stringify(statusMap, null, 2));
|
|
|
|
await browser.close();
|
|
})();
|
|
|
|
|
|
// crawler-flow.js
|
|
//@2
|
|
/*
|
|
Writes to : flows.json
|
|
: status.json
|
|
Consumed by : graph-utils.js
|
|
console.log("Loading: crawler-flow.js");
|
|
|
|
import fs from "fs";
|
|
import { chromium } from "playwright";
|
|
import { normalizeUrl } from "./utils/normalizeUrl.js";
|
|
import { UrlPatternMatcher } from "./utils/urlPatternMatcher.js";
|
|
|
|
const config = JSON.parse(fs.readFileSync("./login-config.json", "utf8"));
|
|
const {
|
|
startURL,
|
|
maxDepth,
|
|
loginConfig,
|
|
includePatterns = [],
|
|
excludePatterns = [],
|
|
matcherOptions = {}
|
|
} = config;
|
|
|
|
const selectors = JSON.parse(fs.readFileSync("./selectors.json", "utf8"));
|
|
|
|
const visited = new Set();
|
|
const flows = [];
|
|
const statusMap = {};
|
|
|
|
const matcher = new UrlPatternMatcher(includePatterns, excludePatterns, matcherOptions);
|
|
|
|
const LOGOUT_KEYWORDS = ["logout", "signout", "logoff", "sign-out", "log-out"];
|
|
|
|
function isLogoutElement(info) {
|
|
const text = (info.text || "").toLowerCase();
|
|
const href = (info.href || "").toLowerCase();
|
|
const onclick = (info.onclick || "").toLowerCase();
|
|
return LOGOUT_KEYWORDS.some(k =>
|
|
text.includes(k) || href.includes(k) || onclick.includes(k)
|
|
);
|
|
}
|
|
|
|
async function getClickableElements(page) {
|
|
return await page.$$eval(
|
|
`
|
|
button,
|
|
[onclick],
|
|
a[onclick]:not([href]),
|
|
img[onclick],
|
|
img[role='button'],
|
|
img[tabindex],
|
|
div[onclick],
|
|
span[onclick],
|
|
[role='button'],
|
|
[data-toggle],
|
|
[data-tab],
|
|
[data-accordion]
|
|
`,
|
|
els =>
|
|
els.map(el => ({
|
|
text: el.innerText || "",
|
|
href: el.getAttribute("href"),
|
|
onclick: el.getAttribute("onclick"),
|
|
xpath: (() => {
|
|
let path = "";
|
|
let current = el;
|
|
while (current && current.nodeType === 1) {
|
|
let index = 1;
|
|
let sibling = current.previousElementSibling;
|
|
while (sibling) {
|
|
if (sibling.tagName === current.tagName) index++;
|
|
sibling = sibling.previousElementSibling;
|
|
}
|
|
path = `/${current.tagName}[${index}]` + path;
|
|
current = current.parentElement;
|
|
}
|
|
return path;
|
|
})()
|
|
}))
|
|
);
|
|
}
|
|
|
|
async function scrollToReveal(page) {
|
|
await page.evaluate(async () => {
|
|
const total =
|
|
document.body.scrollHeight || document.documentElement.scrollHeight || 0;
|
|
const step = Math.max(200, Math.floor(total / 5));
|
|
for (let y = 0; y < total; y += step) {
|
|
window.scrollTo(0, y);
|
|
await new Promise(r => setTimeout(r, 150));
|
|
}
|
|
window.scrollTo(0, total);
|
|
});
|
|
}
|
|
|
|
async function discoverByClicking(page, url, depth, path) {
|
|
await scrollToReveal(page);
|
|
|
|
const clickables = await getClickableElements(page);
|
|
|
|
for (const info of clickables) {
|
|
if (isLogoutElement(info)) continue;
|
|
|
|
try {
|
|
const beforeLinks = new Set(
|
|
await page.$$eval("a[href]", as => as.map(a => a.href))
|
|
);
|
|
|
|
await page.evaluate(xpath => {
|
|
const getNode = xp => {
|
|
const result = document.evaluate(
|
|
xp,
|
|
document,
|
|
null,
|
|
XPathResult.FIRST_ORDERED_NODE_TYPE,
|
|
null
|
|
);
|
|
return result.singleNodeValue;
|
|
};
|
|
const el = getNode(xpath);
|
|
if (el) el.click();
|
|
}, info.xpath);
|
|
|
|
await page.waitForTimeout(400);
|
|
|
|
const afterLinks = await page.$$eval("a[href]", as =>
|
|
as.map(a => a.href)
|
|
);
|
|
|
|
for (const link of afterLinks) {
|
|
if (!beforeLinks.has(link)) {
|
|
const normalized = normalizeUrl(link);
|
|
if (normalized) {
|
|
await crawl(page, normalized, depth + 1, path);
|
|
}
|
|
}
|
|
}
|
|
} catch {}
|
|
}
|
|
}
|
|
|
|
async function performLoginIfNeeded(page, url) {
|
|
for (const rule of loginConfig.logins) {
|
|
if (url.includes(rule.match)) {
|
|
for (const [name, value] of Object.entries(rule.fields)) {
|
|
const sel = `input[name="${name}"]`;
|
|
await page.waitForSelector(sel);
|
|
await page.fill(sel, value);
|
|
}
|
|
|
|
const submitSel = `[name="${rule.submit}"], button[name="${rule.submit}"]`;
|
|
await page.waitForSelector(submitSel);
|
|
await page.click(submitSel);
|
|
|
|
await page.waitForLoadState("networkidle");
|
|
}
|
|
}
|
|
}
|
|
|
|
async function crawl(page, rawUrl, depth, path) {
|
|
const url = normalizeUrl(rawUrl);
|
|
|
|
if (depth > maxDepth) return;
|
|
if (!matcher.allow(url)) return;
|
|
if (visited.has(url)) return;
|
|
|
|
visited.add(url);
|
|
|
|
const response = await page.goto(url, { waitUntil: "networkidle" });
|
|
const status = response?.status() || 0;
|
|
const finalUrl = response?.url() || url;
|
|
const title = await page.title();
|
|
const soft404 = title.includes("404");
|
|
|
|
statusMap[url] = { status, finalUrl, soft404 };
|
|
|
|
await performLoginIfNeeded(page, url);
|
|
|
|
const newPath = [...path, url];
|
|
flows.push(newPath);
|
|
|
|
await discoverByClicking(page, url, depth, newPath);
|
|
|
|
const links = await page.$$eval("a[href]", as =>
|
|
as.map(a => a.href).filter(h => h.startsWith("http"))
|
|
);
|
|
|
|
const normalizedLinks = links
|
|
.map(h => normalizeUrl(h))
|
|
.filter(h => h);
|
|
|
|
for (const link of normalizedLinks) {
|
|
await crawl(page, link, depth + 1, newPath);
|
|
}
|
|
|
|
if (selectors[url]) {
|
|
const entry = selectors[url];
|
|
|
|
for (const link of entry.links || []) {
|
|
const target = normalizeUrl(link.to);
|
|
if (target) await crawl(page, target, depth + 1, newPath);
|
|
}
|
|
|
|
for (const form of entry.forms || []) {
|
|
const target = normalizeUrl(form.action);
|
|
if (target) await crawl(page, target, depth + 1, newPath);
|
|
}
|
|
}
|
|
}
|
|
|
|
(async () => {
|
|
const browser = await chromium.launch({ headless: false });
|
|
const page = await browser.newPage();
|
|
|
|
await crawl(page, startURL, 0, []);
|
|
|
|
fs.writeFileSync("flows.json", JSON.stringify({ flows }, null, 2));
|
|
fs.writeFileSync("status.json", JSON.stringify(statusMap, null, 2));
|
|
|
|
await browser.close();
|
|
})();
|
|
|
|
*/
|