Files
crawler_js/selectors-crawler.js
2026-03-30 01:32:43 +01:00

559 lines
14 KiB
JavaScript

// selectors-crawler.js
//@3
/*
Writes to : selectors.json
Consumed by : graph-utils.js
: test-generator.js
Short friendly labels enabled
*/
console.log("Loading: selectors-crawler.js");
import fs from "fs";
import { chromium } from "playwright";
import { normalizeUrl } from "./utils/normalizeUrl.js";
import { UrlPatternMatcher } from "./utils/urlPatternMatcher.js";
const config = JSON.parse(fs.readFileSync("./login-config.json", "utf8"));
const {
startURL,
maxDepth,
loginConfig,
includePatterns = [],
excludePatterns = [],
matcherOptions = {}
} = config;
const visited = new Set();
const selectors = {};
const matcher = new UrlPatternMatcher(includePatterns, excludePatterns, matcherOptions);
const LOGOUT_KEYWORDS = ["logout", "signout", "logoff", "sign-out", "log-out"];
function isLogoutElement(info) {
const text = (info.text || "").toLowerCase();
const href = (info.href || "").toLowerCase();
const onclick = (info.onclick || "").toLowerCase();
return LOGOUT_KEYWORDS.some(k =>
text.includes(k) || href.includes(k) || onclick.includes(k)
);
}
// Generate short friendly labels
function friendlyLabel(text, role, dataset, tag, index) {
const clean = s =>
String(s || "")
.trim()
.replace(/\s+/g, "-")
.replace(/[^a-zA-Z0-9-_]/g, "")
.toLowerCase();
if (text && clean(text)) return clean(text);
if (dataset?.tab) return `tab-${clean(dataset.tab)}`;
if (dataset?.modal) return `modal-${clean(dataset.modal)}`;
if (dataset?.accordion) return `accordion-${clean(dataset.accordion)}`;
if (role && clean(role)) return clean(role);
return `${tag}-${index}`;
}
async function getUIElements(page) {
const ui = await page.$$eval(
`
button,
[onclick],
a[onclick]:not([href]),
img[onclick],
img[role='button'],
img[tabindex],
div[onclick],
span[onclick],
[role='button'],
[data-toggle],
[data-tab],
[data-accordion]
`,
els =>
els.map((el, index) => ({
id: index,
tag: el.tagName.toLowerCase(),
text: el.innerText || "",
role: el.getAttribute("role") || null,
onclick: el.getAttribute("onclick"),
href: el.getAttribute("href"),
dataset: { ...el.dataset }
}))
);
for (const el of ui)
el.friendly = friendlyLabel(el.text, el.role, el.dataset, el.tag, el.id);
return ui;
}
async function getSections(page) {
const sections = await page.$$eval(
`
section,
[role='dialog'],
[role='tabpanel'],
[role='tablist'],
.modal,
[data-modal],
[data-section],
[data-tab-panel],
[data-accordion-panel]
`,
els =>
els.map((el, index) => ({
id: el.id || index,
tag: el.tagName.toLowerCase(),
text: (el.innerText || "").slice(0, 200),
role: el.getAttribute("role") || null,
dataset: { ...el.dataset }
}))
);
for (const el of sections)
el.friendly = friendlyLabel(el.text, el.role, el.dataset, el.tag, el.id);
return sections;
}
async function scrollToReveal(page) {
await page.evaluate(async () => {
const total =
document.body.scrollHeight || document.documentElement.scrollHeight || 0;
const step = Math.max(200, Math.floor(total / 5));
for (let y = 0; y < total; y += step) {
window.scrollTo(0, y);
await new Promise(r => setTimeout(r, 150));
}
window.scrollTo(0, total);
});
}
async function discoverByClicking(page, url, depth) {
await scrollToReveal(page);
const clickables = await page.$$eval(
`
button,
[onclick],
a[onclick]:not([href]),
img[onclick],
img[role='button'],
img[tabindex],
div[onclick],
span[onclick],
[role='button'],
[data-toggle],
[data-tab],
[data-accordion]
`,
els =>
els.map((el, index) => ({
text: el.innerText || "",
href: el.getAttribute("href"),
onclick: el.getAttribute("onclick"),
tag: el.tagName.toLowerCase(),
role: el.getAttribute("role"),
dataset: { ...el.dataset },
index,
xpath: (() => {
let path = "";
let current = el;
while (current && current.nodeType === 1) {
let idx = 1;
let sibling = current.previousElementSibling;
while (sibling) {
if (sibling.tagName === current.tagName) idx++;
sibling = sibling.previousElementSibling;
}
path = `/${current.tagName}[${idx}]` + path;
current = current.parentElement;
}
return path;
})()
}))
);
for (const info of clickables) {
if (isLogoutElement(info)) continue;
try {
const beforeLinks = new Set(
await page.$$eval("a[href]", as => as.map(a => a.href))
);
await page.evaluate(xpath => {
const getNode = xp => {
const result = document.evaluate(
xp,
document,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
return result.singleNodeValue;
};
const el = getNode(xpath);
if (el) el.click();
}, info.xpath);
await page.waitForTimeout(400);
const afterLinks = await page.$$eval("a[href]", as =>
as.map(a => a.href)
);
for (const link of afterLinks) {
if (!beforeLinks.has(link)) {
const normalized = normalizeUrl(link);
if (normalized) await crawl(page, normalized, depth + 1);
}
}
} catch {}
}
}
async function performLoginIfNeeded(page, url) {
for (const rule of loginConfig.logins) {
if (url.includes(rule.match)) {
for (const [name, value] of Object.entries(rule.fields)) {
const sel = `input[name="${name}"]`;
await page.waitForSelector(sel);
await page.fill(sel, value);
}
const submitSel = `[name="${rule.submit}"], button[name="${rule.submit}"]`;
await page.waitForSelector(submitSel);
await page.click(submitSel);
await page.waitForLoadState("networkidle");
}
}
}
async function crawl(page, rawUrl, depth) {
const url = normalizeUrl(rawUrl);
if (depth > maxDepth) return;
if (!matcher.allow(url)) return;
if (visited.has(url)) return;
visited.add(url);
await page.goto(url, { waitUntil: "networkidle" });
await performLoginIfNeeded(page, url);
const forms = await page.$$eval("form", forms =>
forms.map(f => ({
action: f.action,
inputs: Array.from(f.querySelectorAll("input, textarea, select")).map(i => ({
name: i.name,
tag: i.tagName.toLowerCase(),
type: i.type || null
}))
}))
);
const links = await page.$$eval("a[href]", as =>
as.map(a => a.href).filter(h => h.startsWith("http"))
);
const ui = await getUIElements(page);
const sections = await getSections(page);
selectors[url] = {
forms,
links: links.map(href => ({ to: href })),
ui,
sections
};
await discoverByClicking(page, url, depth);
const normalizedLinks = links
.map(h => normalizeUrl(h))
.filter(h => h);
for (const link of normalizedLinks) {
await crawl(page, link, depth + 1);
}
}
(async () => {
const browser = await chromium.launch({ headless: false });
const page = await browser.newPage();
await crawl(page, startURL, 0);
fs.writeFileSync("selectors.json", JSON.stringify(selectors, null, 2));
await browser.close();
})();
// selectors-crawler.js
//@3
/*
Writes to : selectors.json
Consumed by : graph-utils.js
: test-generator.js
console.log("Loading: selectors-crawler.js");
import fs from "fs";
import { chromium } from "playwright";
import { normalizeUrl } from "./utils/normalizeUrl.js";
import { UrlPatternMatcher } from "./utils/urlPatternMatcher.js";
const config = JSON.parse(fs.readFileSync("./login-config.json", "utf8"));
const {
startURL,
maxDepth,
loginConfig,
includePatterns = [],
excludePatterns = [],
matcherOptions = {}
} = config;
const visited = new Set();
const selectors = {};
const matcher = new UrlPatternMatcher(includePatterns, excludePatterns, matcherOptions);
const LOGOUT_KEYWORDS = ["logout", "signout", "logoff", "sign-out", "log-out"];
function isLogoutElement(info) {
const text = (info.text || "").toLowerCase();
const href = (info.href || "").toLowerCase();
const onclick = (info.onclick || "").toLowerCase();
return LOGOUT_KEYWORDS.some(k =>
text.includes(k) || href.includes(k) || onclick.includes(k)
);
}
async function getUIElements(page) {
return await page.$$eval(
`
button,
[onclick],
a[onclick]:not([href]),
img[onclick],
img[role='button'],
img[tabindex],
div[onclick],
span[onclick],
[role='button'],
[data-toggle],
[data-tab],
[data-accordion]
`,
els =>
els.map((el, index) => ({
id: `ui-${index}`,
tag: el.tagName.toLowerCase(),
text: el.innerText || "",
role: el.getAttribute("role") || null,
onclick: el.getAttribute("onclick"),
href: el.getAttribute("href"),
dataset: { ...el.dataset }
}))
);
}
async function getSections(page) {
return await page.$$eval(
`
section,
[role='dialog'],
[role='tabpanel'],
[role='tablist'],
.modal,
[data-modal],
[data-section],
[data-tab-panel],
[data-accordion-panel]
`,
els =>
els.map((el, index) => ({
id: el.id || `section-${index}`,
tag: el.tagName.toLowerCase(),
text: (el.innerText || "").slice(0, 200),
role: el.getAttribute("role") || null,
dataset: { ...el.dataset }
}))
);
}
async function scrollToReveal(page) {
await page.evaluate(async () => {
const total =
document.body.scrollHeight || document.documentElement.scrollHeight || 0;
const step = Math.max(200, Math.floor(total / 5));
for (let y = 0; y < total; y += step) {
window.scrollTo(0, y);
await new Promise(r => setTimeout(r, 150));
}
window.scrollTo(0, total);
});
}
async function discoverByClicking(page, url, depth) {
await scrollToReveal(page);
const clickables = await page.$$eval(
`
button,
[onclick],
a[onclick]:not([href]),
img[onclick],
img[role='button'],
img[tabindex],
div[onclick],
span[onclick],
[role='button'],
[data-toggle],
[data-tab],
[data-accordion]
`,
els =>
els.map(el => ({
text: el.innerText || "",
href: el.getAttribute("href"),
onclick: el.getAttribute("onclick"),
xpath: (() => {
let path = "";
let current = el;
while (current && current.nodeType === 1) {
let index = 1;
let sibling = current.previousElementSibling;
while (sibling) {
if (sibling.tagName === current.tagName) index++;
sibling = sibling.previousElementSibling;
}
path = `/${current.tagName}[${index}]` + path;
current = current.parentElement;
}
return path;
})()
}))
);
for (const info of clickables) {
if (isLogoutElement(info)) continue;
try {
const beforeLinks = new Set(
await page.$$eval("a[href]", as => as.map(a => a.href))
);
await page.evaluate(xpath => {
const getNode = xp => {
const result = document.evaluate(
xp,
document,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
return result.singleNodeValue;
};
const el = getNode(xpath);
if (el) el.click();
}, info.xpath);
await page.waitForTimeout(400);
const afterLinks = await page.$$eval("a[href]", as =>
as.map(a => a.href)
);
for (const link of afterLinks) {
if (!beforeLinks.has(link)) {
const normalized = normalizeUrl(link);
if (normalized) await crawl(page, normalized, depth + 1);
}
}
} catch {}
}
}
async function performLoginIfNeeded(page, url) {
for (const rule of loginConfig.logins) {
if (url.includes(rule.match)) {
for (const [name, value] of Object.entries(rule.fields)) {
const sel = `input[name="${name}"]`;
await page.waitForSelector(sel);
await page.fill(sel, value);
}
const submitSel = `[name="${rule.submit}"], button[name="${rule.submit}"]`;
await page.waitForSelector(submitSel);
await page.click(submitSel);
await page.waitForLoadState("networkidle");
}
}
}
async function crawl(page, rawUrl, depth) {
const url = normalizeUrl(rawUrl);
if (depth > maxDepth) return;
if (!matcher.allow(url)) return;
if (visited.has(url)) return;
visited.add(url);
await page.goto(url, { waitUntil: "networkidle" });
await performLoginIfNeeded(page, url);
const forms = await page.$$eval("form", forms =>
forms.map(f => ({
action: f.action,
inputs: Array.from(f.querySelectorAll("input, textarea, select")).map(i => ({
name: i.name,
tag: i.tagName.toLowerCase(),
type: i.type || null
}))
}))
);
const links = await page.$$eval("a[href]", as =>
as.map(a => a.href).filter(h => h.startsWith("http"))
);
const ui = await getUIElements(page);
const sections = await getSections(page);
selectors[url] = {
forms,
links: links.map(href => ({ to: href })),
ui,
sections
};
await discoverByClicking(page, url, depth);
const normalizedLinks = links
.map(h => normalizeUrl(h))
.filter(h => h);
for (const link of normalizedLinks) {
await crawl(page, link, depth + 1);
}
}
(async () => {
const browser = await chromium.launch({ headless: false });
const page = await browser.newPage();
await crawl(page, startURL, 0);
fs.writeFileSync("selectors.json", JSON.stringify(selectors, null, 2));
await browser.close();
})();
*/