559 lines
14 KiB
JavaScript
559 lines
14 KiB
JavaScript
// selectors-crawler.js
|
|
//@3
|
|
/*
|
|
Writes to : selectors.json
|
|
Consumed by : graph-utils.js
|
|
: test-generator.js
|
|
Short friendly labels enabled
|
|
*/
|
|
|
|
console.log("Loading: selectors-crawler.js");
|
|
|
|
import fs from "fs";
|
|
import { chromium } from "playwright";
|
|
import { normalizeUrl } from "./utils/normalizeUrl.js";
|
|
import { UrlPatternMatcher } from "./utils/urlPatternMatcher.js";
|
|
|
|
const config = JSON.parse(fs.readFileSync("./login-config.json", "utf8"));
|
|
const {
|
|
startURL,
|
|
maxDepth,
|
|
loginConfig,
|
|
includePatterns = [],
|
|
excludePatterns = [],
|
|
matcherOptions = {}
|
|
} = config;
|
|
|
|
const visited = new Set();
|
|
const selectors = {};
|
|
|
|
const matcher = new UrlPatternMatcher(includePatterns, excludePatterns, matcherOptions);
|
|
|
|
const LOGOUT_KEYWORDS = ["logout", "signout", "logoff", "sign-out", "log-out"];
|
|
|
|
function isLogoutElement(info) {
|
|
const text = (info.text || "").toLowerCase();
|
|
const href = (info.href || "").toLowerCase();
|
|
const onclick = (info.onclick || "").toLowerCase();
|
|
return LOGOUT_KEYWORDS.some(k =>
|
|
text.includes(k) || href.includes(k) || onclick.includes(k)
|
|
);
|
|
}
|
|
|
|
// Generate short friendly labels
|
|
function friendlyLabel(text, role, dataset, tag, index) {
|
|
const clean = s =>
|
|
String(s || "")
|
|
.trim()
|
|
.replace(/\s+/g, "-")
|
|
.replace(/[^a-zA-Z0-9-_]/g, "")
|
|
.toLowerCase();
|
|
|
|
if (text && clean(text)) return clean(text);
|
|
if (dataset?.tab) return `tab-${clean(dataset.tab)}`;
|
|
if (dataset?.modal) return `modal-${clean(dataset.modal)}`;
|
|
if (dataset?.accordion) return `accordion-${clean(dataset.accordion)}`;
|
|
if (role && clean(role)) return clean(role);
|
|
return `${tag}-${index}`;
|
|
}
|
|
|
|
async function getUIElements(page) {
|
|
const ui = await page.$$eval(
|
|
`
|
|
button,
|
|
[onclick],
|
|
a[onclick]:not([href]),
|
|
img[onclick],
|
|
img[role='button'],
|
|
img[tabindex],
|
|
div[onclick],
|
|
span[onclick],
|
|
[role='button'],
|
|
[data-toggle],
|
|
[data-tab],
|
|
[data-accordion]
|
|
`,
|
|
els =>
|
|
els.map((el, index) => ({
|
|
id: index,
|
|
tag: el.tagName.toLowerCase(),
|
|
text: el.innerText || "",
|
|
role: el.getAttribute("role") || null,
|
|
onclick: el.getAttribute("onclick"),
|
|
href: el.getAttribute("href"),
|
|
dataset: { ...el.dataset }
|
|
}))
|
|
);
|
|
|
|
for (const el of ui)
|
|
el.friendly = friendlyLabel(el.text, el.role, el.dataset, el.tag, el.id);
|
|
|
|
return ui;
|
|
}
|
|
|
|
async function getSections(page) {
|
|
const sections = await page.$$eval(
|
|
`
|
|
section,
|
|
[role='dialog'],
|
|
[role='tabpanel'],
|
|
[role='tablist'],
|
|
.modal,
|
|
[data-modal],
|
|
[data-section],
|
|
[data-tab-panel],
|
|
[data-accordion-panel]
|
|
`,
|
|
els =>
|
|
els.map((el, index) => ({
|
|
id: el.id || index,
|
|
tag: el.tagName.toLowerCase(),
|
|
text: (el.innerText || "").slice(0, 200),
|
|
role: el.getAttribute("role") || null,
|
|
dataset: { ...el.dataset }
|
|
}))
|
|
);
|
|
|
|
for (const el of sections)
|
|
el.friendly = friendlyLabel(el.text, el.role, el.dataset, el.tag, el.id);
|
|
|
|
return sections;
|
|
}
|
|
|
|
async function scrollToReveal(page) {
|
|
await page.evaluate(async () => {
|
|
const total =
|
|
document.body.scrollHeight || document.documentElement.scrollHeight || 0;
|
|
const step = Math.max(200, Math.floor(total / 5));
|
|
for (let y = 0; y < total; y += step) {
|
|
window.scrollTo(0, y);
|
|
await new Promise(r => setTimeout(r, 150));
|
|
}
|
|
window.scrollTo(0, total);
|
|
});
|
|
}
|
|
|
|
async function discoverByClicking(page, url, depth) {
|
|
await scrollToReveal(page);
|
|
|
|
const clickables = await page.$$eval(
|
|
`
|
|
button,
|
|
[onclick],
|
|
a[onclick]:not([href]),
|
|
img[onclick],
|
|
img[role='button'],
|
|
img[tabindex],
|
|
div[onclick],
|
|
span[onclick],
|
|
[role='button'],
|
|
[data-toggle],
|
|
[data-tab],
|
|
[data-accordion]
|
|
`,
|
|
els =>
|
|
els.map((el, index) => ({
|
|
text: el.innerText || "",
|
|
href: el.getAttribute("href"),
|
|
onclick: el.getAttribute("onclick"),
|
|
tag: el.tagName.toLowerCase(),
|
|
role: el.getAttribute("role"),
|
|
dataset: { ...el.dataset },
|
|
index,
|
|
xpath: (() => {
|
|
let path = "";
|
|
let current = el;
|
|
while (current && current.nodeType === 1) {
|
|
let idx = 1;
|
|
let sibling = current.previousElementSibling;
|
|
while (sibling) {
|
|
if (sibling.tagName === current.tagName) idx++;
|
|
sibling = sibling.previousElementSibling;
|
|
}
|
|
path = `/${current.tagName}[${idx}]` + path;
|
|
current = current.parentElement;
|
|
}
|
|
return path;
|
|
})()
|
|
}))
|
|
);
|
|
|
|
for (const info of clickables) {
|
|
if (isLogoutElement(info)) continue;
|
|
|
|
try {
|
|
const beforeLinks = new Set(
|
|
await page.$$eval("a[href]", as => as.map(a => a.href))
|
|
);
|
|
|
|
await page.evaluate(xpath => {
|
|
const getNode = xp => {
|
|
const result = document.evaluate(
|
|
xp,
|
|
document,
|
|
null,
|
|
XPathResult.FIRST_ORDERED_NODE_TYPE,
|
|
null
|
|
);
|
|
return result.singleNodeValue;
|
|
};
|
|
const el = getNode(xpath);
|
|
if (el) el.click();
|
|
}, info.xpath);
|
|
|
|
await page.waitForTimeout(400);
|
|
|
|
const afterLinks = await page.$$eval("a[href]", as =>
|
|
as.map(a => a.href)
|
|
);
|
|
|
|
for (const link of afterLinks) {
|
|
if (!beforeLinks.has(link)) {
|
|
const normalized = normalizeUrl(link);
|
|
if (normalized) await crawl(page, normalized, depth + 1);
|
|
}
|
|
}
|
|
} catch {}
|
|
}
|
|
}
|
|
|
|
async function performLoginIfNeeded(page, url) {
|
|
for (const rule of loginConfig.logins) {
|
|
if (url.includes(rule.match)) {
|
|
for (const [name, value] of Object.entries(rule.fields)) {
|
|
const sel = `input[name="${name}"]`;
|
|
await page.waitForSelector(sel);
|
|
await page.fill(sel, value);
|
|
}
|
|
|
|
const submitSel = `[name="${rule.submit}"], button[name="${rule.submit}"]`;
|
|
await page.waitForSelector(submitSel);
|
|
await page.click(submitSel);
|
|
|
|
await page.waitForLoadState("networkidle");
|
|
}
|
|
}
|
|
}
|
|
|
|
async function crawl(page, rawUrl, depth) {
|
|
const url = normalizeUrl(rawUrl);
|
|
|
|
if (depth > maxDepth) return;
|
|
if (!matcher.allow(url)) return;
|
|
if (visited.has(url)) return;
|
|
|
|
visited.add(url);
|
|
|
|
await page.goto(url, { waitUntil: "networkidle" });
|
|
await performLoginIfNeeded(page, url);
|
|
|
|
const forms = await page.$$eval("form", forms =>
|
|
forms.map(f => ({
|
|
action: f.action,
|
|
inputs: Array.from(f.querySelectorAll("input, textarea, select")).map(i => ({
|
|
name: i.name,
|
|
tag: i.tagName.toLowerCase(),
|
|
type: i.type || null
|
|
}))
|
|
}))
|
|
);
|
|
|
|
const links = await page.$$eval("a[href]", as =>
|
|
as.map(a => a.href).filter(h => h.startsWith("http"))
|
|
);
|
|
|
|
const ui = await getUIElements(page);
|
|
const sections = await getSections(page);
|
|
|
|
selectors[url] = {
|
|
forms,
|
|
links: links.map(href => ({ to: href })),
|
|
ui,
|
|
sections
|
|
};
|
|
|
|
await discoverByClicking(page, url, depth);
|
|
|
|
const normalizedLinks = links
|
|
.map(h => normalizeUrl(h))
|
|
.filter(h => h);
|
|
|
|
for (const link of normalizedLinks) {
|
|
await crawl(page, link, depth + 1);
|
|
}
|
|
}
|
|
|
|
(async () => {
|
|
const browser = await chromium.launch({ headless: false });
|
|
const page = await browser.newPage();
|
|
|
|
await crawl(page, startURL, 0);
|
|
|
|
fs.writeFileSync("selectors.json", JSON.stringify(selectors, null, 2));
|
|
await browser.close();
|
|
})();
|
|
|
|
|
|
// selectors-crawler.js
|
|
//@3
|
|
/*
|
|
Writes to : selectors.json
|
|
Consumed by : graph-utils.js
|
|
: test-generator.js
|
|
|
|
console.log("Loading: selectors-crawler.js");
|
|
|
|
import fs from "fs";
|
|
import { chromium } from "playwright";
|
|
import { normalizeUrl } from "./utils/normalizeUrl.js";
|
|
import { UrlPatternMatcher } from "./utils/urlPatternMatcher.js";
|
|
|
|
const config = JSON.parse(fs.readFileSync("./login-config.json", "utf8"));
|
|
const {
|
|
startURL,
|
|
maxDepth,
|
|
loginConfig,
|
|
includePatterns = [],
|
|
excludePatterns = [],
|
|
matcherOptions = {}
|
|
} = config;
|
|
|
|
const visited = new Set();
|
|
const selectors = {};
|
|
|
|
const matcher = new UrlPatternMatcher(includePatterns, excludePatterns, matcherOptions);
|
|
|
|
const LOGOUT_KEYWORDS = ["logout", "signout", "logoff", "sign-out", "log-out"];
|
|
|
|
function isLogoutElement(info) {
|
|
const text = (info.text || "").toLowerCase();
|
|
const href = (info.href || "").toLowerCase();
|
|
const onclick = (info.onclick || "").toLowerCase();
|
|
return LOGOUT_KEYWORDS.some(k =>
|
|
text.includes(k) || href.includes(k) || onclick.includes(k)
|
|
);
|
|
}
|
|
|
|
async function getUIElements(page) {
|
|
return await page.$$eval(
|
|
`
|
|
button,
|
|
[onclick],
|
|
a[onclick]:not([href]),
|
|
img[onclick],
|
|
img[role='button'],
|
|
img[tabindex],
|
|
div[onclick],
|
|
span[onclick],
|
|
[role='button'],
|
|
[data-toggle],
|
|
[data-tab],
|
|
[data-accordion]
|
|
`,
|
|
els =>
|
|
els.map((el, index) => ({
|
|
id: `ui-${index}`,
|
|
tag: el.tagName.toLowerCase(),
|
|
text: el.innerText || "",
|
|
role: el.getAttribute("role") || null,
|
|
onclick: el.getAttribute("onclick"),
|
|
href: el.getAttribute("href"),
|
|
dataset: { ...el.dataset }
|
|
}))
|
|
);
|
|
}
|
|
|
|
async function getSections(page) {
|
|
return await page.$$eval(
|
|
`
|
|
section,
|
|
[role='dialog'],
|
|
[role='tabpanel'],
|
|
[role='tablist'],
|
|
.modal,
|
|
[data-modal],
|
|
[data-section],
|
|
[data-tab-panel],
|
|
[data-accordion-panel]
|
|
`,
|
|
els =>
|
|
els.map((el, index) => ({
|
|
id: el.id || `section-${index}`,
|
|
tag: el.tagName.toLowerCase(),
|
|
text: (el.innerText || "").slice(0, 200),
|
|
role: el.getAttribute("role") || null,
|
|
dataset: { ...el.dataset }
|
|
}))
|
|
);
|
|
}
|
|
|
|
async function scrollToReveal(page) {
|
|
await page.evaluate(async () => {
|
|
const total =
|
|
document.body.scrollHeight || document.documentElement.scrollHeight || 0;
|
|
const step = Math.max(200, Math.floor(total / 5));
|
|
for (let y = 0; y < total; y += step) {
|
|
window.scrollTo(0, y);
|
|
await new Promise(r => setTimeout(r, 150));
|
|
}
|
|
window.scrollTo(0, total);
|
|
});
|
|
}
|
|
|
|
async function discoverByClicking(page, url, depth) {
|
|
await scrollToReveal(page);
|
|
|
|
const clickables = await page.$$eval(
|
|
`
|
|
button,
|
|
[onclick],
|
|
a[onclick]:not([href]),
|
|
img[onclick],
|
|
img[role='button'],
|
|
img[tabindex],
|
|
div[onclick],
|
|
span[onclick],
|
|
[role='button'],
|
|
[data-toggle],
|
|
[data-tab],
|
|
[data-accordion]
|
|
`,
|
|
els =>
|
|
els.map(el => ({
|
|
text: el.innerText || "",
|
|
href: el.getAttribute("href"),
|
|
onclick: el.getAttribute("onclick"),
|
|
xpath: (() => {
|
|
let path = "";
|
|
let current = el;
|
|
while (current && current.nodeType === 1) {
|
|
let index = 1;
|
|
let sibling = current.previousElementSibling;
|
|
while (sibling) {
|
|
if (sibling.tagName === current.tagName) index++;
|
|
sibling = sibling.previousElementSibling;
|
|
}
|
|
path = `/${current.tagName}[${index}]` + path;
|
|
current = current.parentElement;
|
|
}
|
|
return path;
|
|
})()
|
|
}))
|
|
);
|
|
|
|
for (const info of clickables) {
|
|
if (isLogoutElement(info)) continue;
|
|
|
|
try {
|
|
const beforeLinks = new Set(
|
|
await page.$$eval("a[href]", as => as.map(a => a.href))
|
|
);
|
|
|
|
await page.evaluate(xpath => {
|
|
const getNode = xp => {
|
|
const result = document.evaluate(
|
|
xp,
|
|
document,
|
|
null,
|
|
XPathResult.FIRST_ORDERED_NODE_TYPE,
|
|
null
|
|
);
|
|
return result.singleNodeValue;
|
|
};
|
|
const el = getNode(xpath);
|
|
if (el) el.click();
|
|
}, info.xpath);
|
|
|
|
await page.waitForTimeout(400);
|
|
|
|
const afterLinks = await page.$$eval("a[href]", as =>
|
|
as.map(a => a.href)
|
|
);
|
|
|
|
for (const link of afterLinks) {
|
|
if (!beforeLinks.has(link)) {
|
|
const normalized = normalizeUrl(link);
|
|
if (normalized) await crawl(page, normalized, depth + 1);
|
|
}
|
|
}
|
|
} catch {}
|
|
}
|
|
}
|
|
|
|
async function performLoginIfNeeded(page, url) {
|
|
for (const rule of loginConfig.logins) {
|
|
if (url.includes(rule.match)) {
|
|
for (const [name, value] of Object.entries(rule.fields)) {
|
|
const sel = `input[name="${name}"]`;
|
|
await page.waitForSelector(sel);
|
|
await page.fill(sel, value);
|
|
}
|
|
|
|
const submitSel = `[name="${rule.submit}"], button[name="${rule.submit}"]`;
|
|
await page.waitForSelector(submitSel);
|
|
await page.click(submitSel);
|
|
|
|
await page.waitForLoadState("networkidle");
|
|
}
|
|
}
|
|
}
|
|
|
|
async function crawl(page, rawUrl, depth) {
|
|
const url = normalizeUrl(rawUrl);
|
|
|
|
if (depth > maxDepth) return;
|
|
if (!matcher.allow(url)) return;
|
|
if (visited.has(url)) return;
|
|
|
|
visited.add(url);
|
|
|
|
await page.goto(url, { waitUntil: "networkidle" });
|
|
await performLoginIfNeeded(page, url);
|
|
|
|
const forms = await page.$$eval("form", forms =>
|
|
forms.map(f => ({
|
|
action: f.action,
|
|
inputs: Array.from(f.querySelectorAll("input, textarea, select")).map(i => ({
|
|
name: i.name,
|
|
tag: i.tagName.toLowerCase(),
|
|
type: i.type || null
|
|
}))
|
|
}))
|
|
);
|
|
|
|
const links = await page.$$eval("a[href]", as =>
|
|
as.map(a => a.href).filter(h => h.startsWith("http"))
|
|
);
|
|
|
|
const ui = await getUIElements(page);
|
|
const sections = await getSections(page);
|
|
|
|
selectors[url] = {
|
|
forms,
|
|
links: links.map(href => ({ to: href })),
|
|
ui,
|
|
sections
|
|
};
|
|
|
|
await discoverByClicking(page, url, depth);
|
|
|
|
const normalizedLinks = links
|
|
.map(h => normalizeUrl(h))
|
|
.filter(h => h);
|
|
|
|
for (const link of normalizedLinks) {
|
|
await crawl(page, link, depth + 1);
|
|
}
|
|
}
|
|
|
|
(async () => {
|
|
const browser = await chromium.launch({ headless: false });
|
|
const page = await browser.newPage();
|
|
|
|
await crawl(page, startURL, 0);
|
|
|
|
fs.writeFileSync("selectors.json", JSON.stringify(selectors, null, 2));
|
|
await browser.close();
|
|
})();
|
|
*/
|