Add --fast option to NPM driver

main
Elbert Alias 1 year ago
parent 6fb1baedde
commit 4582061e8d

@ -13,6 +13,7 @@ const aliases = {
a: 'userAgent',
b: 'batchSize',
d: 'debug',
f: 'fast',
t: 'delay',
h: 'help',
H: 'header',
@ -74,6 +75,7 @@ Examples:
Options:
-b, --batch-size=... Process links in batches
-d, --debug Output debug messages
-f, --fast Prioritise speed over accuracy
-t, --delay=ms Wait for ms milliseconds between requests
-h, --help This text
-H, --header Extra header to send with requests

@ -283,7 +283,11 @@ function analyzeDom(dom, technologies = Wappalyzer.technologies) {
}
function get(url, options = {}) {
const timeout = options.timeout || 10000
const timeout =
options.timeout ||
(this.options.fast
? this.Math.min(this.options.maxWait, 3000)
: this.options.maxWait)
if (['http:', 'https:'].includes(url.protocol)) {
const { get } = url.protocol === 'http:' ? http : https
@ -314,7 +318,7 @@ function get(url, options = {}) {
}
)
.setTimeout(timeout, () =>
reject(new Error(`Timeout (${url.href}, ${timeout}ms)`))
reject(new Error(`Timeout (${url}, ${timeout}ms)`))
)
.on('error', (error) => reject(new Error(error.message)))
)
@ -345,6 +349,7 @@ class Driver {
}
this.options.debug = Boolean(+this.options.debug)
this.options.fast = Boolean(+this.options.fast)
this.options.recursive = Boolean(+this.options.recursive)
this.options.probe =
String(this.options.probe || '').toLowerCase() === 'basic'
@ -369,7 +374,7 @@ class Driver {
}
async init() {
for (let attempt = 1; attempt <= 3; attempt++) {
for (let attempt = 1; attempt <= 2; attempt++) {
this.log(`Launching browser (attempt ${attempt})...`)
try {
@ -385,7 +390,9 @@ class Driver {
acceptInsecureCerts: true,
args: chromiumArgs,
executablePath: CHROMIUM_BIN,
timeout: 5000,
timeout: this.options.fast
? Math.min(this.options.maxWait, 10000)
: this.options.maxWait,
})
}
@ -393,28 +400,20 @@ class Driver {
} catch (error) {
this.log(error)
if (attempt >= 3) {
if (attempt >= 2) {
throw new Error(error.message || error.toString())
}
}
}
this.browser.on('disconnected', async () => {
this.log('Browser disconnected')
this.browser.on('disconnected', () => {
this.browser = undefined
if (!this.destroyed) {
try {
await this.init()
} catch (error) {
this.log(error)
}
}
this.log('Browser disconnected')
})
}
async destroy() {
this.destroyed = true
if (this.browser) {
try {
await sleep(1)
@ -507,8 +506,6 @@ class Site {
this.cache = {}
this.probed = false
this.destroyed = false
}
log(message, source = 'driver', type = 'log') {
@ -544,7 +541,9 @@ class Site {
promise,
fallback,
errorMessage = 'Operation took too long to complete',
maxWait = Math.min(this.options.maxWait, 3000)
maxWait = this.options.fast
? Math.min(this.options.maxWait, 2000)
: this.options.maxWait
) {
let timeout = null
@ -579,10 +578,6 @@ class Site {
}
async goto(url) {
if (this.destroyed) {
return
}
// Return when the URL is a duplicate or maxUrls has been reached
if (this.analyzedUrls[url.href]) {
return []
@ -640,14 +635,18 @@ class Site {
) {
request.abort('blockedbyclient')
} else {
const headers = {
...request.headers(),
...this.options.headers,
}
await this.emit('request', { page, request })
request.continue({ headers })
if (Object.keys(this.options.headers).length) {
const headers = {
...request.headers(),
...this.options.headers,
}
request.continue({ headers })
} else {
request.continue()
}
}
} catch (error) {
error.message += ` (${url})`
@ -657,7 +656,7 @@ class Site {
})
page.on('response', async (response) => {
if (this.destroyed || !page || page.__closed || page.isClosed()) {
if (!page || page.__closed || page.isClosed()) {
return
}
@ -745,7 +744,7 @@ class Site {
}
if (!this.options.noScripts) {
await sleep(1000)
await sleep(this.options.fast ? 1000 : 3000)
}
// page.on('console', (message) => this.log(message.text()))
@ -810,151 +809,170 @@ class Site {
let dom = []
if (html) {
// Links
links = !this.options.recursive
? []
: await this.promiseTimeout(
await Promise.all([
(async () => {
// Links
links = !this.options.recursive
? []
: await this.promiseTimeout(
(
await this.promiseTimeout(
page.evaluateHandle(() =>
Array.from(document.getElementsByTagName('a')).map(
({
hash,
hostname,
href,
pathname,
protocol,
rel,
}) => ({
hash,
hostname,
href,
pathname,
protocol,
rel,
})
)
),
{ jsonValue: () => [] },
'Timeout (links)'
)
).jsonValue(),
[],
'Timeout (links)'
)
})(),
(async () => {
// Text
text = await this.promiseTimeout(
(
await this.promiseTimeout(
page.evaluateHandle(() =>
Array.from(document.getElementsByTagName('a')).map(
({ hash, hostname, href, pathname, protocol, rel }) => ({
hash,
hostname,
href,
pathname,
protocol,
rel,
})
)
page.evaluateHandle(
() =>
// eslint-disable-next-line unicorn/prefer-text-content
document.body && document.body.innerText
),
{ jsonValue: () => [] },
'Timeout (links)'
{ jsonValue: () => '' },
'Timeout (text)'
)
).jsonValue(),
[],
'Timeout (links)'
)
// Text
text = await this.promiseTimeout(
(
await this.promiseTimeout(
page.evaluateHandle(
() =>
// eslint-disable-next-line unicorn/prefer-text-content
document.body && document.body.innerText
),
{ jsonValue: () => '' },
'',
'Timeout (text)'
)
).jsonValue(),
'',
'Timeout (text)'
)
// CSS
css = await this.promiseTimeout(
(
await this.promiseTimeout(
page.evaluateHandle((maxRows) => {
const css = []
})(),
(async () => {
// CSS
css = await this.promiseTimeout(
(
await this.promiseTimeout(
page.evaluateHandle((maxRows) => {
const css = []
try {
if (!document.styleSheets.length) {
return ''
}
try {
if (!document.styleSheets.length) {
return ''
}
for (const sheet of Array.from(document.styleSheets)) {
for (const rules of Array.from(sheet.cssRules)) {
css.push(rules.cssText)
for (const sheet of Array.from(document.styleSheets)) {
for (const rules of Array.from(sheet.cssRules)) {
css.push(rules.cssText)
if (css.length >= maxRows) {
break
if (css.length >= maxRows) {
break
}
}
}
} catch (error) {
return ''
}
}
} catch (error) {
return ''
}
return css.join('\n')
}, this.options.htmlMaxRows),
{ jsonValue: () => '' },
return css.join('\n')
}, this.options.htmlMaxRows),
{ jsonValue: () => '' },
'Timeout (css)'
)
).jsonValue(),
'',
'Timeout (css)'
)
).jsonValue(),
'',
'Timeout (css)'
)
})(),
(async () => {
// Script tags
;[scriptSrc, scripts] = await this.promiseTimeout(
(
await this.promiseTimeout(
page.evaluateHandle(() => {
const nodes = Array.from(
document.getElementsByTagName('script')
)
// Script tags
;[scriptSrc, scripts] = await this.promiseTimeout(
(
await this.promiseTimeout(
page.evaluateHandle(() => {
const nodes = Array.from(
document.getElementsByTagName('script')
return [
nodes
.filter(
({ src }) =>
src && !src.startsWith('data:text/javascript;')
)
.map(({ src }) => src),
nodes
.map((node) => node.textContent)
.filter((script) => script),
]
}),
{ jsonValue: () => [] },
'Timeout (scripts)'
)
return [
nodes
.filter(
({ src }) =>
src && !src.startsWith('data:text/javascript;')
)
.map(({ src }) => src),
nodes
.map((node) => node.textContent)
.filter((script) => script),
]
}),
{ jsonValue: () => [] },
).jsonValue(),
[],
'Timeout (scripts)'
)
).jsonValue(),
[],
'Timeout (scripts)'
)
// Meta tags
meta = await this.promiseTimeout(
(
await this.promiseTimeout(
page.evaluateHandle(() =>
Array.from(document.querySelectorAll('meta')).reduce(
(metas, meta) => {
const key =
meta.getAttribute('name') || meta.getAttribute('property')
if (key) {
metas[key.toLowerCase()] = metas[key.toLowerCase()] || []
metas[key.toLowerCase()].push(
meta.getAttribute('content')
)
}
return metas
},
{}
})(),
(async () => {
// Meta tags
meta = await this.promiseTimeout(
(
await this.promiseTimeout(
page.evaluateHandle(() =>
Array.from(document.querySelectorAll('meta')).reduce(
(metas, meta) => {
const key =
meta.getAttribute('name') ||
meta.getAttribute('property')
if (key) {
metas[key.toLowerCase()] =
metas[key.toLowerCase()] || []
metas[key.toLowerCase()].push(
meta.getAttribute('content')
)
}
return metas
},
{}
)
),
{ jsonValue: () => [] },
'Timeout (meta)'
)
),
{ jsonValue: () => [] },
).jsonValue(),
[],
'Timeout (meta)'
)
).jsonValue(),
[],
'Timeout (meta)'
)
// JavaScript
js = this.options.noScripts
? []
: await this.promiseTimeout(getJs(page), [], 'Timeout (js)')
// DOM
dom = await this.promiseTimeout(getDom(page), [], 'Timeout (dom)')
})(),
(async () => {
// JavaScript
js = this.options.noScripts
? []
: await this.promiseTimeout(getJs(page), [], 'Timeout (js)')
})(),
(async () => {
// DOM
dom = await this.promiseTimeout(getDom(page), [], 'Timeout (dom)')
})(),
])
}
this.cache[url.href] = {
@ -1037,7 +1055,9 @@ class Site {
}
if (error.message.includes('net::ERR_NAME_NOT_RESOLVED')) {
const newError = new Error(`Hostname could not be resolved (${url})`)
const newError = new Error(
`Hostname could not be resolved (${url.hostname})`
)
newError.code = 'WAPPALYZER_DNS_ERROR'
@ -1253,7 +1273,9 @@ class Site {
}),
[],
'Timeout (dns)',
Math.min(this.options.maxWait, 15000)
this.options.fast
? Math.min(this.options.maxWait, 15000)
: this.options.maxWait
)
}
@ -1452,8 +1474,6 @@ class Site {
})
)
this.destroyed = true
this.log('Site closed')
}
}

@ -13,7 +13,7 @@
"software"
],
"homepage": "https://www.wappalyzer.com/",
"version": "6.10.65",
"version": "6.10.66",
"author": "Wappalyzer",
"license": "GPL-3.0",
"repository": {

@ -4,7 +4,7 @@
"author": "Wappalyzer",
"homepage_url": "https://www.wappalyzer.com/",
"description": "Identify web technologies",
"version": "6.10.65",
"version": "6.10.66",
"default_locale": "en",
"manifest_version": 2,
"icons": {

@ -4,7 +4,7 @@
"author": "Wappalyzer",
"homepage_url": "https://www.wappalyzer.com/",
"description": "Identify web technologies",
"version": "6.10.65",
"version": "6.10.66",
"default_locale": "en",
"manifest_version": 3,
"icons": {

@ -13,7 +13,7 @@
"software"
],
"homepage": "https://www.wappalyzer.com/",
"version": "6.10.65",
"version": "6.10.66",
"author": "Wappalyzer",
"license": "GPL-3.0",
"repository": {

@ -389,20 +389,6 @@
"scriptSrc": "cdn\\.blog\\.st-hatena\\.com/",
"website": "https://hatenablog.com"
},
"Header Bidding Ai": {
"cats": [
36
],
"description": "Header Bidding Ai is a provider of an automated and managed header bidding solution. Header bidding cutting-edge technique where publishers offer their ad inventory to many ad exchanges.",
"icon": "Header Bidding Ai.svg",
"scriptSrc": "\\.headerbidding\\.ai/",
"saas": true,
"pricing": [
"poa",
"recurring"
],
"website": "https://headerbidding.ai"
},
"HeadJS": {
"cats": [
59
@ -415,6 +401,20 @@
"scriptSrc": "head\\.(?:core|load)(?:\\.min)?\\.js",
"website": "https://headjs.com"
},
"Header Bidding Ai": {
"cats": [
36
],
"description": "Header Bidding Ai is a provider of an automated and managed header bidding solution. Header bidding cutting-edge technique where publishers offer their ad inventory to many ad exchanges.",
"icon": "Header Bidding Ai.svg",
"pricing": [
"poa",
"recurring"
],
"saas": true,
"scriptSrc": "\\.headerbidding\\.ai/",
"website": "https://headerbidding.ai"
},
"Headless UI": {
"cats": [
66

@ -1955,22 +1955,6 @@
"scriptSrc": "widget\\.sezzle\\.(?:in|com)",
"website": "https://sezzle.com/"
},
"shadcn/ui": {
"cats": [
66
],
"css": [
"--destructive-foreground"
],
"description": "shadcn/ui is a component system built with Radix UI and Tailwind CSS.",
"icon": "shadcn-ui.svg",
"oss": true,
"implies": [
"Radix UI",
"Tailwind CSS"
],
"website": "https://ui.shadcn.com"
},
"Shaka Player": {
"cats": [
14
@ -6720,11 +6704,11 @@
],
"description": "Summernote is an open-source JavaScript library that offers a feature-rich WYSIWYG editor for web applications, allowing users to create and edit formatted content in a familiar word processor-like interface.",
"icon": "Summernote.svg",
"oss": true,
"scriptSrc": [
"/(?:S|s)ummernote(?:\\.min)?\\.js",
"/summernote(?:@|-)([\\d\\.]+)/\\;version:\\1"
],
"oss": true,
"website": "https://summernote.org"
},
"Sumo": {
@ -7310,6 +7294,22 @@
"scriptSrc": "scrollreveal(?:\\.min)(?:\\.js)",
"website": "https://scrollrevealjs.org"
},
"shadcn/ui": {
"cats": [
66
],
"css": [
"--destructive-foreground"
],
"description": "shadcn/ui is a component system built with Radix UI and Tailwind CSS.",
"icon": "shadcn-ui.svg",
"implies": [
"Radix UI",
"Tailwind CSS"
],
"oss": true,
"website": "https://ui.shadcn.com"
},
"shine.js": {
"cats": [
25