Add --fast option to NPM driver

main
Elbert Alias 1 year ago
parent 6fb1baedde
commit 4582061e8d

@ -13,6 +13,7 @@ const aliases = {
a: 'userAgent', a: 'userAgent',
b: 'batchSize', b: 'batchSize',
d: 'debug', d: 'debug',
f: 'fast',
t: 'delay', t: 'delay',
h: 'help', h: 'help',
H: 'header', H: 'header',
@ -74,6 +75,7 @@ Examples:
Options: Options:
-b, --batch-size=... Process links in batches -b, --batch-size=... Process links in batches
-d, --debug Output debug messages -d, --debug Output debug messages
-f, --fast Prioritise speed over accuracy
-t, --delay=ms Wait for ms milliseconds between requests -t, --delay=ms Wait for ms milliseconds between requests
-h, --help This text -h, --help This text
-H, --header Extra header to send with requests -H, --header Extra header to send with requests

@ -283,7 +283,11 @@ function analyzeDom(dom, technologies = Wappalyzer.technologies) {
} }
function get(url, options = {}) { function get(url, options = {}) {
const timeout = options.timeout || 10000 const timeout =
options.timeout ||
(this.options.fast
? this.Math.min(this.options.maxWait, 3000)
: this.options.maxWait)
if (['http:', 'https:'].includes(url.protocol)) { if (['http:', 'https:'].includes(url.protocol)) {
const { get } = url.protocol === 'http:' ? http : https const { get } = url.protocol === 'http:' ? http : https
@ -314,7 +318,7 @@ function get(url, options = {}) {
} }
) )
.setTimeout(timeout, () => .setTimeout(timeout, () =>
reject(new Error(`Timeout (${url.href}, ${timeout}ms)`)) reject(new Error(`Timeout (${url}, ${timeout}ms)`))
) )
.on('error', (error) => reject(new Error(error.message))) .on('error', (error) => reject(new Error(error.message)))
) )
@ -345,6 +349,7 @@ class Driver {
} }
this.options.debug = Boolean(+this.options.debug) this.options.debug = Boolean(+this.options.debug)
this.options.fast = Boolean(+this.options.fast)
this.options.recursive = Boolean(+this.options.recursive) this.options.recursive = Boolean(+this.options.recursive)
this.options.probe = this.options.probe =
String(this.options.probe || '').toLowerCase() === 'basic' String(this.options.probe || '').toLowerCase() === 'basic'
@ -369,7 +374,7 @@ class Driver {
} }
async init() { async init() {
for (let attempt = 1; attempt <= 3; attempt++) { for (let attempt = 1; attempt <= 2; attempt++) {
this.log(`Launching browser (attempt ${attempt})...`) this.log(`Launching browser (attempt ${attempt})...`)
try { try {
@ -385,7 +390,9 @@ class Driver {
acceptInsecureCerts: true, acceptInsecureCerts: true,
args: chromiumArgs, args: chromiumArgs,
executablePath: CHROMIUM_BIN, executablePath: CHROMIUM_BIN,
timeout: 5000, timeout: this.options.fast
? Math.min(this.options.maxWait, 10000)
: this.options.maxWait,
}) })
} }
@ -393,28 +400,20 @@ class Driver {
} catch (error) { } catch (error) {
this.log(error) this.log(error)
if (attempt >= 3) { if (attempt >= 2) {
throw new Error(error.message || error.toString()) throw new Error(error.message || error.toString())
} }
} }
} }
this.browser.on('disconnected', async () => { this.browser.on('disconnected', () => {
this.log('Browser disconnected') this.browser = undefined
if (!this.destroyed) { this.log('Browser disconnected')
try {
await this.init()
} catch (error) {
this.log(error)
}
}
}) })
} }
async destroy() { async destroy() {
this.destroyed = true
if (this.browser) { if (this.browser) {
try { try {
await sleep(1) await sleep(1)
@ -507,8 +506,6 @@ class Site {
this.cache = {} this.cache = {}
this.probed = false this.probed = false
this.destroyed = false
} }
log(message, source = 'driver', type = 'log') { log(message, source = 'driver', type = 'log') {
@ -544,7 +541,9 @@ class Site {
promise, promise,
fallback, fallback,
errorMessage = 'Operation took too long to complete', errorMessage = 'Operation took too long to complete',
maxWait = Math.min(this.options.maxWait, 3000) maxWait = this.options.fast
? Math.min(this.options.maxWait, 2000)
: this.options.maxWait
) { ) {
let timeout = null let timeout = null
@ -579,10 +578,6 @@ class Site {
} }
async goto(url) { async goto(url) {
if (this.destroyed) {
return
}
// Return when the URL is a duplicate or maxUrls has been reached // Return when the URL is a duplicate or maxUrls has been reached
if (this.analyzedUrls[url.href]) { if (this.analyzedUrls[url.href]) {
return [] return []
@ -640,14 +635,18 @@ class Site {
) { ) {
request.abort('blockedbyclient') request.abort('blockedbyclient')
} else { } else {
const headers = {
...request.headers(),
...this.options.headers,
}
await this.emit('request', { page, request }) await this.emit('request', { page, request })
request.continue({ headers }) if (Object.keys(this.options.headers).length) {
const headers = {
...request.headers(),
...this.options.headers,
}
request.continue({ headers })
} else {
request.continue()
}
} }
} catch (error) { } catch (error) {
error.message += ` (${url})` error.message += ` (${url})`
@ -657,7 +656,7 @@ class Site {
}) })
page.on('response', async (response) => { page.on('response', async (response) => {
if (this.destroyed || !page || page.__closed || page.isClosed()) { if (!page || page.__closed || page.isClosed()) {
return return
} }
@ -745,7 +744,7 @@ class Site {
} }
if (!this.options.noScripts) { if (!this.options.noScripts) {
await sleep(1000) await sleep(this.options.fast ? 1000 : 3000)
} }
// page.on('console', (message) => this.log(message.text())) // page.on('console', (message) => this.log(message.text()))
@ -810,151 +809,170 @@ class Site {
let dom = [] let dom = []
if (html) { if (html) {
// Links await Promise.all([
links = !this.options.recursive (async () => {
? [] // Links
: await this.promiseTimeout( links = !this.options.recursive
? []
: await this.promiseTimeout(
(
await this.promiseTimeout(
page.evaluateHandle(() =>
Array.from(document.getElementsByTagName('a')).map(
({
hash,
hostname,
href,
pathname,
protocol,
rel,
}) => ({
hash,
hostname,
href,
pathname,
protocol,
rel,
})
)
),
{ jsonValue: () => [] },
'Timeout (links)'
)
).jsonValue(),
[],
'Timeout (links)'
)
})(),
(async () => {
// Text
text = await this.promiseTimeout(
( (
await this.promiseTimeout( await this.promiseTimeout(
page.evaluateHandle(() => page.evaluateHandle(
Array.from(document.getElementsByTagName('a')).map( () =>
({ hash, hostname, href, pathname, protocol, rel }) => ({ // eslint-disable-next-line unicorn/prefer-text-content
hash, document.body && document.body.innerText
hostname,
href,
pathname,
protocol,
rel,
})
)
), ),
{ jsonValue: () => [] }, { jsonValue: () => '' },
'Timeout (links)' 'Timeout (text)'
) )
).jsonValue(), ).jsonValue(),
[], '',
'Timeout (links)'
)
// Text
text = await this.promiseTimeout(
(
await this.promiseTimeout(
page.evaluateHandle(
() =>
// eslint-disable-next-line unicorn/prefer-text-content
document.body && document.body.innerText
),
{ jsonValue: () => '' },
'Timeout (text)' 'Timeout (text)'
) )
).jsonValue(), })(),
'', (async () => {
'Timeout (text)' // CSS
) css = await this.promiseTimeout(
(
// CSS await this.promiseTimeout(
css = await this.promiseTimeout( page.evaluateHandle((maxRows) => {
( const css = []
await this.promiseTimeout(
page.evaluateHandle((maxRows) => {
const css = []
try { try {
if (!document.styleSheets.length) { if (!document.styleSheets.length) {
return '' return ''
} }
for (const sheet of Array.from(document.styleSheets)) { for (const sheet of Array.from(document.styleSheets)) {
for (const rules of Array.from(sheet.cssRules)) { for (const rules of Array.from(sheet.cssRules)) {
css.push(rules.cssText) css.push(rules.cssText)
if (css.length >= maxRows) { if (css.length >= maxRows) {
break break
}
}
} }
} catch (error) {
return ''
} }
}
} catch (error) {
return ''
}
return css.join('\n') return css.join('\n')
}, this.options.htmlMaxRows), }, this.options.htmlMaxRows),
{ jsonValue: () => '' }, { jsonValue: () => '' },
'Timeout (css)'
)
).jsonValue(),
'',
'Timeout (css)' 'Timeout (css)'
) )
).jsonValue(), })(),
'', (async () => {
'Timeout (css)' // Script tags
) ;[scriptSrc, scripts] = await this.promiseTimeout(
(
await this.promiseTimeout(
page.evaluateHandle(() => {
const nodes = Array.from(
document.getElementsByTagName('script')
)
// Script tags return [
;[scriptSrc, scripts] = await this.promiseTimeout( nodes
( .filter(
await this.promiseTimeout( ({ src }) =>
page.evaluateHandle(() => { src && !src.startsWith('data:text/javascript;')
const nodes = Array.from( )
document.getElementsByTagName('script') .map(({ src }) => src),
nodes
.map((node) => node.textContent)
.filter((script) => script),
]
}),
{ jsonValue: () => [] },
'Timeout (scripts)'
) )
).jsonValue(),
return [ [],
nodes
.filter(
({ src }) =>
src && !src.startsWith('data:text/javascript;')
)
.map(({ src }) => src),
nodes
.map((node) => node.textContent)
.filter((script) => script),
]
}),
{ jsonValue: () => [] },
'Timeout (scripts)' 'Timeout (scripts)'
) )
).jsonValue(), })(),
[], (async () => {
'Timeout (scripts)' // Meta tags
) meta = await this.promiseTimeout(
(
// Meta tags await this.promiseTimeout(
meta = await this.promiseTimeout( page.evaluateHandle(() =>
( Array.from(document.querySelectorAll('meta')).reduce(
await this.promiseTimeout( (metas, meta) => {
page.evaluateHandle(() => const key =
Array.from(document.querySelectorAll('meta')).reduce( meta.getAttribute('name') ||
(metas, meta) => { meta.getAttribute('property')
const key =
meta.getAttribute('name') || meta.getAttribute('property') if (key) {
metas[key.toLowerCase()] =
if (key) { metas[key.toLowerCase()] || []
metas[key.toLowerCase()] = metas[key.toLowerCase()] || []
metas[key.toLowerCase()].push(
metas[key.toLowerCase()].push( meta.getAttribute('content')
meta.getAttribute('content') )
) }
}
return metas
return metas },
}, {}
{} )
),
{ jsonValue: () => [] },
'Timeout (meta)'
) )
), ).jsonValue(),
{ jsonValue: () => [] }, [],
'Timeout (meta)' 'Timeout (meta)'
) )
).jsonValue(), })(),
[], (async () => {
'Timeout (meta)' // JavaScript
) js = this.options.noScripts
? []
// JavaScript : await this.promiseTimeout(getJs(page), [], 'Timeout (js)')
js = this.options.noScripts })(),
? [] (async () => {
: await this.promiseTimeout(getJs(page), [], 'Timeout (js)') // DOM
dom = await this.promiseTimeout(getDom(page), [], 'Timeout (dom)')
// DOM })(),
dom = await this.promiseTimeout(getDom(page), [], 'Timeout (dom)') ])
} }
this.cache[url.href] = { this.cache[url.href] = {
@ -1037,7 +1055,9 @@ class Site {
} }
if (error.message.includes('net::ERR_NAME_NOT_RESOLVED')) { if (error.message.includes('net::ERR_NAME_NOT_RESOLVED')) {
const newError = new Error(`Hostname could not be resolved (${url})`) const newError = new Error(
`Hostname could not be resolved (${url.hostname})`
)
newError.code = 'WAPPALYZER_DNS_ERROR' newError.code = 'WAPPALYZER_DNS_ERROR'
@ -1253,7 +1273,9 @@ class Site {
}), }),
[], [],
'Timeout (dns)', 'Timeout (dns)',
Math.min(this.options.maxWait, 15000) this.options.fast
? Math.min(this.options.maxWait, 15000)
: this.options.maxWait
) )
} }
@ -1452,8 +1474,6 @@ class Site {
}) })
) )
this.destroyed = true
this.log('Site closed') this.log('Site closed')
} }
} }

@ -13,7 +13,7 @@
"software" "software"
], ],
"homepage": "https://www.wappalyzer.com/", "homepage": "https://www.wappalyzer.com/",
"version": "6.10.65", "version": "6.10.66",
"author": "Wappalyzer", "author": "Wappalyzer",
"license": "GPL-3.0", "license": "GPL-3.0",
"repository": { "repository": {

@ -4,7 +4,7 @@
"author": "Wappalyzer", "author": "Wappalyzer",
"homepage_url": "https://www.wappalyzer.com/", "homepage_url": "https://www.wappalyzer.com/",
"description": "Identify web technologies", "description": "Identify web technologies",
"version": "6.10.65", "version": "6.10.66",
"default_locale": "en", "default_locale": "en",
"manifest_version": 2, "manifest_version": 2,
"icons": { "icons": {

@ -4,7 +4,7 @@
"author": "Wappalyzer", "author": "Wappalyzer",
"homepage_url": "https://www.wappalyzer.com/", "homepage_url": "https://www.wappalyzer.com/",
"description": "Identify web technologies", "description": "Identify web technologies",
"version": "6.10.65", "version": "6.10.66",
"default_locale": "en", "default_locale": "en",
"manifest_version": 3, "manifest_version": 3,
"icons": { "icons": {

@ -13,7 +13,7 @@
"software" "software"
], ],
"homepage": "https://www.wappalyzer.com/", "homepage": "https://www.wappalyzer.com/",
"version": "6.10.65", "version": "6.10.66",
"author": "Wappalyzer", "author": "Wappalyzer",
"license": "GPL-3.0", "license": "GPL-3.0",
"repository": { "repository": {

@ -389,20 +389,6 @@
"scriptSrc": "cdn\\.blog\\.st-hatena\\.com/", "scriptSrc": "cdn\\.blog\\.st-hatena\\.com/",
"website": "https://hatenablog.com" "website": "https://hatenablog.com"
}, },
"Header Bidding Ai": {
"cats": [
36
],
"description": "Header Bidding Ai is a provider of an automated and managed header bidding solution. Header bidding cutting-edge technique where publishers offer their ad inventory to many ad exchanges.",
"icon": "Header Bidding Ai.svg",
"scriptSrc": "\\.headerbidding\\.ai/",
"saas": true,
"pricing": [
"poa",
"recurring"
],
"website": "https://headerbidding.ai"
},
"HeadJS": { "HeadJS": {
"cats": [ "cats": [
59 59
@ -415,6 +401,20 @@
"scriptSrc": "head\\.(?:core|load)(?:\\.min)?\\.js", "scriptSrc": "head\\.(?:core|load)(?:\\.min)?\\.js",
"website": "https://headjs.com" "website": "https://headjs.com"
}, },
"Header Bidding Ai": {
"cats": [
36
],
"description": "Header Bidding Ai is a provider of an automated and managed header bidding solution. Header bidding cutting-edge technique where publishers offer their ad inventory to many ad exchanges.",
"icon": "Header Bidding Ai.svg",
"pricing": [
"poa",
"recurring"
],
"saas": true,
"scriptSrc": "\\.headerbidding\\.ai/",
"website": "https://headerbidding.ai"
},
"Headless UI": { "Headless UI": {
"cats": [ "cats": [
66 66

@ -1955,22 +1955,6 @@
"scriptSrc": "widget\\.sezzle\\.(?:in|com)", "scriptSrc": "widget\\.sezzle\\.(?:in|com)",
"website": "https://sezzle.com/" "website": "https://sezzle.com/"
}, },
"shadcn/ui": {
"cats": [
66
],
"css": [
"--destructive-foreground"
],
"description": "shadcn/ui is a component system built with Radix UI and Tailwind CSS.",
"icon": "shadcn-ui.svg",
"oss": true,
"implies": [
"Radix UI",
"Tailwind CSS"
],
"website": "https://ui.shadcn.com"
},
"Shaka Player": { "Shaka Player": {
"cats": [ "cats": [
14 14
@ -6720,11 +6704,11 @@
], ],
"description": "Summernote is an open-source JavaScript library that offers a feature-rich WYSIWYG editor for web applications, allowing users to create and edit formatted content in a familiar word processor-like interface.", "description": "Summernote is an open-source JavaScript library that offers a feature-rich WYSIWYG editor for web applications, allowing users to create and edit formatted content in a familiar word processor-like interface.",
"icon": "Summernote.svg", "icon": "Summernote.svg",
"oss": true,
"scriptSrc": [ "scriptSrc": [
"/(?:S|s)ummernote(?:\\.min)?\\.js", "/(?:S|s)ummernote(?:\\.min)?\\.js",
"/summernote(?:@|-)([\\d\\.]+)/\\;version:\\1" "/summernote(?:@|-)([\\d\\.]+)/\\;version:\\1"
], ],
"oss": true,
"website": "https://summernote.org" "website": "https://summernote.org"
}, },
"Sumo": { "Sumo": {
@ -7310,6 +7294,22 @@
"scriptSrc": "scrollreveal(?:\\.min)(?:\\.js)", "scriptSrc": "scrollreveal(?:\\.min)(?:\\.js)",
"website": "https://scrollrevealjs.org" "website": "https://scrollrevealjs.org"
}, },
"shadcn/ui": {
"cats": [
66
],
"css": [
"--destructive-foreground"
],
"description": "shadcn/ui is a component system built with Radix UI and Tailwind CSS.",
"icon": "shadcn-ui.svg",
"implies": [
"Radix UI",
"Tailwind CSS"
],
"oss": true,
"website": "https://ui.shadcn.com"
},
"shine.js": { "shine.js": {
"cats": [ "cats": [
25 25