Add --fast option to NPM driver

main
Elbert Alias 1 year ago
parent 6fb1baedde
commit 4582061e8d

@ -13,6 +13,7 @@ const aliases = {
a: 'userAgent', a: 'userAgent',
b: 'batchSize', b: 'batchSize',
d: 'debug', d: 'debug',
f: 'fast',
t: 'delay', t: 'delay',
h: 'help', h: 'help',
H: 'header', H: 'header',
@ -74,6 +75,7 @@ Examples:
Options: Options:
-b, --batch-size=... Process links in batches -b, --batch-size=... Process links in batches
-d, --debug Output debug messages -d, --debug Output debug messages
-f, --fast Prioritise speed over accuracy
-t, --delay=ms Wait for ms milliseconds between requests -t, --delay=ms Wait for ms milliseconds between requests
-h, --help This text -h, --help This text
-H, --header Extra header to send with requests -H, --header Extra header to send with requests

@ -283,7 +283,11 @@ function analyzeDom(dom, technologies = Wappalyzer.technologies) {
} }
function get(url, options = {}) { function get(url, options = {}) {
const timeout = options.timeout || 10000 const timeout =
options.timeout ||
(this.options.fast
? this.Math.min(this.options.maxWait, 3000)
: this.options.maxWait)
if (['http:', 'https:'].includes(url.protocol)) { if (['http:', 'https:'].includes(url.protocol)) {
const { get } = url.protocol === 'http:' ? http : https const { get } = url.protocol === 'http:' ? http : https
@ -314,7 +318,7 @@ function get(url, options = {}) {
} }
) )
.setTimeout(timeout, () => .setTimeout(timeout, () =>
reject(new Error(`Timeout (${url.href}, ${timeout}ms)`)) reject(new Error(`Timeout (${url}, ${timeout}ms)`))
) )
.on('error', (error) => reject(new Error(error.message))) .on('error', (error) => reject(new Error(error.message)))
) )
@ -345,6 +349,7 @@ class Driver {
} }
this.options.debug = Boolean(+this.options.debug) this.options.debug = Boolean(+this.options.debug)
this.options.fast = Boolean(+this.options.fast)
this.options.recursive = Boolean(+this.options.recursive) this.options.recursive = Boolean(+this.options.recursive)
this.options.probe = this.options.probe =
String(this.options.probe || '').toLowerCase() === 'basic' String(this.options.probe || '').toLowerCase() === 'basic'
@ -369,7 +374,7 @@ class Driver {
} }
async init() { async init() {
for (let attempt = 1; attempt <= 3; attempt++) { for (let attempt = 1; attempt <= 2; attempt++) {
this.log(`Launching browser (attempt ${attempt})...`) this.log(`Launching browser (attempt ${attempt})...`)
try { try {
@ -385,7 +390,9 @@ class Driver {
acceptInsecureCerts: true, acceptInsecureCerts: true,
args: chromiumArgs, args: chromiumArgs,
executablePath: CHROMIUM_BIN, executablePath: CHROMIUM_BIN,
timeout: 5000, timeout: this.options.fast
? Math.min(this.options.maxWait, 10000)
: this.options.maxWait,
}) })
} }
@ -393,28 +400,20 @@ class Driver {
} catch (error) { } catch (error) {
this.log(error) this.log(error)
if (attempt >= 3) { if (attempt >= 2) {
throw new Error(error.message || error.toString()) throw new Error(error.message || error.toString())
} }
} }
} }
this.browser.on('disconnected', async () => { this.browser.on('disconnected', () => {
this.log('Browser disconnected') this.browser = undefined
if (!this.destroyed) { this.log('Browser disconnected')
try {
await this.init()
} catch (error) {
this.log(error)
}
}
}) })
} }
async destroy() { async destroy() {
this.destroyed = true
if (this.browser) { if (this.browser) {
try { try {
await sleep(1) await sleep(1)
@ -507,8 +506,6 @@ class Site {
this.cache = {} this.cache = {}
this.probed = false this.probed = false
this.destroyed = false
} }
log(message, source = 'driver', type = 'log') { log(message, source = 'driver', type = 'log') {
@ -544,7 +541,9 @@ class Site {
promise, promise,
fallback, fallback,
errorMessage = 'Operation took too long to complete', errorMessage = 'Operation took too long to complete',
maxWait = Math.min(this.options.maxWait, 3000) maxWait = this.options.fast
? Math.min(this.options.maxWait, 2000)
: this.options.maxWait
) { ) {
let timeout = null let timeout = null
@ -579,10 +578,6 @@ class Site {
} }
async goto(url) { async goto(url) {
if (this.destroyed) {
return
}
// Return when the URL is a duplicate or maxUrls has been reached // Return when the URL is a duplicate or maxUrls has been reached
if (this.analyzedUrls[url.href]) { if (this.analyzedUrls[url.href]) {
return [] return []
@ -640,14 +635,18 @@ class Site {
) { ) {
request.abort('blockedbyclient') request.abort('blockedbyclient')
} else { } else {
await this.emit('request', { page, request })
if (Object.keys(this.options.headers).length) {
const headers = { const headers = {
...request.headers(), ...request.headers(),
...this.options.headers, ...this.options.headers,
} }
await this.emit('request', { page, request })
request.continue({ headers }) request.continue({ headers })
} else {
request.continue()
}
} }
} catch (error) { } catch (error) {
error.message += ` (${url})` error.message += ` (${url})`
@ -657,7 +656,7 @@ class Site {
}) })
page.on('response', async (response) => { page.on('response', async (response) => {
if (this.destroyed || !page || page.__closed || page.isClosed()) { if (!page || page.__closed || page.isClosed()) {
return return
} }
@ -745,7 +744,7 @@ class Site {
} }
if (!this.options.noScripts) { if (!this.options.noScripts) {
await sleep(1000) await sleep(this.options.fast ? 1000 : 3000)
} }
// page.on('console', (message) => this.log(message.text())) // page.on('console', (message) => this.log(message.text()))
@ -810,6 +809,8 @@ class Site {
let dom = [] let dom = []
if (html) { if (html) {
await Promise.all([
(async () => {
// Links // Links
links = !this.options.recursive links = !this.options.recursive
? [] ? []
@ -818,7 +819,14 @@ class Site {
await this.promiseTimeout( await this.promiseTimeout(
page.evaluateHandle(() => page.evaluateHandle(() =>
Array.from(document.getElementsByTagName('a')).map( Array.from(document.getElementsByTagName('a')).map(
({ hash, hostname, href, pathname, protocol, rel }) => ({ ({
hash,
hostname,
href,
pathname,
protocol,
rel,
}) => ({
hash, hash,
hostname, hostname,
href, href,
@ -835,7 +843,8 @@ class Site {
[], [],
'Timeout (links)' 'Timeout (links)'
) )
})(),
(async () => {
// Text // Text
text = await this.promiseTimeout( text = await this.promiseTimeout(
( (
@ -852,7 +861,8 @@ class Site {
'', '',
'Timeout (text)' 'Timeout (text)'
) )
})(),
(async () => {
// CSS // CSS
css = await this.promiseTimeout( css = await this.promiseTimeout(
( (
@ -887,7 +897,8 @@ class Site {
'', '',
'Timeout (css)' 'Timeout (css)'
) )
})(),
(async () => {
// Script tags // Script tags
;[scriptSrc, scripts] = await this.promiseTimeout( ;[scriptSrc, scripts] = await this.promiseTimeout(
( (
@ -916,7 +927,8 @@ class Site {
[], [],
'Timeout (scripts)' 'Timeout (scripts)'
) )
})(),
(async () => {
// Meta tags // Meta tags
meta = await this.promiseTimeout( meta = await this.promiseTimeout(
( (
@ -925,10 +937,12 @@ class Site {
Array.from(document.querySelectorAll('meta')).reduce( Array.from(document.querySelectorAll('meta')).reduce(
(metas, meta) => { (metas, meta) => {
const key = const key =
meta.getAttribute('name') || meta.getAttribute('property') meta.getAttribute('name') ||
meta.getAttribute('property')
if (key) { if (key) {
metas[key.toLowerCase()] = metas[key.toLowerCase()] || [] metas[key.toLowerCase()] =
metas[key.toLowerCase()] || []
metas[key.toLowerCase()].push( metas[key.toLowerCase()].push(
meta.getAttribute('content') meta.getAttribute('content')
@ -947,14 +961,18 @@ class Site {
[], [],
'Timeout (meta)' 'Timeout (meta)'
) )
})(),
(async () => {
// JavaScript // JavaScript
js = this.options.noScripts js = this.options.noScripts
? [] ? []
: await this.promiseTimeout(getJs(page), [], 'Timeout (js)') : await this.promiseTimeout(getJs(page), [], 'Timeout (js)')
})(),
(async () => {
// DOM // DOM
dom = await this.promiseTimeout(getDom(page), [], 'Timeout (dom)') dom = await this.promiseTimeout(getDom(page), [], 'Timeout (dom)')
})(),
])
} }
this.cache[url.href] = { this.cache[url.href] = {
@ -1037,7 +1055,9 @@ class Site {
} }
if (error.message.includes('net::ERR_NAME_NOT_RESOLVED')) { if (error.message.includes('net::ERR_NAME_NOT_RESOLVED')) {
const newError = new Error(`Hostname could not be resolved (${url})`) const newError = new Error(
`Hostname could not be resolved (${url.hostname})`
)
newError.code = 'WAPPALYZER_DNS_ERROR' newError.code = 'WAPPALYZER_DNS_ERROR'
@ -1253,7 +1273,9 @@ class Site {
}), }),
[], [],
'Timeout (dns)', 'Timeout (dns)',
Math.min(this.options.maxWait, 15000) this.options.fast
? Math.min(this.options.maxWait, 15000)
: this.options.maxWait
) )
} }
@ -1452,8 +1474,6 @@ class Site {
}) })
) )
this.destroyed = true
this.log('Site closed') this.log('Site closed')
} }
} }

@ -13,7 +13,7 @@
"software" "software"
], ],
"homepage": "https://www.wappalyzer.com/", "homepage": "https://www.wappalyzer.com/",
"version": "6.10.65", "version": "6.10.66",
"author": "Wappalyzer", "author": "Wappalyzer",
"license": "GPL-3.0", "license": "GPL-3.0",
"repository": { "repository": {

@ -4,7 +4,7 @@
"author": "Wappalyzer", "author": "Wappalyzer",
"homepage_url": "https://www.wappalyzer.com/", "homepage_url": "https://www.wappalyzer.com/",
"description": "Identify web technologies", "description": "Identify web technologies",
"version": "6.10.65", "version": "6.10.66",
"default_locale": "en", "default_locale": "en",
"manifest_version": 2, "manifest_version": 2,
"icons": { "icons": {

@ -4,7 +4,7 @@
"author": "Wappalyzer", "author": "Wappalyzer",
"homepage_url": "https://www.wappalyzer.com/", "homepage_url": "https://www.wappalyzer.com/",
"description": "Identify web technologies", "description": "Identify web technologies",
"version": "6.10.65", "version": "6.10.66",
"default_locale": "en", "default_locale": "en",
"manifest_version": 3, "manifest_version": 3,
"icons": { "icons": {

@ -13,7 +13,7 @@
"software" "software"
], ],
"homepage": "https://www.wappalyzer.com/", "homepage": "https://www.wappalyzer.com/",
"version": "6.10.65", "version": "6.10.66",
"author": "Wappalyzer", "author": "Wappalyzer",
"license": "GPL-3.0", "license": "GPL-3.0",
"repository": { "repository": {

@ -389,20 +389,6 @@
"scriptSrc": "cdn\\.blog\\.st-hatena\\.com/", "scriptSrc": "cdn\\.blog\\.st-hatena\\.com/",
"website": "https://hatenablog.com" "website": "https://hatenablog.com"
}, },
"Header Bidding Ai": {
"cats": [
36
],
"description": "Header Bidding Ai is a provider of an automated and managed header bidding solution. Header bidding cutting-edge technique where publishers offer their ad inventory to many ad exchanges.",
"icon": "Header Bidding Ai.svg",
"scriptSrc": "\\.headerbidding\\.ai/",
"saas": true,
"pricing": [
"poa",
"recurring"
],
"website": "https://headerbidding.ai"
},
"HeadJS": { "HeadJS": {
"cats": [ "cats": [
59 59
@ -415,6 +401,20 @@
"scriptSrc": "head\\.(?:core|load)(?:\\.min)?\\.js", "scriptSrc": "head\\.(?:core|load)(?:\\.min)?\\.js",
"website": "https://headjs.com" "website": "https://headjs.com"
}, },
"Header Bidding Ai": {
"cats": [
36
],
"description": "Header Bidding Ai is a provider of an automated and managed header bidding solution. Header bidding cutting-edge technique where publishers offer their ad inventory to many ad exchanges.",
"icon": "Header Bidding Ai.svg",
"pricing": [
"poa",
"recurring"
],
"saas": true,
"scriptSrc": "\\.headerbidding\\.ai/",
"website": "https://headerbidding.ai"
},
"Headless UI": { "Headless UI": {
"cats": [ "cats": [
66 66

@ -1955,22 +1955,6 @@
"scriptSrc": "widget\\.sezzle\\.(?:in|com)", "scriptSrc": "widget\\.sezzle\\.(?:in|com)",
"website": "https://sezzle.com/" "website": "https://sezzle.com/"
}, },
"shadcn/ui": {
"cats": [
66
],
"css": [
"--destructive-foreground"
],
"description": "shadcn/ui is a component system built with Radix UI and Tailwind CSS.",
"icon": "shadcn-ui.svg",
"oss": true,
"implies": [
"Radix UI",
"Tailwind CSS"
],
"website": "https://ui.shadcn.com"
},
"Shaka Player": { "Shaka Player": {
"cats": [ "cats": [
14 14
@ -6720,11 +6704,11 @@
], ],
"description": "Summernote is an open-source JavaScript library that offers a feature-rich WYSIWYG editor for web applications, allowing users to create and edit formatted content in a familiar word processor-like interface.", "description": "Summernote is an open-source JavaScript library that offers a feature-rich WYSIWYG editor for web applications, allowing users to create and edit formatted content in a familiar word processor-like interface.",
"icon": "Summernote.svg", "icon": "Summernote.svg",
"oss": true,
"scriptSrc": [ "scriptSrc": [
"/(?:S|s)ummernote(?:\\.min)?\\.js", "/(?:S|s)ummernote(?:\\.min)?\\.js",
"/summernote(?:@|-)([\\d\\.]+)/\\;version:\\1" "/summernote(?:@|-)([\\d\\.]+)/\\;version:\\1"
], ],
"oss": true,
"website": "https://summernote.org" "website": "https://summernote.org"
}, },
"Sumo": { "Sumo": {
@ -7310,6 +7294,22 @@
"scriptSrc": "scrollreveal(?:\\.min)(?:\\.js)", "scriptSrc": "scrollreveal(?:\\.min)(?:\\.js)",
"website": "https://scrollrevealjs.org" "website": "https://scrollrevealjs.org"
}, },
"shadcn/ui": {
"cats": [
66
],
"css": [
"--destructive-foreground"
],
"description": "shadcn/ui is a component system built with Radix UI and Tailwind CSS.",
"icon": "shadcn-ui.svg",
"implies": [
"Radix UI",
"Tailwind CSS"
],
"oss": true,
"website": "https://ui.shadcn.com"
},
"shine.js": { "shine.js": {
"cats": [ "cats": [
25 25