Build 5.4.3, NPM performance improvements, removed requestTimeout option

main
Elbert Alias 7 years ago
parent 3f42859e49
commit 77574d548c

@ -27,14 +27,13 @@ node index.js [url] [options]
### Options
```
--debug=0|1 Output debug messages.
--delay=ms Wait for ms milliseconds between requests.
--max-depth=num Don't analyze pages more than num levels deep.
--max-urls=num Exit when num URLs have been analyzed.
--max-wait=ms Wait no more than ms milliseconds for page resources to load.
--recursive=0|1 Follow links on pages (crawler).
--request-timeout=ms Wait no more than ms millisecond for the page to load.
--user-agent=str Set the user agent string.
--debug=0|1 Output debug messages.
--delay=ms Wait for ms milliseconds between requests.
--max-depth=num Don't analyze pages more than num levels deep.
--max-urls=num Exit when num URLs have been analyzed.
--max-wait=ms Wait no more than ms milliseconds for page resources to load.
--recursive=0|1 Follow links on pages (crawler).
--user-agent=str Set the user agent string.
```
@ -46,9 +45,8 @@ const options = {
delay: 500,
maxDepth: 3,
maxUrls: 10,
maxWait: 1000,
maxWait: 5000,
recursive: true,
requestTimeout: 3000,
userAgent: 'Wappalyzer',
};

@ -17,9 +17,8 @@ class Driver {
delay: 500,
maxDepth: 3,
maxUrls: 10,
maxWait: 1000,
maxWait: 5000,
recursive: false,
requestTimeout: 3000,
userAgent: 'Mozilla/5.0 (compatible; Wappalyzer)',
}, options || {});
@ -29,7 +28,6 @@ class Driver {
this.options.maxUrls = parseInt(this.options.maxUrls, 10);
this.options.maxWait = parseInt(this.options.maxWait, 10);
this.options.recursive = Boolean(this.options.recursive);
this.options.requestTimeout = parseInt(this.options.requestTimeout, 10);
this.origPageUrl = url.parse(pageUrl);
this.analyzedPageUrls = [];
@ -61,8 +59,6 @@ class Driver {
}
displayApps(detected, meta) {
this.timer('displayApps');
this.meta = meta;
Object.keys(detected).forEach(appName => {
@ -92,95 +88,112 @@ class Driver {
}
fetch(pageUrl, index, depth) {
return new Promise(resolve => {
// Return when the URL is a duplicate or maxUrls has been reached
if ( this.analyzedPageUrls.indexOf(pageUrl.href) !== -1 || this.analyzedPageUrls.length >= this.options.maxUrls ) {
return resolve();
}
// Return when the URL is a duplicate or maxUrls has been reached
if ( this.analyzedPageUrls.indexOf(pageUrl.href) !== -1 || this.analyzedPageUrls.length >= this.options.maxUrls ) {
return Promise.resolve();
}
this.timer('fetch url: ' + pageUrl.href + '; depth: ' + depth + '; delay: ' + ( this.options.delay * index ) + 'ms');
this.analyzedPageUrls.push(pageUrl.href);
this.analyzedPageUrls.push(pageUrl.href);
const timerScope = {
last: new Date().getTime()
};
const browser = new Browser({
silent: true,
userAgent: this.options.userAgent,
waitDuration: this.options.maxWait,
});
this.timer('fetch; url: ' + pageUrl.href + '; depth: ' + depth + '; delay: ' + ( this.options.delay * index ) + 'ms', timerScope);
this.sleep(this.options.delay * index)
.then(() => {
this.timer('browser.visit start url: ' + pageUrl.href);
return new Promise(resolve => this.sleep(this.options.delay * index).then(() => this.visit(pageUrl, timerScope, resolve)));
}
browser.visit(pageUrl.href, this.options.requestTimeout, error => {
this.timer('browser.visit end url: ' + pageUrl.href);
visit(pageUrl, timerScope, resolve) {
const browser = new Browser({
silent: true,
userAgent: this.options.userAgent,
waitDuration: this.options.maxWait,
});
pageUrl.canonical = pageUrl.protocol + '//' + pageUrl.host + pageUrl.pathname;
this.timer('browser.visit start; url: ' + pageUrl.href, timerScope);
// Validate response
if ( !browser.resources['0'] || !browser.resources['0'].response ) {
this.wappalyzer.log('No response from server', 'browser', 'error');
browser.visit(pageUrl.href, () => {
this.timer('browser.visit end; url: ' + pageUrl.href, timerScope);
return resolve();
}
if ( !this.responseOk(browser, pageUrl) ) {
return resolve();
}
const headers = this.getHeaders(browser);
const headers = this.getHeaders(browser);
const html = this.getHtml(browser);
const scripts = this.getScripts(browser);
const js = this.getJs(browser);
// Validate content type
const contentType = headers.hasOwnProperty('content-type') ? headers['content-type'].shift() : null;
this.wappalyzer.analyze(pageUrl, {
headers,
html,
scripts,
js
});
if ( !contentType || !/\btext\/html\b/.test(contentType) ) {
this.wappalyzer.log('Skipping ' + pageUrl.href + ' of content type ' + contentType, 'driver');
const links = Array.from(browser.document.getElementsByTagName('a'))
.filter(link => link.hostname === this.origPageUrl.hostname)
.filter(link => extensions.test(link.pathname))
.map(link => { link.hash = ''; return url.parse(link.href) });
this.analyzedPageUrls.splice(this.analyzedPageUrls.indexOf(pageUrl.href), 1);
return resolve(links);
});
}
return resolve();
}
responseOk(browser, pageUrl) {
// Validate response
const resource = browser.resources.length ? browser.resources.filter(resource => resource.response).shift() : null;
// Validate document element
if ( !browser.document || !browser.document.documentElement ) {
this.wappalyzer.log('No HTML document at ' + pageUrl.href, 'driver', 'error');
if ( !resource ) {
this.wappalyzer.log('No response from server; url: ' + pageUrl.href, 'driver', 'error');
return resolve();
}
return false;
}
const html = this.getHtml(browser);
const scripts = this.getScripts(browser);
if ( resource.response.status !== 200 ) {
this.wappalyzer.log('Response was not OK; status: ' + resource.response.status + ' ' + resource.response.statusText + '; url: ' + pageUrl.href, 'driver', 'error');
const links = Array.from(browser.document.getElementsByTagName('a'))
.filter(link => link.hostname === this.origPageUrl.hostname)
.filter(link => extensions.test(link.pathname))
.map(link => { link.hash = ''; return url.parse(link.href) });
return false;
}
browser.wait(this.options.maxWait, () => {
this.timer('browser.wait end url: ' + pageUrl.href);
const headers = this.getHeaders(browser);
const js = this.getJs(browser);
// Validate content type
const contentType = headers.hasOwnProperty('content-type') ? headers['content-type'].shift() : null;
this.wappalyzer.analyze(pageUrl, {
headers,
html,
scripts,
js
});
if ( !contentType || !/\btext\/html\b/.test(contentType) ) {
this.wappalyzer.log('Skipping; url: ' + pageUrl.href + '; content type: ' + contentType, 'driver');
return resolve(links);
});
});
});
});
this.analyzedPageUrls.splice(this.analyzedPageUrls.indexOf(pageUrl.href), 1);
return false;
}
// Validate document
if ( !browser.document || !browser.document.documentElement ) {
this.wappalyzer.log('No HTML document; url: ' + pageUrl.href, 'driver', 'error');
return false;
}
return true;
}
getHeaders(browser) {
const headers = {};
browser.resources['0'].response.headers._headers.forEach(header => {
if ( !headers[header[0]] ){
headers[header[0]] = [];
}
const resource = browser.resources.length ? browser.resources.filter(resource => resource.response).shift() : null;
headers[header[0]].push(header[1]);
});
if ( resource ) {
resource.response.headers._headers.forEach(header => {
if ( !headers[header[0]] ){
headers[header[0]] = [];
}
headers[header[0]].push(header[1]);
});
}
return headers;
}
@ -244,6 +257,8 @@ class Driver {
}
crawl(pageUrl, index = 1, depth = 1) {
pageUrl.canonical = pageUrl.protocol + '//' + pageUrl.host + pageUrl.pathname;
return new Promise(resolve => {
this.fetch(pageUrl, index, depth)
.then(links => {
@ -267,14 +282,14 @@ class Driver {
return ms ? new Promise(resolve => setTimeout(resolve, ms)) : Promise.resolve();
}
timer(step) {
timer(message, scope) {
const time = new Date().getTime();
const sinceStart = ( Math.round(( time - this.time.start ) / 10) / 100) + 's';
const sinceLast = ( Math.round(( time - this.time.last ) / 10) / 100) + 's';
const sinceLast = ( Math.round(( time - scope.last ) / 10) / 100) + 's';
this.wappalyzer.log('[' + step + '] Time lapsed: ' + sinceLast + ' / ' + sinceStart, 'driver');
this.wappalyzer.log('[timer] ' + message + '; lapsed: ' + sinceLast + ' / ' + sinceStart, 'driver');
this.time.last = time;
scope.last = time;
}
};

@ -2,7 +2,7 @@
"name": "wappalyzer",
"description": "Uncovers the technologies used on websites",
"homepage": "https://github.com/AliasIO/Wappalyzer",
"version": "5.4.2",
"version": "5.4.3",
"author": "Elbert Alias",
"license": "GPL-3.0",
"repository": {

@ -4,7 +4,7 @@
"author": "Elbert Alias",
"homepage_url": "https://www.wappalyzer.com",
"description": "Identify web technologies",
"version": "5.4.2",
"version": "5.4.3",
"default_locale": "en",
"manifest_version": 2,
"icons": {

Loading…
Cancel
Save