Improved error handling in NPM driver with puppeteer

main
Elbert Alias 6 years ago
parent 5a855a7cfe
commit 4a4e3c1ced

@ -118,7 +118,7 @@ class PuppeteerBrowser extends Browser {
this.html = await page.content(); this.html = await page.content();
} catch (error) { } catch (error) {
throw new Error(error.message); throw new Error(error.toString());
} }
await page.close(); await page.close();

@ -173,13 +173,13 @@ class Driver {
}); });
} }
fetch(pageUrl, index, depth) { async fetch(pageUrl, index, depth) {
// Return when the URL is a duplicate or maxUrls has been reached // Return when the URL is a duplicate or maxUrls has been reached
if ( if (
this.analyzedPageUrls[pageUrl.href] this.analyzedPageUrls[pageUrl.href]
|| this.analyzedPageUrls.length >= this.options.maxUrls || this.analyzedPageUrls.length >= this.options.maxUrls
) { ) {
return Promise.resolve(); return;
} }
this.analyzedPageUrls[pageUrl.href] = { this.analyzedPageUrls[pageUrl.href] = {
@ -192,21 +192,29 @@ class Driver {
this.timer(`fetch; url: ${pageUrl.href}; depth: ${depth}; delay: ${this.options.delay * index}ms`, timerScope); this.timer(`fetch; url: ${pageUrl.href}; depth: ${depth}; delay: ${this.options.delay * index}ms`, timerScope);
return new Promise(async (resolve, reject) => { await sleep(this.options.delay * index);
await sleep(this.options.delay * index);
this.visit(pageUrl, timerScope, resolve, reject); try {
}); await this.visit(pageUrl, timerScope);
} catch (error) {
throw new Error(error.message);
}
} }
async visit(pageUrl, timerScope, resolve, reject) { async visit(pageUrl, timerScope) {
const browser = new this.Browser(this.options); const browser = new this.Browser(this.options);
browser.log = (message, type) => this.wappalyzer.log(message, 'browser', type); browser.log = (message, type) => this.wappalyzer.log(message, 'browser', type);
this.timer(`visit start; url: ${pageUrl.href}`, timerScope); this.timer(`visit start; url: ${pageUrl.href}`, timerScope);
await browser.visit(pageUrl.href); try {
await browser.visit(pageUrl.href);
} catch (error) {
this.wappalyzer.log(error.message, 'browser', 'error');
throw new Error('RESPONSE_NOT_OK');
}
this.timer(`visit end; url: ${pageUrl.href}`, timerScope); this.timer(`visit end; url: ${pageUrl.href}`, timerScope);
@ -214,11 +222,11 @@ class Driver {
// Validate response // Validate response
if (!browser.statusCode) { if (!browser.statusCode) {
return reject(new Error('NO_RESPONSE')); throw new Error('NO_RESPONSE');
} }
if (browser.statusCode !== 200) { if (browser.statusCode !== 200) {
return reject(new Error('RESPONSE_NOT_OK')); throw new Error('RESPONSE_NOT_OK');
} }
if (!browser.contentType || !/\btext\/html\b/.test(browser.contentType)) { if (!browser.contentType || !/\btext\/html\b/.test(browser.contentType)) {
@ -262,55 +270,49 @@ class Driver {
this.emit('visit', { browser, pageUrl }); this.emit('visit', { browser, pageUrl });
return resolve(reducedLinks); return reducedLinks;
} }
crawl(pageUrl, index = 1, depth = 1) { async crawl(pageUrl, index = 1, depth = 1) {
pageUrl.canonical = `${pageUrl.protocol}//${pageUrl.host}${pageUrl.pathname}`; pageUrl.canonical = `${pageUrl.protocol}//${pageUrl.host}${pageUrl.pathname}`;
return new Promise(async (resolve) => { let links;
let links;
try { try {
links = await this.fetch(pageUrl, index, depth); links = await this.fetch(pageUrl, index, depth);
} catch (error) { } catch (error) {
const type = error.message && errorTypes[error.message] ? error.message : 'UNKNOWN_ERROR'; const type = error.message && errorTypes[error.message] ? error.message : 'UNKNOWN_ERROR';
const message = error.message && errorTypes[error.message] ? errorTypes[error.message] : 'Unknown error'; const message = error.message && errorTypes[error.message] ? errorTypes[error.message] : 'Unknown error';
this.analyzedPageUrls[pageUrl.href].error = { this.analyzedPageUrls[pageUrl.href].error = {
type, type,
message, message,
}; };
this.wappalyzer.log(`${message}; url: ${pageUrl.href}`, 'driver', 'error'); this.wappalyzer.log(`${message}; url: ${pageUrl.href}`, 'driver', 'error');
} }
if (links && this.options.recursive && depth < this.options.maxDepth) { if (links && this.options.recursive && depth < this.options.maxDepth) {
await this.chunk(links.slice(0, this.options.maxUrls), depth + 1); await this.chunk(links.slice(0, this.options.maxUrls), depth + 1);
} }
return resolve({ return {
urls: this.analyzedPageUrls, urls: this.analyzedPageUrls,
applications: this.apps, applications: this.apps,
meta: this.meta, meta: this.meta,
}); };
});
} }
chunk(links, depth, chunk = 0) { async chunk(links, depth, chunk = 0) {
if (links.length === 0) { if (links.length === 0) {
return Promise.resolve(); return;
} }
const chunked = links.splice(0, this.options.chunkSize); const chunked = links.splice(0, this.options.chunkSize);
return new Promise(async (resolve) => { await Promise.all(chunked.map((link, index) => this.crawl(link, index, depth)));
await Promise.all(chunked.map((link, index) => this.crawl(link, index, depth)));
await this.chunk(links, depth, chunk + 1);
resolve(); await this.chunk(links, depth, chunk + 1);
});
} }
timer(message, scope) { timer(message, scope) {

@ -2,7 +2,7 @@
"name": "wappalyzer", "name": "wappalyzer",
"description": "Uncovers the technologies used on websites", "description": "Uncovers the technologies used on websites",
"homepage": "https://github.com/AliasIO/Wappalyzer", "homepage": "https://github.com/AliasIO/Wappalyzer",
"version": "5.9.1", "version": "5.9.4",
"author": "Elbert Alias", "author": "Elbert Alias",
"license": "GPL-3.0", "license": "GPL-3.0",
"repository": { "repository": {

@ -4,7 +4,7 @@
"author": "Elbert Alias", "author": "Elbert Alias",
"homepage_url": "https://www.wappalyzer.com", "homepage_url": "https://www.wappalyzer.com",
"description": "Identify web technologies", "description": "Identify web technologies",
"version": "5.9.1", "version": "5.9.3",
"default_locale": "en", "default_locale": "en",
"manifest_version": 2, "manifest_version": 2,
"icons": { "icons": {