From 4a4e3c1cedadcca60c39c77c15591a0b0cdfdfdd Mon Sep 17 00:00:00 2001 From: Elbert Alias <77259+AliasIO@users.noreply.github.com> Date: Wed, 8 Jan 2020 10:28:56 +1100 Subject: [PATCH] Improved error handling in NPM driver with puppeteer --- src/drivers/npm/browsers/puppeteer.js | 2 +- src/drivers/npm/driver.js | 86 +++++++++++++------------- src/drivers/npm/package.json | 2 +- src/drivers/webextension/manifest.json | 2 +- 4 files changed, 47 insertions(+), 45 deletions(-) diff --git a/src/drivers/npm/browsers/puppeteer.js b/src/drivers/npm/browsers/puppeteer.js index 770d91e54..fba073d49 100644 --- a/src/drivers/npm/browsers/puppeteer.js +++ b/src/drivers/npm/browsers/puppeteer.js @@ -118,7 +118,7 @@ class PuppeteerBrowser extends Browser { this.html = await page.content(); } catch (error) { - throw new Error(error.message); + throw new Error(error.toString()); } await page.close(); diff --git a/src/drivers/npm/driver.js b/src/drivers/npm/driver.js index b75b6cad9..29b82de68 100644 --- a/src/drivers/npm/driver.js +++ b/src/drivers/npm/driver.js @@ -173,13 +173,13 @@ class Driver { }); } - fetch(pageUrl, index, depth) { + async fetch(pageUrl, index, depth) { // Return when the URL is a duplicate or maxUrls has been reached if ( this.analyzedPageUrls[pageUrl.href] || this.analyzedPageUrls.length >= this.options.maxUrls ) { - return Promise.resolve(); + return; } this.analyzedPageUrls[pageUrl.href] = { @@ -192,21 +192,29 @@ class Driver { this.timer(`fetch; url: ${pageUrl.href}; depth: ${depth}; delay: ${this.options.delay * index}ms`, timerScope); - return new Promise(async (resolve, reject) => { - await sleep(this.options.delay * index); + await sleep(this.options.delay * index); - this.visit(pageUrl, timerScope, resolve, reject); - }); + try { + await this.visit(pageUrl, timerScope); + } catch (error) { + throw new Error(error.message); + } } - async visit(pageUrl, timerScope, resolve, reject) { + async visit(pageUrl, timerScope) { const browser = new this.Browser(this.options); browser.log = (message, type) => this.wappalyzer.log(message, 'browser', type); this.timer(`visit start; url: ${pageUrl.href}`, timerScope); - await browser.visit(pageUrl.href); + try { + await browser.visit(pageUrl.href); + } catch (error) { + this.wappalyzer.log(error.message, 'browser', 'error'); + + throw new Error('RESPONSE_NOT_OK'); + } this.timer(`visit end; url: ${pageUrl.href}`, timerScope); @@ -214,11 +222,11 @@ class Driver { // Validate response if (!browser.statusCode) { - return reject(new Error('NO_RESPONSE')); + throw new Error('NO_RESPONSE'); } if (browser.statusCode !== 200) { - return reject(new Error('RESPONSE_NOT_OK')); + throw new Error('RESPONSE_NOT_OK'); } if (!browser.contentType || !/\btext\/html\b/.test(browser.contentType)) { @@ -262,55 +270,49 @@ class Driver { this.emit('visit', { browser, pageUrl }); - return resolve(reducedLinks); + return reducedLinks; } - crawl(pageUrl, index = 1, depth = 1) { + async crawl(pageUrl, index = 1, depth = 1) { pageUrl.canonical = `${pageUrl.protocol}//${pageUrl.host}${pageUrl.pathname}`; - return new Promise(async (resolve) => { - let links; + let links; - try { - links = await this.fetch(pageUrl, index, depth); - } catch (error) { - const type = error.message && errorTypes[error.message] ? error.message : 'UNKNOWN_ERROR'; - const message = error.message && errorTypes[error.message] ? errorTypes[error.message] : 'Unknown error'; + try { + links = await this.fetch(pageUrl, index, depth); + } catch (error) { + const type = error.message && errorTypes[error.message] ? error.message : 'UNKNOWN_ERROR'; + const message = error.message && errorTypes[error.message] ? errorTypes[error.message] : 'Unknown error'; - this.analyzedPageUrls[pageUrl.href].error = { - type, - message, - }; + this.analyzedPageUrls[pageUrl.href].error = { + type, + message, + }; - this.wappalyzer.log(`${message}; url: ${pageUrl.href}`, 'driver', 'error'); - } + this.wappalyzer.log(`${message}; url: ${pageUrl.href}`, 'driver', 'error'); + } - if (links && this.options.recursive && depth < this.options.maxDepth) { - await this.chunk(links.slice(0, this.options.maxUrls), depth + 1); - } + if (links && this.options.recursive && depth < this.options.maxDepth) { + await this.chunk(links.slice(0, this.options.maxUrls), depth + 1); + } - return resolve({ - urls: this.analyzedPageUrls, - applications: this.apps, - meta: this.meta, - }); - }); + return { + urls: this.analyzedPageUrls, + applications: this.apps, + meta: this.meta, + }; } - chunk(links, depth, chunk = 0) { + async chunk(links, depth, chunk = 0) { if (links.length === 0) { - return Promise.resolve(); + return; } const chunked = links.splice(0, this.options.chunkSize); - return new Promise(async (resolve) => { - await Promise.all(chunked.map((link, index) => this.crawl(link, index, depth))); - - await this.chunk(links, depth, chunk + 1); + await Promise.all(chunked.map((link, index) => this.crawl(link, index, depth))); - resolve(); - }); + await this.chunk(links, depth, chunk + 1); } timer(message, scope) { diff --git a/src/drivers/npm/package.json b/src/drivers/npm/package.json index c3b388a75..371ee27da 100644 --- a/src/drivers/npm/package.json +++ b/src/drivers/npm/package.json @@ -2,7 +2,7 @@ "name": "wappalyzer", "description": "Uncovers the technologies used on websites", "homepage": "https://github.com/AliasIO/Wappalyzer", - "version": "5.9.1", + "version": "5.9.4", "author": "Elbert Alias", "license": "GPL-3.0", "repository": { diff --git a/src/drivers/webextension/manifest.json b/src/drivers/webextension/manifest.json index 5aaee78e3..ba422fd0f 100644 --- a/src/drivers/webextension/manifest.json +++ b/src/drivers/webextension/manifest.json @@ -4,7 +4,7 @@ "author": "Elbert Alias", "homepage_url": "https://www.wappalyzer.com", "description": "Identify web technologies", - "version": "5.9.1", + "version": "5.9.3", "default_locale": "en", "manifest_version": 2, "icons": {