From 3cb35624ee74a865920f866f230b9bad24a23019 Mon Sep 17 00:00:00 2001 From: Elbert Alias Date: Fri, 29 Dec 2017 10:49:52 +1100 Subject: [PATCH] NPM driver: skip non-HTML pages, return analyzed URLs --- src/drivers/npm/driver.js | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/src/drivers/npm/driver.js b/src/drivers/npm/driver.js index 7d1d1cea9..ac13a8b41 100644 --- a/src/drivers/npm/driver.js +++ b/src/drivers/npm/driver.js @@ -8,6 +8,8 @@ const Browser = require('zombie'); const json = JSON.parse(fs.readFileSync(__dirname + '/apps.json')); +const extensions = /^([^.]+$|\.(asp|aspx|cgi|htm|html|jsp|php)$)/; + class Driver { constructor(pageUrl, options) { this.options = Object.assign({}, { @@ -90,17 +92,15 @@ class Driver { } fetch(pageUrl, index, depth) { - this.timer('fetch'); - return new Promise(resolve => { // Return when the URL is a duplicate or maxUrls has been reached if ( this.analyzedPageUrls.indexOf(pageUrl.href) !== -1 || this.analyzedPageUrls.length >= this.options.maxUrls ) { return resolve(); } - this.analyzedPageUrls.push(pageUrl.href); + this.timer('fetch url: ' + pageUrl.pathname + '; depth: ' + depth + '; delay: ' + ( this.options.delay * index ) + 'ms'); - this.wappalyzer.log('depth: ' + depth + '; delay: ' + ( this.options.delay * index ) + 'ms; url: ' + pageUrl.href, 'driver'); + this.analyzedPageUrls.push(pageUrl.href); const browser = new Browser({ silent: true, @@ -110,10 +110,12 @@ class Driver { this.sleep(this.options.delay * index) .then(() => { - this.timer('browser.visit start'); + this.timer('browser.visit start url: ' + pageUrl.pathname); browser.visit(pageUrl.href, this.options.requestTimeout, error => { - this.timer('browser.visit end'); + this.timer('browser.visit end url: ' + pageUrl.pathname); + + pageUrl.canonical = pageUrl.protocol + '//' + pageUrl.host + pageUrl.pathname; if ( !browser.resources['0'] || !browser.resources['0'].response ) { this.wappalyzer.log('No response from server', 'browser', 'error'); @@ -122,9 +124,20 @@ class Driver { } browser.wait(this.options.maxWait, () => { - this.timer('browser.wait end'); + this.timer('browser.wait end url: ' + pageUrl.pathname); const headers = this.getHeaders(browser); + + const contentType = headers.hasOwnProperty('content-type') ? headers['content-type'].shift() : null; + + if ( !contentType || !/\btext\/html\b/.test(contentType) ) { + this.wappalyzer.log('Skipping ' + pageUrl.pathname + ' of content type ' + contentType, 'driver'); + + this.analyzedPageUrls.splice(this.analyzedPageUrls.indexOf(pageUrl.href), 1); + + return resolve(); + } + const html = this.getHtml(browser); const scripts = this.getScripts(browser); const js = this.getJs(browser); @@ -210,14 +223,13 @@ class Driver { } crawl(pageUrl, index = 1, depth = 1) { - this.timer('crawl'); - return new Promise(resolve => { this.fetch(pageUrl, index, depth) .then(links => { if ( links && Boolean(this.options.recursive) && depth < this.options.maxDepth ) { links = Array.from(links) .filter(link => link.hostname === this.origPageUrl.hostname) + .filter(link => extensions.test(link.pathname)) .map(link => { link.hash = ''; return link }); return Promise.all(links.map((link, index) => this.crawl(link, index + 1, depth + 1))); @@ -226,9 +238,8 @@ class Driver { } }) .then(() => { - this.timer('done'); - resolve({ + urls: this.analyzedPageUrls, applications: this.apps, meta: this.meta });