From 2cb284594fc2e7c939ad00be5740f71c684e0a70 Mon Sep 17 00:00:00 2001 From: Elbert Alias <77259+AliasIO@users.noreply.github.com> Date: Sat, 8 Dec 2018 15:32:58 +1100 Subject: [PATCH] Add the ability to use a different headless browser in NPM driver --- src/apps.json | 2 +- src/drivers/npm/browser.js | 20 ++ src/drivers/npm/browsers/zombie copy.js | 114 +++++++++++ src/drivers/npm/browsers/zombie.js | 119 ++++++++++++ src/drivers/npm/driver.js | 248 ++++++++---------------- src/drivers/npm/index.js | 7 +- src/drivers/npm/npm-shrinkwrap.json | 2 +- src/drivers/npm/package.json | 2 +- src/drivers/webextension/manifest.json | 2 +- src/wappalyzer.js | 85 ++++---- 10 files changed, 379 insertions(+), 222 deletions(-) create mode 100644 src/drivers/npm/browser.js create mode 100644 src/drivers/npm/browsers/zombie copy.js create mode 100644 src/drivers/npm/browsers/zombie.js diff --git a/src/apps.json b/src/apps.json index d5fc00f67..269daf989 100644 --- a/src/apps.json +++ b/src/apps.json @@ -2695,7 +2695,7 @@ "X-Drupal-Cache": "", "X-Generator": "^Drupal(?:\\s([\\d.]+))?\\;version:\\1" }, - "html": "<(?:link|style)[^>]+sites/(?:default|all)/(?:themes|modules)/", + "html": "<(?:link|style)[^>]+\"/sites/(?:default|all)/(?:themes|modules)/", "icon": "Drupal.svg", "implies": "PHP", "js": { diff --git a/src/drivers/npm/browser.js b/src/drivers/npm/browser.js new file mode 100644 index 000000000..532a75aa3 --- /dev/null +++ b/src/drivers/npm/browser.js @@ -0,0 +1,20 @@ +class Browser { + constructor(options) { + this.options = options; + + this.window = null; + this.document = null; + this.statusCode = null; + this.contentType = null; + this.headers = null; + this.statusCode = null; + this.contentType = null; + this.html = null; + this.js = null; + this.links = null; + this.scripts = null; + this.cookies = null; + } +} + +module.exports = Browser; diff --git a/src/drivers/npm/browsers/zombie copy.js b/src/drivers/npm/browsers/zombie copy.js new file mode 100644 index 000000000..d33d9cb92 --- /dev/null +++ b/src/drivers/npm/browsers/zombie copy.js @@ -0,0 +1,114 @@ +const Zombie = require('zombie'); + +class Browser { + constructor(options) { + this.options = options; + + this.browser = new Zombie({ + proxy: options.proxy, + silent: true, + strictSSL: false, + userAgent: options.userAgent, + waitDuration: options.maxWait, + }); + + this.statusCode = null; + this.contentType = null; + this.headers = null; + this.statusCode = null; + this.contentType = null; + this.html = null; + this.scripts = null; + this.cookies = null; + + this.window = this.browser.window; + this.document = this.browser.document; + + this.browser.on('authenticate', (auth) => { + auth.username = this.options.username; + auth.password = this.options.password; + }); + } + + visit(url) { + return new Promise((resolve) => { + this.browser.visit(url, () => { + const resource = this.browser.resources.length + ? this.browser.resources.filter(_resource => _resource.response).shift() : null; + + this.headers = this.getHeaders(); + this.statusCode = resource ? resource.response.status : 0; + this.contentType = this.headers['content-type'] ? this.headers['content-type'].shift() : null; + this.html = this.getHtml(); + this.scripts = this.getScripts(); + this.cookies = this.getCookies(); + + resolve(); + }); + }); + } + + getHeaders() { + const headers = {}; + + const resource = this.browser.resources.length + ? this.browser.resources.filter(_resource => _resource.response).shift() : null; + + if (resource) { + // eslint-disable-next-line no-underscore-dangle + resource.response.headers._headers.forEach((header) => { + if (!headers[header[0]]) { + headers[header[0]] = []; + } + + headers[header[0]].push(header[1]); + }); + } + + return headers; + } + + getHtml() { + let html = ''; + + if (this.browser.document && this.browser.document.documentElement) { + try { + html = this.browser.html(); + } catch (error) { + this.log(error.message, 'error'); + } + } + + return html; + } + + getScripts() { + if (!this.browser.document || !this.browser.document.scripts) { + return []; + } + + const scripts = Array.prototype.slice + .apply(this.browser.document.scripts) + .filter(script => script.src) + .map(script => script.src); + + return scripts; + } + + getCookies() { + const cookies = []; + + if (this.browser.cookies) { + this.browser.cookies.forEach(cookie => cookies.push({ + name: cookie.key, + value: cookie.value, + domain: cookie.domain, + path: cookie.path, + })); + } + + return cookies; + } +} + +export default Browser; diff --git a/src/drivers/npm/browsers/zombie.js b/src/drivers/npm/browsers/zombie.js new file mode 100644 index 000000000..0e091d697 --- /dev/null +++ b/src/drivers/npm/browsers/zombie.js @@ -0,0 +1,119 @@ +const Zombie = require('zombie'); +const Browser = require('../browser'); + +class ZombieBrowser extends Browser { + constructor(options) { + super(options); + + this.browser = new Zombie({ + proxy: options.proxy, + silent: true, + strictSSL: false, + userAgent: options.userAgent, + waitDuration: options.maxWait, + }); + + this.browser.on('authenticate', (auth) => { + auth.username = this.options.username; + auth.password = this.options.password; + }); + } + + visit(url) { + return new Promise((resolve) => { + this.browser.visit(url, () => { + const resource = this.browser.resources.length + ? this.browser.resources.filter(_resource => _resource.response).shift() : null; + + this.headers = this.getHeaders(); + this.statusCode = resource ? resource.response.status : 0; + this.contentType = this.headers['content-type'] ? this.headers['content-type'].shift() : null; + this.html = this.getHtml(); + this.js = this.getJs(); + this.links = this.getLinks(); + this.scripts = this.getScripts(); + this.cookies = this.getCookies(); + + resolve(); + }); + }); + } + + getHeaders() { + const headers = {}; + + const resource = this.browser.resources.length + ? this.browser.resources.filter(_resource => _resource.response).shift() : null; + + if (resource) { + // eslint-disable-next-line no-underscore-dangle + resource.response.headers._headers.forEach((header) => { + if (!headers[header[0]]) { + headers[header[0]] = []; + } + + headers[header[0]].push(header[1]); + }); + } + + return headers; + } + + getHtml() { + let html = ''; + + if (this.browser.document && this.browser.document.documentElement) { + try { + html = this.browser.html(); + } catch (error) { + this.log(error.message, 'error'); + } + } + + return html; + } + + getScripts() { + let scripts = []; + + if (this.browser.document && this.browser.document.scripts) { + scripts = Array.prototype.slice + .apply(this.browser.document.scripts) + .filter(script => script.src) + .map(script => script.src); + } + + return scripts; + } + + getJs() { + return this.browser.window; + } + + getLinks() { + let links = []; + + if (this.browser.document) { + links = Array.from(this.browser.document.getElementsByTagName('a')); + } + + return links; + } + + getCookies() { + const cookies = []; + + if (this.browser.cookies) { + this.browser.cookies.forEach(cookie => cookies.push({ + name: cookie.key, + value: cookie.value, + domain: cookie.domain, + path: cookie.path, + })); + } + + return cookies; + } +} + +module.exports = ZombieBrowser; diff --git a/src/drivers/npm/driver.js b/src/drivers/npm/driver.js index 5c343c22c..cf4bfce61 100644 --- a/src/drivers/npm/driver.js +++ b/src/drivers/npm/driver.js @@ -1,7 +1,6 @@ const url = require('url'); const fs = require('fs'); const path = require('path'); -const Browser = require('zombie'); const Wappalyzer = require('./wappalyzer'); const json = JSON.parse(fs.readFileSync(path.resolve(`${__dirname}/apps.json`))); @@ -18,56 +17,8 @@ function sleep(ms) { return ms ? new Promise(resolve => setTimeout(resolve, ms)) : Promise.resolve(); } -function getHeaders(browser) { - const headers = {}; - - const resource = browser.resources.length - ? browser.resources.filter(_resource => _resource.response).shift() : null; - - if (resource) { - // eslint-disable-next-line no-underscore-dangle - resource.response.headers._headers.forEach((header) => { - if (!headers[header[0]]) { - headers[header[0]] = []; - } - - headers[header[0]].push(header[1]); - }); - } - - return headers; -} - -function getScripts(browser) { - if (!browser.document || !browser.document.scripts) { - return []; - } - - const scripts = Array.prototype.slice - .apply(browser.document.scripts) - .filter(script => script.src) - .map(script => script.src); - - return scripts; -} - -function getCookies(browser) { - const cookies = []; - - if (browser.cookies) { - browser.cookies.forEach(cookie => cookies.push({ - name: cookie.key, - value: cookie.value, - domain: cookie.domain, - path: cookie.path, - })); - } - - return cookies; -} - class Driver { - constructor(pageUrl, options) { + constructor(Browser, pageUrl, options) { this.options = Object.assign({}, { password: '', proxy: null, @@ -98,6 +49,8 @@ class Driver { this.apps = []; this.meta = {}; + this.Browser = Browser; + this.wappalyzer = new Wappalyzer(); this.wappalyzer.apps = json.apps; @@ -175,119 +128,70 @@ class Driver { this.timer(`fetch; url: ${pageUrl.href}; depth: ${depth}; delay: ${this.options.delay * index}ms`, timerScope); - return new Promise((resolve, reject) => { - sleep(this.options.delay * index) - .then(() => this.visit(pageUrl, timerScope, resolve, reject)); - }); - } - - visit(pageUrl, timerScope, resolve, reject) { - const browser = new Browser({ - proxy: this.options.proxy, - silent: true, - strictSSL: false, - userAgent: this.options.userAgent, - waitDuration: this.options.maxWait, - }); + return new Promise(async (resolve, reject) => { + await sleep(this.options.delay * index); - browser.on('authenticate', (auth) => { - auth.username = this.options.username; - auth.password = this.options.password; + this.visit(pageUrl, timerScope, resolve, reject); }); + } - this.timer(`browser.visit start; url: ${pageUrl.href}`, timerScope); + async visit(pageUrl, timerScope, resolve, reject) { + const browser = new this.Browser(this.options); - browser.visit(pageUrl.href, () => { - this.timer(`browser.visit end; url: ${pageUrl.href}`, timerScope); + browser.log = (message, type) => this.wappalyzer.log(message, 'browser', type); - try { - if (!this.checkResponse(browser, pageUrl)) { - resolve(); + this.timer(`visit start; url: ${pageUrl.href}`, timerScope); - return; - } - } catch (error) { - reject(error); + await browser.visit(pageUrl.href); - return; - } + this.timer(`visit end; url: ${pageUrl.href}`, timerScope); - const headers = getHeaders(browser); - const html = this.getHtml(browser); - const scripts = getScripts(browser); - const js = this.getJs(browser); - const cookies = getCookies(browser); - - this.wappalyzer.analyze(pageUrl, { - headers, - html, - scripts, - js, - cookies, - }) - .then(() => { - const links = Array.prototype.reduce.call( - browser.document.getElementsByTagName('a'), (results, link) => { - if (link.protocol.match(/https?:/) && link.hostname === this.origPageUrl.hostname && extensions.test(link.pathname)) { - link.hash = ''; - - results.push(url.parse(link.href)); - } - - return results; - }, [], - ); - - return resolve(links); - }); - }); - } + this.analyzedPageUrls[pageUrl.href].status = browser.statusCode; - checkResponse(browser, pageUrl) { // Validate response - const resource = browser.resources.length - ? browser.resources.filter(_resource => _resource.response).shift() : null; + if (!browser.statusCode) { + reject(new Error('NO_RESPONSE')); + } - if (!resource) { - throw new Error('NO_RESPONSE'); + if (browser.statusCode !== 200) { + reject(new Error('RESPONSE_NOT_OK')); } - this.analyzedPageUrls[pageUrl.href].status = resource.response.status; + if (!browser.contentType || !/\btext\/html\b/.test(browser.contentType)) { + this.wappalyzer.log(`Skipping; url: ${pageUrl.href}; content type: ${browser.contentType}`, 'driver'); - if (resource.response.status !== 200) { - throw new Error('RESPONSE_NOT_OK'); + delete this.analyzedPageUrls[pageUrl.href]; } - const headers = getHeaders(browser); + const { cookies, headers, scripts } = browser; - // Validate content type - const contentType = headers['content-type'] ? headers['content-type'].shift() : null; + const html = this.processHtml(browser.html); + const js = this.processJs(browser.js); - if (!contentType || !/\btext\/html\b/.test(contentType)) { - this.wappalyzer.log(`Skipping; url: ${pageUrl.href}; content type: ${contentType}`, 'driver'); + await this.wappalyzer.analyze(pageUrl, { + cookies, + headers, + html, + js, + scripts, + }); - delete this.analyzedPageUrls[pageUrl.href]; + const reducedLinks = Array.prototype.reduce.call( + browser.links, (results, link) => { + if (link.protocol.match(/https?:/) && link.hostname === this.origPageUrl.hostname && extensions.test(link.pathname)) { + link.hash = ''; - return false; - } + results.push(url.parse(link.href)); + } - // Validate document - if (!browser.document || !browser.document.documentElement) { - throw new Error('NO_HTML_DOCUMENT'); - } + return results; + }, [], + ); - return true; + return resolve(reducedLinks); } - getHtml(browser) { - let html = ''; - - try { - html = browser.html(); - } catch (error) { - this.wappalyzer.log(error.message, 'browser', 'error'); - } - + processHtml(html) { if (this.options.htmlMaxCols || this.options.htmlMaxRows) { const chunks = []; const maxCols = this.options.htmlMaxCols; @@ -308,7 +212,7 @@ class Driver { return html; } - getJs(browser) { + processJs(window) { const patterns = this.wappalyzer.jsPatterns; const js = {}; @@ -323,7 +227,7 @@ class Driver { let value = properties .reduce((parent, property) => (parent && parent[property] - ? parent[property] : null), browser.window); + ? parent[property] : null), window); value = typeof value === 'string' || typeof value === 'number' ? value : !!value; @@ -340,32 +244,32 @@ class Driver { crawl(pageUrl, index = 1, depth = 1) { pageUrl.canonical = `${pageUrl.protocol}//${pageUrl.host}${pageUrl.pathname}`; - return new Promise((resolve) => { - this.fetch(pageUrl, index, depth) - .catch((error) => { - const type = error.message && errorTypes[error.message] ? error.message : 'UNKNOWN_ERROR'; - const message = error.message && errorTypes[error.message] ? errorTypes[error.message] : 'Unknown error'; - - this.analyzedPageUrls[pageUrl.href].error = { - type, - message, - }; - - this.wappalyzer.log(`${message}; url: ${pageUrl.href}`, 'driver', 'error'); - }) - .then((links) => { - if (links && this.options.recursive && depth < this.options.maxDepth) { - return this.chunk(links.slice(0, this.options.maxUrls), depth + 1); - } - return Promise.resolve(); - }) - .then(() => { - resolve({ - urls: this.analyzedPageUrls, - applications: this.apps, - meta: this.meta, - }); - }); + return new Promise(async (resolve) => { + let links; + + try { + links = await this.fetch(pageUrl, index, depth); + } catch (error) { + const type = error.message && errorTypes[error.message] ? error.message : 'UNKNOWN_ERROR'; + const message = error.message && errorTypes[error.message] ? errorTypes[error.message] : 'Unknown error'; + + this.analyzedPageUrls[pageUrl.href].error = { + type, + message, + }; + + this.wappalyzer.log(`${message}; url: ${pageUrl.href}`, 'driver', 'error'); + } + + if (links && this.options.recursive && depth < this.options.maxDepth) { + await this.chunk(links.slice(0, this.options.maxUrls), depth + 1); + } + + return resolve({ + urls: this.analyzedPageUrls, + applications: this.apps, + meta: this.meta, + }); }); } @@ -376,10 +280,12 @@ class Driver { const chunked = links.splice(0, this.options.chunkSize); - return new Promise((resolve) => { - Promise.all(chunked.map((link, index) => this.crawl(link, index, depth))) - .then(() => this.chunk(links, depth, chunk + 1)) - .then(() => resolve()); + return new Promise(async (resolve) => { + await Promise.all(chunked.map((link, index) => this.crawl(link, index, depth))); + + await this.chunk(links, depth, chunk + 1); + + resolve(); }); } diff --git a/src/drivers/npm/index.js b/src/drivers/npm/index.js index cdff2ba3d..9436159ed 100755 --- a/src/drivers/npm/index.js +++ b/src/drivers/npm/index.js @@ -1,6 +1,6 @@ #!/usr/bin/env node - +const Browser = require('./browsers/zombie'); const Wappalyzer = require('./driver'); const args = process.argv.slice(2); @@ -30,7 +30,10 @@ do { } } while (arg); -const wappalyzer = new Wappalyzer(url, options); +const wappalyzer = new Wappalyzer(Browser, url, options); + +// Optionally define a custom log function +// wappalyzer.log = (message, source, type) => console.log(message); wappalyzer.analyze() .then((json) => { diff --git a/src/drivers/npm/npm-shrinkwrap.json b/src/drivers/npm/npm-shrinkwrap.json index e82437f27..19f89d6b8 100644 --- a/src/drivers/npm/npm-shrinkwrap.json +++ b/src/drivers/npm/npm-shrinkwrap.json @@ -1,6 +1,6 @@ { "name": "wappalyzer", - "version": "5.5.6", + "version": "5.5.7", "lockfileVersion": 1, "requires": true, "dependencies": { diff --git a/src/drivers/npm/package.json b/src/drivers/npm/package.json index d6d3e197a..f89284174 100644 --- a/src/drivers/npm/package.json +++ b/src/drivers/npm/package.json @@ -2,7 +2,7 @@ "name": "wappalyzer", "description": "Uncovers the technologies used on websites", "homepage": "https://github.com/AliasIO/Wappalyzer", - "version": "5.5.7", + "version": "5.6.0", "author": "Elbert Alias", "license": "GPL-3.0", "repository": { diff --git a/src/drivers/webextension/manifest.json b/src/drivers/webextension/manifest.json index 3e3021605..ef7dc4df8 100644 --- a/src/drivers/webextension/manifest.json +++ b/src/drivers/webextension/manifest.json @@ -4,7 +4,7 @@ "author": "Elbert Alias", "homepage_url": "https://www.wappalyzer.com", "description": "Identify web technologies", - "version": "5.5.7", + "version": "5.6.0", "default_locale": "en", "manifest_version": 2, "icons": { diff --git a/src/wappalyzer.js b/src/wappalyzer.js index 05e9121ba..a50ddb913 100644 --- a/src/wappalyzer.js +++ b/src/wappalyzer.js @@ -160,7 +160,7 @@ class Wappalyzer { this.detected[url.canonical] = {}; } - const metas = []; + const metaTags = []; // Additional information let language = null; @@ -170,23 +170,22 @@ class Wappalyzer { html = ''; } - const matches = data.html.match(/]*[: ]lang="([a-z]{2}((-|_)[A-Z]{2})?)"/i); + let matches = data.html.match(new RegExp(']*[: ]lang="([a-z]{2}((-|_)[A-Z]{2})?)"', 'i')); language = matches && matches.length ? matches[1] : null; - // grab metas + // Meta tags const regex = /]+>/ig; - let metaMatches; + do { - metaMatches = regex.exec(html); + matches = regex.exec(html); - if (!metaMatches) { + if (!matches) { break; } - const [match] = metaMatches; - metas.push(match); - } while (metaMatches); + metaTags.push(matches[0]); + } while (matches); } Object.keys(this.apps).forEach((appName) => { @@ -200,7 +199,7 @@ class Wappalyzer { if (html) { promises.push(this.analyzeHtml(app, html)); - promises.push(this.analyzeMeta(app, metas)); + promises.push(this.analyzeMeta(app, metaTags)); } if (scripts) { @@ -224,33 +223,32 @@ class Wappalyzer { }); } - return new Promise((resolve) => { - Promise.all(promises) - .then(() => { - Object.keys(apps).forEach((appName) => { - const app = apps[appName]; + return new Promise(async (resolve) => { + await Promise.all(promises); - if (!app.detected || !app.getConfidence()) { - delete apps[app.name]; - } - }); + Object.keys(apps).forEach((appName) => { + const app = apps[appName]; - resolveExcludes(apps, this.detected[url]); - this.resolveImplies(apps, url.canonical); + if (!app.detected || !app.getConfidence()) { + delete apps[app.name]; + } + }); - this.cacheDetectedApps(apps, url.canonical); - this.trackDetectedApps(apps, url, language); + resolveExcludes(apps, this.detected[url]); + this.resolveImplies(apps, url.canonical); - this.log(`Processing ${Object.keys(data).join(', ')} took ${((new Date() - startTime) / 1000).toFixed(2)}s (${url.hostname})`, 'core'); + this.cacheDetectedApps(apps, url.canonical); + this.trackDetectedApps(apps, url, language); - if (Object.keys(apps).length) { - this.log(`Identified ${Object.keys(apps).join(', ')} (${url.hostname})`, 'core'); - } + this.log(`Processing ${Object.keys(data).join(', ')} took ${((new Date() - startTime) / 1000).toFixed(2)}s (${url.hostname})`, 'core'); - this.driver.displayApps(this.detected[url.canonical], { language }, context); + if (Object.keys(apps).length) { + this.log(`Identified ${Object.keys(apps).join(', ')} (${url.hostname})`, 'core'); + } - return resolve(); - }); + this.driver.displayApps(this.detected[url.canonical], { language }, context); + + return resolve(); }); } @@ -265,23 +263,20 @@ class Wappalyzer { * */ robotsTxtAllows(url) { - return new Promise((resolve, reject) => { + return new Promise(async (resolve, reject) => { const parsed = this.parseUrl(url); if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') { - reject(); - - return; + return reject(); } - this.driver.getRobotsTxt(parsed.host, parsed.protocol === 'https:') - .then((robotsTxt) => { - if (robotsTxt.some(disallowedPath => parsed.pathname.indexOf(disallowedPath) === 0)) { - return reject(); - } + const robotsTxt = await this.driver.getRobotsTxt(parsed.host, parsed.protocol === 'https:'); + + if (robotsTxt.some(disallowedPath => parsed.pathname.indexOf(disallowedPath) === 0)) { + return reject(); + } - return resolve(); - }, () => resolve()); + return resolve(); }); } @@ -376,10 +371,10 @@ class Wappalyzer { try { attrs.regex = new RegExp(attr.replace('/', '\\/'), 'i'); // Escape slashes in regular expression - } catch (e) { + } catch (error) { attrs.regex = new RegExp(); - this.log(`${e}: ${attr}`, 'error', 'core'); + this.log(`${error.message}: ${attr}`, 'error', 'core'); } } }); @@ -572,7 +567,7 @@ class Wappalyzer { /** * Analyze meta tag */ - analyzeMeta(app, metas) { + analyzeMeta(app, metaTags) { const patterns = this.parsePatterns(app.props.meta); const promises = []; @@ -580,7 +575,7 @@ class Wappalyzer { return Promise.resolve(); } - metas.forEach((match) => { + metaTags.forEach((match) => { Object.keys(patterns).forEach((meta) => { const r = new RegExp(`(?:name|property)=["']${meta}["']`, 'i');