From 75cd40094c0c1211d36f289f5e52e46399c19667 Mon Sep 17 00:00:00 2001 From: Elbert Alias Date: Sat, 22 Jul 2017 17:55:13 +1000 Subject: [PATCH] Respect robots.txt --- src/drivers/webextension/js/driver.js | 68 ++++++++++--- src/wappalyzer.js | 134 ++++++++++++++++++++------ 2 files changed, 159 insertions(+), 43 deletions(-) diff --git a/src/drivers/webextension/js/driver.js b/src/drivers/webextension/js/driver.js index 6bbe373fe..4568e30b6 100644 --- a/src/drivers/webextension/js/driver.js +++ b/src/drivers/webextension/js/driver.js @@ -1,6 +1,7 @@ /** * WebExtension driver */ + setOption('robotsTxtCache', {}); var tabCache = {}; var headersCache = {}; @@ -61,7 +62,7 @@ function post(url, body) { body }) .then(response => { - wappalyzer.log('POST ' + url + ': ', 'driver'); + wappalyzer.log('POST ' + url + ': ' + response.status, 'driver'); }) .catch(error => { wappalyzer.log('POST ' + url + ': ' + error, 'driver', 'error'); @@ -130,9 +131,9 @@ browser.webRequest.onCompleted.addListener(request => { var responseHeaders = {}; if ( request.responseHeaders ) { - var uri = request.url.replace(/#.*$/, ''); // Remove hash + var url = wappalyzer.parseUrl(request.url); - request.responseHeaders.forEach(header => { + request.responseHeaders.forEach(function(header) { responseHeaders[header.name.toLowerCase()] = header.value || '' + header.binaryValue; }); @@ -141,12 +142,12 @@ browser.webRequest.onCompleted.addListener(request => { } if ( /text\/html/.test(responseHeaders['content-type']) ) { - if ( headersCache[uri] === undefined ) { - headersCache[uri] = {}; + if ( headersCache[url.canonical] === undefined ) { + headersCache[url.canonical] = {}; } Object.keys(responseHeaders).forEach(header => { - headersCache[uri][header] = responseHeaders[header]; + headersCache[url.canonical][header] = responseHeaders[header]; }); } } @@ -167,15 +168,15 @@ browser.webRequest.onCompleted.addListener(request => { break; case 'analyze': - var a = document.createElement('a'); + var url = wappalyzer.parseUrl(sender.tab.url); - a.href = sender.tab.url.replace(/#.*$/, ''); - - if ( headersCache[a.href] !== undefined ) { - message.subject.headers = headersCache[a.href]; + if ( headersCache[url.canonical] !== undefined ) { + message.subject.headers = headersCache[url.canonical]; } - wappalyzer.analyze(a.hostname, a.href, message.subject, { tab: sender.tab }); + wappalyzer.analyze(url.hostname, url.canonical, message.subject, { + tab: sender.tab + }); break; case 'ad_log': @@ -202,7 +203,7 @@ browser.webRequest.onCompleted.addListener(request => { */ wappalyzer.driver.log = (message, source, type) => { console.log('[wappalyzer ' + type + ']', '[' + source + ']', message); -} +}; /** * Display apps @@ -255,7 +256,44 @@ wappalyzer.driver.displayApps = (detected, context) => { } }); } -} +}; + +/** + * Fetch and cache robots.txt for host + */ +wappalyzer.driver.getRobotsTxt = (host, secure = false) => { + return new Promise((resolve, reject) => { + getOption('robotsTxtCache') + .then(robotsTxtCache => { + robotsTxtCache = robotsTxtCache || {}; + + if ( host in robotsTxtCache ) { + resolve(robotsTxtCache[host]); + } else { + var url = 'http' + ( secure ? 's' : '' ) + '://' + host + '/robots.txt'; + + fetch('http' + ( secure ? 's' : '' ) + '://' + host + '/robots.txt') + .then(response => { + if ( !response.ok ) { + throw 'GET ' + response.url + ' was not ok'; + } + + return response.text(); + }) + .then(robotsTxt => { + robotsTxtCache[host] = wappalyzer.parseRobotsTxt(robotsTxt); + + setOption('robotsTxtCache', robotsTxtCache); + + resolve(robotsTxtCache[host]); + + var hostname = host.replace(/:[0-9]+$/, '') + }) + .catch(reject); + } + }); + }); +}; /** * Anonymously track detected applications for research purposes @@ -268,4 +306,4 @@ wappalyzer.driver.ping = (ping, adCache) => { post('https://ad.wappalyzer.com/log/wp/', adCache); } }); -} +}; diff --git a/src/wappalyzer.js b/src/wappalyzer.js index 1e6ee8aa9..213854c0b 100644 --- a/src/wappalyzer.js +++ b/src/wappalyzer.js @@ -37,8 +37,6 @@ wappalyzer.log = (message, source, type) => { }; wappalyzer.analyze = (hostname, url, data, context) => { - wappalyzer.log('Function call: analyze()', 'core'); - var apps = {}; // Remove hash from URL @@ -74,6 +72,10 @@ wappalyzer.analyze = (hostname, url, data, context) => { if ( data.env ) { analyzeEnv(app, data.env); } + + if ( data.robotsTxt ) { + analyzeRobotsTxt(app, data.robotsTxt); + } }) Object.keys(apps).forEach(appName => { @@ -104,6 +106,65 @@ wappalyzer.cacheDetectedAds = ad => { adCache.push(ad); } +/** + * + */ +wappalyzer.robotsTxtAllows = url => { + return new Promise((resolve, reject) => { + var parsed = wappalyzer.parseUrl(url); + + wappalyzer.driver.getRobotsTxt(parsed.host, parsed.protocol === 'https:') + .then(robotsTxt => { + robotsTxt.forEach(disallow => { + if ( parsed.pathname.search(disallow) === 0 ) { + reject(); + } + }); + + resolve(); + }); + }); +}; + +/** + * Parse a URL + */ +wappalyzer.parseUrl = url => { + var a = document.createElement('a'); + + a.href = url; + + a.canonical = a.protocol + '//' + a.host + a.pathname; + + return a; +} + +/** + * + */ +wappalyzer.parseRobotsTxt = robotsTxt => { + var userAgent; + var disallow = []; + + robotsTxt.split('\n').forEach(line => { + var matches = /^User-agent:\s*(.+)$/i.exec(line); + + if ( matches ) { + userAgent = matches[1].toLowerCase(); + } else { + if ( userAgent === '*' || userAgent === 'wappalyzer' ) { + matches = /^Disallow:\s*(.+)$/i.exec(line); + + if ( matches ) { + disallow.push(matches[1]); + } + } + } + }); + + return disallow; +} + /** * Enclose string in array */ @@ -156,7 +217,7 @@ function parsePatterns(patterns) { } // Convert back to array if the original pattern list was an array (or string) - if ( parsed.hasOwnProperty('main') ) { + if ( 'main' in parsed ) { parsed = parsed.main; } @@ -206,7 +267,7 @@ function resolveImplies(apps, url) { return; } - if ( !apps.hasOwnProperty(implied.string) ) { + if ( !( implied.string in apps ) ) { apps[implied.string] = detected[url] && detected[url][implied.string] ? detected[url][implied.string] : new Application(implied.string, true); checkImplies = true; @@ -226,8 +287,6 @@ function resolveImplies(apps, url) { * Cache detected applications */ function cacheDetectedApps(apps, url) { - wappalyzer.log('Function call: cacheDetectedApps()', 'core'); - Object.keys(apps).forEach(appName => { var app = apps[appName]; @@ -244,35 +303,39 @@ function cacheDetectedApps(apps, url) { * Track detected applications */ function trackDetectedApps(apps, url, hostname, html) { - wappalyzer.log('Function call: trackDetectedApps()', 'core'); - Object.keys(apps).forEach(appName => { var app = apps[appName]; - if ( detected[url][appName].getConfidence() >= 100 && validation.hostname.test(hostname) && !validation.hostnameBlacklist.test(url) ) { - if ( !hostnameCache.hasOwnProperty(hostname) ) { - hostnameCache[hostname] = { - applications: {}, - meta: {} - }; - } - - if ( !hostnameCache[hostname].applications.hasOwnProperty(appName) ) { - hostnameCache[hostname].applications[appName] = { - hits: 0 - }; - } - - hostnameCache[hostname].applications[appName].hits ++; - - if ( apps[appName].version ) { - hostnameCache[hostname].applications[appName].version = app.version; + if ( detected[url][appName].getConfidence() >= 100 ) { + if ( validation.hostname.test(hostname) && !validation.hostnameBlacklist.test(url) ) { + wappalyzer.robotsTxtAllows(url) + .then(() => { + if ( !( hostname in hostnameCache ) ) { + hostnameCache[hostname] = { + applications: {}, + meta: {} + }; + } + + if ( !( appName in hostnameCache[hostname].applications ) ) { + hostnameCache[hostname].applications[appName] = { + hits: 0 + }; + } + + hostnameCache[hostname].applications[appName].hits ++; + + if ( apps[appName].version ) { + hostnameCache[hostname].applications[appName].version = app.version; + } + }) + .catch(() => console.log('Disallowed in robots.txt: ' + url)) } } }); // Additional information - if ( hostnameCache.hasOwnProperty(hostname) ) { + if ( hostname in hostnameCache ) { var match = html.match(/]*[: ]lang="([a-z]{2}((-|_)[A-Z]{2})?)"/i); if ( match && match.length ) { @@ -373,7 +436,7 @@ function analyzeHeaders(app, headers) { patterns[header].forEach(pattern => { header = header.toLowerCase(); - if ( headers.hasOwnProperty(header) && pattern.regex.test(headers[header]) ) { + if ( header in headers && pattern.regex.test(headers[header]) ) { addDetected(app, pattern, 'headers', headers[header], header); } }); @@ -398,6 +461,21 @@ function analyzeEnv(app, envs) { } } +/** + * Analyze robots.txt + */ +function analyzeRobotsTxt(app, robotsTxt) { + var patterns = parsePatterns(app.props.robotsTxt); + + if ( patterns.length ) { + patterns.forEach(pattern => { + if ( pattern.regex.test(robotsTxt) ) { + addDetected(app, pattern, 'robotsTxt', robotsTxt); + } + }); + } +} + /** * Mark application as detected, set confidence and version */