Respect robots.txt

main
Elbert Alias 7 years ago
parent a00319a3a8
commit 75cd40094c

@ -1,6 +1,7 @@
/** /**
* WebExtension driver * WebExtension driver
*/ */
setOption('robotsTxtCache', {});
var tabCache = {}; var tabCache = {};
var headersCache = {}; var headersCache = {};
@ -61,7 +62,7 @@ function post(url, body) {
body body
}) })
.then(response => { .then(response => {
wappalyzer.log('POST ' + url + ': ', 'driver'); wappalyzer.log('POST ' + url + ': ' + response.status, 'driver');
}) })
.catch(error => { .catch(error => {
wappalyzer.log('POST ' + url + ': ' + error, 'driver', 'error'); wappalyzer.log('POST ' + url + ': ' + error, 'driver', 'error');
@ -130,9 +131,9 @@ browser.webRequest.onCompleted.addListener(request => {
var responseHeaders = {}; var responseHeaders = {};
if ( request.responseHeaders ) { if ( request.responseHeaders ) {
var uri = request.url.replace(/#.*$/, ''); // Remove hash var url = wappalyzer.parseUrl(request.url);
request.responseHeaders.forEach(header => { request.responseHeaders.forEach(function(header) {
responseHeaders[header.name.toLowerCase()] = header.value || '' + header.binaryValue; responseHeaders[header.name.toLowerCase()] = header.value || '' + header.binaryValue;
}); });
@ -141,12 +142,12 @@ browser.webRequest.onCompleted.addListener(request => {
} }
if ( /text\/html/.test(responseHeaders['content-type']) ) { if ( /text\/html/.test(responseHeaders['content-type']) ) {
if ( headersCache[uri] === undefined ) { if ( headersCache[url.canonical] === undefined ) {
headersCache[uri] = {}; headersCache[url.canonical] = {};
} }
Object.keys(responseHeaders).forEach(header => { Object.keys(responseHeaders).forEach(header => {
headersCache[uri][header] = responseHeaders[header]; headersCache[url.canonical][header] = responseHeaders[header];
}); });
} }
} }
@ -167,15 +168,15 @@ browser.webRequest.onCompleted.addListener(request => {
break; break;
case 'analyze': case 'analyze':
var a = document.createElement('a'); var url = wappalyzer.parseUrl(sender.tab.url);
a.href = sender.tab.url.replace(/#.*$/, ''); if ( headersCache[url.canonical] !== undefined ) {
message.subject.headers = headersCache[url.canonical];
if ( headersCache[a.href] !== undefined ) {
message.subject.headers = headersCache[a.href];
} }
wappalyzer.analyze(a.hostname, a.href, message.subject, { tab: sender.tab }); wappalyzer.analyze(url.hostname, url.canonical, message.subject, {
tab: sender.tab
});
break; break;
case 'ad_log': case 'ad_log':
@ -202,7 +203,7 @@ browser.webRequest.onCompleted.addListener(request => {
*/ */
wappalyzer.driver.log = (message, source, type) => { wappalyzer.driver.log = (message, source, type) => {
console.log('[wappalyzer ' + type + ']', '[' + source + ']', message); console.log('[wappalyzer ' + type + ']', '[' + source + ']', message);
} };
/** /**
* Display apps * Display apps
@ -255,7 +256,44 @@ wappalyzer.driver.displayApps = (detected, context) => {
} }
}); });
} }
} };
/**
* Fetch and cache robots.txt for host
*/
wappalyzer.driver.getRobotsTxt = (host, secure = false) => {
return new Promise((resolve, reject) => {
getOption('robotsTxtCache')
.then(robotsTxtCache => {
robotsTxtCache = robotsTxtCache || {};
if ( host in robotsTxtCache ) {
resolve(robotsTxtCache[host]);
} else {
var url = 'http' + ( secure ? 's' : '' ) + '://' + host + '/robots.txt';
fetch('http' + ( secure ? 's' : '' ) + '://' + host + '/robots.txt')
.then(response => {
if ( !response.ok ) {
throw 'GET ' + response.url + ' was not ok';
}
return response.text();
})
.then(robotsTxt => {
robotsTxtCache[host] = wappalyzer.parseRobotsTxt(robotsTxt);
setOption('robotsTxtCache', robotsTxtCache);
resolve(robotsTxtCache[host]);
var hostname = host.replace(/:[0-9]+$/, '')
})
.catch(reject);
}
});
});
};
/** /**
* Anonymously track detected applications for research purposes * Anonymously track detected applications for research purposes
@ -268,4 +306,4 @@ wappalyzer.driver.ping = (ping, adCache) => {
post('https://ad.wappalyzer.com/log/wp/', adCache); post('https://ad.wappalyzer.com/log/wp/', adCache);
} }
}); });
} };

@ -37,8 +37,6 @@ wappalyzer.log = (message, source, type) => {
}; };
wappalyzer.analyze = (hostname, url, data, context) => { wappalyzer.analyze = (hostname, url, data, context) => {
wappalyzer.log('Function call: analyze()', 'core');
var apps = {}; var apps = {};
// Remove hash from URL // Remove hash from URL
@ -74,6 +72,10 @@ wappalyzer.analyze = (hostname, url, data, context) => {
if ( data.env ) { if ( data.env ) {
analyzeEnv(app, data.env); analyzeEnv(app, data.env);
} }
if ( data.robotsTxt ) {
analyzeRobotsTxt(app, data.robotsTxt);
}
}) })
Object.keys(apps).forEach(appName => { Object.keys(apps).forEach(appName => {
@ -104,6 +106,65 @@ wappalyzer.cacheDetectedAds = ad => {
adCache.push(ad); adCache.push(ad);
} }
/**
*
*/
wappalyzer.robotsTxtAllows = url => {
return new Promise((resolve, reject) => {
var parsed = wappalyzer.parseUrl(url);
wappalyzer.driver.getRobotsTxt(parsed.host, parsed.protocol === 'https:')
.then(robotsTxt => {
robotsTxt.forEach(disallow => {
if ( parsed.pathname.search(disallow) === 0 ) {
reject();
}
});
resolve();
});
});
};
/**
* Parse a URL
*/
wappalyzer.parseUrl = url => {
var a = document.createElement('a');
a.href = url;
a.canonical = a.protocol + '//' + a.host + a.pathname;
return a;
}
/**
*
*/
wappalyzer.parseRobotsTxt = robotsTxt => {
var userAgent;
var disallow = [];
robotsTxt.split('\n').forEach(line => {
var matches = /^User-agent:\s*(.+)$/i.exec(line);
if ( matches ) {
userAgent = matches[1].toLowerCase();
} else {
if ( userAgent === '*' || userAgent === 'wappalyzer' ) {
matches = /^Disallow:\s*(.+)$/i.exec(line);
if ( matches ) {
disallow.push(matches[1]);
}
}
}
});
return disallow;
}
/** /**
* Enclose string in array * Enclose string in array
*/ */
@ -156,7 +217,7 @@ function parsePatterns(patterns) {
} }
// Convert back to array if the original pattern list was an array (or string) // Convert back to array if the original pattern list was an array (or string)
if ( parsed.hasOwnProperty('main') ) { if ( 'main' in parsed ) {
parsed = parsed.main; parsed = parsed.main;
} }
@ -206,7 +267,7 @@ function resolveImplies(apps, url) {
return; return;
} }
if ( !apps.hasOwnProperty(implied.string) ) { if ( !( implied.string in apps ) ) {
apps[implied.string] = detected[url] && detected[url][implied.string] ? detected[url][implied.string] : new Application(implied.string, true); apps[implied.string] = detected[url] && detected[url][implied.string] ? detected[url][implied.string] : new Application(implied.string, true);
checkImplies = true; checkImplies = true;
@ -226,8 +287,6 @@ function resolveImplies(apps, url) {
* Cache detected applications * Cache detected applications
*/ */
function cacheDetectedApps(apps, url) { function cacheDetectedApps(apps, url) {
wappalyzer.log('Function call: cacheDetectedApps()', 'core');
Object.keys(apps).forEach(appName => { Object.keys(apps).forEach(appName => {
var app = apps[appName]; var app = apps[appName];
@ -244,35 +303,39 @@ function cacheDetectedApps(apps, url) {
* Track detected applications * Track detected applications
*/ */
function trackDetectedApps(apps, url, hostname, html) { function trackDetectedApps(apps, url, hostname, html) {
wappalyzer.log('Function call: trackDetectedApps()', 'core');
Object.keys(apps).forEach(appName => { Object.keys(apps).forEach(appName => {
var app = apps[appName]; var app = apps[appName];
if ( detected[url][appName].getConfidence() >= 100 && validation.hostname.test(hostname) && !validation.hostnameBlacklist.test(url) ) { if ( detected[url][appName].getConfidence() >= 100 ) {
if ( !hostnameCache.hasOwnProperty(hostname) ) { if ( validation.hostname.test(hostname) && !validation.hostnameBlacklist.test(url) ) {
hostnameCache[hostname] = { wappalyzer.robotsTxtAllows(url)
applications: {}, .then(() => {
meta: {} if ( !( hostname in hostnameCache ) ) {
}; hostnameCache[hostname] = {
} applications: {},
meta: {}
if ( !hostnameCache[hostname].applications.hasOwnProperty(appName) ) { };
hostnameCache[hostname].applications[appName] = { }
hits: 0
}; if ( !( appName in hostnameCache[hostname].applications ) ) {
} hostnameCache[hostname].applications[appName] = {
hits: 0
hostnameCache[hostname].applications[appName].hits ++; };
}
if ( apps[appName].version ) {
hostnameCache[hostname].applications[appName].version = app.version; hostnameCache[hostname].applications[appName].hits ++;
if ( apps[appName].version ) {
hostnameCache[hostname].applications[appName].version = app.version;
}
})
.catch(() => console.log('Disallowed in robots.txt: ' + url))
} }
} }
}); });
// Additional information // Additional information
if ( hostnameCache.hasOwnProperty(hostname) ) { if ( hostname in hostnameCache ) {
var match = html.match(/<html[^>]*[: ]lang="([a-z]{2}((-|_)[A-Z]{2})?)"/i); var match = html.match(/<html[^>]*[: ]lang="([a-z]{2}((-|_)[A-Z]{2})?)"/i);
if ( match && match.length ) { if ( match && match.length ) {
@ -373,7 +436,7 @@ function analyzeHeaders(app, headers) {
patterns[header].forEach(pattern => { patterns[header].forEach(pattern => {
header = header.toLowerCase(); header = header.toLowerCase();
if ( headers.hasOwnProperty(header) && pattern.regex.test(headers[header]) ) { if ( header in headers && pattern.regex.test(headers[header]) ) {
addDetected(app, pattern, 'headers', headers[header], header); addDetected(app, pattern, 'headers', headers[header], header);
} }
}); });
@ -398,6 +461,21 @@ function analyzeEnv(app, envs) {
} }
} }
/**
* Analyze robots.txt
*/
function analyzeRobotsTxt(app, robotsTxt) {
var patterns = parsePatterns(app.props.robotsTxt);
if ( patterns.length ) {
patterns.forEach(pattern => {
if ( pattern.regex.test(robotsTxt) ) {
addDetected(app, pattern, 'robotsTxt', robotsTxt);
}
});
}
}
/** /**
* Mark application as detected, set confidence and version * Mark application as detected, set confidence and version
*/ */

Loading…
Cancel
Save