Respect robots.txt

main
Elbert Alias 8 years ago
parent a00319a3a8
commit 75cd40094c

@ -1,6 +1,7 @@
/**
* WebExtension driver
*/
setOption('robotsTxtCache', {});
var tabCache = {};
var headersCache = {};
@ -61,7 +62,7 @@ function post(url, body) {
body
})
.then(response => {
wappalyzer.log('POST ' + url + ': ', 'driver');
wappalyzer.log('POST ' + url + ': ' + response.status, 'driver');
})
.catch(error => {
wappalyzer.log('POST ' + url + ': ' + error, 'driver', 'error');
@ -130,9 +131,9 @@ browser.webRequest.onCompleted.addListener(request => {
var responseHeaders = {};
if ( request.responseHeaders ) {
var uri = request.url.replace(/#.*$/, ''); // Remove hash
var url = wappalyzer.parseUrl(request.url);
request.responseHeaders.forEach(header => {
request.responseHeaders.forEach(function(header) {
responseHeaders[header.name.toLowerCase()] = header.value || '' + header.binaryValue;
});
@ -141,12 +142,12 @@ browser.webRequest.onCompleted.addListener(request => {
}
if ( /text\/html/.test(responseHeaders['content-type']) ) {
if ( headersCache[uri] === undefined ) {
headersCache[uri] = {};
if ( headersCache[url.canonical] === undefined ) {
headersCache[url.canonical] = {};
}
Object.keys(responseHeaders).forEach(header => {
headersCache[uri][header] = responseHeaders[header];
headersCache[url.canonical][header] = responseHeaders[header];
});
}
}
@ -167,15 +168,15 @@ browser.webRequest.onCompleted.addListener(request => {
break;
case 'analyze':
var a = document.createElement('a');
var url = wappalyzer.parseUrl(sender.tab.url);
a.href = sender.tab.url.replace(/#.*$/, '');
if ( headersCache[a.href] !== undefined ) {
message.subject.headers = headersCache[a.href];
if ( headersCache[url.canonical] !== undefined ) {
message.subject.headers = headersCache[url.canonical];
}
wappalyzer.analyze(a.hostname, a.href, message.subject, { tab: sender.tab });
wappalyzer.analyze(url.hostname, url.canonical, message.subject, {
tab: sender.tab
});
break;
case 'ad_log':
@ -202,7 +203,7 @@ browser.webRequest.onCompleted.addListener(request => {
*/
wappalyzer.driver.log = (message, source, type) => {
console.log('[wappalyzer ' + type + ']', '[' + source + ']', message);
}
};
/**
* Display apps
@ -255,7 +256,44 @@ wappalyzer.driver.displayApps = (detected, context) => {
}
});
}
}
};
/**
* Fetch and cache robots.txt for host
*/
wappalyzer.driver.getRobotsTxt = (host, secure = false) => {
return new Promise((resolve, reject) => {
getOption('robotsTxtCache')
.then(robotsTxtCache => {
robotsTxtCache = robotsTxtCache || {};
if ( host in robotsTxtCache ) {
resolve(robotsTxtCache[host]);
} else {
var url = 'http' + ( secure ? 's' : '' ) + '://' + host + '/robots.txt';
fetch('http' + ( secure ? 's' : '' ) + '://' + host + '/robots.txt')
.then(response => {
if ( !response.ok ) {
throw 'GET ' + response.url + ' was not ok';
}
return response.text();
})
.then(robotsTxt => {
robotsTxtCache[host] = wappalyzer.parseRobotsTxt(robotsTxt);
setOption('robotsTxtCache', robotsTxtCache);
resolve(robotsTxtCache[host]);
var hostname = host.replace(/:[0-9]+$/, '')
})
.catch(reject);
}
});
});
};
/**
* Anonymously track detected applications for research purposes
@ -268,4 +306,4 @@ wappalyzer.driver.ping = (ping, adCache) => {
post('https://ad.wappalyzer.com/log/wp/', adCache);
}
});
}
};

@ -37,8 +37,6 @@ wappalyzer.log = (message, source, type) => {
};
wappalyzer.analyze = (hostname, url, data, context) => {
wappalyzer.log('Function call: analyze()', 'core');
var apps = {};
// Remove hash from URL
@ -74,6 +72,10 @@ wappalyzer.analyze = (hostname, url, data, context) => {
if ( data.env ) {
analyzeEnv(app, data.env);
}
if ( data.robotsTxt ) {
analyzeRobotsTxt(app, data.robotsTxt);
}
})
Object.keys(apps).forEach(appName => {
@ -104,6 +106,65 @@ wappalyzer.cacheDetectedAds = ad => {
adCache.push(ad);
}
/**
*
*/
wappalyzer.robotsTxtAllows = url => {
return new Promise((resolve, reject) => {
var parsed = wappalyzer.parseUrl(url);
wappalyzer.driver.getRobotsTxt(parsed.host, parsed.protocol === 'https:')
.then(robotsTxt => {
robotsTxt.forEach(disallow => {
if ( parsed.pathname.search(disallow) === 0 ) {
reject();
}
});
resolve();
});
});
};
/**
* Parse a URL
*/
wappalyzer.parseUrl = url => {
var a = document.createElement('a');
a.href = url;
a.canonical = a.protocol + '//' + a.host + a.pathname;
return a;
}
/**
*
*/
wappalyzer.parseRobotsTxt = robotsTxt => {
var userAgent;
var disallow = [];
robotsTxt.split('\n').forEach(line => {
var matches = /^User-agent:\s*(.+)$/i.exec(line);
if ( matches ) {
userAgent = matches[1].toLowerCase();
} else {
if ( userAgent === '*' || userAgent === 'wappalyzer' ) {
matches = /^Disallow:\s*(.+)$/i.exec(line);
if ( matches ) {
disallow.push(matches[1]);
}
}
}
});
return disallow;
}
/**
* Enclose string in array
*/
@ -156,7 +217,7 @@ function parsePatterns(patterns) {
}
// Convert back to array if the original pattern list was an array (or string)
if ( parsed.hasOwnProperty('main') ) {
if ( 'main' in parsed ) {
parsed = parsed.main;
}
@ -206,7 +267,7 @@ function resolveImplies(apps, url) {
return;
}
if ( !apps.hasOwnProperty(implied.string) ) {
if ( !( implied.string in apps ) ) {
apps[implied.string] = detected[url] && detected[url][implied.string] ? detected[url][implied.string] : new Application(implied.string, true);
checkImplies = true;
@ -226,8 +287,6 @@ function resolveImplies(apps, url) {
* Cache detected applications
*/
function cacheDetectedApps(apps, url) {
wappalyzer.log('Function call: cacheDetectedApps()', 'core');
Object.keys(apps).forEach(appName => {
var app = apps[appName];
@ -244,35 +303,39 @@ function cacheDetectedApps(apps, url) {
* Track detected applications
*/
function trackDetectedApps(apps, url, hostname, html) {
wappalyzer.log('Function call: trackDetectedApps()', 'core');
Object.keys(apps).forEach(appName => {
var app = apps[appName];
if ( detected[url][appName].getConfidence() >= 100 && validation.hostname.test(hostname) && !validation.hostnameBlacklist.test(url) ) {
if ( !hostnameCache.hasOwnProperty(hostname) ) {
hostnameCache[hostname] = {
applications: {},
meta: {}
};
}
if ( !hostnameCache[hostname].applications.hasOwnProperty(appName) ) {
hostnameCache[hostname].applications[appName] = {
hits: 0
};
}
hostnameCache[hostname].applications[appName].hits ++;
if ( apps[appName].version ) {
hostnameCache[hostname].applications[appName].version = app.version;
if ( detected[url][appName].getConfidence() >= 100 ) {
if ( validation.hostname.test(hostname) && !validation.hostnameBlacklist.test(url) ) {
wappalyzer.robotsTxtAllows(url)
.then(() => {
if ( !( hostname in hostnameCache ) ) {
hostnameCache[hostname] = {
applications: {},
meta: {}
};
}
if ( !( appName in hostnameCache[hostname].applications ) ) {
hostnameCache[hostname].applications[appName] = {
hits: 0
};
}
hostnameCache[hostname].applications[appName].hits ++;
if ( apps[appName].version ) {
hostnameCache[hostname].applications[appName].version = app.version;
}
})
.catch(() => console.log('Disallowed in robots.txt: ' + url))
}
}
});
// Additional information
if ( hostnameCache.hasOwnProperty(hostname) ) {
if ( hostname in hostnameCache ) {
var match = html.match(/<html[^>]*[: ]lang="([a-z]{2}((-|_)[A-Z]{2})?)"/i);
if ( match && match.length ) {
@ -373,7 +436,7 @@ function analyzeHeaders(app, headers) {
patterns[header].forEach(pattern => {
header = header.toLowerCase();
if ( headers.hasOwnProperty(header) && pattern.regex.test(headers[header]) ) {
if ( header in headers && pattern.regex.test(headers[header]) ) {
addDetected(app, pattern, 'headers', headers[header], header);
}
});
@ -398,6 +461,21 @@ function analyzeEnv(app, envs) {
}
}
/**
* Analyze robots.txt
*/
function analyzeRobotsTxt(app, robotsTxt) {
var patterns = parsePatterns(app.props.robotsTxt);
if ( patterns.length ) {
patterns.forEach(pattern => {
if ( pattern.regex.test(robotsTxt) ) {
addDetected(app, pattern, 'robotsTxt', robotsTxt);
}
});
}
}
/**
* Mark application as detected, set confidence and version
*/