Add robots.txt queuing

main
Elbert Alias 7 years ago
parent 24c460f86f
commit f35c7ff3e7

@ -2,7 +2,7 @@
"name": "wappalyzer", "name": "wappalyzer",
"description": "Uncovers the technologies used on websites", "description": "Uncovers the technologies used on websites",
"homepage": "https://github.com/AliasIO/Wappalyzer", "homepage": "https://github.com/AliasIO/Wappalyzer",
"version": "5.4.7", "version": "5.4.8",
"author": "Elbert Alias", "author": "Elbert Alias",
"license": "GPL-3.0", "license": "GPL-3.0",
"repository": { "repository": {

@ -11,6 +11,7 @@ var tabCache = {};
var headersCache = {}; var headersCache = {};
var categoryOrder = []; var categoryOrder = [];
var options = {}; var options = {};
var robotsTxtQueue = {};
browser.tabs.onRemoved.addListener(tabId => { browser.tabs.onRemoved.addListener(tabId => {
tabCache[tabId] = null; tabCache[tabId] = null;
@ -287,7 +288,13 @@ wappalyzer.driver.displayApps = (detected, meta, context) => {
* Fetch and cache robots.txt for host * Fetch and cache robots.txt for host
*/ */
wappalyzer.driver.getRobotsTxt = (host, secure = false) => { wappalyzer.driver.getRobotsTxt = (host, secure = false) => {
return new Promise((resolve, reject) => { if ( robotsTxtQueue.hasOwnProperty(host) ) {
wappalyzer.log('robotTxt fetch already in queue');
return robotsTxtQueue[host];
}
robotsTxtQueue[host] = new Promise((resolve, reject) => {
getOption('tracking', true) getOption('tracking', true)
.then(tracking => { .then(tracking => {
if ( !tracking ) { if ( !tracking ) {
@ -299,34 +306,31 @@ wappalyzer.driver.getRobotsTxt = (host, secure = false) => {
robotsTxtCache = robotsTxtCache || {}; robotsTxtCache = robotsTxtCache || {};
if ( host in robotsTxtCache ) { if ( host in robotsTxtCache ) {
resolve(robotsTxtCache[host]); return resolve(robotsTxtCache[host]);
} else {
const url = 'http' + ( secure ? 's' : '' ) + '://' + host + '/robots.txt';
fetch('http' + ( secure ? 's' : '' ) + '://' + host + '/robots.txt')
.then(response => {
if ( !response.ok ) {
if ( response.status === 404 ) {
return '';
} else {
throw 'GET ' + response.url + ' was not ok';
}
}
return response.text();
})
.then(robotsTxt => {
robotsTxtCache[host] = wappalyzer.parseRobotsTxt(robotsTxt);
setOption('robotsTxtCache', robotsTxtCache);
resolve(robotsTxtCache[host]);
})
.catch(reject);
} }
const timeout = setTimeout(() => resolve([]), 3000);
fetch('http' + ( secure ? 's' : '' ) + '://' + host + '/robots.txt', { redirect: 'follow' })
.then(response => {
clearTimeout(timeout);
return response.ok ? response.text() : '';
})
.then(robotsTxt => {
robotsTxtCache[host] = wappalyzer.parseRobotsTxt(robotsTxt);
setOption('robotsTxtCache', robotsTxtCache);
resolve(robotsTxtCache[host]);
})
.catch(err => resolve([]));
}); });
}); });
}); })
.finally(() => delete robotsTxtQueue[host]);
return robotsTxtQueue[host];
}; };
/** /**

@ -890,7 +890,7 @@ var exports = {};
var _pageTags; var _pageTags;
var INIT_MS_BW_SEARCHES = 2000; var INIT_MS_BW_SEARCHES = 2000;
var PAGE_TAG_RE = new RegExp('gpt|oascentral'); var PAGE_TAG_RE = new RegExp('gpt|oascentral');
var POST_MSG_ID = '1511804838-25881-9878-26947-14879'; var POST_MSG_ID = '1519242200-10756-12873-1462-13403';
var AD_SERVER_RE = new RegExp('^(google_ads_iframe|oas_frame|atwAdFrame)'); var AD_SERVER_RE = new RegExp('^(google_ads_iframe|oas_frame|atwAdFrame)');
function getPageTags(doc) { function getPageTags(doc) {

@ -4,7 +4,7 @@
"author": "Elbert Alias", "author": "Elbert Alias",
"homepage_url": "https://www.wappalyzer.com", "homepage_url": "https://www.wappalyzer.com",
"description": "Identify web technologies", "description": "Identify web technologies",
"version": "5.4.7", "version": "5.4.8",
"default_locale": "en", "default_locale": "en",
"manifest_version": 2, "manifest_version": 2,
"icons": { "icons": {

@ -19,7 +19,6 @@ class Wappalyzer {
this.categories = {}; this.categories = {};
this.driver = {}; this.driver = {};
this.jsPatterns = {}; this.jsPatterns = {};
this.detected = {}; this.detected = {};
this.hostnameCache = {}; this.hostnameCache = {};
this.adCache = []; this.adCache = [];
@ -139,12 +138,12 @@ class Wappalyzer {
this.driver.getRobotsTxt(parsed.host, parsed.protocol === 'https:') this.driver.getRobotsTxt(parsed.host, parsed.protocol === 'https:')
.then(robotsTxt => { .then(robotsTxt => {
if (robotsTxt.some(disallowedPath => parsed.pathname.indexOf(disallowedPath) === 0)) { if ( robotsTxt.some(disallowedPath => parsed.pathname.indexOf(disallowedPath) === 0) ) {
return reject(); return reject();
} else {
return resolve();
} }
});
return resolve();
}, () => resolve());
}); });
}; };

Loading…
Cancel
Save