From f35c7ff3e721c445e66f61b74b046ba506024cb4 Mon Sep 17 00:00:00 2001 From: Elbert Alias <77259+AliasIO@users.noreply.github.com> Date: Fri, 23 Feb 2018 14:34:45 +1100 Subject: [PATCH] Add robots.txt queuing --- src/drivers/npm/package.json | 2 +- src/drivers/webextension/js/driver.js | 56 ++++++++++++++------------ src/drivers/webextension/js/iframe.js | 2 +- src/drivers/webextension/manifest.json | 2 +- src/wappalyzer.js | 9 ++--- 5 files changed, 37 insertions(+), 34 deletions(-) diff --git a/src/drivers/npm/package.json b/src/drivers/npm/package.json index f6b5cf37c..473839445 100644 --- a/src/drivers/npm/package.json +++ b/src/drivers/npm/package.json @@ -2,7 +2,7 @@ "name": "wappalyzer", "description": "Uncovers the technologies used on websites", "homepage": "https://github.com/AliasIO/Wappalyzer", - "version": "5.4.7", + "version": "5.4.8", "author": "Elbert Alias", "license": "GPL-3.0", "repository": { diff --git a/src/drivers/webextension/js/driver.js b/src/drivers/webextension/js/driver.js index 198d67074..c7d550537 100644 --- a/src/drivers/webextension/js/driver.js +++ b/src/drivers/webextension/js/driver.js @@ -11,6 +11,7 @@ var tabCache = {}; var headersCache = {}; var categoryOrder = []; var options = {}; +var robotsTxtQueue = {}; browser.tabs.onRemoved.addListener(tabId => { tabCache[tabId] = null; @@ -287,7 +288,13 @@ wappalyzer.driver.displayApps = (detected, meta, context) => { * Fetch and cache robots.txt for host */ wappalyzer.driver.getRobotsTxt = (host, secure = false) => { - return new Promise((resolve, reject) => { + if ( robotsTxtQueue.hasOwnProperty(host) ) { + wappalyzer.log('robotTxt fetch already in queue'); + + return robotsTxtQueue[host]; + } + + robotsTxtQueue[host] = new Promise((resolve, reject) => { getOption('tracking', true) .then(tracking => { if ( !tracking ) { @@ -299,34 +306,31 @@ wappalyzer.driver.getRobotsTxt = (host, secure = false) => { robotsTxtCache = robotsTxtCache || {}; if ( host in robotsTxtCache ) { - resolve(robotsTxtCache[host]); - } else { - const url = 'http' + ( secure ? 's' : '' ) + '://' + host + '/robots.txt'; - - fetch('http' + ( secure ? 's' : '' ) + '://' + host + '/robots.txt') - .then(response => { - if ( !response.ok ) { - if ( response.status === 404 ) { - return ''; - } else { - throw 'GET ' + response.url + ' was not ok'; - } - } - - return response.text(); - }) - .then(robotsTxt => { - robotsTxtCache[host] = wappalyzer.parseRobotsTxt(robotsTxt); - - setOption('robotsTxtCache', robotsTxtCache); - - resolve(robotsTxtCache[host]); - }) - .catch(reject); + return resolve(robotsTxtCache[host]); } + + const timeout = setTimeout(() => resolve([]), 3000); + + fetch('http' + ( secure ? 's' : '' ) + '://' + host + '/robots.txt', { redirect: 'follow' }) + .then(response => { + clearTimeout(timeout); + + return response.ok ? response.text() : ''; + }) + .then(robotsTxt => { + robotsTxtCache[host] = wappalyzer.parseRobotsTxt(robotsTxt); + + setOption('robotsTxtCache', robotsTxtCache); + + resolve(robotsTxtCache[host]); + }) + .catch(err => resolve([])); }); }); - }); + }) + .finally(() => delete robotsTxtQueue[host]); + + return robotsTxtQueue[host]; }; /** diff --git a/src/drivers/webextension/js/iframe.js b/src/drivers/webextension/js/iframe.js index 2370a3eda..883ebea71 100644 --- a/src/drivers/webextension/js/iframe.js +++ b/src/drivers/webextension/js/iframe.js @@ -890,7 +890,7 @@ var exports = {}; var _pageTags; var INIT_MS_BW_SEARCHES = 2000; var PAGE_TAG_RE = new RegExp('gpt|oascentral'); - var POST_MSG_ID = '1511804838-25881-9878-26947-14879'; + var POST_MSG_ID = '1519242200-10756-12873-1462-13403'; var AD_SERVER_RE = new RegExp('^(google_ads_iframe|oas_frame|atwAdFrame)'); function getPageTags(doc) { diff --git a/src/drivers/webextension/manifest.json b/src/drivers/webextension/manifest.json index de6e15489..6accf3520 100644 --- a/src/drivers/webextension/manifest.json +++ b/src/drivers/webextension/manifest.json @@ -4,7 +4,7 @@ "author": "Elbert Alias", "homepage_url": "https://www.wappalyzer.com", "description": "Identify web technologies", - "version": "5.4.7", + "version": "5.4.8", "default_locale": "en", "manifest_version": 2, "icons": { diff --git a/src/wappalyzer.js b/src/wappalyzer.js index d2937d212..d95f90154 100644 --- a/src/wappalyzer.js +++ b/src/wappalyzer.js @@ -19,7 +19,6 @@ class Wappalyzer { this.categories = {}; this.driver = {}; this.jsPatterns = {}; - this.detected = {}; this.hostnameCache = {}; this.adCache = []; @@ -139,12 +138,12 @@ class Wappalyzer { this.driver.getRobotsTxt(parsed.host, parsed.protocol === 'https:') .then(robotsTxt => { - if (robotsTxt.some(disallowedPath => parsed.pathname.indexOf(disallowedPath) === 0)) { + if ( robotsTxt.some(disallowedPath => parsed.pathname.indexOf(disallowedPath) === 0) ) { return reject(); - } else { - return resolve(); } - }); + + return resolve(); + }, () => resolve()); }); };