Improve parsing of minified HTML

main
Elbert Alias 6 years ago
parent 21ff078ec7
commit a2a18e12b5

@ -2205,7 +2205,7 @@
"criteo_pubtag": "", "criteo_pubtag": "",
"criteo_q": "" "criteo_q": ""
}, },
"script":[ "script": [
"//(?:cas\\.criteo\\.com|(?:[^/]\\.)?criteo\\.net)/", "//(?:cas\\.criteo\\.com|(?:[^/]\\.)?criteo\\.net)/",
"//static.criteo.net/js/ld/ld.js" "//static.criteo.net/js/ld/ld.js"
], ],
@ -3710,7 +3710,7 @@
"_gat": "" "_gat": ""
}, },
"icon": "Google Analytics.svg", "icon": "Google Analytics.svg",
"html": "<amp-analytics [^>]*type=[\"']googleanalytics[\"']", "html": "<amp-analytics [^>]*type=[\"']googleanalytics[\"']",
"js": { "js": {
"GoogleAnalyticsObject": "", "GoogleAnalyticsObject": "",
"gaGlobal": "" "gaGlobal": ""
@ -6767,7 +6767,7 @@
], ],
"headers": { "headers": {
"Server": "nginx(?:/([\\d.]+))?\\;version:\\1", "Server": "nginx(?:/([\\d.]+))?\\;version:\\1",
"X-Fastcgi-Cache": "" "X-Fastcgi-Cache": ""
}, },
"icon": "Nginx.svg", "icon": "Nginx.svg",
"website": "http://nginx.org/en" "website": "http://nginx.org/en"
@ -9704,7 +9704,7 @@
"script": [ "script": [
"^//tags\\.tiqcdn\\.com/", "^//tags\\.tiqcdn\\.com/",
"/tealium/utag\\.js$" "/tealium/utag\\.js$"
], ],
"website": "http://tealium.com" "website": "http://tealium.com"
}, },
"TeamCity": { "TeamCity": {
@ -11234,7 +11234,7 @@
"cats": [ "cats": [
42 42
], ],
"html":"<!-- (?:End )?Yahoo! Tag Manager -->", "html": "<!-- (?:End )?Yahoo! Tag Manager -->",
"script": "b\\.yjtag\\.jp/iframe", "script": "b\\.yjtag\\.jp/iframe",
"icon": "yahoo.png", "icon": "yahoo.png",
"website": "https://tagmanager.yahoo.co.jp/" "website": "https://tagmanager.yahoo.co.jp/"

@ -1,5 +1,3 @@
const url = require('url'); const url = require('url');
const fs = require('fs'); const fs = require('fs');
const path = require('path'); const path = require('path');
@ -77,7 +75,7 @@ class Driver {
chunkSize: 5, chunkSize: 5,
debug: false, debug: false,
delay: 500, delay: 500,
htmlMaxCols: 2000, htmlMaxCols: 200,
htmlMaxRows: 3000, htmlMaxRows: 3000,
maxDepth: 3, maxDepth: 3,
maxUrls: 10, maxUrls: 10,
@ -214,11 +212,14 @@ class Driver {
} }
const headers = getHeaders(browser); const headers = getHeaders(browser);
const html = this.getHtml(browser); const html = this.getHtml(browser)
;//.replace(new RegExp(`(.{${this.options.htmlMaxCols},}[^>]*>)<`, 'g'), (match, p1) => `${p1}\n<`);
const scripts = getScripts(browser); const scripts = getScripts(browser);
const js = this.getJs(browser); const js = this.getJs(browser);
const cookies = getCookies(browser); const cookies = getCookies(browser);
// console.log({ html, foo: html.split('\n').length });
this.wappalyzer.analyze(pageUrl, { this.wappalyzer.analyze(pageUrl, {
headers, headers,
html, html,

@ -1,6 +1,6 @@
{ {
"name": "wappalyzer", "name": "wappalyzer",
"version": "5.5.3", "version": "5.5.4",
"lockfileVersion": 1, "lockfileVersion": 1,
"requires": true, "requires": true,
"dependencies": { "dependencies": {

@ -2,7 +2,7 @@
"name": "wappalyzer", "name": "wappalyzer",
"description": "Uncovers the technologies used on websites", "description": "Uncovers the technologies used on websites",
"homepage": "https://github.com/AliasIO/Wappalyzer", "homepage": "https://github.com/AliasIO/Wappalyzer",
"version": "5.5.4", "version": "5.5.5",
"author": "Elbert Alias", "author": "Elbert Alias",
"license": "GPL-3.0", "license": "GPL-3.0",
"repository": { "repository": {

@ -1,12 +1,25 @@
/** global: browser */ /** global: browser */
/** global: XMLSerializer */ /** global: XMLSerializer */
/* global browser, chrome */
/* eslint-env browser */
function sendMessage(id, subject, callback) {
(chrome || browser).runtime.sendMessage({
id,
subject,
source: 'content.js',
}, callback || (() => {}));
}
if (typeof browser !== 'undefined' && typeof document.body !== 'undefined') { if (typeof browser !== 'undefined' && typeof document.body !== 'undefined') {
try { try {
sendMessage('init', {}); sendMessage('init', {});
// HTML // HTML
let html = new XMLSerializer().serializeToString(document).split('\n'); let html = new XMLSerializer().serializeToString(document)
.replace(new RegExp('(.{1000,}[^>]*>)<', 'g'), (match, p1) => `${p1}\n<`)
.split('\n');
html = html html = html
.slice(0, 1000).concat(html.slice(html.length - 1000)) .slice(0, 1000).concat(html.slice(html.length - 1000))
@ -31,14 +44,14 @@ if (typeof browser !== 'undefined' && typeof document.body !== 'undefined') {
return; return;
} }
removeEventListener('message', onMessage); window.removeEventListener('message', onMessage);
sendMessage('analyze', { js: event.data.js }); sendMessage('analyze', { js: event.data.js });
script.remove(); script.remove();
}; };
addEventListener('message', onMessage); window.addEventListener('message', onMessage);
sendMessage('get_js_patterns', {}, (response) => { sendMessage('get_js_patterns', {}, (response) => {
if (response) { if (response) {
@ -58,14 +71,6 @@ if (typeof browser !== 'undefined' && typeof document.body !== 'undefined') {
} }
} }
function sendMessage(id, subject, callback) {
(chrome || browser).runtime.sendMessage({
id,
subject,
source: 'content.js',
}, callback || (() => {}));
}
// https://stackoverflow.com/a/44774834 // https://stackoverflow.com/a/44774834
// https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/tabs/executeScript#Return_value // https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/tabs/executeScript#Return_value
undefined; undefined; // eslint-disable-line no-unused-expressions

@ -81,7 +81,7 @@
} }
function getFrame(getFrameDetails, callback) { function getFrame(getFrameDetails, callback) {
if (typeof chrome !== 'undefined') { if (isChrome()) {
chrome.webNavigation.getFrame(getFrameDetails, callback); chrome.webNavigation.getFrame(getFrameDetails, callback);
} else if (typeof browser !== 'undefined') { } else if (typeof browser !== 'undefined') {
const gettingFrame = browser.webNavigation.getFrame(getFrameDetails); const gettingFrame = browser.webNavigation.getFrame(getFrameDetails);

@ -4,7 +4,7 @@
"author": "Elbert Alias", "author": "Elbert Alias",
"homepage_url": "https://www.wappalyzer.com", "homepage_url": "https://www.wappalyzer.com",
"description": "Identify web technologies", "description": "Identify web technologies",
"version": "5.5.4", "version": "5.5.5",
"default_locale": "en", "default_locale": "en",
"manifest_version": 2, "manifest_version": 2,
"icons": { "icons": {