Improve parsing of minified HTML

main
Elbert Alias 6 years ago
parent 21ff078ec7
commit a2a18e12b5

@ -2205,7 +2205,7 @@
"criteo_pubtag": "",
"criteo_q": ""
},
"script":[
"script": [
"//(?:cas\\.criteo\\.com|(?:[^/]\\.)?criteo\\.net)/",
"//static.criteo.net/js/ld/ld.js"
],
@ -3710,7 +3710,7 @@
"_gat": ""
},
"icon": "Google Analytics.svg",
"html": "<amp-analytics [^>]*type=[\"']googleanalytics[\"']",
"html": "<amp-analytics [^>]*type=[\"']googleanalytics[\"']",
"js": {
"GoogleAnalyticsObject": "",
"gaGlobal": ""
@ -6767,7 +6767,7 @@
],
"headers": {
"Server": "nginx(?:/([\\d.]+))?\\;version:\\1",
"X-Fastcgi-Cache": ""
"X-Fastcgi-Cache": ""
},
"icon": "Nginx.svg",
"website": "http://nginx.org/en"
@ -9704,7 +9704,7 @@
"script": [
"^//tags\\.tiqcdn\\.com/",
"/tealium/utag\\.js$"
],
],
"website": "http://tealium.com"
},
"TeamCity": {
@ -11234,7 +11234,7 @@
"cats": [
42
],
"html":"<!-- (?:End )?Yahoo! Tag Manager -->",
"html": "<!-- (?:End )?Yahoo! Tag Manager -->",
"script": "b\\.yjtag\\.jp/iframe",
"icon": "yahoo.png",
"website": "https://tagmanager.yahoo.co.jp/"

@ -1,5 +1,3 @@
const url = require('url');
const fs = require('fs');
const path = require('path');
@ -77,7 +75,7 @@ class Driver {
chunkSize: 5,
debug: false,
delay: 500,
htmlMaxCols: 2000,
htmlMaxCols: 200,
htmlMaxRows: 3000,
maxDepth: 3,
maxUrls: 10,
@ -214,11 +212,14 @@ class Driver {
}
const headers = getHeaders(browser);
const html = this.getHtml(browser);
const html = this.getHtml(browser)
;//.replace(new RegExp(`(.{${this.options.htmlMaxCols},}[^>]*>)<`, 'g'), (match, p1) => `${p1}\n<`);
const scripts = getScripts(browser);
const js = this.getJs(browser);
const cookies = getCookies(browser);
// console.log({ html, foo: html.split('\n').length });
this.wappalyzer.analyze(pageUrl, {
headers,
html,

@ -1,6 +1,6 @@
{
"name": "wappalyzer",
"version": "5.5.3",
"version": "5.5.4",
"lockfileVersion": 1,
"requires": true,
"dependencies": {

@ -2,7 +2,7 @@
"name": "wappalyzer",
"description": "Uncovers the technologies used on websites",
"homepage": "https://github.com/AliasIO/Wappalyzer",
"version": "5.5.4",
"version": "5.5.5",
"author": "Elbert Alias",
"license": "GPL-3.0",
"repository": {

@ -1,12 +1,25 @@
/** global: browser */
/** global: XMLSerializer */
/* global browser, chrome */
/* eslint-env browser */
function sendMessage(id, subject, callback) {
(chrome || browser).runtime.sendMessage({
id,
subject,
source: 'content.js',
}, callback || (() => {}));
}
if (typeof browser !== 'undefined' && typeof document.body !== 'undefined') {
try {
sendMessage('init', {});
// HTML
let html = new XMLSerializer().serializeToString(document).split('\n');
let html = new XMLSerializer().serializeToString(document)
.replace(new RegExp('(.{1000,}[^>]*>)<', 'g'), (match, p1) => `${p1}\n<`)
.split('\n');
html = html
.slice(0, 1000).concat(html.slice(html.length - 1000))
@ -31,14 +44,14 @@ if (typeof browser !== 'undefined' && typeof document.body !== 'undefined') {
return;
}
removeEventListener('message', onMessage);
window.removeEventListener('message', onMessage);
sendMessage('analyze', { js: event.data.js });
script.remove();
};
addEventListener('message', onMessage);
window.addEventListener('message', onMessage);
sendMessage('get_js_patterns', {}, (response) => {
if (response) {
@ -58,14 +71,6 @@ if (typeof browser !== 'undefined' && typeof document.body !== 'undefined') {
}
}
function sendMessage(id, subject, callback) {
(chrome || browser).runtime.sendMessage({
id,
subject,
source: 'content.js',
}, callback || (() => {}));
}
// https://stackoverflow.com/a/44774834
// https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/tabs/executeScript#Return_value
undefined;
undefined; // eslint-disable-line no-unused-expressions

@ -81,7 +81,7 @@
}
function getFrame(getFrameDetails, callback) {
if (typeof chrome !== 'undefined') {
if (isChrome()) {
chrome.webNavigation.getFrame(getFrameDetails, callback);
} else if (typeof browser !== 'undefined') {
const gettingFrame = browser.webNavigation.getFrame(getFrameDetails);

@ -4,7 +4,7 @@
"author": "Elbert Alias",
"homepage_url": "https://www.wappalyzer.com",
"description": "Identify web technologies",
"version": "5.5.4",
"version": "5.5.5",
"default_locale": "en",
"manifest_version": 2,
"icons": {