Add crawler functionality in NPM driver

main
Elbert Alias 8 years ago
parent 43911f62c5
commit 211c1d659c

@ -31,7 +31,11 @@ $ node index.js https://www.wappalyzer.com
const options = { const options = {
userAgent: 'Wappalyzer', userAgent: 'Wappalyzer',
maxWait: 3000, maxWait: 3000,
debug: false debug: false,
recursive: true,
maxDepth: 3,
maxUrls: 10,
delay: 500,
}; };
const wappalyzer = require('wappalyzer')(options); const wappalyzer = require('wappalyzer')(options);

@ -3,45 +3,44 @@
const driver = options => { const driver = options => {
const Wappalyzer = require('./wappalyzer'); const Wappalyzer = require('./wappalyzer');
const request = require('request'); const request = require('request');
const url = require('url');
const fs = require('fs'); const fs = require('fs');
const Browser = require('zombie'); const Browser = require('zombie');
const json = JSON.parse(fs.readFileSync(__dirname + '/apps.json')); const json = JSON.parse(fs.readFileSync(__dirname + '/apps.json'));
return { return {
analyze: url => { analyze: pageUrl => {
const origPageUrl = url.parse(pageUrl);
const analyzedPageUrls = [];
const apps = [];
const wappalyzer = new Wappalyzer(); const wappalyzer = new Wappalyzer();
wappalyzer.apps = json.apps; wappalyzer.apps = json.apps;
wappalyzer.categories = json.categories; wappalyzer.categories = json.categories;
return new Promise((resolve, reject) => { wappalyzer.driver.log = (message, source, type) => {
wappalyzer.driver.log = (message, source, type) => { if ( Boolean(options.debug) ) {
if ( type === 'error' ) { console.log('[wappalyzer ' + type + ']', '[' + source + ']', message);
return reject(message); }
} };
if ( Boolean(options.debug) ) {
console.log('[wappalyzer ' + type + ']', '[' + source + ']', message);
}
};
wappalyzer.driver.displayApps = detected => {
var apps = [];
Object.keys(detected).forEach(appName => { wappalyzer.driver.displayApps = detected => {
const app = detected[appName]; Object.keys(detected).forEach(appName => {
const app = detected[appName];
var categories = []; var categories = [];
app.props.cats.forEach(id => { app.props.cats.forEach(id => {
var category = {}; var category = {};
category[id] = wappalyzer.categories[id].name; category[id] = wappalyzer.categories[id].name;
categories.push(category) categories.push(category)
}); });
if ( !apps.some(detectedApp => detectedApp.name === app.name) ) {
apps.push({ apps.push({
name: app.name, name: app.name,
confidence: app.confidenceTotal.toString(), confidence: app.confidenceTotal.toString(),
@ -50,52 +49,96 @@ const driver = options => {
website: app.props.website, website: app.props.website,
categories categories
}); });
}); }
});
};
resolve(apps); const browser = new Browser({
}; userAgent: options.userAgent,
waitDuration: options.maxWait + 'ms',
});
const browser = new Browser({ const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
userAgent: options.userAgent
});
browser.visit(url, error => { const fetch = (pageUrl, index, depth) => {
if ( !browser.resources['0'] || !browser.resources['0'].response ) { return new Promise(async (resolve, reject) => {
return wappalyzer.log('No response from server', 'driver', 'error'); // Return when the URL is a duplicate or maxUrls has been reached
if ( analyzedPageUrls.indexOf(pageUrl.href) !== -1 || analyzedPageUrls.length >= options.maxUrls ) {
return resolve();
} }
browser.wait(options.maxWait) analyzedPageUrls.push(pageUrl.href);
.catch(error => wappalyzer.log(error.message, 'browser'))
.finally(() => {
wappalyzer.driver.document = browser.document;
const headers = {}; wappalyzer.log('depth: ' + depth + '; delay: ' + ( options.delay * index ) + 'ms; url: ' + pageUrl.href, 'driver');
browser.resources['0'].response.headers._headers.forEach(header => { // Be nice
if ( !headers[header[0]] ){ if ( options.delay ) {
headers[header[0]] = []; await sleep(options.delay * index);
} }
headers[header[0]].push(header[1]);
}); browser.visit(pageUrl.href, error => {
if ( !browser.resources['0'] || !browser.resources['0'].response ) {
wappalyzer.log('No response from server', 'browser', 'error');
const vars = Object.getOwnPropertyNames(browser.window); return resolve();
const html = browser.html(); }
const scripts = Array.prototype.slice
.apply(browser.document.scripts)
.filter(s => s.src)
.map(s => s.src);
const hostname = wappalyzer.parseUrl(url).hostname; browser.wait()
.catch(error => wappalyzer.log(error.message, 'browser'))
.finally(() => {
wappalyzer.driver.document = browser.document;
wappalyzer.analyze(hostname, url, { const headers = {};
headers,
html, browser.resources['0'].response.headers._headers.forEach(header => {
env: vars, if ( !headers[header[0]] ){
scripts headers[header[0]] = [];
}
headers[header[0]].push(header[1]);
});
const vars = Object.getOwnPropertyNames(browser.window);
const html = browser.html();
const scripts = Array.prototype.slice
.apply(browser.document.scripts)
.filter(s => s.src)
.map(s => s.src);
wappalyzer.analyze(pageUrl.hostname, pageUrl.href, {
headers,
html,
env: vars,
scripts
});
resolve(browser);
}); });
}); });
}); });
}); };
const crawl = async (pageUrl, index, depth) => {
try {
const browser = await fetch(pageUrl, index, depth);
if ( options.recursive && depth < options.maxDepth && browser ) {
const links = Array.from(browser.body.getElementsByTagName('a')).filter(link => link.hostname === origPageUrl.hostname);
await Promise.all(links.map(async (link, index) => {
link.hash = '';
return crawl(link, index, depth + 1);
}));
}
return Promise.resolve(apps);
} catch (error) {
return Promise.reject(error);
}
};
return crawl(origPageUrl, 1, 1);
} }
}; };
}; };

@ -1,9 +1,13 @@
'use strict'; 'use strict';
const options = { const options = {
userAgent: null, userAgent: 'Mozilla/5.0 (compatible; Wappalyzer)',
maxWait: 3000, maxWait: 3000,
debug: false debug: true,
recursive: true,
maxDepth: 3,
maxUrls: 10,
delay: 500,
}; };
const args = process.argv.slice(2); const args = process.argv.slice(2);