Add crawler functionality in NPM driver

main
Elbert Alias 8 years ago
parent 43911f62c5
commit 211c1d659c

@ -31,7 +31,11 @@ $ node index.js https://www.wappalyzer.com
const options = {
userAgent: 'Wappalyzer',
maxWait: 3000,
debug: false
debug: false,
recursive: true,
maxDepth: 3,
maxUrls: 10,
delay: 500,
};
const wappalyzer = require('wappalyzer')(options);

@ -3,32 +3,30 @@
const driver = options => {
const Wappalyzer = require('./wappalyzer');
const request = require('request');
const url = require('url');
const fs = require('fs');
const Browser = require('zombie');
const json = JSON.parse(fs.readFileSync(__dirname + '/apps.json'));
return {
analyze: url => {
analyze: pageUrl => {
const origPageUrl = url.parse(pageUrl);
const analyzedPageUrls = [];
const apps = [];
const wappalyzer = new Wappalyzer();
wappalyzer.apps = json.apps;
wappalyzer.categories = json.categories;
return new Promise((resolve, reject) => {
wappalyzer.driver.log = (message, source, type) => {
if ( type === 'error' ) {
return reject(message);
}
if ( Boolean(options.debug) ) {
console.log('[wappalyzer ' + type + ']', '[' + source + ']', message);
}
};
wappalyzer.driver.displayApps = detected => {
var apps = [];
Object.keys(detected).forEach(appName => {
const app = detected[appName];
@ -42,6 +40,7 @@ const driver = options => {
categories.push(category)
});
if ( !apps.some(detectedApp => detectedApp.name === app.name) ) {
apps.push({
name: app.name,
confidence: app.confidenceTotal.toString(),
@ -50,21 +49,41 @@ const driver = options => {
website: app.props.website,
categories
});
}
});
resolve(apps);
};
const browser = new Browser({
userAgent: options.userAgent
userAgent: options.userAgent,
waitDuration: options.maxWait + 'ms',
});
browser.visit(url, error => {
const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
const fetch = (pageUrl, index, depth) => {
return new Promise(async (resolve, reject) => {
// Return when the URL is a duplicate or maxUrls has been reached
if ( analyzedPageUrls.indexOf(pageUrl.href) !== -1 || analyzedPageUrls.length >= options.maxUrls ) {
return resolve();
}
analyzedPageUrls.push(pageUrl.href);
wappalyzer.log('depth: ' + depth + '; delay: ' + ( options.delay * index ) + 'ms; url: ' + pageUrl.href, 'driver');
// Be nice
if ( options.delay ) {
await sleep(options.delay * index);
}
browser.visit(pageUrl.href, error => {
if ( !browser.resources['0'] || !browser.resources['0'].response ) {
return wappalyzer.log('No response from server', 'driver', 'error');
wappalyzer.log('No response from server', 'browser', 'error');
return resolve();
}
browser.wait(options.maxWait)
browser.wait()
.catch(error => wappalyzer.log(error.message, 'browser'))
.finally(() => {
wappalyzer.driver.document = browser.document;
@ -75,6 +94,7 @@ const driver = options => {
if ( !headers[header[0]] ){
headers[header[0]] = [];
}
headers[header[0]].push(header[1]);
});
@ -85,17 +105,40 @@ const driver = options => {
.filter(s => s.src)
.map(s => s.src);
const hostname = wappalyzer.parseUrl(url).hostname;
wappalyzer.analyze(hostname, url, {
wappalyzer.analyze(pageUrl.hostname, pageUrl.href, {
headers,
html,
env: vars,
scripts
});
resolve(browser);
});
});
});
};
const crawl = async (pageUrl, index, depth) => {
try {
const browser = await fetch(pageUrl, index, depth);
if ( options.recursive && depth < options.maxDepth && browser ) {
const links = Array.from(browser.body.getElementsByTagName('a')).filter(link => link.hostname === origPageUrl.hostname);
await Promise.all(links.map(async (link, index) => {
link.hash = '';
return crawl(link, index, depth + 1);
}));
}
return Promise.resolve(apps);
} catch (error) {
return Promise.reject(error);
}
};
return crawl(origPageUrl, 1, 1);
}
};
};

@ -1,9 +1,13 @@
'use strict';
const options = {
userAgent: null,
userAgent: 'Mozilla/5.0 (compatible; Wappalyzer)',
maxWait: 3000,
debug: false
debug: true,
recursive: true,
maxDepth: 3,
maxUrls: 10,
delay: 500,
};
const args = process.argv.slice(2);