NPM driver: add batch processing, protocol check

main
Elbert Alias 7 years ago
parent 49914dd959
commit cb754545df

@ -27,10 +27,11 @@ node index.js [url] [options]
### Options
```
--chunk-size=num Process links in chunks.
--debug=0|1 Output debug messages.
--delay=ms Wait for ms milliseconds between requests.
--max-depth=num Don't analyze pages more than num levels deep.
--max-urls=num Exit when num URLs have been analyzed.
--max-depth=num Don't analyse pages more than num levels deep.
--max-urls=num Exit when num URLs have been analysed.
--max-wait=ms Wait no more than ms milliseconds for page resources to load.
--recursive=0|1 Follow links on pages (crawler).
--user-agent=str Set the user agent string.

@ -13,6 +13,7 @@ const extensions = /^([^.]+$|\.(asp|aspx|cgi|htm|html|jsp|php)$)/;
class Driver {
constructor(pageUrl, options) {
this.options = Object.assign({}, {
chunkSize: 5,
debug: false,
delay: 500,
maxDepth: 3,
@ -133,6 +134,7 @@ class Driver {
});
const links = Array.from(browser.document.getElementsByTagName('a'))
.filter(link => link.protocol === 'http:' || link.protocol === 'https:')
.filter(link => link.hostname === this.origPageUrl.hostname)
.filter(link => extensions.test(link.pathname))
.map(link => { link.hash = ''; return url.parse(link.href) });
@ -256,7 +258,7 @@ class Driver {
return js;
}
crawl(pageUrl, index = 1, depth = 1) {
crawl(pageUrl, index, depth = 1) {
pageUrl.canonical = pageUrl.protocol + '//' + pageUrl.host + pageUrl.pathname;
return new Promise(resolve => {
@ -264,7 +266,7 @@ class Driver {
.catch(() => {})
.then(links => {
if ( links && Boolean(this.options.recursive) && depth < this.options.maxDepth ) {
return Promise.all(links.map((link, index) => this.crawl(link, index + 1, depth + 1)));
return this.chunk(links.slice(0, this.options.maxUrls), depth + 1);
} else {
return Promise.resolve();
}
@ -279,6 +281,20 @@ class Driver {
});
}
chunk(links, depth, chunk = 0) {
if ( links.length === 0 ) {
return Promise.resolve();
}
const chunked = links.splice(0, this.options.chunkSize);
return new Promise(resolve => {
Promise.all(chunked.map((link, index) => this.crawl(link, index, depth)))
.then(() => this.chunk(links, depth, chunk + 1))
.then(() => resolve());
});
}
sleep(ms) {
return ms ? new Promise(resolve => setTimeout(resolve, ms)) : Promise.resolve();
}

@ -28,6 +28,12 @@ while ( arg = args.shift() ) {
const wappalyzer = new Wappalyzer(url, options);
setTimeout(() => {
console.log('force quit');
process.exit(1);
}, 10000);
wappalyzer.analyze()
.then(json => {
process.stdout.write(JSON.stringify(json) + '\n')

@ -2,7 +2,7 @@
"name": "wappalyzer",
"description": "Uncovers the technologies used on websites",
"homepage": "https://github.com/AliasIO/Wappalyzer",
"version": "5.4.3",
"version": "5.4.4",
"author": "Elbert Alias",
"license": "GPL-3.0",
"repository": {

@ -70,7 +70,7 @@ function appsToDomTemplate(response) {
'a', {
class: 'detected__app',
target: '_blank',
href: 'https://www.wappalyzer.com/applications/' + slugify(appName)
href: 'https://www.wappalyzer.com/technologies/' + slugify(appName)
}, [
'img', {
class: 'detected__app-icon',

@ -4,7 +4,7 @@
"author": "Elbert Alias",
"homepage_url": "https://www.wappalyzer.com",
"description": "Identify web technologies",
"version": "5.4.3",
"version": "5.4.4",
"default_locale": "en",
"manifest_version": 2,
"icons": {

Loading…
Cancel
Save