Add probe option, remove languagedetect dependency, emit additional events

main
Elbert Alias 4 years ago
parent 0f3066a1f1
commit fbf7ce3a68

@ -97,6 +97,19 @@
} }
] ]
}, },
"robots": {
"oneOf": [
{
"type": "array",
"items": {
"$ref": "#/definitions/non-empty-non-blank-string"
}
},
{
"$ref": "#/definitions/non-empty-non-blank-string"
}
]
},
"excludes": { "excludes": {
"oneOf": [ "oneOf": [
{ {

@ -31,6 +31,7 @@ wappalyzer <url> [options]
-m, --max-urls=... Exit when num URLs have been analysed -m, --max-urls=... Exit when num URLs have been analysed
-w, --max-wait=... Wait no more than ms milliseconds for page resources to load -w, --max-wait=... Wait no more than ms milliseconds for page resources to load
-P, --pretty Pretty-print JSON output -P, --pretty Pretty-print JSON output
-p, --probe Perform a deeper scan by requesting common files
-r, --recursive Follow links on pages (crawler) -r, --recursive Follow links on pages (crawler)
-a, --user-agent=... Set the user agent string -a, --user-agent=... Set the user agent string
``` ```
@ -59,6 +60,7 @@ const options = {
maxUrls: 10, maxUrls: 10,
maxWait: 5000, maxWait: 5000,
recursive: true, recursive: true,
probe: true,
userAgent: 'Wappalyzer', userAgent: 'Wappalyzer',
htmlMaxCols: 2000, htmlMaxCols: 2000,
htmlMaxRows: 2000, htmlMaxRows: 2000,
@ -117,3 +119,16 @@ const urls = ['https://www.wappalyzer.com', 'https://www.example.com']
await wappalyzer.destroy() await wappalyzer.destroy()
})() })()
``` ```
### Events
Listen to events with `site.on(eventName, callback)`. Use the `page` parameter to access the Puppeteer page instance ([reference](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#class-page)).
| Event | Parameters | Description |
|-------------|--------------------------------|------------------------------------------|
| `log` | `message`, `source` | Debug messages |
| `error` | `message`, `source` | Error messages |
| `request` | `page`, `request` | Emitted at the start of a request |
| `response` | `page`, `request` | Emitted upon receiving a server response |
| `goto` | `page`, `url`, `html`, `cookies`, `scripts`, `meta`, `js`, `language` `links` | Emitted after a page has been analysed |
| `analyze` | `urls`, `technologies`, `meta` | Emitted when the site has been analysed |

@ -1,7 +1,8 @@
const { URL } = require('url') const { URL } = require('url')
const fs = require('fs') const fs = require('fs')
const path = require('path') const path = require('path')
const LanguageDetect = require('languagedetect') const http = require('http')
const https = require('https')
const Wappalyzer = require('./wappalyzer') const Wappalyzer = require('./wappalyzer')
const { const {
@ -34,10 +35,6 @@ if (AWS_LAMBDA_FUNCTION_NAME) {
puppeteer = require('puppeteer') puppeteer = require('puppeteer')
} }
const languageDetect = new LanguageDetect()
languageDetect.setLanguageType('iso2')
const extensions = /^([^.]+$|\.(asp|aspx|cgi|htm|html|jsp|php)$)/ const extensions = /^([^.]+$|\.(asp|aspx|cgi|htm|html|jsp|php)$)/
const { apps: technologies, categories } = JSON.parse( const { apps: technologies, categories } = JSON.parse(
@ -64,6 +61,32 @@ function analyzeJs(js) {
) )
} }
function get(url) {
if (['http:', 'https:'].includes(url.protocol)) {
const { get } = url.protocol === 'http:' ? http : https
return new Promise((resolve, reject) =>
get(url.href, (response) => {
if (response.statusCode >= 400) {
return reject(
new Error(`${response.statusCode} ${response.statusMessage}`)
)
}
response.setEncoding('utf8')
let body = ''
response.on('data', (data) => (body += data))
response.on('error', (error) => reject(new Error(error.message)))
response.on('end', () => resolve(body))
})
)
} else {
throw new Error(`Invalid protocol: ${url.protocol}`)
}
}
class Driver { class Driver {
constructor(options = {}) { constructor(options = {}) {
this.options = { this.options = {
@ -74,16 +97,16 @@ class Driver {
htmlMaxRows: 3000, htmlMaxRows: 3000,
maxDepth: 3, maxDepth: 3,
maxUrls: 10, maxUrls: 10,
maxWait: 5000, maxWait: 30000,
recursive: false, recursive: false,
probe: false,
...options ...options
} }
this.options.debug = Boolean(+this.options.debug) this.options.debug = Boolean(+this.options.debug)
this.options.recursive = Boolean(+this.options.recursive) this.options.recursive = Boolean(+this.options.recursive)
this.options.delay = this.options.recursive this.options.probe = Boolean(+this.options.probe)
? parseInt(this.options.delay, 10) this.options.delay = parseInt(this.options.delay, 10)
: 0
this.options.maxDepth = parseInt(this.options.maxDepth, 10) this.options.maxDepth = parseInt(this.options.maxDepth, 10)
this.options.maxUrls = parseInt(this.options.maxUrls, 10) this.options.maxUrls = parseInt(this.options.maxUrls, 10)
this.options.maxWait = parseInt(this.options.maxWait, 10) this.options.maxWait = parseInt(this.options.maxWait, 10)
@ -161,7 +184,6 @@ class Site {
this.analyzedUrls = {} this.analyzedUrls = {}
this.detections = [] this.detections = []
this.language = ''
this.listeners = {} this.listeners = {}
@ -191,7 +213,9 @@ class Site {
emit(event, params) { emit(event, params) {
if (this.listeners[event]) { if (this.listeners[event]) {
this.listeners[event].forEach((listener) => listener(params)) return Promise.all(
this.listeners[event].map((listener) => listener(params))
)
} }
} }
@ -230,15 +254,13 @@ class Site {
await page.setRequestInterception(true) await page.setRequestInterception(true)
page.on('console', (msg) => console.log('PAGE LOG:', msg._text))
page.on('dialog', (dialog) => dialog.dismiss()) page.on('dialog', (dialog) => dialog.dismiss())
page.on('error', (error) => this.error(error)) page.on('error', (error) => this.error(error))
let responseReceived = false let responseReceived = false
page.on('request', (request) => { page.on('request', async (request) => {
try { try {
if ( if (
(responseReceived && request.isNavigationRequest()) || (responseReceived && request.isNavigationRequest()) ||
@ -252,6 +274,8 @@ class Site {
...this.options.headers ...this.options.headers
} }
await this.emit('request', { page, request })
request.continue({ headers }) request.continue({ headers })
} }
} catch (error) { } catch (error) {
@ -259,7 +283,7 @@ class Site {
} }
}) })
page.on('response', (response) => { page.on('response', async (response) => {
try { try {
if (response.url() === url.href) { if (response.url() === url.href) {
this.analyzedUrls[url.href] = { this.analyzedUrls[url.href] = {
@ -288,6 +312,8 @@ class Site {
responseReceived = true responseReceived = true
this.onDetect(analyze({ headers })) this.onDetect(analyze({ headers }))
await this.emit('response', { page, response })
} }
} }
} catch (error) { } catch (error) {
@ -440,34 +466,6 @@ class Site {
throw new Error('No response from server') throw new Error('No response from server')
} }
if (!this.language) {
this.language = await Promise.race([
this.timeout(),
(
await page.evaluateHandle(
() =>
document.documentElement.getAttribute('lang') ||
document.documentElement.getAttribute('xml:lang')
)
).jsonValue()
])
}
if (!this.language) {
try {
const [attrs] = languageDetect.detect(
html.replace(/<\/?[^>]+(>|$)/gs, ' '),
1
)
if (attrs) {
;[this.language] = attrs
}
} catch (error) {
this.error(error)
}
}
this.onDetect(analyzeJs(js)) this.onDetect(analyzeJs(js))
this.onDetect( this.onDetect(
@ -503,12 +501,21 @@ class Site {
[] []
) )
await this.emit('goto', {
page,
url,
html,
cookies,
scripts,
meta,
js,
links: reducedLinks
})
await page.close() await page.close()
this.log('Page closed') this.log('Page closed')
this.emit('goto', url)
return reducedLinks return reducedLinks
} catch (error) { } catch (error) {
this.error(error) this.error(error)
@ -517,7 +524,13 @@ class Site {
async analyze(url = this.originalUrl, index = 1, depth = 1) { async analyze(url = this.originalUrl, index = 1, depth = 1) {
try { try {
if (this.recursive) {
await sleep(this.options.delay * index) await sleep(this.options.delay * index)
}
if (this.options.probe) {
await this.probe(url)
}
const links = await this.goto(url) const links = await this.goto(url)
@ -533,7 +546,7 @@ class Site {
this.error(error) this.error(error)
} }
return { const results = {
urls: this.analyzedUrls, urls: this.analyzedUrls,
technologies: resolve(this.detections).map( technologies: resolve(this.detections).map(
({ ({
@ -559,9 +572,32 @@ class Site {
name name
})) }))
}) })
), )
meta: { }
language: this.language
await this.emit('analyze', results)
return results
}
async probe(url) {
const files = {
robots: '/robots.txt'
}
for (const file of Object.keys(files)) {
const path = files[file]
try {
await sleep(this.options.delay)
const body = await get(new URL(path, url.href))
this.log(`get ${path}: ok`)
this.onDetect(analyze({ [file]: body }))
} catch (error) {
this.error(`get ${path}: ${error.message || error}`)
} }
} }
} }
@ -580,7 +616,7 @@ class Site {
await this.batch(links, depth, batch + 1) await this.batch(links, depth, batch + 1)
} }
onDetect(detections = [], language) { onDetect(detections = []) {
this.detections = this.detections.concat(detections) this.detections = this.detections.concat(detections)
this.detections.filter( this.detections.filter(

@ -35,7 +35,6 @@
"wappalyzer": "./cli.js" "wappalyzer": "./cli.js"
}, },
"dependencies": { "dependencies": {
"languagedetect": "^2.0.0",
"puppeteer": "^2.0.0" "puppeteer": "^2.0.0"
} }
} }

@ -180,7 +180,7 @@ const Wappalyzer = {
* Initialize analyzation. * Initialize analyzation.
* @param {*} param0 * @param {*} param0
*/ */
analyze({ url, html, meta, headers, cookies, scripts }) { analyze({ url, html, robots, meta, headers, cookies, scripts }) {
const oo = Wappalyzer.analyzeOneToOne const oo = Wappalyzer.analyzeOneToOne
const om = Wappalyzer.analyzeOneToMany const om = Wappalyzer.analyzeOneToMany
const mm = Wappalyzer.analyzeManyToMany const mm = Wappalyzer.analyzeManyToMany
@ -193,6 +193,7 @@ const Wappalyzer = {
flatten([ flatten([
oo(technology, 'url', url), oo(technology, 'url', url),
oo(technology, 'html', html), oo(technology, 'html', html),
oo(technology, 'robots', robots),
om(technology, 'scripts', scripts), om(technology, 'scripts', scripts),
mm(technology, 'cookies', cookies), mm(technology, 'cookies', cookies),
mm(technology, 'meta', meta), mm(technology, 'meta', meta),
@ -219,6 +220,7 @@ const Wappalyzer = {
cats, cats,
url, url,
html, html,
robots,
meta, meta,
headers, headers,
cookies, cookies,
@ -239,6 +241,7 @@ const Wappalyzer = {
headers: transform(headers), headers: transform(headers),
cookies: transform(cookies), cookies: transform(cookies),
html: transform(html), html: transform(html),
robots: transform(robots),
meta: transform(meta), meta: transform(meta),
scripts: transform(script), scripts: transform(script),
js: transform(js, true), js: transform(js, true),