Add probe option, remove languagedetect dependency, emit additional events

main
Elbert Alias 5 years ago
parent 0f3066a1f1
commit fbf7ce3a68

@ -97,6 +97,19 @@
}
]
},
"robots": {
"oneOf": [
{
"type": "array",
"items": {
"$ref": "#/definitions/non-empty-non-blank-string"
}
},
{
"$ref": "#/definitions/non-empty-non-blank-string"
}
]
},
"excludes": {
"oneOf": [
{

@ -31,6 +31,7 @@ wappalyzer <url> [options]
-m, --max-urls=... Exit when num URLs have been analysed
-w, --max-wait=... Wait no more than ms milliseconds for page resources to load
-P, --pretty Pretty-print JSON output
-p, --probe Perform a deeper scan by requesting common files
-r, --recursive Follow links on pages (crawler)
-a, --user-agent=... Set the user agent string
```
@ -59,6 +60,7 @@ const options = {
maxUrls: 10,
maxWait: 5000,
recursive: true,
probe: true,
userAgent: 'Wappalyzer',
htmlMaxCols: 2000,
htmlMaxRows: 2000,
@ -117,3 +119,16 @@ const urls = ['https://www.wappalyzer.com', 'https://www.example.com']
await wappalyzer.destroy()
})()
```
### Events
Listen to events with `site.on(eventName, callback)`. Use the `page` parameter to access the Puppeteer page instance ([reference](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#class-page)).
| Event | Parameters | Description |
|-------------|--------------------------------|------------------------------------------|
| `log` | `message`, `source` | Debug messages |
| `error` | `message`, `source` | Error messages |
| `request` | `page`, `request` | Emitted at the start of a request |
| `response` | `page`, `request` | Emitted upon receiving a server response |
| `goto` | `page`, `url`, `html`, `cookies`, `scripts`, `meta`, `js`, `language` `links` | Emitted after a page has been analysed |
| `analyze` | `urls`, `technologies`, `meta` | Emitted when the site has been analysed |

@ -1,7 +1,8 @@
const { URL } = require('url')
const fs = require('fs')
const path = require('path')
const LanguageDetect = require('languagedetect')
const http = require('http')
const https = require('https')
const Wappalyzer = require('./wappalyzer')
const {
@ -34,10 +35,6 @@ if (AWS_LAMBDA_FUNCTION_NAME) {
puppeteer = require('puppeteer')
}
const languageDetect = new LanguageDetect()
languageDetect.setLanguageType('iso2')
const extensions = /^([^.]+$|\.(asp|aspx|cgi|htm|html|jsp|php)$)/
const { apps: technologies, categories } = JSON.parse(
@ -64,6 +61,32 @@ function analyzeJs(js) {
)
}
function get(url) {
if (['http:', 'https:'].includes(url.protocol)) {
const { get } = url.protocol === 'http:' ? http : https
return new Promise((resolve, reject) =>
get(url.href, (response) => {
if (response.statusCode >= 400) {
return reject(
new Error(`${response.statusCode} ${response.statusMessage}`)
)
}
response.setEncoding('utf8')
let body = ''
response.on('data', (data) => (body += data))
response.on('error', (error) => reject(new Error(error.message)))
response.on('end', () => resolve(body))
})
)
} else {
throw new Error(`Invalid protocol: ${url.protocol}`)
}
}
class Driver {
constructor(options = {}) {
this.options = {
@ -74,16 +97,16 @@ class Driver {
htmlMaxRows: 3000,
maxDepth: 3,
maxUrls: 10,
maxWait: 5000,
maxWait: 30000,
recursive: false,
probe: false,
...options
}
this.options.debug = Boolean(+this.options.debug)
this.options.recursive = Boolean(+this.options.recursive)
this.options.delay = this.options.recursive
? parseInt(this.options.delay, 10)
: 0
this.options.probe = Boolean(+this.options.probe)
this.options.delay = parseInt(this.options.delay, 10)
this.options.maxDepth = parseInt(this.options.maxDepth, 10)
this.options.maxUrls = parseInt(this.options.maxUrls, 10)
this.options.maxWait = parseInt(this.options.maxWait, 10)
@ -161,7 +184,6 @@ class Site {
this.analyzedUrls = {}
this.detections = []
this.language = ''
this.listeners = {}
@ -191,7 +213,9 @@ class Site {
emit(event, params) {
if (this.listeners[event]) {
this.listeners[event].forEach((listener) => listener(params))
return Promise.all(
this.listeners[event].map((listener) => listener(params))
)
}
}
@ -230,15 +254,13 @@ class Site {
await page.setRequestInterception(true)
page.on('console', (msg) => console.log('PAGE LOG:', msg._text))
page.on('dialog', (dialog) => dialog.dismiss())
page.on('error', (error) => this.error(error))
let responseReceived = false
page.on('request', (request) => {
page.on('request', async (request) => {
try {
if (
(responseReceived && request.isNavigationRequest()) ||
@ -252,6 +274,8 @@ class Site {
...this.options.headers
}
await this.emit('request', { page, request })
request.continue({ headers })
}
} catch (error) {
@ -259,7 +283,7 @@ class Site {
}
})
page.on('response', (response) => {
page.on('response', async (response) => {
try {
if (response.url() === url.href) {
this.analyzedUrls[url.href] = {
@ -288,6 +312,8 @@ class Site {
responseReceived = true
this.onDetect(analyze({ headers }))
await this.emit('response', { page, response })
}
}
} catch (error) {
@ -440,34 +466,6 @@ class Site {
throw new Error('No response from server')
}
if (!this.language) {
this.language = await Promise.race([
this.timeout(),
(
await page.evaluateHandle(
() =>
document.documentElement.getAttribute('lang') ||
document.documentElement.getAttribute('xml:lang')
)
).jsonValue()
])
}
if (!this.language) {
try {
const [attrs] = languageDetect.detect(
html.replace(/<\/?[^>]+(>|$)/gs, ' '),
1
)
if (attrs) {
;[this.language] = attrs
}
} catch (error) {
this.error(error)
}
}
this.onDetect(analyzeJs(js))
this.onDetect(
@ -503,12 +501,21 @@ class Site {
[]
)
await this.emit('goto', {
page,
url,
html,
cookies,
scripts,
meta,
js,
links: reducedLinks
})
await page.close()
this.log('Page closed')
this.emit('goto', url)
return reducedLinks
} catch (error) {
this.error(error)
@ -517,7 +524,13 @@ class Site {
async analyze(url = this.originalUrl, index = 1, depth = 1) {
try {
await sleep(this.options.delay * index)
if (this.recursive) {
await sleep(this.options.delay * index)
}
if (this.options.probe) {
await this.probe(url)
}
const links = await this.goto(url)
@ -533,7 +546,7 @@ class Site {
this.error(error)
}
return {
const results = {
urls: this.analyzedUrls,
technologies: resolve(this.detections).map(
({
@ -559,9 +572,32 @@ class Site {
name
}))
})
),
meta: {
language: this.language
)
}
await this.emit('analyze', results)
return results
}
async probe(url) {
const files = {
robots: '/robots.txt'
}
for (const file of Object.keys(files)) {
const path = files[file]
try {
await sleep(this.options.delay)
const body = await get(new URL(path, url.href))
this.log(`get ${path}: ok`)
this.onDetect(analyze({ [file]: body }))
} catch (error) {
this.error(`get ${path}: ${error.message || error}`)
}
}
}
@ -580,7 +616,7 @@ class Site {
await this.batch(links, depth, batch + 1)
}
onDetect(detections = [], language) {
onDetect(detections = []) {
this.detections = this.detections.concat(detections)
this.detections.filter(

@ -35,7 +35,6 @@
"wappalyzer": "./cli.js"
},
"dependencies": {
"languagedetect": "^2.0.0",
"puppeteer": "^2.0.0"
}
}
}

@ -180,7 +180,7 @@ const Wappalyzer = {
* Initialize analyzation.
* @param {*} param0
*/
analyze({ url, html, meta, headers, cookies, scripts }) {
analyze({ url, html, robots, meta, headers, cookies, scripts }) {
const oo = Wappalyzer.analyzeOneToOne
const om = Wappalyzer.analyzeOneToMany
const mm = Wappalyzer.analyzeManyToMany
@ -193,6 +193,7 @@ const Wappalyzer = {
flatten([
oo(technology, 'url', url),
oo(technology, 'html', html),
oo(technology, 'robots', robots),
om(technology, 'scripts', scripts),
mm(technology, 'cookies', cookies),
mm(technology, 'meta', meta),
@ -219,6 +220,7 @@ const Wappalyzer = {
cats,
url,
html,
robots,
meta,
headers,
cookies,
@ -239,6 +241,7 @@ const Wappalyzer = {
headers: transform(headers),
cookies: transform(cookies),
html: transform(html),
robots: transform(robots),
meta: transform(meta),
scripts: transform(script),
js: transform(js, true),