From 15c2ca8ca39ce6f49626fa0c61305b781f46737a Mon Sep 17 00:00:00 2001 From: Elbert Alias <77259+AliasIO@users.noreply.github.com> Date: Wed, 28 Dec 2022 10:28:16 +1100 Subject: [PATCH] Implement extended probe feature --- README.md | 11 ++++ src/drivers/npm/cli.js | 36 ++++++------- src/drivers/npm/driver.js | 46 ++++++++++++++--- src/technologies/m.json | 4 +- src/technologies/s.json | 7 ++- src/wappalyzer.js | 106 ++++++++++++++++++-------------------- 6 files changed, 125 insertions(+), 85 deletions(-) diff --git a/README.md b/README.md index 6204affb3..177419a94 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,9 @@ Patterns (regular expressions) are kept in [`src/technologies/`](https://github. "meta": { "generator": "(?:Example|Another Example)" }, + "probe": { + "/path": "" + }, "scriptSrc": "example-([0-9.]+)\\.js\\;confidence:50\\;version:\\1", "scripts": "function webpackJsonpCallback\\(data\\) {", "url": "example\\.com", @@ -367,6 +370,14 @@ Plus any of: "\\.example-class" + + probe + Object + + Request a URL to test for its existance or match text content (NPM driver only). + + { "/path": "Example text" } + robots String | Array diff --git a/src/drivers/npm/cli.js b/src/drivers/npm/cli.js index bda0ad5ed..36dc896bb 100755 --- a/src/drivers/npm/cli.js +++ b/src/drivers/npm/cli.js @@ -72,24 +72,24 @@ Examples: docker wappalyzer/cli https://www.example.com --pretty Options: - -b, --batch-size=... Process links in batches - -d, --debug Output debug messages - -t, --delay=ms Wait for ms milliseconds between requests - -h, --help This text - -H, --header Extra header to send with requests - --html-max-cols=... Limit the number of HTML characters per line processed - --html-max-rows=... Limit the number of HTML lines processed - -D, --max-depth=... Don't analyse pages more than num levels deep - -m, --max-urls=... Exit when num URLs have been analysed - -w, --max-wait=... Wait no more than ms milliseconds for page resources to load - -p, --probe Perform a deeper scan by performing additional requests and inspecting DNS records - -P, --pretty Pretty-print JSON output - --proxy=... Proxy URL, e.g. 'http://user:pass@proxy:8080' - -r, --recursive Follow links on pages (crawler) - -a, --user-agent=... Set the user agent string - -n, --no-scripts Disabled JavaScript on web pages - -N, --no-redirect Disable cross-domain redirects - -e, --extended Output additional information + -b, --batch-size=... Process links in batches + -d, --debug Output debug messages + -t, --delay=ms Wait for ms milliseconds between requests + -h, --help This text + -H, --header Extra header to send with requests + --html-max-cols=... Limit the number of HTML characters per line processed + --html-max-rows=... Limit the number of HTML lines processed + -D, --max-depth=... Don't analyse pages more than num levels deep + -m, --max-urls=... Exit when num URLs have been analysed + -w, --max-wait=... Wait no more than ms milliseconds for page resources to load + -p, --probe=[basic|full] Perform a deeper scan by performing additional requests and inspecting DNS records + -P, --pretty Pretty-print JSON output + --proxy=... Proxy URL, e.g. 'http://user:pass@proxy:8080' + -r, --recursive Follow links on pages (crawler) + -a, --user-agent=... Set the user agent string + -n, --no-scripts Disabled JavaScript on web pages + -N, --no-redirect Disable cross-domain redirects + -e, --extended Output additional information `) process.exit(options.help ? 0 : 1) } diff --git a/src/drivers/npm/driver.js b/src/drivers/npm/driver.js index 5f0e73602..bd84ef2a7 100644 --- a/src/drivers/npm/driver.js +++ b/src/drivers/npm/driver.js @@ -289,7 +289,7 @@ function get(url, options = {}) { }, }, (response) => { - if (response.statusCode >= 400) { + if (response.statusCode >= 300) { return reject( new Error(`${response.statusCode} ${response.statusMessage}`) ) @@ -337,7 +337,12 @@ class Driver { this.options.debug = Boolean(+this.options.debug) this.options.recursive = Boolean(+this.options.recursive) - this.options.probe = Boolean(+this.options.probe) + this.options.probe = + String(this.options.probe || '').toLowerCase() === 'basic' + ? 'basic' + : String(this.options.probe || '').toLowerCase() === 'full' + ? 'full' + : Boolean(+this.options.probe) && 'full' this.options.delay = parseInt(this.options.delay, 10) this.options.maxDepth = parseInt(this.options.maxDepth, 10) this.options.maxUrls = parseInt(this.options.maxUrls, 10) @@ -1153,8 +1158,25 @@ class Site { } async probe(url) { - const files = { - robots: '/robots.txt', + const paths = [ + { + type: 'robots', + path: '/robots.txt', + }, + ] + + if (this.options.probe === 'full') { + Wappalyzer.technologies + .filter(({ probe }) => Object.keys(probe).length) + .forEach((technology) => { + paths.push( + ...Object.keys(technology.probe).map((path) => ({ + type: 'probe', + path, + technology, + })) + ) + }) } // DNS @@ -1180,9 +1202,7 @@ class Site { await Promise.allSettled([ // Static files - ...Object.keys(files).map(async (file, index) => { - const path = files[file] - + ...paths.map(async ({ type, path, technology }, index) => { try { await sleep(this.options.delay * index) @@ -1193,7 +1213,17 @@ class Site { this.log(`Probe ok (${path})`) - await this.onDetect(url, analyze({ [file]: body.slice(0, 100000) })) + const text = body.slice(0, 100000) + + await this.onDetect( + url, + analyze( + { + [type]: path ? { [path]: [text] } : text, + }, + technology && [technology] + ) + ) } catch (error) { this.error(`Probe failed (${path}): ${error.message || error}`) } diff --git a/src/technologies/m.json b/src/technologies/m.json index 17c801dff..003b4082b 100644 --- a/src/technologies/m.json +++ b/src/technologies/m.json @@ -303,7 +303,7 @@ }, "oss": true, "probe": { - "/magento_version": "Magento/([0-9.]+)\\;version:\\1" + "/magento_version": "" }, "scriptSrc": [ "js/mage", @@ -3162,4 +3162,4 @@ ], "website": "https://code.google.com/p/modwsgi" } -} \ No newline at end of file +} diff --git a/src/technologies/s.json b/src/technologies/s.json index 020ebbb04..206244a8f 100644 --- a/src/technologies/s.json +++ b/src/technologies/s.json @@ -3470,13 +3470,16 @@ "img[src^='/-/media/']", "img[src*='/~/media/.+\\.ashx']" ], + "probe": { + "/layouts/System/VisitorIdentification.aspx": "" + }, "icon": "Sitecore.svg", "pricing": [ "poa", "recurring", "high" ], - "requires": "Microsoft ASP.NET", + "implies": "Microsoft ASP.NET", "saas": true, "website": "https://www.sitecore.com/" }, @@ -6698,4 +6701,4 @@ }, "website": "https://styled-components.com" } -} \ No newline at end of file +} diff --git a/src/wappalyzer.js b/src/wappalyzer.js index 68350b086..7c168ea64 100644 --- a/src/wappalyzer.js +++ b/src/wappalyzer.js @@ -298,19 +298,20 @@ const Wappalyzer = { const mm = Wappalyzer.analyzeManyToMany const relations = { - url: oo, - xhr: oo, - html: oo, - text: oo, - scripts: oo, - css: oo, - robots: oo, certIssuer: oo, - scriptSrc: om, cookies: mm, - meta: mm, - headers: mm, + css: oo, dns: mm, + headers: mm, + html: oo, + meta: mm, + probe: mm, + robots: oo, + scriptSrc: om, + scripts: oo, + text: oo, + url: oo, + xhr: oo, } try { @@ -344,82 +345,77 @@ const Wappalyzer = { Wappalyzer.technologies = Object.keys(data).reduce((technologies, name) => { const { cats, + certIssuer, + cookies, + cpe, + css, description, - url, - xhr, + dns, dom, - html, - text, - scripts, - css, - robots, - meta, + excludes, headers, - dns, - certIssuer, - cookies, - scriptSrc, - js, + html, + icon, implies, - excludes, + js, + meta, + pricing, + probe, requires, requiresCategory, - icon, + robots, + scriptSrc, + scripts, + text, + url, website, - pricing, - cpe, + xhr, } = data[name] technologies.push({ - name, - description: description || null, categories: cats || [], - slug: Wappalyzer.slugify(name), - url: transform(url), - xhr: transform(xhr), - headers: transform(headers), - dns: transform(dns), + certIssuer: transform(certIssuer), cookies: transform(cookies), + cpe: cpe || null, + css: transform(css), + description: description || null, + dns: transform(dns), dom: transform( typeof dom === 'string' || Array.isArray(dom) ? toArray(dom).reduce( - (dom, selector) => ({ - ...dom, - [selector]: { exists: '' }, - }), + (dom, selector) => ({ ...dom, [selector]: { exists: '' } }), {} ) : dom, true, false ), + excludes: transform(excludes).map(({ value }) => ({ name: value })), + headers: transform(headers), html: transform(html), - text: transform(text), - scripts: transform(scripts), - css: transform(css), - certIssuer: transform(certIssuer), - robots: transform(robots), - meta: transform(meta), - scriptSrc: transform(scriptSrc), - js: transform(js, true), + icon: icon || 'default.svg', implies: transform(implies).map(({ value, confidence, version }) => ({ name: value, confidence, version, })), - excludes: transform(excludes).map(({ value }) => ({ - name: value, - })), - requires: transform(requires).map(({ value }) => ({ - name: value, - })), + js: transform(js, true), + meta: transform(meta), + name, + pricing: pricing || [], + probe: transform(probe, true), + requires: transform(requires).map(({ value }) => ({ name: value })), requiresCategory: transform(requiresCategory).map(({ value }) => ({ id: value, })), - icon: icon || 'default.svg', + robots: transform(robots), + scriptSrc: transform(scriptSrc), + scripts: transform(scripts), + slug: Wappalyzer.slugify(name), + text: transform(text), + url: transform(url), website: website || null, - pricing: pricing || [], - cpe: cpe || null, + xhr: transform(xhr), }) return technologies