Merge pull request #597 from getsitecontrol/master

couple of apps detection fixes
main
Elbert Alias 11 years ago
commit 4df58d1de0

@ -0,0 +1,2 @@
#! /usr/bin/python
# -*- coding: utf-8 -*-

@ -0,0 +1,156 @@
#! /usr/bin/python
# -*- coding: utf-8 -*-
from wappalyzer import Wappalyzer
TESTS = [
{
'url': 'http://www.hardgraft.com',
'apps': ['jQuery', 'Shopify', 'Nginx']
},
{
'url': 'http://its.bplaced.net',
'apps': ['WordPress', 'jQuery', 'Apache']
},
{
'url': 'http://www.bodybuilding.com/',
'apps': ['jQuery', 'Optimizely', 'SiteCatalyst', 'Apache Tomcat']
},
{
'url': 'http://guidedhelp21.weebly.com/',
'apps': ['Weebly', 'Apache', 'Quantcast', 'Google Analytics', 'jQuery']
},
{
'url': 'http://www.bancadelparque.com/',
'apps': ['Wix', 'Twitter Bootstrap']
},
{
'url': 'http://joomla.ru/',
'apps': ['Joomla', 'jQuery', 'MooTools', 'Yandex.Metrika', 'LiteSpeed']
},
{
'url': 'http://demoshop21.e-stile.ru/',
'apps': ['SiteEdit', 'PHP']
},
{
'url': 'http://umbraco.com',
'apps': ['Umbraco', 'IIS', 'Microsoft ASP.NET']
},
{
'url': 'http://johnsciacca.webs.com/',
'apps': ['Webs', 'RequireJS', 'Site Meter', 'Modernizr']
},
{
'url': 'http://www.1c-bitrix.ru/',
'apps': ['1C-Bitrix', 'Yandex.Metrika']
},
{
'url': 'http://amirocms.com',
'apps': ['Amiro.CMS']
},
{
'url': 'http://dle-news.ru',
'apps': ['DataLife Engine', 'CloudFlare']
},
{
'url': 'http://dotnetnuke.com',
'apps': ['DotNetNuke', 'Microsoft ASP.NET']
},
{
'url': 'http://www.schooldude.com',
'apps': ['DotNetNuke', 'Microsoft ASP.NET']
},
{
'url': 'http://www.sportsdirect.com/',
'apps': ['DotNetNuke', 'Microsoft ASP.NET']
},
{
'url': 'http://drupal.org',
'apps': ['Drupal', 'Varnish']
},
{
'url': 'http://www.komodocms.com/',
'apps': ['Komodo CMS']
},
{
'url': 'http://livestreetcms.com/',
'apps': ['LiveStreet CMS']
},
{
'url': 'http://modxcms.com/',
'apps': ['MODx']
},
{
'url': 'http://modx.ru/',
'apps': ['MODx']
},
{
'url': 'http://revo.modx.ru/',
'apps': ['MODx']
},
{
'url': 'http://www.punchbrand.com',
'apps': ['CS Cart']
},
{
'url': 'http://demo.cs-cart.com/',
'apps': ['CS Cart']
},
{
'url': 'https://livedemo.installatron.com/1404307206magento/',
'apps': ['Magento']
},
{
'url': 'http://livedemo.installatron.com/1404300689prestashop/',
'apps': ['Prestashop']
},
{
'url': 'http://demo.opencart.com/',
'apps': ['OpenCart']
},
{
'url': 'https://livedemo.installatron.com/1404307206oscommerce/',
'apps': ['osCommerce']
},
{
'url': 'http://www.ubercartdemo.com/',
'apps': ['Ubercart']
},
{
'url': 'http://demostore.x-cart.com/',
'apps': ['X-Cart']
},
{
'url': 'https://livedemo.installatron.com/1404307206zencart/',
'apps': ['Zen Cart']
},
{
'url': 'http://oreonfray83.wordpress.com',
'apps': ['WordPress.Com']
},
{
'url': 'http://www.try-phpbb.com/30x/',
'apps': ['phpBB']
},
]
def test():
wappalyzer = Wappalyzer(datafile_path='../../share/apps.json')
for site in TESTS:
print 'testing %s ...' % site['url']
result = wappalyzer.analyze(site['url'])
for app in site['apps']:
found = result.pop(app, None)
if found:
print '\t%s\t- ok\tconfidence=%d' % (app, found.get_confidence())
else:
print '\t%s\t- NOT FOUND' % (app)
return
if result:
print '\tUNEXPECTED APPS:'
for app_name, app in result.iteritems():
print '\t\t%s\t- ok\tconfidence=%d' % (app_name, app.get_confidence())
if __name__ == '__main__':
test()

@ -0,0 +1,131 @@
import re
import unittest
import wappalyzer
class FakeUrlopenResponse(object):
def __init__(self, url, html, headers):
self.url = url
self.html = html
self.headers = headers
def read(self):
return self.html
def info(self):
_cls = self
class _Info:
@property
def dict(self):
return _cls.headers
return _Info()
class WappalyzerCustomTestCase(unittest.TestCase):
def setUp(self):
self.wappalyzer = wappalyzer.Wappalyzer({'categories':[],'apps':[]})
def get_wappalyzer(self, categories, apps):
return wappalyzer.Wappalyzer({'categories': categories, 'apps': apps})
def test_parse_simple(self):
parsed = self.wappalyzer.parse_patterns('control/userimage\\.html')
self.assertEqual(1, len(parsed))
self.assertTrue(hasattr(parsed[0].regex, 'search'))
def test_parse_confidence_version(self):
parsed = self.wappalyzer.parse_patterns('control/userimage\\.html\\;version:1\\;confidence:80')
self.assertEqual(1, len(parsed))
self.assertEqual('1', getattr(parsed[0], 'version'))
self.assertEqual(80, getattr(parsed[0], 'confidence'))
def _construct_response(self, url=None, headers=None, html=None):
return FakeUrlopenResponse(
url=url or '',
headers=headers or {},
html=html or ''
)
def test_by_url(self):
wappalyzer = self.get_wappalyzer(
{},
{'test1': {'url': 'mysite\d.com'}, 'test2': {'url': 'hissite\d.com'},
'test3': {'url': ['my', 'his']}})
resp = self._construct_response(url='http://mysite2.com')
result = wappalyzer.analyze(response=resp)
self.assertIn('test1', result)
self.assertIn('test3', result)
def test_by_html_with_confidence(self):
wappalyzer = self.get_wappalyzer(
{},
{'test1': {'html': 'body\d\\;confidence:70'}, 'test2': {'html': 'body\w'}})
resp = self._construct_response(html='body123')
result = wappalyzer.analyze(response=resp)
self.assertIn('test1', result)
self.assertEqual(70, result['test1'].get_confidence())
def test_by_headers(self):
wappalyzer = self.get_wappalyzer({},
{
'test1': {
"headers": {"Server": "debut\\/?([\\d\\.]+)?\\;version:\\1"},
}
})
resp = self._construct_response(headers={"Server": 'debut'})
result = wappalyzer.analyze(response=resp)
self.assertIn('test1', result)
resp = self._construct_response(headers={"Server": 'debut/12'})
result = wappalyzer.analyze(response=resp)
self.assertIn('test1', result)
def test_by_meta(self):
wappalyzer = self.get_wappalyzer({},
{
'test1': {
"meta": {"generator": "uCore PHP Framework"},
},
'test2': {
"meta": {"generator2": "0"},
}
})
resp = self._construct_response(html="<html><meta name='generator' content='uCore PHP Framework'>")
result = wappalyzer.analyze(response=resp)
self.assertIn('test1', result)
self.assertNotIn('test2', result)
resp = self._construct_response(html="<html><meta property='generator' content='uCore PHP Framework'>")
result = wappalyzer.analyze(response=resp)
self.assertIn('test1', result)
self.assertNotIn('test2', result)
resp = self._construct_response(html="<html><meta content='uCore PHP Framework' name='generator'>")
result = wappalyzer.analyze(response=resp)
self.assertIn('test1', result)
self.assertNotIn('test2', result)
def test_by_scripts(self):
wappalyzer = self.get_wappalyzer(
{},
{'jquery': {
"script": ["jquery(?:\\-|\\.)([\\d.]*\\d)[^/]*\\.js\\;version:\\1",
"/([\\d.]+)/jquery(\\.min)?\\.js\\;version:\\1", "jquery.*\\.js"],
}})
resp = self._construct_response(
html='<html><script src="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js"></script>')
result = wappalyzer.analyze(response=resp)
self.assertIn('jquery', result)
if __name__ == '__main__':
unittest.main()

@ -0,0 +1,135 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import re
import sys
import urllib
from urlparse import urlparse
try:
import json
except ImportError:
import simplejson as json
class Application(object):
def __init__(self, app):
self.app = app
self.confidence = {}
self.detected = False
def set_detected(self, pattern, type, value, key=None):
self.detected = True
self.confidence[type + ' ' + (key + ' ' if key else '') + pattern.str] = pattern.confidence
# todo: detect version
def get_confidence(self):
total = sum(v for v in self.confidence.itervalues())
return min(100, total)
class Wappalyzer(object):
def __init__(self, data=None, datafile_path=None):
data = data or self.load_data(datafile_path)
self.categories = data['categories']
self.apps = data['apps']
def load_data(self, datafile_path=NotImplementedError):
if not datafile_path:
file_dir = os.path.dirname(__file__)
datafile_path = os.path.join(file_dir, 'apps.json')
with open(datafile_path) as f:
data = json.load(f)
return data
def analyze(self, url=None, response=None):
if not response and not url:
raise ValueError
if not response:
response = urllib.urlopen(url)
url = response.url.split('#')[0]
html = response.read()
data = {
'url': url,
'html': html,
'script': re.findall(r'<script[^>]+src=(?:"|\')([^"\']+)', html, re.I | re.M),
'meta': dict((n.lower(), v) for n, v in
re.findall('<meta\s+(?:name|property)=["\']([^"\']+)["\'].+?content=["\']([^"\']+)["\']', html,
re.I | re.M) +
[(m2, m1) for m1, m2 in
re.findall('<meta\s+content=["\']([^"\']+)["\'].+?(?:name|property)=["\']([^"\']+)["\']',
html,
re.I | re.M)]),
'headers': dict((n.lower(), v) for n, v in response.info().dict.iteritems()),
'env': None
}
detected_apps = {}
for app_name, app in self.apps.iteritems():
application = Application(app)
for detection_type, patterns in app.iteritems():
try:
if detection_type in ['url', 'html']:
for pattern in self.parse_patterns(patterns):
if pattern.regex.search(data[detection_type]):
application.set_detected(pattern, detection_type, data[detection_type])
elif detection_type in ['meta', 'headers']:
for hm_name, hm_pattern in patterns.iteritems():
for pattern in self.parse_patterns(hm_pattern):
value = data[detection_type].get(hm_name.lower())
if value and pattern.regex.search(value):
application.set_detected(pattern, detection_type, value, hm_name)
elif detection_type in ['script']:
for script in data[detection_type]:
for pattern in self.parse_patterns(patterns):
if pattern.regex.search(script):
application.set_detected(pattern, detection_type, script)
elif detection_type in ['website', 'excludes', 'cats', 'implies', 'env']:
pass
else:
raise NotImplementedError
except:
print 'error while detecting by %s application %s' % (detection_type, app)
if application.detected:
detected_apps[app_name] = application
return detected_apps
class Pattern:
def __init__(self, str):
self.str = str
self.regex = re.compile(str, re.I)
self.confidence = 100
def parse_patterns(self, patterns):
if isinstance(patterns, basestring):
patterns = [patterns]
elif not isinstance(patterns, list):
raise ValueError
parsed = []
for pattern in patterns:
parts = pattern.split('\\;')
result = Wappalyzer.Pattern(parts[0])
for part in parts[1:]:
name, value = part.split(':', 1)
if name in ['confidence']:
value = float(value)
setattr(result, name, value)
parsed.append(result)
return parsed
if __name__ == '__main__':
try:
w = Wappalyzer(sys.argv[1])
print w.analyze()
except IndexError:
print ('Usage: python %s <url>' % sys.argv[0])

@ -25,3 +25,5 @@ ln -f share/js/wappalyzer.js drivers/php/js
ln -f share/apps.json drivers/python
ln -f share/js/wappalyzer.js drivers/python/js
ln -f share/apps.json drivers/python_raw

@ -628,7 +628,7 @@
"website": "www.cs-cart.com",
"cats": [ 6 ],
"env": "^fn_compare_strings$",
"html": "&nbsp;Powered by (?:<a href=[^>]+cs-cart\\.com|CS-Cart)",
"html": ["&nbsp;Powered by (?:<a href=[^>]+cs-cart\\.com|CS-Cart)", "(?:\\$|jQuery)\\.runCart\\('\\w'\\)"],
"implies": "PHP"
},
"CubeCart": {
@ -743,8 +743,6 @@
"Django CMS": {
"website": "django-cms.org",
"cats": [ 1 ],
"script": "media/cms/js/csrf\\.js",
"headers": { "Set-Cookie": "django[^;]=" },
"implies": "Django"
},
"Dojo": {
@ -777,7 +775,8 @@
"website": "dotnetnuke.com",
"cats": [ 1 ],
"meta": { "generator": "DotNetNuke" },
"headers": { "X-Compressed-By": "DotNetNuke", "Set-Cookie": "DotNetNukeAnonymous=" },
"script":["/js/dnncore\\.js"],
"headers": { "DNNOutputCache":".+", "X-Compressed-By": "DotNetNuke", "Set-Cookie": "DotNetNukeAnonymous=" },
"html": "<!-- by DotNetNuke Corporation",
"env": "^DotNetNuke$",
"implies": "Microsoft ASP.NET"
@ -1104,9 +1103,7 @@
"website": "google.com/adsense",
"cats": [ 36 ],
"env": [ "^google_ad_", "^__google_ad_", "^Goog_AdSense_" ],
"script": [ "googlesyndication\\.com/pagead/show_ads\\.js", "ad\\.ca\\.doubleclick\\.net", "2mdn\\.net" ],
"env": "^google_ad_",
"script": [ "googlesyndication\\.com/pagead/show_ads\\.js", "ad\\.ca\\.doubleclick.net" ]
"script": [ "googlesyndication\\.com/pagead/show_ads\\.js", "ad\\.ca\\.doubleclick\\.net", "2mdn\\.net" ]
},
"Google App Engine": {
"website": "code.google.com/appengine",
@ -1678,6 +1675,7 @@
"LiveStreet CMS": {
"website": "livestreetcms.com",
"cats": [ 1 ],
"html":["var LIVESTREET_SECURITY_KEY"],
"headers": { "X-Powered-By": "LiveStreet CMS" }
},
"Lockerz Share": {
@ -1851,7 +1849,7 @@
"MODx": {
"website": "modxcms.com",
"cats": [ 1 ],
"html": "(?:<a[^>]+>Powered by MODx</a>|<(?:link|script)[^>]+assets/snippets/)",
"html": ["<a[^>]+>Powered by MODx</a>", "<(?:link|script)[^>]+assets/(?:templates|components|snippets)/\\;confidence:80"],
"env": "^MODX_MEDIA_PATH$",
"headers": { "X-Powered-By": "^MODx", "Set-Cookie": "SN4[a-f\\d]{12}" },
"implies": "PHP"
@ -2431,7 +2429,7 @@
"Pure CSS": {
"website": "purecss.io",
"cats": [18],
"html": "<link[^>]+(?:([\\d.])+/)?pure(?:-min)?\\.css\\;version=\\1"
"html": "<link[^>]+(?:([\\d.])+/)?pure(?:-min)?\\.css\\;version:\\1"
},
"Python": {
"website": "python.org",
@ -2646,7 +2644,7 @@
"Shopify": {
"website": "shopify.com",
"cats": [ 6 ],
"html": "<link[^>]+=cdn\\.shopify\\.com",
"html": "<link[^>]+=['\"]//cdn\\.shopify\\.com",
"env": "^Shopify$"
},
"Shopware": {
@ -3178,6 +3176,11 @@
"cats": [ 1 ],
"meta": { "generator": "WEB\\|Publisher" }
},
"Webs": {
"website": "webs.com",
"cats": [ 1 ],
"headers": {"Server":"Webs.com/?([\\d\\.]+)?\\;version:\\1"}
},
"Websale": {
"website": "websale.de",
"cats": [ 6 ],
@ -3208,7 +3211,7 @@
"Weebly": {
"website": "www.weebly.com",
"cats": [ 1 ],
"html": "<[^>]+class=\"weebly"
"script":"cdn\\d+\\.editmysite\\.com"
},
"WikkaWiki": {
"website": "wikkawiki.org",
@ -3230,7 +3233,7 @@
"Wix": {
"website": "wix.com",
"cats": [ 1 ],
"script": "static\\.wix\\.com",
"script": "static\\.wixstatic\\.com",
"headers": { "X-Wix-Dispatcher-Cache-Hit": ".+", "Set-Cookie": "Domain=\\.wix\\.com" },
"env": "^wix(?:Events|Data|Errors)"
},
@ -3261,6 +3264,12 @@
"env": "^wp_username$",
"implies": "PHP"
},
"WordPress.Com": {
"website": "wordpress.com",
"cats": [ 1, 11 ],
"html": "<link[^>]+s\\d+\\.wp\\.com",
"implies": "WordPress"
},
"WordPress Super Cache": {
"website": "ocaoimh.ie/wp-super-cache/",
"cats": [ 23 ],