diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..43bc347 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +# docker pull jesseosiecki/httpscreenshot + +FROM ubuntu:20.04 + +MAINTAINER Jesse Osiecki + +RUN mkdir -p /etc/httpscreenshot +WORKDIR /etc/httpscreenshot + +COPY . /etc/httpscreenshot/ + +RUN apt-get update +RUN apt-get install -y wget libfontconfig + +RUN ./install-dependencies.sh + +RUN chmod +x httpscreenshot.py +RUN ln -s /etc/httpscreenshot/httpscreenshot.py /usr/bin/httpscreenshot + +RUN mkdir -p /etc/httpscreenshot/images +WORKDIR /etc/httpscreenshot/images + +ENTRYPOINT ["httpscreenshot"] diff --git a/README.md b/README.md index 7862e7f..2d87285 100644 --- a/README.md +++ b/README.md @@ -1,48 +1,71 @@ # httpscreenshot -##Installation on Ubuntu: +### Installation via Docker -apt-get install python-requests python-m2crypto phantomjs +`docker pull jesseosiecki/httpscreenshot` +`docker run jesseosiecki/httpscreenshot` -If you run into: 'module' object has no attribute 'PhantomJS' -then pip install selenium (or pip install --upgrade selenium) +### Installation on Ubuntu +#### Via Script -##Readme and Use cases: +Run `install-dependencies.sh` script as root. + +This script has been tested on Ubuntu 20.04 as *root* (sudo). + +### Manually + + apt-get install swig swig2.0 libssl-dev python-dev python-pip + pip install -r requirements.txt + +If you run into: 'module' object has no attribute 'PhantomJS' then `pip install selenium` (or `pip install --upgrade selenium`). + + +If installing on Kali Linux, PhantomJS might not be in the repositories, you can download from https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-1.9.8-linux-x86_64.tar.bz2 and symlink to `/usr/bin` like so: + + sudo ln -s /path/to/phantomjs /usr/bin/phantomjs + +## README and Use Cases HTTPScreenshot is a tool for grabbing screenshots and HTML of large numbers of websites. The goal is for it to be both thorough and fast which can sometimes oppose each other. Before getting into documentation - this is what I USUALLY use for options if I want to screenshot a bunch of sites: - ./httpscreenshot.py -i \ -p -w 40 -a -vH + ./httpscreenshot.py -i \ -p -w 40 -a -vH Notice there are a ton of worker threads (40). This can be problematic, I make up for failures that could have been a result of too many threads with a second run: - ./httpscreenshot.py -i \ -p -w 5 -a -vH + ./httpscreenshot.py -i \ -p -w 5 -a -vH YMMV The options are as follows: - -h, --help show this help message and exit - -l LIST, --list LIST List of input URLs - -i INPUT, --input INPUT - nmap gnmap output file - -p, --headless Run in headless mode (using phantomjs) - -w WORKERS, --workers WORKERS - number of threads - -t TIMEOUT, --timeout TIMEOUT + -h, --help show this help message and exit + -l LIST, --list LIST List of input URLs + -i INPUT, --input INPUT + nmap gnmap output file + -p, --headless Run in headless mode (using phantomjs) + -w WORKERS, --workers WORKERS + number of threads + -t TIMEOUT, --timeout TIMEOUT time to wait for pageload before killing the browser - -v, --verbose turn on verbose debugging + -v, --verbose turn on verbose debugging -a, --autodetect Automatically detect if listening services are HTTP or - HTTPS. Ignores NMAP service detction and URL schemes. - -vH, --vhosts Attempt to scrape hostnames from SSL certificates and - add these to the URL queue - -dB DNS_BRUTE, --dns_brute DNS_BRUTE - Specify a DNS subdomain wordlist for bruteforcing on - wildcard SSL certs - -r RETRIES, --retries RETRIES - Number of retries if a URL fails or timesout + HTTPS. Ignores NMAP service detction and URL schemes. + -vH, --vhosts Attempt to scrape hostnames from SSL certificates and + add these to the URL queue + -dB DNS_BRUTE, --dns_brute DNS_BRUTE + Specify a DNS subdomain wordlist for bruteforcing on + wildcard SSL certs + -r RETRIES, --retries RETRIES + Number of retries if a URL fails or timesout + -tG, --trygui Try to fetch the page with FireFox when headless fails + -sF, --smartfetch Enables smart fetching to reduce network traffic, also + increases speed if certain conditions are met. + -pX PROXY, --proxy PROXY + SOCKS5 Proxy in host:port format + Some of the above options have non-obvious use-cases, so the following provides some more detail: @@ -50,7 +73,7 @@ Some of the above options have non-obvious use-cases, so the following provides -i, --input -> Takes a gnmap file as input. This includes masscan gnmap output. --p, --headless -> I find myself using this option more and more. By default the script "drives" FireFox. As the number of threads increases this becomes really ugly - 20,30 FireFox windows open at once. This options uses "phantomjs" which doesn't have a GUI but will still do a decent job parsing javascript. +-p, --headless -> I find myself using this option more and more. By default the script "drives" Firefox. As the number of threads increases this becomes really ugly - 20,30 Firefox windows open at once. This options uses "phantomjs" which doesn't have a GUI but will still do a decent job parsing javascript. -w, --workers -> The number of threads to use. Increase for more speed. The list of input URL's is automatically shuffled to avoid hammering at IP addresses that are close to each other when possible. If you add too many threads, you might start seeing timeouts in responses - adjust for your network and machine. @@ -60,23 +83,25 @@ Some of the above options have non-obvious use-cases, so the following provides -a, --autodetect -> Without this option enabled, HTTPScreenshot will behave as follows: - If a LIST of urls is specified as input, sites with scheme "http://" are treated as non-ssl and sites with scheme "https://" are treated as ssl-enabled +> If a LIST of urls is specified as input, sites with scheme "http://" are treated as non-ssl and sites with scheme "https://" are treated as ssl-enabled - For GNMAP input the script will scrape input and try to use any SSL detection performed by nmap. Unfortunately this is unreliable, nmap doesn't always like to tell you that something is SSL enabled. Further, masscan doesn't do any version or service detection. +> For GNMAP input the script will scrape input and try to use any SSL detection performed by nmap. Unfortunately this is unreliable, nmap doesn't always like to tell you that something is SSL enabled. Further, masscan doesn't do any version or service detection. - The -a or --autodetect option throws away all SSL hints from the input file and tries to detect on its own. +> The -a or --autodetect option throws away all SSL hints from the input file and tries to detect on its own. -vH, --vhosts -> Often when visiting websites by their IP address (e.g: https://192.168.1.30), we will receive a different page than expected or an error. This is because the site is expecting a certain "virtual host" or hostname instead of the IP address, sometimes a single HTTP server will respond with many different pages for different hostnames. - For plaintext "http" websites, we can use reverse DNS, BING reverse IP search etc... to try and find the hostnames associated with an IP address. This is not currently a feature in HTTPScreenshot, but may be implemented later. +> For plaintext "http" websites, we can use reverse DNS, BING reverse IP search etc... to try and find the hostnames associated with an IP address. This is not currently a feature in HTTPScreenshot, but may be implemented later. - For SSL enabled "https" sites, this can be a little easier. The SSL certificate will provide us with a hint at the domain name in the CN field. In the "subject alt names" field of the certificate, when it exists, we may get a whole list of other domain names potentially associated with this IP. Often these are in the form "*.google.com" (wildcard certificate) but sometimes will be linked to a single hostname only like "www.google.com" +> For SSL enabled "https" sites, this can be a little easier. The SSL certificate will provide us with a hint at the domain name in the CN field. In the "subject alt names" field of the certificate, when it exists, we may get a whole list of other domain names potentially associated with this IP. Often these are in the form "\*.google.com" (wildcard certificate) but sometimes will be linked to a single hostname only like "www.google.com" - the -vH or --vhosts flag will, for each SSL enabled website extract the hostnames from the CN and subject alt names field, and add them to the list of URL's to be screenshotted. For wildcard certificates, the "*." part of the name is dropped. +> The -vH or --vhosts flag will, for each SSL enabled website extract the hostnames from the CN and subject alt names field, and add them to the list of URL's to be screenshotted. For wildcard certificates, the "\*." part of the name is dropped. --dB, --dns_brute -> Must use with -vH for it to make sense. This flag specifies a file containing a list of potential subdomains. For any wildcard certificate e.g: "*.google.com", HTTPScreenshot will try to bruteforce valid subdomains and add them to the list of URLs to be screenshotted. +-dB, --dns_brute -> Must use with -vH for it to make sense. This flag specifies a file containing a list of potential subdomains. For any wildcard certificate e.g: "\*.google.com", HTTPScreenshot will try to bruteforce valid subdomains and add them to the list of URLs to be screenshotted. --r, --retries -> Sometimes FireFox or ghostscript timeout when fetching a page. This could be due to a number of factors, sometimes you just have too many threads going, a network hiccup, etc. This specifies the number of times to "retry" a given host when it fails. +-r, --retries -> Sometimes Firefox or ghostscript timeout when fetching a page. This could be due to a number of factors, sometimes you just have too many threads going, a network hiccup, etc. This specifies the number of times to "retry" a given host when it fails. +-tG, --trygui -> Upon failure to fetch with the headless browser phantomJS, will pop open FireFox and try again. +-sF, --smartfetch -> Enables smart fetching to reduce network traffic, also increases speed if certain conditions are met. diff --git a/httpscreenshot.py b/httpscreenshot.py old mode 100755 new mode 100644 index bb09f09..6794d8d --- a/httpscreenshot.py +++ b/httpscreenshot.py @@ -1,38 +1,49 @@ -#!/usr/bin/python +#!/usr/bin/python3 -''' -Installation on Ubuntu: -apt-get install python-requests python-m2crypto phantomjs -If you run into: 'module' object has no attribute 'PhantomJS' -then pip install selenium (or pip install --upgrade selenium) -''' - -from selenium import webdriver -from urlparse import urlparse -import multiprocessing -import Queue import argparse -import sys -import traceback +import hashlib +import multiprocessing import os.path -import requests -import ssl -import M2Crypto +import queue import re -from random import shuffle -import time -import Image -import ImageDraw -import ImageFont +import shutil import signal +import ssl +import sys +import time +import traceback +from importlib import reload +from random import shuffle +from urllib.parse import urlparse +from collections import defaultdict + +import warnings + +warnings.filterwarnings("ignore") + +import M2Crypto +from PIL import Image, ImageDraw, ImageFont +from pyvirtualdisplay import Display +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.desired_capabilities import DesiredCapabilities +from webdriver_manager.chrome import ChromeDriverManager + +try: + from urllib.parse import quote +except Exception: + from urllib.parse import quote + +try: + import requesocks as requests +except Exception: + import requests reload(sys) -sys.setdefaultencoding("utf8") def timeoutFn(func, args=(), kwargs={}, timeout_duration=1, default=None): - import signal - class TimeoutError(Exception): pass @@ -53,412 +64,619 @@ def handler(signum, frame): def addUrlsForService(host, urlList, servicesList, scheme): - if(servicesList == None or servicesList == []): - return - for service in servicesList: - state = service.findPreviousSibling("state") - if(state != None and state != [] and state['state'] == 'open'): - urlList.append(scheme+host+':'+str(service.parent['portid'])) + if servicesList is None or servicesList == []: + return + for service in servicesList: + state = service.findPreviousSibling("state") + if state is not None and state != [] and state["state"] == "open": + urlList.append(scheme + host + ":" + str(service.parent["portid"])) def detectFileType(inFile): - #Check to see if file is of type gnmap - firstLine = inFile.readline() - secondLine = inFile.readline() - thirdLine = inFile.readline() - - #Be polite and reset the file pointer - inFile.seek(0) - - if ((firstLine.find('nmap') != -1 or firstLine.find('Masscan') != -1) and thirdLine.find('Host:') != -1): - #Looks like a gnmap file - this wont be true for other nmap output types - #Check to see if -sV flag was used, if not, warn - if(firstLine.find('-sV') != -1 or firstLine.find('-A') != -1): - return 'gnmap' - else: - print("Nmap version detection not used! Discovery module may miss some hosts!") - return 'gnmap' - else: - return None + # Check to see if file is of type gnmap + firstLine = inFile.readline() + secondLine = inFile.readline() + thirdLine = inFile.readline() + + # Be polite and reset the file pointer + inFile.seek(0) + + if (firstLine.find("nmap") != -1 or firstLine.find("Masscan") != -1) and thirdLine.find("Host:") != -1: + # Looks like a gnmap file - this wont be true for other nmap output types + # Check to see if -sV flag was used, if not, warn + if firstLine.find("-sV") != -1 or firstLine.find("-A") != -1: + return "gnmap" + else: + print( + "Nmap version detection not used! Discovery module may miss some hosts!" + ) + return "gnmap" + elif (firstLine.find("xml version") != + -1) and (secondLine.find("DOCTYPE nmaprun") != -1 + or secondLine.find("masscan") != -1): + return "xml" + else: + return None + + +def parsexml(inFile): + import xml.etree.ElementTree as ET + + tree = ET.parse(inFile) + root = tree.getroot() + + targets = defaultdict(list) + + for host in root.findall("host"): + ip = host.find('address').get('addr') + + for port in host.find('ports').findall("port"): + if port.find("state").get("state") == "open": + targets[ip].append(port.get("portid")) + + return targets def parseGnmap(inFile, autodetect): - ''' - Parse a gnmap file into a dictionary. The dictionary key is the ip address or hostname. - Each key item is a list of ports and whether or not that port is https/ssl. For example: - >>> targets - {'127.0.0.1': [[443, True], [8080, False]]} - ''' - targets = {} - for hostLine in inFile: - currentTarget = [] - #Pull out the IP address (or hostnames) and HTTP service ports - fields = hostLine.split(' ') - ip = fields[1] #not going to regex match this with ip address b/c could be a hostname - for item in fields: - #Make sure we have an open port with an http type service on it - if (item.find('http') != -1 or autodetect) and re.findall('\d+/open',item): - port = None - https = False - ''' - nmap has a bunch of ways to list HTTP like services, for example: - 8089/open/tcp//ssl|http - 8000/closed/tcp//http-alt/// - 8008/closed/tcp//http/// - 8080/closed/tcp//http-proxy// - 443/open/tcp//ssl|https?/// - 8089/open/tcp//ssl|http - Since we want to detect them all, let's just match on the word http - and make special cases for things containing https and ssl when we - construct the URLs. - ''' - port = item.split('/')[0] - - if item.find('https') != -1 or item.find('ssl') != -1: - https = True - #Add the current service item to the currentTarget list for this host - currentTarget.append([port,https]) - - if(len(currentTarget) > 0): - targets[ip] = currentTarget - return targets - - -def setupBrowserProfile(headless): - browser = None - while(browser is None): - try: - if(not headless): - fp = webdriver.FirefoxProfile() - fp.set_preference("webdriver.accept.untrusted.certs",True) - fp.set_preference("security.enable_java", False) - fp.set_preference("webdriver.load.strategy", "fast"); - browser = webdriver.Firefox(fp) - else: - browser = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true','--ssl-protocol=tlsv1'], executable_path="phantomjs") - except Exception as e: - print e - time.sleep(1) - continue - - return browser + hostRe = re.compile('Host:\s*[^\s]+') + servicesRe = re.compile('Ports:\s*.*') + + targets = defaultdict(list) + + for hostLine in inFile: + if hostLine.strip() == "": + break + # Pull out the IP address (or hostnames) and HTTP service ports + + ipHostRes = hostRe.search(hostLine) + + if ipHostRes is None: + continue + + ipHost = ipHostRes.group() + ip = ipHost.split(':')[1].strip() + + try: + services = servicesRe.search(hostLine).group().split() + except: + continue + + for item in services: + # Make sure we have an open port with an http type service on it + if re.findall("\d+/open", item): + port = None + https = False + """ + nmap has a bunch of ways to list HTTP like services, for example: + 8089/open/tcp//ssl|http + 8000/closed/tcp//http-alt/// + 8008/closed/tcp//http/// + 8080/closed/tcp//http-proxy// + 443/open/tcp//ssl|https?/// + 8089/open/tcp//ssl|http + Since we want to detect them all, let's just match on the word http + and make special cases for things containing https and ssl when we + construct the URLs. + """ + port = item.split("/")[0] + targets[ip].append(port) + + return targets + + +def setupBrowserProfile(headless, proxy, browserType): + browser = None + if (proxy is not None): + service_args = ['--ignore-ssl-errors=true', '--ssl-protocol=any', '--proxy=' + proxy, '--proxy-type=socks5'] + else: + service_args = ['--ignore-ssl-errors=true', '--ssl-protocol=any'] + + while (browser is None): + try: + if (browserType == 'Chrome' or browserType == 'Chromium'): + service = Service(ChromeDriverManager(log_level=0).install()) + coptions = Options() + if headless: + coptions.add_argument("--headless") + coptions.add_argument("--no-sandbox") + coptions.add_argument("--window-size=1024x768") + coptions.add_argument("--ignore-certificate-errors") + coptions.add_argument("--ssl-version-min=tls1") + + browser = webdriver.Chrome(service=service, options=coptions) + else: + capabilities = DesiredCapabilities.FIREFOX + capabilities['acceptSslCerts'] = True + fp = webdriver.FirefoxProfile() + fp.set_preference("webdriver.accept.untrusted.certs", True) + fp.set_preference("security.enable_java", False) + fp.set_preference("webdriver.load.strategy", "fast"); + if (proxy is not None): + proxyItems = proxy.split(":") + fp.set_preference("network.proxy.socks", proxyItems[0]) + fp.set_preference("network.proxy.socks_port", int(proxyItems[1])) + fp.set_preference("network.proxy.type", 1) + + fireFoxOptions = webdriver.FirefoxOptions() + if headless: + fireFoxOptions.headless = True + + browser = webdriver.Firefox(firefox_profile=fp, + capabilities=capabilities, + options=fireFoxOptions) + browser.set_window_size(1024, 768) + + except Exception as e: + print(e) + time.sleep(1) + continue + return browser def writeImage(text, filename, fontsize=40, width=1024, height=200): - image = Image.new("RGBA", (width,height), (255,255,255)) - draw = ImageDraw.Draw(image) - font = ImageFont.truetype(os.path.dirname(os.path.realpath(__file__))+"/LiberationSerif-BoldItalic.ttf", fontsize) - draw.text((10, 0), text, (0,0,0), font=font) - image.save(filename) - - -def worker(urlQueue,tout,debug,headless,doProfile,vhosts,subs,extraHosts,tryGUIOnFail): - if(debug): - print '[*] Starting worker' - - browser = None - try: - browser = setupBrowserProfile(headless) - - except: - print "[-] Oh no! Couldn't create the browser, Selenium blew up" - exc_type, exc_value, exc_traceback = sys.exc_info() - lines = traceback.format_exception(exc_type, exc_value, exc_traceback) - print ''.join('!! ' + line for line in lines) - return - - while True: - #Try to get a URL from the Queue - try: - curUrl = urlQueue.get(timeout=tout) - print '[+] '+str(urlQueue.qsize())+' URLs remaining' - screenshotName = urlparse(curUrl[0]).netloc.replace(":", "-") - if(debug): - print '[+] Got URL: '+curUrl[0] - if(os.path.exists(screenshotName+".png")): - if(debug): - print "[-] Screenshot already exists, skipping" - continue - - except Queue.Empty: - if(debug): - print'[-] URL queue is empty, quitting.' - browser.quit() - return - - try: - if(doProfile): - [resp,curUrl] = autodetectRequest(curUrl, timeout=tout, vhosts=vhosts, urlQueue=urlQueue, subs=subs, extraHosts=extraHosts) - else: - resp = doGet(curUrl, verify=False, timeout=tout, vhosts=vhosts, urlQueue=urlQueue, subs=subs, extraHosts=extraHosts) - if(resp is not None and resp.status_code == 401): - print curUrl[0]+" Requires HTTP Basic Auth" - f = open(screenshotName+".html",'w') - f.write(resp.headers.get('www-authenticate','NONE')) - f.write('Basic Auth') - f.close() - writeImage(resp.headers.get('www-authenticate','NO WWW-AUTHENTICATE HEADER'),screenshotName+".png") - continue - elif(resp is not None): - browser.set_window_size(1024, 768) - browser.set_page_load_timeout((tout)) - old_url = browser.current_url - browser.get(curUrl[0].strip()) - if(browser.current_url == old_url): - print "[-] Error fetching in browser but successfully fetched with Requests: "+curUrl[0] - if(headless): - if(debug): - print "[+] Trying with sslv3 instead of TLS - known phantomjs bug: "+curUrl[0] - browser2 = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'], executable_path="phantomjs") - old_url = browser2.current_url - browser2.get(curUrl[0].strip()) - if(browser2.current_url == old_url): - if(debug): - print "[-] Didn't work with SSLv3 either..."+curUrl[0] - browser2.close() - else: - print '[+] Saving: '+screenshotName - html_source = browser2.page_source - f = open(screenshotName+".html",'w') - f.write(html_source) - f.close() - browser2.save_screenshot(screenshotName+".png") - browser2.close() - continue - - if(tryGUIOnFail and headless): - print "[+] Attempting to fetch with FireFox: "+curUrl[0] - browser2 = setupBrowserProfile(False) - old_url = browser2.current_url - browser2.get(curUrl[0].strip()) - if(browser2.current_url == old_url): - print "[-] Error fetching in GUI browser as well..."+curUrl[0] - browser2.close() - continue - else: - print '[+] Saving: '+screenshotName - html_source = browser2.page_source - f = open(screenshotName+".html",'w') - f.write(html_source) - f.close() - browser2.save_screenshot(screenshotName+".png") - browser2.close() - continue - else: - continue - - print '[+] Saving: '+screenshotName - html_source = browser.page_source - f = open(screenshotName+".html",'w') - f.write(html_source) - f.close() - browser.save_screenshot(screenshotName+".png") - except Exception as e: - print e - print '[-] Something bad happened with URL: '+curUrl[0] - if(curUrl[2] > 0): - curUrl[2] = curUrl[2] - 1; - urlQueue.put(curUrl) - if(debug): - exc_type, exc_value, exc_traceback = sys.exc_info() - lines = traceback.format_exception(exc_type, exc_value, exc_traceback) - print ''.join('!! ' + line for line in lines) - browser.quit() - browser = setupBrowserProfile(headless) - continue + image = Image.new("RGBA", (width, height), (255, 255, 255)) + draw = ImageDraw.Draw(image) + if os.path.exists( + "/usr/share/httpscreenshot/LiberationSerif-BoldItalic.ttf"): + font_path = "/usr/share/httpscreenshot/LiberationSerif-BoldItalic.ttf" + else: + font_path = (os.path.dirname(os.path.realpath(__file__)) + + "/LiberationSerif-BoldItalic.ttf") + font = ImageFont.truetype(font_path, fontsize) + draw.text((10, 0), text, (0, 0, 0), font=font) + image.save(filename) + + +def worker( + urlQueue, + tout, + debug, + headless, + doProfile, + vhosts, + subs, + extraHosts, + tryGUIOnFail, + smartFetch, + proxy, + browserType +): + if debug: + print("[*] Starting worker") + + browser = None + display = None + try: + if tryGUIOnFail or not headless: + display = Display(visible=0, size=(800, 600)) + display.start() + + browser = setupBrowserProfile(headless, proxy, browserType) + + except Exception: + print("[-] Oh no! Couldn't create the browser, Selenium blew up") + exc_type, exc_value, exc_traceback = sys.exc_info() + lines = traceback.format_exception(exc_type, exc_value, exc_traceback) + print("".join("!! " + line for line in lines)) + browser.quit() + display.stop() + return + + while True: + # Try to get a URL from the Queue + if urlQueue.qsize() > 0: + try: + curUrl = urlQueue.get(timeout=tout) + except queue.Empty: + continue + print("[+] " + str(urlQueue.qsize()) + " URLs remaining") + screenshotName = quote(curUrl, safe="") + if debug: + print("[+] Got URL: " + curUrl) + print("[+] screenshotName: " + screenshotName) + if os.path.exists(screenshotName + ".png"): + if debug: + print("[-] Screenshot already exists, skipping") + continue + else: + if debug: + print("[-] URL queue is empty, quitting.") + browser.quit() + return + + try: + if doProfile: + [resp, curUrl] = autodetectRequest( + curUrl, + timeout=tout, + vhosts=vhosts, + urlQueue=urlQueue, + subs=subs, + extraHosts=extraHosts, + proxy=proxy, + ) + else: + resp = doGet( + curUrl, + verify=False, + timeout=tout, + vhosts=vhosts, + urlQueue=urlQueue, + subs=subs, + extraHosts=extraHosts, + proxy=proxy, + ) + if resp is not None and resp.status_code == 401: + print(curUrl + " Requires HTTP Basic Auth") + f = open(screenshotName + ".html", "w") + f.write(resp.headers.get("www-authenticate", "NONE")) + f.write("Basic Auth") + f.close() + writeImage( + resp.headers.get("www-authenticate", + "NO WWW-AUTHENTICATE HEADER"), + screenshotName + ".png", + ) + continue + + elif resp is not None: + if resp.text is not None: + resp_hash = hashlib.md5( + resp.text.encode('utf-8')).hexdigest() + else: + resp_hash = None + + if smartFetch and resp_hash is not None and resp_hash in hash_basket: + # We have this exact same page already, copy it instead of grabbing it again + print( + "[+] Pre-fetch matches previously imaged service, no need to do it again!" + ) + shutil.copy2(hash_basket[resp_hash] + ".html", + screenshotName + ".html") + shutil.copy2(hash_basket[resp_hash] + ".png", + screenshotName + ".png") + else: + if smartFetch: + hash_basket[resp_hash] = screenshotName + + browser.set_page_load_timeout((tout)) + old_url = browser.current_url + browser.get(curUrl.strip()) + if browser.current_url == old_url: + print( + "[-] Error fetching in browser but successfully fetched with Requests: " + + curUrl) + if tryGUIOnFail and headless: + display = Display(visible=0, size=(1024, 768)) + display.start() + print("[+] Attempting to fetch with FireFox: " + + curUrl) + browser2 = setupBrowserProfile(False, proxy, "Firefox") + old_url = browser2.current_url + try: + browser2.get(curUrl.strip()) + if browser2.current_url == old_url: + print( + "[-] Error fetching in GUI browser as well..." + + curUrl) + browser2.quit() + continue + else: + print("[+] Saving: " + screenshotName) + html_source = browser2.page_source + f = open(screenshotName + ".html", "w") + f.write(html_source) + f.close() + browser2.save_screenshot(screenshotName + + ".png") + browser2.quit() + continue + except Exception: + browser2.quit() + display.stop() + print( + "[-] Error fetching in GUI browser as well..." + + curUrl) + + else: + continue + + print("[+] Saving: " + screenshotName) + html_source = browser.page_source + f = open(screenshotName + ".html", "w") + f.write(html_source) + f.close() + browser.save_screenshot(screenshotName + ".png") + + except Exception as e: + if debug: + exc_type, exc_value, exc_traceback = sys.exc_info() + lines = traceback.format_exception(exc_type, exc_value, + exc_traceback) + print("".join("!! " + line for line in lines)) + browser.quit() + browser = setupBrowserProfile(headless, proxy, "Firefox") + continue + browser.quit() + display.stop() def doGet(*args, **kwargs): - url = args[0] - doVhosts = kwargs['vhosts'] - urlQueue = kwargs['urlQueue'] - subs = kwargs['subs'] - extraHosts = kwargs['extraHosts'] - del kwargs['extraHosts'] - del kwargs['urlQueue'] - del kwargs['vhosts'] - del kwargs['subs'] - - kwargs['allow_redirects']=False - - resp = requests.get(url[0],**kwargs) - - - #If we have an https URL and we are configured to scrape hosts from the cert... - if(url[0].find('https') != -1 and url[1] == True): - #Pull hostnames from cert, add as additional URLs and flag as not to pull certs - host = urlparse(url[0]).hostname - port = urlparse(url[0]).port - if(port is None): - port = 443 - cert = ssl.get_server_certificate((host,port),ssl_version=ssl.PROTOCOL_SSLv23) - x509 = M2Crypto.X509.load_cert_string(cert) - subjText = x509.get_subject().as_text() - names = re.findall("CN=([^\s]+)",subjText) - - try: - altNames = x509.get_ext('subjectAltName').get_value() - names.extend(re.findall("DNS:([^,]*)",altNames)) - except: - pass - - for name in names: - if(name.find('*.') != -1): - for sub in subs: - try: - sub = sub.strip() - hostname = name.replace('*.',sub+'.') - if(hostname not in extraHosts): - extraHosts[hostname] = 1 - address = socket.gethostbyname(hostname) - urlQueue.put(['https://'+hostname+':'+str(port),False,url[2]]) - print '[+] Discovered subdomain '+address - except: - pass - name = name.replace('*.','') - if(name not in extraHosts): - extraHosts[name] = 1 - urlQueue.put(['https://'+name+':'+str(port),False,url[2]]) - print '[+] Added host '+name - - else: - if (name not in extraHosts): - extraHosts[name] = 1 - urlQueue.put(['https://'+name+':'+str(port),False,url[2]]) - print '[+] Added host '+name - - - return resp - - else: - return resp - - -def autodetectRequest(url, timeout, vhosts=False, urlQueue=None, subs=None, extraHosts=None): - '''Takes a URL, ignores the scheme. Detect if the host/port is actually an HTTP or HTTPS - server''' - resp = None - host = urlparse(url[0]).hostname - port = urlparse(url[0]).port - - if(port is None): - if('https' in url[0]): - port = 443 - else: - port = 80 - - try: - #cert = ssl.get_server_certificate((host,port)) - - cert = timeoutFn(ssl.get_server_certificate,kwargs={'addr':(host,port),'ssl_version':ssl.PROTOCOL_SSLv23},timeout_duration=3) - - if(cert is not None): - if('https' not in url[0]): - url[0] = url[0].replace('http','https') - #print 'Got cert, changing to HTTPS '+url[0] - - else: - url[0] = url[0].replace('https','http') - #print 'Changing to HTTP '+url[0] - - - except Exception as e: - url[0] = url[0].replace('https','http') - #print 'Changing to HTTP '+url[0] - try: - resp = doGet(url,verify=False, timeout=timeout, vhosts=vhosts, urlQueue=urlQueue, subs=subs, extraHosts=extraHosts) - except Exception as e: - print 'HTTP GET Error: '+str(e) - print url[0] - - return [resp,url] + url = args[0] + doVhosts = kwargs.pop("vhosts", None) + urlQueue = kwargs.pop("urlQueue", None) + subs = kwargs.pop("subs", None) + extraHosts = kwargs.pop("extraHosts", None) + proxy = kwargs.pop("proxy", None) + + kwargs["allow_redirects"] = False + session = requests.session() + if proxy is not None: + session.proxies = { + "http": "socks5://" + proxy, + "https": "socks5://" + proxy + } + resp = session.get(url, **kwargs) + + # If we have an https URL and we are configured to scrape hosts from the cert... + if url.find("https") != -1 and doVhosts is True: + # Pull hostnames from cert, add as additional URLs and flag as not to pull certs + host = urlparse(url).hostname + port = urlparse(url).port + if port is None: + port = 443 + names = [] + try: + cert = ssl.get_server_certificate((host, port)) + x509 = M2Crypto.X509.load_cert_string(cert) + subjText = x509.get_subject().as_text() + names = re.findall("CN=([^\s]+)", subjText) + altNames = x509.get_ext("subjectAltName").get_value() + names.extend(re.findall("DNS:([^,]*)", altNames)) + except Exception as e: + print(e) + + for name in names: + if name not in extraHosts: + extraHosts[name] = 1 + urlQueue.put(f"https://{name}:{port}") + print(f"[+] Added host https://{name}:{port}") + return resp + else: + return resp + + +def autodetectRequest(url, + timeout, + vhosts=False, + urlQueue=None, + subs=None, + extraHosts=None, + proxy=None): + """Takes a URL, ignores the scheme. Detect if the host/port is actually an HTTP or HTTPS + server""" + resp = None + host = urlparse(url).hostname + port = urlparse(url).port + + try: + # cert = ssl.get_server_certificate((host,port)) + + cert = timeoutFn( + ssl.get_server_certificate, + kwargs={"addr": (host, port)}, + timeout_duration=3, + ) + + if cert is not None: + if "https" not in url: + url = url.replace("http://", "https://") + else: + url = url.replace("https://", "http://") + + except Exception: + url = url.replace("https://", "http://") + try: + resp = doGet( + url, + verify=False, + timeout=timeout, + vhosts=vhosts, + urlQueue=urlQueue, + subs=subs, + extraHosts=extraHosts, + proxy=proxy, + ) + except Exception as e: + print("HTTP GET Error: " + str(e)) + print(url) + + return [resp, url] def sslError(e): - if('the handshake operation timed out' in str(e) or 'unknown protocol' in str(e) or 'Connection reset by peer' in str(e) or 'EOF occurred in violation of protocol' in str(e)): - return True - else: - return False - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - - parser.add_argument("-l","--list",help='List of input URLs') - parser.add_argument("-i","--input",help='nmap gnmap output file') - parser.add_argument("-p","--headless",action='store_true',default=False,help='Run in headless mode (using phantomjs)') - parser.add_argument("-w","--workers",default=1,type=int,help='number of threads') - parser.add_argument("-t","--timeout",type=int,default=10,help='time to wait for pageload before killing the browser') - parser.add_argument("-v","--verbose",action='store_true',default=False,help='turn on verbose debugging') - parser.add_argument("-a","--autodetect",action='store_true',default=False,help='Automatically detect if listening services are HTTP or HTTPS. Ignores NMAP service detction and URL schemes.') - parser.add_argument("-vH","--vhosts",action='store_true',default=False,help='Attempt to scrape hostnames from SSL certificates and add these to the URL queue') - parser.add_argument("-dB","--dns_brute",help='Specify a DNS subdomain wordlist for bruteforcing on wildcard SSL certs') - parser.add_argument("-r","--retries",type=int,default=0,help='Number of retries if a URL fails or timesout') - parser.add_argument("-tG","--trygui",action='store_true',default=False,help='Try to fetch the page with FireFox when headless fails') - - args = parser.parse_args() - - if(len(sys.argv) < 2): - parser.print_help() - sys.exit(0) - - if(args.input is not None): - inFile = open(args.input,'r') - if(detectFileType(inFile) == 'gnmap'): - hosts = parseGnmap(inFile,args.autodetect) - urls = [] - for host,ports in hosts.items(): - for port in ports: - url='' - if port[1] == True: - url = ['https://'+host+':'+port[0],args.vhosts,args.retries] - else: - url = ['http://'+host+':'+port[0],args.vhosts,args.retries] - urls.append(url) - - - else: - print 'Invalid input file - must be Nmap GNMAP' - - elif (args.list is not None): - f = open(args.list,'r') - lst = f.readlines() - urls = [] - for url in lst: - urls.append([url.strip(),args.vhosts,args.retries]) - else: - print "No input specified" - sys.exit(0) - - - #shuffle the url list - shuffle(urls) - #read in the subdomain bruteforce list if specificed - subs = [] - if(args.dns_brute != None): - subs = open(args.dns_brute,'r').readlines() - #Fire up the workers - urlQueue = multiprocessing.Queue() - manager = multiprocessing.Manager() - hostsDict = manager.dict() - workers = [] - - for i in range(args.workers): - p = multiprocessing.Process(target=worker, - args=(urlQueue,args.timeout,args.verbose,args.headless,args.autodetect, args.vhosts,subs,hostsDict,args.trygui)) - - - workers.append(p) - p.start() - - for url in urls: - urlQueue.put(url) - - for p in workers: - try: - p.join() - except KeyboardInterrupt: - print "[-] Ctrl-C received! Sending kill to threads..." - kill_received = True - for p in workers: - p.terminate() + if ("the handshake operation timed out" in str(e) + or "unknown protocol" in str(e) + or "Connection reset by peer" in str(e) + or "EOF occurred in violation of protocol" in str(e)): + return True + else: + return False + + +def signal_handler(signal, frame): + print("[-] Ctrl-C received! Killing Thread(s)...") + os._exit(0) + + +signal.signal(signal.SIGINT, signal_handler) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("-l", "--list", help="List of input URLs") + parser.add_argument("-i", "--input", help="nmap gnmap/xml output file") + parser.add_argument( + "-p", + "--headless", + action="store_true", + default=False, + help="Run in headless mode (using phantomjs)", + ) + parser.add_argument( + "-b", + "--browsertype", + default="Firefox", + help="Choose webdriver {Firefox, Chrome}" + ) + parser.add_argument("-w", + "--workers", + default=1, + type=int, + help="number of threads") + parser.add_argument( + "-t", + "--timeout", + type=int, + default=10, + help="time to wait for pageload before killing the browser", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + default=False, + help="turn on verbose debugging", + ) + parser.add_argument( + "-a", + "--autodetect", + action="store_true", + default=False, + help= + "Automatically detect if listening services are HTTP or HTTPS. Ignores NMAP service detction and URL schemes.", + ) + parser.add_argument( + "-vH", + "--vhosts", + action="store_true", + default=False, + help= + "Attempt to scrape hostnames from SSL certificates and add these to the URL queue", + ) + parser.add_argument( + "-dB", + "--dns_brute", + help= + "Specify a DNS subdomain wordlist for bruteforcing on wildcard SSL certs", + ) + parser.add_argument( + "-uL", + "--uri_list", + help="Specify a list of URIs to fetch in addition to the root", + ) + parser.add_argument( + "-r", + "--retries", + type=int, + default=0, + help="Number of retries if a URL fails or timesout", + ) + parser.add_argument( + "-tG", + "--trygui", + action="store_true", + default=False, + help="Try to fetch the page with FireFox when headless fails", + ) + parser.add_argument( + "-sF", + "--smartfetch", + action="store_true", + default=False, + help= + "Enables smart fetching to reduce network traffic, also increases speed if certain conditions are met.", + ) + parser.add_argument("-pX", + "--proxy", + default=None, + help="SOCKS5 Proxy in host:port format") + + args = parser.parse_args() + + if len(sys.argv) < 2: + parser.print_help() + sys.exit(0) + + # read in the URI list if specificed + uris = [""] + if args.uri_list is not None: + uris = open(args.uri_list, "r").readlines() + uris.append("") + + if args.input is not None: + inFile = open(args.input, "r") + if detectFileType(inFile) == "gnmap": + hosts = parseGnmap(inFile, args.autodetect) + elif detectFileType(inFile) == "xml": + hosts = parsexml(inFile) + else: + print("Invalid input file - must be Nmap GNMAP or Nmap XML") + + urls = [] + + for host in hosts: + for port in hosts[host]: + urls.append(f"http://{host}:{port}") + + elif args.list is not None: + f = open(args.list, "r") + lst = f.readlines() + urls = [] + for url in lst: + urls.append(url.strip()) + else: + print("No input specified") + sys.exit(0) + + # shuffle the url list + shuffle(urls) + + # read in the subdomain bruteforce list if specificed + subs = [] + if args.dns_brute is not None: + subs = open(args.dns_brute, "r").readlines() + + # Fire up the workers + urlQueue = multiprocessing.Queue() + manager = multiprocessing.Manager() + hostsDict = manager.dict() + workers = [] + hash_basket = {} + + for i in range(args.workers): + p = multiprocessing.Process( + target=worker, + args=( + urlQueue, + args.timeout, + args.verbose, + args.headless, + args.autodetect, + args.vhosts, + subs, + hostsDict, + args.trygui, + args.smartfetch, + args.proxy, + args.browsertype + ), + ) + workers.append(p) + p.start() + + for url in urls: + urlQueue.put(url) + + for p in workers: + p.join() diff --git a/install-dependencies.sh b/install-dependencies.sh new file mode 100755 index 0000000..1d504cd --- /dev/null +++ b/install-dependencies.sh @@ -0,0 +1,17 @@ +# Installation Script - tested on a fresh install of Ubuntu 20.04.3 LTS as root (sudo) + +# Show all commands being run +#set -x + +# Error out if one fails +set -e + +# Pull packages from apt +apt install -y python3-pip build-essential libssl-dev swig python3-dev + +# Install Google Chrome +wget -O /tmp/google-chrome-stable_current_amd64.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb +apt install -y /tmp/google-chrome-stable_current_amd64.deb + +# Install required python packages +pip3 install -r requirements.txt diff --git a/masshttp.sh b/masshttp.sh new file mode 100755 index 0000000..254e9f0 --- /dev/null +++ b/masshttp.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +/root/masscan/bin/masscan -p80,443 -iL networks.txt -oG http.gnmap --rate 100000 +mkdir httpscreenshots +cd httpscreenshots +python ~/tools/httpscreenshot.py -i ../http.gnmap -p -t 30 -w 50 -a -vH -r 1 +python ~/tools/httpscreenshot.py -i ../http.gnmap -p -t 10 -w 10 -a -vH +cd .. +python screenshotClustering/cluster.py -d httpscreenshots/ + diff --git a/phantomjs b/phantomjs deleted file mode 100755 index 7969e3d..0000000 Binary files a/phantomjs and /dev/null differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4a8609e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +m2crypto +requests +selenium +beautifulsoup4 +pillow +PySocks +python-libnmap +pyvirtualdisplay +reload +webdriver_manager +lxml +urllib3<2.0.0 diff --git a/screenshotClustering/cluster.py b/screenshotClustering/cluster.py index 32b5b1d..13a651f 100755 --- a/screenshotClustering/cluster.py +++ b/screenshotClustering/cluster.py @@ -8,14 +8,19 @@ import time from bs4 import BeautifulSoup +try: + from urllib.parse import quote,unquote +except: + from urllib import quote,unquote + def addAttrToBag(attrName,url,link,wordBags,soup): for tag in soup.findAll('',{attrName:True}): if(isinstance(tag[attrName],str) or isinstance(tag[attrName],unicode)): - tagStr = tag[attrName].encode('utf-8').strip() + tagStr = tag[attrName].encode('ISO-8859-1').strip() elif(isinstance(tag[attrName],list)): - tagStr = tag[attrName][0].encode('utf-8').strip() + tagStr = tag[attrName][0].encode('ISO-8859-1').strip() else: - print '[-] Strange tag type detected - '+str(type(tag[attrName])) + print('[-] Strange tag type detected - '+str(type(tag[attrName]))) tagStr = 'XXXXXXXXX' if(tagStr != ''): @@ -46,9 +51,9 @@ def createWordBags(htmlList): wordBags={} for f in htmlList: - htmlContent = open(f,'r').read() + htmlContent = open(f,'r', encoding='ISO-8859-1').read() wordBags[f]={} - soup = BeautifulSoup(htmlContent) + soup = BeautifulSoup(htmlContent, 'html.parser') addAttrToBag('name',f,False,wordBags,soup) addAttrToBag('href',f,True,wordBags,soup) addAttrToBag('src',f,True,wordBags,soup) @@ -56,7 +61,6 @@ def createWordBags(htmlList): addAttrToBag('class',f,False,wordBags,soup) addTagToBag('title',f,False,wordBags,soup) addTagToBag('h1',f,False,wordBags,soup) - return wordBags def getNumWords(wordBag): @@ -73,11 +77,11 @@ def computeScore(wordBag1,wordBag2,debug=0): if(len(wordBag1) == 0 and len(wordBag2) == 0): if debug: - print 'Both have no words - return true' + print('Both have no words - return true') return 1 elif (len(wordBag1) == 0 or len(wordBag2) == 0): if debug: - print 'One has no words - return false' + print('One has no words - return false') return 0 for word in wordBag1.keys(): @@ -86,17 +90,17 @@ def computeScore(wordBag1,wordBag2,debug=0): score = (float(commonWords)/float(wordBag1Length)*(float(commonWords)/float(wordBag2Length))) if debug: - print "Common Words: "+str(commonWords) - print "WordBag1 Length: "+str(wordBag1Length) - print "WordBag2 Length: "+str(wordBag2Length) - print score + print("Common Words: "+str(commonWords)) + print("WordBag1 Length: "+str(wordBag1Length)) + print("WordBag2 Length: "+str(wordBag2Length)) + print(score) return score def createClusters(wordBags,threshold): clusterData = {} i = 0 - siteList = wordBags.keys() + siteList = list(wordBags.keys()) for i in range(0,len(siteList)): clusterData[siteList[i]] = [threshold, i] @@ -109,7 +113,6 @@ def createClusters(wordBags,threshold): if (clusterData[siteList[j]][0] <= threshold and score > clusterData[siteList[j]][0]): clusterData[siteList[j]][1] = i clusterData[siteList[j]][0] = score - return clusterData def getScopeHtml(scopeFile): @@ -121,56 +124,84 @@ def getScopeHtml(scopeFile): scopeText = scopeText + line+'
' return scopeText +def getPageTitle(htmlFile): + """Simple function to yank page title from html""" + with open(htmlFile, 'r', encoding='ISO-8859-1') as f: + soup = BeautifulSoup(f, "lxml") + try: + return soup.title.string.encode('ascii', 'ignore') + except AttributeError: + return "No Page Title Found" + def renderClusterHtml(clust,width,height,scopeFile=None): - html = '' - scopeHtml = getScopeHtml(scopeFile) - header = ''' - - Web Application Catalog - -

Web Application Catalog

- ''' - if(scopeHtml is not None): - header = header+scopeHtml - header = header + ''' - - -

Catalog:

- ''' - html = html+'' - - for cluster,siteList in clust.iteritems(): - html=html+'' - html=html+'' - for site in siteList: - html=html+'' - html=html+'' - html=html+'
'+site[site.rfind('/')+1:site.rfind('-')]+':'+site[site.rfind('-')+1:site.rfind('.')]+'
' - footer = '' - - return [header,html,footer] + html = '' + scopeHtml = getScopeHtml(scopeFile) + header = ''' + + Web Application Catalog + + ''' + if(scopeHtml is not None): + header = header+scopeHtml + header = header + ''' + + +
+

Web Application Catalog:

+
+ ''' + for cluster, siteList in clust.items(): + try: + title = getPageTitle(siteList[0]).decode("ISO-8859-1") + except (UnicodeDecodeError, AttributeError): + title = getPageTitle(siteList[0]) + html = html + """ + + + + + + + + """ + screenshotName = quote(siteList[0][0:-4], safe='./') + html = html + '
+ """ + title + """
' + for site in siteList: + screenshotName = quote(site[0:-5], safe='./') + if site != siteList[-1]: + html = html + f"" + else: + html = html + f"
" + + + footer = '' + return [header,html,footer] + + + def printJS(): js = """ function popUp(e,src) { - x = e.clientX; - y = e.clientY; - - var img = document.createElement("img"); - img.src = src; - img.setAttribute("class","popUp"); - img.setAttribute("style","position:fixed;left:"+(x+15)+";top:"+0+";background-color:white"); - //img.setAttribute("onmouseout","clearPopup(event)") - // This next line will just add it to the tag - document.body.appendChild(img); + x = e.clientX; + y = e.clientY; + + var img = document.createElement("img"); + img.src = src; + img.setAttribute("class","popUp"); + img.setAttribute("style","position:fixed;left:"+(x+15)+";top:"+0+";background-color:white"); + //img.setAttribute("onmouseout","clearPopup(event)") + // This next line will just add it to the tag + document.body.appendChild(img); } function clearPopup() { - var popUps = document.getElementsByClassName('popUp'); - while(popUps[0]) { - popUps[0].parentNode.removeChild(popUps[0]); - } + var popUps = document.getElementsByClassName('popUp'); + while(popUps[0]) { + popUps[0].parentNode.removeChild(popUps[0]); + } } """ @@ -178,12 +209,164 @@ def printJS(): f.write(js) f.close() +def printCSS(): + css = """ + @import url(http://fonts.googleapis.com/css?family=Roboto:400,500,700,300,100); + + body { + background-color: #3e94ec; + font-family: "Roboto", helvetica, arial, sans-serif; + font-size: 16px; + font-weight: 400; + text-rendering: optimizeLegibility; + } + + div.table-title { + display: block; + margin: auto; + max-width: 600px; + padding:5px; + width: 100%; + } + + .table-title h3 { + color: #fafafa; + font-size: 30px; + font-weight: 400; + font-style:normal; + font-family: "Roboto", helvetica, arial, sans-serif; + text-shadow: -1px -1px 1px rgba(0, 0, 0, 0.1); + text-transform:uppercase; + } + + + /*** Table Styles **/ + + .table-fill { + background: white; + border-radius:3px; + border-collapse: collapse; + height: 320px; + margin: auto; + margin-bottom: 50px; + max-width: 600px; + padding:5px; + width: 100%; + box-shadow: 0 5px 10px rgba(0, 0, 0, 0.1); + animation: float 5s infinite; + } + + th { + color:#D5DDE5;; + background:#1b1e24; + border-bottom:4px solid #9ea7af; + border-right: 1px solid #343a45; + font-size:23px; + font-weight: 100; + padding:24px; + text-align:left; + text-shadow: 0 1px 1px rgba(0, 0, 0, 0.1); + vertical-align:middle; + } + + th:first-child { + border-top-left-radius:3px; + } + + th:last-child { + border-top-right-radius:3px; + border-right:none; + } + + tr { + border-top: 1px solid #C1C3D1; + border-bottom-: 1px solid #C1C3D1; + color:#666B85; + font-size:16px; + font-weight:normal; + text-shadow: 0 1px 1px rgba(256, 256, 256, 0.1); + } + + tr:hover td { + background:#4E5066; + color:#FFFFFF; + border-top: 1px solid #22262e; + border-bottom: 1px solid #22262e; + } + + tr:first-child { + border-top:none; + } + + tr:last-child { + border-bottom:none; + } + + tr:nth-child(odd) td { + background:#EBEBEB; + } + + tr:nth-child(odd):hover td { + background:#4E5066; + } + + tr:last-child td:first-child { + border-bottom-left-radius:3px; + } + + tr:last-child td:last-child { + border-bottom-right-radius:3px; + } + + td { + background:#FFFFFF; + padding:20px; + text-align:left; + vertical-align:middle; + font-weight:300; + font-size:18px; + text-shadow: -1px -1px 1px rgba(0, 0, 0, 0.1); + border-right: 1px solid #C1C3D1; + } + + td:last-child { + border-right: 0px; + } + + th.text-left { + text-align: left; + } + + th.text-center { + text-align: center; + } + + th.text-right { + text-align: right; + } + + td.text-left { + text-align: left; + } + + td.text-center { + text-align: center; + } + + td.text-right { + text-align: right; + } + """ + f = open('style.css','w') + f.write(css) + f.close() + def doCluster(htmlList): siteWordBags = createWordBags(htmlList) clusterData = createClusters(siteWordBags,0.6) clusterDict = {} - for site,data in clusterData.iteritems(): + for site,data in clusterData.items(): if data[1] in clusterDict: clusterDict[data[1]].append(site) else: @@ -261,7 +444,7 @@ def doDiff(htmlList,diffList): htmlRegex = re.compile('.*html.*') for fileName in os.listdir(path): if(htmlRegex.match(fileName)): - htmlList.append(path+fileName) + htmlList.append(path+fileName) n = len(htmlList) @@ -274,7 +457,7 @@ def doDiff(htmlList,diffList): diffList = [] for fileName in os.listdir(args.diff): if(htmlRegex.match(fileName)): - diffList.append(args.diff+fileName) + diffList.append(args.diff+fileName) lists = doDiff(htmlList,diffList) @@ -308,4 +491,5 @@ def doDiff(htmlList,diffList): f = open(args.output,'w') f.write(html) printJS() + printCSS() diff --git a/screenshotClustering/style.css b/screenshotClustering/style.css new file mode 100644 index 0000000..0cb288d --- /dev/null +++ b/screenshotClustering/style.css @@ -0,0 +1,147 @@ + + @import url(http://fonts.googleapis.com/css?family=Roboto:400,500,700,300,100); + + body { + background-color: #3e94ec; + font-family: "Roboto", helvetica, arial, sans-serif; + font-size: 16px; + font-weight: 400; + text-rendering: optimizeLegibility; + } + + div.table-title { + display: block; + margin: auto; + max-width: 600px; + padding:5px; + width: 100%; + } + + .table-title h3 { + color: #fafafa; + font-size: 30px; + font-weight: 400; + font-style:normal; + font-family: "Roboto", helvetica, arial, sans-serif; + text-shadow: -1px -1px 1px rgba(0, 0, 0, 0.1); + text-transform:uppercase; + } + + + /*** Table Styles **/ + + .table-fill { + background: white; + border-radius:3px; + border-collapse: collapse; + height: 320px; + margin: auto; + margin-bottom: 50px; + max-width: 600px; + padding:5px; + width: 100%; + box-shadow: 0 5px 10px rgba(0, 0, 0, 0.1); + animation: float 5s infinite; + } + + th { + color:#D5DDE5;; + background:#1b1e24; + border-bottom:4px solid #9ea7af; + border-right: 1px solid #343a45; + font-size:23px; + font-weight: 100; + padding:24px; + text-align:left; + text-shadow: 0 1px 1px rgba(0, 0, 0, 0.1); + vertical-align:middle; + } + + th:first-child { + border-top-left-radius:3px; + } + + th:last-child { + border-top-right-radius:3px; + border-right:none; + } + + tr { + border-top: 1px solid #C1C3D1; + border-bottom-: 1px solid #C1C3D1; + color:#666B85; + font-size:16px; + font-weight:normal; + text-shadow: 0 1px 1px rgba(256, 256, 256, 0.1); + } + + tr:hover td { + background:#4E5066; + color:#FFFFFF; + border-top: 1px solid #22262e; + border-bottom: 1px solid #22262e; + } + + tr:first-child { + border-top:none; + } + + tr:last-child { + border-bottom:none; + } + + tr:nth-child(odd) td { + background:#EBEBEB; + } + + tr:nth-child(odd):hover td { + background:#4E5066; + } + + tr:last-child td:first-child { + border-bottom-left-radius:3px; + } + + tr:last-child td:last-child { + border-bottom-right-radius:3px; + } + + td { + background:#FFFFFF; + padding:20px; + text-align:left; + vertical-align:middle; + font-weight:300; + font-size:18px; + text-shadow: -1px -1px 1px rgba(0, 0, 0, 0.1); + border-right: 1px solid #C1C3D1; + } + + td:last-child { + border-right: 0px; + } + + th.text-left { + text-align: left; + } + + th.text-center { + text-align: center; + } + + th.text-right { + text-align: right; + } + + td.text-left { + text-align: left; + } + + td.text-center { + text-align: center; + } + + td.text-right { + text-align: right; + } + \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..2bcd70e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 88