diff --git a/Dockerfile b/Dockerfile index cbd71ed..43bc347 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,8 @@ -# docker pull andmyhacks/httpscreenshot +# docker pull jesseosiecki/httpscreenshot -FROM ubuntu:latest +FROM ubuntu:20.04 -MAINTAINER Keith Hoodlet +MAINTAINER Jesse Osiecki RUN mkdir -p /etc/httpscreenshot WORKDIR /etc/httpscreenshot @@ -10,7 +10,7 @@ WORKDIR /etc/httpscreenshot COPY . /etc/httpscreenshot/ RUN apt-get update -RUN apt-get install -y wget libfontconfig vim +RUN apt-get install -y wget libfontconfig RUN ./install-dependencies.sh @@ -19,3 +19,5 @@ RUN ln -s /etc/httpscreenshot/httpscreenshot.py /usr/bin/httpscreenshot RUN mkdir -p /etc/httpscreenshot/images WORKDIR /etc/httpscreenshot/images + +ENTRYPOINT ["httpscreenshot"] diff --git a/README.md b/README.md index eaae0dd..2d87285 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,17 @@ # httpscreenshot +### Installation via Docker + +`docker pull jesseosiecki/httpscreenshot` +`docker run jesseosiecki/httpscreenshot` + ### Installation on Ubuntu #### Via Script Run `install-dependencies.sh` script as root. -This script has been tested on Ubuntu 14.04. +This script has been tested on Ubuntu 20.04 as *root* (sudo). ### Manually diff --git a/httpscreenshot.py b/httpscreenshot.py index 539c5a4..6794d8d 100644 --- a/httpscreenshot.py +++ b/httpscreenshot.py @@ -1,55 +1,49 @@ -#!/usr/bin/python +#!/usr/bin/python3 -''' -Installation on Ubuntu: -apt-get install python-requests python-m2crypto phantomjs -If you run into: 'module' object has no attribute 'PhantomJS' -then pip install selenium (or pip install --upgrade selenium) -''' - -from selenium import webdriver -from urlparse import urlparse -from random import shuffle -from PIL import Image -from PIL import ImageDraw -from PIL import ImageFont -from libnmap.parser import NmapParser -import multiprocessing -import Queue import argparse -import sys -import traceback +import hashlib +import multiprocessing import os.path -import ssl -import M2Crypto +import queue import re -import time -import signal import shutil -import hashlib -from pyvirtualdisplay import Display -from selenium.webdriver.common.desired_capabilities import DesiredCapabilities +import signal +import ssl +import sys +import time +import traceback +from importlib import reload +from random import shuffle +from urllib.parse import urlparse +from collections import defaultdict + +import warnings +warnings.filterwarnings("ignore") +import M2Crypto +from PIL import Image, ImageDraw, ImageFont +from pyvirtualdisplay import Display +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.desired_capabilities import DesiredCapabilities +from webdriver_manager.chrome import ChromeDriverManager try: from urllib.parse import quote -except: - from urllib import quote +except Exception: + from urllib.parse import quote try: - import requesocks as requests -except: - print "requesocks library not found - proxy support will not be available" - import requests + import requesocks as requests +except Exception: + import requests reload(sys) -sys.setdefaultencoding("utf8") def timeoutFn(func, args=(), kwargs={}, timeout_duration=1, default=None): - import signal - class TimeoutError(Exception): pass @@ -70,515 +64,619 @@ def handler(signum, frame): def addUrlsForService(host, urlList, servicesList, scheme): - if(servicesList == None or servicesList == []): - return - for service in servicesList: - state = service.findPreviousSibling("state") - if(state != None and state != [] and state['state'] == 'open'): - urlList.append(scheme+host+':'+str(service.parent['portid'])) + if servicesList is None or servicesList == []: + return + for service in servicesList: + state = service.findPreviousSibling("state") + if state is not None and state != [] and state["state"] == "open": + urlList.append(scheme + host + ":" + str(service.parent["portid"])) def detectFileType(inFile): - #Check to see if file is of type gnmap - firstLine = inFile.readline() - secondLine = inFile.readline() - thirdLine = inFile.readline() - - #Be polite and reset the file pointer - inFile.seek(0) - - if ((firstLine.find('nmap') != -1 or firstLine.find('Masscan') != -1) and thirdLine.find('Host:') != -1): - #Looks like a gnmap file - this wont be true for other nmap output types - #Check to see if -sV flag was used, if not, warn - if(firstLine.find('-sV') != -1 or firstLine.find('-A') != -1): - return 'gnmap' - else: - print("Nmap version detection not used! Discovery module may miss some hosts!") - return 'gnmap' - elif ((firstLine.find('xml version') != -1) and secondLine.find('DOCTYPE nmaprun') != -1): - return 'xml' - else: - return None + # Check to see if file is of type gnmap + firstLine = inFile.readline() + secondLine = inFile.readline() + thirdLine = inFile.readline() + + # Be polite and reset the file pointer + inFile.seek(0) + + if (firstLine.find("nmap") != -1 or firstLine.find("Masscan") != -1) and thirdLine.find("Host:") != -1: + # Looks like a gnmap file - this wont be true for other nmap output types + # Check to see if -sV flag was used, if not, warn + if firstLine.find("-sV") != -1 or firstLine.find("-A") != -1: + return "gnmap" + else: + print( + "Nmap version detection not used! Discovery module may miss some hosts!" + ) + return "gnmap" + elif (firstLine.find("xml version") != + -1) and (secondLine.find("DOCTYPE nmaprun") != -1 + or secondLine.find("masscan") != -1): + return "xml" + else: + return None + def parsexml(inFile): - targets = {} - infile = NmapParser.parse_fromfile(args.input) - for host in infile.hosts: - if host.services: - currentTarget = [] - for s in host.services: - if s.state != 'closed' and 'http' in s.service: - ip = host.address - port = str(s.port) - https = False - if 'https' in s.service or 'ssl' in s.service: - https = True - - currentTarget.append([port,https]) - - if(len(currentTarget) > 0): - targets[ip] = currentTarget - - return targets - print "Parsing is complete, continue on..." + import xml.etree.ElementTree as ET + + tree = ET.parse(inFile) + root = tree.getroot() + + targets = defaultdict(list) + + for host in root.findall("host"): + ip = host.find('address').get('addr') + + for port in host.find('ports').findall("port"): + if port.find("state").get("state") == "open": + targets[ip].append(port.get("portid")) + + return targets + def parseGnmap(inFile, autodetect): - ''' - Parse a gnmap file into a dictionary. The dictionary key is the ip address or hostname. - Each key item is a list of ports and whether or not that port is https/ssl. For example: - >>> targets - {'127.0.0.1': [[443, True], [8080, False]]} - ''' - targets = {} - for hostLine in inFile: - if hostLine.strip() == '': - break - currentTarget = [] - #Pull out the IP address (or hostnames) and HTTP service ports - fields = hostLine.split(' ') - ip = fields[1] #not going to regex match this with ip address b/c could be a hostname - for item in fields: - #Make sure we have an open port with an http type service on it - if (item.find('http') != -1 or autodetect) and re.findall('\d+/open',item): - port = None - https = False - ''' - nmap has a bunch of ways to list HTTP like services, for example: - 8089/open/tcp//ssl|http - 8000/closed/tcp//http-alt/// - 8008/closed/tcp//http/// - 8080/closed/tcp//http-proxy// - 443/open/tcp//ssl|https?/// - 8089/open/tcp//ssl|http - Since we want to detect them all, let's just match on the word http - and make special cases for things containing https and ssl when we - construct the URLs. - ''' - port = item.split('/')[0] - - if item.find('https') != -1 or item.find('ssl') != -1: - https = True - #Add the current service item to the currentTarget list for this host - currentTarget.append([port,https]) - - if(len(currentTarget) > 0): - targets[ip] = currentTarget - return targets - - -def setupBrowserProfile(headless,proxy): - browser = None - if(proxy is not None): - service_args=['--ignore-ssl-errors=true','--ssl-protocol=any','--proxy='+proxy,'--proxy-type=socks5'] - else: - service_args=['--ignore-ssl-errors=true','--ssl-protocol=any'] - - while(browser is None): - try: - if(not headless): - capabilities = DesiredCapabilities.FIREFOX - capabilities['acceptSslCerts'] = True - fp = webdriver.FirefoxProfile() - fp.set_preference("webdriver.accept.untrusted.certs",True) - fp.set_preference("security.enable_java", False) - fp.set_preference("webdriver.load.strategy", "fast"); - if(proxy is not None): - proxyItems = proxy.split(":") - fp.set_preference("network.proxy.socks",proxyItems[0]) - fp.set_preference("network.proxy.socks_port",int(proxyItems[1])) - fp.set_preference("network.proxy.type",1) - browser = webdriver.Firefox(firefox_profile=fp,capabilities=capabilities) - else: - browser = webdriver.PhantomJS(service_args=service_args, executable_path="phantomjs") - browser.set_window_size(1024, 768) - - except Exception as e: - print e - time.sleep(1) - continue - return browser + hostRe = re.compile('Host:\s*[^\s]+') + servicesRe = re.compile('Ports:\s*.*') + + targets = defaultdict(list) + + for hostLine in inFile: + if hostLine.strip() == "": + break + # Pull out the IP address (or hostnames) and HTTP service ports + + ipHostRes = hostRe.search(hostLine) + + if ipHostRes is None: + continue + + ipHost = ipHostRes.group() + ip = ipHost.split(':')[1].strip() + + try: + services = servicesRe.search(hostLine).group().split() + except: + continue + + for item in services: + # Make sure we have an open port with an http type service on it + if re.findall("\d+/open", item): + port = None + https = False + """ + nmap has a bunch of ways to list HTTP like services, for example: + 8089/open/tcp//ssl|http + 8000/closed/tcp//http-alt/// + 8008/closed/tcp//http/// + 8080/closed/tcp//http-proxy// + 443/open/tcp//ssl|https?/// + 8089/open/tcp//ssl|http + Since we want to detect them all, let's just match on the word http + and make special cases for things containing https and ssl when we + construct the URLs. + """ + port = item.split("/")[0] + targets[ip].append(port) + + return targets + + +def setupBrowserProfile(headless, proxy, browserType): + browser = None + if (proxy is not None): + service_args = ['--ignore-ssl-errors=true', '--ssl-protocol=any', '--proxy=' + proxy, '--proxy-type=socks5'] + else: + service_args = ['--ignore-ssl-errors=true', '--ssl-protocol=any'] + + while (browser is None): + try: + if (browserType == 'Chrome' or browserType == 'Chromium'): + service = Service(ChromeDriverManager(log_level=0).install()) + coptions = Options() + if headless: + coptions.add_argument("--headless") + coptions.add_argument("--no-sandbox") + coptions.add_argument("--window-size=1024x768") + coptions.add_argument("--ignore-certificate-errors") + coptions.add_argument("--ssl-version-min=tls1") + + browser = webdriver.Chrome(service=service, options=coptions) + else: + capabilities = DesiredCapabilities.FIREFOX + capabilities['acceptSslCerts'] = True + fp = webdriver.FirefoxProfile() + fp.set_preference("webdriver.accept.untrusted.certs", True) + fp.set_preference("security.enable_java", False) + fp.set_preference("webdriver.load.strategy", "fast"); + if (proxy is not None): + proxyItems = proxy.split(":") + fp.set_preference("network.proxy.socks", proxyItems[0]) + fp.set_preference("network.proxy.socks_port", int(proxyItems[1])) + fp.set_preference("network.proxy.type", 1) + + fireFoxOptions = webdriver.FirefoxOptions() + if headless: + fireFoxOptions.headless = True + + browser = webdriver.Firefox(firefox_profile=fp, + capabilities=capabilities, + options=fireFoxOptions) + browser.set_window_size(1024, 768) + + except Exception as e: + print(e) + time.sleep(1) + continue + return browser def writeImage(text, filename, fontsize=40, width=1024, height=200): - image = Image.new("RGBA", (width,height), (255,255,255)) - draw = ImageDraw.Draw(image) - if (os.path.exists("/usr/share/httpscreenshot/LiberationSerif-BoldItalic.ttf")): - font_path = "/usr/share/httpscreenshot/LiberationSerif-BoldItalic.ttf" + image = Image.new("RGBA", (width, height), (255, 255, 255)) + draw = ImageDraw.Draw(image) + if os.path.exists( + "/usr/share/httpscreenshot/LiberationSerif-BoldItalic.ttf"): + font_path = "/usr/share/httpscreenshot/LiberationSerif-BoldItalic.ttf" + else: + font_path = (os.path.dirname(os.path.realpath(__file__)) + + "/LiberationSerif-BoldItalic.ttf") + font = ImageFont.truetype(font_path, fontsize) + draw.text((10, 0), text, (0, 0, 0), font=font) + image.save(filename) + + +def worker( + urlQueue, + tout, + debug, + headless, + doProfile, + vhosts, + subs, + extraHosts, + tryGUIOnFail, + smartFetch, + proxy, + browserType +): + if debug: + print("[*] Starting worker") + + browser = None + display = None + try: + if tryGUIOnFail or not headless: + display = Display(visible=0, size=(800, 600)) + display.start() + + browser = setupBrowserProfile(headless, proxy, browserType) + + except Exception: + print("[-] Oh no! Couldn't create the browser, Selenium blew up") + exc_type, exc_value, exc_traceback = sys.exc_info() + lines = traceback.format_exception(exc_type, exc_value, exc_traceback) + print("".join("!! " + line for line in lines)) + browser.quit() + display.stop() + return + + while True: + # Try to get a URL from the Queue + if urlQueue.qsize() > 0: + try: + curUrl = urlQueue.get(timeout=tout) + except queue.Empty: + continue + print("[+] " + str(urlQueue.qsize()) + " URLs remaining") + screenshotName = quote(curUrl, safe="") + if debug: + print("[+] Got URL: " + curUrl) + print("[+] screenshotName: " + screenshotName) + if os.path.exists(screenshotName + ".png"): + if debug: + print("[-] Screenshot already exists, skipping") + continue else: - font_path = os.path.dirname(os.path.realpath(__file__))+"/LiberationSerif-BoldItalic.ttf" - font = ImageFont.truetype(font_path, fontsize) - draw.text((10, 0), text, (0,0,0), font=font) - image.save(filename) - - -def worker(urlQueue, tout, debug, headless, doProfile, vhosts, subs, extraHosts, tryGUIOnFail, smartFetch,proxy): - if(debug): - print '[*] Starting worker' - - browser = None - display = None - try: - if(tryGUIOnFail or not headless): - display = Display(visible=0, size=(800, 600)) - display.start() - - browser = setupBrowserProfile(headless,proxy) - - except: - print "[-] Oh no! Couldn't create the browser, Selenium blew up" - exc_type, exc_value, exc_traceback = sys.exc_info() - lines = traceback.format_exception(exc_type, exc_value, exc_traceback) - print ''.join('!! ' + line for line in lines) - browser.quit() - display.stop() - return - - while True: - #Try to get a URL from the Queue - if urlQueue.qsize() > 0: - try: - curUrl = urlQueue.get(timeout=tout) - except Queue.Empty: - continue - print '[+] '+str(urlQueue.qsize())+' URLs remaining' - screenshotName = quote(curUrl[0], safe='') - if(debug): - print '[+] Got URL: '+curUrl[0] - print '[+] screenshotName: '+screenshotName - if(os.path.exists(screenshotName+".png")): - if(debug): - print "[-] Screenshot already exists, skipping" - continue - else: - if(debug): - print'[-] URL queue is empty, quitting.' - browser.quit() - return - - try: - if(doProfile): - [resp,curUrl] = autodetectRequest(curUrl, timeout=tout, vhosts=vhosts, urlQueue=urlQueue, subs=subs, extraHosts=extraHosts,proxy=proxy) - else: - resp = doGet(curUrl, verify=False, timeout=tout, vhosts=vhosts, urlQueue=urlQueue, subs=subs, extraHosts=extraHosts,proxy=proxy) - if(resp is not None and resp.status_code == 401): - print curUrl[0]+" Requires HTTP Basic Auth" - f = open(screenshotName+".html",'w') - f.write(resp.headers.get('www-authenticate','NONE')) - f.write('Basic Auth') - f.close() - writeImage(resp.headers.get('www-authenticate','NO WWW-AUTHENTICATE HEADER'),screenshotName+".png") - continue - - elif(resp is not None): - if(resp.text is not None): - resp_hash = hashlib.md5(resp.text).hexdigest() - else: - resp_hash = None - - if smartFetch and resp_hash is not None and resp_hash in hash_basket: - #We have this exact same page already, copy it instead of grabbing it again - print "[+] Pre-fetch matches previously imaged service, no need to do it again!" - shutil.copy2(hash_basket[resp_hash]+".html",screenshotName+".html") - shutil.copy2(hash_basket[resp_hash]+".png",screenshotName+".png") - else: - if smartFetch: - hash_basket[resp_hash] = screenshotName - - - #browser.set_window_size(1024, 768) - browser.set_page_load_timeout((tout)) - old_url = browser.current_url - browser.get(curUrl[0].strip()) - if(browser.current_url == old_url): - print "[-] Error fetching in browser but successfully fetched with Requests: "+curUrl[0] - if(headless): - browser2 = None - if(debug): - print "[+] Trying with sslv3 instead of TLS - known phantomjs bug: "+curUrl[0] - if(proxy is not None): - browser2 = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true','--proxy='+proxy,'--proxy-type=socks5'], executable_path="phantomjs") - else: - browser2 = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'], executable_path="phantomjs") - #print "Launched browser2: "+str(browser2.service.process.pid) - - old_url = browser2.current_url - try: - browser2.get(curUrl[0].strip()) - if(browser2.current_url == old_url): - if(debug): - print "[-] Didn't work with SSLv3 either..."+curUrl[0] - browser2.quit() - else: - print '[+] Saving: '+screenshotName - html_source = browser2.page_source - f = open(screenshotName+".html",'w') - f.write(html_source) - f.close() - browser2.save_screenshot(screenshotName+".png") - browser2.quit() - continue - except: - browser2.quit() - print "[-] Didn't work with SSLv3 either - exception..."+curUrl[0] - - if(tryGUIOnFail and headless): - display = Display(visible=0, size=(1024, 768)) - display.start() - print "[+] Attempting to fetch with FireFox: "+curUrl[0] - browser2 = setupBrowserProfile(False,proxy) - old_url = browser2.current_url - try: - browser2.get(curUrl[0].strip()) - if(browser2.current_url == old_url): - print "[-] Error fetching in GUI browser as well..."+curUrl[0] - browser2.quit() - continue - else: - print '[+] Saving: '+screenshotName - html_source = browser2.page_source - f = open(screenshotName+".html",'w') - f.write(html_source) - f.close() - browser2.save_screenshot(screenshotName+".png") - browser2.quit() - continue - except: - browser2.quit() - display.stop() - print "[-] Error fetching in GUI browser as well..."+curUrl[0] - - else: - continue - - print '[+] Saving: '+screenshotName - html_source = browser.page_source - f = open(screenshotName+".html",'w') - f.write(html_source) - f.close() - browser.save_screenshot(screenshotName+".png") - - except Exception as e: - print e - print '[-] Something bad happened with URL: '+curUrl[0] - if(curUrl[2] > 0): - curUrl[2] = curUrl[2] - 1; - urlQueue.put(curUrl) - if(debug): - exc_type, exc_value, exc_traceback = sys.exc_info() - lines = traceback.format_exception(exc_type, exc_value, exc_traceback) - print ''.join('!! ' + line for line in lines) - browser.quit() - browser = setupBrowserProfile(headless,proxy) - continue - browser.quit() - display.stop() + if debug: + print("[-] URL queue is empty, quitting.") + browser.quit() + return + + try: + if doProfile: + [resp, curUrl] = autodetectRequest( + curUrl, + timeout=tout, + vhosts=vhosts, + urlQueue=urlQueue, + subs=subs, + extraHosts=extraHosts, + proxy=proxy, + ) + else: + resp = doGet( + curUrl, + verify=False, + timeout=tout, + vhosts=vhosts, + urlQueue=urlQueue, + subs=subs, + extraHosts=extraHosts, + proxy=proxy, + ) + if resp is not None and resp.status_code == 401: + print(curUrl + " Requires HTTP Basic Auth") + f = open(screenshotName + ".html", "w") + f.write(resp.headers.get("www-authenticate", "NONE")) + f.write("Basic Auth") + f.close() + writeImage( + resp.headers.get("www-authenticate", + "NO WWW-AUTHENTICATE HEADER"), + screenshotName + ".png", + ) + continue + + elif resp is not None: + if resp.text is not None: + resp_hash = hashlib.md5( + resp.text.encode('utf-8')).hexdigest() + else: + resp_hash = None + + if smartFetch and resp_hash is not None and resp_hash in hash_basket: + # We have this exact same page already, copy it instead of grabbing it again + print( + "[+] Pre-fetch matches previously imaged service, no need to do it again!" + ) + shutil.copy2(hash_basket[resp_hash] + ".html", + screenshotName + ".html") + shutil.copy2(hash_basket[resp_hash] + ".png", + screenshotName + ".png") + else: + if smartFetch: + hash_basket[resp_hash] = screenshotName + + browser.set_page_load_timeout((tout)) + old_url = browser.current_url + browser.get(curUrl.strip()) + if browser.current_url == old_url: + print( + "[-] Error fetching in browser but successfully fetched with Requests: " + + curUrl) + if tryGUIOnFail and headless: + display = Display(visible=0, size=(1024, 768)) + display.start() + print("[+] Attempting to fetch with FireFox: " + + curUrl) + browser2 = setupBrowserProfile(False, proxy, "Firefox") + old_url = browser2.current_url + try: + browser2.get(curUrl.strip()) + if browser2.current_url == old_url: + print( + "[-] Error fetching in GUI browser as well..." + + curUrl) + browser2.quit() + continue + else: + print("[+] Saving: " + screenshotName) + html_source = browser2.page_source + f = open(screenshotName + ".html", "w") + f.write(html_source) + f.close() + browser2.save_screenshot(screenshotName + + ".png") + browser2.quit() + continue + except Exception: + browser2.quit() + display.stop() + print( + "[-] Error fetching in GUI browser as well..." + + curUrl) + + else: + continue + + print("[+] Saving: " + screenshotName) + html_source = browser.page_source + f = open(screenshotName + ".html", "w") + f.write(html_source) + f.close() + browser.save_screenshot(screenshotName + ".png") + + except Exception as e: + if debug: + exc_type, exc_value, exc_traceback = sys.exc_info() + lines = traceback.format_exception(exc_type, exc_value, + exc_traceback) + print("".join("!! " + line for line in lines)) + browser.quit() + browser = setupBrowserProfile(headless, proxy, "Firefox") + continue + browser.quit() + display.stop() + def doGet(*args, **kwargs): - url = args[0] - doVhosts = kwargs.pop('vhosts' ,None) - urlQueue = kwargs.pop('urlQueue' ,None) - subs = kwargs.pop('subs' ,None) - extraHosts = kwargs.pop('extraHosts',None) - proxy = kwargs.pop('proxy',None) - - kwargs['allow_redirects'] = False - session = requests.session() - if(proxy is not None): - session.proxies={'http':'socks5://'+proxy,'https':'socks5://'+proxy} - resp = session.get(url[0],**kwargs) - - #If we have an https URL and we are configured to scrape hosts from the cert... - if(url[0].find('https') != -1 and url[1] == True): - #Pull hostnames from cert, add as additional URLs and flag as not to pull certs - host = urlparse(url[0]).hostname - port = urlparse(url[0]).port - if(port is None): - port = 443 - names = [] - try: - cert = ssl.get_server_certificate((host,port),ssl_version=ssl.PROTOCOL_SSLv23) - x509 = M2Crypto.X509.load_cert_string(cert.decode('string_escape')) - subjText = x509.get_subject().as_text() - names = re.findall("CN=([^\s]+)",subjText) - altNames = x509.get_ext('subjectAltName').get_value() - names.extend(re.findall("DNS:([^,]*)",altNames)) - except: - pass - - for name in names: - if(name.find('*.') != -1): - for sub in subs: - try: - sub = sub.strip() - hostname = name.replace('*.',sub+'.') - if(hostname not in extraHosts): - extraHosts[hostname] = 1 - address = socket.gethostbyname(hostname) - urlQueue.put(['https://'+hostname+':'+str(port),False,url[2]]) - print '[+] Discovered subdomain '+address - except: - pass - name = name.replace('*.','') - if(name not in extraHosts): - extraHosts[name] = 1 - urlQueue.put(['https://'+name+':'+str(port),False,url[2]]) - print '[+] Added host '+name - else: - if (name not in extraHosts): - extraHosts[name] = 1 - urlQueue.put(['https://'+name+':'+str(port),False,url[2]]) - print '[+] Added host '+name - return resp - else: - return resp - - -def autodetectRequest(url, timeout, vhosts=False, urlQueue=None, subs=None, extraHosts=None,proxy=None): - '''Takes a URL, ignores the scheme. Detect if the host/port is actually an HTTP or HTTPS - server''' - resp = None - host = urlparse(url[0]).hostname - port = urlparse(url[0]).port - - if(port is None): - if('https' in url[0]): - port = 443 - else: - port = 80 - - try: - #cert = ssl.get_server_certificate((host,port)) - - cert = timeoutFn(ssl.get_server_certificate,kwargs={'addr':(host,port),'ssl_version':ssl.PROTOCOL_SSLv23},timeout_duration=3) - - if(cert is not None): - if('https' not in url[0]): - url[0] = url[0].replace('http','https') - #print 'Got cert, changing to HTTPS '+url[0] - - else: - url[0] = url[0].replace('https','http') - #print 'Changing to HTTP '+url[0] - - - except Exception as e: - url[0] = url[0].replace('https','http') - #print 'Changing to HTTP '+url[0] - try: - resp = doGet(url,verify=False, timeout=timeout, vhosts=vhosts, urlQueue=urlQueue, subs=subs, extraHosts=extraHosts, proxy=proxy) - except Exception as e: - print 'HTTP GET Error: '+str(e) - print url[0] - - return [resp,url] + url = args[0] + doVhosts = kwargs.pop("vhosts", None) + urlQueue = kwargs.pop("urlQueue", None) + subs = kwargs.pop("subs", None) + extraHosts = kwargs.pop("extraHosts", None) + proxy = kwargs.pop("proxy", None) + + kwargs["allow_redirects"] = False + session = requests.session() + if proxy is not None: + session.proxies = { + "http": "socks5://" + proxy, + "https": "socks5://" + proxy + } + resp = session.get(url, **kwargs) + + # If we have an https URL and we are configured to scrape hosts from the cert... + if url.find("https") != -1 and doVhosts is True: + # Pull hostnames from cert, add as additional URLs and flag as not to pull certs + host = urlparse(url).hostname + port = urlparse(url).port + if port is None: + port = 443 + names = [] + try: + cert = ssl.get_server_certificate((host, port)) + x509 = M2Crypto.X509.load_cert_string(cert) + subjText = x509.get_subject().as_text() + names = re.findall("CN=([^\s]+)", subjText) + altNames = x509.get_ext("subjectAltName").get_value() + names.extend(re.findall("DNS:([^,]*)", altNames)) + except Exception as e: + print(e) + + for name in names: + if name not in extraHosts: + extraHosts[name] = 1 + urlQueue.put(f"https://{name}:{port}") + print(f"[+] Added host https://{name}:{port}") + return resp + else: + return resp + + +def autodetectRequest(url, + timeout, + vhosts=False, + urlQueue=None, + subs=None, + extraHosts=None, + proxy=None): + """Takes a URL, ignores the scheme. Detect if the host/port is actually an HTTP or HTTPS + server""" + resp = None + host = urlparse(url).hostname + port = urlparse(url).port + + try: + # cert = ssl.get_server_certificate((host,port)) + + cert = timeoutFn( + ssl.get_server_certificate, + kwargs={"addr": (host, port)}, + timeout_duration=3, + ) + + if cert is not None: + if "https" not in url: + url = url.replace("http://", "https://") + else: + url = url.replace("https://", "http://") + + except Exception: + url = url.replace("https://", "http://") + try: + resp = doGet( + url, + verify=False, + timeout=timeout, + vhosts=vhosts, + urlQueue=urlQueue, + subs=subs, + extraHosts=extraHosts, + proxy=proxy, + ) + except Exception as e: + print("HTTP GET Error: " + str(e)) + print(url) + + return [resp, url] def sslError(e): - if('the handshake operation timed out' in str(e) or 'unknown protocol' in str(e) or 'Connection reset by peer' in str(e) or 'EOF occurred in violation of protocol' in str(e)): - return True - else: - return False + if ("the handshake operation timed out" in str(e) + or "unknown protocol" in str(e) + or "Connection reset by peer" in str(e) + or "EOF occurred in violation of protocol" in str(e)): + return True + else: + return False + def signal_handler(signal, frame): - print "[-] Ctrl-C received! Killing Thread(s)..." - os._exit(0) + print("[-] Ctrl-C received! Killing Thread(s)...") + os._exit(0) + + signal.signal(signal.SIGINT, signal_handler) -if __name__ == '__main__': - parser = argparse.ArgumentParser() - - parser.add_argument("-l","--list",help='List of input URLs') - parser.add_argument("-i","--input",help='nmap gnmap/xml output file') - parser.add_argument("-p","--headless",action='store_true',default=False,help='Run in headless mode (using phantomjs)') - parser.add_argument("-w","--workers",default=1,type=int,help='number of threads') - parser.add_argument("-t","--timeout",type=int,default=10,help='time to wait for pageload before killing the browser') - parser.add_argument("-v","--verbose",action='store_true',default=False,help='turn on verbose debugging') - parser.add_argument("-a","--autodetect",action='store_true',default=False,help='Automatically detect if listening services are HTTP or HTTPS. Ignores NMAP service detction and URL schemes.') - parser.add_argument("-vH","--vhosts",action='store_true',default=False,help='Attempt to scrape hostnames from SSL certificates and add these to the URL queue') - parser.add_argument("-dB","--dns_brute",help='Specify a DNS subdomain wordlist for bruteforcing on wildcard SSL certs') - parser.add_argument("-uL","--uri_list",help='Specify a list of URIs to fetch in addition to the root') - parser.add_argument("-r","--retries",type=int,default=0,help='Number of retries if a URL fails or timesout') - parser.add_argument("-tG","--trygui",action='store_true',default=False,help='Try to fetch the page with FireFox when headless fails') - parser.add_argument("-sF","--smartfetch",action='store_true',default=False,help='Enables smart fetching to reduce network traffic, also increases speed if certain conditions are met.') - parser.add_argument("-pX","--proxy",default=None,help='SOCKS5 Proxy in host:port format') - - - args = parser.parse_args() - - if(len(sys.argv) < 2): - parser.print_help() - sys.exit(0) - - - #read in the URI list if specificed - uris = [''] - if(args.uri_list != None): - uris = open(args.uri_list,'r').readlines() - uris.append('') - - if(args.input is not None): - inFile = open(args.input,'rU') - if(detectFileType(inFile) == 'gnmap'): - hosts = parseGnmap(inFile,args.autodetect) - urls = [] - for host,ports in hosts.items(): - for port in ports: - for uri in uris: - url = '' - if port[1] == True: - url = ['https://'+host+':'+port[0]+uri.strip(),args.vhosts,args.retries] - else: - url = ['http://'+host+':'+port[0]+uri.strip(),args.vhosts,args.retries] - urls.append(url) - elif(detectFileType(inFile) == 'xml'): - hosts = parsexml(inFile) - urls = [] - for host,ports in hosts.items(): - for port in ports: - for uri in uris: - url = '' - if port[1] == True: - url = ['https://'+host+':'+port[0]+uri.strip(),args.vhosts,args.retries] - else: - url = ['http://'+host+':'+port[0]+uri.strip(),args.vhosts,args.retries] - urls.append(url) - else: - print 'Invalid input file - must be Nmap GNMAP or Nmap XML' - - elif (args.list is not None): - f = open(args.list,'r') - lst = f.readlines() - urls = [] - for url in lst: - urls.append([url.strip(),args.vhosts,args.retries]) - else: - print "No input specified" - sys.exit(0) - - - #shuffle the url list - shuffle(urls) - - #read in the subdomain bruteforce list if specificed - subs = [] - if(args.dns_brute != None): - subs = open(args.dns_brute,'r').readlines() - - #Fire up the workers - urlQueue = multiprocessing.Queue() - manager = multiprocessing.Manager() - hostsDict = manager.dict() - workers = [] - hash_basket = {} - - for i in range(args.workers): - p = multiprocessing.Process(target=worker, args=(urlQueue, args.timeout, args.verbose, args.headless, args.autodetect, args.vhosts, subs, hostsDict, args.trygui, args.smartfetch,args.proxy)) - workers.append(p) - p.start() - - for url in urls: - urlQueue.put(url) - - for p in workers: - p.join() - +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("-l", "--list", help="List of input URLs") + parser.add_argument("-i", "--input", help="nmap gnmap/xml output file") + parser.add_argument( + "-p", + "--headless", + action="store_true", + default=False, + help="Run in headless mode (using phantomjs)", + ) + parser.add_argument( + "-b", + "--browsertype", + default="Firefox", + help="Choose webdriver {Firefox, Chrome}" + ) + parser.add_argument("-w", + "--workers", + default=1, + type=int, + help="number of threads") + parser.add_argument( + "-t", + "--timeout", + type=int, + default=10, + help="time to wait for pageload before killing the browser", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + default=False, + help="turn on verbose debugging", + ) + parser.add_argument( + "-a", + "--autodetect", + action="store_true", + default=False, + help= + "Automatically detect if listening services are HTTP or HTTPS. Ignores NMAP service detction and URL schemes.", + ) + parser.add_argument( + "-vH", + "--vhosts", + action="store_true", + default=False, + help= + "Attempt to scrape hostnames from SSL certificates and add these to the URL queue", + ) + parser.add_argument( + "-dB", + "--dns_brute", + help= + "Specify a DNS subdomain wordlist for bruteforcing on wildcard SSL certs", + ) + parser.add_argument( + "-uL", + "--uri_list", + help="Specify a list of URIs to fetch in addition to the root", + ) + parser.add_argument( + "-r", + "--retries", + type=int, + default=0, + help="Number of retries if a URL fails or timesout", + ) + parser.add_argument( + "-tG", + "--trygui", + action="store_true", + default=False, + help="Try to fetch the page with FireFox when headless fails", + ) + parser.add_argument( + "-sF", + "--smartfetch", + action="store_true", + default=False, + help= + "Enables smart fetching to reduce network traffic, also increases speed if certain conditions are met.", + ) + parser.add_argument("-pX", + "--proxy", + default=None, + help="SOCKS5 Proxy in host:port format") + + args = parser.parse_args() + + if len(sys.argv) < 2: + parser.print_help() + sys.exit(0) + + # read in the URI list if specificed + uris = [""] + if args.uri_list is not None: + uris = open(args.uri_list, "r").readlines() + uris.append("") + + if args.input is not None: + inFile = open(args.input, "r") + if detectFileType(inFile) == "gnmap": + hosts = parseGnmap(inFile, args.autodetect) + elif detectFileType(inFile) == "xml": + hosts = parsexml(inFile) + else: + print("Invalid input file - must be Nmap GNMAP or Nmap XML") + + urls = [] + + for host in hosts: + for port in hosts[host]: + urls.append(f"http://{host}:{port}") + + elif args.list is not None: + f = open(args.list, "r") + lst = f.readlines() + urls = [] + for url in lst: + urls.append(url.strip()) + else: + print("No input specified") + sys.exit(0) + + # shuffle the url list + shuffle(urls) + + # read in the subdomain bruteforce list if specificed + subs = [] + if args.dns_brute is not None: + subs = open(args.dns_brute, "r").readlines() + + # Fire up the workers + urlQueue = multiprocessing.Queue() + manager = multiprocessing.Manager() + hostsDict = manager.dict() + workers = [] + hash_basket = {} + + for i in range(args.workers): + p = multiprocessing.Process( + target=worker, + args=( + urlQueue, + args.timeout, + args.verbose, + args.headless, + args.autodetect, + args.vhosts, + subs, + hostsDict, + args.trygui, + args.smartfetch, + args.proxy, + args.browsertype + ), + ) + workers.append(p) + p.start() + + for url in urls: + urlQueue.put(url) + + for p in workers: + p.join() diff --git a/install-dependencies.sh b/install-dependencies.sh index c2aa19c..1d504cd 100755 --- a/install-dependencies.sh +++ b/install-dependencies.sh @@ -1,4 +1,4 @@ -# Installation Script - tested on an ubuntu/trusty64 vagrant box +# Installation Script - tested on a fresh install of Ubuntu 20.04.3 LTS as root (sudo) # Show all commands being run #set -x @@ -6,40 +6,12 @@ # Error out if one fails set -e -apt-get install -y swig swig3.0 libssl-dev python-dev libjpeg-dev xvfb +# Pull packages from apt +apt install -y python3-pip build-essential libssl-dev swig python3-dev -# Newer version in PyPI -#apt-get install -y python-requests +# Install Google Chrome +wget -O /tmp/google-chrome-stable_current_amd64.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb +apt install -y /tmp/google-chrome-stable_current_amd64.deb -# Newer version in PyPI -#apt-get install -y python-m2crypto - -# Installing pillow from PIP for the latest -#apt-get install -y python-pil - -# Install pip and install pytnon requirements through it -apt-get install -y python-pip -pip install -r requirements.txt - -# This binary is distributed with the code base, version is -# more recent then the one in the ubuntu repo (1.9.1 vs 1.9.0) -#apt-get install -y phantomjs - -# Grab the latest of phantomjs it directly from the source -wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 - -phantom_md5sum=`md5sum phantomjs-2.1.1-linux-x86_64.tar.bz2 | cut -d' ' -f1` -checksum="1c947d57fce2f21ce0b43fe2ed7cd361" - -if [ "$phantom_md5sum" != "$checksum" ] -then - echo "phantomjs checksum mismatch" - exit 254 -fi - -tar xvf phantomjs-2.1.1-linux-x86_64.tar.bz2 -mv phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/bin/phantomjs - -wget https://github.com/mozilla/geckodriver/releases/download/v0.11.1/geckodriver-v0.11.1-linux64.tar.gz -tar xzvf geckodriver-v0.11.1-linux64.tar.gz -mv geckodriver /usr/bin/geckodriver +# Install required python packages +pip3 install -r requirements.txt diff --git a/requirements.txt b/requirements.txt index f3f17e1..4a8609e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,10 @@ requests selenium beautifulsoup4 pillow -requesocks +PySocks python-libnmap pyvirtualdisplay +reload +webdriver_manager +lxml +urllib3<2.0.0 diff --git a/screenshotClustering/cluster.py b/screenshotClustering/cluster.py index 41e9b95..13a651f 100755 --- a/screenshotClustering/cluster.py +++ b/screenshotClustering/cluster.py @@ -16,11 +16,11 @@ def addAttrToBag(attrName,url,link,wordBags,soup): for tag in soup.findAll('',{attrName:True}): if(isinstance(tag[attrName],str) or isinstance(tag[attrName],unicode)): - tagStr = tag[attrName].encode('utf-8').strip() + tagStr = tag[attrName].encode('ISO-8859-1').strip() elif(isinstance(tag[attrName],list)): - tagStr = tag[attrName][0].encode('utf-8').strip() + tagStr = tag[attrName][0].encode('ISO-8859-1').strip() else: - print '[-] Strange tag type detected - '+str(type(tag[attrName])) + print('[-] Strange tag type detected - '+str(type(tag[attrName]))) tagStr = 'XXXXXXXXX' if(tagStr != ''): @@ -51,7 +51,7 @@ def createWordBags(htmlList): wordBags={} for f in htmlList: - htmlContent = open(f,'r').read() + htmlContent = open(f,'r', encoding='ISO-8859-1').read() wordBags[f]={} soup = BeautifulSoup(htmlContent, 'html.parser') addAttrToBag('name',f,False,wordBags,soup) @@ -77,11 +77,11 @@ def computeScore(wordBag1,wordBag2,debug=0): if(len(wordBag1) == 0 and len(wordBag2) == 0): if debug: - print 'Both have no words - return true' + print('Both have no words - return true') return 1 elif (len(wordBag1) == 0 or len(wordBag2) == 0): if debug: - print 'One has no words - return false' + print('One has no words - return false') return 0 for word in wordBag1.keys(): @@ -90,17 +90,17 @@ def computeScore(wordBag1,wordBag2,debug=0): score = (float(commonWords)/float(wordBag1Length)*(float(commonWords)/float(wordBag2Length))) if debug: - print "Common Words: "+str(commonWords) - print "WordBag1 Length: "+str(wordBag1Length) - print "WordBag2 Length: "+str(wordBag2Length) - print score + print("Common Words: "+str(commonWords)) + print("WordBag1 Length: "+str(wordBag1Length)) + print("WordBag2 Length: "+str(wordBag2Length)) + print(score) return score def createClusters(wordBags,threshold): clusterData = {} i = 0 - siteList = wordBags.keys() + siteList = list(wordBags.keys()) for i in range(0,len(siteList)): clusterData[siteList[i]] = [threshold, i] @@ -126,7 +126,7 @@ def getScopeHtml(scopeFile): def getPageTitle(htmlFile): """Simple function to yank page title from html""" - with open(htmlFile, 'r') as f: + with open(htmlFile, 'r', encoding='ISO-8859-1') as f: soup = BeautifulSoup(f, "lxml") try: return soup.title.string.encode('ascii', 'ignore') @@ -151,12 +151,16 @@ def renderClusterHtml(clust,width,height,scopeFile=None): ''' for cluster, siteList in clust.items(): + try: + title = getPageTitle(siteList[0]).decode("ISO-8859-1") + except (UnicodeDecodeError, AttributeError): + title = getPageTitle(siteList[0]) html = html + """ + """ + title + """ @@ -166,9 +170,9 @@ def renderClusterHtml(clust,width,height,scopeFile=None): for site in siteList: screenshotName = quote(site[0:-5], safe='./') if site != siteList[-1]: - html = html + '
'+unquote(unquote(screenshotName[2:]).decode("utf-8")).decode("utf-8")+'
' + html = html + f"
{unquote(unquote(screenshotName[2:]))}
" else: - html = html + '
'+unquote(unquote(screenshotName[2:]).decode("utf-8")).decode("utf-8")+'
- """ + getPageTitle(siteList[0]) + """
' + html = html + f"
{unquote(unquote(screenshotName[2:]))}
" footer = '' @@ -362,7 +366,7 @@ def doCluster(htmlList): clusterData = createClusters(siteWordBags,0.6) clusterDict = {} - for site,data in clusterData.iteritems(): + for site,data in clusterData.items(): if data[1] in clusterDict: clusterDict[data[1]].append(site) else: diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..2bcd70e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 88