cryptixcoder
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎PyCrawler.py‎
Lines changed: 61 additions & 218 deletions b/‎PyCrawler.py‎
Lines changed: 61 additions & 218 deletions
diff --git a/‎README‎
Lines changed: 0 additions & 12 deletions b/‎README‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎__init__.py‎ b/‎__init__.py‎
@@ -1,2 +1,3 @@
-
+*.pyc
 .DS_Store
+*.db
@@ -1,230 +1,73 @@
-#!/usr/bin/python
-importsys
-importre
-importurllib2
-importurlparse
-importthreading
-importsqlite3assqlite
-importrobotparser
-# Try to import psyco for JIT compilation
+fromqueryimportCrawlerDb
+fromcontent_processorimportContentProcessor
+fromsettingsimportVERBOSE
+importsys, urlparse, urllib2
 
+# ===== Init stuff =====
 
-"""
-The program should take arguments
-1) database file name
-2) start url
-3) crawl depth 
-4) domains to limit to, regex (optional)
-5) verbose (optional)
-Start out by checking to see if the args are there and
-set them to their variables
-"""
-iflen(sys.argv) <4:
-sys.exit("Not enough arguments!")
-else:
-dbname=sys.argv[1]
-starturl=sys.argv[2]
-crawldepth=int(sys.argv[3])
-iflen(sys.argv) >=5:
-domains=sys.argv[4]
-iflen(sys.argv) ==6:
-if (sys.argv[5].upper() =="TRUE"):
-verbose=True
-else:
-verbose=False
-else:
-domains=False
-verbose=False
-# urlparse the start url
-surlparsed=urlparse.urlparse(starturl)
+# db init
+cdb=CrawlerDb()
+cdb.connect()
 
-# Connect to the db and create the tables if they don't already exist
-connection=sqlite.connect(dbname)
-cursor=connection.cursor()
-# crawl_index: holds all the information of the urls that have been crawled
-cursor.execute('CREATE TABLE IF NOT EXISTS crawl_index (crawlid INTEGER, parentid INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256), status INTEGER )')
-# queue: this should be obvious
-cursor.execute('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256))')
-# status: Contains a record of when crawling was started and stopped. 
-# Mostly in place for a future application to watch the crawl interactively.
-cursor.execute('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )')
-connection.commit()
+# content processor init
+processor=ContentProcessor(None, None, None)
 
-# Compile keyword and link regex expressions
-keywordregex=re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>')
-linkregex=re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')
-ifdomains:
-domainregex=re.compile(domains)
-else:
-domainregex=False
-crawled= []
+iflen(sys.argv) <2:
+print"Error: No start url was passed"
+sys.exit()
 
-# set crawling status and stick starting url into the queue
-cursor.execute("INSERT INTO status VALUES ((?), (?))", (1, "datetime('now')"))
-cursor.execute("INSERT INTO queue VALUES ((?), (?), (?), (?))", (None, 0, 0, starturl))
-connection.commit()
+l=sys.argv[1:]
 
+cdb.enqueue(l)
 
-# insert starting url into queue
+defcrawl():
+print"starting..."
+queue_empty=False
+whileTrue:
+url=cdb.dequeue()
+printurl
+ifcdb.checkCrawled(url):
+continue
+ifurlisFalse:
+queue_empty=True
 
-classthreader ( threading.Thread ):
-
-# Parser for robots.txt that helps determine if we are allowed to fetch a url
-rp=robotparser.RobotFileParser()
-
-"""
- run()
- Args:
- none
- the run() method contains the main loop of the program. Each iteration takes the url
- at the top of the queue and starts the crawl of it. 
- """
-defrun(self):
-while1:
-try:
-# Get the first item from the queue
-cursor.execute("SELECT * FROM queue LIMIT 1")
-crawling=cursor.fetchone()
-# Remove the item from the queue
-cursor.execute("DELETE FROM queue WHERE id = (?)", (crawling[0], ))
-connection.commit()
-ifverbose:
-printcrawling[3]
-exceptKeyError:
-raiseStopIteration
-except:
-pass
-
-# if theres nothing in the que, then set the status to done and exit
-ifcrawling==None:
-cursor.execute("INSERT INTO status VALUES ((?), datetime('now'))", (0,))
-connection.commit()
-sys.exit("Done!")
-# Crawl the link
-self.crawl(crawling)
-
-"""
- crawl()
- Args:
- crawling: this should be a url
-
- crawl() opens the page at the "crawling" url, parses it and puts it into the database.
- It looks for the page title, keywords, and links.
- """
-defcrawl(self, crawling):
-# crawler id
-cid=crawling[0]
-# parent id. 0 if start url
-pid=crawling[1]
-# current depth
-curdepth=crawling[2]
-# crawling urL
-curl=crawling[3]
-ifdomainregexandnotdomainregex.search(curl):
-return
-# Split the link into its sections
-url=urlparse.urlparse(curl)
-
+# Get HTTPConnection
+#connection = httplib.HTTPConnection(parsed_url.netloc)
+# Make the request
+#connection.request("GET", parsed_url.path)
+# Get response
+#response = connection.getresponse()
+#data = response.read()
+status=0
+request=None
 try:
-# Have our robot parser grab the robots.txt file and read it
-self.rp.set_url('http://'+url[1] +'/robots.txt')
-self.rp.read()
-
-# If we're not allowed to open a url, return the function to skip it
-ifnotself.rp.can_fetch('PyCrawler', curl):
-ifverbose:
-printcurl+" not allowed by robots.txt"
-return
-except:
-pass
-
-try:
-# Add the link to the already crawled list
-crawled.append(curl)
-exceptMemoryError:
-# If the crawled array is too big, deleted it and start over
-delcrawled[:]
-try:
-# Create a Request object
-request=urllib2.Request(curl)
-# Add user-agent header to the request
-request.add_header("User-Agent", "PyCrawler")
-# Build the url opener, open the link and read it into msg
-opener=urllib2.build_opener()
-f=opener.open(request)
-msg=f.read()
-# put meta data in info
-info=f.info() 
-
-
+request=urllib2.urlopen(str(url))
 excepturllib2.URLError, e:
-# If it doesn't load, skip this url
-#print e.code
-try: 
-cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?), (?) )", (cid, pid, curl, '', '', e.code))
-connection.commit
-except:
-pass
+printe.reason
+excepturllib2.HTTPError, e:
+status=e.code
+ifstatus==0:
+status=200
+data=request.read()
 
-return
-
-# Find what's between the title tags
-startPos=msg.find('<title>')
-ifstartPos!=-1:
-endPos=msg.find('</title>', startPos+7)
-ifendPos!=-1:
-title=msg[startPos+7:endPos]
-
-# Start keywords list with whats in the keywords meta tag if there is one
-keywordlist=keywordregex.findall(msg)
-iflen(keywordlist) >0:
-keywordlist=keywordlist[0]
-else:
-keywordlist=""
-
-
-
-# Get the links
-links=linkregex.findall(msg)
-# queue up the links
-self.queue_links(url, links, cid, curdepth)
+ifVERBOSE:
+print"Got %s status from %s"% (status, url)
+processor.setInfo(str(url), status, data)
+add_queue=processor.process()
+l=len(add_queue)
+print"Found %i links"%l
+ifl>0:
+ifqueue_empty==True:
+queue_empty=False
+cdb.enqueue(add_queue) 
+cdb.addPage(processor.getDataDict())
+processor.reset()
+ifqueue_empty:
+break
 
-try:
-# Put now crawled link into the db
-cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?), (?) )", (cid, pid, curl, title, keywordlist, 200))
-connection.commit()
-except:
-pass
-
-
-defqueue_links(self, url, links, cid, curdepth):
-ifcurdepth<crawldepth:
-# Read the links and inser them into the queue
-forlinkinlinks:
-cursor.execute("SELECT url FROM queue WHERE url=?", [link])
-forrowincursor:
-ifrow[0].decode('utf-8') ==url:
-continue
-iflink.startswith('/'):
-link='http://'+url[1] +link
-eliflink.startswith('#'):
-continue
-elifnotlink.startswith('http'):
-link=urlparse.urljoin(url.geturl(),link)
-
-iflink.decode('utf-8') notincrawled:
-try:
-cursor.execute("INSERT INTO queue VALUES ( (?), (?), (?), (?) )", (None, cid, curdepth+1, link))
-connection.commit()
-except:
-continue
-else:
-pass
-if__name__=='__main__':
-try:
-importpsyco
-psyco.full()
-exceptImportError:
-print"Continuing without psyco JIT compilation!"
-# Run main loop
-threader().run()
+print"finishing..."
+cdb.close()
+print"done! goodbye!"
+
+if__name__=="__main__":
+crawl()
-Original file line number
+Diff line change
@@ @@ -1,2 +1,3 @@ @@
+-
 +*.pyc
 .DS_Store
 +*.db