1212print "Continuing without psyco JIT compilation!"
1313
1414"""
15- The program should take 3 arguments
15+ The program should take arguments
16161) database file name
17172) start url
18183) crawl depth
1919Start out by checking to see if the args are there and
2020set them to their variables
2121"""
22- if len (sys .argv ) < 4 :
22+ if len (sys .argv ) < 5 :
2323sys .exit ("Not enough arguments!" )
2424else :
2525dbname = sys .argv [1 ]
2626starturl = sys .argv [2 ]
2727crawldepth = int (sys .argv [3 ])
2828
29+ # urlparse the start url
30+ surlparsed = urlparse .urlparse (starturl )
2931
3032# Connect to the db and create the tables if they don't already exist
3133connection = sqlite .connect (dbname )
3537cursor .execute ('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )' )
3638connection .commit ()
3739
38- """
39- # Check for a start point
40- if len(sys.argv) < 2:
41- print "No starting point! Checking existing queue"
42- cursor.execute("SELECT * FROM queue LIMIT 1")
43- c = cursor.fetchone()
44- if c == None:
45- sys.exit("ERROR: No start point! Exiting")
46- else:
47- try:
48- if sys.argv[1]:
49- cursor.execute("INSERT INTO queue VALUES ( (?) )", (sys.argv[1], ))
50- connection.commit()
51- except:
52- pass
53- """
54-
5540# Compile keyword and link regex expressions
5641keywordregex = re .compile ('<meta\sname=["\' ]keywords["\' ]\scontent=["\' ](.*?)["\' ]\s/>' )
5742linkregex = re .compile ('<a\s*href=[\' |"](.*?)[\' "].*?>' )
@@ -112,11 +97,12 @@ def crawl(self, crawling):
11297# Load the link
11398response = urllib2 .urlopen (curl )
11499except :
115- # If it doesn't load, kill the function
100+ # If it doesn't load, skip this url
116101return
117102# Read response
118103msg = response .read ()
119104
105+ # Find what's between the title tags
120106startPos = msg .find ('<title>' )
121107if startPos != - 1 :
122108endPos = msg .find ('</title>' , startPos + 7 )
@@ -142,13 +128,14 @@ def crawl(self, crawling):
142128def queue_links (self , url , links , cid , curdepth ):
143129if curdepth < crawldepth :
144130# Read the links and inser them into the queue
145- for link in ( links . pop ( 0 ) for _ in xrange ( len ( links ))) :
131+ for link in links :
146132if link .startswith ('/' ):
147133link = 'http://' + url [1 ] + link
148134elif link .startswith ('#' ):
149135link = 'http://' + url [1 ] + url [2 ] + link
150136elif not link .startswith ('http' ):
151137link = 'http://' + url [1 ] + '/' + link
138+
152139if link .decode ('utf-8' ) not in crawled :
153140try :
154141cursor .execute ("INSERT INTO queue VALUES ( (?), (?), (?), (?) )" , (None , cid , curdepth + 1 , link ))
0 commit comments