44import urlparse
55import threading
66import sqlite3 as sqlite
7+ import robotparser
78# Try to import psyco for JIT compilation
89try :
910import psyco
16171) database file name
17182) start url
18193) crawl depth
20+ 4) verbose (optional)
1921Start out by checking to see if the args are there and
2022set them to their variables
2123"""
2628starturl = sys .argv [2 ]
2729crawldepth = int (sys .argv [3 ])
2830if len (sys .argv ) == 5 :
29- if (sys .argv [4 ].uppercase == "TRUE" ):
31+ if (sys .argv [4 ].upper () == "TRUE" ):
3032verbose = True
3133else :
3234verbose = False
6163# insert starting url into queue
6264
6365class threader ( threading .Thread ):
66+
67+ # Parser for robots.txt that helps determine if we are allowed to fetch a url
68+ rp = robotparser .RobotFileParser ()
69+
6470"""
6571 run()
6672 Args:
@@ -78,7 +84,7 @@ def run(self):
7884cursor .execute ("DELETE FROM queue WHERE id = (?)" , (crawling [0 ], ))
7985connection .commit ()
8086if verbose :
81- print crawling
87+ print crawling [ 3 ]
8288except KeyError :
8389raise StopIteration
8490except :
@@ -111,6 +117,20 @@ def crawl(self, crawling):
111117curl = crawling [3 ]
112118# Split the link into its sections
113119url = urlparse .urlparse (curl )
120+
121+ try :
122+ # Have our robot parser grab the robots.txt file and read it
123+ self .rp .set_url ('http://' + url [1 ] + '/robots.txt' )
124+ self .rp .read ()
125+
126+ # If we're not allowed to open a url, return the function to skip it
127+ if not self .rp .can_fetch ('PyCrawler' , curl ):
128+ if verbose :
129+ print curl + " not allowed by robots.txt"
130+ return
131+ except :
132+ pass
133+
114134try :
115135# Add the link to the already crawled list
116136crawled .append (curl )
@@ -122,15 +142,13 @@ def crawl(self, crawling):
122142request = urllib2 .Request (curl )
123143# Add user-agent header to the request
124144request .add_header ("User-Agent" , "PyCrawler" )
125- # Build the url opener, open the link and read it into response
145+ # Build the url opener, open the link and read it into msg
126146opener = urllib2 .build_opener ()
127- response = opener .open (request ).read ()
147+ msg = opener .open (request ).read ()
128148
129149except :
130150# If it doesn't load, skip this url
131151return
132- # Read response
133- msg = response .read ()
134152
135153# Find what's between the title tags
136154startPos = msg .find ('<title>' )
@@ -161,10 +179,14 @@ def queue_links(self, url, links, cid, curdepth):
161179if curdepth < crawldepth :
162180# Read the links and inser them into the queue
163181for link in links :
182+ cursor .execute ("SELECT url FROM queue WHERE url=?" , [link ])
183+ for row in cursor :
184+ if row [0 ].decode ('utf-8' ) == url :
185+ continue
164186if link .startswith ('/' ):
165187link = 'http://' + url [1 ] + link
166188elif link .startswith ('#' ):
167- link = 'http://' + url [ 1 ] + url [ 2 ] + link
189+ continue
168190elif not link .startswith ('http' ):
169191link = 'http://' + url [1 ] + '/' + link
170192
0 commit comments