Skip to content

Commit 3017da5

Browse files
committed
Added robots.txt compliance and some small fixes.
1 parent f24055b commit 3017da5

File tree

1 file changed

+29
-7
lines changed

1 file changed

+29
-7
lines changed

‎PyCrawler.py‎

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
importurlparse
55
importthreading
66
importsqlite3assqlite
7+
importrobotparser
78
# Try to import psyco for JIT compilation
89
try:
910
importpsyco
@@ -16,6 +17,7 @@
1617
1) database file name
1718
2) start url
1819
3) crawl depth
20+
4) verbose (optional)
1921
Start out by checking to see if the args are there and
2022
set them to their variables
2123
"""
@@ -26,7 +28,7 @@
2628
starturl=sys.argv[2]
2729
crawldepth=int(sys.argv[3])
2830
iflen(sys.argv) ==5:
29-
if (sys.argv[4].uppercase=="TRUE"):
31+
if (sys.argv[4].upper()=="TRUE"):
3032
verbose=True
3133
else:
3234
verbose=False
@@ -61,6 +63,10 @@
6163
# insert starting url into queue
6264

6365
classthreader ( threading.Thread ):
66+
67+
# Parser for robots.txt that helps determine if we are allowed to fetch a url
68+
rp=robotparser.RobotFileParser()
69+
6470
"""
6571
run()
6672
Args:
@@ -78,7 +84,7 @@ def run(self):
7884
cursor.execute("DELETE FROM queue WHERE id = (?)", (crawling[0], ))
7985
connection.commit()
8086
ifverbose:
81-
printcrawling
87+
printcrawling[3]
8288
exceptKeyError:
8389
raiseStopIteration
8490
except:
@@ -111,6 +117,20 @@ def crawl(self, crawling):
111117
curl=crawling[3]
112118
# Split the link into its sections
113119
url=urlparse.urlparse(curl)
120+
121+
try:
122+
# Have our robot parser grab the robots.txt file and read it
123+
self.rp.set_url('http://'+url[1] +'/robots.txt')
124+
self.rp.read()
125+
126+
# If we're not allowed to open a url, return the function to skip it
127+
ifnotself.rp.can_fetch('PyCrawler', curl):
128+
ifverbose:
129+
printcurl+" not allowed by robots.txt"
130+
return
131+
except:
132+
pass
133+
114134
try:
115135
# Add the link to the already crawled list
116136
crawled.append(curl)
@@ -122,15 +142,13 @@ def crawl(self, crawling):
122142
request=urllib2.Request(curl)
123143
# Add user-agent header to the request
124144
request.add_header("User-Agent", "PyCrawler")
125-
# Build the url opener, open the link and read it into response
145+
# Build the url opener, open the link and read it into msg
126146
opener=urllib2.build_opener()
127-
response=opener.open(request).read()
147+
msg=opener.open(request).read()
128148

129149
except:
130150
# If it doesn't load, skip this url
131151
return
132-
# Read response
133-
msg=response.read()
134152

135153
# Find what's between the title tags
136154
startPos=msg.find('<title>')
@@ -161,10 +179,14 @@ def queue_links(self, url, links, cid, curdepth):
161179
ifcurdepth<crawldepth:
162180
# Read the links and inser them into the queue
163181
forlinkinlinks:
182+
cursor.execute("SELECT url FROM queue WHERE url=?", [link])
183+
forrowincursor:
184+
ifrow[0].decode('utf-8') ==url:
185+
continue
164186
iflink.startswith('/'):
165187
link='http://'+url[1] +link
166188
eliflink.startswith('#'):
167-
link='http://'+url[1] +url[2] +link
189+
continue
168190
elifnotlink.startswith('http'):
169191
link='http://'+url[1] +'/'+link
170192

0 commit comments

Comments
(0)