Skip to content

Commit ad8c39f

Browse files
committed
Complete redo
1 parent c462210 commit ad8c39f

File tree

8 files changed

+345
-231
lines changed

8 files changed

+345
-231
lines changed

‎.gitignore‎

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1-
1+
*.pyc
22
.DS_Store
3+
*.db

‎PyCrawler.py‎

Lines changed: 61 additions & 218 deletions
Original file line numberDiff line numberDiff line change
@@ -1,230 +1,73 @@
1-
#!/usr/bin/python
2-
importsys
3-
importre
4-
importurllib2
5-
importurlparse
6-
importthreading
7-
importsqlite3assqlite
8-
importrobotparser
9-
# Try to import psyco for JIT compilation
1+
fromqueryimportCrawlerDb
2+
fromcontent_processorimportContentProcessor
3+
fromsettingsimportVERBOSE
4+
importsys, urlparse, urllib2
105

6+
# ===== Init stuff =====
117

12-
"""
13-
The program should take arguments
14-
1) database file name
15-
2) start url
16-
3) crawl depth
17-
4) domains to limit to, regex (optional)
18-
5) verbose (optional)
19-
Start out by checking to see if the args are there and
20-
set them to their variables
21-
"""
22-
iflen(sys.argv) <4:
23-
sys.exit("Not enough arguments!")
24-
else:
25-
dbname=sys.argv[1]
26-
starturl=sys.argv[2]
27-
crawldepth=int(sys.argv[3])
28-
iflen(sys.argv) >=5:
29-
domains=sys.argv[4]
30-
iflen(sys.argv) ==6:
31-
if (sys.argv[5].upper() =="TRUE"):
32-
verbose=True
33-
else:
34-
verbose=False
35-
else:
36-
domains=False
37-
verbose=False
38-
# urlparse the start url
39-
surlparsed=urlparse.urlparse(starturl)
8+
# db init
9+
cdb=CrawlerDb()
10+
cdb.connect()
4011

41-
# Connect to the db and create the tables if they don't already exist
42-
connection=sqlite.connect(dbname)
43-
cursor=connection.cursor()
44-
# crawl_index: holds all the information of the urls that have been crawled
45-
cursor.execute('CREATE TABLE IF NOT EXISTS crawl_index (crawlid INTEGER, parentid INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256), status INTEGER )')
46-
# queue: this should be obvious
47-
cursor.execute('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256))')
48-
# status: Contains a record of when crawling was started and stopped.
49-
# Mostly in place for a future application to watch the crawl interactively.
50-
cursor.execute('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )')
51-
connection.commit()
12+
# content processor init
13+
processor=ContentProcessor(None, None, None)
5214

53-
# Compile keyword and link regex expressions
54-
keywordregex=re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>')
55-
linkregex=re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')
56-
ifdomains:
57-
domainregex=re.compile(domains)
58-
else:
59-
domainregex=False
60-
crawled= []
15+
iflen(sys.argv) <2:
16+
print"Error: No start url was passed"
17+
sys.exit()
6118

62-
# set crawling status and stick starting url into the queue
63-
cursor.execute("INSERT INTO status VALUES ((?), (?))", (1, "datetime('now')"))
64-
cursor.execute("INSERT INTO queue VALUES ((?), (?), (?), (?))", (None, 0, 0, starturl))
65-
connection.commit()
19+
l=sys.argv[1:]
6620

21+
cdb.enqueue(l)
6722

68-
# insert starting url into queue
23+
defcrawl():
24+
print"starting..."
25+
queue_empty=False
26+
whileTrue:
27+
url=cdb.dequeue()
28+
printurl
29+
ifcdb.checkCrawled(url):
30+
continue
31+
ifurlisFalse:
32+
queue_empty=True
6933

70-
classthreader ( threading.Thread ):
71-
72-
# Parser for robots.txt that helps determine if we are allowed to fetch a url
73-
rp=robotparser.RobotFileParser()
74-
75-
"""
76-
run()
77-
Args:
78-
none
79-
the run() method contains the main loop of the program. Each iteration takes the url
80-
at the top of the queue and starts the crawl of it.
81-
"""
82-
defrun(self):
83-
while1:
84-
try:
85-
# Get the first item from the queue
86-
cursor.execute("SELECT * FROM queue LIMIT 1")
87-
crawling=cursor.fetchone()
88-
# Remove the item from the queue
89-
cursor.execute("DELETE FROM queue WHERE id = (?)", (crawling[0], ))
90-
connection.commit()
91-
ifverbose:
92-
printcrawling[3]
93-
exceptKeyError:
94-
raiseStopIteration
95-
except:
96-
pass
97-
98-
# if theres nothing in the que, then set the status to done and exit
99-
ifcrawling==None:
100-
cursor.execute("INSERT INTO status VALUES ((?), datetime('now'))", (0,))
101-
connection.commit()
102-
sys.exit("Done!")
103-
# Crawl the link
104-
self.crawl(crawling)
105-
106-
"""
107-
crawl()
108-
Args:
109-
crawling: this should be a url
110-
111-
crawl() opens the page at the "crawling" url, parses it and puts it into the database.
112-
It looks for the page title, keywords, and links.
113-
"""
114-
defcrawl(self, crawling):
115-
# crawler id
116-
cid=crawling[0]
117-
# parent id. 0 if start url
118-
pid=crawling[1]
119-
# current depth
120-
curdepth=crawling[2]
121-
# crawling urL
122-
curl=crawling[3]
123-
ifdomainregexandnotdomainregex.search(curl):
124-
return
125-
# Split the link into its sections
126-
url=urlparse.urlparse(curl)
127-
34+
# Get HTTPConnection
35+
#connection = httplib.HTTPConnection(parsed_url.netloc)
36+
# Make the request
37+
#connection.request("GET", parsed_url.path)
38+
# Get response
39+
#response = connection.getresponse()
40+
#data = response.read()
41+
status=0
42+
request=None
12843
try:
129-
# Have our robot parser grab the robots.txt file and read it
130-
self.rp.set_url('http://'+url[1] +'/robots.txt')
131-
self.rp.read()
132-
133-
# If we're not allowed to open a url, return the function to skip it
134-
ifnotself.rp.can_fetch('PyCrawler', curl):
135-
ifverbose:
136-
printcurl+" not allowed by robots.txt"
137-
return
138-
except:
139-
pass
140-
141-
try:
142-
# Add the link to the already crawled list
143-
crawled.append(curl)
144-
exceptMemoryError:
145-
# If the crawled array is too big, deleted it and start over
146-
delcrawled[:]
147-
try:
148-
# Create a Request object
149-
request=urllib2.Request(curl)
150-
# Add user-agent header to the request
151-
request.add_header("User-Agent", "PyCrawler")
152-
# Build the url opener, open the link and read it into msg
153-
opener=urllib2.build_opener()
154-
f=opener.open(request)
155-
msg=f.read()
156-
# put meta data in info
157-
info=f.info()
158-
159-
44+
request=urllib2.urlopen(str(url))
16045
excepturllib2.URLError, e:
161-
# If it doesn't load, skip this url
162-
#print e.code
163-
try:
164-
cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?), (?) )", (cid, pid, curl, '', '', e.code))
165-
connection.commit
166-
except:
167-
pass
46+
printe.reason
47+
excepturllib2.HTTPError, e:
48+
status=e.code
49+
ifstatus==0:
50+
status=200
51+
data=request.read()
16852

169-
return
170-
171-
# Find what's between the title tags
172-
startPos=msg.find('<title>')
173-
ifstartPos!=-1:
174-
endPos=msg.find('</title>', startPos+7)
175-
ifendPos!=-1:
176-
title=msg[startPos+7:endPos]
177-
178-
# Start keywords list with whats in the keywords meta tag if there is one
179-
keywordlist=keywordregex.findall(msg)
180-
iflen(keywordlist) >0:
181-
keywordlist=keywordlist[0]
182-
else:
183-
keywordlist=""
184-
185-
186-
187-
# Get the links
188-
links=linkregex.findall(msg)
189-
# queue up the links
190-
self.queue_links(url, links, cid, curdepth)
53+
ifVERBOSE:
54+
print"Got %s status from %s"% (status, url)
55+
processor.setInfo(str(url), status, data)
56+
add_queue=processor.process()
57+
l=len(add_queue)
58+
print"Found %i links"%l
59+
ifl>0:
60+
ifqueue_empty==True:
61+
queue_empty=False
62+
cdb.enqueue(add_queue)
63+
cdb.addPage(processor.getDataDict())
64+
processor.reset()
65+
ifqueue_empty:
66+
break
19167

192-
try:
193-
# Put now crawled link into the db
194-
cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?), (?) )", (cid, pid, curl, title, keywordlist, 200))
195-
connection.commit()
196-
except:
197-
pass
198-
199-
200-
defqueue_links(self, url, links, cid, curdepth):
201-
ifcurdepth<crawldepth:
202-
# Read the links and inser them into the queue
203-
forlinkinlinks:
204-
cursor.execute("SELECT url FROM queue WHERE url=?", [link])
205-
forrowincursor:
206-
ifrow[0].decode('utf-8') ==url:
207-
continue
208-
iflink.startswith('/'):
209-
link='http://'+url[1] +link
210-
eliflink.startswith('#'):
211-
continue
212-
elifnotlink.startswith('http'):
213-
link=urlparse.urljoin(url.geturl(),link)
214-
215-
iflink.decode('utf-8') notincrawled:
216-
try:
217-
cursor.execute("INSERT INTO queue VALUES ( (?), (?), (?), (?) )", (None, cid, curdepth+1, link))
218-
connection.commit()
219-
except:
220-
continue
221-
else:
222-
pass
223-
if__name__=='__main__':
224-
try:
225-
importpsyco
226-
psyco.full()
227-
exceptImportError:
228-
print"Continuing without psyco JIT compilation!"
229-
# Run main loop
230-
threader().run()
68+
print"finishing..."
69+
cdb.close()
70+
print"done! goodbye!"
71+
72+
if__name__=="__main__":
73+
crawl()

‎README‎

Lines changed: 0 additions & 12 deletions
This file was deleted.

‎__init__.py‎

Whitespace-only changes.

0 commit comments

Comments
(0)