Skip to content

Commit ad55cd8

Browse files
committed
Work on colors, Added option to rotate sqlite database files
2 parents 4425b4c + dacd344 commit ad55cd8

File tree

6 files changed

+70
-35
lines changed

6 files changed

+70
-35
lines changed

‎PyCrawler.db.1‎

220 KB
Binary file not shown.

‎PyCrawler.py‎

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
fromqueryimportCrawlerDb
22
fromcontent_processorimportContentProcessor
3-
fromsettingsimportVERBOSE, COLOR_ERROR, COLOR_SUCCESS
4-
importsys, urlparse, urllib2
3+
fromsettingsimportVERBOSE, USE_COLORS, DATABASE_ENGINE, DATABASE_NAME, SQLITE_ROTATE_DATABASE_ON_STARTUP
4+
importsys, urlparse, urllib2, shutil, glob, robotparser
55
importcPrinter
66

77
# ===== Init stuff =====
@@ -14,36 +14,58 @@
1414
processor=ContentProcessor(None, None, None)
1515

1616
# get cprinter
17-
printer=cPrinter.Printer(COLOR_SUCCESS, COLOR_ERROR)
17+
printer=cPrinter.Printer(USE_COLORS)
18+
19+
# robot parser init
20+
robot=robotparser.RobotFileParser()
1821

1922
iflen(sys.argv) <2:
20-
printer.p("Error: No start url was passed", printer.error)
23+
printer.p("Error: No start url was passed", printer.other)
2124
sys.exit()
2225

2326
l=sys.argv[1:]
2427

2528
cdb.enqueue(l)
2629

2730
defcrawl():
28-
printer.p("starting...", printer.success)
29-
queue_empty=False
31+
printer.p("starting...", printer.other)
3032
whileTrue:
3133
url=cdb.dequeue()
34+
u=urlparse.urlparse(url)
35+
robot.set_url('http://'+u[1]+"/robots.txt")
36+
ifnotrobot.can_fetch('PyCrawler', url):
37+
printer.p("Url disallowed by robots.txt: %s "%url, printer.other)
38+
continue
39+
ifnoturl.startswith('http'):
40+
printer.p("Unfollowable link found at %s "%url, printer.other)
41+
continue
42+
3243
ifcdb.checkCrawled(url):
3344
continue
3445
ifurlisFalse:
35-
queue_empty=True
46+
break
3647
status=0
3748
request=None
3849
try:
3950
request=urllib2.urlopen(str(url))
4051
excepturllib2.URLError, e:
4152
printer.p(e.reason, printer.error)
53+
printer.p("Exception at url: %s"%url, printer.error)
54+
55+
continue
4256
excepturllib2.HTTPError, e:
4357
status=e.code
4458
ifstatus==0:
4559
status=200
4660
data=request.read()
61+
processor.setInfo(str(url), status, data)
62+
ret=processor.process()
63+
ifstatus!=200:
64+
continue
65+
add_queue= []
66+
forqinret:
67+
ifnotcdb.checkCrawled(q):
68+
add_queue.append(q)
4769

4870
processor.setInfo(str(url), status, data)
4971
add_queue=processor.process()
@@ -52,17 +74,26 @@ def crawl():
5274
printer.p("Got %s status from %s"% (status, url), printer.success)
5375
printer.p("Found %i links"%l, printer.success)
5476
ifl>0:
55-
ifqueue_empty==True:
56-
queue_empty=False
5777
cdb.enqueue(add_queue)
5878
cdb.addPage(processor.getDataDict())
5979
processor.reset()
60-
ifqueue_empty:
61-
break
6280

63-
printer.p("finishing...", printer.success)
81+
printer.p("finishing...", printer.other)
6482
cdb.close()
6583
printer.p("done! goodbye!", printer.success)
6684

6785
if__name__=="__main__":
68-
crawl()
86+
ifDATABASE_ENGINE=="sqlite"andSQLITE_ROTATE_DATABASE_ON_STARTUP:
87+
dbs=glob.glob("*.db*")
88+
index=1;
89+
while("%s.db.%s"% (DATABASE_NAME, index) indbs):
90+
index+=1
91+
shutil.copy2(dbs[len(dbs)-1], "%s.db.%s"% (DATABASE_NAME, index))
92+
try:
93+
crawl()
94+
exceptKeyboardInterrupt:
95+
printer.p("Stopping", printer.error)
96+
sys.exit()
97+
exceptException, e:
98+
printer.p("EXCEPTION: %s "%e, printer.error)
99+

‎cPrinter.py‎

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@
22

33
classPrinter():
44

5-
def__init__(self, COLOR_SUCCESS, COLOR_ERROR):
5+
def__init__(self, USE_COLORS):
66
# Define our types
77
self.success=0;
88
self.error=1;
9+
self.other=2;
10+
11+
self.USE_COLORS=USE_COLORS
912

1013
# Initialize environment
1114
curses.setupterm()
@@ -16,19 +19,18 @@ def __init__(self, COLOR_SUCCESS, COLOR_ERROR):
1619
#Get the normal attribute
1720
self.COLOR_NORMAL=curses.tigetstr('sgr0')
1821

19-
# Initialize custom colors to the first two slots
20-
curses.initscr()
21-
curses.start_color()
22-
curses.init_color(0, COLOR_SUCCESS[0], COLOR_SUCCESS[1], COLOR_SUCCESS[2])
23-
curses.init_color(1, COLOR_ERROR[0], COLOR_ERROR[1], COLOR_ERROR[2])
24-
curses.endwin()
25-
2622
# Get + Save the color sequences
27-
self.COLOR_SUCCESS=curses.tparm(self.fcap, 0)
28-
self.COLOR_ERROR=curses.tparm(self.fcap, 1)
23+
self.COLOR_SUCCESS=curses.tparm(self.fcap, curses.COLOR_GREEN)
24+
self.COLOR_ERROR=curses.tparm(self.fcap, curses.COLOR_RED)
25+
self.COLOR_OTHER=curses.tparm(self.fcap, curses.COLOR_YELLOW)
2926

3027
defp(self, text, type):
31-
iftype==self.success:
32-
print"%s%s%s"% (self.COLOR_SUCCESS, text, self.COLOR_NORMAL)
33-
eliftype==self.error:
34-
print"%s%s%s"% (self.COLOR_SUCCESS, text, self.COLOR_NORMAL)
28+
ifself.USE_COLORS:
29+
iftype==self.success:
30+
print"%s[*] %s%s"% (self.COLOR_SUCCESS, text, self.COLOR_NORMAL)
31+
eliftype==self.error:
32+
print"%s[!] %s%s"% (self.COLOR_ERROR, text, self.COLOR_NORMAL)
33+
eliftype==self.other:
34+
print"%s[.] %s%s"% (self.COLOR_OTHER, text, self.COLOR_NORMAL)
35+
else:
36+
printtext

‎content_processor.py‎

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def rankKeywords(text):
2020
defstripPunctuation(text):
2121
pattern=re.compile(r'[^\w\s]')
2222
returnpattern.sub('', text)
23+
2324
classContentProcessor:
2425

2526
def__init__(self, url, status, text):
@@ -114,4 +115,7 @@ def process(self):
114115
returnqueue
115116

116117
defgetDataDict(self):
118+
fork,vinself.keywords.items():
119+
ifv<3:
120+
delself.keywords[k]
117121
return{"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}

‎query.py‎

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def enqueue(self, urls):
5656
returnFalse
5757
iflen(urls) ==0:
5858
returnTrue
59-
args= [{'address':u} foruinurls]
59+
args= [{'address':unicode(u)} foruinurls]
6060
result=self.connection.execute(self.queue_table.insert(), args)
6161
ifresult:
6262
returnTrue
@@ -81,7 +81,7 @@ def dequeue(self):
8181
returnFalse
8282

8383
defcheckCrawled(self, url):
84-
s=select([self.crawl_table]).where(self.crawl_table.c.address==url)
84+
s=select([self.crawl_table]).where(self.crawl_table.c.address==unicode(url))
8585
result=self.connection.execute(s)
8686
iflen(result.fetchall()) >0:
8787
result.close()
@@ -100,7 +100,7 @@ def addPage(self, data):
100100
ifnotself.connected:
101101
returnFalse
102102
# Add the page to the crawl table
103-
result=self.connection.execute(self.crawl_table.insert().values(address=data['address'],http_status=data['status'],title=data['title'],size=data['size']))
103+
result=self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
104104
ifnotresult:
105105
returnFalse
106106
# generate list of argument dictionaries for the insert many statement

‎settings.py‎

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,8 @@
77
DATABASE_USER=""# Not used with sqlite
88
DATABASE_PASS=""# Not used with sqlite
99

10-
VERBOSE=True
10+
SQLITE_ROTATE_DATABASE_ON_STARTUP=True# Rotate the database to a new one on startup
1111

12-
# These values are for the text output colors.
13-
# List values are 0-255 RGB values, respectively.
12+
VERBOSE=True
1413

15-
COLOR_SUCCESS= [0, 255, 0] # Success Color (Green)
16-
COLOR_ERROR= [255, 0, 0] # Error Color (Red)
14+
USE_COLORS=True# Whether or not colors should be used when printing text

0 commit comments

Comments
(0)