Skip to content

Commit 8169ee5

Browse files
author
Ethan Blackburn
committed
Revert "fixed content_processor"
This reverts commit 8ea6f69.
1 parent 8ea6f69 commit 8169ee5

File tree

2 files changed

+8
-21
lines changed

2 files changed

+8
-21
lines changed

‎PyCrawler.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,4 +92,4 @@ def crawl():
9292
exceptException, e:
9393
logger.error("EXCEPTION: %s "%e)
9494
traceback.print_exc()
95-
95+

‎content_processor.py‎

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
frommultiprocessingimportPool
2-
importre, sys, logging, string
2+
importre, sys, logging
33

44
fromready_queueimportready_queue
55

@@ -9,21 +9,13 @@ def rankKeywords(text):
99
invalid_keywords= ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
1010
ranks={}
1111
text=text.split(' ')
12-
exclude=set(string.punctuation)
1312
fortintext:
14-
#remove punctuation if attached to word
15-
temp=t
16-
t=''
17-
foriinrange(len(temp)):
18-
if(temp[i] notinexclude):
19-
t+=temp[i]
20-
t=t.strip()
2113
iftininvalid_keywords:
2214
continue
2315
ifnotranks.has_key(t):
2416
ranks[t] =1
2517
else:
26-
ranks[t] +=1
18+
ranks[t] +=1
2719
returnranks
2820

2921
defstripPunctuation(text):
@@ -91,18 +83,13 @@ def processBody(self):
9183
offset=0
9284
i=0
9385
l= []
94-
cont=True
95-
whilecont:
96-
#this divides the text into sets of 500 words
97-
#set j to the index of the last letter of the 500th word
86+
whileTrue:
9887
j=self.findnth(self.text[i:],' ',500)
99-
#if only 500 words or less are left
88+
offset+=j
10089
ifj==-1:
101-
cont=False
102-
#Should append a string that contains 500 words for each loop(except the last loop) to l
103-
#last loop should append a string with 500 words or less to l
104-
l.append(self.text[i:i+j])
105-
i+=j+1
90+
break
91+
l.append(self.text[i:j])
92+
i=offset+j+1
10693
logger.debug("processing with %i threads"%len(l))
10794
try:
10895
iflen(l) ==0:

0 commit comments

Comments
(0)