Skip to content

Commit 62d2dbc

Browse files
committed
Merge pull request theanti9#12 from EthanBlackburn/master
fixed content processor
2 parents 2d186f4 + 3d0dcc3 commit 62d2dbc

File tree

1 file changed

+21
-8
lines changed

1 file changed

+21
-8
lines changed

‎content_processor.py‎

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
frommultiprocessingimportPool
2-
importre, sys, logging
2+
importre, sys, logging, string
33

44
fromready_queueimportready_queue
55

@@ -9,13 +9,21 @@ def rankKeywords(text):
99
invalid_keywords= ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
1010
ranks={}
1111
text=text.split(' ')
12+
exclude=set(string.punctuation)
1213
fortintext:
14+
#remove punctuation if attached to word
15+
temp=t
16+
t=''
17+
foriinrange(len(temp)):
18+
if(temp[i] notinexclude):
19+
t+=temp[i]
20+
t=t.strip()
1321
iftininvalid_keywords:
1422
continue
1523
ifnotranks.has_key(t):
1624
ranks[t] =1
1725
else:
18-
ranks[t] +=1
26+
ranks[t] +=1
1927
returnranks
2028

2129
defstripPunctuation(text):
@@ -83,13 +91,18 @@ def processBody(self):
8391
offset=0
8492
i=0
8593
l= []
86-
whileTrue:
94+
cont=True
95+
whilecont:
96+
#this divides the text into sets of 500 words
97+
#set j to the index of the last letter of the 500th word
8798
j=self.findnth(self.text[i:],' ',500)
88-
offset+=j
99+
#if only 500 words or less are left
89100
ifj==-1:
90-
break
91-
l.append(self.text[i:j])
92-
i=offset+j+1
101+
cont=False
102+
#Should append a string that contains 500 words for each loop(except the last loop) to l
103+
#last loop should append a string with 500 words or less to l
104+
l.append(self.text[i:i+j])
105+
i+=j+1
93106
logger.debug("processing with %i threads"%len(l))
94107
try:
95108
iflen(l) ==0:
@@ -136,4 +149,4 @@ def getDataDict(self):
136149
fork,vinself.keywords.items():
137150
ifv<3:
138151
delself.keywords[k]
139-
return{"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}
152+
return{"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}

0 commit comments

Comments
(0)