11from multiprocessing import Pool
2- import re , sys , logging
2+ import re , sys , logging , string
33
44from ready_queue import ready_queue
55
@@ -9,13 +9,21 @@ def rankKeywords(text):
99invalid_keywords = ['' , ' ' , "i" , "a" , "an" , "and" , "the" , "for" , "be" , "to" , "or" , "too" , "also" ]
1010ranks = {}
1111text = text .split (' ' )
12+ exclude = set (string .punctuation )
1213for t in text :
14+ #remove punctuation if attached to word
15+ temp = t
16+ t = ''
17+ for i in range (len (temp )):
18+ if (temp [i ] not in exclude ):
19+ t += temp [i ]
20+ t = t .strip ()
1321if t in invalid_keywords :
1422continue
1523if not ranks .has_key (t ):
1624ranks [t ] = 1
1725else :
18- ranks [t ] += 1
26+ ranks [t ] += 1
1927return ranks
2028
2129def stripPunctuation (text ):
@@ -83,13 +91,18 @@ def processBody(self):
8391offset = 0
8492i = 0
8593l = []
86- while True :
94+ cont = True
95+ while cont :
96+ #this divides the text into sets of 500 words
97+ #set j to the index of the last letter of the 500th word
8798j = self .findnth (self .text [i :],' ' ,500 )
88- offset += j
99+ #if only 500 words or less are left
89100if j == - 1 :
90- break
91- l .append (self .text [i :j ])
92- i = offset + j + 1
101+ cont = False
102+ #Should append a string that contains 500 words for each loop(except the last loop) to l
103+ #last loop should append a string with 500 words or less to l
104+ l .append (self .text [i :i + j ])
105+ i += j + 1
93106logger .debug ("processing with %i threads" % len (l ))
94107try :
95108if len (l ) == 0 :
@@ -136,4 +149,4 @@ def getDataDict(self):
136149for k ,v in self .keywords .items ():
137150if v < 3 :
138151del self .keywords [k ]
139- return {"address" :self .url , "title" :self .title , "status" :self .status , "size" :self .size , "keywords" :self .keywords }
152+ return {"address" :self .url , "title" :self .title , "status" :self .status , "size" :self .size , "keywords" :self .keywords }
0 commit comments