1+ # coding=utf-8
12# This code is supporting material for the book
23# Building Machine Learning Systems with Python
34# by Willi Richert and Luis Pedro Coelho
@@ -26,17 +27,18 @@ class StemmedTfidfVectorizer(TfidfVectorizer):
2627
2728def build_analyzer (self ):
2829analyzer = super (TfidfVectorizer , self ).build_analyzer ()
30+ # 下面代码似乎有问题?应该先Stem,再analyze
2931return lambda doc : (english_stemmer .stem (w ) for w in analyzer (doc ))
3032
3133vectorizer = StemmedTfidfVectorizer (min_df = 10 , max_df = 0.5 ,
3234stop_words = 'english' , decode_error = 'ignore'
3335 )
34- vectorized = vectorizer .fit_transform (train_data .data )
36+ vectorized = vectorizer .fit_transform (train_data .data )# TF-IDF 数字
3537
3638post_group = zip (train_data .data , train_data .target )
3739# Create a list of tuples that can be sorted by
3840# the length of the posts
39- all = [(len (post [0 ]), post [0 ], train_data .target_names [post [1 ]])
41+ all = [(len (post [0 ]), post [0 ], train_data .target_names [post [1 ]])# post[1] 是数字,转换成名字
4042for post in post_group ]
4143graphics = sorted ([post for post in all if post [2 ] == 'comp.graphics' ])
4244print (graphics [5 ])
@@ -46,13 +48,13 @@ def build_analyzer(self):
4648# \n\n==============================================================================\n',
4749# 'comp.graphics')
4850
49- noise_post = graphics [5 ][1 ]
51+ noise_post = graphics [5 ][1 ]# get the content -Jim
5052
51- analyzer = vectorizer .build_analyzer ()
53+ analyzer = vectorizer .build_analyzer ()# 第二个analyzer,确保还有stem功能?
5254print (list (analyzer (noise_post )))
53-
55+ # vectorizer.get_feature_names 返回全部4千多个词
5456useful = set (analyzer (noise_post )).intersection (vectorizer .get_feature_names ())
55- print (sorted (useful ))
57+ print (sorted (useful ))# 去掉了近一般的词
5658# ['ac', 'birmingham', 'host', 'kingdom', 'nntp', 'sorri', 'test', 'uk', 'unit', 'univers']
5759
5860for term in sorted (useful ):
0 commit comments