Adding comment.

dashjim · dashjim · commit 806829d7a1e8 · 2017-07-02T21:14:37.000+08:00
diff --git a/ch03/noise_analysis.py b/ch03/noise_analysis.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # This code is supporting material for the book
 # Building Machine Learning Systems with Python
 # by Willi Richert and Luis Pedro Coelho
@@ -26,17 +27,18 @@ class StemmedTfidfVectorizer(TfidfVectorizer):
 
 def build_analyzer(self):
 analyzer = super(TfidfVectorizer, self).build_analyzer()
+ # 下面代码似乎有问题？应该先Stem，再analyze
 return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
 
 vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,
 stop_words='english', decode_error='ignore'
 )
-vectorized = vectorizer.fit_transform(train_data.data)
+vectorized = vectorizer.fit_transform(train_data.data) # TF-IDF 数字
 
 post_group = zip(train_data.data, train_data.target)
 # Create a list of tuples that can be sorted by
 # the length of the posts
-all = [(len(post[0]), post[0], train_data.target_names[post[1]])
+all = [(len(post[0]), post[0], train_data.target_names[post[1]]) # post[1] 是数字，转换成名字
 for post in post_group]
 graphics = sorted([post for post in all if post[2] == 'comp.graphics'])
 print(graphics[5])
@@ -46,13 +48,13 @@ def build_analyzer(self):
 # \n\n==============================================================================\n',
 # 'comp.graphics')
 
-noise_post = graphics[5][1]
+noise_post = graphics[5][1] # get the content -Jim
 
-analyzer = vectorizer.build_analyzer()
+analyzer = vectorizer.build_analyzer() # 第二个analyzer，确保还有stem功能？
 print(list(analyzer(noise_post)))
-
+# vectorizer.get_feature_names 返回全部4千多个词
 useful = set(analyzer(noise_post)).intersection(vectorizer.get_feature_names())
-print(sorted(useful))
+print(sorted(useful)) # 去掉了近一般的词
 # ['ac', 'birmingham', 'host', 'kingdom', 'nntp', 'sorri', 'test', 'uk', 'unit', 'univers']
 
 for term in sorted(useful):
diff --git a/ch03/rel_post_01.py b/ch03/rel_post_01.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # This code is supporting material for the book
 # Building Machine Learning Systems with Python
 # by Willi Richert and Luis Pedro Coelho
@@ -51,25 +52,26 @@ def build_analyzer(self):
 print("#samples: %d, #features: %d" % (num_samples, num_features))
 
 new_post_vec = vectorizer.transform([new_post])
-print(new_post_vec, type(new_post_vec))
+print(new_post_vec, type(new_post_vec)) # 稀疏表示
 print(new_post_vec.toarray())
 print(vectorizer.get_feature_names())
 
-
+# 不能处理重复的句子
 def dist_raw(v1, v2):
 delta = v1 - v2
 return sp.linalg.norm(delta.toarray())
 
-
+# 处理重复的句子 (什么鬼？两种方法结果一样？)
 def dist_norm(v1, v2):
 v1_normalized = v1 / sp.linalg.norm(v1.toarray())
 v2_normalized = v2 / sp.linalg.norm(v2.toarray())
 
 delta = v1_normalized - v2_normalized
-
+ # norm()向量的长度
 return sp.linalg.norm(delta.toarray())
 
 dist = dist_norm
+# dist = dist_raw (什么鬼？两种方法结果一样？)
 
 best_dist = sys.maxsize
 best_i = None
diff --git a/ch03/rel_post_20news.py b/ch03/rel_post_20news.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # This code is supporting material for the book
 # Building Machine Learning Systems with Python
 # by Willi Richert and Luis Pedro Coelho
@@ -56,15 +57,18 @@ def build_analyzer(self):
 )
 
 vectorized = vectorizer.fit_transform(train_data.data)
+# shape 是一个tuple
 num_samples, num_features = vectorized.shape
 print("#samples: %d, #features: %d" % (num_samples, num_features))
 # samples: 3529, #features: 4712
 
 from sklearn.cluster import KMeans
-
+# make it verbose will print some information -jim
 km = KMeans(n_clusters=num_clusters, n_init=1, verbose=1, random_state=3)
+# 传给模型的是一个二维数组，一共有3529行，每一行代表一个post，每一行的宽度是4712，其中的数字是TF-IDF的得分
 clustered = km.fit(vectorized)
 
+# 模型结果的解读...
 print("km.labels_=%s" % km.labels_)
 # km.labels_=[ 6 34 22 ..., 2 21 26]
 

-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 +# coding=utf-8
 # This code is supporting material for the book
 # Building Machine Learning Systems with Python
 # by Willi Richert and Luis Pedro Coelho
 defbuild_analyzer(self):
 analyzer=super(TfidfVectorizer, self).build_analyzer()
 +# 下面代码似乎有问题？应该先Stem，再analyze
 returnlambdadoc: (english_stemmer.stem(w) forwinanalyzer(doc))
 vectorizer=StemmedTfidfVectorizer(min_df=10, max_df=0.5,
 stop_words='english', decode_error='ignore'
+ )
 -vectorized=vectorizer.fit_transform(train_data.data)
 +vectorized=vectorizer.fit_transform(train_data.data)# TF-IDF 数字
 post_group=zip(train_data.data, train_data.target)
 # Create a list of tuples that can be sorted by
 # the length of the posts
 -all= [(len(post[0]), post[0], train_data.target_names[post[1]])
 +all= [(len(post[0]), post[0], train_data.target_names[post[1]])# post[1] 是数字，转换成名字
 forpostinpost_group]
 graphics=sorted([postforpostinallifpost[2] =='comp.graphics'])
 print(graphics[5])
 # \n\n==============================================================================\n',
 # 'comp.graphics')
 -noise_post=graphics[5][1]
 +noise_post=graphics[5][1]# get the content -Jim
 -analyzer=vectorizer.build_analyzer()
 +analyzer=vectorizer.build_analyzer()# 第二个analyzer，确保还有stem功能？
 print(list(analyzer(noise_post)))
+-
 +# vectorizer.get_feature_names 返回全部4千多个词
 useful=set(analyzer(noise_post)).intersection(vectorizer.get_feature_names())
 -print(sorted(useful))
 +print(sorted(useful))# 去掉了近一般的词
 # ['ac', 'birmingham', 'host', 'kingdom', 'nntp', 'sorri', 'test', 'uk', 'unit', 'univers']
 forterminsorted(useful):