Skip to content

Commit 806829d

Browse files
committed
Adding comment.
1 parent 396432a commit 806829d

File tree

3 files changed

+19
-11
lines changed

3 files changed

+19
-11
lines changed

‎ch03/noise_analysis.py‎

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# This code is supporting material for the book
23
# Building Machine Learning Systems with Python
34
# by Willi Richert and Luis Pedro Coelho
@@ -26,17 +27,18 @@ class StemmedTfidfVectorizer(TfidfVectorizer):
2627

2728
defbuild_analyzer(self):
2829
analyzer=super(TfidfVectorizer, self).build_analyzer()
30+
# 下面代码似乎有问题?应该先Stem,再analyze
2931
returnlambdadoc: (english_stemmer.stem(w) forwinanalyzer(doc))
3032

3133
vectorizer=StemmedTfidfVectorizer(min_df=10, max_df=0.5,
3234
stop_words='english', decode_error='ignore'
3335
)
34-
vectorized=vectorizer.fit_transform(train_data.data)
36+
vectorized=vectorizer.fit_transform(train_data.data)# TF-IDF 数字
3537

3638
post_group=zip(train_data.data, train_data.target)
3739
# Create a list of tuples that can be sorted by
3840
# the length of the posts
39-
all= [(len(post[0]), post[0], train_data.target_names[post[1]])
41+
all= [(len(post[0]), post[0], train_data.target_names[post[1]])# post[1] 是数字,转换成名字
4042
forpostinpost_group]
4143
graphics=sorted([postforpostinallifpost[2] =='comp.graphics'])
4244
print(graphics[5])
@@ -46,13 +48,13 @@ def build_analyzer(self):
4648
# \n\n==============================================================================\n',
4749
# 'comp.graphics')
4850

49-
noise_post=graphics[5][1]
51+
noise_post=graphics[5][1]# get the content -Jim
5052

51-
analyzer=vectorizer.build_analyzer()
53+
analyzer=vectorizer.build_analyzer()# 第二个analyzer,确保还有stem功能?
5254
print(list(analyzer(noise_post)))
53-
55+
# vectorizer.get_feature_names 返回全部4千多个词
5456
useful=set(analyzer(noise_post)).intersection(vectorizer.get_feature_names())
55-
print(sorted(useful))
57+
print(sorted(useful))# 去掉了近一般的词
5658
# ['ac', 'birmingham', 'host', 'kingdom', 'nntp', 'sorri', 'test', 'uk', 'unit', 'univers']
5759

5860
forterminsorted(useful):

‎ch03/rel_post_01.py‎

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# This code is supporting material for the book
23
# Building Machine Learning Systems with Python
34
# by Willi Richert and Luis Pedro Coelho
@@ -51,25 +52,26 @@ def build_analyzer(self):
5152
print("#samples: %d, #features: %d"% (num_samples, num_features))
5253

5354
new_post_vec=vectorizer.transform([new_post])
54-
print(new_post_vec, type(new_post_vec))
55+
print(new_post_vec, type(new_post_vec))# 稀疏表示
5556
print(new_post_vec.toarray())
5657
print(vectorizer.get_feature_names())
5758

58-
59+
# 不能处理重复的句子
5960
defdist_raw(v1, v2):
6061
delta=v1-v2
6162
returnsp.linalg.norm(delta.toarray())
6263

63-
64+
# 处理重复的句子 (什么鬼?两种方法结果一样?)
6465
defdist_norm(v1, v2):
6566
v1_normalized=v1/sp.linalg.norm(v1.toarray())
6667
v2_normalized=v2/sp.linalg.norm(v2.toarray())
6768

6869
delta=v1_normalized-v2_normalized
69-
70+
# norm()向量的长度
7071
returnsp.linalg.norm(delta.toarray())
7172

7273
dist=dist_norm
74+
# dist = dist_raw (什么鬼?两种方法结果一样?)
7375

7476
best_dist=sys.maxsize
7577
best_i=None

‎ch03/rel_post_20news.py‎

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# This code is supporting material for the book
23
# Building Machine Learning Systems with Python
34
# by Willi Richert and Luis Pedro Coelho
@@ -56,15 +57,18 @@ def build_analyzer(self):
5657
)
5758

5859
vectorized=vectorizer.fit_transform(train_data.data)
60+
# shape 是一个tuple
5961
num_samples, num_features=vectorized.shape
6062
print("#samples: %d, #features: %d"% (num_samples, num_features))
6163
# samples: 3529, #features: 4712
6264

6365
fromsklearn.clusterimportKMeans
64-
66+
# make it verbose will print some information -jim
6567
km=KMeans(n_clusters=num_clusters, n_init=1, verbose=1, random_state=3)
68+
# 传给模型的是一个二维数组,一共有3529行,每一行代表一个post,每一行的宽度是4712,其中的数字是TF-IDF的得分
6669
clustered=km.fit(vectorized)
6770

71+
# 模型结果的解读...
6872
print("km.labels_=%s"%km.labels_)
6973
# km.labels_=[ 6 34 22 ..., 2 21 26]
7074

0 commit comments

Comments
(0)