Skip to content

Commit 396432a

Browse files
committed
more Chinese comments on chapter 2
1 parent 75f4fa6 commit 396432a

File tree

2 files changed

+31
-16
lines changed

2 files changed

+31
-16
lines changed

‎.gitignore‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ ch03/data/379/raw/
77
ch03/data/379/
88
ch03/charts/
99
playGround/user-basket/data/raw_user.csv
10+
ch01/charts/

‎ch02/chapter.py‎

Lines changed: 30 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# This code is supporting material for the book
23
# Building Machine Learning Systems with Python
34
# by Willi Richert and Luis Pedro Coelho
@@ -29,11 +30,14 @@
2930
elift==2:
3031
c='b'
3132
marker='x'
33+
# target数组中=0的是一类,作为features的下标,来选出同类的,再把列选出来。
34+
# feature一共有四种(四列)【0,1,2,3】,一共可以组合出6种图,这里只显示一种
3235
plt.scatter(features[target==t, 0],
33-
features[target==t, 1],
36+
features[target==t, 2],
3437
marker=marker,
3538
c=c)
3639
# We use NumPy fancy indexing to get an array of strings:
40+
# 可以理解为新的数组中的元素是不断的用每一个下标(target)选出来的。
3741
labels=target_names[target]
3842

3943
# The petal length is the feature at position 2
@@ -49,23 +53,26 @@
4953

5054
print('Minimum of others:{0}.'.format(min_non_setosa))
5155

56+
# setosa 是最好区分的,可以直接选出来,后面都是要区分另外两种
5257
# ~ is the boolean negation operator
5358
features=features[~is_setosa]
5459
labels=labels[~is_setosa]
5560
# Build a new target variable, is_virigina
61+
# 这是后面用来检测预测效果用的。
5662
is_virginica= (labels=='virginica')
5763

5864
# Initialize best_acc to impossibly low value
5965
best_acc=-1.0
60-
forfiinrange(features.shape[1]):
66+
forfiinrange(features.shape[1]):# features 是一个N * 4的二维数组
6167
# We are going to test all possible thresholds
6268
thresh=features[:,fi]
63-
fortinthresh:
69+
fortinthresh:# 分别取第一个feature类别(一列数据)中的每一个数据。
6470

6571
# Get the vector for feature `fi`
6672
feature_i=features[:, fi]
6773
# apply threshold `t`
6874
pred= (feature_i>t)
75+
# 预测和真实值一致则为True也就是1,其他为零,然后求平均数
6976
acc= (pred==is_virginica).mean()
7077
rev_acc= (pred==~is_virginica).mean()
7178
ifrev_acc>acc:
@@ -81,13 +88,18 @@
8188
best_reverse=reverse
8289

8390
print(best_fi, best_t, best_reverse, best_acc)
84-
85-
defis_virginica_test(fi, t, reverse, example):
86-
'Apply threshold model to a new example'
87-
test=example[fi] >t
88-
ifreverse:
89-
test=nottest
90-
returntest
91+
# result:
92+
# (3, 1.6000000000000001, False, 0.93999999999999995)
93+
94+
# The following code has error and thus disable for now - Jim
95+
# def is_virginica_test(fi, t, reverse, example):
96+
# 'Apply threshold model to a new example'
97+
# test = example[fi] > t
98+
# if reverse:
99+
# test = not test
100+
# return test
101+
102+
# 交叉验证:1 - 极端情况是每次取出来一个作为测试数据,其他的作为训练数据。
91103
fromthresholdimportfit_model, predict
92104

93105
# ning accuracy was 96.0%.
@@ -96,9 +108,9 @@ def is_virginica_test(fi, t, reverse, example):
96108

97109
foreiinrange(len(features)):
98110
# select all but the one at position `ei`:
99-
training=np.ones(len(features), bool)
100-
training[ei] =False
101-
testing=~training
111+
training=np.ones(len(features), bool)# 全部为True
112+
training[ei] =False# 一个为false
113+
testing=~training# 全部为false, 一个为True
102114
model=fit_model(features[training], is_virginica[training])
103115
predictions=predict(model, features[testing])
104116
correct+=np.sum(predictions==is_virginica[testing])
@@ -124,17 +136,18 @@ def is_virginica_test(fi, t, reverse, example):
124136
features, labels=load_dataset('seeds')
125137

126138

127-
139+
# 交叉验证:2 - k fold
128140
fromsklearn.neighborsimportKNeighborsClassifier
129141
classifier=KNeighborsClassifier(n_neighbors=1)
130142
fromsklearn.cross_validationimportKFold
131143

132144
kf=KFold(len(features), n_folds=5, shuffle=True)
133145
means= []
146+
# kf只是下标,还是要事先准备好训练数据和标记。
134147
fortraining,testinginkf:
135148
# We learn a model for this fold with `fit` and then apply it to the
136149
# testing data with `predict`:
137-
classifier.fit(features[training], labels[training])
150+
classifier.fit(features[training], labels[training])# features包含的多列数据
138151
prediction=classifier.predict(features[testing])
139152

140153
# np.mean on an array of booleans returns fraction
@@ -143,7 +156,7 @@ def is_virginica_test(fi, t, reverse, example):
143156
means.append(curmean)
144157
print('Mean accuracy:{:.1%}'.format(np.mean(means)))
145158

146-
159+
# 同样是KNN,对比参数标准化后的效果
147160
fromsklearn.pipelineimportPipeline
148161
fromsklearn.preprocessingimportStandardScaler
149162

@@ -161,4 +174,5 @@ def is_virginica_test(fi, t, reverse, example):
161174
# of correct decisions for this fold:
162175
curmean=np.mean(prediction==labels[testing])
163176
means.append(curmean)
177+
# 尼玛,每次结果都不一样!什么鬼?
164178
print('Mean accuracy:{:.1%}'.format(np.mean(means)))

0 commit comments

Comments
(0)