1+ # coding=utf-8
12# This code is supporting material for the book
23# Building Machine Learning Systems with Python
34# by Willi Richert and Luis Pedro Coelho
2930elif t == 2 :
3031c = 'b'
3132marker = 'x'
33+ # target数组中=0的是一类,作为features的下标,来选出同类的,再把列选出来。
34+ # feature一共有四种(四列)【0,1,2,3】,一共可以组合出6种图,这里只显示一种
3235plt .scatter (features [target == t , 0 ],
33- features [target == t , 1 ],
36+ features [target == t , 2 ],
3437marker = marker ,
3538c = c )
3639# We use NumPy fancy indexing to get an array of strings:
40+ # 可以理解为新的数组中的元素是不断的用每一个下标(target)选出来的。
3741labels = target_names [target ]
3842
3943# The petal length is the feature at position 2
4953
5054print ('Minimum of others:{0}.' .format (min_non_setosa ))
5155
56+ # setosa 是最好区分的,可以直接选出来,后面都是要区分另外两种
5257# ~ is the boolean negation operator
5358features = features [~ is_setosa ]
5459labels = labels [~ is_setosa ]
5560# Build a new target variable, is_virigina
61+ # 这是后面用来检测预测效果用的。
5662is_virginica = (labels == 'virginica' )
5763
5864# Initialize best_acc to impossibly low value
5965best_acc = - 1.0
60- for fi in range (features .shape [1 ]):
66+ for fi in range (features .shape [1 ]):# features 是一个N * 4的二维数组
6167# We are going to test all possible thresholds
6268thresh = features [:,fi ]
63- for t in thresh :
69+ for t in thresh :# 分别取第一个feature类别(一列数据)中的每一个数据。
6470
6571# Get the vector for feature `fi`
6672feature_i = features [:, fi ]
6773# apply threshold `t`
6874pred = (feature_i > t )
75+ # 预测和真实值一致则为True也就是1,其他为零,然后求平均数
6976acc = (pred == is_virginica ).mean ()
7077rev_acc = (pred == ~ is_virginica ).mean ()
7178if rev_acc > acc :
8188best_reverse = reverse
8289
8390print (best_fi , best_t , best_reverse , best_acc )
84-
85- def is_virginica_test (fi , t , reverse , example ):
86- 'Apply threshold model to a new example'
87- test = example [fi ] > t
88- if reverse :
89- test = not test
90- return test
91+ # result:
92+ # (3, 1.6000000000000001, False, 0.93999999999999995)
93+
94+ # The following code has error and thus disable for now - Jim
95+ # def is_virginica_test(fi, t, reverse, example):
96+ # 'Apply threshold model to a new example'
97+ # test = example[fi] > t
98+ # if reverse:
99+ # test = not test
100+ # return test
101+
102+ # 交叉验证:1 - 极端情况是每次取出来一个作为测试数据,其他的作为训练数据。
91103from threshold import fit_model , predict
92104
93105# ning accuracy was 96.0%.
@@ -96,9 +108,9 @@ def is_virginica_test(fi, t, reverse, example):
96108
97109for ei in range (len (features )):
98110# select all but the one at position `ei`:
99- training = np .ones (len (features ), bool )
100- training [ei ] = False
101- testing = ~ training
111+ training = np .ones (len (features ), bool )# 全部为True
112+ training [ei ] = False # 一个为false
113+ testing = ~ training # 全部为false, 一个为True
102114model = fit_model (features [training ], is_virginica [training ])
103115predictions = predict (model , features [testing ])
104116correct += np .sum (predictions == is_virginica [testing ])
@@ -124,17 +136,18 @@ def is_virginica_test(fi, t, reverse, example):
124136features , labels = load_dataset ('seeds' )
125137
126138
127-
139+ # 交叉验证:2 - k fold
128140from sklearn .neighbors import KNeighborsClassifier
129141classifier = KNeighborsClassifier (n_neighbors = 1 )
130142from sklearn .cross_validation import KFold
131143
132144kf = KFold (len (features ), n_folds = 5 , shuffle = True )
133145means = []
146+ # kf只是下标,还是要事先准备好训练数据和标记。
134147for training ,testing in kf :
135148# We learn a model for this fold with `fit` and then apply it to the
136149# testing data with `predict`:
137- classifier .fit (features [training ], labels [training ])
150+ classifier .fit (features [training ], labels [training ])# features包含的多列数据
138151prediction = classifier .predict (features [testing ])
139152
140153# np.mean on an array of booleans returns fraction
@@ -143,7 +156,7 @@ def is_virginica_test(fi, t, reverse, example):
143156means .append (curmean )
144157print ('Mean accuracy:{:.1%}' .format (np .mean (means )))
145158
146-
159+ # 同样是KNN,对比参数标准化后的效果
147160from sklearn .pipeline import Pipeline
148161from sklearn .preprocessing import StandardScaler
149162
@@ -161,4 +174,5 @@ def is_virginica_test(fi, t, reverse, example):
161174# of correct decisions for this fold:
162175curmean = np .mean (prediction == labels [testing ])
163176means .append (curmean )
177+ # 尼玛,每次结果都不一样!什么鬼?
164178print ('Mean accuracy:{:.1%}' .format (np .mean (means )))
0 commit comments