more Chinese comments on chapter 2

dashjim · dashjim · commit 396432a32f11 · 2017-07-02T17:21:31.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ ch03/data/379/raw/
 ch03/data/379/
 ch03/charts/
 playGround/user-basket/data/raw_user.csv
+ch01/charts/
diff --git a/ch02/chapter.py b/ch02/chapter.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # This code is supporting material for the book
 # Building Machine Learning Systems with Python
 # by Willi Richert and Luis Pedro Coelho
@@ -29,11 +30,14 @@
 elif t == 2:
 c = 'b'
 marker = 'x'
+ # target数组中=0的是一类，作为features的下标，来选出同类的，再把列选出来。
+ # feature一共有四种（四列）【0，1,2,3】，一共可以组合出6种图，这里只显示一种
 plt.scatter(features[target == t, 0],
- features[target == t, 1],
+ features[target == t, 2],
 marker=marker,
 c=c)
 # We use NumPy fancy indexing to get an array of strings:
+# 可以理解为新的数组中的元素是不断的用每一个下标（target）选出来的。
 labels = target_names[target]
 
 # The petal length is the feature at position 2
@@ -49,23 +53,26 @@
 
 print('Minimum of others:{0}.'.format(min_non_setosa))
 
+# setosa 是最好区分的，可以直接选出来，后面都是要区分另外两种
 # ~ is the boolean negation operator
 features = features[~is_setosa]
 labels = labels[~is_setosa]
 # Build a new target variable, is_virigina
+# 这是后面用来检测预测效果用的。
 is_virginica = (labels == 'virginica')
 
 # Initialize best_acc to impossibly low value
 best_acc = -1.0
-for fi in range(features.shape[1]):
+for fi in range(features.shape[1]): # features 是一个N * 4的二维数组
 # We are going to test all possible thresholds
 thresh = features[:,fi]
- for t in thresh:
+ for t in thresh: # 分别取第一个feature类别（一列数据）中的每一个数据。
 
 # Get the vector for feature `fi`
 feature_i = features[:, fi]
 # apply threshold `t`
 pred = (feature_i > t)
+ # 预测和真实值一致则为True也就是1，其他为零，然后求平均数
 acc = (pred == is_virginica).mean()
 rev_acc = (pred == ~is_virginica).mean()
 if rev_acc > acc:
@@ -81,13 +88,18 @@
 best_reverse = reverse
 
 print(best_fi, best_t, best_reverse, best_acc)
-
-def is_virginica_test(fi, t, reverse, example):
- 'Apply threshold model to a new example'
- test = example[fi] > t
- if reverse:
- test = not test
- return test
+# result:
+# (3, 1.6000000000000001, False, 0.93999999999999995)
+
+# The following code has error and thus disable for now - Jim
+# def is_virginica_test(fi, t, reverse, example):
+# 'Apply threshold model to a new example'
+# test = example[fi] > t
+# if reverse:
+# test = not test
+# return test
+
+# 交叉验证：1 - 极端情况是每次取出来一个作为测试数据，其他的作为训练数据。
 from threshold import fit_model, predict
 
 # ning accuracy was 96.0%.
@@ -96,9 +108,9 @@ def is_virginica_test(fi, t, reverse, example):
 
 for ei in range(len(features)):
 # select all but the one at position `ei`:
- training = np.ones(len(features), bool)
- training[ei] = False
- testing = ~training
+ training = np.ones(len(features), bool) # 全部为True
+ training[ei] = False # 一个为false
+ testing = ~training # 全部为false, 一个为True
 model = fit_model(features[training], is_virginica[training])
 predictions = predict(model, features[testing])
 correct += np.sum(predictions == is_virginica[testing])
@@ -124,17 +136,18 @@ def is_virginica_test(fi, t, reverse, example):
 features, labels = load_dataset('seeds')
 
 
-
+# 交叉验证：2 - k fold
 from sklearn.neighbors import KNeighborsClassifier
 classifier = KNeighborsClassifier(n_neighbors=1)
 from sklearn.cross_validation import KFold
 
 kf = KFold(len(features), n_folds=5, shuffle=True)
 means = []
+# kf只是下标，还是要事先准备好训练数据和标记。
 for training,testing in kf:
 # We learn a model for this fold with `fit` and then apply it to the
 # testing data with `predict`:
- classifier.fit(features[training], labels[training])
+ classifier.fit(features[training], labels[training]) # features包含的多列数据
 prediction = classifier.predict(features[testing])
 
 # np.mean on an array of booleans returns fraction
@@ -143,7 +156,7 @@ def is_virginica_test(fi, t, reverse, example):
 means.append(curmean)
 print('Mean accuracy:{:.1%}'.format(np.mean(means)))
 
-
+# 同样是KNN，对比参数标准化后的效果
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
 
@@ -161,4 +174,5 @@ def is_virginica_test(fi, t, reverse, example):
 # of correct decisions for this fold:
 curmean = np.mean(prediction == labels[testing])
 means.append(curmean)
+# 尼玛，每次结果都不一样！什么鬼？
 print('Mean accuracy:{:.1%}'.format(np.mean(means)))