|
| 1 | +# This code is supporting material for the book |
| 2 | +# Building Machine Learning Systems with Python |
| 3 | +# by Willi Richert and Luis Pedro Coelho |
| 4 | +# published by PACKT Publishing |
| 5 | +# |
| 6 | +# It is made available under the MIT License |
| 7 | + |
| 8 | + |
| 9 | +frommatplotlibimportpyplotasplt |
| 10 | +importnumpyasnp |
| 11 | + |
| 12 | +# We load the data with load_iris from sklearn |
| 13 | +fromsklearn.datasetsimportload_iris |
| 14 | +data=load_iris() |
| 15 | + |
| 16 | +# load_iris returns an object with several fields |
| 17 | +features=data.data |
| 18 | +feature_names=data.feature_names |
| 19 | +target=data.target |
| 20 | +target_names=data.target_names |
| 21 | + |
| 22 | +fortinrange(3): |
| 23 | +ift==0: |
| 24 | +c='r' |
| 25 | +marker='>' |
| 26 | +elift==1: |
| 27 | +c='g' |
| 28 | +marker='o' |
| 29 | +elift==2: |
| 30 | +c='b' |
| 31 | +marker='x' |
| 32 | +plt.scatter(features[target==t,0], |
| 33 | +features[target==t,1], |
| 34 | +marker=marker, |
| 35 | +c=c) |
| 36 | +# We use NumPy fancy indexing to get an array of strings: |
| 37 | +labels=target_names[target] |
| 38 | + |
| 39 | +# The petal length is the feature at position 2 |
| 40 | +plength=features[:, 2] |
| 41 | + |
| 42 | +# Build an array of booleans: |
| 43 | +is_setosa= (labels=='setosa') |
| 44 | + |
| 45 | +# This is the important step: |
| 46 | +max_setosa=plength[is_setosa].max() |
| 47 | +min_non_setosa=plength[~is_setosa].min() |
| 48 | +print('Maximum of setosa:{0}.'.format(max_setosa)) |
| 49 | + |
| 50 | +print('Minimum of others:{0}.'.format(min_non_setosa)) |
| 51 | + |
| 52 | +# ~ is the boolean negation operator |
| 53 | +features=features[~is_setosa] |
| 54 | +labels=labels[~is_setosa] |
| 55 | +# Build a new target variable, is_virigina |
| 56 | +is_virginica= (labels=='virginica') |
| 57 | + |
| 58 | +# Initialize best_acc to impossibly low value |
| 59 | +best_acc=-1.0 |
| 60 | +forfiinrange(features.shape[1]): |
| 61 | +# We are going to test all possible thresholds |
| 62 | +thresh=features[:,fi] |
| 63 | +fortinthresh: |
| 64 | + |
| 65 | +# Get the vector for feature `fi` |
| 66 | +feature_i=features[:, fi] |
| 67 | +# apply threshold `t` |
| 68 | +pred= (feature_i>t) |
| 69 | +acc= (pred==is_virginica).mean() |
| 70 | +rev_acc= (pred==~is_virginica).mean() |
| 71 | +ifrev_acc>acc: |
| 72 | +reverse=True |
| 73 | +acc=rev_acc |
| 74 | +else: |
| 75 | +reverse=False |
| 76 | + |
| 77 | +ifacc>best_acc: |
| 78 | +best_acc=acc |
| 79 | +best_fi=fi |
| 80 | +best_t=t |
| 81 | +best_reverse=reverse |
| 82 | + |
| 83 | +print(best_fi, best_t, best_reverse, best_acc) |
| 84 | + |
| 85 | +defis_virginica_test(fi, t, reverse, example): |
| 86 | +'Apply threshold model to a new example' |
| 87 | +test=example[fi] >t |
| 88 | +ifreverse: |
| 89 | +test=nottest |
| 90 | +returntest |
| 91 | +fromthresholdimportfit_model, predict |
| 92 | + |
| 93 | +# ning accuracy was 96.0%. |
| 94 | +# ing accuracy was 90.0% (N = 50). |
| 95 | +correct=0.0 |
| 96 | + |
| 97 | +foreiinrange(len(features)): |
| 98 | +# select all but the one at position `ei`: |
| 99 | +training=np.ones(len(features), bool) |
| 100 | +training[ei] =False |
| 101 | +testing=~training |
| 102 | +model=fit_model(features[training], is_virginica[training]) |
| 103 | +predict(model, features[testing]) |
| 104 | +predictions=predict(model, features[testing]) |
| 105 | +correct+=np.sum(predictions==is_virginica[testing]) |
| 106 | +acc=correct/float(len(features)) |
| 107 | +print('Accuracy:{0:.1%}'.format(acc)) |
| 108 | + |
| 109 | + |
| 110 | +########################################### |
| 111 | +############## SEEDS DATASET ############## |
| 112 | +########################################### |
| 113 | + |
| 114 | +fromloadimportload_dataset |
| 115 | + |
| 116 | +feature_names= [ |
| 117 | +'area', |
| 118 | +'perimeter', |
| 119 | +'compactness', |
| 120 | +'length of kernel', |
| 121 | +'width of kernel', |
| 122 | +'asymmetry coefficien', |
| 123 | +'length of kernel groove', |
| 124 | +] |
| 125 | +features, labels=load_dataset('seeds') |
| 126 | + |
| 127 | + |
| 128 | + |
| 129 | +fromsklearn.neighborsimportKNeighborsClassifier |
| 130 | +classifier=KNeighborsClassifier(n_neighbors=1) |
| 131 | +fromsklearn.cross_validationimportKFold |
| 132 | + |
| 133 | +kf=KFold(len(features), n_folds=5, shuffle=True) |
| 134 | +means= [] |
| 135 | +fortraining,testinginkf: |
| 136 | +# We learn a model for this fold with `fit` and then apply it to the |
| 137 | +# testing data with `predict`: |
| 138 | +classifier.fit(features[training], labels[training]) |
| 139 | +prediction=classifier.predict(features[testing]) |
| 140 | + |
| 141 | +# np.mean on an array of booleans returns fraction |
| 142 | +# of correct decisions for this fold: |
| 143 | +curmean=np.mean(prediction==labels[testing]) |
| 144 | +means.append(curmean) |
| 145 | +print('Mean accuracy:{:.1%}'.format(np.mean(means))) |
| 146 | + |
| 147 | + |
| 148 | +fromsklearn.pipelineimportPipeline |
| 149 | +fromsklearn.preprocessingimportStandardScaler |
| 150 | + |
| 151 | +classifier=KNeighborsClassifier(n_neighbors=1) |
| 152 | +classifier=Pipeline([('norm', StandardScaler()), ('knn', classifier)]) |
| 153 | + |
| 154 | + |
| 155 | + |
| 156 | +means= [] |
| 157 | +fortraining,testinginkf: |
| 158 | +# We learn a model for this fold with `fit` and then apply it to the |
| 159 | +# testing data with `predict`: |
| 160 | +classifier.fit(features[training], labels[training]) |
| 161 | +prediction=classifier.predict(features[testing]) |
| 162 | + |
| 163 | +# np.mean on an array of booleans returns fraction |
| 164 | +# of correct decisions for this fold: |
| 165 | +curmean=np.mean(prediction==labels[testing]) |
| 166 | +means.append(curmean) |
| 167 | +print('Mean accuracy:{:.1%}'.format(np.mean(means))) |
0 commit comments