Skip to content

Commit 4205208

Browse files
committed
ENH Single file with the code as in book
1 parent 1d8fd23 commit 4205208

File tree

2 files changed

+170
-0
lines changed

2 files changed

+170
-0
lines changed

‎ch02/README.rst‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ Support code for *Chapter 2: Learning How to Classify with Real-world
66
Examples*. The directory data contains the seeds dataset, originally downloaded
77
from https://archive.ics.uci.edu/ml/datasets/seeds
88

9+
chapter.py
10+
The code as printed in the book.
11+
912
figure1.py
1013
Figure 1 in the book: all 2-by-2 scatter plots
1114

‎ch02/chapter.py‎

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
# This code is supporting material for the book
2+
# Building Machine Learning Systems with Python
3+
# by Willi Richert and Luis Pedro Coelho
4+
# published by PACKT Publishing
5+
#
6+
# It is made available under the MIT License
7+
8+
9+
frommatplotlibimportpyplotasplt
10+
importnumpyasnp
11+
12+
# We load the data with load_iris from sklearn
13+
fromsklearn.datasetsimportload_iris
14+
data=load_iris()
15+
16+
# load_iris returns an object with several fields
17+
features=data.data
18+
feature_names=data.feature_names
19+
target=data.target
20+
target_names=data.target_names
21+
22+
fortinrange(3):
23+
ift==0:
24+
c='r'
25+
marker='>'
26+
elift==1:
27+
c='g'
28+
marker='o'
29+
elift==2:
30+
c='b'
31+
marker='x'
32+
plt.scatter(features[target==t,0],
33+
features[target==t,1],
34+
marker=marker,
35+
c=c)
36+
# We use NumPy fancy indexing to get an array of strings:
37+
labels=target_names[target]
38+
39+
# The petal length is the feature at position 2
40+
plength=features[:, 2]
41+
42+
# Build an array of booleans:
43+
is_setosa= (labels=='setosa')
44+
45+
# This is the important step:
46+
max_setosa=plength[is_setosa].max()
47+
min_non_setosa=plength[~is_setosa].min()
48+
print('Maximum of setosa:{0}.'.format(max_setosa))
49+
50+
print('Minimum of others:{0}.'.format(min_non_setosa))
51+
52+
# ~ is the boolean negation operator
53+
features=features[~is_setosa]
54+
labels=labels[~is_setosa]
55+
# Build a new target variable, is_virigina
56+
is_virginica= (labels=='virginica')
57+
58+
# Initialize best_acc to impossibly low value
59+
best_acc=-1.0
60+
forfiinrange(features.shape[1]):
61+
# We are going to test all possible thresholds
62+
thresh=features[:,fi]
63+
fortinthresh:
64+
65+
# Get the vector for feature `fi`
66+
feature_i=features[:, fi]
67+
# apply threshold `t`
68+
pred= (feature_i>t)
69+
acc= (pred==is_virginica).mean()
70+
rev_acc= (pred==~is_virginica).mean()
71+
ifrev_acc>acc:
72+
reverse=True
73+
acc=rev_acc
74+
else:
75+
reverse=False
76+
77+
ifacc>best_acc:
78+
best_acc=acc
79+
best_fi=fi
80+
best_t=t
81+
best_reverse=reverse
82+
83+
print(best_fi, best_t, best_reverse, best_acc)
84+
85+
defis_virginica_test(fi, t, reverse, example):
86+
'Apply threshold model to a new example'
87+
test=example[fi] >t
88+
ifreverse:
89+
test=nottest
90+
returntest
91+
fromthresholdimportfit_model, predict
92+
93+
# ning accuracy was 96.0%.
94+
# ing accuracy was 90.0% (N = 50).
95+
correct=0.0
96+
97+
foreiinrange(len(features)):
98+
# select all but the one at position `ei`:
99+
training=np.ones(len(features), bool)
100+
training[ei] =False
101+
testing=~training
102+
model=fit_model(features[training], is_virginica[training])
103+
predict(model, features[testing])
104+
predictions=predict(model, features[testing])
105+
correct+=np.sum(predictions==is_virginica[testing])
106+
acc=correct/float(len(features))
107+
print('Accuracy:{0:.1%}'.format(acc))
108+
109+
110+
###########################################
111+
############## SEEDS DATASET ##############
112+
###########################################
113+
114+
fromloadimportload_dataset
115+
116+
feature_names= [
117+
'area',
118+
'perimeter',
119+
'compactness',
120+
'length of kernel',
121+
'width of kernel',
122+
'asymmetry coefficien',
123+
'length of kernel groove',
124+
]
125+
features, labels=load_dataset('seeds')
126+
127+
128+
129+
fromsklearn.neighborsimportKNeighborsClassifier
130+
classifier=KNeighborsClassifier(n_neighbors=1)
131+
fromsklearn.cross_validationimportKFold
132+
133+
kf=KFold(len(features), n_folds=5, shuffle=True)
134+
means= []
135+
fortraining,testinginkf:
136+
# We learn a model for this fold with `fit` and then apply it to the
137+
# testing data with `predict`:
138+
classifier.fit(features[training], labels[training])
139+
prediction=classifier.predict(features[testing])
140+
141+
# np.mean on an array of booleans returns fraction
142+
# of correct decisions for this fold:
143+
curmean=np.mean(prediction==labels[testing])
144+
means.append(curmean)
145+
print('Mean accuracy:{:.1%}'.format(np.mean(means)))
146+
147+
148+
fromsklearn.pipelineimportPipeline
149+
fromsklearn.preprocessingimportStandardScaler
150+
151+
classifier=KNeighborsClassifier(n_neighbors=1)
152+
classifier=Pipeline([('norm', StandardScaler()), ('knn', classifier)])
153+
154+
155+
156+
means= []
157+
fortraining,testinginkf:
158+
# We learn a model for this fold with `fit` and then apply it to the
159+
# testing data with `predict`:
160+
classifier.fit(features[training], labels[training])
161+
prediction=classifier.predict(features[testing])
162+
163+
# np.mean on an array of booleans returns fraction
164+
# of correct decisions for this fold:
165+
curmean=np.mean(prediction==labels[testing])
166+
means.append(curmean)
167+
print('Mean accuracy:{:.1%}'.format(np.mean(means)))

0 commit comments

Comments
(0)