Skip to content

Commit afcdc0b

Browse files
committed
ENH Add chapter.py files
These contain the code as typed in the text
1 parent bb9b8de commit afcdc0b

File tree

6 files changed

+494
-0
lines changed

6 files changed

+494
-0
lines changed

‎ch08/chapter.py‎

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
importnumpyasnp# NOT IN BOOK
2+
frommatplotlibimportpyplotasplt# NOT IN BOOK
3+
4+
defload():
5+
importnumpyasnp
6+
fromscipyimportsparse
7+
8+
data=np.loadtxt('data/ml-100k/u.data')
9+
ij=data[:, :2]
10+
ij-=1# original data is in 1-based system
11+
values=data[:, 2]
12+
reviews=sparse.csc_matrix((values, ij.T)).astype(float)
13+
returnreviews.toarray()
14+
reviews=load()
15+
U,M=np.where(reviews)
16+
importrandom
17+
test_idxs=np.array(random.sample(range(len(U)), len(U)//10))
18+
19+
train=reviews.copy()
20+
train[U[test_idxs], M[test_idxs]] =0
21+
22+
test=np.zeros_like(reviews)
23+
test[U[test_idxs], M[test_idxs]] =reviews[U[test_idxs], M[test_idxs]]
24+
25+
classNormalizePositive(object):
26+
def__init__(self, axis=0):
27+
self.axis=axis
28+
29+
deffit(self, features, y=None):
30+
ifself.axis==1:
31+
features=features.T
32+
# count features that are greater than zero in axis 0:
33+
binary= (features>0)
34+
35+
count0=binary.sum(axis=0)
36+
37+
# to avoid division by zero, set zero counts to one:
38+
count0[count0==0] =1.
39+
40+
# computing the mean is easy:
41+
self.mean=features.sum(axis=0)/count0
42+
43+
# only consider differences where binary is True:
44+
diff= (features-self.mean) *binary
45+
diff**=2
46+
# regularize the estimate of std by adding 0.1
47+
self.std=np.sqrt(0.1+diff.sum(axis=0)/count0)
48+
returnself
49+
50+
51+
deftransform(self, features):
52+
ifself.axis==1:
53+
features=features.T
54+
binary= (features>0)
55+
features=features-self.mean
56+
features/=self.std
57+
features*=binary
58+
ifself.axis==1:
59+
features=features.T
60+
returnfeatures
61+
62+
definverse_transform(self, features, copy=True):
63+
ifcopy:
64+
features=features.copy()
65+
ifself.axis==1:
66+
features=features.T
67+
features*=self.std
68+
features+=self.mean
69+
ifself.axis==1:
70+
features=features.T
71+
returnfeatures
72+
73+
deffit_transform(self, features):
74+
returnself.fit(features).transform(features)
75+
76+
77+
norm=NormalizePositive(axis=1)
78+
binary= (train>0)
79+
train=norm.fit_transform(train)
80+
# plot just 200x200 area for space reasons
81+
plt.imshow(binary[:200, :200], interpolation='nearest')
82+
83+
fromscipy.spatialimportdistance
84+
# compute all pair-wise distances:
85+
dists=distance.pdist(binary, 'correlation')
86+
# Convert to square form, so that dists[i,j]
87+
# is distance between binary[i] and binary[j]:
88+
dists=distance.squareform(dists)
89+
neighbors=dists.argsort(axis=1)
90+
91+
# We are going to fill this matrix with results
92+
filled=train.copy()
93+
foruinrange(filled.shape[0]):
94+
# n_u is neighbors of user
95+
n_u=neighbors[u, 1:]
96+
forminrange(filled.shape[1]):
97+
# get relevant reviews in order!
98+
revs= [train[neigh, m]
99+
forneighinn_u
100+
ifbinary [neigh, m]]
101+
iflen(revs):
102+
# n is the number of reviews for this movie
103+
n=len(revs)
104+
# take half of the reviews plus one into consideration:
105+
n//=2
106+
n+=1
107+
revs=revs[:n]
108+
filled[u,m] =np.mean(revs)
109+
110+
predicted=norm.inverse_transform(filled)
111+
fromsklearnimportmetrics
112+
r2=metrics.r2_score(test[test>0], predicted[test>0])
113+
print('R2 score (binary neighbors):{:.1%}'.format(r2))
114+
115+
reviews=reviews.T
116+
# use same code as before
117+
r2=metrics.r2_score(test[test>0], predicted[test>0])
118+
print('R2 score (binary movie neighbors):{:.1%}'.format(r2))
119+
120+
121+
fromsklearn.linear_modelimportElasticNetCV# NOT IN BOOK
122+
123+
reg=ElasticNetCV(alphas=[
124+
0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
125+
filled=train.copy()
126+
# iterate over all users:
127+
foruinrange(train.shape[0]):
128+
curtrain=np.delete(train, u, axis=0)
129+
bu=binary[u]
130+
reg.fit(curtrain[:,bu].T, train[u, bu])
131+
filled[u, ~bu] =reg.predict(curtrain[:,~bu].T)
132+
predicted=norm.inverse_transform(filled)
133+
r2=metrics.r2_score(test[test>0], predicted[test>0])
134+
print('R2 score (user regression):{:.1%}'.format(r2))
135+
136+
137+
# SHOPPING BASKET ANALYSIS
138+
# This is the slow version of the code, which will take a long time to
139+
# complete.
140+
141+
142+
fromcollectionsimportdefaultdict
143+
fromitertoolsimportchain
144+
145+
# File is downloaded as a compressed file
146+
importgzip
147+
# file format is a line per transaction
148+
# of the form '12 34 342 5...'
149+
dataset= [[int(tok) fortokinline.strip().split()]
150+
forlineingzip.open('data/retail.dat.gz')]
151+
dataset= [set(d) fordindataset]
152+
# count how often each product was purchased:
153+
counts=defaultdict(int)
154+
foreleminchain(*dataset):
155+
counts[elem] +=1
156+
157+
minsupport=80
158+
valid=set(kfork,vincounts.items() if (v>=minsupport))
159+
itemsets= [frozenset([v]) forvinvalid]
160+
freqsets= []
161+
foriinrange(16):
162+
nextsets= []
163+
tested=set()
164+
foritinitemsets:
165+
forvinvalid:
166+
ifvnotinit:
167+
# Create a new candidate set by adding v to it
168+
c= (it|frozenset([v]))
169+
# check If we have tested it already
170+
ifcintested:
171+
continue
172+
tested.add(c)
173+
174+
# Count support by looping over dataset
175+
# This step is slow.
176+
# Check `apriori.py` for a better implementation.
177+
support_c=sum(1fordindatasetifd.issuperset(c))
178+
ifsupport_c>minsupport:
179+
nextsets.append(c)
180+
freqsets.extend(nextsets)
181+
itemsets=nextsets
182+
ifnotlen(itemsets):
183+
break
184+
print("Finished!")
185+
186+
187+
minlift=5.0
188+
nr_transactions=float(len(dataset))
189+
foritemsetinfreqsets:
190+
foriteminitemset:
191+
consequent=frozenset([item])
192+
antecedent=itemset-consequent
193+
base=0.0
194+
# acount: antecedent count
195+
acount=0.0
196+
197+
# ccount : consequent count
198+
ccount=0.0
199+
fordindataset:
200+
ifitemind: base+=1
201+
ifd.issuperset(itemset): ccount+=1
202+
ifd.issuperset(antecedent): acount+=1
203+
base/=nr_transactions
204+
p_y_given_x=ccount/acount
205+
lift=p_y_given_x/base
206+
iflift>minlift:
207+
print('Rule{0} ->{1} has lift{2}'
208+
.format(antecedent, consequent,lift))

‎ch10/README.rst‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ Running ``download.sh`` will retrieve the other dataset into a directory
1717
Scripts
1818
-------
1919

20+
chapter.py
21+
Code as written in the book.
2022
thresholded_figure.py
2123
Computes the thresholded figures, including after Gaussian blurring
2224
lena-ring.py

0 commit comments

Comments
(0)