charlessoft
diff --git a/‎ch08/chapter.py‎
Lines changed: 208 additions & 0 deletions b/‎ch08/chapter.py‎
Lines changed: 208 additions & 0 deletions
diff --git a/‎ch10/README.rst‎
Lines changed: 2 additions & 0 deletions b/‎ch10/README.rst‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,208 @@
+importnumpyasnp# NOT IN BOOK
+frommatplotlibimportpyplotasplt# NOT IN BOOK
+
+defload():
+importnumpyasnp
+fromscipyimportsparse
+
+data=np.loadtxt('data/ml-100k/u.data')
+ij=data[:, :2]
+ij-=1# original data is in 1-based system
+values=data[:, 2]
+reviews=sparse.csc_matrix((values, ij.T)).astype(float)
+returnreviews.toarray()
+reviews=load()
+U,M=np.where(reviews)
+importrandom
+test_idxs=np.array(random.sample(range(len(U)), len(U)//10))
+
+train=reviews.copy()
+train[U[test_idxs], M[test_idxs]] =0
+
+test=np.zeros_like(reviews)
+test[U[test_idxs], M[test_idxs]] =reviews[U[test_idxs], M[test_idxs]]
+
+classNormalizePositive(object):
+def__init__(self, axis=0):
+self.axis=axis
+
+deffit(self, features, y=None):
+ifself.axis==1:
+features=features.T
+# count features that are greater than zero in axis 0:
+binary= (features>0)
+
+count0=binary.sum(axis=0)
+
+# to avoid division by zero, set zero counts to one:
+count0[count0==0] =1.
+
+# computing the mean is easy:
+self.mean=features.sum(axis=0)/count0
+
+# only consider differences where binary is True:
+diff= (features-self.mean) *binary
+diff**=2
+# regularize the estimate of std by adding 0.1
+self.std=np.sqrt(0.1+diff.sum(axis=0)/count0)
+returnself
+
+
+deftransform(self, features):
+ifself.axis==1:
+features=features.T
+binary= (features>0)
+features=features-self.mean
+features/=self.std
+features*=binary
+ifself.axis==1:
+features=features.T
+returnfeatures
+
+definverse_transform(self, features, copy=True):
+ifcopy:
+features=features.copy()
+ifself.axis==1:
+features=features.T
+features*=self.std
+features+=self.mean
+ifself.axis==1:
+features=features.T
+returnfeatures
+
+deffit_transform(self, features):
+returnself.fit(features).transform(features)
+
+
+norm=NormalizePositive(axis=1)
+binary= (train>0)
+train=norm.fit_transform(train)
+# plot just 200x200 area for space reasons
+plt.imshow(binary[:200, :200], interpolation='nearest')
+
+fromscipy.spatialimportdistance
+# compute all pair-wise distances:
+dists=distance.pdist(binary, 'correlation')
+# Convert to square form, so that dists[i,j]
+# is distance between binary[i] and binary[j]:
+dists=distance.squareform(dists)
+neighbors=dists.argsort(axis=1)
+
+# We are going to fill this matrix with results
+filled=train.copy()
+foruinrange(filled.shape[0]):
+# n_u is neighbors of user
+n_u=neighbors[u, 1:]
+forminrange(filled.shape[1]):
+# get relevant reviews in order!
+revs= [train[neigh, m]
+forneighinn_u
+ifbinary [neigh, m]]
+iflen(revs):
+# n is the number of reviews for this movie
+n=len(revs)
+# take half of the reviews plus one into consideration:
+n//=2
+n+=1
+revs=revs[:n]
+filled[u,m] =np.mean(revs)
+
+predicted=norm.inverse_transform(filled)
+fromsklearnimportmetrics
+r2=metrics.r2_score(test[test>0], predicted[test>0])
+print('R2 score (binary neighbors):{:.1%}'.format(r2))
+
+reviews=reviews.T
+# use same code as before 
+r2=metrics.r2_score(test[test>0], predicted[test>0])
+print('R2 score (binary movie neighbors):{:.1%}'.format(r2))
+
+
+fromsklearn.linear_modelimportElasticNetCV# NOT IN BOOK
+
+reg=ElasticNetCV(alphas=[
+0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
+filled=train.copy()
+# iterate over all users:
+foruinrange(train.shape[0]):
+curtrain=np.delete(train, u, axis=0)
+bu=binary[u]
+reg.fit(curtrain[:,bu].T, train[u, bu])
+filled[u, ~bu] =reg.predict(curtrain[:,~bu].T)
+predicted=norm.inverse_transform(filled)
+r2=metrics.r2_score(test[test>0], predicted[test>0])
+print('R2 score (user regression):{:.1%}'.format(r2))
+
+
+# SHOPPING BASKET ANALYSIS
+# This is the slow version of the code, which will take a long time to
+# complete.
+
+
+fromcollectionsimportdefaultdict
+fromitertoolsimportchain
+
+# File is downloaded as a compressed file
+importgzip
+# file format is a line per transaction
+# of the form '12 34 342 5...'
+dataset= [[int(tok) fortokinline.strip().split()]
+forlineingzip.open('data/retail.dat.gz')]
+dataset= [set(d) fordindataset]
+# count how often each product was purchased:
+counts=defaultdict(int)
+foreleminchain(*dataset):
+counts[elem] +=1
+
+minsupport=80
+valid=set(kfork,vincounts.items() if (v>=minsupport))
+itemsets= [frozenset([v]) forvinvalid]
+freqsets= []
+foriinrange(16):
+nextsets= []
+tested=set()
+foritinitemsets:
+forvinvalid:
+ifvnotinit:
+# Create a new candidate set by adding v to it
+c= (it|frozenset([v]))
+# check If we have tested it already
+ifcintested:
+continue
+tested.add(c)
+
+# Count support by looping over dataset
+# This step is slow.
+# Check `apriori.py` for a better implementation.
+support_c=sum(1fordindatasetifd.issuperset(c))
+ifsupport_c>minsupport:
+nextsets.append(c)
+freqsets.extend(nextsets)
+itemsets=nextsets
+ifnotlen(itemsets):
+break
+print("Finished!")
+
+
+minlift=5.0
+nr_transactions=float(len(dataset))
+foritemsetinfreqsets:
+foriteminitemset:
+consequent=frozenset([item])
+antecedent=itemset-consequent
+base=0.0
+# acount: antecedent count
+acount=0.0
+
+# ccount : consequent count
+ccount=0.0
+fordindataset:
+ifitemind: base+=1
+ifd.issuperset(itemset): ccount+=1
+ifd.issuperset(antecedent): acount+=1
+base/=nr_transactions
+p_y_given_x=ccount/acount
+lift=p_y_given_x/base
+iflift>minlift:
+print('Rule{0} ->{1} has lift{2}'
+ .format(antecedent, consequent,lift))
@@ -17,6 +17,8 @@ Running ``download.sh`` will retrieve the other dataset into a directory
 Scripts
 -------
 
+chapter.py
+ Code as written in the book.
 thresholded_figure.py
  Computes the thresholded figures, including after Gaussian blurring
 lena-ring.py