|
| 1 | +importnumpyasnp# NOT IN BOOK |
| 2 | +frommatplotlibimportpyplotasplt# NOT IN BOOK |
| 3 | + |
| 4 | +defload(): |
| 5 | +importnumpyasnp |
| 6 | +fromscipyimportsparse |
| 7 | + |
| 8 | +data=np.loadtxt('data/ml-100k/u.data') |
| 9 | +ij=data[:, :2] |
| 10 | +ij-=1# original data is in 1-based system |
| 11 | +values=data[:, 2] |
| 12 | +reviews=sparse.csc_matrix((values, ij.T)).astype(float) |
| 13 | +returnreviews.toarray() |
| 14 | +reviews=load() |
| 15 | +U,M=np.where(reviews) |
| 16 | +importrandom |
| 17 | +test_idxs=np.array(random.sample(range(len(U)), len(U)//10)) |
| 18 | + |
| 19 | +train=reviews.copy() |
| 20 | +train[U[test_idxs], M[test_idxs]] =0 |
| 21 | + |
| 22 | +test=np.zeros_like(reviews) |
| 23 | +test[U[test_idxs], M[test_idxs]] =reviews[U[test_idxs], M[test_idxs]] |
| 24 | + |
| 25 | +classNormalizePositive(object): |
| 26 | +def__init__(self, axis=0): |
| 27 | +self.axis=axis |
| 28 | + |
| 29 | +deffit(self, features, y=None): |
| 30 | +ifself.axis==1: |
| 31 | +features=features.T |
| 32 | +# count features that are greater than zero in axis 0: |
| 33 | +binary= (features>0) |
| 34 | + |
| 35 | +count0=binary.sum(axis=0) |
| 36 | + |
| 37 | +# to avoid division by zero, set zero counts to one: |
| 38 | +count0[count0==0] =1. |
| 39 | + |
| 40 | +# computing the mean is easy: |
| 41 | +self.mean=features.sum(axis=0)/count0 |
| 42 | + |
| 43 | +# only consider differences where binary is True: |
| 44 | +diff= (features-self.mean) *binary |
| 45 | +diff**=2 |
| 46 | +# regularize the estimate of std by adding 0.1 |
| 47 | +self.std=np.sqrt(0.1+diff.sum(axis=0)/count0) |
| 48 | +returnself |
| 49 | + |
| 50 | + |
| 51 | +deftransform(self, features): |
| 52 | +ifself.axis==1: |
| 53 | +features=features.T |
| 54 | +binary= (features>0) |
| 55 | +features=features-self.mean |
| 56 | +features/=self.std |
| 57 | +features*=binary |
| 58 | +ifself.axis==1: |
| 59 | +features=features.T |
| 60 | +returnfeatures |
| 61 | + |
| 62 | +definverse_transform(self, features, copy=True): |
| 63 | +ifcopy: |
| 64 | +features=features.copy() |
| 65 | +ifself.axis==1: |
| 66 | +features=features.T |
| 67 | +features*=self.std |
| 68 | +features+=self.mean |
| 69 | +ifself.axis==1: |
| 70 | +features=features.T |
| 71 | +returnfeatures |
| 72 | + |
| 73 | +deffit_transform(self, features): |
| 74 | +returnself.fit(features).transform(features) |
| 75 | + |
| 76 | + |
| 77 | +norm=NormalizePositive(axis=1) |
| 78 | +binary= (train>0) |
| 79 | +train=norm.fit_transform(train) |
| 80 | +# plot just 200x200 area for space reasons |
| 81 | +plt.imshow(binary[:200, :200], interpolation='nearest') |
| 82 | + |
| 83 | +fromscipy.spatialimportdistance |
| 84 | +# compute all pair-wise distances: |
| 85 | +dists=distance.pdist(binary, 'correlation') |
| 86 | +# Convert to square form, so that dists[i,j] |
| 87 | +# is distance between binary[i] and binary[j]: |
| 88 | +dists=distance.squareform(dists) |
| 89 | +neighbors=dists.argsort(axis=1) |
| 90 | + |
| 91 | +# We are going to fill this matrix with results |
| 92 | +filled=train.copy() |
| 93 | +foruinrange(filled.shape[0]): |
| 94 | +# n_u is neighbors of user |
| 95 | +n_u=neighbors[u, 1:] |
| 96 | +forminrange(filled.shape[1]): |
| 97 | +# get relevant reviews in order! |
| 98 | +revs= [train[neigh, m] |
| 99 | +forneighinn_u |
| 100 | +ifbinary [neigh, m]] |
| 101 | +iflen(revs): |
| 102 | +# n is the number of reviews for this movie |
| 103 | +n=len(revs) |
| 104 | +# take half of the reviews plus one into consideration: |
| 105 | +n//=2 |
| 106 | +n+=1 |
| 107 | +revs=revs[:n] |
| 108 | +filled[u,m] =np.mean(revs) |
| 109 | + |
| 110 | +predicted=norm.inverse_transform(filled) |
| 111 | +fromsklearnimportmetrics |
| 112 | +r2=metrics.r2_score(test[test>0], predicted[test>0]) |
| 113 | +print('R2 score (binary neighbors):{:.1%}'.format(r2)) |
| 114 | + |
| 115 | +reviews=reviews.T |
| 116 | +# use same code as before |
| 117 | +r2=metrics.r2_score(test[test>0], predicted[test>0]) |
| 118 | +print('R2 score (binary movie neighbors):{:.1%}'.format(r2)) |
| 119 | + |
| 120 | + |
| 121 | +fromsklearn.linear_modelimportElasticNetCV# NOT IN BOOK |
| 122 | + |
| 123 | +reg=ElasticNetCV(alphas=[ |
| 124 | +0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) |
| 125 | +filled=train.copy() |
| 126 | +# iterate over all users: |
| 127 | +foruinrange(train.shape[0]): |
| 128 | +curtrain=np.delete(train, u, axis=0) |
| 129 | +bu=binary[u] |
| 130 | +reg.fit(curtrain[:,bu].T, train[u, bu]) |
| 131 | +filled[u, ~bu] =reg.predict(curtrain[:,~bu].T) |
| 132 | +predicted=norm.inverse_transform(filled) |
| 133 | +r2=metrics.r2_score(test[test>0], predicted[test>0]) |
| 134 | +print('R2 score (user regression):{:.1%}'.format(r2)) |
| 135 | + |
| 136 | + |
| 137 | +# SHOPPING BASKET ANALYSIS |
| 138 | +# This is the slow version of the code, which will take a long time to |
| 139 | +# complete. |
| 140 | + |
| 141 | + |
| 142 | +fromcollectionsimportdefaultdict |
| 143 | +fromitertoolsimportchain |
| 144 | + |
| 145 | +# File is downloaded as a compressed file |
| 146 | +importgzip |
| 147 | +# file format is a line per transaction |
| 148 | +# of the form '12 34 342 5...' |
| 149 | +dataset= [[int(tok) fortokinline.strip().split()] |
| 150 | +forlineingzip.open('data/retail.dat.gz')] |
| 151 | +dataset= [set(d) fordindataset] |
| 152 | +# count how often each product was purchased: |
| 153 | +counts=defaultdict(int) |
| 154 | +foreleminchain(*dataset): |
| 155 | +counts[elem] +=1 |
| 156 | + |
| 157 | +minsupport=80 |
| 158 | +valid=set(kfork,vincounts.items() if (v>=minsupport)) |
| 159 | +itemsets= [frozenset([v]) forvinvalid] |
| 160 | +freqsets= [] |
| 161 | +foriinrange(16): |
| 162 | +nextsets= [] |
| 163 | +tested=set() |
| 164 | +foritinitemsets: |
| 165 | +forvinvalid: |
| 166 | +ifvnotinit: |
| 167 | +# Create a new candidate set by adding v to it |
| 168 | +c= (it|frozenset([v])) |
| 169 | +# check If we have tested it already |
| 170 | +ifcintested: |
| 171 | +continue |
| 172 | +tested.add(c) |
| 173 | + |
| 174 | +# Count support by looping over dataset |
| 175 | +# This step is slow. |
| 176 | +# Check `apriori.py` for a better implementation. |
| 177 | +support_c=sum(1fordindatasetifd.issuperset(c)) |
| 178 | +ifsupport_c>minsupport: |
| 179 | +nextsets.append(c) |
| 180 | +freqsets.extend(nextsets) |
| 181 | +itemsets=nextsets |
| 182 | +ifnotlen(itemsets): |
| 183 | +break |
| 184 | +print("Finished!") |
| 185 | + |
| 186 | + |
| 187 | +minlift=5.0 |
| 188 | +nr_transactions=float(len(dataset)) |
| 189 | +foritemsetinfreqsets: |
| 190 | +foriteminitemset: |
| 191 | +consequent=frozenset([item]) |
| 192 | +antecedent=itemset-consequent |
| 193 | +base=0.0 |
| 194 | +# acount: antecedent count |
| 195 | +acount=0.0 |
| 196 | + |
| 197 | +# ccount : consequent count |
| 198 | +ccount=0.0 |
| 199 | +fordindataset: |
| 200 | +ifitemind: base+=1 |
| 201 | +ifd.issuperset(itemset): ccount+=1 |
| 202 | +ifd.issuperset(antecedent): acount+=1 |
| 203 | +base/=nr_transactions |
| 204 | +p_y_given_x=ccount/acount |
| 205 | +lift=p_y_given_x/base |
| 206 | +iflift>minlift: |
| 207 | +print('Rule{0} ->{1} has lift{2}' |
| 208 | + .format(antecedent, consequent,lift)) |
0 commit comments