Skip to content

Commit 7f1428c

Browse files
committed
Improved data/chart handling; added toy data
1 parent 06c2a57 commit 7f1428c

File tree

10 files changed

+57
-8
lines changed

10 files changed

+57
-8
lines changed

‎ch03/README.md‎

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Chapter 3 - Clustering - Finding Related Posts
2+
==============================================
3+
4+
For this chapter you will need the '20news' dataset from
5+
http://mlcomp.org/datasets/379. To get the data you will need to
6+
register, but it is totally free. When being logged in, you will
7+
see a ZIP download link.

‎ch03/data/toy/01.txt‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is a toy post about machine learning. Actually, it contains not much interesting stuff.

‎ch03/data/toy/02.txt‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Imaging databases provide storage capabilities.

‎ch03/data/toy/03.txt‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Most imaging databases safe images permanently.

‎ch03/data/toy/04.txt‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Imaging databases store data.

‎ch03/data/toy/05.txt‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Imaging databases store data. Imaging databases store data. Imaging databases store data.

‎ch03/plot_kmeans_example.py‎

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
frommatplotlibimportpylab
1616
fromsklearn.clusterimportKMeans
1717

18+
fromutilsimportDATA_DIR, CHART_DIR
19+
1820
seed=2
1921
sp.random.seed(seed) # to reproduce the data later on
2022

@@ -55,7 +57,7 @@ def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None):
5557

5658
i=1
5759
plot_clustering(x, y, "Vectors")
58-
pylab.savefig(os.path.join("..", "1400_03_0%i.png"%i))
60+
pylab.savefig(os.path.join(CHART_DIR, "1400_03_0%i.png"%i))
5961
pylab.clf()
6062

6163
i+=1
@@ -80,7 +82,7 @@ def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None):
8082
c1a, c1b, c1c=km.cluster_centers_
8183
pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
8284
marker='x', linewidth=2, s=100, color='black')
83-
pylab.savefig(os.path.join("..", "1400_03_0%i.png"%i))
85+
pylab.savefig(os.path.join(CHART_DIR, "1400_03_0%i.png"%i))
8486
pylab.clf()
8587

8688
i+=1
@@ -110,7 +112,7 @@ def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None):
110112
pylab.gca().add_patch(
111113
pylab.Arrow(c1c[0], c1c[1], c2c[0] -c1c[0], c2c[1] -c1c[1], width=0.1))
112114

113-
pylab.savefig(os.path.join("..", "1400_03_0%i.png"%i))
115+
pylab.savefig(os.path.join(CHART_DIR, "1400_03_0%i.png"%i))
114116
pylab.clf()
115117

116118
i+=1
@@ -131,7 +133,7 @@ def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None):
131133

132134
pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
133135
marker='x', linewidth=2, s=100, color='black')
134-
pylab.savefig(os.path.join("..", "1400_03_0%i.png"%i))
136+
pylab.savefig(os.path.join(CHART_DIR, "1400_03_0%i.png"%i))
135137
pylab.clf()
136138

137139
i+=1

‎ch03/rel_post_01.py‎

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212

1313
fromsklearn.feature_extraction.textimportCountVectorizer
1414

15-
DIR=r"../data/toy"
16-
posts= [open(os.path.join(DIR, f)).read() forfinos.listdir(DIR)]
15+
fromutilsimportDATA_DIR
16+
17+
TOY_DIR=os.path.join(DATA_DIR, "toy")
18+
posts= [open(os.path.join(TOY_DIR, f)).read() forfinos.listdir(TOY_DIR)]
1719

1820
new_post="imaging databases"
1921

‎ch03/rel_post_mlcomp_01.py‎

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,31 @@
55
#
66
# It is made available under the MIT License
77

8+
importos
9+
importsys
810
importsklearn.datasets
911
importscipyassp
1012

13+
fromutilsimportDATA_DIR
14+
15+
ifnotos.path.exists(DATA_DIR):
16+
print("""\
17+
It seems that you have not yet downloaded the MLCOMP data set.
18+
Please do so and place it into %s."""%DATA_DIR)
19+
sys.exit(1)
20+
1121
new_post= \
1222
"""Disk drive problems. Hi, I have a problem with my hard disk.
1323
After 1 year it is working only sporadically now.
1424
I tried to format it, but now it doesn't boot any more.
1525
Any ideas? Thanks.
1626
"""
1727

18-
MLCOMP_DIR=r"P:\Dropbox\pymlbook\data"
1928
groups= [
2029
'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
2130
'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space']
2231
dataset=sklearn.datasets.load_mlcomp("20news-18828", "train",
23-
mlcomp_root=MLCOMP_DIR,
32+
mlcomp_root=DATA_DIR,
2433
categories=groups)
2534
print("Number of posts:", len(dataset.filenames))
2635

@@ -82,6 +91,13 @@ def build_analyzer(self):
8291
show_at_2=similar[len(similar) /2]
8392
show_at_3=similar[-1]
8493

94+
print("=== #1 ===")
8595
print(show_at_1)
96+
print()
97+
98+
print("=== #2 ===")
8699
print(show_at_2)
100+
print()
101+
102+
print("=== #3 ===")
87103
print(show_at_3)

‎ch03/utils.py‎

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# This code is supporting material for the book
2+
# Building Machine Learning Systems with Python
3+
# by Willi Richert and Luis Pedro Coelho
4+
# published by PACKT Publishing
5+
#
6+
# It is made available under the MIT License
7+
8+
importos
9+
importsys
10+
11+
DATA_DIR=os.path.join(
12+
os.path.dirname(os.path.realpath(__file__)), "data")
13+
14+
ifnotos.path.exists(DATA_DIR):
15+
print("Uh, we were expecting a data directory, which contains the toy data")
16+
sys.exit(1)
17+

0 commit comments

Comments
(0)