1+ import numpy as np
2+ from sklearn import datasets
3+ from sklearn import decomposition
4+ from sklearn import cluster
5+ import random
6+
7+ # Calculate the distance between two vectors
8+ def euclidean_distance (vec_1 , vec_2 ):
9+ distance = 0
10+ for i in range (len (vec_1 )):
11+ distance += pow ((vec_1 [i ] - vec_2 [i ]), 2 )
12+
13+ return np .sqrt (distance )
14+
15+ class KMeans ():
16+ def __init__ (self , k = 2 , max_iterations = 500 , n_init = 3 ):
17+ self .k = k
18+ self .max_iterations = max_iterations
19+
20+ # Initialize the centroids as random samples
21+ def _init_random_centroids (self , X ):
22+ n_samples , n_features = np .shape (X )
23+ centroids = np .zeros ((self .k , n_features ))
24+ for i in range (self .k ):
25+ centroid = X [np .random .choice (range (n_samples ))]
26+ centroids [i ] = centroid
27+ return centroids
28+
29+ # Return the index of the closest centroid to the sample
30+ def _closest_centroid (self , sample , centroids ):
31+ closest_i = None
32+ closest_distance = float ("inf" )
33+ for i , centroid in enumerate (centroids ):
34+ distance = euclidean_distance (sample , centroid )
35+ if distance < closest_distance :
36+ closest_i = i
37+ closest_distance = distance
38+ return closest_i
39+
40+ # Assign the samples to the closest centroids to create clusters
41+ def _create_clusters (self , centroids , X ):
42+ n_samples = np .shape (X )[0 ]
43+ clusters = [[] for _ in range (self .k )]
44+ for sample_i , sample in enumerate (X ):
45+ centroid_i = self ._closest_centroid (sample , centroids )
46+ clusters [centroid_i ].append (sample_i )
47+ return clusters
48+
49+ # Calculate new centroids as the means of the samples in each cluster
50+ def _calculate_centroids (self , clusters , X ):
51+ n_features = np .shape (X )[1 ]
52+ centroids = np .zeros ((self .k , n_features ))
53+ for i , cluster in enumerate (clusters ):
54+ centroid = np .mean (X [cluster ], axis = 0 )
55+ centroids [i ] = centroid
56+ return centroids
57+
58+ # Classify samples as the index of their clusters
59+ def _get_cluster_labels (self , clusters , X ):
60+ # One prediction for each sample
61+ y_pred = np .zeros (np .shape (X )[0 ])
62+ for cluster_i , cluster in enumerate (clusters ):
63+ for sample_i in cluster :
64+ y_pred [sample_i ] = cluster_i
65+ return y_pred
66+
67+ # Do K-Means clustering and return cluster indices
68+ def predict (self , X ):
69+ # Initialize centroids
70+ centroids = self ._init_random_centroids (X )
71+
72+ # Iterate until convergence or for max iterations
73+ for _ in range (self .max_iterations ):
74+ # Assign samples to closest centroids (create clusters)
75+ clusters = self ._create_clusters (centroids , X )
76+
77+ prev_centroids = centroids
78+ # Calculate new centroids from the clusters
79+ centroids = self ._calculate_centroids (clusters , X )
80+
81+ # If no centroids have changed => convergence
82+ diff = centroids - prev_centroids
83+ if not diff .any ():
84+ break
85+
86+ return self ._get_cluster_labels (clusters , X )
87+
88+
89+ # Get the training data
90+ # Import the Iris flower dataset
91+ iris = datasets .load_iris ()
92+ train_data = np .array (iris .data )
93+ train_labels = np .array (iris .target )
94+ num_features = train_data .data .shape [1 ]
95+
96+ # Apply PCA to the data to reduce its dimensionality
97+ pca = decomposition .PCA (n_components = 2 )
98+ pca .fit (train_data )
99+ train_data = pca .transform (train_data )
100+
101+ # *********************************************
102+ # Apply K-Means Clustering MANUALLY
103+ # *********************************************
104+ # Create the K-Means Clustering Object
105+ unique_labels = np .unique (train_labels )
106+ num_classes = len (unique_labels )
107+ clf = KMeans (k = num_classes , max_iterations = 3000 )
108+
109+ predicted_labels = clf .predict (train_data )
110+
111+
112+ # Compute the training accuracy
113+ Accuracy = 0
114+ for index in range (len (train_labels )):
115+ # Cluster the data using K-Means
116+ current_label = train_labels [index ]
117+ predicted_label = predicted_labels [index ]
118+
119+ if current_label == predicted_label :
120+ Accuracy += 1
121+
122+ Accuracy /= len (train_labels )
123+
124+ # Print stuff
125+ print ("Classification Accuracy = " , Accuracy )
0 commit comments