Skip to content

Commit bdfd947

Browse files
committed
K-Means and KNN
1 parent 21189a3 commit bdfd947

File tree

4 files changed

+258
-1
lines changed

4 files changed

+258
-1
lines changed

‎README.md‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ The included programs are:
1717
5. Support Vector Machine Classification**
1818
6. Neural Network Classification**
1919
7. Neural Network Regression**
20+
8. K-Means Clustering*
21+
9. K-Nearest-Neighbor*
2022

2123
## Requirements
2224
1. Python 3.5

‎k_nearest_neighbor.py‎

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
importnumpyasnp
2+
fromsklearnimportdatasets
3+
fromsklearnimportdecomposition
4+
importrandom
5+
6+
defnormalize_data(data):
7+
num_elements=len(data)
8+
total= [0] *data.shape[1]
9+
forsampleindata:
10+
total=total+sample
11+
mean_features=np.divide(total, num_elements)
12+
13+
total= [0] *data.shape[1]
14+
forsampleindata:
15+
total=total+np.square(sample-mean_features)
16+
17+
std_features=np.divide(total, num_elements)
18+
19+
forindex, sampleinenumerate(data):
20+
data[index] =np.divide((sample-mean_features), std_features)
21+
22+
returndata
23+
24+
# Calculate the distance between two vectors
25+
defeuclidean_distance(vec_1, vec_2):
26+
distance=0
27+
foriinrange(len(vec_1)):
28+
distance+=pow((vec_1[i] -vec_2[i]), 2)
29+
30+
returnnp.sqrt(distance)
31+
32+
# Split the data into train and test sets
33+
deftrain_test_split(X, y, test_size=0.2):
34+
# Randomly shuffle the data
35+
combined=list(zip(train_data, train_labels))
36+
random.shuffle(combined)
37+
train_data[:], train_labels[:] =zip(*combined)
38+
39+
# Split the training data from test data in the ratio specified in test_size
40+
split_i=len(y) -int(len(y) // (1/test_size))
41+
x_train, x_test=train_data[:split_i], train_data[split_i:]
42+
y_train, y_test=train_labels[:split_i], train_labels[split_i:]
43+
44+
returnx_train, x_test, y_train, y_test
45+
46+
classKNN():
47+
def__init__(self, k=5):
48+
self.k=k
49+
50+
# Do a majority vote among the neighbors
51+
def_majority_vote(self, neighbors, classes):
52+
max_count=0
53+
most_common=None
54+
# Count class occurences among neighbors
55+
forcinnp.unique(classes):
56+
# Count number of neighbors with class c
57+
count=len(neighbors[neighbors[:, 1] ==c])
58+
ifcount>max_count:
59+
max_count=count
60+
most_common=c
61+
returnmost_common
62+
63+
defpredict(self, X_test, X_train, y_train):
64+
classes=np.unique(y_train)
65+
y_pred= []
66+
# Determine the class of each sample
67+
fortest_sampleinX_test:
68+
neighbors= []
69+
70+
# Calculate the distance form each observed sample to the sample we wish to predict
71+
forj, observed_sampleinenumerate(X_train):
72+
distance=euclidean_distance(test_sample, observed_sample)
73+
label=y_train[j]
74+
75+
# Add neighbor information
76+
neighbors.append([distance, label])
77+
neighbors=np.array(neighbors)
78+
79+
# Sort the list of observed samples from lowest to highest distance and select the k first
80+
k_nearest_neighbors=neighbors[neighbors[:, 0].argsort()][:self.k]
81+
82+
# Do a majority vote among the k neighbors and set prediction as the class receing the most votes
83+
label=self._majority_vote(k_nearest_neighbors, classes)
84+
y_pred.append(label)
85+
returnnp.array(y_pred)
86+
87+
88+
# Get the training data
89+
# Import the Iris flower dataset
90+
iris=datasets.load_iris()
91+
train_data=np.array(iris.data)
92+
train_labels=np.array(iris.target)
93+
num_features=train_data.data.shape[1]
94+
95+
# Normalize the training data
96+
train_data=normalize_data(train_data)
97+
98+
# Apply PCA to the data to reduce its dimensionality
99+
pca=decomposition.PCA(n_components=2)
100+
pca.fit(train_data)
101+
train_data=pca.transform(train_data)
102+
103+
104+
X_train, X_test, y_train, y_test=train_test_split(train_data, train_labels, test_size=0.5)
105+
106+
clf=KNN(k=3)
107+
predicted_labels=clf.predict(X_test, X_train, y_train)
108+
109+
# Compute the training accuracy
110+
Accuracy=0
111+
forindexinrange(len(y_test)):
112+
# Cluster the data using K-Means
113+
current_label=y_test[index]
114+
predicted_label=predicted_labels[index]
115+
116+
ifcurrent_label==predicted_label:
117+
Accuracy+=1
118+
119+
Accuracy/=len(train_labels)
120+
121+
# Print stuff
122+
print("Classification Accuracy = ", Accuracy)

‎kmeans.py‎

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
importnumpyasnp
2+
fromsklearnimportdatasets
3+
fromsklearnimportdecomposition
4+
fromsklearnimportcluster
5+
importrandom
6+
7+
# Calculate the distance between two vectors
8+
defeuclidean_distance(vec_1, vec_2):
9+
distance=0
10+
foriinrange(len(vec_1)):
11+
distance+=pow((vec_1[i] -vec_2[i]), 2)
12+
13+
returnnp.sqrt(distance)
14+
15+
classKMeans():
16+
def__init__(self, k=2, max_iterations=500, n_init=3):
17+
self.k=k
18+
self.max_iterations=max_iterations
19+
20+
# Initialize the centroids as random samples
21+
def_init_random_centroids(self, X):
22+
n_samples, n_features=np.shape(X)
23+
centroids=np.zeros((self.k, n_features))
24+
foriinrange(self.k):
25+
centroid=X[np.random.choice(range(n_samples))]
26+
centroids[i] =centroid
27+
returncentroids
28+
29+
# Return the index of the closest centroid to the sample
30+
def_closest_centroid(self, sample, centroids):
31+
closest_i=None
32+
closest_distance=float("inf")
33+
fori, centroidinenumerate(centroids):
34+
distance=euclidean_distance(sample, centroid)
35+
ifdistance<closest_distance:
36+
closest_i=i
37+
closest_distance=distance
38+
returnclosest_i
39+
40+
# Assign the samples to the closest centroids to create clusters
41+
def_create_clusters(self, centroids, X):
42+
n_samples=np.shape(X)[0]
43+
clusters= [[] for_inrange(self.k)]
44+
forsample_i, sampleinenumerate(X):
45+
centroid_i=self._closest_centroid(sample, centroids)
46+
clusters[centroid_i].append(sample_i)
47+
returnclusters
48+
49+
# Calculate new centroids as the means of the samples in each cluster
50+
def_calculate_centroids(self, clusters, X):
51+
n_features=np.shape(X)[1]
52+
centroids=np.zeros((self.k, n_features))
53+
fori, clusterinenumerate(clusters):
54+
centroid=np.mean(X[cluster], axis=0)
55+
centroids[i] =centroid
56+
returncentroids
57+
58+
# Classify samples as the index of their clusters
59+
def_get_cluster_labels(self, clusters, X):
60+
# One prediction for each sample
61+
y_pred=np.zeros(np.shape(X)[0])
62+
forcluster_i, clusterinenumerate(clusters):
63+
forsample_iincluster:
64+
y_pred[sample_i] =cluster_i
65+
returny_pred
66+
67+
# Do K-Means clustering and return cluster indices
68+
defpredict(self, X):
69+
# Initialize centroids
70+
centroids=self._init_random_centroids(X)
71+
72+
# Iterate until convergence or for max iterations
73+
for_inrange(self.max_iterations):
74+
# Assign samples to closest centroids (create clusters)
75+
clusters=self._create_clusters(centroids, X)
76+
77+
prev_centroids=centroids
78+
# Calculate new centroids from the clusters
79+
centroids=self._calculate_centroids(clusters, X)
80+
81+
# If no centroids have changed => convergence
82+
diff=centroids-prev_centroids
83+
ifnotdiff.any():
84+
break
85+
86+
returnself._get_cluster_labels(clusters, X)
87+
88+
89+
# Get the training data
90+
# Import the Iris flower dataset
91+
iris=datasets.load_iris()
92+
train_data=np.array(iris.data)
93+
train_labels=np.array(iris.target)
94+
num_features=train_data.data.shape[1]
95+
96+
# Apply PCA to the data to reduce its dimensionality
97+
pca=decomposition.PCA(n_components=2)
98+
pca.fit(train_data)
99+
train_data=pca.transform(train_data)
100+
101+
# *********************************************
102+
# Apply K-Means Clustering MANUALLY
103+
# *********************************************
104+
# Create the K-Means Clustering Object
105+
unique_labels=np.unique(train_labels)
106+
num_classes=len(unique_labels)
107+
clf=KMeans(k=num_classes, max_iterations=3000)
108+
109+
predicted_labels=clf.predict(train_data)
110+
111+
112+
# Compute the training accuracy
113+
Accuracy=0
114+
forindexinrange(len(train_labels)):
115+
# Cluster the data using K-Means
116+
current_label=train_labels[index]
117+
predicted_label=predicted_labels[index]
118+
119+
ifcurrent_label==predicted_label:
120+
Accuracy+=1
121+
122+
Accuracy/=len(train_labels)
123+
124+
# Print stuff
125+
print("Classification Accuracy = ", Accuracy)

‎pca_logistic_regression.py‎

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,21 @@ def normalize_data(data):
2828
defsigmoid(val):
2929
returnnp.divide(1, (1+np.exp(-1*val)))
3030

31-
defpca(data, exp_var_percentage=95):
31+
defcompute_cov_mat(data):
3232
# Compute the mean of the data
3333
mean_vec=np.mean(data, axis=0)
3434

3535
# Compute the covariance matrix
3636
cov_mat= (data-mean_vec).T.dot((data-mean_vec)) / (data.shape[0]-1)
3737

38+
returncov_mat
39+
40+
41+
defpca(data, exp_var_percentage=95):
42+
43+
# Compute the covariance matrix
44+
cov_mat=compute_cov_mat(data)
45+
3846
# Compute the eigen values and vectors of the covariance matrix
3947
eig_vals, eig_vecs=np.linalg.eig(cov_mat)
4048

0 commit comments

Comments
(0)