K-Means and KNN

GeorgeSeif · GeorgeSeif · commit bdfd947c4542 · 2017-03-10T15:14:59.000-05:00
diff --git a/README.md b/README.md
@@ -17,6 +17,8 @@ The included programs are:
 5. Support Vector Machine Classification**
 6. Neural Network Classification**
 7. Neural Network Regression**
+8. K-Means Clustering*
+9. K-Nearest-Neighbor*
 
 ## Requirements
 1. Python 3.5
diff --git a/k_nearest_neighbor.py b/k_nearest_neighbor.py
@@ -0,0 +1,122 @@
+import numpy as np
+from sklearn import datasets
+from sklearn import decomposition
+import random
+
+def normalize_data(data):
+	num_elements = len(data)
+	total = [0] * data.shape[1]
+	for sample in data:
+		total = total + sample
+	mean_features = np.divide(total, num_elements)
+
+	total = [0] * data.shape[1]
+	for sample in data:
+		total = total + np.square(sample - mean_features)
+
+	std_features = np.divide(total, num_elements)
+
+	for index, sample in enumerate(data):
+		data[index] = np.divide((sample - mean_features), std_features) 
+
+	return data
+
+# Calculate the distance between two vectors
+def euclidean_distance(vec_1, vec_2):
+ distance = 0
+ for i in range(len(vec_1)):
+ distance += pow((vec_1[i] - vec_2[i]), 2)
+
+ return np.sqrt(distance)
+
+# Split the data into train and test sets
+def train_test_split(X, y, test_size=0.2):
+ # Randomly shuffle the data
+ combined = list(zip(train_data, train_labels))
+ random.shuffle(combined)
+ train_data[:], train_labels[:] = zip(*combined)
+
+ # Split the training data from test data in the ratio specified in test_size
+ split_i = len(y) - int(len(y) // (1 / test_size))
+ x_train, x_test = train_data[:split_i], train_data[split_i:]
+ y_train, y_test = train_labels[:split_i], train_labels[split_i:]
+
+ return x_train, x_test, y_train, y_test
+
+class KNN():
+ def __init__(self, k=5):
+ self.k = k
+
+ # Do a majority vote among the neighbors
+ def _majority_vote(self, neighbors, classes):
+ max_count = 0
+ most_common = None
+ # Count class occurences among neighbors
+ for c in np.unique(classes):
+ # Count number of neighbors with class c
+ count = len(neighbors[neighbors[:, 1] == c])
+ if count > max_count:
+ max_count = count
+ most_common = c
+ return most_common
+
+ def predict(self, X_test, X_train, y_train):
+ classes = np.unique(y_train)
+ y_pred = []
+ # Determine the class of each sample
+ for test_sample in X_test:
+ neighbors = []
+
+ # Calculate the distance form each observed sample to the sample we wish to predict
+ for j, observed_sample in enumerate(X_train):
+ distance = euclidean_distance(test_sample, observed_sample)
+ label = y_train[j]
+
+ # Add neighbor information
+ neighbors.append([distance, label])
+ neighbors = np.array(neighbors)
+
+ # Sort the list of observed samples from lowest to highest distance and select the k first
+ k_nearest_neighbors = neighbors[neighbors[:, 0].argsort()][:self.k]
+
+ # Do a majority vote among the k neighbors and set prediction as the class receing the most votes
+ label = self._majority_vote(k_nearest_neighbors, classes)
+ y_pred.append(label)
+ return np.array(y_pred)
+
+
+# Get the training data
+# Import the Iris flower dataset
+iris = datasets.load_iris()
+train_data = np.array(iris.data)
+train_labels = np.array(iris.target)
+num_features = train_data.data.shape[1]
+
+# Normalize the training data
+train_data = normalize_data(train_data)
+
+# Apply PCA to the data to reduce its dimensionality
+pca = decomposition.PCA(n_components=2)
+pca.fit(train_data)
+train_data = pca.transform(train_data)
+
+
+X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.5)
+
+clf = KNN(k=3)
+predicted_labels = clf.predict(X_test, X_train, y_train)
+
+# Compute the training accuracy
+Accuracy = 0
+for index in range(len(y_test)):
+	# Cluster the data using K-Means
+	current_label = y_test[index]
+	predicted_label = predicted_labels[index]
+
+	if current_label == predicted_label:
+		Accuracy += 1
+
+Accuracy /= len(train_labels)
+
+# Print stuff
+print("Classification Accuracy = ", Accuracy)
diff --git a/kmeans.py b/kmeans.py
@@ -0,0 +1,125 @@
+import numpy as np
+from sklearn import datasets
+from sklearn import decomposition
+from sklearn import cluster
+import random
+
+# Calculate the distance between two vectors
+def euclidean_distance(vec_1, vec_2):
+ distance = 0
+ for i in range(len(vec_1)):
+ distance += pow((vec_1[i] - vec_2[i]), 2)
+
+ return np.sqrt(distance)
+
+class KMeans():
+ def __init__(self, k=2, max_iterations=500, n_init=3):
+ self.k = k
+ self.max_iterations = max_iterations
+
+ # Initialize the centroids as random samples
+ def _init_random_centroids(self, X):
+ n_samples, n_features = np.shape(X)
+ centroids = np.zeros((self.k, n_features))
+ for i in range(self.k):
+ centroid = X[np.random.choice(range(n_samples))]
+ centroids[i] = centroid
+ return centroids
+
+ # Return the index of the closest centroid to the sample
+ def _closest_centroid(self, sample, centroids):
+ closest_i = None
+ closest_distance = float("inf")
+ for i, centroid in enumerate(centroids):
+ distance = euclidean_distance(sample, centroid)
+ if distance < closest_distance:
+ closest_i = i
+ closest_distance = distance
+ return closest_i
+
+ # Assign the samples to the closest centroids to create clusters
+ def _create_clusters(self, centroids, X):
+ n_samples = np.shape(X)[0]
+ clusters = [[] for _ in range(self.k)]
+ for sample_i, sample in enumerate(X):		
+ centroid_i = self._closest_centroid(sample, centroids)
+ clusters[centroid_i].append(sample_i)
+ return clusters
+
+ # Calculate new centroids as the means of the samples in each cluster
+ def _calculate_centroids(self, clusters, X):
+ n_features = np.shape(X)[1]
+ centroids = np.zeros((self.k, n_features))
+ for i, cluster in enumerate(clusters):
+ centroid = np.mean(X[cluster], axis=0)
+ centroids[i] = centroid
+ return centroids
+
+ # Classify samples as the index of their clusters
+ def _get_cluster_labels(self, clusters, X):
+ # One prediction for each sample
+ y_pred = np.zeros(np.shape(X)[0])
+ for cluster_i, cluster in enumerate(clusters):
+ for sample_i in cluster:
+ y_pred[sample_i] = cluster_i
+ return y_pred
+
+ # Do K-Means clustering and return cluster indices
+ def predict(self, X):
+ # Initialize centroids
+ centroids = self._init_random_centroids(X)
+
+ # Iterate until convergence or for max iterations
+ for _ in range(self.max_iterations):
+ # Assign samples to closest centroids (create clusters)
+ clusters = self._create_clusters(centroids, X)
+
+ prev_centroids = centroids
+ # Calculate new centroids from the clusters
+ centroids = self._calculate_centroids(clusters, X)
+
+ # If no centroids have changed => convergence
+ diff = centroids - prev_centroids
+ if not diff.any():
+ break
+
+ return self._get_cluster_labels(clusters, X)
+
+
+# Get the training data
+# Import the Iris flower dataset
+iris = datasets.load_iris()
+train_data = np.array(iris.data)
+train_labels = np.array(iris.target)
+num_features = train_data.data.shape[1]
+
+# Apply PCA to the data to reduce its dimensionality
+pca = decomposition.PCA(n_components=2)
+pca.fit(train_data)
+train_data = pca.transform(train_data)
+
+# *********************************************
+# Apply K-Means Clustering MANUALLY
+# *********************************************
+# Create the K-Means Clustering Object 
+unique_labels = np.unique(train_labels)
+num_classes = len(unique_labels)
+clf = KMeans(k=num_classes, max_iterations=3000)
+
+predicted_labels = clf.predict(train_data)
+
+
+# Compute the training accuracy
+Accuracy = 0
+for index in range(len(train_labels)):
+	# Cluster the data using K-Means
+	current_label = train_labels[index]
+	predicted_label = predicted_labels[index]
+
+	if current_label == predicted_label:
+		Accuracy += 1
+
+Accuracy /= len(train_labels)
+
+# Print stuff
+print("Classification Accuracy = ", Accuracy)
diff --git a/pca_logistic_regression.py b/pca_logistic_regression.py
@@ -28,13 +28,21 @@ def normalize_data(data):
 def sigmoid(val):
 	return np.divide(1, (1 + np.exp(-1*val)))
 
-def pca(data, exp_var_percentage=95):
+def compute_cov_mat(data):
 	# Compute the mean of the data
 	mean_vec = np.mean(data, axis=0)
 
 	# Compute the covariance matrix
 	cov_mat = (data - mean_vec).T.dot((data - mean_vec)) / (data.shape[0]-1)
 
+	return cov_mat
+
+
+def pca(data, exp_var_percentage=95):
+
+	# Compute the covariance matrix
+	cov_mat = compute_cov_mat(data)
+
 	# Compute the eigen values and vectors of the covariance matrix
 	eig_vals, eig_vecs = np.linalg.eig(cov_mat)
 

-Original file line number
+Diff line change
@@ @@ -0,0 +1,122 @@ @@
 +importnumpyasnp
 +fromsklearnimportdatasets
 +fromsklearnimportdecomposition
 +importrandom
++
 +defnormalize_data(data):
 +num_elements=len(data)
 +total= [0] *data.shape[1]
 +forsampleindata:
 +total=total+sample
 +mean_features=np.divide(total, num_elements)
++
 +total= [0] *data.shape[1]
 +forsampleindata:
 +total=total+np.square(sample-mean_features)
++
 +std_features=np.divide(total, num_elements)
++
 +forindex, sampleinenumerate(data):
 +data[index] =np.divide((sample-mean_features), std_features)
++
 +returndata
++
 +# Calculate the distance between two vectors
 +defeuclidean_distance(vec_1, vec_2):
 +distance=0
 +foriinrange(len(vec_1)):
 +distance+=pow((vec_1[i] -vec_2[i]), 2)
++
 +returnnp.sqrt(distance)
++
 +# Split the data into train and test sets
 +deftrain_test_split(X, y, test_size=0.2):
 +# Randomly shuffle the data
 +combined=list(zip(train_data, train_labels))
 +random.shuffle(combined)
 +train_data[:], train_labels[:] =zip(*combined)
++
 +# Split the training data from test data in the ratio specified in test_size
 +split_i=len(y) -int(len(y) // (1/test_size))
 +x_train, x_test=train_data[:split_i], train_data[split_i:]
 +y_train, y_test=train_labels[:split_i], train_labels[split_i:]
++
 +returnx_train, x_test, y_train, y_test
++
 +classKNN():
 +def__init__(self, k=5):
 +self.k=k
++
 +# Do a majority vote among the neighbors
 +def_majority_vote(self, neighbors, classes):
 +max_count=0
 +most_common=None
 +# Count class occurences among neighbors
 +forcinnp.unique(classes):
 +# Count number of neighbors with class c
 +count=len(neighbors[neighbors[:, 1] ==c])
 +ifcount>max_count:
 +max_count=count
 +most_common=c
 +returnmost_common
++
 +defpredict(self, X_test, X_train, y_train):
 +classes=np.unique(y_train)
 +y_pred= []
 +# Determine the class of each sample
 +fortest_sampleinX_test:
 +neighbors= []
++
 +# Calculate the distance form each observed sample to the sample we wish to predict
 +forj, observed_sampleinenumerate(X_train):
 +distance=euclidean_distance(test_sample, observed_sample)
 +label=y_train[j]
++
 +# Add neighbor information
 +neighbors.append([distance, label])
 +neighbors=np.array(neighbors)
++
 +# Sort the list of observed samples from lowest to highest distance and select the k first
 +k_nearest_neighbors=neighbors[neighbors[:, 0].argsort()][:self.k]
++
 +# Do a majority vote among the k neighbors and set prediction as the class receing the most votes
 +label=self._majority_vote(k_nearest_neighbors, classes)
 +y_pred.append(label)
 +returnnp.array(y_pred)
++
++
 +# Get the training data
 +# Import the Iris flower dataset
 +iris=datasets.load_iris()
 +train_data=np.array(iris.data)
 +train_labels=np.array(iris.target)
 +num_features=train_data.data.shape[1]
++
 +# Normalize the training data
 +train_data=normalize_data(train_data)
++
 +# Apply PCA to the data to reduce its dimensionality
 +pca=decomposition.PCA(n_components=2)
 +pca.fit(train_data)
 +train_data=pca.transform(train_data)
++
++
 +X_train, X_test, y_train, y_test=train_test_split(train_data, train_labels, test_size=0.5)
++
 +clf=KNN(k=3)
 +predicted_labels=clf.predict(X_test, X_train, y_train)
++
 +# Compute the training accuracy
 +Accuracy=0
 +forindexinrange(len(y_test)):
 +# Cluster the data using K-Means
 +current_label=y_test[index]
 +predicted_label=predicted_labels[index]
++
 +ifcurrent_label==predicted_label:
 +Accuracy+=1
++
 +Accuracy/=len(train_labels)
++
 +# Print stuff
 +print("Classification Accuracy = ", Accuracy)
-Original file line number
+Diff line change
@@ @@ -0,0 +1,125 @@ @@
 +importnumpyasnp
 +fromsklearnimportdatasets
 +fromsklearnimportdecomposition
 +fromsklearnimportcluster
 +importrandom
++
 +# Calculate the distance between two vectors
 +defeuclidean_distance(vec_1, vec_2):
 +distance=0
 +foriinrange(len(vec_1)):
 +distance+=pow((vec_1[i] -vec_2[i]), 2)
++
 +returnnp.sqrt(distance)
++
 +classKMeans():
 +def__init__(self, k=2, max_iterations=500, n_init=3):
 +self.k=k
 +self.max_iterations=max_iterations
++
 +# Initialize the centroids as random samples
 +def_init_random_centroids(self, X):
 +n_samples, n_features=np.shape(X)
 +centroids=np.zeros((self.k, n_features))
 +foriinrange(self.k):
 +centroid=X[np.random.choice(range(n_samples))]
 +centroids[i] =centroid
 +returncentroids
++
 +# Return the index of the closest centroid to the sample
 +def_closest_centroid(self, sample, centroids):
 +closest_i=None
 +closest_distance=float("inf")
 +fori, centroidinenumerate(centroids):
 +distance=euclidean_distance(sample, centroid)
 +ifdistance<closest_distance:
 +closest_i=i
 +closest_distance=distance
 +returnclosest_i
++
 +# Assign the samples to the closest centroids to create clusters
 +def_create_clusters(self, centroids, X):
 +n_samples=np.shape(X)[0]
 +clusters= [[] for_inrange(self.k)]
 +forsample_i, sampleinenumerate(X):
 +centroid_i=self._closest_centroid(sample, centroids)
 +clusters[centroid_i].append(sample_i)
 +returnclusters
++
 +# Calculate new centroids as the means of the samples in each cluster
 +def_calculate_centroids(self, clusters, X):
 +n_features=np.shape(X)[1]
 +centroids=np.zeros((self.k, n_features))
 +fori, clusterinenumerate(clusters):
 +centroid=np.mean(X[cluster], axis=0)
 +centroids[i] =centroid
 +returncentroids
++
 +# Classify samples as the index of their clusters
 +def_get_cluster_labels(self, clusters, X):
 +# One prediction for each sample
 +y_pred=np.zeros(np.shape(X)[0])
 +forcluster_i, clusterinenumerate(clusters):
 +forsample_iincluster:
 +y_pred[sample_i] =cluster_i
 +returny_pred
++
 +# Do K-Means clustering and return cluster indices
 +defpredict(self, X):
 +# Initialize centroids
 +centroids=self._init_random_centroids(X)
++
 +# Iterate until convergence or for max iterations
 +for_inrange(self.max_iterations):
 +# Assign samples to closest centroids (create clusters)
 +clusters=self._create_clusters(centroids, X)
++
 +prev_centroids=centroids
 +# Calculate new centroids from the clusters
 +centroids=self._calculate_centroids(clusters, X)
++
 +# If no centroids have changed => convergence
 +diff=centroids-prev_centroids
 +ifnotdiff.any():
 +break
++
 +returnself._get_cluster_labels(clusters, X)
++
++
 +# Get the training data
 +# Import the Iris flower dataset
 +iris=datasets.load_iris()
 +train_data=np.array(iris.data)
 +train_labels=np.array(iris.target)
 +num_features=train_data.data.shape[1]
++
 +# Apply PCA to the data to reduce its dimensionality
 +pca=decomposition.PCA(n_components=2)
 +pca.fit(train_data)
 +train_data=pca.transform(train_data)
++
 +# *********************************************
 +# Apply K-Means Clustering MANUALLY
 +# *********************************************
 +# Create the K-Means Clustering Object
 +unique_labels=np.unique(train_labels)
 +num_classes=len(unique_labels)
 +clf=KMeans(k=num_classes, max_iterations=3000)
++
 +predicted_labels=clf.predict(train_data)
++
++
 +# Compute the training accuracy
 +Accuracy=0
 +forindexinrange(len(train_labels)):
 +# Cluster the data using K-Means
 +current_label=train_labels[index]
 +predicted_label=predicted_labels[index]
++
 +ifcurrent_label==predicted_label:
 +Accuracy+=1
++
 +Accuracy/=len(train_labels)
++
 +# Print stuff
 +print("Classification Accuracy = ", Accuracy)
-Original file line number
+Diff line change
 defsigmoid(val):
 returnnp.divide(1, (1+np.exp(-1*val)))
 -defpca(data, exp_var_percentage=95):
 +defcompute_cov_mat(data):
 # Compute the mean of the data
 mean_vec=np.mean(data, axis=0)
 # Compute the covariance matrix
 cov_mat= (data-mean_vec).T.dot((data-mean_vec)) / (data.shape[0]-1)
 +returncov_mat
++
++
 +defpca(data, exp_var_percentage=95):
++
 +# Compute the covariance matrix
 +cov_mat=compute_cov_mat(data)
++
 # Compute the eigen values and vectors of the covariance matrix
 eig_vals, eig_vecs=np.linalg.eig(cov_mat)