diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..485dee6 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea diff --git a/LinearRegression/LinearRegression.py b/LinearRegression/LinearRegression.py index 3c1ae46..c506670 100644 --- a/LinearRegression/LinearRegression.py +++ b/LinearRegression/LinearRegression.py @@ -1,115 +1,121 @@ -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import numpy as np from matplotlib import pyplot as plt from matplotlib.font_manager import FontProperties -font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) # 解决windows环境下画图汉字乱码问题 - - -def linearRegression(alpha=0.01,num_iters=400): - print u"加载数据...\n" - - data = loadtxtAndcsv_data("data.txt",",",np.float64) #读取数据 - X = data[:,0:-1] # X对应0到倒数第2列 - y = data[:,-1] # y对应最后一列 - m = len(y) # 总的数据条数 - col = data.shape[1] # data的列数 - - X,mu,sigma = featureNormaliza(X) # 归一化 - plot_X1_X2(X) # 画图看一下归一化效果 - - X = np.hstack((np.ones((m,1)),X)) # 在X前加一列1 - - print u"\n执行梯度下降算法....\n" - - theta = np.zeros((col,1)) - y = y.reshape(-1,1) #将行向量转化为列 - theta,J_history = gradientDescent(X, y, theta, alpha, num_iters) - + +font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) # 解决windows环境下画图汉字乱码问题 + + +def linearRegression(alpha=0.01, num_iters=400): + print(u"加载数据...") + + data = loadtxtAndcsv_data("data.txt", ",", np.float64) # 读取数据 + X = data[:, 0:-1] # X对应0到倒数第2列 + y = data[:, -1] # y对应最后一列 + m = len(y) # 总的数据条数 + col = data.shape[1] # data的列数 + + X, mu, sigma = featureNormaliza(X) # 归一化 + plot_X1_X2(X) # 画图看一下归一化效果 + + X = np.hstack((np.ones((m, 1)), X)) # 在X前加一列1 + + print(u"\n执行梯度下降算法....") + + theta = np.zeros((col, 1)) + y = y.reshape(-1, 1) # 将行向量转化为列 + theta, J_history = gradientDescent(X, y, theta, alpha, num_iters) + plotJ(J_history, num_iters) - - return mu,sigma,theta #返回均值mu,标准差sigma,和学习的结果theta - - + + return mu, sigma, theta # 返回均值mu,标准差sigma,和学习的结果theta + + # 加载txt和csv文件 -def loadtxtAndcsv_data(fileName,split,dataType): - return np.loadtxt(fileName,delimiter=split,dtype=dataType) +def loadtxtAndcsv_data(fileName, split, dataType): + return np.loadtxt(fileName, delimiter=split, dtype=dataType) + # 加载npy文件 def loadnpy_data(fileName): return np.load(fileName) + # 归一化feature def featureNormaliza(X): - X_norm = np.array(X) #将X转化为numpy数组对象,才可以进行矩阵的运算 - #定义所需变量 - mu = np.zeros((1,X.shape[1])) - sigma = np.zeros((1,X.shape[1])) - - mu = np.mean(X_norm,0) # 求每一列的平均值(0指定为列,1代表行) - sigma = np.std(X_norm,0) # 求每一列的标准差 - for i in range(X.shape[1]): # 遍历列 - X_norm[:,i] = (X_norm[:,i]-mu[i])/sigma[i] # 归一化 - - return X_norm,mu,sigma + X_norm = np.array(X) # 将X转化为numpy数组对象,才可以进行矩阵的运算 + # 定义所需变量 + mu = np.zeros((1, X.shape[1])) + sigma = np.zeros((1, X.shape[1])) + + mu = np.mean(X_norm, 0) # 求每一列的平均值(0指定为列,1代表行) + sigma = np.std(X_norm, 0) # 求每一列的标准差 + for i in range(X.shape[1]): # 遍历列 + X_norm[:, i] = (X_norm[:, i] - mu[i]) / sigma[i] # 归一化 + + return X_norm, mu, sigma + # 画二维图 def plot_X1_X2(X): - plt.scatter(X[:,0],X[:,1]) + plt.scatter(X[:, 0], X[:, 1]) plt.show() # 梯度下降算法 -def gradientDescent(X,y,theta,alpha,num_iters): - m = len(y) +def gradientDescent(X, y, theta, alpha, num_iters): + m = len(y) n = len(theta) - - temp = np.matrix(np.zeros((n,num_iters))) # 暂存每次迭代计算的theta,转化为矩阵形式 - - - J_history = np.zeros((num_iters,1)) #记录每次迭代计算的代价值 - + + temp = np.matrix(np.zeros((n, num_iters))) # 暂存每次迭代计算的theta,转化为矩阵形式 + + J_history = np.zeros((num_iters, 1)) # 记录每次迭代计算的代价值 + for i in range(num_iters): # 遍历迭代次数 - h = np.dot(X,theta) # 计算内积,matrix可以直接乘 - temp[:,i] = theta - ((alpha/m)*(np.dot(np.transpose(X),h-y))) #梯度的计算 - theta = temp[:,i] - J_history[i] = computerCost(X,y,theta) #调用计算代价函数 - print '.', - return theta,J_history + h = np.dot(X, theta) # 计算内积,matrix可以直接乘 + temp[:, i] = theta - ((alpha / m) * (np.dot(np.transpose(X), h - y))) # 梯度的计算 + theta = temp[:, i] + J_history[i] = computerCost(X, y, theta) # 调用计算代价函数 + print('.', end='') + return theta, J_history + # 计算代价函数 -def computerCost(X,y,theta): +def computerCost(X, y, theta): m = len(y) - J = 0 - - J = (np.transpose(X*theta-y))*(X*theta-y)/(2*m) #计算代价J + J = (np.transpose(X * theta - y)) * (X * theta - y) / (2 * m) # 计算代价J return J + # 画每次迭代代价的变化图 -def plotJ(J_history,num_iters): - x = np.arange(1,num_iters+1) - plt.plot(x,J_history) - plt.xlabel(u"迭代次数",fontproperties=font) # 注意指定字体,要不然出现乱码问题 - plt.ylabel(u"代价值",fontproperties=font) - plt.title(u"代价随迭代次数的变化",fontproperties=font) +def plotJ(J_history, num_iters): + x = np.arange(1, num_iters + 1) + plt.plot(x, J_history) + plt.xlabel(u"迭代次数", fontproperties=font) # 注意指定字体,要不然出现乱码问题 + plt.ylabel(u"代价值", fontproperties=font) + plt.title(u"代价随迭代次数的变化", fontproperties=font) plt.show() + # 测试linearRegression函数 def testLinearRegression(): - mu,sigma,theta = linearRegression(0.01,400) - print u"\n计算的theta值为:\n",theta - print u"\n预测结果为:%f"%predict(mu, sigma, theta) - + mu, sigma, theta = linearRegression(0.01, 400) + print(u"\n计算的theta值为:") + print(theta) + print(u"\n预测结果为:%f" % predict(mu, sigma, theta)) + + # 测试学习效果(预测) -def predict(mu,sigma,theta): +def predict(mu, sigma, theta): result = 0 # 注意归一化 - predict = np.array([1650,3]) - norm_predict = (predict-mu)/sigma - final_predict = np.hstack((np.ones((1)),norm_predict)) - - result = np.dot(final_predict,theta) # 预测结果 + predict = np.array([1650, 3]) + norm_predict = (predict - mu) / sigma + final_predict = np.hstack((np.ones((1)), norm_predict)) + + result = np.dot(final_predict, theta) # 预测结果 return result - - + + if __name__ == "__main__": - testLinearRegression() \ No newline at end of file + testLinearRegression() diff --git a/LinearRegression/LinearRegression_scikit-learn.py b/LinearRegression/LinearRegression_scikit-learn.py index 172a7af..02bce38 100644 --- a/LinearRegression/LinearRegression_scikit-learn.py +++ b/LinearRegression/LinearRegression_scikit-learn.py @@ -1,41 +1,41 @@ -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import numpy as np from sklearn import linear_model -from sklearn.preprocessing import StandardScaler #引入归一化的包 +from sklearn.preprocessing import StandardScaler # 引入归一化的包 + def linearRegression(): - print u"加载数据...\n" - data = loadtxtAndcsv_data("data.txt",",",np.float64) #读取数据 - X = np.array(data[:,0:-1],dtype=np.float64) # X对应0到倒数第2列 - y = np.array(data[:,-1],dtype=np.float64) # y对应最后一列 - + print(u"加载数据...") + data = loadtxtAndcsv_data("data.txt", ",", np.float64) # 读取数据 + X = np.array(data[:, 0:-1], dtype=np.float64) # X对应0到倒数第2列 + y = np.array(data[:, -1], dtype=np.float64) # y对应最后一列 + # 归一化操作 - scaler = StandardScaler() + scaler = StandardScaler() scaler.fit(X) x_train = scaler.transform(X) - x_test = scaler.transform(np.array([1650,3])) - + x_test = scaler.transform(np.array([[1650, 3]])) + # 线性模型拟合 model = linear_model.LinearRegression() model.fit(x_train, y) - - #预测结果 + + # 预测结果 result = model.predict(x_test) - print model.coef_ # Coefficient of the features 决策函数中的特征系数 - print model.intercept_ # 又名bias偏置,若设置为False,则为0 - print result # 预测结果 + print(model.coef_) # Coefficient of the features 决策函数中的特征系数 + print(model.intercept_) # 又名bias偏置,若设置为False,则为0 + print(result) # 预测结果 # 加载txt和csv文件 -def loadtxtAndcsv_data(fileName,split,dataType): - return np.loadtxt(fileName,delimiter=split,dtype=dataType) +def loadtxtAndcsv_data(fileName, split, dataType): + return np.loadtxt(fileName, delimiter=split, dtype=dataType) + # 加载npy文件 def loadnpy_data(fileName): return np.load(fileName) - - if __name__ == "__main__": - linearRegression() \ No newline at end of file + linearRegression() diff --git a/LogisticRegression/LogisticRegression.py b/LogisticRegression/LogisticRegression.py index b203b33..5a1b3ab 100644 --- a/LogisticRegression/LogisticRegression.py +++ b/LogisticRegression/LogisticRegression.py @@ -1,141 +1,148 @@ -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import numpy as np import matplotlib.pyplot as plt from scipy import optimize from matplotlib.font_manager import FontProperties -font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) # 解决windows环境下画图汉字乱码问题 + +font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) # 解决windows环境下画图汉字乱码问题 def LogisticRegression(): - data = loadtxtAndcsv_data("data2.txt", ",", np.float64) - X = data[:,0:-1] - y = data[:,-1] - - plot_data(X,y) # 作图 - - X = mapFeature(X[:,0],X[:,1]) #映射为多项式 - initial_theta = np.zeros((X.shape[1],1))#初始化theta - initial_lambda = 0.1 #初始化正则化系数,一般取0.01,0.1,1..... - - J = costFunction(initial_theta,X,y,initial_lambda) #计算一下给定初始化的theta和lambda求出的代价J - - print J #输出一下计算的值,应该为0.693147 - #result = optimize.fmin(costFunction, initial_theta, args=(X,y,initial_lambda)) #直接使用最小化的方法,效果不好 + data = loadtxtAndcsv_data("data2.txt", ",", np.float64) + X = data[:, 0:-1] + y = data[:, -1] + + plot_data(X, y) # 作图 + + X = mapFeature(X[:, 0], X[:, 1]) # 映射为多项式 + initial_theta = np.zeros((X.shape[1], 1)) # 初始化theta + initial_lambda = 0.1 # 初始化正则化系数,一般取0.01,0.1,1..... + + J = costFunction(initial_theta, X, y, initial_lambda) # 计算一下给定初始化的theta和lambda求出的代价J + + print(J) # 输出一下计算的值,应该为0.693147 + # result = optimize.fmin(costFunction, initial_theta, args=(X,y,initial_lambda)) #直接使用最小化的方法,效果不好 '''调用scipy中的优化算法fmin_bfgs(拟牛顿法Broyden-Fletcher-Goldfarb-Shanno) - costFunction是自己实现的一个求代价的函数, - initial_theta表示初始化的值, - fprime指定costFunction的梯度 - args是其余测参数,以元组的形式传入,最后会将最小化costFunction的theta返回 ''' - result = optimize.fmin_bfgs(costFunction, initial_theta, fprime=gradient, args=(X,y,initial_lambda)) - p = predict(X, result) #预测 - print u'在训练集上的准确度为%f%%'%np.mean(np.float64(p==y)*100) # 与真实值比较,p==y返回True,转化为float - - X = data[:,0:-1] - y = data[:,-1] - plotDecisionBoundary(result,X,y) #画决策边界 - - + result = optimize.fmin_bfgs(costFunction, initial_theta, fprime=gradient, args=(X, y, initial_lambda)) + p = predict(X, result) # 预测 + print(u'在训练集上的准确度为%f%%' % np.mean(np.float64(p == y) * 100)) # 与真实值比较,p==y返回True,转化为float + + X = data[:, 0:-1] + y = data[:, -1] + plotDecisionBoundary(result, X, y) # 画决策边界 + # 加载txt和csv文件 -def loadtxtAndcsv_data(fileName,split,dataType): - return np.loadtxt(fileName,delimiter=split,dtype=dataType) +def loadtxtAndcsv_data(fileName, split, dataType): + return np.loadtxt(fileName, delimiter=split, dtype=dataType) + # 加载npy文件 def loadnpy_data(fileName): return np.load(fileName) + # 显示二维图形 -def plot_data(X,y): - pos = np.where(y==1) #找到y==1的坐标位置 - neg = np.where(y==0) #找到y==0的坐标位置 - #作图 - plt.figure(figsize=(15,12)) - plt.plot(X[pos,0],X[pos,1],'ro') # red o - plt.plot(X[neg,0],X[neg,1],'bo') # blue o - plt.title(u"两个类别散点图",fontproperties=font) +def plot_data(X, y): + pos = np.where(y == 1) # 找到y==1的坐标位置 + neg = np.where(y == 0) # 找到y==0的坐标位置 + # 作图 + plt.figure(figsize=(15, 12)) + plt.plot(X[pos, 0], X[pos, 1], 'ro') # red o + plt.plot(X[neg, 0], X[neg, 1], 'bo') # blue o + plt.title(u"两个类别散点图", fontproperties=font) plt.show() -# 映射为多项式 -def mapFeature(X1,X2): - degree = 2; # 映射的最高次方 - out = np.ones((X1.shape[0],1)) # 映射后的结果数组(取代X) + +# 映射为多项式 +def mapFeature(X1, X2): + degree = 2; # 映射的最高次方 + out = np.ones((X1.shape[0], 1)) # 映射后的结果数组(取代X) ''' - 这里以degree=2为例,映射为1,x1,x2,x1^2,x1,x2,x2^2 + 这里以degree=2为例,映射为1,x1,x2,x1^2,x1*x2,x2^2 ''' - for i in np.arange(1,degree+1): - for j in range(i+1): - temp = X1**(i-j)*(X2**j) #矩阵直接乘相当于matlab中的点乘.* - out = np.hstack((out, temp.reshape(-1,1))) + for i in np.arange(1, degree + 1): + for j in range(i + 1): + temp = X1 ** (i - j) * (X2 ** j) # 矩阵直接乘相当于matlab中的点乘.* + out = np.hstack((out, temp.reshape(-1, 1))) return out + # 代价函数 -def costFunction(initial_theta,X,y,inital_lambda): +def costFunction(initial_theta, X, y, inital_lambda): m = len(y) J = 0 - - h = sigmoid(np.dot(X,initial_theta)) # 计算h(z) - theta1 = initial_theta.copy() # 因为正则化j=1从1开始,不包含0,所以复制一份,前theta(0)值为0 - theta1[0] = 0 - - temp = np.dot(np.transpose(theta1),theta1) - J = (-np.dot(np.transpose(y),np.log(h))-np.dot(np.transpose(1-y),np.log(1-h))+temp*inital_lambda/2)/m # 正则化的代价方程 + + h = sigmoid(np.dot(X, initial_theta)) # 计算h(z) + theta1 = initial_theta.copy() # 因为正则化j=1从1开始,不包含0,所以复制一份,前theta(0)值为0 + theta1[0] = 0 + + temp = np.dot(np.transpose(theta1), theta1) + J = (-np.dot(np.transpose(y), np.log(h)) - np.dot(np.transpose(1 - y), np.log(1 - h)) + temp * inital_lambda / 2) / m # 正则化的代价方程 return J + # 计算梯度 -def gradient(initial_theta,X,y,inital_lambda): +def gradient(initial_theta, X, y, inital_lambda): m = len(y) grad = np.zeros((initial_theta.shape[0])) - - h = sigmoid(np.dot(X,initial_theta))# 计算h(z) + + h = sigmoid(np.dot(X, initial_theta)) # 计算h(z) theta1 = initial_theta.copy() theta1[0] = 0 - grad = np.dot(np.transpose(X),h-y)/m+inital_lambda/m*theta1 #正则化的梯度 + grad = np.dot(np.transpose(X), h - y) / m + inital_lambda / m * theta1 # 正则化的梯度 return grad -# S型函数 + +# S型函数 def sigmoid(z): - h = np.zeros((len(z),1)) # 初始化,与z的长度一置 - - h = 1.0/(1.0+np.exp(-z)) + h = np.zeros((len(z), 1)) # 初始化,与z的长度一置 + + h = 1.0 / (1.0 + np.exp(-z)) return h -#画决策边界 -def plotDecisionBoundary(theta,X,y): - pos = np.where(y==1) #找到y==1的坐标位置 - neg = np.where(y==0) #找到y==0的坐标位置 - #作图 - plt.figure(figsize=(15,12)) - plt.plot(X[pos,0],X[pos,1],'ro') # red o - plt.plot(X[neg,0],X[neg,1],'bo') # blue o - plt.title(u"决策边界",fontproperties=font) - - #u = np.linspace(30,100,100) - #v = np.linspace(30,100,100) - - u = np.linspace(-1,1.5,50) #根据具体的数据,这里需要调整 - v = np.linspace(-1,1.5,50) - - z = np.zeros((len(u),len(v))) +# 画决策边界 +def plotDecisionBoundary(theta, X, y): + pos = np.where(y == 1) # 找到y==1的坐标位置 + neg = np.where(y == 0) # 找到y==0的坐标位置 + # 作图 + plt.figure(figsize=(15, 12)) + plt.plot(X[pos, 0], X[pos, 1], 'ro') # red o + plt.plot(X[neg, 0], X[neg, 1], 'bo') # blue o + plt.title(u"决策边界", fontproperties=font) + + # u = np.linspace(30,100,100) + # v = np.linspace(30,100,100) + + u = np.linspace(-1, 1.5, 50) # 根据具体的数据,这里需要调整 + v = np.linspace(-1, 1.5, 50) + + z = np.zeros((len(u), len(v))) for i in range(len(u)): for j in range(len(v)): - z[i,j] = np.dot(mapFeature(u[i].reshape(1,-1),v[j].reshape(1,-1)),theta) # 计算对应的值,需要map - + z[i, j] = np.dot(mapFeature(u[i].reshape(1, -1), v[j].reshape(1, -1)), theta) # 计算对应的值,需要map + z = np.transpose(z) - plt.contour(u,v,z,[0,0.01],linewidth=2.0) # 画等高线,范围在[0,0.01],即近似为决策边界 - #plt.legend() + plt.contour(u, v, z, [0, 0.01], linewidth=2.0) # 画等高线,范围在[0,0.01],即近似为决策边界 + # plt.legend() plt.show() + # 预测 -def predict(X,theta): +def predict(X, theta): m = X.shape[0] - p = np.zeros((m,1)) - p = sigmoid(np.dot(X,theta)) # 预测的结果,是个概率值 - + p = np.zeros((m, 1)) + p = sigmoid(np.dot(X, theta)) # 预测的结果,是个概率值 + for i in range(m): - if p[i] > 0.5: #概率大于0.5预测为1,否则预测为0 + if p[i] > 0.5: # 概率大于0.5预测为1,否则预测为0 p[i] = 1 else: p[i] = 0 @@ -149,5 +156,3 @@ def testLogisticRegression(): if __name__ == "__main__": testLogisticRegression() - - diff --git a/LogisticRegression/LogisticRegression_OneVsAll.py b/LogisticRegression/LogisticRegression_OneVsAll.py index 9a2e950..31dd7d6 100644 --- a/LogisticRegression/LogisticRegression_OneVsAll.py +++ b/LogisticRegression/LogisticRegression_OneVsAll.py @@ -1,38 +1,41 @@ -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import numpy as np import matplotlib.pyplot as plt import scipy.io as spio from scipy import optimize from matplotlib.font_manager import FontProperties -font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) # 解决windows环境下画图汉字乱码问题 + +font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) # 解决windows环境下画图汉字乱码问题 def logisticRegression_OneVsAll(): - data = loadmat_data("data_digits.mat") - X = data['X'] # 获取X数据,每一行对应一个数字20x20px + data = loadmat_data("data_digits.mat") + X = data['X'] # 获取X数据,每一行对应一个数字20x20px y = data['y'] - m,n = X.shape - num_labels = 10 # 数字个数,0-9 - + m, n = X.shape + num_labels = 10 # 数字个数,0-9 + ## 随机显示几行数据 - rand_indices = [t for t in [np.random.randint(x-x, m) for x in range(100)]] # 生成100个0-m的随机数 - display_data(X[rand_indices,:]) # 显示100个数字 - - Lambda = 0.1 # 正则化系数 - #y = y.reshape(-1,1) + rand_indices = [t for t in [np.random.randint(x - x, m) for x in range(100)]] # 生成100个0-m的随机数 + display_data(X[rand_indices, :]) # 显示100个数字 + + Lambda = 0.1 # 正则化系数 + # y = y.reshape(-1,1) all_theta = oneVsAll(X, y, num_labels, Lambda) # 计算所有的theta - - p = predict_oneVsAll(all_theta,X) # 预测 + + p = predict_oneVsAll(all_theta, X) # 预测 # 将预测结果和真实结果保存到文件中 - #res = np.hstack((p,y.reshape(-1,1))) - #np.savetxt("predict.csv", res, delimiter=',') - - print u"预测准确度为:%f%%"%np.mean(np.float64(p == y.reshape(-1,1))*100) - + # res = np.hstack((p,y.reshape(-1,1))) + # np.savetxt("predict.csv", res, delimiter=',') + + print(u"预测准确度为:%f%%" % np.mean(np.float64(p == y.reshape(-1, 1)) * 100)) + + # 加载mat文件 def loadmat_data(fileName): return spio.loadmat(fileName) - + + # 显示100个数字 def display_data(imgData): sum = 0 @@ -43,92 +46,99 @@ def display_data(imgData): - 显示即可 ''' pad = 1 - display_array = -np.ones((pad+10*(20+pad),pad+10*(20+pad))) + display_array = -np.ones((pad + 10 * (20 + pad), pad + 10 * (20 + pad))) for i in range(10): for j in range(10): - display_array[pad+i*(20+pad):pad+i*(20+pad)+20,pad+j*(20+pad):pad+j*(20+pad)+20] = (imgData[sum,:].reshape(20,20,order="F")) # order=F指定以列优先,在matlab中是这样的,python中需要指定,默认以行 + display_array[pad + i * (20 + pad):pad + i * (20 + pad) + 20, pad + j * (20 + pad):pad + j * (20 + pad) + 20] = ( + imgData[sum, :].reshape(20, 20, order="F")) # order=F指定以列优先,在matlab中是这样的,python中需要指定,默认以行 sum += 1 - - plt.imshow(display_array,cmap='gray') #显示灰度图像 + + plt.imshow(display_array, cmap='gray') # 显示灰度图像 plt.axis('off') plt.show() -# 求每个分类的theta,最后返回所有的all_theta -def oneVsAll(X,y,num_labels,Lambda): + +# 求每个分类的theta,最后返回所有的all_theta +def oneVsAll(X, y, num_labels, Lambda): # 初始化变量 - m,n = X.shape - all_theta = np.zeros((n+1,num_labels)) # 每一列对应相应分类的theta,共10列 - X = np.hstack((np.ones((m,1)),X)) # X前补上一列1的偏置bias - class_y = np.zeros((m,num_labels)) # 数据的y对应0-9,需要映射为0/1的关系 - initial_theta = np.zeros((n+1,1)) # 初始化一个分类的theta - + m, n = X.shape + all_theta = np.zeros((n + 1, num_labels)) # 每一列对应相应分类的theta,共10列 + X = np.hstack((np.ones((m, 1)), X)) # X前补上一列1的偏置bias + class_y = np.zeros((m, num_labels)) # 数据的y对应0-9,需要映射为0/1的关系 + initial_theta = np.zeros((n + 1, 1)) # 初始化一个分类的theta + # 映射y for i in range(num_labels): - class_y[:,i] = np.int32(y==i).reshape(1,-1) # 注意reshape(1,-1)才可以赋值 - - #np.savetxt("class_y.csv", class_y[0:600,:], delimiter=',') - + class_y[:, i] = np.int32(y == i).reshape(1, -1) # 注意reshape(1,-1)才可以赋值 + + # np.savetxt("class_y.csv", class_y[0:600,:], delimiter=',') + '''遍历每个分类,计算对应的theta值''' for i in range(num_labels): - #optimize.fmin_cg - result = optimize.fmin_bfgs(costFunction, initial_theta, fprime=gradient, args=(X,class_y[:,i],Lambda)) # 调用梯度下降的优化方法 - all_theta[:,i] = result.reshape(1,-1) # 放入all_theta中 - - all_theta = np.transpose(all_theta) + # optimize.fmin_cg + result = optimize.fmin_bfgs(costFunction, initial_theta, fprime=gradient, + args=(X, class_y[:, i], Lambda)) # 调用梯度下降的优化方法 + all_theta[:, i] = result.reshape(1, -1) # 放入all_theta中 + + all_theta = np.transpose(all_theta) return all_theta + # 代价函数 -def costFunction(initial_theta,X,y,inital_lambda): +def costFunction(initial_theta, X, y, inital_lambda): m = len(y) J = 0 - - h = sigmoid(np.dot(X,initial_theta)) # 计算h(z) - theta1 = initial_theta.copy() # 因为正则化j=1从1开始,不包含0,所以复制一份,前theta(0)值为0 - theta1[0] = 0 - - temp = np.dot(np.transpose(theta1),theta1) - J = (-np.dot(np.transpose(y),np.log(h))-np.dot(np.transpose(1-y),np.log(1-h))+temp*inital_lambda/2)/m # 正则化的代价方程 + + h = sigmoid(np.dot(X, initial_theta)) # 计算h(z) + theta1 = initial_theta.copy() # 因为正则化j=1从1开始,不包含0,所以复制一份,前theta(0)值为0 + theta1[0] = 0 + + temp = np.dot(np.transpose(theta1), theta1) + J = (-np.dot(np.transpose(y), np.log(h)) - np.dot(np.transpose(1 - y), np.log(1 - h)) + temp * inital_lambda / 2) / m # 正则化的代价方程 return J + # 计算梯度 -def gradient(initial_theta,X,y,inital_lambda): +def gradient(initial_theta, X, y, inital_lambda): m = len(y) grad = np.zeros((initial_theta.shape[0])) - - h = sigmoid(np.dot(X,initial_theta)) # 计算h(z) + + h = sigmoid(np.dot(X, initial_theta)) # 计算h(z) theta1 = initial_theta.copy() theta1[0] = 0 - grad = np.dot(np.transpose(X),h-y)/m+inital_lambda/m*theta1 #正则化的梯度 - return grad - + grad = np.dot(np.transpose(X), h - y) / m + inital_lambda / m * theta1 # 正则化的梯度 + return grad + + # S型函数 def sigmoid(z): - h = np.zeros((len(z),1)) # 初始化,与z的长度一致 - - h = 1.0/(1.0+np.exp(-z)) + h = np.zeros((len(z), 1)) # 初始化,与z的长度一致 + + h = 1.0 / (1.0 + np.exp(-z)) return h + # 预测 -def predict_oneVsAll(all_theta,X): +def predict_oneVsAll(all_theta, X): m = X.shape[0] num_labels = all_theta.shape[0] - p = np.zeros((m,1)) - X = np.hstack((np.ones((m,1)),X)) #在X最前面加一列1 - - h = sigmoid(np.dot(X,np.transpose(all_theta))) #预测 + p = np.zeros((m, 1)) + X = np.hstack((np.ones((m, 1)), X)) # 在X最前面加一列1 + + h = sigmoid(np.dot(X, np.transpose(all_theta))) # 预测 ''' 返回h中每一行最大值所在的列号 - np.max(h, axis=1)返回h中每一行的最大值(是某个数字的最大概率) - 最后where找到的最大概率所在的列号(列号即是对应的数字) ''' - p = np.array(np.where(h[0,:] == np.max(h, axis=1)[0])) + p = np.array(np.where(h[0, :] == np.max(h, axis=1)[0])) for i in np.arange(1, m): - t = np.array(np.where(h[i,:] == np.max(h, axis=1)[i])) - p = np.vstack((p,t)) + t = np.array(np.where(h[i, :] == np.max(h, axis=1)[i])) + p = np.vstack((p, t)) return p - - + + if __name__ == "__main__": - logisticRegression_OneVsAll() \ No newline at end of file + logisticRegression_OneVsAll() diff --git a/LogisticRegression/LogisticRegression_OneVsAll_scikit-learn.py b/LogisticRegression/LogisticRegression_OneVsAll_scikit-learn.py index 5d9374b..668048f 100644 --- a/LogisticRegression/LogisticRegression_OneVsAll_scikit-learn.py +++ b/LogisticRegression/LogisticRegression_OneVsAll_scikit-learn.py @@ -1,27 +1,27 @@ -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- from scipy import io as spio import numpy as np -from sklearn import svm from sklearn.linear_model import LogisticRegression - def logisticRegression_oneVsAll(): - data = loadmat_data("data_digits.mat") - X = data['X'] # 获取X数据,每一行对应一个数字20x20px - y = data['y'] # 这里读取mat文件y的shape=(5000, 1) - y = np.ravel(y) # 调用sklearn需要转化成一维的(5000,) - + data = loadmat_data("data_digits.mat") + X = data['X'] # 获取X数据,每一行对应一个数字20x20px + y = data['y'] # 这里读取mat文件y的shape=(5000, 1) + y = np.ravel(y) # 调用sklearn需要转化成一维的(5000,) + model = LogisticRegression() - model.fit(X, y) # 拟合 - - predict = model.predict(X) #预测 - - print u"预测准确度为:%f%%"%np.mean(np.float64(predict == y)*100) + model.fit(X, y) # 拟合 + + predict = model.predict(X) # 预测 + + print(u"预测准确度为:%f%%" % np.mean(np.float64(predict == y) * 100)) + # 加载mat文件 def loadmat_data(fileName): return spio.loadmat(fileName) + if __name__ == "__main__": - logisticRegression_oneVsAll() \ No newline at end of file + logisticRegression_oneVsAll() diff --git a/LogisticRegression/LogisticRegression_scikit-learn.py b/LogisticRegression/LogisticRegression_scikit-learn.py index 469c941..2df9043 100644 --- a/LogisticRegression/LogisticRegression_scikit-learn.py +++ b/LogisticRegression/LogisticRegression_scikit-learn.py @@ -3,41 +3,43 @@ from sklearn.cross_validation import train_test_split import numpy as np + def logisticRegression(): - data = loadtxtAndcsv_data("data1.txt", ",", np.float64) - X = data[:,0:-1] - y = data[:,-1] - - # ΪѵͲԼ - x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2) - - # һ + data = loadtxtAndcsv_data("data1.txt", ",", np.float64) + X = data[:, 0:-1] + y = data[:, -1] + + # 划分为训练集和测试集 + x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + + # 归一化 scaler = StandardScaler() scaler.fit(x_train) x_train = scaler.fit_transform(x_train) x_test = scaler.fit_transform(x_test) - - #߼ع + + # 逻辑回归 model = LogisticRegression() - model.fit(x_train,y_train) - - # Ԥ + model.fit(x_train, y_train) + + # 预测 predict = model.predict(x_test) right = sum(predict == y_test) - - predict = np.hstack((predict.reshape(-1,1),y_test.reshape(-1,1))) # Ԥֵʵֵһ飬ù۲ - print predict - print ('Լ׼ȷʣ%f%%'%(right*100.0/predict.shape[0])) #ڲԼϵ׼ȷ -# txtcsvļ -def loadtxtAndcsv_data(fileName,split,dataType): - return np.loadtxt(fileName,delimiter=split,dtype=dataType) + predict = np.hstack((predict.reshape(-1, 1), y_test.reshape(-1, 1))) # 将预测值和真实值放在一块,好观察 + print(predict) + print('测试集准确率:%f%%' % (right * 100.0 / predict.shape[0])) # 计算在测试集上的准确度 -# npyļ + +# 加载txt和csv文件 +def loadtxtAndcsv_data(fileName, split, dataType): + return np.loadtxt(fileName, delimiter=split, dtype=dataType) + + +# 加载npy文件 def loadnpy_data(fileName): return np.load(fileName) - if __name__ == "__main__": - logisticRegression() \ No newline at end of file + logisticRegression() diff --git a/readme.md b/readme.md index b9127d2..ff7102f 100644 --- a/readme.md +++ b/readme.md @@ -265,10 +265,10 @@ def sigmoid(z): ``` # 映射为多项式 def mapFeature(X1,X2): - degree = 3; # 映射的最高次方 + degree = 2; # 映射的最高次方 out = np.ones((X1.shape[0],1)) # 映射后的结果数组(取代X) ''' - 这里以degree=2为例,映射为1,x1,x2,x1^2,x1,x2,x2^2 + 这里以degree=2为例,映射为1,x1,x2,x1^2,x1*x2,x2^2 ''' for i in np.arange(1,degree+1): for j in range(i+1):