""" ============================================================== 基于 Kernel LDA + KNN 的人脸识别 使用 Kernel Discriminant Analysis 做特征降维 使用 K-Nearest-Neighbor 做分类 数据: 人脸图像来自于 Olivetti faces data-set from AT&T (classification) 数据集包含 40 个人的人脸图像, 每个人都有 10 张图像 我们只使用其中标签(label/target)为 0 和 1 的前 2 个人的图像 算法: 需要自己实现基于 RBF Kernel 的 Kernel Discriminant Analysis 用于处理两个类别的数据的特征降维 代码的框架已经给出, 需要学生自己补充 KernelDiscriminantAnalysis 的 fit() 和 transform() 函数的内容 ============================================================== """ # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_olivetti_faces from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler print(__doc__) ################################################ """ Scikit-learn-compatible Kernel Discriminant Analysis. """ import numpy as np from scipy import linalg from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin from sklearn.preprocessing import OneHotEncoder from sklearn.utils.validation import check_array, check_is_fitted, check_X_y class RBFKernel(object): """ a gaussian kernel k(x, y) = exp( - gamma * || x - y ||^2) """ def __init__(self, gamma: float) -> None: self._gamma = gamma @property def gamma(self) -> float: return self._gamma @gamma.setter def gamma(self, gamma: float) -> None: self._gamma = gamma def _rbf(self, x: np.array, y: np.array) -> float: return np.exp(- self._gamma * np.sum((x - y) ** 2)) def __call__(self, x: np.array, y: np.array) -> float: return self._rbf(x, y) class KernelDiscriminantAnalysis(BaseEstimator, ClassifierMixin, TransformerMixin): """Kernel Discriminant Analysis. Parameters ---------- n_components: integer. The dimension after transform. gamma: float. Parameter to RBF Kernel lmb: float (>= 0.0), default=0.001. Regularization parameter """ def __init__(self, n_components, gamma, lmb=0.001): self.n_components = n_components self.gamma = gamma self.lmb = lmb self.X = None # 用于存放输入的训练数据的 X self.K = None # 用于存放训练数据 X 产生的 Kernel Matrix self.M = None # 用于存放 Kernel LDA 最优化公式中的 M self.N = None # 用于存放 Kernel LDA 最优化公式中的 N self.EigenVectors = None # 用于存放 Kernel LDA 最优化公式中的 M 对应的广义特征向量, 每一列为一个特征向量, 按照对应特征值大小排序 self._rbf = RBFKernel(gamma) def fit(self, X: np.array, y: np.array) -> None: """Fit KDA model. Parameters ---------- X: numpy array of shape [n_samples, n_features] Training set. y: numpy array of shape [n_samples] Target values. Only works for 2 classes with label/target 0 and 1. Returns ------- self """ self.X = X classes = list(set(y)) assert len(classes) == 2, "only works for 2 classes" X_cls_1 = X[y == classes[0], :] X_cls_2 = X[y == classes[1], :] # M, M_1 & M_2 M_1 = self._M_cls(X, X_cls_1) M_2 = self._M_cls(X, X_cls_2) self.M = np.matmul((M_2 - M_1), np.transpose(M_2 - M_1)) # N, K_1 & K_2 num_samples = X.shape[0] num_samples_cls_1 = X_cls_1.shape[0] num_samples_cls_2 = X_cls_2.shape[0] K_1 = self._K_cls(X, X_cls_1) K_2 = self._K_cls(X, X_cls_2) self.N = np.matmul( np.matmul( K_1, (np.identity(num_samples_cls_1) - 1 / num_samples_cls_1 * np.ones(shape=(num_samples_cls_1, num_samples_cls_1))) ), np.transpose(K_1) ) self.N += np.matmul( np.matmul( K_2, (np.identity(num_samples_cls_2) - 1 / num_samples_cls_2 * np.ones(shape=(num_samples_cls_2, num_samples_cls_2))) ), np.transpose(K_2) ) self.N += self.lmb + np.identity(num_samples) self.K = [K_1, K_2] _, vecs = linalg.eig(self.M, self.N) self.EigenVectors = vecs[:, : self.n_components] def transform(self, X_test: np.array) -> np.array: """Transform data with the trained KernelLDA model. Parameters ---------- X_test: numpy array of shape [n_samples, n_features] The input data. Returns ------- y_pred: array-like, shape (n_samples, n_components) Transformations for X. """ num_samples_test = X_test.shape[0] num_samples = self.X.shape[0] y_pred = np.zeros(shape=(num_samples_test, self.n_components)) for i in range(num_samples_test): # kernel x = X_test[i, :] x_rbf = np.zeros(shape=(num_samples, )) for k in range(num_samples): x_rbf[k] = self._rbf(self.X[k, :], x) y_pred[i, :] = np.matmul(x_rbf, self.EigenVectors) return y_pred def _M_cls(self, X: np.array, X_cls: np.array) -> np.array: """ """ num_samples = X.shape[0] num_samples_cls = X_cls.shape[0] # M_cls M_cls = np.zeros(shape=(num_samples, 1)) for j in range(num_samples): for k in range(num_samples_cls): M_cls[j] += self._rbf(X[j, :], X_cls[k, :]) M_cls /= num_samples_cls return M_cls def _K_cls(self, X: np.array, X_cls: np.array) -> np.array: num_samples = X.shape[0] num_samples_cls = X_cls.shape[0] K_cls = np.zeros(shape=(num_samples, num_samples_cls)) for n in range(num_samples): for m in range(num_samples_cls): K_cls[n, m] = self._rbf(X[n, :], X_cls[m, :]) return K_cls ################################################ # 指定 KNN 中最近邻的个数 (k 的值) n_neighbors = 3 # 设置随机数种子让实验可以复现 random_state = 0 # 现在人脸数据集 faces = fetch_olivetti_faces(download_if_missing=False) targets = faces.target # show sample images images = faces.images[targets < 2] # save images features = faces.data # features targets = faces.target # targets fig = plt.figure() # create a new figure window for i in range(20): # display 20 images # subplot : 4 rows and 5 columns img_grid = fig.add_subplot(4, 5, i+1) # plot features as image img_grid.imshow(images[i], cmap='gray') plt.show() # Prepare data, 只限于处理类别 0 和 1 的人脸 X, y = faces.data[targets < 2], faces.target[targets < 2] # Split into train/test X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.5, stratify=y, random_state=random_state) # Reduce dimension to 2 with KernelDiscriminantAnalysis # can adjust the value of 'gamma' as needed. kda = make_pipeline(StandardScaler(), KernelDiscriminantAnalysis(n_components=2, gamma = 0.000005)) # Use a nearest neighbor classifier to evaluate the methods knn = KNeighborsClassifier(n_neighbors=n_neighbors) # Fit the method's model kda.fit(X_train, y_train) # Fit a nearest neighbor classifier on the embedded training set knn.fit(kda.transform(X_train), y_train) # Compute the nearest neighbor accuracy on the embedded test set acc_knn = knn.score(kda.transform(X_test), y_test) # Embed the data set in 2 dimensions using the fitted model X_embedded = kda.transform(X) plt.figure() # plt.subplot(1, 3, i + 1, aspect=1) # Plot the projected points and show the evaluation score plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1') plt.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format('kda', n_neighbors, acc_knn)) plt.show()