算法小白的第一次尝试---KPCA(核主成分分析)降维【实例对比分析PCA、LDA和KPCA】

--------------------------------------------------------------------------
笔者追求算法实现,不喜欢大篇幅叙述原理,有关KPCA理论推荐查看该篇博客
https://blog.csdn.net/zjuPeco/article/details/77510981
	
PCA降维欢迎前往笔者上一篇博客:
https://blog.csdn.net/Java_Man_China/article/details/89331554

LDA降维欢迎前往笔者上一篇博客:
https://blog.csdn.net/Java_Man_China/article/details/89504514
--------------------------------------------------------------------  ----
import breeze.linalg.{DenseMatrix, DenseVector, eig}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.{LabeledPoint, StandardScaler, VectorAssembler}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.collection.mutable.ArrayBuffer

/** PCA and LDA can be used to lower the linear datasets,but for the non-linear datasets,we need to
  * borrow kernel function ,So this code show how to use  KPCA  to solve the non-linear data
  * Data Source : http://archive.ics.uci.edu/ml/datasets/Wine
  * @author XiaoTangBao
  * @date 2019/4/29 14:04
  * @version 1.0
  */
object KPCA {
  def main(args: Array[String]): Unit = {
    //屏蔽日志
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    val spark = SparkSession.builder().master("local[4]").appName("KPCA").getOrCreate()
    val data = spark.sparkContext.textFile("G:\\mldata\\kpca_test.txt").map(str => str.split(','))
      .map(arr => arr.map(str => str.toDouble)).map(arr => Row(arr(0),arr(1),arr(2)))
    //定义Schema和featuresArray
    val schema = StructType(List(StructField("label",DoubleType,true),StructField("x1",DoubleType,true),StructField("x2",DoubleType,true)))
    val featuresArray = Array("x1","x2")
    val df = spark.createDataFrame(data,schema)
    //定义转化器
    val va = new VectorAssembler().setInputCols(featuresArray).setOutputCol("features")
    val ndf = va.transform(df).select("label","features")
    //rbf核函数参数
    val gama = 15.0
    //降维后的最终纬度
    val dim = 2
    val n1 = run(ndf,dim,15.0)
    val arr = ArrayBuffer[(Double,Double)]()
    for(i<-0 until n1.cols) arr.append((n1(0,i),n1(1,i)))
    arr.foreach(tp =>println(tp._1))
    println("**************")
    arr.foreach(tp =>println(tp._2))
  }

  /**
    * the method attempts to lower the dimensionality by the RBF
    * @param data the ioriginal data which in high dimensions, each col of the data replace one record.
    * @param k the final dimensions
    * @param gama the only one paramter of RBF
    */
  def run(df:DataFrame, k:Int, gama:Double)= {
    //标准化处理数据,标准化后不再需要去中心化
    val stdf = new StandardScaler().setInputCol("features").setOutputCol("Scaledfeatures")
      .setWithMean(true).setWithStd(true).fit(df).transform(df)
      .select("label","Scaledfeatures")
      .withColumnRenamed("Scaledfeatures","features")

    val trainData = stdf.select("features").rdd.map(row => row.toString())
      .map(str => str.replace('[', ' '))
      .map(str => str.replace(']', ' '))
      .map(str => str.trim).map(str => str.split(','))
      .map(arr => arr.map(str => str.toDouble)).collect()

    val labels = stdf.select("label").rdd.map(row => row.toString())
      .map(str => str.replace('[', ' '))
      .map(str => str.replace(']', ' '))
      .map(str => str.trim).map(str => str.toDouble).collect()

    //特征列数
    val tzz = trainData(0).length

    //生成新的带label的数据
    val labArr = ArrayBuffer[LabeledPoint]()
    for (i <- 0 until trainData.length) labArr.append(LabeledPoint(labels(i), Vectors.dense(trainData(i))))

    //总样本组成的大型矩阵
    val allData = labArr.map(lab => lab.features).map(vec => vec.toArray).flatMap(x => x).toArray
    val big_Matrx = new DenseMatrix[Double](tzz, trainData.length, allData)

    //计算样本的核矩阵
    var kMatrix = DenseMatrix.zeros[Double](big_Matrx.cols,big_Matrx.cols)
    for(i<-0 until kMatrix.rows){
      val vi = big_Matrx(::,i)
      for(j<-0 until kMatrix.cols){
        kMatrix(i,j) = rbf(vi,big_Matrx(::,j),gama)
      }
    }

    //聚集核矩阵
    var LMatrix = DenseMatrix.zeros[Double](kMatrix.rows,kMatrix.cols)
    for(i<-0 until LMatrix.cols) LMatrix(::,i) := 1.0 / kMatrix.rows
    kMatrix = kMatrix - LMatrix * kMatrix - kMatrix * LMatrix + LMatrix * kMatrix * LMatrix

    //计算样本核矩阵的特征值和特征向量
    val eigValues = eig(kMatrix).eigenvalues
    //此处返回的eigVectors已经单位化了
    val eigVectors = eig(kMatrix).eigenvectors

    //选取最大的k个特征值对应的特征向量
    val label_eig = DenseMatrix.horzcat(eigVectors.t,eigValues.toDenseMatrix.t)
    var strArr = ArrayBuffer[String]()
    for(i<-0 until label_eig.rows) strArr.append(label_eig.t(::,i).toString)
    for(i<-0 until strArr.length){
      strArr(i) = strArr(i).replace("DenseVector(","").replace(')',' ').trim()
    }
    val da = ArrayBuffer[LabeledPoint]()
    for(str <- strArr){
      val arr = str.split(',').map(string => string.toDouble)
      val lab = arr.takeRight(1)(0)
      val value = arr.take(arr.length -1)
      val labPoint = LabeledPoint(lab,Vectors.dense(value))
      da.append(labPoint)
    }

    //假设此处没有问题---我估计还是有点问题,不然为啥和python上的不同呢 ????
    val result = da.sortBy(labPoint => labPoint.label).reverse.take(k).map(lab => lab.features).map(vec => vec.toArray)
    var rt = DenseMatrix.zeros[Double](result.length,result(0).length)
    for(i<-0 until rt.rows){
      for(j<-0 until rt.cols){
        rt(i,j) = result(i)(j)
      }
    }
    rt
  }
  def rbf(v1:DenseVector[Double],v2:DenseVector[Double],gama:Double)={
    val index_cof = (v1 - v2) dot (v1 - v2)
    val result = math.exp((-1.0) * gama * index_cof)
    result
  }
}

采用KPCA,分别对葡萄酒数据进行了降维处理,结果如下图所示:
算法小白的第一次尝试---KPCA(核主成分分析)降维【实例对比分析PCA、LDA和KPCA】
算法小白的第一次尝试---KPCA(核主成分分析)降维【实例对比分析PCA、LDA和KPCA】
与Python调库结果对比,发现两者基本一致
算法小白的第一次尝试---KPCA(核主成分分析)降维【实例对比分析PCA、LDA和KPCA】

同时为了对比线性降维的效果,采用PCA和LDA分别对数据进行了降维,结果如下图所示:
算法小白的第一次尝试---KPCA(核主成分分析)降维【实例对比分析PCA、LDA和KPCA】
算法小白的第一次尝试---KPCA(核主成分分析)降维【实例对比分析PCA、LDA和KPCA】
算法小白的第一次尝试---KPCA(核主成分分析)降维【实例对比分析PCA、LDA和KPCA】
算法小白的第一次尝试---KPCA(核主成分分析)降维【实例对比分析PCA、LDA和KPCA】
实验结果表明:对于非线性可分数据,PCA和LDA降维效果不理想,而KPCA对于非线性数据,降维效果明显。