import org.apache.spark.SparkContext import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.distributed.RowMatrix val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.implicits._ val crimes = sqlContext.read.format("com.databricks.spark.csv").option("delimiter", ",").option("header", "true").option("inferSchema", "true").load("./data/UScrime2-colsLotsOfNAremoved.csv") // MLlib PCA requires Row Matrix not DataFrame, so lets convert val assembler = new VectorAssembler().setInputCols(crimes.columns).setOutputCol("features") val featuresDF = assembler.transform(crimes).select("features") val rddOfRows = featuresDF.rdd val rddOfVectors = rddOfRows map { _.get(0).asInstanceOf[Vector]} val mat = new RowMatrix(rddOfVectors) // compute top 10 PCs val pcs = mat.computePrincipalComponents(10)
0 Comments
Leave a Reply. |
Archives
October 2016
Categories
All
|