import org.apache.spark.SparkContext import org.apache.spark.ml.feature.{OneHotEncoder, IndexToString, StringIndexer} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector} import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions._ val sqlContext = new SQLContext(sc) val df = sqlContext.createDataFrame(Seq( (0, "US"),(1, "IND"),(2, "AUS"),(3,"IND"),(4, "US") )).toDF("id", "nationality") // Using StringIndexer to convert categorical variable // Most common label will get value 0, then 1 so on.. val indexer = new StringIndexer().setInputCol("nationality").setOutputCol("index") val indexedDF = indexer.fit(df).transform(df) // converting back from index to nationality val converter = new IndexToString().setInputCol("predictedIndex").setOutputCol("predictedNationality") val predictions = indexedDF.selectExpr("index as predictedIndex") converter.transform(predictions) // using oneHotEncoder to convert categorical variable // It generates a sparse vector for each categorical variable. // oneHotEncoder generally excludes the last categorical variable. // if we want to include the last categorical variable, then we should set setDropLast to false. val oneHotEncoder = new OneHotEncoder().setInputCol("nationality").setOutputCol("nVector") val indexedDF1 = oneHotEncoder.transform(df) // to see the created nVector, either add a new column to the dataFrame val toDenseUDF = udf[DenseVector, SparseVector](_.toDense) indexedDF1.withColumn("denseVector", toDenseUDF(indexedDF1("nVector"))) // Or indexedDF1 foreach { e => val denseVector = e.getAs[SparseVector]("nVector").toDense println(s"${e(0)} ${e(1)} $denseVector") }
0 Comments
Leave a Reply. |
Archives
October 2016
Categories
All
|