Data Science With Scala: Encoding Categorical Features

4/7/2016
import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.{OneHotEncoder, IndexToString, StringIndexer}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._
    val sqlContext = new SQLContext(sc)

    val df = sqlContext.createDataFrame(Seq(
      (0, "US"),(1, "IND"),(2, "AUS"),(3,"IND"),(4, "US")
    )).toDF("id", "nationality")

    // Using StringIndexer to convert categorical variable
    // Most common label will get value 0, then 1  so on..
    val indexer = new StringIndexer().setInputCol("nationality").setOutputCol("index")
    val indexedDF = indexer.fit(df).transform(df)

    // converting back from  index to nationality
    val converter = new IndexToString().setInputCol("predictedIndex").setOutputCol("predictedNationality")
    val predictions = indexedDF.selectExpr("index as predictedIndex")
    converter.transform(predictions)

    // using oneHotEncoder to convert categorical variable
    // It generates a sparse vector for each categorical variable.
    // oneHotEncoder generally excludes the last categorical variable.
    // if we want to include the last categorical variable, then we should set setDropLast to false.
    val oneHotEncoder = new OneHotEncoder().setInputCol("nationality").setOutputCol("nVector")
    val indexedDF1 = oneHotEncoder.transform(df)

    // to see the created nVector, either add a new column to the dataFrame
    val toDenseUDF = udf[DenseVector, SparseVector](_.toDense)
    indexedDF1.withColumn("denseVector", toDenseUDF(indexedDF1("nVector")))
    // Or
    indexedDF1 foreach { e =>
      val denseVector = e.getAs[SparseVector]("nVector").toDense
      println(s"${e(0)} ${e(1)} $denseVector")
    }
0 Comments
Data Science With Scala: Encoding Categorical Features

Leave a Reply.

Archives

Categories