Mahalanobis Distance
import breeze.linalg._ import org.apache.spark.SparkContext import org.apache.spark.ml.feature.{StandardScaler, VectorAssembler} import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions._ val sqlContext = new SQLContext(sc) val df = sqlContext.range(0, 10).select("id").withColumn("uniform", rand(10L)).withColumn("normal1", randn(10L)).withColumn("normal2", randn(11L)) val assembler = new VectorAssembler().setInputCols(Array("uniform", "normal1", "normal2")).setOutputCol("features") val df1 = assembler.transform(df) // Add outlier info val df2 = df1.select("id", "features").unionAll(sqlContext.createDataFrame(Seq((10, Vectors.dense(5,5,5))))) // Standardize the df2: This is important step in calculating mahalanobis distance. // When normalized to zero mean and unit standard deviation then correlation matrix is equal to covariance matrix val standardScalar = new StandardScaler().setInputCol("features").setOutputCol("scaledFeat").setWithMean(true).setWithStd(true) val scalarModel = standardScalar.fit(df2.select("id", "features")) val df2Scaled = scalarModel.transform(df2).select("id", "scaledFeat") //Compute the inverse covariance matrix val rddScaledFeat = df2Scaled.select("scaledFeat").rdd.map(_(0).asInstanceOf[Vector]) val corr = Statistics.corr(rddScaledFeat) val invCovariance = inv(new breeze.linalg.DenseMatrix(3, 3, corr.toArray)) // Note: DenseVector is from breeze.linalg library val mahalanobois = udf[Double, Vector] { v => val vB = DenseVector(v.toArray) vB.t * invCovariance * vB } val df2Mahalanobis = df2Scaled.withColumn("mahalanobis", mahalanobois(df2Scaled("scaledFeat"))) // Remove Outliers, lets say top 2 val ids = df2Mahalanobis.select("id", "mahalanobis").sort(df2Mahalanobis("mahalanobis").desc).drop("mahalanobis").collect() val idOutliers = ids map { _(0).asInstanceOf[Long] } slice(0, 2) df2.filter(s"id not in (${idOutliers.mkString(",")})").show()
0 Comments
Leave a Reply. |
Archives
October 2016
Categories
All
|