import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.test.ChiSqTestResult import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.stat.{KernelDensity, Statistics, MultivariateStatisticalSummary} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.random.RandomRDDs._ // chi-squared test for goodnessOfFit val testVector = Vectors.dense(0.3, 0.1, 0.2, 0.1, 0.9, 0.05) val goodnessOfFitResult: ChiSqTestResult = Statistics.chiSqTest(testVector) println(s"goodnessOfFitResult p-value: ${goodnessOfFitResult.pValue} nullHypothesis: ${goodnessOfFitResult.nullHypothesis}") //chi-squared test for independence on matrix val testMatrix = Matrices.dense(numRows = 3, numCols = 2, Array(2.0,4.0,1.0,5.0,7.0,3.0)) val independenceResult: ChiSqTestResult = Statistics.chiSqTest(testMatrix) println(s"independenceResult p-value: ${independenceResult.pValue} nullHypothesis: ${independenceResult.nullHypothesis}") //chi-squared test for independence on labeledPoint val testLabeledPoint = sc.parallelize(Array( LabeledPoint(0, Vectors.dense(0.5, 0.4)), LabeledPoint(1, Vectors.dense(0.3, 0.1)), LabeledPoint(0, Vectors.dense(0.2, 0.8)) )) val independenceResult1: Array[ChiSqTestResult] = Statistics.chiSqTest(testLabeledPoint) independenceResult1 foreach { r => println(s"independenceResult1 p-value: ${r.pValue} nullHypothesis: ${r.nullHypothesis}") } // Kolmogorov - Smirnov Test for equality of distribution val testNormal: RDD[Double] = normalRDD(sc, size = 1000L, numPartitions = 1, seed = 90L) val testNormalResult = Statistics.kolmogorovSmirnovTest(testNormal, "norm", 0, 1) println(s"testNormalResult p-value: ${testNormalResult.pValue} nullHypothesis: ${testNormalResult.nullHypothesis}") // Kernel-Density Estimate val kd = new KernelDensity().setSample(testNormal).setBandwidth(0.1) val densities = kd.estimate((-2.0 to 2 by 0.5).toArray) println(s"Kernel densities ${densities foreach println}")
0 Comments
import org.apache.spark.mllib.linalg.distributed.IndexedRow // RandomSplit val data = sc.parallelize(1 to 10000000) val Array(train, test, validation) = data.randomSplit(Array(0.5, 0.25, 0.25), seed = 90L) println(s"splits train ${train.count()} test ${test.count()} validate ${validation.count()}") // StratifiedSampling val rows: RDD[IndexedRow] = sc.parallelize(Array( IndexedRow(0L, Vectors.dense(1.0, 6.0, 2.0)), IndexedRow(1L, Vectors.dense(3.0, 1.0, 3.0)), IndexedRow(1L, Vectors.dense(4.0, 2.0, 1.0)) )) // set probability to pick a sample of label/index 0L is 1.0 and probability to pick a sample of label/index 1L is 0.5 val fractions = Map(0L -> 1.0, 1L -> 0.5) val sample = rows map { case IndexedRow(i, v) => (i, v) } sampleByKey(withReplacement = false, fractions = fractions, seed = 90L) println(sample.collect())
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.stat.MultivariateStatisticalSummary import org.apache.spark.rdd.RDD import org.apache.spark.mllib.random.RandomRDDs._ val observations: RDD[Vector] = sc.parallelize(Array( Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 3.0, 7.0), Vectors.dense(5.0, 6.0, 4.0) )) val x: RDD[Double] = sc.parallelize(Array(1.0, 2.0, 3.0)) val y: RDD[Double] = sc.parallelize(Array(3.0, 2.0, 1.0)) val stats: MultivariateStatisticalSummary = Statistics.colStats(observations) println(stats.mean) println(stats.count) val corr: Double = Statistics.corr(x, y, "pearson") println(corr) val corrMatrix: Matrix = Statistics.corr(observations, "pearson") println(corrMatrix) val rand1: RDD[Double] = poissonRDD(sc, mean=1, size=1000000L, numPartitions = 10) println(s"Rand1 mean ${rand1.mean()} variance ${rand1.variance()}") val rand2: RDD[Vector] = normalVectorRDD(sc, numRows = 1000L, numCols = 10, numPartitions = 10) val rand2Stats = Statistics.colStats(rand2) println(s"Rand2 mean ${rand2Stats.mean} variance ${rand2Stats.variance}")
Chaining of filter and map operations can be compressed into one operation, otherwise we end up with more garbage and more time spent building the final collection. This can be accomplished by using collect operation.
Below is the example: val x = List((1,2),(2,3),(3,4),(4,5),(5,6)) x.filter(_._1 % 2 == 0).map(_._2) # Above chaining can be replace by collect x.collect { case x if (x._1 % 2 == 0) => x._2 }
View: A view runs transformations as functional composition instead of as a series of intermediate collections.
(1 to 1000000).view map (_ + 5) map (_ * 2)
using lazy evaluation on collections doesn't always guarantee optimized performance. Lazy evaluation requires the creation of an additional closure. If creating the closures takes longer than creating intermediate collections, the lazy version will run slower. Typically, for smaller value of n strict version will run faster than Lazy version.
|
Archives
October 2016
Categories
All
|