Evaluators computes Metrics from predictions. Available evaluators are:
val sqlContext = new SQLContext(sc) import sqlContext.implicits._ import org.apache.spark.mllib.util.MLUtils import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{IndexToString, VectorIndexer, StringIndexer} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary import org.apache.spark.ml.evaluation.{RegressionEvaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.regression.RandomForestRegressor val data = MLUtils.loadLibSVMFile(sc, "./data/sample_libsvm_data.txt").toDF() val Array(train, test) = data.randomSplit(Array(0.7, 0.3)) /** BinaryClassificationEvaluator **/ val logr = new LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8) val modelLog = logr.fit(train) println(s"Weights: ${modelLog.coefficients} \n Intercept: ${modelLog.intercept}") // Get predictions on test data val predictionsLog = modelLog.transform(test) // Define evaluator val evaluatorBinary = new BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("rawPrediction").setMetricName("areaUnderROC") // Run Evaluation. The area under the ROC curve ranges from 0.5 and 1.0 with larger values indicative of better fit val roc = evaluatorBinary.evaluate(predictionsLog) /** MulticlassClassificationEvaluator **/ val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(data) val vectorIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(data) val indexToString = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) val classifierRF = new RandomForestClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setNumTrees(3) val pipelineRF = new Pipeline().setStages(Array(labelIndexer, vectorIndexer, classifierRF, indexToString)) val modelRF = pipelineRF.fit(train) // Get predictions on test data val predictionsRF = modelRF.transform(test) // Define MulticlassClassificationEvaluator val evaluatorMultiClass = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("precision") val accuracy = evaluatorMultiClass.evaluate(predictionsRF) println(s"Test Error: ${1 - accuracy}") /** RegressionEvaluator **/ val regressorRF = new RandomForestRegressor().setLabelCol("label").setPredictionCol("indexedFeatures") val pipelineRFR = new Pipeline().setStages(Array(vectorIndexer, regressorRF)) val modelRFR = pipelineRFR.fit(train) // Get predictions on test data val predictionRFR = modelRFR.transform(test) // Define RegressionEvaluator val evaluatorReg = new RegressionEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("rmse") val rmse = evaluatorReg.evaluate(predictionRFR) println(s"RMSE: $rmse") /** BinaryLogisticRegressionSummary **/ val summaryLog = modelLog.summary.asInstanceOf[BinaryLogisticRegressionSummary] println(s"areaUnderCurve: ${summaryLog.areaUnderROC}") val fMeasure = summaryLog.fMeasureByThreshold val maxFMeasure = fMeasure.agg("F-Measure" -> "Max").head().getDouble(0) val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure).select("threshold").head().getDouble(0) println(s"MaxFMeasure: $maxFMeasure & bestThreshold: $bestThreshold")
0 Comments
Leave a Reply. |
Archives
October 2016
Categories
All
|