Dataset: Record Linkage Comparison Patterns Data Set
Source: UCI Machine Learning Repository Description: Dataset was curated from a record linkage study that was performed at German hospital 2010. It contains million pairs of parent records that were matched according to several different criteria, like first name, last name, date of birth and address. Each match was assigned a numerical value from 0 to 1 on how similar they are and then the data was hand labeled to identify which record pairs matched. //Importing data from a folder and creating an RDD scala> val rawData = sc.textFile("/linkage/dataset") rawData: org.apache.spark.rdd.RDD[String] = /linkage/dataset MapPartitionsRDD[5] at textFile at <console>:21 // Some basic operations on RDD //Getting the first row in data scala> rawData.first res2: String = "id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match" //Counting the number of tuples in the data scala> rawData.count() res9: Long = 5749144 //To fetch all raw data scala> rawData.collect() //To save the RDD in persistent storage like HDFS scala> rawData.saveAsTextFile("hdfs://linkage/dataset") //Getting top 5 rows of data scala> val head = rawData.take(5) head: Array[String] = Array("id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match", 37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE, 39086,47614,1,?,1,?,1,1,1,1,1,TRUE, 70031,70237,1,?,1,?,1,1,1,1,1,TRUE, 84795,97439,1,?,1,?,1,1,1,1,1,TRUE) //basic operations on top 5 rows scala> head.length res3: Int = 5 scala> head foreach println "id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match" 37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE 39086,47614,1,?,1,?,1,1,1,1,1,TRUE 70031,70237,1,?,1,?,1,1,1,1,1,TRUE 84795,97439,1,?,1,?,1,1,1,1,1,TRUE //filter headers from head data. //For that lets create a function to check if the tuple is a header or not. //In our example header has "id_1" attribute. scala> def isHeader(tuple: String) = tuple.contains("id_1") isHeader: (tuple: String)Boolean scala> head filter isHeader res3: Array[String] = Array("id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match") //Get the data without header scala> val dataWithoutHeader = head filterNot isHeader dataWithoutHeader: Array[String] = Array(37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE, 39086,47614,1,?,1,?,1,1,1,1,1,TRUE, 70031,70237,1,?,1,?,1,1,1,1,1,TRUE, 84795,97439,1,?,1,?,1,1,1,1,1,TRUE) //The first 2 attributes of the tuple are patients ID which are Int, //while the last attribute is a Boolean telling if the patients //records matched or not. while the middle 9 attributes are raw scores //on which the conclusion is made. //Lets create a case class to accept every tuple scala> case class Entry(id1: Int, id2:Int, scores: Seq[Double], matched: Boolean) defined class Entry //Lets define a paring function that excepts tuple and converts //to Entry object. Some of the score attributes have "?", which //will be replaced with NaN. scala> def parse (tuple: String) = { | val attributes = tuple.split(",") | val id1 = attributes(0).toInt | val id2 = attributes(1).toInt | val matched = attributes(11).toBoolean | val scores = attributes slice(2,11) map { x => if("?".equals(x)) Double.NaN else x.toDouble } toSeq | Entry(id1, id2, scores, matched) | } parse: (tuple: String)Entry scala> val entries: Seq[Entry] = dataWithoutHeader map parse toSeq entries: Seq[Entry] = WrappedArray(Entry(37291,53113,WrappedArray(0.833333333333333, NaN, 1.0, NaN, 1.0, 1.0, 1.0, 1.0, 0.0),true), Entry(39086,47614,WrappedArray(1.0, NaN, 1.0, NaN, 1.0, 1.0, 1.0, 1.0, 1.0),true), Entry(70031,70237,WrappedArray(1.0, NaN, 1.0, NaN, 1.0, 1.0, 1.0, 1.0, 1.0),true), Entry(84795,97439,WrappedArray(1.0, NaN, 1.0, NaN, 1.0, 1.0, 1.0, 1.0, 1.0),true))
0 Comments
Leave a Reply. |
Archives
April 2016
Categories |