Basic Operations

7/16/2015

Dataset: Record Linkage Comparison Patterns Data Set
Source: UCI Machine Learning Repository
Description: Dataset was curated from a record linkage study that was performed at German hospital 2010. It contains million pairs of parent records that were matched according to several different criteria, like first name, last name, date of birth and address. Each match was assigned a numerical value from 0 to 1 on how similar they are and then the data was hand labeled to identify which record pairs matched.

//Importing data from a folder and creating an RDD
scala> val rawData = sc.textFile("/linkage/dataset")
rawData: org.apache.spark.rdd.RDD[String] = /linkage/dataset MapPartitionsRDD[5] at textFile at <console>:21

// Some basic operations on RDD
//Getting the first row in data
scala> rawData.first
res2: String = "id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match"

//Counting the number of tuples in the data
scala> rawData.count()
res9: Long = 5749144

//To fetch all raw data
scala> rawData.collect()

//To save the RDD in persistent storage like HDFS
scala> rawData.saveAsTextFile("hdfs://linkage/dataset")


//Getting top 5 rows of data
scala> val head = rawData.take(5)
head: Array[String] = Array("id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match", 37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE, 39086,47614,1,?,1,?,1,1,1,1,1,TRUE, 70031,70237,1,?,1,?,1,1,1,1,1,TRUE, 84795,97439,1,?,1,?,1,1,1,1,1,TRUE)

//basic operations on top 5 rows
scala> head.length
res3: Int = 5

scala> head foreach println
"id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match"
37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE
39086,47614,1,?,1,?,1,1,1,1,1,TRUE
70031,70237,1,?,1,?,1,1,1,1,1,TRUE
84795,97439,1,?,1,?,1,1,1,1,1,TRUE

//filter headers from head data. 
//For that lets create a function to check if the tuple is a header or not. 
//In our example header has "id_1" attribute.
scala> def isHeader(tuple: String) = tuple.contains("id_1")
isHeader: (tuple: String)Boolean

scala> head filter isHeader 
res3: Array[String] = Array("id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match")

//Get the data without header
scala> val dataWithoutHeader = head filterNot isHeader
dataWithoutHeader: Array[String] = Array(37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE, 39086,47614,1,?,1,?,1,1,1,1,1,TRUE, 70031,70237,1,?,1,?,1,1,1,1,1,TRUE, 84795,97439,1,?,1,?,1,1,1,1,1,TRUE)

//The first 2 attributes of the tuple are patients ID which are Int, 
//while the last attribute is a Boolean telling if the patients //records matched or not. while the middle 9 attributes are raw scores
//on which the conclusion is made.
//Lets create a case class to accept every tuple
scala> case class Entry(id1: Int, id2:Int, scores: Seq[Double], matched: Boolean)
defined class Entry

//Lets define a paring function that excepts tuple and converts
//to Entry object. Some of the score attributes have "?", which
//will be replaced with NaN.
scala> def parse (tuple: String) = {
     | val attributes = tuple.split(",")
     | val id1 = attributes(0).toInt
     | val id2 = attributes(1).toInt
     | val matched = attributes(11).toBoolean
     | val scores = attributes slice(2,11) map { x => if("?".equals(x)) Double.NaN else x.toDouble } toSeq
     | Entry(id1, id2, scores, matched)
     | }

parse: (tuple: String)Entry

scala> val entries: Seq[Entry] = dataWithoutHeader map parse toSeq
entries: Seq[Entry] = WrappedArray(Entry(37291,53113,WrappedArray(0.833333333333333, NaN, 1.0, NaN, 1.0, 1.0, 1.0, 1.0, 0.0),true), Entry(39086,47614,WrappedArray(1.0, NaN, 1.0, NaN, 1.0, 1.0, 1.0, 1.0, 1.0),true), Entry(70031,70237,WrappedArray(1.0, NaN, 1.0, NaN, 1.0, 1.0, 1.0, 1.0, 1.0),true), Entry(84795,97439,WrappedArray(1.0, NaN, 1.0, NaN, 1.0, 1.0, 1.0, 1.0, 1.0),true))

0 Comments

Basic Operations

Leave a Reply.

Archives

Categories