Blog Archives

Basic Data Analysis R

11/1/2015
> fileLocation <- "./dataset/Consumer_Complaints.csv"
> 
> #reading csv file
> data <- read.csv(fileLocation, header = TRUE)
> 
> #To get dimensions of the data
> dimData <- dim(data)
> print(dimData)
[1] 471871     16
> 
> #get number of rows
> rows <- nrow(data)
> print(rows)
[1] 471871
>
>#To get column names 
>colnames(data)
 [1] "Date.received"                "Product"                      "Sub.product"                 
 [4] "Issue"                        "Sub.issue"                    "Consumer.complaint.narrative"
 [7] "Company.public.response"      "Company"                      "State"                       
[10] "ZIP.code"                     "Submitted.via"                "Date.sent.to.company"        
[13] "Company.response.to.consumer" "Timely.response."             "Consumer.disputed."          
[16] "Complaint.ID" 
>
>#To get rowNames             
> rownames(data)
    [1] "1"     "2"     "3"     "4"     "5"     "6"     "7"     "8"     "9"     "10"    "11"    "12"    "13"   
   [14] "14"    "15"    "16"    "17"    "18"    "19"    "20"    "21"    "22"    "23"    "24"    "25"    "26"   
   [27] "27"    "28"    "29"    "30"    "31"    "32"    "33"    "34"    "35"    "36"    "37"    "38"    "39"   
   [40] "40"    "41"    "42" ......
>
>#To view summary of data. Output is not shown.
>summary <- summary(data)
>#print(summary)
>
>#To view quick snapshot of data structure
>struct <- str(data)
>#print(struct)
>
> #Inspect data using HEAD
> headData <- head(data)
> #print(headData)
>
> #To see the last few rows of your data, use the tail()
> tailData <- tail(data)
> #print(tailData)
>
> #get headers in table
> headers <- names(data)
> print(headers)
 [1] "Date.received"                "Product"                      "Sub.product"                 
 [4] "Issue"                        "Sub.issue"                    "Consumer.complaint.narrative"
 [7] "Company.public.response"      "Company"                      "State"                       
[10] "ZIP.code"                     "Submitted.via"                "Date.sent.to.company"        
[13] "Company.response.to.consumer" "Timely.response."             "Consumer.disputed."          
[16] "Complaint.ID"                
> 
> #check if string is in headers
> testString <- "Product"
> check <- is.na(match(testString, headers))
> print(check)
[1] FALSE
> 
> #pick a column 'Product' from the data
> productData <- data["Product"]
> print(head(productData))
           Product
1  Debt collection
2 Credit reporting
3  Money transfers
4  Debt collection
5  Debt collection
6 Credit reporting
> 
> 
> #subset data to have only first 3 columns
> #R indexes from 1, not 0. So your first column is at [1] and not [0]
> subsetData <- data[, 1:3]
> #print(subsetData)
> 
> #subset data to with columns that aren't contiguous
> #or we can use select() function to achieve the same
> subsetData1 <- data[, c(1,3,5,7)]
> #print(subsetData1)
> 
> #Filter rows where product is Mortgage
> #or we can use filter() function to achieve the same
> subsetData2 <- data[data$Product == "Mortgage", ]
> #print(subsetData2)
> 
> #slicing data using subset(dataset, row filters, col selections)
> #Also, we can use %.% to create a execution pipeline
> subsetData3 <- subset(data, Product == "Mortgage", 1:3)
> #print(subsetData3)
> 
> #check for NA values in productData
> chkProductData <- complete.cases(productData)
> summary(chkProductData)
   Mode    TRUE    NA's 
logical  471871       0 
> 
> #get only nonNA values
> productDataGood <- productData[chkProductData,]
> 
> #to get count of issues for every product kind
> countMatrix <- table(productDataGood)
> print(countMatrix)
productDataGood
Bank account or service           Consumer loan             Credit card        Credit reporting 
                  53460                   16880                   57374                   73234 
        Debt collection         Money transfers                Mortgage Other financial service 
                  83117                    2995                  166237                     352 
            Payday loan            Prepaid card            Student loan 
                   3190                    1157                   13875 
> 
#Normalize table data
> cm_state <- table(data["State"])
> total <- sum(cm_state)
> for(state in names(cm_state)){
+     cm_state[state] = cm_state[state] / total
+ }
> print(cm_state)
>
> #default mean function returns NA when there is any NA in the numerical
> #data. So use na.rm = TRUE to tell function to exclude NAs
> mean(data$numeric_column, na.rm = TRUE)
>
> 
> #computing combinations
> choose(5,2)
[1] 10
> 
> #how many ways to combine fruits into group of 2
> fruits <- c("apple", "mango", "banana", "grapes", "plum")
> combn(fruits, 2)
     [,1]    [,2]     [,3]     [,4]    [,5]     [,6]     [,7]    [,8]     [,9]     [,10]   
[1,] "apple" "apple"  "apple"  "apple" "mango"  "mango"  "mango" "banana" "banana" "grapes"
[2,] "mango" "banana" "grapes" "plum"  "banana" "grapes" "plum"  "grapes" "plum"   "plum"  
> 
>#To Inspect in deep, use describe function on psych package.
>#This would give data like MAD, Skew, kurtosis along with mean, median , std
>install.packages("psych")
>library(psych)
>describe(data)
0 Comments
Basic Data Analysis R

Archives

Categories