> fileLocation <- "./dataset/Consumer_Complaints.csv" > > #reading csv file > data <- read.csv(fileLocation, header = TRUE) > > #To get dimensions of the data > dimData <- dim(data) > print(dimData) [1] 471871 16 > > #get number of rows > rows <- nrow(data) > print(rows) [1] 471871 > >#To get column names >colnames(data) [1] "Date.received" "Product" "Sub.product" [4] "Issue" "Sub.issue" "Consumer.complaint.narrative" [7] "Company.public.response" "Company" "State" [10] "ZIP.code" "Submitted.via" "Date.sent.to.company" [13] "Company.response.to.consumer" "Timely.response." "Consumer.disputed." [16] "Complaint.ID" > >#To get rowNames > rownames(data) [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" [14] "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" [27] "27" "28" "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" [40] "40" "41" "42" ...... > >#To view summary of data. Output is not shown. >summary <- summary(data) >#print(summary) > >#To view quick snapshot of data structure >struct <- str(data) >#print(struct) > > #Inspect data using HEAD > headData <- head(data) > #print(headData) > > #To see the last few rows of your data, use the tail() > tailData <- tail(data) > #print(tailData) > > #get headers in table > headers <- names(data) > print(headers) [1] "Date.received" "Product" "Sub.product" [4] "Issue" "Sub.issue" "Consumer.complaint.narrative" [7] "Company.public.response" "Company" "State" [10] "ZIP.code" "Submitted.via" "Date.sent.to.company" [13] "Company.response.to.consumer" "Timely.response." "Consumer.disputed." [16] "Complaint.ID" > > #check if string is in headers > testString <- "Product" > check <- is.na(match(testString, headers)) > print(check) [1] FALSE > > #pick a column 'Product' from the data > productData <- data["Product"] > print(head(productData)) Product 1 Debt collection 2 Credit reporting 3 Money transfers 4 Debt collection 5 Debt collection 6 Credit reporting > > > #subset data to have only first 3 columns > #R indexes from 1, not 0. So your first column is at [1] and not [0] > subsetData <- data[, 1:3] > #print(subsetData) > > #subset data to with columns that aren't contiguous > #or we can use select() function to achieve the same > subsetData1 <- data[, c(1,3,5,7)] > #print(subsetData1) > > #Filter rows where product is Mortgage > #or we can use filter() function to achieve the same > subsetData2 <- data[data$Product == "Mortgage", ] > #print(subsetData2) > > #slicing data using subset(dataset, row filters, col selections) > #Also, we can use %.% to create a execution pipeline > subsetData3 <- subset(data, Product == "Mortgage", 1:3) > #print(subsetData3) > > #check for NA values in productData > chkProductData <- complete.cases(productData) > summary(chkProductData) Mode TRUE NA's logical 471871 0 > > #get only nonNA values > productDataGood <- productData[chkProductData,] > > #to get count of issues for every product kind > countMatrix <- table(productDataGood) > print(countMatrix) productDataGood Bank account or service Consumer loan Credit card Credit reporting 53460 16880 57374 73234 Debt collection Money transfers Mortgage Other financial service 83117 2995 166237 352 Payday loan Prepaid card Student loan 3190 1157 13875 > #Normalize table data > cm_state <- table(data["State"]) > total <- sum(cm_state) > for(state in names(cm_state)){ + cm_state[state] = cm_state[state] / total + } > print(cm_state) > > #default mean function returns NA when there is any NA in the numerical > #data. So use na.rm = TRUE to tell function to exclude NAs > mean(data$numeric_column, na.rm = TRUE) > > > #computing combinations > choose(5,2) [1] 10 > > #how many ways to combine fruits into group of 2 > fruits <- c("apple", "mango", "banana", "grapes", "plum") > combn(fruits, 2) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [1,] "apple" "apple" "apple" "apple" "mango" "mango" "mango" "banana" "banana" "grapes" [2,] "mango" "banana" "grapes" "plum" "banana" "grapes" "plum" "grapes" "plum" "plum" > >#To Inspect in deep, use describe function on psych package. >#This would give data like MAD, Skew, kurtosis along with mean, median , std >install.packages("psych") >library(psych) >describe(data)
0 Comments
|
Archives
May 2016
Categories |