Data cleaning

Common Tasks

  • Data class
  • Creating dataframes
  • Updating column names
  • Combining columns
  • Transposing, merging, long/wide
  • Rownames/column names
  • Concatenating
  • Creating lists
  • Subsetting (indices, logic, which)
  • Dealing with NA values w/ logic
  • Strings
  • Functions
  • lapply

Basic background

  • dataframe$column
  • read.csv(“data.csv”)
  • write.csv(object, “object.csv”)
  • head()
  • str()
  • colnames()
  • summary()
  • unique()
  • class()
  • levels()
  • nrow(), length()
  • dataframe[row, column]

Packages

install.packages(c("lubridate",
                   "tidyr", 
                   "tibble"))
library(lubridate)
library(tidyr)
library(tibble)

Structure

as.Date, as.String, as.Numeric, as.Factor use class() to find type

class(sales$event_dt)
[1] "factor"
sales$event_dt <- as.Date(sales$event_dt)
class(sales$event_dt)
[1] "Date"

Splitting columns

sales$event_dt[1:3]
[1] "2015-09-12" "2009-09-04" "2006-04-21"
strsplit(as.character(sales$event_dt[1:3]), "-")
[[1]]
[1] "2015" "09"   "12"  

[[2]]
[1] "2009" "09"   "04"  

[[3]]
[1] "2006" "04"   "21"  

Creating new dataframes

new.df <- data.frame(matrix(nrow=5, ncol=3))

Updating column names

colnames(new.df) <- c("column.1", "column.2", "column.3")
new.df$column.1 <- c(1,2,3,4,5)
new.df$column.2 <- c("a", "b", "c", "d", "e")
new.df
  column.1 column.2 column.3
1        1        a       NA
2        2        b       NA
3        3        c       NA
4        4        d       NA
5        5        e       NA

Renaming singular column

colnames(new.df[colnames(new.df) == "column.2"]) 
        <- "whatever"

Renaming data points

PSI$variable[PSI$variable=="so2_twenty_four"] <- "so2_24"

Combining columns

new.df$column.3 <- paste(new.df$column.1,
                new.df$column.2, sep="")
new.df$column.3
[1] "1a" "2b" "3c" "4d" "5e"

Long/wide

long <- gather(new.df, key, value)
long[1:3,]
       key value
1 column.1     1
2 column.1     2
3 column.1     3
long$ID <- rep(seq(1:5),3)
wide <- spread(long, key, value)
wide[1:3,2:4]
  column.1 column.2 column.3
1        1        a       1a
2        2        b       2b
3        3        c       3c

Transposing

t() returns a nested list. A dataframe must be specified if you want a dataframe.

new.df.t <- as.data.frame(t(new.df))
new.df.t
         V1 V2 V3 V4 V5
column.1  1  2  3  4  5
column.2  a  b  c  d  e
column.3 1a 2b 3c 4d 5e

Rownames/colnames

require(tibble)
rownames_to_column(new.df.t)
   rowname V1 V2 V3 V4 V5
1 column.1  1  2  3  4  5
2 column.2  a  b  c  d  e
3 column.3 1a 2b 3c 4d 5e

Concatenating dataframes

new.df.2 <- new.df
rbind(new.df, new.df.2)
   column.1 column.2 column.3
1         1        a       1a
2         2        b       2b
3         3        c       3c
4         4        d       4d
5         5        e       5e
6         1        a       1a
7         2        b       2b
8         3        c       3c
9         4        d       4d
10        5        e       5e

Creating new lists

mylist <- rep(NA, nrow(sales)/500)

mylist
 [1] NA NA NA NA NA NA NA NA NA NA

Subsetting by indices

R makes use of the [row,column] notation

sales[5:10, c(3,9)]
         primary_act_id major_cat_name
5  91c03a34b562436efa3c           MISC
6  ac4b847b3fde66f2117e           MISC
7  a14232befff04be1e2f3           MISC
8  91c03a34b562436efa3c           MISC
9  0efaba7ce3f0d7466b42           MISC
10 f6425a3223e73ea6de5a       CONCERTS

Subsetting by >=, ==, <=, !=

R accepts logical statements within a [row, column] subsetting argument

sales[sales$age_yr>70 & !is.na(sales$age_yr),
      c(34,14,15)]
     age_yr tickets_purchased_qty trans_face_val_amt
11       80                     1                 20
428      82                     1                 20
1220     80                     1                 20
1314     76                     1                 30
1701     72                     2                 30
3830     72                     1                 22
4188     90                     1                 15
4328     94                     1                  9
4975     78                     1                 20

Subsetting by %in%

sales[sales$venue_state %in% c("RHODE ISLAND", 
                               "MANITOBA"), c(18,26)]
       event_dt  venue_state
732  2015-08-08 RHODE ISLAND
1860 2015-09-17     MANITOBA
2046 2016-02-19     MANITOBA
2177 2015-12-16 RHODE ISLAND
2832 2015-09-17     MANITOBA
3304 2015-09-17     MANITOBA
4064 2016-01-27     MANITOBA
4119 2015-09-17     MANITOBA
4308 2015-11-16     MANITOBA
4935 2016-01-15     MANITOBA

Subsetting by which

If you have 500 columns, you may not know which column index to subset by in the previous example. Here the age column is extracted using which.

sales[sales$age_yr>70 & !is.na(sales$age_yr),
      c(which(colnames(sales) %in% 
                c("age_yr","tickets_purchased_qty")))]
     tickets_purchased_qty age_yr
11                       1     80
428                      1     82
1220                     1     80
1314                     1     76
1701                     2     72
3830                     1     72
4188                     1     90
4328                     1     94
4975                     1     78

Which.max, which.min

which.max and which.min are useful for removing known outliers.

sales[-c(which.max(sales$income_amt)),]

Merging

merged <- merge(x1, x2, by.x="column.x", by.y="column.y")

Reclassifying

levels(cut(sales$trans_face_val_amt, 5))
[1] "(-0.52,305]"         "(305,609]"           "(609,913]"          
[4] "(913,1.22e+03]"      "(1.22e+03,1.52e+03]"

Removing NA values

new.df[2,3] <- NA
new.df[1,2] <- NA
new.df[1,3] <- NA
new.df
  column.1 column.2 column.3
1        1     <NA>     <NA>
2        2        b     <NA>
3        3        c       3c
4        4        d       4d
5        5        e       5e

Na.omit

na.omit(new.df)
  column.1 column.2 column.3
3        3        c       3c
4        4        d       4d
5        5        e       5e

Column NAs

new.df[, colSums(is.na(new.df)) <= 1]
  column.1 column.2
1        1     <NA>
2        2        b
3        3        c
4        4        d
5        5        e

Row NAs

new.df[rowSums(is.na(new.df)) <= 1,]
  column.1 column.2 column.3
2        2        b     <NA>
3        3        c       3c
4        4        d       4d
5        5        e       5e

Converting date/time

lubridate package

year()
month()
day()
week()
as_date()
as_datetime()
time_length()

Lubridate examples

library(lubridate)
dates <- as.Date(sales$event_dt[1:3])
dates
[1] "2015-09-12" "2009-09-04" "2006-04-21"
year(dates)
[1] 2015 2009 2006
month(dates)
[1] 9 9 4
week(dates)
[1] 37 36 16
day(dates)
[1] 12  4 21
print(wday(dates, label=TRUE), max.levels=0)
[1] Sat Fri Fri

More examples

floor_date(dates, "month")
[1] "2015-09-01" "2009-09-01" "2006-04-01"
ceiling_date(dates, "season")
[1] "2015-12-01" "2009-12-01" "2006-06-01"
date1 <- "2009-08-03 12:01:59"
as.Date(date1) # uh-oh
[1] "2009-08-03"
as_datetime(date1)
[1] "2009-08-03 12:01:59 UTC"

Strings

grepl returns a logical TRUE/FALSE

files <- files[grepl('name', files) == TRUE]
gsub("hello","goodbye",files)

Creating functions

myfunction <- function(x) {
  z <- x + 1
  return(z)
}

myfunction(3)
[1] 4

Function to calculate percentage of NA in columns

calc.na <- function(x, data) {
    calc <- sum(is.na(data[[x]])/nrow(data))
    return(unlist(calc))
}

calc.na("onsale_dt", sales)
[1] 0.0202

What about the amount of every column?

Option 1: For loop

Problem 1: results duplicated Problem 2: returns list indices as column names, which cannot be iterated over or used in the future.

results <- rep(NA, length(colnames(sales)))

for (i in colnames(sales)) {
  results[i] <- calc.na(i, sales)
}

results[60]
tickets_purchased_qty 
                    0 

Better for loop

results <- rep(NA, length(colnames(sales)))

for (i in c(1:(length(colnames(sales))))) {
  results[i] <- calc.na(colnames(sales)[i], sales)
}

results
 [1] 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.6828 0.0000 0.0000
[11] 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.5784 0.0202
[21] 0.0008 0.0000 0.0848 0.0000 0.0000 0.0000 0.0000 0.4842 0.0000 0.0000
[31] 0.0000 0.0000 0.9646 0.9664 0.9692 0.9722 0.9776 0.9828 0.9648 0.9686
[41] 0.9740 0.9724 0.9842 0.9846 0.9874 0.9354

Option 2: lapply

unlist(lapply(colnames(sales), calc.na, sales))
 [1] 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.6828 0.0000 0.0000
[11] 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.5784 0.0202
[21] 0.0008 0.0000 0.0848 0.0000 0.0000 0.0000 0.0000 0.4842 0.0000 0.0000
[31] 0.0000 0.0000 0.9646 0.9664 0.9692 0.9722 0.9776 0.9828 0.9648 0.9686
[41] 0.9740 0.9724 0.9842 0.9846 0.9874 0.9354

Function - example 2

toMatch <- c("x", "y", "z", "...")
sentences <- c("sentence 1", "sentence 2", "...")

subset_sentences <- function(Match, sentences){
    sentences[grep(Match,sentences)]
}

subsetted <- lapply(toMatch, subset_sentences, sentences)

[[]] operator

adjust_conc <- function(data, col, amt, orig) {
  update <- (data[[col]] * sd(orig$Concentration)) + 
    mean(orig$Concentration)
  update <- update * amt
  update <- (update - mean(orig$Concentration))/
    sd(orig$Concentration)
return(update)
}

Example

Example of creating list of dates to loop over

dates <- seq(ymd_hms('2018-03-08 00:00:00'), 
             ymd_hms('2018-03-12 23:00:00'), 
             by="1 hour")

dates <- as.character(dates)

for (i in seq_along(dates)) {
  dates[i] <- gsub(" ", "T", dates[i])
}

dates[1:5]
[1] "2018-03-08T00:00:00" "2018-03-08T01:00:00" "2018-03-08T02:00:00"
[4] "2018-03-08T03:00:00" "2018-03-08T04:00:00"

Read in RDS

Data is saved as an RDS because it is a recursively nested list

require(lubridate)
require(tidyr)
weather <- readRDS("data/scraped_data.rds")
class(weather)
[1] "matrix"

Cleaning

weather <- unlist(rbind(weather[lapply(weather, 
                  length)>0]), 
                  recursive=FALSE)
weather.times <- weather[seq_along(weather) %% 2 > 0]
weather.times <- unlist(weather.times)

weather.readings <- weather[seq_along(weather) %% 2 == 0]
for (i in c(1:length(weather.readings))) {
  weather.readings[[i]][[1]][3] <- i
}

weather.readings <- do.call("rbind", weather.readings)
weather.readings <- do.call("rbind", weather.readings)

Continued

weather.readings <- spread(weather.readings, 
                           station_id, value)
weather.times.df <- as.data.frame(seq
                    (1, length(weather.times)))
weather.times.df$date <- weather.times
colnames(weather.times.df)[1] <- "V3"
weather.readings <- merge(weather.readings, 
                          weather.times.df, by="V3")
colnames(weather.readings)[19] <- "date"

Join metadata

weather.metadata <- read.csv("data/weather.metadata.csv")
weather.melted <- gather(weather.readings, key, 
                         value, -date, -V3)

weather.joined <- merge(weather.melted, 
                        weather.metadata, 
                        by.x="key", by.y="id")

weather.joined$date <- unlist(weather.joined$date)
weather.joined$day <- as_date(weather.joined$date)

Done!

weather.joined[1:5,c(10,4,8,9)]
         day value location.latitude location.longitude
1 2018-01-04  29.3            1.4172           103.7485
2 2018-01-02    NA            1.4172           103.7485
3 2018-01-03  25.7            1.4172           103.7485
4 2018-01-03  25.5            1.4172           103.7485
5 2018-01-04  29.3            1.4172           103.7485