Data Consistency

Aim of the Unit

This Unit aims to handle missing values and outliers

Missing values

Delete rows containing missing values

Missing values will appear as NA. Note that NA is not a string, but a symbol. You have several options for dealing with NA values.

Function Description
na.omit() Row-wise deletion of missing values
na.fail() Keep the object only if no-missing values are present
na.action=na.omit Common option in many functions, like linear model.
is.na Test for NA values and gives TRUE when NA occurs.

An example on the use of na.omit is the described below. For a given dataset:

# Make group factor and a response variable with 2 NA:
x <- sample(1:3, 10, replace = TRUE)# Randomly drawn "1", "2" and "3"  
y <- runif(10)                      # Randomly uniform population
y[sample(10, 2)] = NA               # Randomly place 2 NA
# Create a dataframe
dat <- data.frame(Class = x, A = y, B = sample(y))  
round(dat, digit=3)
##    Class     A     B
## 1      1 0.694 0.242
## 2      1    NA 0.694
## 3      3 0.738    NA
## 4      2 0.980 0.865
## 5      3 0.956 0.980
## 6      1    NA 0.106
## 7      2 0.242 0.956
## 8      1 0.898 0.898
## 9      2 0.106 0.738
## 10     3 0.865    NA
# Delete rows with missing values
round(na.omit(dat), digit=3)
##   Class     A     B
## 1     1 0.694 0.242
## 4     2 0.980 0.865
## 5     3 0.956 0.980
## 7     2 0.242 0.956
## 8     1 0.898 0.898
## 9     2 0.106 0.738

Replace NA with Column Mean

Given a dataset with some NA:

# make a group variable and a response with 2 NA:
group = sample(1:3, 10, replace = TRUE)  
y = runif(10)              # Make 10 data from a uniform population
y[sample(10, 2)] = NA      # Place 2 NA's randomly
df = data.frame(Class = group, A = y)  # Create the dataframe
round(df, digit=3)
##    Class     A
## 1      1 0.072
## 2      1 0.815
## 3      2    NA
## 4      3 0.044
## 5      1 0.618
## 6      2 0.889
## 7      2 0.018
## 8      3 0.528
## 9      3    NA
## 10     1 0.045

Replace NA’s with the column mean:

df <- sapply(df, function(x) replace(x, is.na(x), mean(x, na.rm = TRUE)))
round(df, digit=3)
##       Class     A
##  [1,]     1 0.072
##  [2,]     1 0.815
##  [3,]     2 0.379
##  [4,]     3 0.044
##  [5,]     1 0.618
##  [6,]     2 0.889
##  [7,]     2 0.018
##  [8,]     3 0.528
##  [9,]     3 0.379
## [10,]     1 0.045

Replace NA with row mean

Given a dataset with some NA:

# make a group variable and a response with 2 NA:
group = as.factor(sample(1:3, 10, replace = TRUE))
y = runif(10)              # Make 10 data from a uniform population
y[sample(10, 1)] = NA      # Place 2 NA's randomly
df = data.frame(Class = group, A = y, B = sample(y)*0.1)  # Create the dataframe
df
##    Class           A            B
## 1      1 0.406139004 0.0226243353
## 2      2          NA 0.0424522116
## 3      3 0.005966132 0.0819394313
## 4      1 0.819394313 0.0005966132
## 5      3 0.424522116           NA
## 6      1 0.902260807 0.0902260807
## 7      2 0.417519874 0.0093316971
## 8      2 0.196729045 0.0417519874
## 9      1 0.093316971 0.0196729045
## 10     1 0.226243353 0.0406139004

Replace NA’s with the row mean:

# Make the coordinates where NA is located:
k <- which(is.na(df), arr.ind=TRUE)
# substitute the NA cells with the row mean:
df[k] <- apply(df[-1], 1, function(x) mean(x, na.rm=TRUE))[k[,1]]

Replace NA by Group Means

Given a dataset with some NA:

# make a group variable and a response with 2 NA:
group = sample(1:3, 10, replace = TRUE)  
y = runif(10)              # Make 10 data from a uniform population
y[sample(10, 2)] = NA      # Place 2 NA's randomly
df = data.frame(Class = group, A = y)  # Create the dataframe
df
##    Class          A
## 1      2 0.11693989
## 2      3 0.86271803
## 3      1         NA
## 4      1 0.05045190
## 5      1 0.88332948
## 6      3 0.84929724
## 7      1 0.81243776
## 8      1 0.06677673
## 9      3 0.06318002
## 10     3         NA

Replace NA’s with group means:

df$A <- with(df, ave(x = A, Class,
                 FUN = function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))))
round(df, digit=3)
##    Class     A
## 1      2 0.117
## 2      3 0.863
## 3      1 0.453
## 4      1 0.050
## 5      1 0.883
## 6      3 0.849
## 7      1 0.812
## 8      1 0.067
## 9      3 0.063
## 10     3 0.592

Replacement of NA’s with column means

# Make data.frame with 2 NA:
group = sample(1:3, 10,        # Make 10 data randonly drawn
               replace = TRUE) # from 1, 2 and 3  
param = runif(10)              # Make 10 data from a uniform population
param[sample(10, 2)] = NA      # Place 2 NA's randomly
df = data.frame(Class = group, A = param, B = sample(param))  # Create dataframe
round(df, digit=3)
##    Class     A     B
## 1      3 0.925 0.820
## 2      1 0.141 0.363
## 3      2 0.201 0.005
## 4      1    NA 0.201
## 5      1    NA    NA
## 6      2 0.005 0.392
## 7      3 0.820    NA
## 8      3 0.392 0.917
## 9      2 0.917 0.925
## 10     3 0.363 0.141

Then, apply to each column the replace() function:

sapply(df, FUN = function(x) replace(x, is.na(x), mean(x, na.rm = TRUE)))
##       Class           A           B
##  [1,]     3 0.924633779 0.819683624
##  [2,]     1 0.140972989 0.362995645
##  [3,]     2 0.200528346 0.004579756
##  [4,]     1 0.470234591 0.200528346
##  [5,]     1 0.470234591 0.470234591
##  [6,]     2 0.004579756 0.391510678
##  [7,]     3 0.819683624 0.470234591
##  [8,]     3 0.391510678 0.916971910
##  [9,]     2 0.916971910 0.924633779
## [10,]     3 0.362995645 0.140972989

Columnwise replacement of NA’s with column means grouped by class

# Make data.frame with 2 NA:
group <- sample(1:3, 10,        # Make 10 data randonly drawn
               replace = TRUE) # from 1, 2 and 3  
param <- runif(10)              # Make 10 data from a uniform population
param[sample(10, 2)] = NA      # Place 2 NA's randomly
# Create dataframe
df <- data.frame(Class = group, A = param, B = sample(param))  
round(df, digit=3)
##    Class     A     B
## 1      3 0.179    NA
## 2      2 0.125 0.179
## 3      2 0.295    NA
## 4      3 0.972 0.125
## 5      3 0.919 0.295
## 6      3 0.033 0.972
## 7      2    NA 0.812
## 8      2 0.812 0.033
## 9      3 0.964 0.919
## 10     1    NA 0.964

The apporach here is, first, to split the dataframe in lists, one for each column:

df.list <- lapply(df[,2:3], c)
df.list
## $A
##  [1] 0.17887657 0.12534759 0.29506692 0.97190419 0.91926574 0.03277886
##  [7]         NA 0.81185843 0.96388622         NA
## 
## $B
##  [1]         NA 0.17887657         NA 0.12534759 0.29506692 0.97190419
##  [7] 0.81185843 0.03277886 0.91926574 0.96388622

Now, we can apply to each list (by tapply()) the function replace(x, is.na(x), FUN):

df.tapply <- lapply(df.list, function(x) tapply(x, df$Class, function(x) 
                                               replace(x, is.na(x), 
                                               mean(x, na.rm = TRUE))))

Finally, we can combine the results in a data.frame:

sapply(df.tapply,unlist)
##             A          B
## 1         NaN 0.96388622
## 21 0.12534759 0.17887657
## 22 0.29506692 0.34117129
## 23 0.41075765 0.81185843
## 24 0.81185843 0.03277886
## 31 0.17887657 0.57789611
## 32 0.97190419 0.12534759
## 33 0.91926574 0.29506692
## 34 0.03277886 0.97190419
## 35 0.96388622 0.91926574

Outlier removal

Let’s build some data with at least one outlier:

dat <- rnorm(n = 10, mean = 10, sd = 1)
dat.out <- data.frame(A = replace(dat, 2, 14)); dat.out
##            A
## 1  11.306102
## 2  14.000000
## 3   9.928638
## 4   9.596242
## 5   9.243622
## 6   9.377714
## 7   8.705887
## 8  10.118461
## 9   8.218649
## 10 11.098094

Remove outliers

Outliers can be defined as values that fall outside a specific threshold. Such threshold can be any arbitrary value, like 3 times the standard deviation or 3 times the interquartile range (IQR).

For instance, let’s build some data:

X <- c(3,4,3,2,14,5)

To test the presence of outliers in a vector, determine the difference between the first and third quartiles. This is given by:

low = quantile(X)[2]
upp = quantile(X)[4]
iqr = upp - low

Now, we can test if there is any value that is higher than 3 times iqr. The following script gives the outlier:

X[which(X > (iqr * 1.5) + upp)]
## [1] 14

The same procedure can be transferred into a function:

# define a function to remove outliers
FindOutliers <- function(data, id.col) {lowerq = quantile(data[,id.col])[2]
                                        upperq = quantile(data[,id.col])[4]
                                        iqr = upperq - lowerq #Or use IQR(data)
# we identify extreme outliers
extreme.threshold.upper = (iqr * 1.5) + upperq
extreme.threshold.lower = lowerq - (iqr * 1.5)
result <- which(data[,id.col] > extreme.threshold.upper | 
                  data[,id.col] < extreme.threshold.lower)
}

Let’s apply such function to our dataset:

# use the function to identify outliers
out <- FindOutliers(dat.out, 1); out
## [1] 2

Remove the outliers from the original dataset:

dat.out[-out,1]
## [1] 11.306102  9.928638  9.596242  9.243622  9.377714  8.705887 10.118461
## [8]  8.218649 11.098094

Replace outliers columnwise

It follows a function that I have found Klodian Dhana in this post: https://www.r-bloggers.com/identify-describe-plot-and-remove-the-outliers-from-the-dataset/

outlierKD <- function(dt, var) {
     var_name <- eval(substitute(var),eval(dt))
     na1 <- sum(is.na(var_name))
     m1 <- mean(var_name, na.rm = T)
     par(mfrow=c(2, 2), oma=c(0,0,3,0))
     boxplot(var_name, main="With outliers")
     hist(var_name, main="With outliers", xlab=NA, ylab=NA)
     outlier <- boxplot.stats(var_name)$out
     mo <- mean(outlier)
     var_name <- ifelse(var_name %in% outlier, NA, var_name)
     boxplot(var_name, main="Without outliers")
     hist(var_name, main="Without outliers", xlab=NA, ylab=NA)
     title("Outlier Check", outer=TRUE)
     na2 <- sum(is.na(var_name))
     cat("Outliers identified:", na2 - na1, "n")
     cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "n")
     cat("Mean of the outliers:", round(mo, 2), "n")
     m2 <- mean(var_name, na.rm = T)
     cat("Mean without removing outliers:", round(m1, 2), "n")
     cat("Mean if we remove outliers:", round(m2, 2), "n")
     response <- "yes"
#readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
     if(response == "y" | response == "yes"){
          dt[as.character(substitute(var))] <- invisible(var_name)
          assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
          cat("Outliers successfully removed", "n")
          return(invisible(dt))
     } else{
          cat("Nothing changed", "n")
          return(invisible(var_name))
     }
}

The function can be run by typing the dataset and the variable to test. LEt’s build some data with at least one outlier:

y <- rnorm(n = 10, mean = 10, sd = 1)
O <- data.frame(A = replace(y, 2, 14)); O
##            A
## 1  10.401193
## 2  14.000000
## 3   9.658929
## 4  11.126003
## 5   9.624690
## 6  11.123231
## 7  10.521865
## 8   8.996620
## 9  10.577148
## 10  8.097707

Now, apply the function:

outlierKD(O, A); O

## Outliers identified: 1 nPropotion (%) of outliers: 11.1 nMean of the outliers: 14 nMean without removing outliers: 10.41 nMean if we remove outliers: 10.01 nOutliers successfully removed n
##            A
## 1  10.401193
## 2         NA
## 3   9.658929
## 4  11.126003
## 5   9.624690
## 6  11.123231
## 7  10.521865
## 8   8.996620
## 9  10.577148
## 10  8.097707