# Aim of the Unit

This Unit aims to handle missing values and outliers

# Missing values

## Delete rows containing missing values

Missing values will appear as NA. Note that NA is not a string, but a symbol. You have several options for dealing with NA values.

Function Description
na.omit() Row-wise deletion of missing values
na.fail() Keep the object only if no-missing values are present
na.action=na.omit Common option in many functions, like linear model.
is.na Test for NA values and gives TRUE when NA occurs.

An example on the use of na.omit is the described below. For a given dataset:

# Make group factor and a response variable with 2 NA:
x <- sample(1:3, 10, replace = TRUE)# Randomly drawn "1", "2" and "3"
y <- runif(10)                      # Randomly uniform population
y[sample(10, 2)] = NA               # Randomly place 2 NA
# Create a dataframe
dat <- data.frame(Class = x, A = y, B = sample(y))
round(dat, digit=3)
##    Class     A     B
## 1      1 0.694 0.242
## 2      1    NA 0.694
## 3      3 0.738    NA
## 4      2 0.980 0.865
## 5      3 0.956 0.980
## 6      1    NA 0.106
## 7      2 0.242 0.956
## 8      1 0.898 0.898
## 9      2 0.106 0.738
## 10     3 0.865    NA
# Delete rows with missing values
round(na.omit(dat), digit=3)
##   Class     A     B
## 1     1 0.694 0.242
## 4     2 0.980 0.865
## 5     3 0.956 0.980
## 7     2 0.242 0.956
## 8     1 0.898 0.898
## 9     2 0.106 0.738

## Replace NA with Column Mean

Given a dataset with some NA:

# make a group variable and a response with 2 NA:
group = sample(1:3, 10, replace = TRUE)
y = runif(10)              # Make 10 data from a uniform population
y[sample(10, 2)] = NA      # Place 2 NA's randomly
df = data.frame(Class = group, A = y)  # Create the dataframe
round(df, digit=3)
##    Class     A
## 1      1 0.072
## 2      1 0.815
## 3      2    NA
## 4      3 0.044
## 5      1 0.618
## 6      2 0.889
## 7      2 0.018
## 8      3 0.528
## 9      3    NA
## 10     1 0.045

Replace NAâ€™s with the column mean:

df <- sapply(df, function(x) replace(x, is.na(x), mean(x, na.rm = TRUE)))
round(df, digit=3)
##       Class     A
##  [1,]     1 0.072
##  [2,]     1 0.815
##  [3,]     2 0.379
##  [4,]     3 0.044
##  [5,]     1 0.618
##  [6,]     2 0.889
##  [7,]     2 0.018
##  [8,]     3 0.528
##  [9,]     3 0.379
## [10,]     1 0.045

## Replace NA with row mean

Given a dataset with some NA:

# make a group variable and a response with 2 NA:
group = as.factor(sample(1:3, 10, replace = TRUE))
y = runif(10)              # Make 10 data from a uniform population
y[sample(10, 1)] = NA      # Place 2 NA's randomly
df = data.frame(Class = group, A = y, B = sample(y)*0.1)  # Create the dataframe
df
##    Class           A            B
## 1      1 0.406139004 0.0226243353
## 2      2          NA 0.0424522116
## 3      3 0.005966132 0.0819394313
## 4      1 0.819394313 0.0005966132
## 5      3 0.424522116           NA
## 6      1 0.902260807 0.0902260807
## 7      2 0.417519874 0.0093316971
## 8      2 0.196729045 0.0417519874
## 9      1 0.093316971 0.0196729045
## 10     1 0.226243353 0.0406139004

Replace NAâ€™s with the row mean:

# Make the coordinates where NA is located:
k <- which(is.na(df), arr.ind=TRUE)
# substitute the NA cells with the row mean:
df[k] <- apply(df[-1], 1, function(x) mean(x, na.rm=TRUE))[k[,1]]

## Replace NA by Group Means

Given a dataset with some NA:

# make a group variable and a response with 2 NA:
group = sample(1:3, 10, replace = TRUE)
y = runif(10)              # Make 10 data from a uniform population
y[sample(10, 2)] = NA      # Place 2 NA's randomly
df = data.frame(Class = group, A = y)  # Create the dataframe
df
##    Class          A
## 1      2 0.11693989
## 2      3 0.86271803
## 3      1         NA
## 4      1 0.05045190
## 5      1 0.88332948
## 6      3 0.84929724
## 7      1 0.81243776
## 8      1 0.06677673
## 9      3 0.06318002
## 10     3         NA

Replace NAâ€™s with group means:

df$A <- with(df, ave(x = A, Class, FUN = function(x) replace(x, is.na(x), mean(x, na.rm = TRUE)))) round(df, digit=3) ## Class A ## 1 2 0.117 ## 2 3 0.863 ## 3 1 0.453 ## 4 1 0.050 ## 5 1 0.883 ## 6 3 0.849 ## 7 1 0.812 ## 8 1 0.067 ## 9 3 0.063 ## 10 3 0.592 ## Replacement of NAâ€™s with column means # Make data.frame with 2 NA: group = sample(1:3, 10, # Make 10 data randonly drawn replace = TRUE) # from 1, 2 and 3 param = runif(10) # Make 10 data from a uniform population param[sample(10, 2)] = NA # Place 2 NA's randomly df = data.frame(Class = group, A = param, B = sample(param)) # Create dataframe round(df, digit=3) ## Class A B ## 1 3 0.925 0.820 ## 2 1 0.141 0.363 ## 3 2 0.201 0.005 ## 4 1 NA 0.201 ## 5 1 NA NA ## 6 2 0.005 0.392 ## 7 3 0.820 NA ## 8 3 0.392 0.917 ## 9 2 0.917 0.925 ## 10 3 0.363 0.141 Then, apply to each column the replace() function: sapply(df, FUN = function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))) ## Class A B ## [1,] 3 0.924633779 0.819683624 ## [2,] 1 0.140972989 0.362995645 ## [3,] 2 0.200528346 0.004579756 ## [4,] 1 0.470234591 0.200528346 ## [5,] 1 0.470234591 0.470234591 ## [6,] 2 0.004579756 0.391510678 ## [7,] 3 0.819683624 0.470234591 ## [8,] 3 0.391510678 0.916971910 ## [9,] 2 0.916971910 0.924633779 ## [10,] 3 0.362995645 0.140972989 ## Columnwise replacement of NAâ€™s with column means grouped by class # Make data.frame with 2 NA: group <- sample(1:3, 10, # Make 10 data randonly drawn replace = TRUE) # from 1, 2 and 3 param <- runif(10) # Make 10 data from a uniform population param[sample(10, 2)] = NA # Place 2 NA's randomly # Create dataframe df <- data.frame(Class = group, A = param, B = sample(param)) round(df, digit=3) ## Class A B ## 1 3 0.179 NA ## 2 2 0.125 0.179 ## 3 2 0.295 NA ## 4 3 0.972 0.125 ## 5 3 0.919 0.295 ## 6 3 0.033 0.972 ## 7 2 NA 0.812 ## 8 2 0.812 0.033 ## 9 3 0.964 0.919 ## 10 1 NA 0.964 The apporach here is, first, to split the dataframe in lists, one for each column: df.list <- lapply(df[,2:3], c) df.list ##$A
##  [1] 0.17887657 0.12534759 0.29506692 0.97190419 0.91926574 0.03277886
##  [7]         NA 0.81185843 0.96388622         NA
##
## $B ## [1] NA 0.17887657 NA 0.12534759 0.29506692 0.97190419 ## [7] 0.81185843 0.03277886 0.91926574 0.96388622 Now, we can apply to each list (by tapply()) the function replace(x, is.na(x), FUN): df.tapply <- lapply(df.list, function(x) tapply(x, df$Class, function(x)
replace(x, is.na(x),
mean(x, na.rm = TRUE))))

Finally, we can combine the results in a data.frame:

sapply(df.tapply,unlist)
##             A          B
## 1         NaN 0.96388622
## 21 0.12534759 0.17887657
## 22 0.29506692 0.34117129
## 23 0.41075765 0.81185843
## 24 0.81185843 0.03277886
## 31 0.17887657 0.57789611
## 32 0.97190419 0.12534759
## 33 0.91926574 0.29506692
## 34 0.03277886 0.97190419
## 35 0.96388622 0.91926574

# Outlier removal

Letâ€™s build some data with at least one outlier:

dat <- rnorm(n = 10, mean = 10, sd = 1)
dat.out <- data.frame(A = replace(dat, 2, 14)); dat.out
##            A
## 1  11.306102
## 2  14.000000
## 3   9.928638
## 4   9.596242
## 5   9.243622
## 6   9.377714
## 7   8.705887
## 8  10.118461
## 9   8.218649
## 10 11.098094

## Remove outliers

Outliers can be defined as values that fall outside a specific threshold. Such threshold can be any arbitrary value, like 3 times the standard deviation or 3 times the interquartile range (IQR).

For instance, letâ€™s build some data:

X <- c(3,4,3,2,14,5)

To test the presence of outliers in a vector, determine the difference between the first and third quartiles. This is given by:

low = quantile(X)[2]
upp = quantile(X)[4]
iqr = upp - low

Now, we can test if there is any value that is higher than 3 times iqr. The following script gives the outlier:

X[which(X > (iqr * 1.5) + upp)]
## [1] 14

The same procedure can be transferred into a function:

# define a function to remove outliers
FindOutliers <- function(data, id.col) {lowerq = quantile(data[,id.col])[2]
upperq = quantile(data[,id.col])[4]
iqr = upperq - lowerq #Or use IQR(data)
# we identify extreme outliers
extreme.threshold.upper = (iqr * 1.5) + upperq
extreme.threshold.lower = lowerq - (iqr * 1.5)
result <- which(data[,id.col] > extreme.threshold.upper |
data[,id.col] < extreme.threshold.lower)
}

Letâ€™s apply such function to our dataset:

# use the function to identify outliers
out <- FindOutliers(dat.out, 1); out
## [1] 2

Remove the outliers from the original dataset:

dat.out[-out,1]
## [1] 11.306102  9.928638  9.596242  9.243622  9.377714  8.705887 10.118461
## [8]  8.218649 11.098094

## Replace outliers columnwise

It follows a function that I have found Klodian Dhana in this post: https://www.r-bloggers.com/identify-describe-plot-and-remove-the-outliers-from-the-dataset/

outlierKD <- function(dt, var) {
var_name <- eval(substitute(var),eval(dt))
na1 <- sum(is.na(var_name))
m1 <- mean(var_name, na.rm = T)
par(mfrow=c(2, 2), oma=c(0,0,3,0))
boxplot(var_name, main="With outliers")
hist(var_name, main="With outliers", xlab=NA, ylab=NA)
outlier <- boxplot.stats(var_name)$out mo <- mean(outlier) var_name <- ifelse(var_name %in% outlier, NA, var_name) boxplot(var_name, main="Without outliers") hist(var_name, main="Without outliers", xlab=NA, ylab=NA) title("Outlier Check", outer=TRUE) na2 <- sum(is.na(var_name)) cat("Outliers identified:", na2 - na1, "n") cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "n") cat("Mean of the outliers:", round(mo, 2), "n") m2 <- mean(var_name, na.rm = T) cat("Mean without removing outliers:", round(m1, 2), "n") cat("Mean if we remove outliers:", round(m2, 2), "n") response <- "yes" #readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ") if(response == "y" | response == "yes"){ dt[as.character(substitute(var))] <- invisible(var_name) assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
cat("Outliers successfully removed", "n")
return(invisible(dt))
} else{
cat("Nothing changed", "n")
return(invisible(var_name))
}
}

The function can be run by typing the dataset and the variable to test. LEtâ€™s build some data with at least one outlier:

y <- rnorm(n = 10, mean = 10, sd = 1)
O <- data.frame(A = replace(y, 2, 14)); O
##            A
## 1  10.401193
## 2  14.000000
## 3   9.658929
## 4  11.126003
## 5   9.624690
## 6  11.123231
## 7  10.521865
## 8   8.996620
## 9  10.577148
## 10  8.097707

Now, apply the function:

outlierKD(O, A); O

## Outliers identified: 1 nPropotion (%) of outliers: 11.1 nMean of the outliers: 14 nMean without removing outliers: 10.41 nMean if we remove outliers: 10.01 nOutliers successfully removed n
##            A
## 1  10.401193
## 2         NA
## 3   9.658929
## 4  11.126003
## 5   9.624690
## 6  11.123231
## 7  10.521865
## 8   8.996620
## 9  10.577148
## 10  8.097707