This Unit aims to handle missing values and outliers
Missing values will appear as NA
. Note that NA
is not a string, but a symbol. You have several options for dealing with NA values.
Function | Description |
---|---|
na.omit() |
Row-wise deletion of missing values |
na.fail() |
Keep the object only if no-missing values are present |
na.action=na.omit |
Common option in many functions, like linear model. |
is.na |
Test for NA values and gives TRUE when NA occurs. |
An example on the use of na.omit
is the described below. For a given dataset:
# Make group factor and a response variable with 2 NA:
x <- sample(1:3, 10, replace = TRUE)# Randomly drawn "1", "2" and "3"
y <- runif(10) # Randomly uniform population
y[sample(10, 2)] = NA # Randomly place 2 NA
# Create a dataframe
dat <- data.frame(Class = x, A = y, B = sample(y))
round(dat, digit=3)
## Class A B
## 1 1 0.694 0.242
## 2 1 NA 0.694
## 3 3 0.738 NA
## 4 2 0.980 0.865
## 5 3 0.956 0.980
## 6 1 NA 0.106
## 7 2 0.242 0.956
## 8 1 0.898 0.898
## 9 2 0.106 0.738
## 10 3 0.865 NA
# Delete rows with missing values
round(na.omit(dat), digit=3)
## Class A B
## 1 1 0.694 0.242
## 4 2 0.980 0.865
## 5 3 0.956 0.980
## 7 2 0.242 0.956
## 8 1 0.898 0.898
## 9 2 0.106 0.738
Given a dataset with some NA:
# make a group variable and a response with 2 NA:
group = sample(1:3, 10, replace = TRUE)
y = runif(10) # Make 10 data from a uniform population
y[sample(10, 2)] = NA # Place 2 NA's randomly
df = data.frame(Class = group, A = y) # Create the dataframe
round(df, digit=3)
## Class A
## 1 1 0.072
## 2 1 0.815
## 3 2 NA
## 4 3 0.044
## 5 1 0.618
## 6 2 0.889
## 7 2 0.018
## 8 3 0.528
## 9 3 NA
## 10 1 0.045
Replace NA’s with the column mean:
df <- sapply(df, function(x) replace(x, is.na(x), mean(x, na.rm = TRUE)))
round(df, digit=3)
## Class A
## [1,] 1 0.072
## [2,] 1 0.815
## [3,] 2 0.379
## [4,] 3 0.044
## [5,] 1 0.618
## [6,] 2 0.889
## [7,] 2 0.018
## [8,] 3 0.528
## [9,] 3 0.379
## [10,] 1 0.045
Given a dataset with some NA:
# make a group variable and a response with 2 NA:
group = as.factor(sample(1:3, 10, replace = TRUE))
y = runif(10) # Make 10 data from a uniform population
y[sample(10, 1)] = NA # Place 2 NA's randomly
df = data.frame(Class = group, A = y, B = sample(y)*0.1) # Create the dataframe
df
## Class A B
## 1 1 0.406139004 0.0226243353
## 2 2 NA 0.0424522116
## 3 3 0.005966132 0.0819394313
## 4 1 0.819394313 0.0005966132
## 5 3 0.424522116 NA
## 6 1 0.902260807 0.0902260807
## 7 2 0.417519874 0.0093316971
## 8 2 0.196729045 0.0417519874
## 9 1 0.093316971 0.0196729045
## 10 1 0.226243353 0.0406139004
Replace NA’s with the row mean:
# Make the coordinates where NA is located:
k <- which(is.na(df), arr.ind=TRUE)
# substitute the NA cells with the row mean:
df[k] <- apply(df[-1], 1, function(x) mean(x, na.rm=TRUE))[k[,1]]
Given a dataset with some NA:
# make a group variable and a response with 2 NA:
group = sample(1:3, 10, replace = TRUE)
y = runif(10) # Make 10 data from a uniform population
y[sample(10, 2)] = NA # Place 2 NA's randomly
df = data.frame(Class = group, A = y) # Create the dataframe
df
## Class A
## 1 2 0.11693989
## 2 3 0.86271803
## 3 1 NA
## 4 1 0.05045190
## 5 1 0.88332948
## 6 3 0.84929724
## 7 1 0.81243776
## 8 1 0.06677673
## 9 3 0.06318002
## 10 3 NA
Replace NA’s with group means:
df$A <- with(df, ave(x = A, Class,
FUN = function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))))
round(df, digit=3)
## Class A
## 1 2 0.117
## 2 3 0.863
## 3 1 0.453
## 4 1 0.050
## 5 1 0.883
## 6 3 0.849
## 7 1 0.812
## 8 1 0.067
## 9 3 0.063
## 10 3 0.592
# Make data.frame with 2 NA:
group = sample(1:3, 10, # Make 10 data randonly drawn
replace = TRUE) # from 1, 2 and 3
param = runif(10) # Make 10 data from a uniform population
param[sample(10, 2)] = NA # Place 2 NA's randomly
df = data.frame(Class = group, A = param, B = sample(param)) # Create dataframe
round(df, digit=3)
## Class A B
## 1 3 0.925 0.820
## 2 1 0.141 0.363
## 3 2 0.201 0.005
## 4 1 NA 0.201
## 5 1 NA NA
## 6 2 0.005 0.392
## 7 3 0.820 NA
## 8 3 0.392 0.917
## 9 2 0.917 0.925
## 10 3 0.363 0.141
Then, apply to each column the replace()
function:
sapply(df, FUN = function(x) replace(x, is.na(x), mean(x, na.rm = TRUE)))
## Class A B
## [1,] 3 0.924633779 0.819683624
## [2,] 1 0.140972989 0.362995645
## [3,] 2 0.200528346 0.004579756
## [4,] 1 0.470234591 0.200528346
## [5,] 1 0.470234591 0.470234591
## [6,] 2 0.004579756 0.391510678
## [7,] 3 0.819683624 0.470234591
## [8,] 3 0.391510678 0.916971910
## [9,] 2 0.916971910 0.924633779
## [10,] 3 0.362995645 0.140972989
# Make data.frame with 2 NA:
group <- sample(1:3, 10, # Make 10 data randonly drawn
replace = TRUE) # from 1, 2 and 3
param <- runif(10) # Make 10 data from a uniform population
param[sample(10, 2)] = NA # Place 2 NA's randomly
# Create dataframe
df <- data.frame(Class = group, A = param, B = sample(param))
round(df, digit=3)
## Class A B
## 1 3 0.179 NA
## 2 2 0.125 0.179
## 3 2 0.295 NA
## 4 3 0.972 0.125
## 5 3 0.919 0.295
## 6 3 0.033 0.972
## 7 2 NA 0.812
## 8 2 0.812 0.033
## 9 3 0.964 0.919
## 10 1 NA 0.964
The apporach here is, first, to split the dataframe in lists, one for each column:
df.list <- lapply(df[,2:3], c)
df.list
## $A
## [1] 0.17887657 0.12534759 0.29506692 0.97190419 0.91926574 0.03277886
## [7] NA 0.81185843 0.96388622 NA
##
## $B
## [1] NA 0.17887657 NA 0.12534759 0.29506692 0.97190419
## [7] 0.81185843 0.03277886 0.91926574 0.96388622
Now, we can apply to each list (by tapply()
) the function replace(x, is.na(x), FUN)
:
df.tapply <- lapply(df.list, function(x) tapply(x, df$Class, function(x)
replace(x, is.na(x),
mean(x, na.rm = TRUE))))
Finally, we can combine the results in a data.frame:
sapply(df.tapply,unlist)
## A B
## 1 NaN 0.96388622
## 21 0.12534759 0.17887657
## 22 0.29506692 0.34117129
## 23 0.41075765 0.81185843
## 24 0.81185843 0.03277886
## 31 0.17887657 0.57789611
## 32 0.97190419 0.12534759
## 33 0.91926574 0.29506692
## 34 0.03277886 0.97190419
## 35 0.96388622 0.91926574
Let’s build some data with at least one outlier:
dat <- rnorm(n = 10, mean = 10, sd = 1)
dat.out <- data.frame(A = replace(dat, 2, 14)); dat.out
## A
## 1 11.306102
## 2 14.000000
## 3 9.928638
## 4 9.596242
## 5 9.243622
## 6 9.377714
## 7 8.705887
## 8 10.118461
## 9 8.218649
## 10 11.098094
Outliers can be defined as values that fall outside a specific threshold. Such threshold can be any arbitrary value, like 3 times the standard deviation or 3 times the interquartile range (IQR).
For instance, let’s build some data:
X <- c(3,4,3,2,14,5)
To test the presence of outliers in a vector, determine the difference between the first and third quartiles. This is given by:
low = quantile(X)[2]
upp = quantile(X)[4]
iqr = upp - low
Now, we can test if there is any value that is higher than 3 times iqr
. The following script gives the outlier:
X[which(X > (iqr * 1.5) + upp)]
## [1] 14
The same procedure can be transferred into a function:
# define a function to remove outliers
FindOutliers <- function(data, id.col) {lowerq = quantile(data[,id.col])[2]
upperq = quantile(data[,id.col])[4]
iqr = upperq - lowerq #Or use IQR(data)
# we identify extreme outliers
extreme.threshold.upper = (iqr * 1.5) + upperq
extreme.threshold.lower = lowerq - (iqr * 1.5)
result <- which(data[,id.col] > extreme.threshold.upper |
data[,id.col] < extreme.threshold.lower)
}
Let’s apply such function to our dataset:
# use the function to identify outliers
out <- FindOutliers(dat.out, 1); out
## [1] 2
Remove the outliers from the original dataset:
dat.out[-out,1]
## [1] 11.306102 9.928638 9.596242 9.243622 9.377714 8.705887 10.118461
## [8] 8.218649 11.098094
It follows a function that I have found Klodian Dhana in this post: https://www.r-bloggers.com/identify-describe-plot-and-remove-the-outliers-from-the-dataset/
outlierKD <- function(dt, var) {
var_name <- eval(substitute(var),eval(dt))
na1 <- sum(is.na(var_name))
m1 <- mean(var_name, na.rm = T)
par(mfrow=c(2, 2), oma=c(0,0,3,0))
boxplot(var_name, main="With outliers")
hist(var_name, main="With outliers", xlab=NA, ylab=NA)
outlier <- boxplot.stats(var_name)$out
mo <- mean(outlier)
var_name <- ifelse(var_name %in% outlier, NA, var_name)
boxplot(var_name, main="Without outliers")
hist(var_name, main="Without outliers", xlab=NA, ylab=NA)
title("Outlier Check", outer=TRUE)
na2 <- sum(is.na(var_name))
cat("Outliers identified:", na2 - na1, "n")
cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "n")
cat("Mean of the outliers:", round(mo, 2), "n")
m2 <- mean(var_name, na.rm = T)
cat("Mean without removing outliers:", round(m1, 2), "n")
cat("Mean if we remove outliers:", round(m2, 2), "n")
response <- "yes"
#readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
if(response == "y" | response == "yes"){
dt[as.character(substitute(var))] <- invisible(var_name)
assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
cat("Outliers successfully removed", "n")
return(invisible(dt))
} else{
cat("Nothing changed", "n")
return(invisible(var_name))
}
}
The function can be run by typing the dataset and the variable to test. LEt’s build some data with at least one outlier:
y <- rnorm(n = 10, mean = 10, sd = 1)
O <- data.frame(A = replace(y, 2, 14)); O
## A
## 1 10.401193
## 2 14.000000
## 3 9.658929
## 4 11.126003
## 5 9.624690
## 6 11.123231
## 7 10.521865
## 8 8.996620
## 9 10.577148
## 10 8.097707
Now, apply the function:
outlierKD(O, A); O
## Outliers identified: 1 nPropotion (%) of outliers: 11.1 nMean of the outliers: 14 nMean without removing outliers: 10.41 nMean if we remove outliers: 10.01 nOutliers successfully removed n
## A
## 1 10.401193
## 2 NA
## 3 9.658929
## 4 11.126003
## 5 9.624690
## 6 11.123231
## 7 10.521865
## 8 8.996620
## 9 10.577148
## 10 8.097707