Preprocessing

# Preprocessing data

## Subsetting data

Subsetting can be also obtained by the "subset()" function:

  subset(iris, Species == "virginica" & Sepal.Length > 7.65)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 118          7.7         3.8          6.7         2.2 virginica
## 119          7.7         2.6          6.9         2.3 virginica
## 123          7.7         2.8          6.7         2.0 virginica
## 132          7.9         3.8          6.4         2.0 virginica
## 136          7.7         3.0          6.1         2.3 virginica


### Aggregating data

The function "aggregate" computes summary statistics by a factor or variable:

    aggregate(iris[,1:4], list(iris$Species), mean)  ## Group.1 Sepal.Length Sepal.Width Petal.Length Petal.Width ## 1 setosa 5.006 3.428 1.462 0.246 ## 2 versicolor 5.936 2.770 4.260 1.326 ## 3 virginica 6.588 2.974 5.552 2.026  ### Reshaping data Datasets can be designde in two main forms: long or wide. Long designs are used to minimize the number of columns. In table with long design, the variables are in a same column and the resulting value is in a column behind. Consider like in this example:  VAR <- rep(c("A", "B", "C", "D"), each = 2) OUT <- rnorm(mean = 10, sd = 1, n = 8) data.frame(VAR, OUT)  ## VAR OUT ## 1 A 10.922711 ## 2 A 9.663043 ## 3 B 9.947090 ## 4 B 8.840834 ## 5 C 9.397669 ## 6 C 8.944838 ## 7 D 11.565463 ## 8 D 7.879598  Wide designs are used to minimize the number of rows. In table with wide design, the variables are in independent columns where are listed their resulting values. Consider this example:  A <- rnorm(mean = 10, sd = 1, n = 2) B <- rnorm(mean = 10, sd = 1, n = 2) C <- rnorm(mean = 10, sd = 1, n = 2) D <- rnorm(mean = 10, sd = 1, n = 2) data.frame(A, B, C, D)  ## A B C D ## 1 8.826595 10.47677 10.40452 9.545675 ## 2 9.055044 10.24750 11.72397 8.330696  It is often desirable to move from one design to an other. This can be done with the package "reshape":  library(reshape) DAT <- data.frame(A, B, C, D) # This create a dataframe with wide format. melt(DAT) # This convert the dataframe in long format.  ## Using as id variables  ## variable value ## 1 A 8.826595 ## 2 A 9.055044 ## 3 B 10.476765 ## 4 B 10.247503 ## 5 C 10.404515 ## 6 C 11.723973 ## 7 D 9.545675 ## 8 D 8.330696  To reconvert the dataset from the long format to the wide format:  DAT.long <- melt(DAT) # This create a dataframe with a long format.  ## Using as id variables   cast(DAT.long, formula = variable + value ~ .) # This convert it back in the wide format  ## variable value (all) ## 1 A 8.826595 8.826595 ## 2 A 9.055044 9.055044 ## 3 B 10.247503 10.247503 ## 4 B 10.476765 10.476765 ## 5 C 10.404515 10.404515 ## 6 C 11.723973 11.723973 ## 7 D 8.330696 8.330696 ## 8 D 9.545675 9.545675   #Air quality example airquality[1:3, ]  ## Ozone Solar.R Wind Temp Month Day ## 1 41 190 7.4 67 5 1 ## 2 36 118 8.0 72 5 2 ## 3 12 149 12.6 74 5 3   names(airquality) <- tolower(names(airquality)) aqm <- melt(airquality, id=c("month", "day"), na.rm=TRUE) aqm[1:3, ]  ## month day variable value ## 1 5 1 ozone 41 ## 2 5 2 ozone 36 ## 3 5 3 ozone 12   cast(aqm, month + day ~ variable, margins = FALSE)[1:3, ]  ## month day ozone solar.r wind temp ## 1 5 1 41 190 7.4 67 ## 2 5 2 36 118 8.0 72 ## 3 5 3 12 149 12.6 74   cast(aqm, day ~ month ~ variable)  ## , , variable = ozone ## ## month ## day 5 6 7 8 9 ## 1 41 NA 135 39 96 ## 2 36 NA 49 9 78 ## 3 12 NA 32 16 73 ## 4 18 NA NA 78 91 ## 5 NA NA 64 35 47 ## 6 28 NA 40 66 32 ## 7 23 29 77 122 20 ## 8 19 NA 97 89 23 ## 9 8 71 97 110 21 ## 10 NA 39 85 NA 24 ## 11 7 NA NA NA 44 ## 12 16 NA 10 44 21 ## 13 11 23 27 28 28 ## 14 14 NA NA 65 9 ## 15 18 NA 7 NA 13 ## 16 14 21 48 22 46 ## 17 34 37 35 59 18 ## 18 6 20 61 23 13 ## 19 30 12 79 31 24 ## 20 11 13 63 44 16 ## 21 1 NA 16 21 13 ## 22 11 NA NA 9 23 ## 23 4 NA NA NA 36 ## 24 32 NA 80 45 7 ## 25 NA NA 108 168 14 ## 26 NA NA 20 73 30 ## 27 NA NA 52 NA NA ## 28 23 NA 82 76 14 ## 29 45 NA 50 118 18 ## 30 115 NA 64 84 20 ## 31 37 NA 59 85 NA ## ## , , variable = solar.r ## ## month ## day 5 6 7 8 9 ## 1 190 286 269 83 167 ## 2 118 287 248 24 197 ## 3 149 242 236 77 183 ## 4 313 186 101 NA 189 ## 5 NA 220 175 NA 95 ## 6 NA 264 314 NA 92 ## 7 299 127 276 255 252 ## 8 99 273 267 229 220 ## 9 19 291 272 207 230 ## 10 194 323 175 222 259 ## 11 NA 259 139 137 236 ## 12 256 250 264 192 259 ## 13 290 148 175 273 238 ## 14 274 332 291 157 24 ## 15 65 322 48 64 112 ## 16 334 191 260 71 237 ## 17 307 284 274 51 224 ## 18 78 37 285 115 27 ## 19 322 120 187 244 238 ## 20 44 137 220 190 201 ## 21 8 150 7 259 238 ## 22 320 59 258 36 14 ## 23 25 91 295 255 139 ## 24 92 250 294 212 49 ## 25 66 135 223 238 20 ## 26 266 127 81 215 193 ## 27 NA 47 82 153 145 ## 28 13 98 213 203 191 ## 29 252 31 275 225 131 ## 30 223 138 253 237 223 ## 31 279 NA 254 188 NA ## ## , , variable = wind ## ## month ## day 5 6 7 8 9 ## 1 7.4 8.6 4.1 6.9 6.9 ## 2 8.0 9.7 9.2 13.8 5.1 ## 3 12.6 16.1 9.2 7.4 2.8 ## 4 11.5 9.2 10.9 6.9 4.6 ## 5 14.3 8.6 4.6 7.4 7.4 ## 6 14.9 14.3 10.9 4.6 15.5 ## 7 8.6 9.7 5.1 4.0 10.9 ## 8 13.8 6.9 6.3 10.3 10.3 ## 9 20.1 13.8 5.7 8.0 10.9 ## 10 8.6 11.5 7.4 8.6 9.7 ## 11 6.9 10.9 8.6 11.5 14.9 ## 12 9.7 9.2 14.3 11.5 15.5 ## 13 9.2 8.0 14.9 11.5 6.3 ## 14 10.9 13.8 14.9 9.7 10.9 ## 15 13.2 11.5 14.3 11.5 11.5 ## 16 11.5 14.9 6.9 10.3 6.9 ## 17 12.0 20.7 10.3 6.3 13.8 ## 18 18.4 9.2 6.3 7.4 10.3 ## 19 11.5 11.5 5.1 10.9 10.3 ## 20 9.7 10.3 11.5 10.3 8.0 ## 21 9.7 6.3 6.9 15.5 12.6 ## 22 16.6 1.7 9.7 14.3 9.2 ## 23 9.7 4.6 11.5 12.6 10.3 ## 24 12.0 6.3 8.6 9.7 10.3 ## 25 16.6 8.0 8.0 3.4 16.6 ## 26 14.9 8.0 8.6 8.0 6.9 ## 27 8.0 10.3 12.0 5.7 13.2 ## 28 12.0 11.5 7.4 9.7 14.3 ## 29 14.9 14.9 7.4 2.3 8.0 ## 30 5.7 8.0 7.4 6.3 11.5 ## 31 7.4 NA 9.2 6.3 NA ## ## , , variable = temp ## ## month ## day 5 6 7 8 9 ## 1 67 78 84 81 91 ## 2 72 74 85 81 92 ## 3 74 67 81 82 93 ## 4 62 84 84 86 93 ## 5 56 85 83 85 87 ## 6 66 79 83 87 84 ## 7 65 82 88 89 80 ## 8 59 87 92 90 78 ## 9 61 90 92 90 75 ## 10 69 87 89 92 73 ## 11 74 93 82 86 81 ## 12 69 92 73 86 76 ## 13 66 82 81 82 77 ## 14 68 80 91 80 71 ## 15 58 79 80 79 71 ## 16 64 77 81 77 78 ## 17 66 72 82 79 67 ## 18 57 65 84 76 76 ## 19 68 73 87 78 68 ## 20 62 76 85 78 82 ## 21 59 77 74 77 64 ## 22 73 76 81 72 71 ## 23 61 76 82 75 81 ## 24 61 76 86 79 69 ## 25 57 75 85 81 63 ## 26 58 78 82 86 70 ## 27 57 73 86 88 77 ## 28 67 80 88 97 75 ## 29 81 77 86 94 76 ## 30 79 83 83 96 68 ## 31 76 NA 81 94 NA   cast(aqm, month ~ variable, mean)[1:3, ]  ## month ozone solar.r wind temp ## 1 5 23.61538 181.2963 11.622581 65.54839 ## 2 6 29.44444 190.1667 10.266667 79.10000 ## 3 7 59.11538 216.4839 8.941935 83.90323   cast(aqm, month ~ . | variable, mean)  ##$ozone
##   month    (all)
## 1     5 23.61538
## 2     6 29.44444
## 3     7 59.11538
## 4     8 59.96154
## 5     9 31.44828
##
## $solar.r ## month (all) ## 1 5 181.2963 ## 2 6 190.1667 ## 3 7 216.4839 ## 4 8 171.8571 ## 5 9 167.4333 ## ##$wind
##   month     (all)
## 1     5 11.622581
## 2     6 10.266667
## 3     7  8.941935
## 4     8  8.793548
## 5     9 10.180000
##
## \$temp
##   month    (all)
## 1     5 65.54839
## 2     6 79.10000
## 3     7 83.90323
## 4     8 83.96774
## 5     9 76.90000

    cast(aqm, month ~ variable, mean, margins=c("grand_row", "grand_col"))

##   month    ozone  solar.r      wind     temp    (all)
## 1     5 23.61538 181.2963 11.622581 65.54839 68.70696
## 2     6 29.44444 190.1667 10.266667 79.10000 87.38384
## 3     7 59.11538 216.4839  8.941935 83.90323 93.49748
## 4     8 59.96154 171.8571  8.793548 83.96774 79.71207
## 5     9 31.44828 167.4333 10.180000 76.90000 71.82689
## 6 (all) 42.12931 185.9315  9.957516 77.88235 80.05722

    cast(aqm, day ~ month, mean, subset=variable=="ozone")

##    day   5   6   7   8   9
## 1    1  41 NaN 135  39  96
## 2    2  36 NaN  49   9  78
## 3    3  12 NaN  32  16  73
## 4    4  18 NaN NaN  78  91
## 5    5 NaN NaN  64  35  47
## 6    6  28 NaN  40  66  32
## 7    7  23  29  77 122  20
## 8    8  19 NaN  97  89  23
## 9    9   8  71  97 110  21
## 10  10 NaN  39  85 NaN  24
## 11  11   7 NaN NaN NaN  44
## 12  12  16 NaN  10  44  21
## 13  13  11  23  27  28  28
## 14  14  14 NaN NaN  65   9
## 15  15  18 NaN   7 NaN  13
## 16  16  14  21  48  22  46
## 17  17  34  37  35  59  18
## 18  18   6  20  61  23  13
## 19  19  30  12  79  31  24
## 20  20  11  13  63  44  16
## 21  21   1 NaN  16  21  13
## 22  22  11 NaN NaN   9  23
## 23  23   4 NaN NaN NaN  36
## 24  24  32 NaN  80  45   7
## 25  25 NaN NaN 108 168  14
## 26  26 NaN NaN  20  73  30
## 27  27 NaN NaN  52 NaN NaN
## 28  28  23 NaN  82  76  14
## 29  29  45 NaN  50 118  18
## 30  30 115 NaN  64  84  20
## 31  31  37 NaN  59  85 NaN

    cast(aqm, month ~ variable, range)

##   month ozone_X1 ozone_X2 solar.r_X1 solar.r_X2 wind_X1 wind_X2 temp_X1
## 1     5        1      115          8        334     5.7    20.1      56
## 2     6       12       71         31        332     1.7    20.7      65
## 3     7        7      135          7        314     4.1    14.9      73
## 4     8        9      168         24        273     2.3    15.5      72
## 5     9        7       96         14        259     2.8    16.6      63
##   temp_X2
## 1      81
## 2      93
## 3      92
## 4      97
## 5      93

    cast(aqm, month ~ variable + result_variable, range)

##   month ozone_X1 ozone_X2 solar.r_X1 solar.r_X2 wind_X1 wind_X2 temp_X1
## 1     5        1      115          8        334     5.7    20.1      56
## 2     6       12       71         31        332     1.7    20.7      65
## 3     7        7      135          7        314     4.1    14.9      73
## 4     8        9      168         24        273     2.3    15.5      72
## 5     9        7       96         14        259     2.8    16.6      63
##   temp_X2
## 1      81
## 2      93
## 3      92
## 4      97
## 5      93

    cast(aqm, variable ~ month ~ result_variable,range)

## , , result_variable = X1
##
##          month
## variable     5    6    7    8    9
##   ozone    1.0 12.0  7.0  9.0  7.0
##   solar.r  8.0 31.0  7.0 24.0 14.0
##   wind     5.7  1.7  4.1  2.3  2.8
##   temp    56.0 65.0 73.0 72.0 63.0
##
## , , result_variable = X2
##
##          month
## variable      5     6     7     8     9
##   ozone   115.0  71.0 135.0 168.0  96.0
##   solar.r 334.0 332.0 314.0 273.0 259.0
##   wind     20.1  20.7  14.9  15.5  16.6
##   temp     81.0  93.0  92.0  97.0  93.0