Skip to content

Latest commit

 

History

History
3008 lines (2115 loc) · 76.7 KB

c05_Data_Engineering_and_Data_Shaping.md

File metadata and controls

3008 lines (2115 loc) · 76.7 KB
output
github_document

00070_informalexample_5.1_of_section_5.1.1.R

# informalexample 5.1 of section 5.1.1 
# (informalexample 5.1 of section 5.1.1)  : Data engineering and data shaping : Data selection : Sub-setting rows and columns 

library("ggplot2") 	# Note: 1 

summary(iris) 	# Note: 2 
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50

# Note 1: 
#   Attach the ggplot2 package for later plotting. 

# Note 2: 
#   Take a look at the built-in iris data. 

00071_informalexample_5.2_of_section_5.1.1.R

# informalexample 5.2 of section 5.1.1 
# (informalexample 5.2 of section 5.1.1)  : Data engineering and data shaping : Data selection : Sub-setting rows and columns 

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

ggplot(iris, 
       aes(x = Petal.Length, y = Petal.Width, 
           shape = Species, color = Species)) + 
  geom_point(size =2 ) + 
  ggtitle("Petal dimensions by iris species: all measurements")

plot of chunk 00071_informalexample_5.2_of_section_5.1.1.R

00072_informalexample_5.3_of_section_5.1.1.R

# informalexample 5.3 of section 5.1.1 
# (informalexample 5.3 of section 5.1.1)  : Data engineering and data shaping : Data selection : Sub-setting rows and columns 

columns_we_want <- c("Petal.Length", "Petal.Width", "Species")
rows_we_want <- iris$Petal.Length > 2

# before
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

iris_base <- iris[rows_we_want, columns_we_want, drop = FALSE]

# after
head(iris_base)
##    Petal.Length Petal.Width    Species
## 51          4.7         1.4 versicolor
## 52          4.5         1.5 versicolor
## 53          4.9         1.5 versicolor
## 54          4.0         1.3 versicolor
## 55          4.6         1.5 versicolor
## 56          4.5         1.3 versicolor
##    Petal.Length Petal.Width    Species
## 51          4.7         1.4 versicolor
## 52          4.5         1.5 versicolor
## 53          4.9         1.5 versicolor
## 54          4.0         1.3 versicolor
## 55          4.6         1.5 versicolor
## 56          4.5         1.3 versicolor

00073_informalexample_5.4_of_section_5.1.1.R

# informalexample 5.4 of section 5.1.1 
# (informalexample 5.4 of section 5.1.1)  : Data engineering and data shaping : Data selection : Sub-setting rows and columns 

library("data.table")

iris_data.table <- as.data.table(iris)                                	# Note: 1 

columns_we_want <- c("Petal.Length", "Petal.Width", "Species")
rows_we_want <- iris_data.table$Petal.Length > 2

iris_data.table <- iris_data.table[rows_we_want , ..columns_we_want]  	# Note: 2 

head(iris_data.table)
##    Petal.Length Petal.Width    Species
## 1:          4.7         1.4 versicolor
## 2:          4.5         1.5 versicolor
## 3:          4.9         1.5 versicolor
## 4:          4.0         1.3 versicolor
## 5:          4.6         1.5 versicolor
## 6:          4.5         1.3 versicolor
##    Petal.Length Petal.Width    Species
## 1:          4.7         1.4 versicolor
## 2:          4.5         1.5 versicolor
## 3:          4.9         1.5 versicolor
## 4:          4.0         1.3 versicolor
## 5:          4.6         1.5 versicolor
## 6:          4.5         1.3 versicolor

# Note 1: 
#   Convert to data.table class to get data.table semantics. 

# Note 2: 
#   The “..” notation tells data.table that “columns_we_want” isn’t itself the name of a column 
#   but a variable referring to names of columns. 

00074_informalexample_5.5_of_section_5.1.1.R

# informalexample 5.5 of section 5.1.1 
# (informalexample 5.5 of section 5.1.1)  : Data engineering and data shaping : Data selection : Sub-setting rows and columns 

library("data.table")

df <- data.frame(x = 1:2, y = 3:4)          	# Note: 1 

df[, x]                                     	# Note: 2 
## Error in `[.data.frame`(df, , x): object 'x' not found
## Error in `[.data.frame`(df, , x) : object 'x' not found

x <- "y"                                	# Note: 3 
dt <- data.table(df)

dt[, x]                                     	# Note: 4 
## [1] 1 2
## [1] 1 2

dt[, ..x]                                   	# Note: 5 
##    y
## 1: 3
## 2: 4
##    y
## 1: 3
## 2: 4

# Note 1: 
#   Example data.frame. 

# Note 2: 
#   Notice writing df[, x] instead of df[, "x"] is an error (assuming x is not bound to a value in 
#   our environment). 

# Note 3: 
#   Set up data.table example. 

# Note 4: 
#   Notice this returns the column x much like d$x would. 

# Note 5: 
#   This uses data.table’s “look up” idiom to get a data.table of columns referred to by the 
#   variable x. 

00075_informalexample_5.6_of_section_5.1.1.R

# informalexample 5.6 of section 5.1.1 
# (informalexample 5.6 of section 5.1.1)  : Data engineering and data shaping : Data selection : Sub-setting rows and columns 

library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
iris_dplyr <- iris %>% 
  select(.,
         Petal.Length, Petal.Width, Species) %>%
  filter(.,
         Petal.Length > 2)

head(iris_dplyr)
##   Petal.Length Petal.Width    Species
## 1          4.7         1.4 versicolor
## 2          4.5         1.5 versicolor
## 3          4.9         1.5 versicolor
## 4          4.0         1.3 versicolor
## 5          4.6         1.5 versicolor
## 6          4.5         1.3 versicolor
##   Petal.Length Petal.Width    Species
## 1          4.7         1.4 versicolor
## 2          4.5         1.5 versicolor
## 3          4.9         1.5 versicolor
## 4          4.0         1.3 versicolor
## 5          4.6         1.5 versicolor
## 6          4.5         1.3 versicolor

00076_informalexample_5.7_of_section_5.1.2.R

# informalexample 5.7 of section 5.1.2 
# (informalexample 5.7 of section 5.1.2)  : Data engineering and data shaping : Data selection : Removing records with incomplete data 

library("ggplot2")
data(msleep)                         	# Note: 1 

str(msleep)
## Classes 'tbl_df', 'tbl' and 'data.frame':	83 obs. of  11 variables:
##  $ name        : chr  "Cheetah" "Owl monkey" "Mountain beaver" "Greater short-tailed shrew" ...
##  $ genus       : chr  "Acinonyx" "Aotus" "Aplodontia" "Blarina" ...
##  $ vore        : chr  "carni" "omni" "herbi" "omni" ...
##  $ order       : chr  "Carnivora" "Primates" "Rodentia" "Soricomorpha" ...
##  $ conservation: chr  "lc" NA "nt" "lc" ...
##  $ sleep_total : num  12.1 17 14.4 14.9 4 14.4 8.7 7 10.1 3 ...
##  $ sleep_rem   : num  NA 1.8 2.4 2.3 0.7 2.2 1.4 NA 2.9 NA ...
##  $ sleep_cycle : num  NA NA NA 0.133 0.667 ...
##  $ awake       : num  11.9 7 9.6 9.1 20 9.6 15.3 17 13.9 21 ...
##  $ brainwt     : num  NA 0.0155 NA 0.00029 0.423 NA NA NA 0.07 0.0982 ...
##  $ bodywt      : num  50 0.48 1.35 0.019 600 ...
## Classes 'tbl_df', 'tbl' and 'data.frame':    83 obs. of  11 variables:
##  $ name        : chr  "Cheetah" "Owl monkey" "Mountain beaver" "Greater short-tailed shrew" ...
##  $ genus       : chr  "Acinonyx" "Aotus" "Aplodontia" "Blarina" ...
##  $ vore        : chr  "carni" "omni" "herbi" "omni" ...
##  $ order       : chr  "Carnivora" "Primates" "Rodentia" "Soricomorpha" ...
##  $ conservation: chr  "lc" NA "nt" "lc" ...
##  $ sleep_total : num  12.1 17 14.4 14.9 4 14.4 8.7 7 10.1 3 ...
##  $ sleep_rem   : num  NA 1.8 2.4 2.3 0.7 2.2 1.4 NA 2.9 NA ...
##  $ sleep_cycle : num  NA NA NA 0.133 0.667 ...
##  $ awake       : num  11.9 7 9.6 9.1 20 9.6 15.3 17 13.9 21 ...
##  $ brainwt     : num  NA 0.0155 NA 0.00029 0.423 NA NA NA 0.07 0.0982 ...
##  $ bodywt      : num  50 0.48 1.35 0.019 600 ...

# Note 1: 
#   Copy the msleep from the ggplot2 package into our workspace. 

00077_informalexample_5.8_of_section_5.1.2.R

# informalexample 5.8 of section 5.1.2 
# (informalexample 5.8 of section 5.1.2)  : Data engineering and data shaping : Data selection : Removing records with incomplete data 

summary(msleep)
##      name              genus               vore          
##  Length:83          Length:83          Length:83         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##     order           conservation        sleep_total      sleep_rem    
##  Length:83          Length:83          Min.   : 1.90   Min.   :0.100  
##  Class :character   Class :character   1st Qu.: 7.85   1st Qu.:0.900  
##  Mode  :character   Mode  :character   Median :10.10   Median :1.500  
##                                        Mean   :10.43   Mean   :1.875  
##                                        3rd Qu.:13.75   3rd Qu.:2.400  
##                                        Max.   :19.90   Max.   :6.600  
##                                                        NA's   :22     
##   sleep_cycle         awake          brainwt            bodywt        
##  Min.   :0.1167   Min.   : 4.10   Min.   :0.00014   Min.   :   0.005  
##  1st Qu.:0.1833   1st Qu.:10.25   1st Qu.:0.00290   1st Qu.:   0.174  
##  Median :0.3333   Median :13.90   Median :0.01240   Median :   1.670  
##  Mean   :0.4396   Mean   :13.57   Mean   :0.28158   Mean   : 166.136  
##  3rd Qu.:0.5792   3rd Qu.:16.15   3rd Qu.:0.12550   3rd Qu.:  41.750  
##  Max.   :1.5000   Max.   :22.10   Max.   :5.71200   Max.   :6654.000  
##  NA's   :51                       NA's   :27
##      name              genus               vore          
##  Length:83          Length:83          Length:83         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##     order           conservation        sleep_total      sleep_rem    
##  Length:83          Length:83          Min.   : 1.90   Min.   :0.100  
##  Class :character   Class :character   1st Qu.: 7.85   1st Qu.:0.900  
##  Mode  :character   Mode  :character   Median :10.10   Median :1.500  
##                                        Mean   :10.43   Mean   :1.875  
##                                        3rd Qu.:13.75   3rd Qu.:2.400  
##                                        Max.   :19.90   Max.   :6.600  
##                                                        NA's   :22     
##   sleep_cycle         awake          brainwt            bodywt        
##  Min.   :0.1167   Min.   : 4.10   Min.   :0.00014   Min.   :   0.005  
##  1st Qu.:0.1833   1st Qu.:10.25   1st Qu.:0.00290   1st Qu.:   0.174  
##  Median :0.3333   Median :13.90   Median :0.01240   Median :   1.670  
##  Mean   :0.4396   Mean   :13.57   Mean   :0.28158   Mean   : 166.136  
##  3rd Qu.:0.5792   3rd Qu.:16.15   3rd Qu.:0.12550   3rd Qu.:  41.750  
##  Max.   :1.5000   Max.   :22.10   Max.   :5.71200   Max.   :6654.000  
##  NA's   :51                       NA's   :27

00078_informalexample_5.9_of_section_5.1.2.R

# informalexample 5.9 of section 5.1.2 
# (informalexample 5.9 of section 5.1.2)  : Data engineering and data shaping : Data selection : Removing records with incomplete data 

clean_base_1 <- msleep[complete.cases(msleep), , drop = FALSE]

summary(clean_base_1)
##      name              genus               vore          
##  Length:20          Length:20          Length:20         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     order           conservation        sleep_total       sleep_rem    
##  Length:20          Length:20          Min.   : 2.900   Min.   :0.600  
##  Class :character   Class :character   1st Qu.: 8.925   1st Qu.:1.300  
##  Mode  :character   Mode  :character   Median :11.300   Median :2.350  
##                                        Mean   :11.225   Mean   :2.275  
##                                        3rd Qu.:13.925   3rd Qu.:3.125  
##                                        Max.   :19.700   Max.   :4.900  
##   sleep_cycle         awake          brainwt            bodywt        
##  Min.   :0.1167   Min.   : 4.30   Min.   :0.00014   Min.   :  0.0050  
##  1st Qu.:0.1792   1st Qu.:10.07   1st Qu.:0.00115   1st Qu.:  0.0945  
##  Median :0.2500   Median :12.70   Median :0.00590   Median :  0.7490  
##  Mean   :0.3458   Mean   :12.78   Mean   :0.07882   Mean   : 72.1177  
##  3rd Qu.:0.4167   3rd Qu.:15.07   3rd Qu.:0.03670   3rd Qu.:  6.1250  
##  Max.   :1.0000   Max.   :21.10   Max.   :0.65500   Max.   :600.0000
##      name              genus               vore          
##  Length:20          Length:20          Length:20         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     order           conservation        sleep_total       sleep_rem    
##  Length:20          Length:20          Min.   : 2.900   Min.   :0.600  
##  Class :character   Class :character   1st Qu.: 8.925   1st Qu.:1.300  
##  Mode  :character   Mode  :character   Median :11.300   Median :2.350  
##                                        Mean   :11.225   Mean   :2.275  
##                                        3rd Qu.:13.925   3rd Qu.:3.125  
##                                        Max.   :19.700   Max.   :4.900  
##   sleep_cycle         awake          brainwt            bodywt        
##  Min.   :0.1167   Min.   : 4.30   Min.   :0.00014   Min.   :  0.0050  
##  1st Qu.:0.1792   1st Qu.:10.07   1st Qu.:0.00115   1st Qu.:  0.0945  
##  Median :0.2500   Median :12.70   Median :0.00590   Median :  0.7490  
##  Mean   :0.3458   Mean   :12.78   Mean   :0.07882   Mean   : 72.1177  
##  3rd Qu.:0.4167   3rd Qu.:15.07   3rd Qu.:0.03670   3rd Qu.:  6.1250  
##  Max.   :1.0000   Max.   :21.10   Max.   :0.65500   Max.   :600.0000

nrow(clean_base_1)
## [1] 20
## [1] 20

clean_base_2 = na.omit(msleep)
nrow(clean_base_2)
## [1] 20
## [1] 20

00079_informalexample_5.10_of_section_5.1.2.R

# informalexample 5.10 of section 5.1.2 
# (informalexample 5.10 of section 5.1.2)  : Data engineering and data shaping : Data selection : Removing records with incomplete data 

library("data.table")

msleep_data.table <- as.data.table(msleep)

clean_data.table = msleep_data.table[complete.cases(msleep_data.table), ]

nrow(clean_data.table)
## [1] 20
## [1] 20

00080_informalexample_5.11_of_section_5.1.2.R

# informalexample 5.11 of section 5.1.2 
# (informalexample 5.11 of section 5.1.2)  : Data engineering and data shaping : Data selection : Removing records with incomplete data 

library("dplyr")

clean_dplyr <- msleep %>% 
  filter(., complete.cases(.))

nrow(clean_dplyr)
## [1] 20
## [1] 20

00081_informalexample_5.12_of_section_5.1.3.R

# informalexample 5.12 of section 5.1.3 
# (informalexample 5.12 of section 5.1.3)  : Data engineering and data shaping : Data selection : Ordering rows 

purchases <- wrapr::build_frame(                 	# Note: 1 
   "day", "hour", "n_purchase" |
   1    , 9     , 5            |
   2    , 9     , 3            |
   2    , 11    , 5            |
   1    , 13    , 1            |
   2    , 13    , 3            |
   1    , 14    , 1            )

# Note 1: 
#   Use wrapr::build_frame to type data in directly in legible column order. 

00082_informalexample_5.13_of_section_5.1.3.R

# informalexample 5.13 of section 5.1.3 
# (informalexample 5.13 of section 5.1.3)  : Data engineering and data shaping : Data selection : Ordering rows 

order_index <- with(purchases, order(day, hour))                        	# Note: 1 
  
purchases_ordered <- purchases[order_index, , drop = FALSE]
purchases_ordered$running_total <- cumsum(purchases_ordered$n_purchase) 	# Note: 2 

purchases_ordered
##   day hour n_purchase running_total
## 1   1    9          5             5
## 4   1   13          1             6
## 6   1   14          1             7
## 2   2    9          3            10
## 3   2   11          5            15
## 5   2   13          3            18
##   day hour n_purchase running_total
## 1   1    9          5             5
## 4   1   13          1             6
## 6   1   14          1             7
## 2   2    9          3            10
## 3   2   11          5            15
## 5   2   13          3            18

# Note 1: 
#   with() executes the code in its second argument as if the columns of the first argument were 
#   variables. This lets us write “x” instead of “order$x”. 

# Note 2: 
#   Compute the running sum. 

00083_informalexample_5.14_of_section_5.1.3.R

# informalexample 5.14 of section 5.1.3 
# (informalexample 5.14 of section 5.1.3)  : Data engineering and data shaping : Data selection : Ordering rows 

library("data.table")

DT_purchases <- as.data.table(purchases)

order_cols <- c("day", "hour")            	# Note: 1 
setorderv(DT_purchases, order_cols)

DT_purchases[ , running_total := cumsum(n_purchase)]

# print(DT_purchases)

# Note 1: 
#   Re-order data 

00084_informalexample_5.15_of_section_5.1.3.R

# informalexample 5.15 of section 5.1.3 
# (informalexample 5.15 of section 5.1.3)  : Data engineering and data shaping : Data selection : Ordering rows 

library("dplyr")

res <- purchases %>%
  arrange(., day, hour) %>%
  mutate(., running_total = cumsum(n_purchase))
  
# print(res)

00085_informalexample_5.16_of_section_5.1.3.R

# informalexample 5.16 of section 5.1.3 
# (informalexample 5.16 of section 5.1.3)  : Data engineering and data shaping : Data selection : Ordering rows 

order_index <- with(purchases, order(day, hour))                	# Note: 1 
purchases_ordered <- purchases[order_index, , drop = FALSE]

data_list <- split(purchases_ordered, purchases_ordered$day)    	# Note: 2 

data_list <- lapply(                                            	# Note: 3 
  data_list,
  function(di) {
    di$running_total <- cumsum(di$n_purchase)
    di
  })

purchases_ordered <- do.call(base::rbind, data_list)            	# Note: 4 
rownames(purchases_ordered) <- NULL                             	# Note: 5 

purchases_ordered
##   day hour n_purchase running_total
## 1   1    9          5             5
## 2   1   13          1             6
## 3   1   14          1             7
## 4   2    9          3             3
## 5   2   11          5             8
## 6   2   13          3            11
##   day hour n_purchase running_total
## 1   1    9          5             5
## 2   1   13          1             6
## 3   1   14          1             7
## 4   2    9          3             3
## 5   2   11          5             8
## 6   2   13          3            11

# Note 1: 
#   First: sort the data. 

# Note 2: 
#   Now split data into a list of groups. 

# Note 3: 
#   Apply the cumsum to each group. 

# Note 4: 
#   Put the results back to together into a single data.frame. 

# Note 5: 
#   R often keeps annotations in the rownames(). In this case it is storing the original row 
#   numbers of the pieces we are assembling. This can confuse users when printing, so 
#   it is good practice to remove these annotations as we do here. 

00086_informalexample_5.17_of_section_5.1.3.R

# informalexample 5.17 of section 5.1.3 
# (informalexample 5.17 of section 5.1.3)  : Data engineering and data shaping : Data selection : Ordering rows 

library("data.table")

# new copy for result solution
DT_purchases <- as.data.table(purchases)[order(day, hour), 
             .(hour = hour,
               n_purchase = n_purchase, 
               running_total = cumsum(n_purchase)),
             by = "day"]                      	# Note: 1 
# print(DT_purchases)                             	# Note: 2 

# in-place solution
DT_purchases <- as.data.table(purchases)
order_cols <- c("day", "hour")
setorderv(DT_purchases, order_cols)
DT_purchases[ , running_total := cumsum(n_purchase), by = day]
# print(DT_purchases)                              	# Note: 3 

# don't reorder the actual data variation!
DT_purchases <- as.data.table(purchases)
DT_purchases[order(day, hour), 
             `:=`(hour = hour,
               n_purchase = n_purchase, 
               running_total = cumsum(n_purchase)),
             by = "day"]
# print(DT_purchases)                               	# Note: 4

# Note 1: 
#   Adding the “by” keyword converts the calculation into a per-group calculation.. 

# Note 2: 
#   First solution: result is a second copy of the data .(=) notation. Only columns used in the 
#   calculation (such as “day”) and those explicitly assigned to are in the 
#   result. 

# Note 3: 
#   Second solution: result is computed in-place by ordering the table before the grouped 
#   calculation. 

# Note 4: 
#   Third solution: result is in same order as the original table, but the cumulative some is 
#   computed as if we sorted the table, computed the grouped 
#   running sum, and then returned the table to the original order. 

00087_informalexample_5.18_of_section_5.1.3.R

# informalexample 5.18 of section 5.1.3 
# (informalexample 5.18 of section 5.1.3)  : Data engineering and data shaping : Data selection : Ordering rows 

library("dplyr")

res <- purchases %>%
  arrange(., day, hour) %>%
  group_by(., day) %>%
  mutate(., running_total = cumsum(n_purchase)) %>%
  ungroup(.)

# print(res)

00088_informalexample_5.19_of_section_5.2.1.R

# informalexample 5.19 of section 5.2.1 
# (informalexample 5.19 of section 5.2.1)  : Data engineering and data shaping : Basic data transforms : Add new columns 

library("datasets")
library("ggplot2")

summary(airquality)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
## 
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
##

00089_informalexample_5.20_of_section_5.2.1.R

# informalexample 5.20 of section 5.2.1 
# (informalexample 5.20 of section 5.2.1)  : Data engineering and data shaping : Basic data transforms : Add new columns 

library("lubridate")
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following object is masked from 'package:base':
## 
##     date
library("ggplot2")

# create a function to make the date string.
datestr = function(day, month, year) {
  paste(day, month, year, sep="-")
}

00090_informalexample_5.21_of_section_5.2.1.R

# informalexample 5.21 of section 5.2.1 
# (informalexample 5.21 of section 5.2.1)  : Data engineering and data shaping : Basic data transforms : Add new columns 

airquality_with_date <- airquality                              	# Note: 1 

airquality_with_date$date <- with(airquality_with_date,         	# Note: 2 
                                  dmy(datestr(Day, Month, 1973)))

airquality_with_date <- airquality_with_date[,                 	# Note: 3 
                                             c("Ozone", "date"),
                                             drop = FALSE]

head(airquality_with_date)                                      	# Note: 4 
##   Ozone       date
## 1    41 1973-05-01
## 2    36 1973-05-02
## 3    12 1973-05-03
## 4    18 1973-05-04
## 5    NA 1973-05-05
## 6    28 1973-05-06
##   Ozone       date
## 1    41 1973-05-01
## 2    36 1973-05-02
## 3    12 1973-05-03
## 4    18 1973-05-04
## 5    NA 1973-05-05
## 6    28 1973-05-06        

ggplot(airquality_with_date, aes(x = date, y = Ozone)) +        	# Note: 5 
  geom_point() + 
  geom_line() + 
  xlab("Date") +
  ggtitle("New York ozone readings, May 1 - Sept 30, 1973")
## Warning: Removed 37 rows containing missing values (geom_point).

plot of chunk 00090_informalexample_5.21_of_section_5.2.1.R

# Note 1: 
#   Build a copy of the data. 

# Note 2: 
#   Add the date column, with “with()” to refer to columns without needing the table name. 

# Note 3: 
#   Limit down to columns of interest. 

# Note 4: 
#   Show the results. 

# Note 5: 
#   Plot the results. 

00091_informalexample_5.22_of_section_5.2.1.R

# informalexample 5.22 of section 5.2.1 
# (informalexample 5.22 of section 5.2.1)  : Data engineering and data shaping : Basic data transforms : Add new columns 

library("wrapr")                                 	# Note: 1 
## 
## Attaching package: 'wrapr'
## The following object is masked from 'package:dplyr':
## 
##     coalesce
## The following object is masked from 'package:data.table':
## 
##     :=
airquality %.>%                                    	# Note: 2 
  transform(., date = dmy(datestr(Day, Month, 1973))) %.>%
  subset(., !is.na(Ozone), select =  c("Ozone", "date")) %.>%
  head(.)
##   Ozone       date
## 1    41 1973-05-01
## 2    36 1973-05-02
## 3    12 1973-05-03
## 4    18 1973-05-04
## 6    28 1973-05-06
## 7    23 1973-05-07
##   Ozone       date
## 1    41 1973-05-01
## 2    36 1973-05-02
## 3    12 1973-05-03
## 4    18 1973-05-04
## 6    28 1973-05-06
## 7    23 1973-05-07

# Note 1: 
#   Attach the wrapr package to define the wrapr dot arrow pipe: %.>%. The dot arrow pipe is 
#   another R pipe and is described in the R Journal at 
#   https://journal.r-project.org/archive/2018/RJ-2018-042/index.html 

# Note 2: 
#   Run all the steps as before using transform() and subset(), adding an extra step of filtering 
#   down to rows that do not have missing Ozone values. 

00092_informalexample_5.23_of_section_5.2.1.R

# informalexample 5.23 of section 5.2.1 
# (informalexample 5.23 of section 5.2.1)  : Data engineering and data shaping : Basic data transforms : Add new columns 

library("data.table")

DT_airquality <- 
  as.data.table(airquality)[                        	# Note: 1 
    , date := dmy(datestr(Day, Month, 1973)) ][     	# Note: 2 
      , c("Ozone", "date")]                 	# Note: 3 

head(DT_airquality)
##    Ozone       date
## 1:    41 1973-05-01
## 2:    36 1973-05-02
## 3:    12 1973-05-03
## 4:    18 1973-05-04
## 5:    NA 1973-05-05
## 6:    28 1973-05-06
##    Ozone       date
## 1:    41 1973-05-01
## 2:    36 1973-05-02
## 3:    12 1973-05-03
## 4:    18 1973-05-04
## 5:    NA 1973-05-05
## 6:    28 1973-05-06

# Note 1: 
#   Build a data.table copy of the data. 

# Note 2: 
#   Add the date column. 

# Note 3: 
#   Limit down to columns of interest. 

00093_informalexample_5.24_of_section_5.2.1.R

# informalexample 5.24 of section 5.2.1 
# (informalexample 5.24 of section 5.2.1)  : Data engineering and data shaping : Basic data transforms : Add new columns 

library("dplyr")

airquality_with_date2 <- airquality %>%
  mutate(., date = dmy(datestr(Day, Month, 1973))) %>%
  select(., Ozone, date)

head(airquality_with_date2)
##   Ozone       date
## 1    41 1973-05-01
## 2    36 1973-05-02
## 3    12 1973-05-03
## 4    18 1973-05-04
## 5    NA 1973-05-05
## 6    28 1973-05-06
##   Ozone       date
## 1    41 1973-05-01
## 2    36 1973-05-02
## 3    12 1973-05-03
## 4    18 1973-05-04
## 5    NA 1973-05-05
## 6    28 1973-05-06

00094_informalexample_5.25_of_section_5.2.1.R

# informalexample 5.25 of section 5.2.1 
# (informalexample 5.25 of section 5.2.1)  : Data engineering and data shaping : Basic data transforms : Add new columns 

library("zoo")
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
airquality_corrected <- airquality_with_date
airquality_corrected$OzoneCorrected <-
  na.locf(airquality_corrected$Ozone, na.rm = FALSE)

summary(airquality_corrected)
##      Ozone             date            OzoneCorrected  
##  Min.   :  1.00   Min.   :1973-05-01   Min.   :  1.00  
##  1st Qu.: 18.00   1st Qu.:1973-06-08   1st Qu.: 16.00  
##  Median : 31.50   Median :1973-07-16   Median : 30.00  
##  Mean   : 42.13   Mean   :1973-07-16   Mean   : 39.78  
##  3rd Qu.: 63.25   3rd Qu.:1973-08-23   3rd Qu.: 52.00  
##  Max.   :168.00   Max.   :1973-09-30   Max.   :168.00  
##  NA's   :37
##      Ozone             date            OzoneCorrected  
##  Min.   :  1.00   Min.   :1973-05-01   Min.   :  1.00  
##  1st Qu.: 18.00   1st Qu.:1973-06-08   1st Qu.: 16.00  
##  Median : 31.50   Median :1973-07-16   Median : 30.00  
##  Mean   : 42.13   Mean   :1973-07-16   Mean   : 39.78  
##  3rd Qu.: 63.25   3rd Qu.:1973-08-23   3rd Qu.: 52.00  
##  Max.   :168.00   Max.   :1973-09-30   Max.   :168.00  
##  NA's   :37

ggplot(airquality_corrected, aes(x = date, y = Ozone)) + 
  geom_point(aes(y=Ozone)) + 
  geom_line(aes(y=OzoneCorrected)) + 
  ggtitle("New York ozone readings, May 1 - Sept 30, 1973",
          subtitle = "(corrected)") +
  xlab("Date")
## Warning: Removed 37 rows containing missing values (geom_point).

plot of chunk 00094_informalexample_5.25_of_section_5.2.1.R

00095_informalexample_5.26_of_section_5.2.1.R

# informalexample 5.26 of section 5.2.1 
# (informalexample 5.26 of section 5.2.1)  : Data engineering and data shaping : Basic data transforms : Add new columns 

library("data.table")
library("zoo")

DT_airquality[, OzoneCorrected := na.locf(Ozone, na.rm=FALSE)]

summary(DT_airquality)
##      Ozone             date            OzoneCorrected  
##  Min.   :  1.00   Min.   :1973-05-01   Min.   :  1.00  
##  1st Qu.: 18.00   1st Qu.:1973-06-08   1st Qu.: 16.00  
##  Median : 31.50   Median :1973-07-16   Median : 30.00  
##  Mean   : 42.13   Mean   :1973-07-16   Mean   : 39.78  
##  3rd Qu.: 63.25   3rd Qu.:1973-08-23   3rd Qu.: 52.00  
##  Max.   :168.00   Max.   :1973-09-30   Max.   :168.00  
##  NA's   :37
##      Ozone             date            OzoneCorrected  
##  Min.   :  1.00   Min.   :1973-05-01   Min.   :  1.00  
##  1st Qu.: 18.00   1st Qu.:1973-06-08   1st Qu.: 16.00  
##  Median : 31.50   Median :1973-07-16   Median : 30.00  
##  Mean   : 42.13   Mean   :1973-07-16   Mean   : 39.78  
##  3rd Qu.: 63.25   3rd Qu.:1973-08-23   3rd Qu.: 52.00  
##  Max.   :168.00   Max.   :1973-09-30   Max.   :168.00  
##  NA's   :37

00096_informalexample_5.27_of_section_5.2.1.R

# informalexample 5.27 of section 5.2.1 
# (informalexample 5.27 of section 5.2.1)  : Data engineering and data shaping : Basic data transforms : Add new columns 

library("dplyr")
library("zoo")

airquality_with_date %>% 
  mutate(.,
         OzoneCorrected = na.locf(Ozone, na.rm = FALSE)) %>% 
  summary(.)
##      Ozone             date            OzoneCorrected  
##  Min.   :  1.00   Min.   :1973-05-01   Min.   :  1.00  
##  1st Qu.: 18.00   1st Qu.:1973-06-08   1st Qu.: 16.00  
##  Median : 31.50   Median :1973-07-16   Median : 30.00  
##  Mean   : 42.13   Mean   :1973-07-16   Mean   : 39.78  
##  3rd Qu.: 63.25   3rd Qu.:1973-08-23   3rd Qu.: 52.00  
##  Max.   :168.00   Max.   :1973-09-30   Max.   :168.00  
##  NA's   :37
##      Ozone             date            OzoneCorrected  
##  Min.   :  1.00   Min.   :1973-05-01   Min.   :  1.00  
##  1st Qu.: 18.00   1st Qu.:1973-06-08   1st Qu.: 16.00  
##  Median : 31.50   Median :1973-07-16   Median : 30.00  
##  Mean   : 42.13   Mean   :1973-07-16   Mean   : 39.78  
##  3rd Qu.: 63.25   3rd Qu.:1973-08-23   3rd Qu.: 52.00  
##  Max.   :168.00   Max.   :1973-09-30   Max.   :168.00  
##  NA's   :37

00097_informalexample_5.28_of_section_5.2.2.R

# informalexample 5.28 of section 5.2.2 
# (informalexample 5.28 of section 5.2.2)  : Data engineering and data shaping : Basic data transforms : Other simple operations 

d <- data.frame(x = 1:2, y = 3:4)
print(d)
##   x y
## 1 1 3
## 2 2 4
#>   x y
#> 1 1 3
#> 2 2 4

colnames(d) <- c("BIGX", "BIGY")
print(d)
##   BIGX BIGY
## 1    1    3
## 2    2    4
#>   BIGX BIGY
#> 1    1    3
#> 2    2    4

d$BIGX <- NULL
print(d)
##   BIGY
## 1    3
## 2    4
#>   BIGY
#> 1    3
#> 2    4

00098_informalexample_5.29_of_section_5.3.1.R

# informalexample 5.29 of section 5.3.1 
# (informalexample 5.29 of section 5.3.1)  : Data engineering and data shaping : Aggregating transforms : Combining many rows into summary rows 

library("datasets")
library("ggplot2")

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

00099_informalexample_5.30_of_section_5.3.1.R

# informalexample 5.30 of section 5.3.1 
# (informalexample 5.30 of section 5.3.1)  : Data engineering and data shaping : Aggregating transforms : Combining many rows into summary rows 

iris_summary <- aggregate(
  cbind(Petal.Length, Petal.Width) ~ Species, 
  data = iris, 
  FUN = mean)

print(iris_summary)
##      Species Petal.Length Petal.Width
## 1     setosa        1.462       0.246
## 2 versicolor        4.260       1.326
## 3  virginica        5.552       2.026
#      Species Petal.Length Petal.Width
# 1     setosa        1.462       0.246
# 2 versicolor        4.260       1.326
# 3  virginica        5.552       2.026

library(ggplot2)
ggplot(mapping = aes(x = Petal.Length, y = Petal.Width, 
                     shape = Species, color = Species)) + 
  geom_point(data = iris, # raw data
             alpha = 0.5) + 
  geom_point(data = iris_summary, # per-group summaries
             size = 5) +
  ggtitle("Average Petal dimensions by iris species\n(with raw data for reference)")

plot of chunk 00099_informalexample_5.30_of_section_5.3.1.R

00100_informalexample_5.31_of_section_5.3.1.R

# informalexample 5.31 of section 5.3.1 
# (informalexample 5.31 of section 5.3.1)  : Data engineering and data shaping : Aggregating transforms : Combining many rows into summary rows 

library("data.table")

iris_data.table <- as.data.table(iris)
iris_data.table <- iris_data.table[, 
                                   .(Petal.Length = mean(Petal.Length),
                                     Petal.Width = mean(Petal.Width)), 
                                   by = .(Species)]

# print(iris_data.table)

00101_informalexample_5.32_of_section_5.3.1.R

# informalexample 5.32 of section 5.3.1 
# (informalexample 5.32 of section 5.3.1)  : Data engineering and data shaping : Aggregating transforms : Combining many rows into summary rows 

library("dplyr")

iris_summary <- iris %>% group_by(., Species) %>% 
  summarize(.,
            Petal.Length = mean(Petal.Length),
            Petal.Width = mean(Petal.Width)) %>%
  ungroup(.) 

# print(iris_summary)

00102_informalexample_5.33_of_section_5.3.1.R

# informalexample 5.33 of section 5.3.1 
# (informalexample 5.33 of section 5.3.1)  : Data engineering and data shaping : Aggregating transforms : Combining many rows into summary rows 

iris_copy <- iris
iris_copy$mean_Petal.Length <- ave(iris$Petal.Length, iris$Species, FUN = mean)
iris_copy$mean_Petal.Width <- ave(iris$Petal.Width, iris$Species, FUN = mean)

# head(iris_copy)
# tail(iris_copy)

00103_informalexample_5.34_of_section_5.3.1.R

# informalexample 5.34 of section 5.3.1 
# (informalexample 5.34 of section 5.3.1)  : Data engineering and data shaping : Aggregating transforms : Combining many rows into summary rows 

library("data.table")

iris_data.table <- as.data.table(iris)

iris_data.table[ , 
                 `:=`(mean_Petal.Length = mean(Petal.Length),
                      mean_Petal.Width = mean(Petal.Width)), 
                 by = "Species"]

# print(iris_data.table)

00104_informalexample_5.35_of_section_5.3.1.R

# informalexample 5.35 of section 5.3.1 
# (informalexample 5.35 of section 5.3.1)  : Data engineering and data shaping : Aggregating transforms : Combining many rows into summary rows 

library("dplyr")

iris_dplyr <- iris %>% 
  group_by(., Species) %>% 
  mutate(.,
         mean_Petal.Length = mean(Petal.Length),
         mean_Petal.Width = mean(Petal.Width)) %>%
  ungroup(.)

# head(iris_dplyr)

00105_informalexample_5.36_of_section_5.4.1.R

# informalexample 5.36 of section 5.4.1 
# (informalexample 5.36 of section 5.4.1)  : Data engineering and data shaping : Multi-table data transforms : Combining two or more ordered data frames quickly 

productTable <- wrapr::build_frame(
   "productID", "price" |
   "p1"       , 9.99    |
   "p2"       , 16.29   |
   "p3"       , 19.99   |
   "p4"       , 5.49    |
   "p5"       , 24.49   )


salesTable <- wrapr::build_frame(
   "productID", "sold_store", "sold_online" |
   "p1"       , 6           , 64            |
   "p2"       , 31          , 1             |
   "p3"       , 30          , 23            |
   "p4"       , 31          , 67            |
   "p5"       , 43          , 51            )

productTable2 <- wrapr::build_frame(
   "productID", "price" |
   "n1"       , 25.49   |
   "n2"       , 33.99   |
   "n3"       , 17.99   )

productTable$productID <- factor(productTable$productID)
productTable2$productID <- factor(productTable2$productID)

00106_informalexample_5.37_of_section_5.4.1.R

# informalexample 5.37 of section 5.4.1 
# (informalexample 5.37 of section 5.4.1)  : Data engineering and data shaping : Multi-table data transforms : Combining two or more ordered data frames quickly 

rbind_base = rbind(productTable, 
                   productTable2)

00107_informalexample_5.38_of_section_5.4.1.R

# informalexample 5.38 of section 5.4.1 
# (informalexample 5.38 of section 5.4.1)  : Data engineering and data shaping : Multi-table data transforms : Combining two or more ordered data frames quickly 

str(rbind_base)
## 'data.frame':	8 obs. of  2 variables:
##  $ productID: Factor w/ 8 levels "p1","p2","p3",..: 1 2 3 4 5 6 7 8
##  $ price    : num  9.99 16.29 19.99 5.49 24.49 ...
## 'data.frame':    8 obs. of  2 variables:
##  $ productID: Factor w/ 8 levels "p1","p2","p3",..: 1 2 3 4 5 6 7 8
##  $ price    : num  9.99 16.29 19.99 5.49 24.49 ...

00108_informalexample_5.39_of_section_5.4.1.R

# informalexample 5.39 of section 5.4.1 
# (informalexample 5.39 of section 5.4.1)  : Data engineering and data shaping : Multi-table data transforms : Combining two or more ordered data frames quickly 

library("data.table")

rbindlist(list(productTable, 
               productTable2))
##    productID price
## 1:        p1  9.99
## 2:        p2 16.29
## 3:        p3 19.99
## 4:        p4  5.49
## 5:        p5 24.49
## 6:        n1 25.49
## 7:        n2 33.99
## 8:        n3 17.99
##    productID price
## 1:        p1  9.99
## 2:        p2 16.29
## 3:        p3 19.99
## 4:        p4  5.49
## 5:        p5 24.49
## 6:        n1 25.49
## 7:        n2 33.99
## 8:        n3 17.99

00109_informalexample_5.40_of_section_5.4.1.R

# informalexample 5.40 of section 5.4.1 
# (informalexample 5.40 of section 5.4.1)  : Data engineering and data shaping : Multi-table data transforms : Combining two or more ordered data frames quickly 

library("dplyr")

bind_rows(list(productTable, 
               productTable2))
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
##   productID price
## 1        p1  9.99
## 2        p2 16.29
## 3        p3 19.99
## 4        p4  5.49
## 5        p5 24.49
## 6        n1 25.49
## 7        n2 33.99
## 8        n3 17.99
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

##   productID price
## 1        p1  9.99
## 2        p2 16.29
## 3        p3 19.99
## 4        p4  5.49
## 5        p5 24.49
## 6        n1 25.49
## 7        n2 33.99
## 8        n3 17.99

00110_informalexample_5.41_of_section_5.4.1.R

# informalexample 5.41 of section 5.4.1 
# (informalexample 5.41 of section 5.4.1)  : Data engineering and data shaping : Multi-table data transforms : Combining two or more ordered data frames quickly 

# add an extra column telling us which table
# each row comes from
productTable_marked <- productTable
productTable_marked$table <- "productTable"
productTable2_marked <- productTable2
productTable2_marked$table <- "productTable2"

# combine the tables
rbind_base <- rbind(productTable_marked, 
                    productTable2_marked)
rbind_base
##   productID price         table
## 1        p1  9.99  productTable
## 2        p2 16.29  productTable
## 3        p3 19.99  productTable
## 4        p4  5.49  productTable
## 5        p5 24.49  productTable
## 6        n1 25.49 productTable2
## 7        n2 33.99 productTable2
## 8        n3 17.99 productTable2
##   productID price         table
## 1        p1  9.99  productTable
## 2        p2 16.29  productTable
## 3        p3 19.99  productTable
## 4        p4  5.49  productTable
## 5        p5 24.49  productTable
## 6        n1 25.49 productTable2
## 7        n2 33.99 productTable2
## 8        n3 17.99 productTable2

# split them apart
tables <- split(rbind_base, rbind_base$table)
tables
## $productTable
##   productID price        table
## 1        p1  9.99 productTable
## 2        p2 16.29 productTable
## 3        p3 19.99 productTable
## 4        p4  5.49 productTable
## 5        p5 24.49 productTable
## 
## $productTable2
##   productID price         table
## 6        n1 25.49 productTable2
## 7        n2 33.99 productTable2
## 8        n3 17.99 productTable2
## $productTable
##   productID price        table
## 1        p1  9.99 productTable
## 2        p2 16.29 productTable
## 3        p3 19.99 productTable
## 4        p4  5.49 productTable
## 5        p5 24.49 productTable
## 
## $productTable2
##   productID price         table
## 6        n1 25.49 productTable2
## 7        n2 33.99 productTable2
## 8        n3 17.99 productTable2

00111_informalexample_5.42_of_section_5.4.1.R

# informalexample 5.42 of section 5.4.1 
# (informalexample 5.42 of section 5.4.1)  : Data engineering and data shaping : Multi-table data transforms : Combining two or more ordered data frames quickly 

library("data.table")

# convert to data.table
dt <- as.data.table(rbind_base)

# arbitrary user defined function
f <- function(.BY, .SD) {
  max(.SD$price)
}

# apply the function to each group
# and collect results
dt[ , max_price := f(.BY, .SD), by = table]

print(dt)
##    productID price         table max_price
## 1:        p1  9.99  productTable     24.49
## 2:        p2 16.29  productTable     24.49
## 3:        p3 19.99  productTable     24.49
## 4:        p4  5.49  productTable     24.49
## 5:        p5 24.49  productTable     24.49
## 6:        n1 25.49 productTable2     33.99
## 7:        n2 33.99 productTable2     33.99
## 8:        n3 17.99 productTable2     33.99
##    productID price         table max_price
## 1:        p1  9.99  productTable     24.49
## 2:        p2 16.29  productTable     24.49
## 3:        p3 19.99  productTable     24.49
## 4:        p4  5.49  productTable     24.49
## 5:        p5 24.49  productTable     24.49
## 6:        n1 25.49 productTable2     33.99
## 7:        n2 33.99 productTable2     33.99
## 8:        n3 17.99 productTable2     33.99

00112_informalexample_5.43_of_section_5.4.1.R

# informalexample 5.43 of section 5.4.1 
# (informalexample 5.43 of section 5.4.1)  : Data engineering and data shaping : Multi-table data transforms : Combining two or more ordered data frames quickly 

library("data.table")

dt <- as.data.table(rbind_base)
grouping_column <- "table"
dt[ , max_price := max(price), by = eval(grouping_column)]

print(dt)
##    productID price         table max_price
## 1:        p1  9.99  productTable     24.49
## 2:        p2 16.29  productTable     24.49
## 3:        p3 19.99  productTable     24.49
## 4:        p4  5.49  productTable     24.49
## 5:        p5 24.49  productTable     24.49
## 6:        n1 25.49 productTable2     33.99
## 7:        n2 33.99 productTable2     33.99
## 8:        n3 17.99 productTable2     33.99
##    productID price         table max_price
## 1:        p1  9.99  productTable     24.49
## 2:        p2 16.29  productTable     24.49
## 3:        p3 19.99  productTable     24.49
## 4:        p4  5.49  productTable     24.49
## 5:        p5 24.49  productTable     24.49
## 6:        n1 25.49 productTable2     33.99
## 7:        n2 33.99 productTable2     33.99
## 8:        n3 17.99 productTable2     33.99

00113_informalexample_5.44_of_section_5.4.1.R

# informalexample 5.44 of section 5.4.1 
# (informalexample 5.44 of section 5.4.1)  : Data engineering and data shaping : Multi-table data transforms : Combining two or more ordered data frames quickly 

rbind_base %>%
  group_by(., table) %>%
  mutate(., max_price = max(price)) %>%
  ungroup(.)
## # A tibble: 8 x 4
##   productID price table         max_price
##   <fct>     <dbl> <chr>             <dbl>
## 1 p1         9.99 productTable       24.5
## 2 p2        16.3  productTable       24.5
## 3 p3        20.0  productTable       24.5
## 4 p4         5.49 productTable       24.5
## 5 p5        24.5  productTable       24.5
## 6 n1        25.5  productTable2      34.0
## 7 n2        34.0  productTable2      34.0
## 8 n3        18.0  productTable2      34.0
## # A tibble: 8 x 4
##   productID price table         max_price
##   <fct>     <dbl> <chr>             <dbl>
## 1 p1         9.99 productTable       24.5
## 2 p2        16.3  productTable       24.5
## 3 p3        20.0  productTable       24.5
## 4 p4         5.49 productTable       24.5
## 5 p5        24.5  productTable       24.5
## 6 n1        25.5  productTable2      34.0
## 7 n2        34.0  productTable2      34.0
## 8 n3        18.0  productTable2      34.0

00114_informalexample_5.45_of_section_5.4.1.R

# informalexample 5.45 of section 5.4.1 
# (informalexample 5.45 of section 5.4.1)  : Data engineering and data shaping : Multi-table data transforms : Combining two or more ordered data frames quickly 

cbind(productTable, salesTable[, -1])
##   productID price sold_store sold_online
## 1        p1  9.99          6          64
## 2        p2 16.29         31           1
## 3        p3 19.99         30          23
## 4        p4  5.49         31          67
## 5        p5 24.49         43          51
##   productID price sold_store sold_online
## 1        p1  9.99          6          64
## 2        p2 16.29         31           1
## 3        p3 19.99         30          23
## 4        p4  5.49         31          67
## 5        p5 24.49         43          51

00115_informalexample_5.46_of_section_5.4.1.R

# informalexample 5.46 of section 5.4.1 
# (informalexample 5.46 of section 5.4.1)  : Data engineering and data shaping : Multi-table data transforms : Combining two or more ordered data frames quickly 

library("data.table")

cbind(as.data.table(productTable), 
      as.data.table(salesTable[, -1]))
##    productID price sold_store sold_online
## 1:        p1  9.99          6          64
## 2:        p2 16.29         31           1
## 3:        p3 19.99         30          23
## 4:        p4  5.49         31          67
## 5:        p5 24.49         43          51
##    productID price sold_store sold_online
## 1:        p1  9.99          6          64
## 2:        p2 16.29         31           1
## 3:        p3 19.99         30          23
## 4:        p4  5.49         31          67
## 5:        p5 24.49         43          51

00116_informalexample_5.47_of_section_5.4.1.R

# informalexample 5.47 of section 5.4.1 
# (informalexample 5.47 of section 5.4.1)  : Data engineering and data shaping : Multi-table data transforms : Combining two or more ordered data frames quickly 

library("dplyr")

# list of data frames calling convention
dplyr::bind_cols(list(productTable, salesTable[, -1]))
##   productID price sold_store sold_online
## 1        p1  9.99          6          64
## 2        p2 16.29         31           1
## 3        p3 19.99         30          23
## 4        p4  5.49         31          67
## 5        p5 24.49         43          51
##   productID price sold_store sold_online
## 1        p1  9.99          6          64
## 2        p2 16.29         31           1
## 3        p3 19.99         30          23
## 4        p4  5.49         31          67
## 5        p5 24.49         43          51

00117_informalexample_5.48_of_section_5.4.2.R

# informalexample 5.48 of section 5.4.2 
# (informalexample 5.48 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

productTable <- wrapr::build_frame(
   "productID", "price" |
   "p1"       , 9.99    |
   "p3"       , 19.99   |
   "p4"       , 5.49    |
   "p5"       , 24.49   )

salesTable <- wrapr::build_frame(
   "productID", "unitsSold" |
   "p1"       , 10          |
   "p2"       , 43          |
   "p3"       , 55          |
   "p4"       , 8           )

00118_informalexample_5.49_of_section_5.4.2.R

# informalexample 5.49 of section 5.4.2 
# (informalexample 5.49 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

merge(productTable, salesTable, by = "productID", all.x = TRUE)
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8
## 4        p5 24.49        NA
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8
## 4        p5 24.49        NA

00119_informalexample_5.50_of_section_5.4.2.R

# informalexample 5.50 of section 5.4.2 
# (informalexample 5.50 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

library("data.table")

productTable_data.table <- as.data.table(productTable)
salesTable_data.table <- as.data.table(salesTable)

# index notation for join
# idea is rows are produced for each row inside the []
salesTable_data.table[productTable_data.table, on = "productID"]
##    productID unitsSold price
## 1:        p1        10  9.99
## 2:        p3        55 19.99
## 3:        p4         8  5.49
## 4:        p5        NA 24.49
##    productID unitsSold price
## 1:        p1        10  9.99
## 2:        p3        55 19.99
## 3:        p4         8  5.49
## 4:        p5        NA 24.49

# data.table also overrides merge()
merge(productTable, salesTable, by = "productID", all.x = TRUE)
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8
## 4        p5 24.49        NA
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8
## 4        p5 24.49        NA

00120_informalexample_5.51_of_section_5.4.2.R

# informalexample 5.51 of section 5.4.2 
# (informalexample 5.51 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

library("data.table")

joined_table <- productTable
joined_table$unitsSold <- salesTable$unitsSold[match(joined_table$productID, 
                                                    salesTable$productID)]
print(joined_table)
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8
## 4        p5 24.49        NA
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8
## 4        p5 24.49        NA

00121_informalexample_5.52_of_section_5.4.2.R

# informalexample 5.52 of section 5.4.2 
# (informalexample 5.52 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

library("dplyr")

left_join(productTable, salesTable, by = "productID")
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8
## 4        p5 24.49        NA
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8
## 4        p5 24.49        NA

00122_informalexample_5.53_of_section_5.4.2.R

# informalexample 5.53 of section 5.4.2 
# (informalexample 5.53 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

merge(productTable, salesTable, by = "productID")
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8

00123_informalexample_5.54_of_section_5.4.2.R

# informalexample 5.54 of section 5.4.2 
# (informalexample 5.54 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

library("data.table")

productTable_data.table <- as.data.table(productTable)
salesTable_data.table <- as.data.table(salesTable)

merge(productTable, salesTable, by = "productID")
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8

00124_informalexample_5.55_of_section_5.4.2.R

# informalexample 5.55 of section 5.4.2 
# (informalexample 5.55 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

library("dplyr")

inner_join(productTable, salesTable, by = "productID")
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8

00125_informalexample_5.56_of_section_5.4.2.R

# informalexample 5.56 of section 5.4.2 
# (informalexample 5.56 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

# note that merge orders the result by key column by default
# use sort=FALSE to skip the sorting
merge(productTable, salesTable, by = "productID", all=TRUE)
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p2    NA        43
## 3        p3 19.99        55
## 4        p4  5.49         8
## 5        p5 24.49        NA
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p2    NA        43
## 3        p3 19.99        55
## 4        p4  5.49         8
## 5        p5 24.49        NA

00126_informalexample_5.57_of_section_5.4.2.R

# informalexample 5.57 of section 5.4.2 
# (informalexample 5.57 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

library("data.table")

productTable_data.table <- as.data.table(productTable)
salesTable_data.table <- as.data.table(salesTable)

merge(productTable_data.table, salesTable_data.table, 
      by = "productID", all = TRUE)
##    productID price unitsSold
## 1:        p1  9.99        10
## 2:        p2    NA        43
## 3:        p3 19.99        55
## 4:        p4  5.49         8
## 5:        p5 24.49        NA
##    productID price unitsSold
## 1:        p1  9.99        10
## 2:        p2    NA        43
## 3:        p3 19.99        55
## 4:        p4  5.49         8
## 5:        p5 24.49        NA

00127_informalexample_5.58_of_section_5.4.2.R

# informalexample 5.58 of section 5.4.2 
# (informalexample 5.58 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

library("dplyr")

full_join(productTable, salesTable, by = "productID")
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8
## 4        p5 24.49        NA
## 5        p2    NA        43
##   productID price unitsSold
## 1        p1  9.99        10
## 2        p3 19.99        55
## 3        p4  5.49         8
## 4        p5 24.49        NA
## 5        p2    NA        43

00128_informalexample_5.59_of_section_5.4.2.R

# informalexample 5.59 of section 5.4.2 
# (informalexample 5.59 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

library("data.table")

quotes <- data.table(
  bid = c(5, 5, 7, 8),
  ask = c(6, 6, 8, 10),
  bid_quantity = c(100, 100, 100, 100),
  ask_quantity = c(100, 100, 100, 100),
  when = as.POSIXct(strptime(
    c("2018-10-18 1:03:17", 
      "2018-10-18 2:12:23", 
      "2018-10-18 2:15:00", 
      "2018-10-18 2:17:51"), 
    "%Y-%m-%d %H:%M:%S")))

print(quotes)
##    bid ask bid_quantity ask_quantity                when
## 1:   5   6          100          100 2018-10-18 01:03:17
## 2:   5   6          100          100 2018-10-18 02:12:23
## 3:   7   8          100          100 2018-10-18 02:15:00
## 4:   8  10          100          100 2018-10-18 02:17:51
##    bid ask bid_quantity ask_quantity                when
## 1:   5   6          100          100 2018-10-18 01:03:17
## 2:   5   6          100          100 2018-10-18 02:12:23
## 3:   7   8          100          100 2018-10-18 02:15:00
## 4:   8  10          100          100 2018-10-18 02:17:51

00129_informalexample_5.60_of_section_5.4.2.R

# informalexample 5.60 of section 5.4.2 
# (informalexample 5.60 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

trades <- data.table(
  trade_id = c(32525, 32526),
  price = c(5.5, 9),
  quantity = c(100, 200),
  when = as.POSIXct(strptime(
    c("2018-10-18 2:13:42", 
      "2018-10-18 2:19:20"), 
    "%Y-%m-%d %H:%M:%S")))

print(trades)
##    trade_id price quantity                when
## 1:    32525   5.5      100 2018-10-18 02:13:42
## 2:    32526   9.0      200 2018-10-18 02:19:20
##    trade_id price quantity                when
## 1:    32525   5.5      100 2018-10-18 02:13:42
## 2:    32526   9.0      200 2018-10-18 02:19:20

00130_informalexample_5.61_of_section_5.4.2.R

# informalexample 5.61 of section 5.4.2 
# (informalexample 5.61 of section 5.4.2)  : Data engineering and data shaping : Multi-table data transforms : Principled methods to combine data from multiple tables 

quotes[, quote_time := when]
trades[ , trade_time := when ]
quotes[ trades, on = "when", roll = TRUE ][
  , .(quote_time, bid, price, ask, trade_id, trade_time) ]
##             quote_time bid price ask trade_id          trade_time
## 1: 2018-10-18 02:12:23   5   5.5   6    32525 2018-10-18 02:13:42
## 2: 2018-10-18 02:17:51   8   9.0  10    32526 2018-10-18 02:19:20
##             quote_time bid price ask trade_id          trade_time
## 1: 2018-10-18 02:12:23   5   5.5   6    32525 2018-10-18 02:13:42
## 2: 2018-10-18 02:17:51   8   9.0  10    32526 2018-10-18 02:19:20

00131_informalexample_5.62_of_section_5.5.1.R

# informalexample 5.62 of section 5.5.1 
# (informalexample 5.62 of section 5.5.1)  : Data engineering and data shaping : Reshaping transforms : Moving data from wide to tall form 

library("datasets")
library("xts")
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## The following objects are masked from 'package:data.table':
## 
##     first, last
# move the date index into a column
dates <- index(as.xts(time(Seatbelts)))
Seatbelts <- data.frame(Seatbelts) 
Seatbelts$date <- dates
  
# restrict down to 1982 and 1983
Seatbelts <- Seatbelts[ (Seatbelts$date >= as.yearmon("Jan 1982")) &
                          (Seatbelts$date <= as.yearmon("Dec 1983")),
                           , drop = FALSE]
Seatbelts$date <- as.Date(Seatbelts$date)
# mark if the seatbelt law was in effect
Seatbelts$law <- ifelse(Seatbelts$law==1, "new law", "pre-law")
# limit down to the columns we want
Seatbelts <- Seatbelts[, c("date", "DriversKilled", "front", "rear", "law")]

head(Seatbelts)
##           date DriversKilled front rear     law
## 157 1982-01-01           115   595  238 pre-law
## 158 1982-02-01           104   673  285 pre-law
## 159 1982-03-01           131   660  324 pre-law
## 160 1982-04-01           108   676  346 pre-law
## 161 1982-05-01           103   755  410 pre-law
## 162 1982-06-01           115   815  411 pre-law
##           date DriversKilled front rear     law
## 157 1982-01-01           115   595  238 pre-law
## 158 1982-02-01           104   673  285 pre-law
## 159 1982-03-01           131   660  324 pre-law
## 160 1982-04-01           108   676  346 pre-law
## 161 1982-05-01           103   755  410 pre-law
## 162 1982-06-01           115   815  411 pre-law

00132_informalexample_5.63_of_section_5.5.1.R

# informalexample 5.63 of section 5.5.1 
# (informalexample 5.63 of section 5.5.1)  : Data engineering and data shaping : Reshaping transforms : Moving data from wide to tall form 

# let's give an example of the kind of graph we have in mind, using just driver deaths
library("ggplot2")

ggplot(Seatbelts, 
       aes(x = date, y = DriversKilled, color = law, shape = law)) + 
  geom_point() + 
  geom_smooth(se=FALSE) + 
  ggtitle("UK car driver deaths by month")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

plot of chunk 00132_informalexample_5.63_of_section_5.5.1.R

00133_informalexample_5.64_of_section_5.5.1.R

# informalexample 5.64 of section 5.5.1 
# (informalexample 5.64 of section 5.5.1)  : Data engineering and data shaping : Reshaping transforms : Moving data from wide to tall form 

library("data.table")

seatbelts_long2 <- 
  melt.data.table(as.data.table(Seatbelts),
                  id.vars = NULL,
                  measure.vars = c("DriversKilled", "front", "rear"),
                  variable.name = "victim_type", 
                  value.name = "nvictims")

00134_informalexample_5.65_of_section_5.5.1.R

# informalexample 5.65 of section 5.5.1 
# (informalexample 5.65 of section 5.5.1)  : Data engineering and data shaping : Reshaping transforms : Moving data from wide to tall form 

library("cdata")

seatbelts_long3 <- unpivot_to_blocks(
  Seatbelts, 
  nameForNewKeyColumn = "victim_type", 
  nameForNewValueColumn = "nvictims", 
  columnsToTakeFrom = c("DriversKilled", "front", "rear"))

00135_informalexample_5.66_of_section_5.5.1.R

# informalexample 5.66 of section 5.5.1 
# (informalexample 5.66 of section 5.5.1)  : Data engineering and data shaping : Reshaping transforms : Moving data from wide to tall form 

library("tidyr")

seatbelts_long1 <- gather(
  Seatbelts, 
  key = victim_type, 
  value = nvictims, 
  DriversKilled, front, rear)

head(seatbelts_long1)
##         date     law   victim_type nvictims
## 1 1982-01-01 pre-law DriversKilled      115
## 2 1982-02-01 pre-law DriversKilled      104
## 3 1982-03-01 pre-law DriversKilled      131
## 4 1982-04-01 pre-law DriversKilled      108
## 5 1982-05-01 pre-law DriversKilled      103
## 6 1982-06-01 pre-law DriversKilled      115
##         date     law   victim_type nvictims
## 1 1982-01-01 pre-law DriversKilled      115
## 2 1982-02-01 pre-law DriversKilled      104
## 3 1982-03-01 pre-law DriversKilled      131
## 4 1982-04-01 pre-law DriversKilled      108
## 5 1982-05-01 pre-law DriversKilled      103
## 6 1982-06-01 pre-law DriversKilled      115

ggplot(seatbelts_long1, 
       aes(x = date, y = nvictims, color = law, shape = law)) + 
  geom_point() + 
  geom_smooth(se=FALSE) + 
  facet_wrap(~victim_type, ncol=1, scale="free_y") +  
  ggtitle("UK auto fatalities by month and seating position")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

plot of chunk 00135_informalexample_5.66_of_section_5.5.1.R

00136_informalexample_5.67_of_section_5.5.2.R

# informalexample 5.67 of section 5.5.2 
# (informalexample 5.67 of section 5.5.2)  : Data engineering and data shaping : Reshaping transforms : Moving data from tall to wide form 

library("datasets")
library("data.table")
library("ggplot2")

ChickWeight <- data.frame(ChickWeight) # get rid of attributes
ChickWeight$Diet <- NULL # remove the diet label
# pad names with zeros
padz <- function(x, n=max(nchar(x))) gsub(" ", "0", formatC(x, width=n)) 
# append "Chick" to the chick ids
ChickWeight$Chick <- paste0("Chick", padz(as.character(ChickWeight$Chick)))

head(ChickWeight)
##   weight Time   Chick
## 1     42    0 Chick01
## 2     51    2 Chick01
## 3     59    4 Chick01
## 4     64    6 Chick01
## 5     76    8 Chick01
## 6     93   10 Chick01
##   weight Time   Chick
## 1     42    0 Chick01
## 2     51    2 Chick01
## 3     59    4 Chick01
## 4     64    6 Chick01
## 5     76    8 Chick01
## 6     93   10 Chick01

00137_informalexample_5.68_of_section_5.5.2.R

# informalexample 5.68 of section 5.5.2 
# (informalexample 5.68 of section 5.5.2)  : Data engineering and data shaping : Reshaping transforms : Moving data from tall to wide form 

# aggregate count and mean weight by time
ChickSummary <- as.data.table(ChickWeight)
ChickSummary <- ChickSummary[, 
             .(count = .N, 
               weight = mean(weight),
               q1_weight = quantile(weight, probs = 0.25),
               q2_weight = quantile(weight, probs = 0.75)), 
             by = Time]
head(ChickSummary)
##    Time count    weight q1_weight q2_weight
## 1:    0    50  41.06000        41        42
## 2:    2    50  49.22000        48        51
## 3:    4    49  59.95918        57        63
## 4:    6    49  74.30612        68        80
## 5:    8    49  91.24490        83       102
## 6:   10    49 107.83673        93       124
##    Time count    weight q1_weight q2_weight
## 1:    0    50  41.06000        41        42
## 2:    2    50  49.22000        48        51
## 3:    4    49  59.95918        57        63
## 4:    6    49  74.30612        68        80
## 5:    8    49  91.24490        83       102
## 6:   10    49 107.83673        93       124

00138_informalexample_5.69_of_section_5.5.2.R

# informalexample 5.69 of section 5.5.2 
# (informalexample 5.69 of section 5.5.2)  : Data engineering and data shaping : Reshaping transforms : Moving data from tall to wide form 

library("ggplot2")

ChickSummary <- cdata::unpivot_to_blocks(                              	# Note: 1 
  ChickSummary,
  nameForNewKeyColumn = "measurement",
  nameForNewValueColumn = "value",
  columnsToTakeFrom = c("count", "weight"))
## Warning in data.table::melt.data.table(data = data, id.vars =
## columnsToCopy, : 'measure.vars' [count, weight] are not all of the same
## type. By order of hierarchy, the molten data value column will be of type
## 'double'. All measure variables not of type 'double' will be coerced too.
## Check DETAILS in ?melt.data.table for more on coercion.
ChickSummary$q1_weight[ChickSummary$measurement=="count"] <- NA    	# Note: 2 
ChickSummary$q2_weight[ChickSummary$measurement=="count"] <- NA
CW <- ChickWeight
CW$measurement <- "weight"

ggplot(ChickSummary, aes(x = Time, y = value, color = measurement)) +   	# Note: 3 
  geom_line(data = CW, aes(x = Time, y = weight, group = Chick),
            color="LightGray") +
  geom_line(size=2) + 
  geom_ribbon(aes(ymin = q1_weight, ymax = q2_weight), 
              alpha = 0.3, colour = NA) +
  facet_wrap(~measurement, ncol=1, scales = "free_y") +
  theme(legend.position = "none") +
  ylab(NULL) +
  ggtitle("Chick Weight and Count Measurements by Time",
          subtitle = "25% through 75% quartiles of weight shown shaded around mean")

plot of chunk 00138_informalexample_5.69_of_section_5.5.2.R

# Note 1: 
#   Unpivot into tall form for plotting. 

# Note 2: 
#   Make sure we have the exact set of columns needed for plotting. 

# Note 3: 
#   Make the plot. 

00139_informalexample_5.70_of_section_5.5.2.R

# informalexample 5.70 of section 5.5.2 
# (informalexample 5.70 of section 5.5.2)  : Data engineering and data shaping : Reshaping transforms : Moving data from tall to wide form 

library("data.table")

ChickWeight_wide2 <- dcast.data.table(
  as.data.table(ChickWeight), 
  Chick ~ Time,
  value.var = "weight")

00140_informalexample_5.71_of_section_5.5.2.R

# informalexample 5.71 of section 5.5.2 
# (informalexample 5.71 of section 5.5.2)  : Data engineering and data shaping : Reshaping transforms : Moving data from tall to wide form 

library("cdata")

ChickWeight_wide3 <- pivot_to_rowrecs(
  ChickWeight, 
  columnToTakeKeysFrom = "Time", 
  columnToTakeValuesFrom = "weight",
  rowKeyColumns = "Chick")

00141_informalexample_5.72_of_section_5.5.2.R

# informalexample 5.72 of section 5.5.2 
# (informalexample 5.72 of section 5.5.2)  : Data engineering and data shaping : Reshaping transforms : Moving data from tall to wide form 

library("tidyr")

ChickWeight_wide1 <- spread(ChickWeight, 
                            key = Time, 
                            value = weight)

head(ChickWeight_wide1)
##     Chick  0  2  4  6  8  10  12  14  16  18  20  21
## 1 Chick01 42 51 59 64 76  93 106 125 149 171 199 205
## 2 Chick02 40 49 58 72 84 103 122 138 162 187 209 215
## 3 Chick03 43 39 55 67 84  99 115 138 163 187 198 202
## 4 Chick04 42 49 56 67 74  87 102 108 136 154 160 157
## 5 Chick05 41 42 48 60 79 106 141 164 197 199 220 223
## 6 Chick06 41 49 59 74 97 124 141 148 155 160 160 157
##     Chick  0  2  4  6  8  10  12  14  16  18  20  21
## 1 Chick01 42 51 59 64 76  93 106 125 149 171 199 205
## 2 Chick02 40 49 58 72 84 103 122 138 162 187 209 215
## 3 Chick03 43 39 55 67 84  99 115 138 163 187 198 202
## 4 Chick04 42 49 56 67 74  87 102 108 136 154 160 157
## 5 Chick05 41 42 48 60 79 106 141 164 197 199 220 223
## 6 Chick06 41 49 59 74 97 124 141 148 155 160 160 157