/
Assignment02.R
123 lines (94 loc) · 5.38 KB
/
Assignment02.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#Author: Shane Seheult
## ---- libraries --------------------------------------------------------------
library(tidyverse)
library(ggplot2)
library(ggpubr)
library(cowplot)
library(gganimate)
## ---- Import Data ------------------------------------------------------------
teat_project_csv <- "Teat_Preferance_QMEE_Project.csv"
dd <- read_csv(teat_project_csv)
## BMB:: why do we have two identical sex columns? Should clean this ...
all(dd$Sex...2 == dd$Sex...6) ## TRUE
print(dd)
## ---- Do Birthdays differ, on average, between Groups? -----------------------
Wild_Caught.df <- filter(dd, Group == "Wild-Caught")
Captive.df <- filter(dd, Group == "Captive")
## BMB: in general you should avoid splitting this way if you can.
t.test(x = Wild_Caught.df$'Julian Date', y = Captive.df$'Julian Date')
## this is better (also good to rename vars to eliminate spaces!)
t.test(`Julian Date` ~ Group, data = dd)
# The results suggest that there is a significant difference (t(26.9) = 3.37, p = 0.02, 95%CI [0.70, 2.90])
# between birthdays of pups born to wild-caught females and pups born to females in captivity.
# The mean Birthday for pups born from wild-caught females is 168.74 (Julian Day) and the mean Birthday for
# pups born from captive females is 166.94 (Julian Day).
## ---- Plot the Data & look for anomalies ------------------------------------
## BMB: don't repeat all this theme() stuff. Create your own theme object and add it to the plot.
fig01 <- ggplot(data = dd, aes(y = dd$`Julian Date`, x = Group, na.rm = T)) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), panel.border = element_blank()) +
theme(axis.line = element_line(color = "black", size = 0.4)) +
theme(axis.text.x = element_text(size = 12), axis.text.y = element_text(size = 12)) +
theme(axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12)) +
ylab('Julian Date') + xlab('Group') +
geom_boxplot(fill = "gray") ## see https://clauswilke.com/dataviz/avoid-line-drawings.html
fig01
## The box plots show no outliers (i.e. there are no data points exceeding the whiskers of the boxplots)
## However, the Julian Dates for the captive group tend to be skewed to lower values (as evidenced by the bold line
# of the box plot).
## ---- Looking for Normality in Data ------------------------------------------
qqnorm(dd$`Julian Date`, main = "Julian Date");qqline(dd$`Julian Date`)
## BMB: don't do this (for reasons we will explain). It's irrelevant
## whether the *marginal* distribution of your data is Normal or not
## And in the residuals
lin.mod <- lm(dd$`Julian Date`~ dd$Group)
qqnorm(residuals(lin.mod)); qqline(residuals(lin.mod))
## BMB: this is more relevant
## Histogram of Data
fig02 <- ggplot(data = dd, aes(x = dd$`Julian Date`)) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), panel.border = element_blank()) +
theme(axis.line = element_line(color = "black", size = 0.4)) +
ylab('Density') + xlab('Day') +
scale_y_continuous(limits = c(0,20), expand = c(0,0)) +
geom_histogram(bins = 20) +
theme(legend.position = c(0.85, 0.85), legend.title = element_blank())
fig02
## BMB: histograms aren't great for discrete data (unless you make the
## bins wider)
# The data appear to be normally distributed as seen by the qqnorm plot (i.e. the data points
# do not drastically deviate from the qqline). and as per the histogram plot (fig02), although the
# data do seem to be slightly right skewed.
## BMB: don't do it!
shapiro.test(dd$`Julian Date`)
# The results of the Shapiro-Wilk Normality test suggest that the data are not normally
# distributed. W = 0.950, p = 0.015
## ---- Trying Log Transformed data --------------------------------------------
dd.02 <- dd %>% mutate(log_JulianDate = log(dd$`Julian Date`))
qqnorm(dd.02$log_JulianDate, main = "log Julian Date");qqline(dd.02$`log_JulianDate`)
fig03 <- ggplot(data = dd.02, aes(x = dd.02$`log_JulianDate`)) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), panel.border = element_blank()) +
theme(axis.line = element_line(color = "black", size = 0.4)) +
ylab('Density') + xlab('Day') +
scale_y_continuous(limits = c(0,20), expand = c(0,0)) +
geom_histogram() +
theme(legend.position = c(0.85, 0.85), legend.title = element_blank())
fig03
shapiro.test(dd.02$`log_JulianDate`)
# The log transformed data is not normal either, likely because I am using Julian
# Date as the dependent variable.
## ---- Do the proportions of Same Sex and Different Sex pup pairs differ? -----
Matches <- dd$Sex...2 == dd$`Sibling Sex`
summary(Matches)
# From here, we can see that the total number of observations is 60 (37 + 19 + 4), which is consistent
# with using summary(df) to see that Bat.ID has a length of 60. We can then calculate the proportion Same-
# Sibling Matches (i.e. TRUE) and the proportion of Different Sibling Matches (i.e. FALSE).
prop_SameSib <- 19 / 60
prop_DiffSib <- 37 / 60
## BMB: you should avoid hard-coding numbers
with(dd, table(`Sibling Sex`==Sex...2))
# We can also test if the two proportions differ from one another:
prop.test(c(19,37), c(60,60))
# The results suggest that there is a higher proportion of Different Sex Siblings compared to Same Sex Siblings
# (X^2 = 9.68, df = 1, p = 0.002, 95% CI [-0.487, -0.113]).