full_analyses.Rmd

---
title: "Full CP analyses"
author: "NK & AC"
date: "2/23/2021"
output: html_document
---

```{r setup, include=FALSE}
library(tidyverse) # I wasn't able to run this package from groundhog in Windows
library(car) # I wasn't able to run this package from groundhog in Windows
 
#new library for graphs
library(ggpubr) 
library(ggiraphExtra)
library(ggridges)
library(ggpol)
library(scales)

library(groundhog)
groundhog.day="2021-02-23"
pkgs=c('tidyverse','ggplot2', 'gvlma','car','RCurl','plyr','readxl','knitr',
       'rmarkdown')
groundhog.library(pkgs, groundhog.day)


# Seed for random number generation
set.seed(22)

#document session info
capture.output(sessionInfo(),file="lastknit_session_info.txt")

opts_chunk$set(echo = TRUE)
```

## TODO

- increase reproducibility by using only groundhog
- clean up the double child Id column name
- TODO instead of using line numbers, use the child's IDs

## Read in data

```{r read-in, eval=F}

# 1. gen CP for all datasets

# Generate canonical proportion for data from .eaf
  # French Solomon Tsimane
raw_eaf_data<- read_excel("RawData/Raw_data_full.xlsx")
raw_eaf_data$Subtier_type<-NA
raw_eaf_data$Subtier_type[grep("vcm",raw_eaf_data$Subtier)]<-"vcm"
raw_eaf_data$clean_ChildID <- gsub("_period.*","",gsub("_._period.*","",raw_eaf_data$Filename))
raw_eaf_data$clean_ChildID <- gsub("random.*","",raw_eaf_data$clean_ChildID)
raw_eaf_data$clean_ChildID <- gsub("periodic.*","",raw_eaf_data$clean_ChildID)
raw_eaf_data$clean_ChildID <- gsub("_LD.*","",raw_eaf_data$clean_ChildID)

vcm_tab <- table(raw_eaf_data$clean_ChildID[raw_eaf_data$Subtier_type=="vcm" & raw_eaf_data$Tier=="CHI"],raw_eaf_data$Type[raw_eaf_data$Subtier_type=="vcm" & raw_eaf_data$Tier=="CHI"])
vcm_tab_chi_eaf <- vcm_tab[,"C"]/(vcm_tab[,"C"]+vcm_tab[,"N"])

# Generate canonical proportion for data from cychosz
  #contains: Casillas-Yeli  Cychosz Seedlings    Tsimane    Warlaumont 
raw_cychosz_data<- read_csv("./Data/meta_answers_global_crossling_9-20.csv")
raw_cychosz_data$unique_child_ID <- paste(raw_cychosz_data$language,raw_cychosz_data$child_ID)

vcm_tab <- table(raw_cychosz_data$unique_child_ID,raw_cychosz_data$Answer)
vcm_tab_chi_cychosz <- vcm_tab[,"Canonical"]/(vcm_tab[,"Canonical"]+vcm_tab[,"Non-canonical"])

# Generate canonical proportion for data from zooniverse #TODO
  #contains: Yélî & Tsimane

other_data<- read_excel("Data/CR_by_child-updated_21_01.xlsx")
other_data_tsi<- read_excel("Data/CR_by_child_tsi.xlsx")

#2. combine cp info w/ other info
# we'll use the best and most data we can:
#- Yélî will come from zooniverse-other DONE
#- Tsimane will come from zooniverse-tsi DONE
#- French Solomon from eaf DONE
#- the rest of the corpora will come from cychosz TODO

#also, we want to be able to merge our CPs into other_data because other_data has age of the child

best_data=other_data[other_data$corpus %in% c("Yélî"),]

#next, add tsi kids
best_data2 <- rbind.fill(best_data,other_data_tsi)
best_data2$CP=best_data2$CR

#next, add French & Solomon 
#to this end, first we need to add the CPs
vcm_tab_chi_eaf2=data.frame(cbind(vcm_tab_chi_eaf,names(vcm_tab_chi_eaf)))
colnames(vcm_tab_chi_eaf2)<-c("CP","ChildID") #renamed column to ChildID for consistency
vcm_tab_chi_eaf2$CP <- as.numeric(as.character(vcm_tab_chi_eaf2$CP))

#subset to French & Solomon 
other_data_fs <- subset(other_data, corpus %in% c("French","Solomon"))
  #clean up
other_data_fs$CR <- NULL #we get rid of CP column here so in the following merging we could work with R calculated data
other_data_fs$ChildID <-gsub("_period.*","",gsub("_._period.*","",other_data_fs$ChildID))
other_data_fs$ChildID <- gsub("random.*","",other_data_fs$ChildID)
other_data_fs$ChildID <- gsub("periodic.*","",other_data_fs$ChildID)
other_data_fs$ChildID <- gsub("_LD.*","",other_data_fs$ChildID)

#remove tsi 
vcm_tab_chi_eaf2 <- vcm_tab_chi_eaf2[-(grep("tsi",vcm_tab_chi_eaf2$ChildID)),]
other_data_fs<- merge(other_data_fs,vcm_tab_chi_eaf2,by="ChildID",all=T) 

#Assigning values for children uploaded from OSF
#I'm going to assign value for new children here, so it won't be messed up later in case we inCPease the dataset
#TODO instead of using line numbers, use the child's IDs
other_data_fs[c(29,31), c('Language', 'corpus')] <- 'French'
other_data_fs[c(29,31), 'C_count'] <- '21'
other_data_fs[c(29,31), 'V_count'] <- '17'
other_data_fs[c(29,31), 'Syllable complexity'] <- 'High'
other_data_fs['29', 'Age in months'] <- '31.4'
other_data_fs['31', 'Age in months'] <- '3'

other_data_fs$'Age in months' <- as.numeric(other_data_fs$'Age in months')

#and next we add French & Solomon to the big data set
best_data3 <- rbind.fill(best_data2,other_data_fs) 

#add cychosz's data
#combine cp info with background info
vcm_tab_chi_cychosz=data.frame(cbind(vcm_tab_chi_cychosz,names(vcm_tab_chi_cychosz)))
colnames(vcm_tab_chi_cychosz)<-c("CP","ChildID") #renamed column to ChildID for consistency
#remove tsi
vcm_tab_chi_cychosz[-grep("Tsi",rownames(vcm_tab_chi_cychosz)),]->vcm_tab_chi_cychosz
vcm_tab_chi_cychosz$ChildID=gsub("[+/]","",gsub(" ","",gsub("[A-Z]","",gsub("[a-z]","",vcm_tab_chi_cychosz$ChildID))))

other_data_fs <- subset(other_data, corpus %in% c("Warlaumont","Seedlings", "Cychosz","Tseltal"))
other_data_fs$ChildID=gsub("[A-Z]","",gsub("[a-z]","",other_data_fs$ChildID))

other_data_fs<- merge(other_data_fs,vcm_tab_chi_cychosz,by="ChildID") 

best_data_final <- rbind.fill(best_data3,other_data_fs) 


##CLEANING
best_data_final$syl_comp <- factor(best_data_final$`Syllable complexity`, levels=c("Low","Moderate","High"))
best_data_final$lang <- as.factor(best_data_final$Language)

best_data_final$age <-  as.numeric(coalesce(best_data_final$`Age in months`, best_data_final$Age))

best_data_final <- best_data_final %>% 
  filter(age <= 40) #after this your data reduces to 85 observations (<40) which you care about.

#table(best_data_final$ChildID)[order(table(best_data_final$ChildID))]

best_data_clean=best_data_final[,c("age","lang","corpus", "syl_comp","C_count","V_count","CP")]
write.csv(best_data_clean,"Data/best_data_clean.csv",row.names=F)

```

## Adding Phonetic properties

```{r read-in}

read.csv("Data/best_data_clean.csv")->best_data_clean

#DATASET WITH PHONETIC PROPERTIES ---------------------------------
#adding a file with phonetic data  
phon_data<- read_excel("C:/Users/Lenovo/Desktop/UNISI/ENS Traineeship/GIT/LangComplexity/Data/LAAC_Internship2020_Languages_upd.xlsx")
phon_data<- read_excel("Data/LAAC_Internship2020_Languages_upd.xlsx")

#MERGING TWO DATASETS TOGETHER
#select the columns to merge from the Languages file
phon_data <-phon_data %>% select(Language, Maddieson_C_inv, Maddieson_VQ_Inv, Maddieson_sylcomp)
phon_data$lang <- as.factor(phon_data$Language)
phon_data <- dplyr::distinct(phon_data) # TO GET RID OF DUPLICATES
phon_data$Language <- NULL

#merge the selected columns into one dataset
mydata_sub<-merge(best_data_clean,phon_data, by="lang", all.x=T)  
#mydata_sub<-subset(mydata_sub, !is.na(CP)) #NA in CP are excluded 

mydata_sub$Mad_syl_comp <- factor(mydata_sub$`Maddieson_sylcomp`, levels=c("Low","Moderate","High"))
mydata_sub$Mad_C <- factor(mydata_sub$`Maddieson_C_inv`, levels=c("Moderately Small","Average","Large","Moderately large"))
mydata_sub$Mad_VQ <-factor(mydata_sub$`Maddieson_VQ_Inv`, levels=c("Average","Large"))
mydata_sub$syl_comp <- NULL
mydata_sub$Maddieson_sylcomp <- NULL
mydata_sub$Maddieson_C_inv <- NULL
mydata_sub$Maddieson_VQ_Inv <- NULL
dim(mydata_sub)
table(mydata_sub$lang)

summary(mydata_sub)

mydata_sub<-subset(mydata_sub, !is.na(Mad_C)) #NA are excluded -> 80 children
dim(mydata_sub)

#correction some definitions in data
mydata_sub$C_count <- as.numeric(mydata_sub$C_count)
mydata_sub$V_count <- as.numeric(mydata_sub$V_count)
mydata_sub$corpus <- as.factor(mydata_sub$corpus)

write.csv(mydata_sub,"Data/best_data_clean_wLang.csv",row.names=F)
```

## Graphs
```{r read-in}
#read.csv("Data/best_data_clean_wLang.csv")->mydata_sub

mydata_sub$Mad_syl_comp <- factor(mydata_sub$Mad_syl_comp, levels=c("Low","Moderate","High"))
mydata_sub$Mad_C <- factor(mydata_sub$Mad_C, levels=c("Moderately Small","Average","Large","Moderately large"))
mydata_sub$Mad_VQ <-factor(mydata_sub$Mad_VQ, levels=c("Average","Large"))

summary(mydata_sub)
dim(mydata_sub)

#Histograms
hist(mydata_sub$CP,main="CP",xlab="CP")  

#Plots

library("ggridges")
ggplot(mydata_sub,aes(x = age, y = Mad_syl_comp)) +
  geom_density_ridges_gradient(aes(fill = ..x..), scale = 3, size = 0.3) +
  scale_fill_gradientn(colours = c("#0D0887FF", "#CC4678FF", "#F0F921FF"),name = "Age")+
  labs(x = "Age (months)")+
  labs(y = "Syllable Complexity")+
  labs(title = 'Distributinon of Syllable Complexity') 


ggplot(mydata_sub,aes(x = CP, y = Mad_C)) +
  geom_density_ridges_gradient(aes(fill = ..x..), scale = 3, size = 0.3) +
  scale_fill_gradientn(colours = c("#0D0887FF", "#CC4678FF", "#F0F921FF"),name = "Age")+
  labs(x = "Canonical Proportion")+
  labs(y = "Consonants")+
  labs(title = 'Distributinon of Consonants') 

ggplot(mydata_sub,aes(x = CP, y = Mad_VQ)) +
  geom_density_ridges_gradient(aes(fill = ..x..), scale = 3, size = 0.3) +
  scale_fill_gradientn(colours = c("#0D0887FF", "#CC4678FF", "#F0F921FF"),name = "Age")+
  labs(x = "Canonical Proportion")+
  labs(y = "Vowels")+
  labs(title = 'Distributinon of Vowels') 


ggscatter(
  mydata_sub, x = "CP", y = "age",
  facet.by  = c("Mad_VQ", "Mad_syl_comp"), 
  short.panel.labs = FALSE
  )+
  stat_smooth(method = "loess", span = 0.9)

ggscatter(
  mydata_sub, x = "CP", y = "age",
  facet.by  = c("Mad_C", "Mad_syl_comp"), 
  short.panel.labs = FALSE
  )+
  stat_smooth(method = "loess", span = 0.9)


ggplot(mydata_sub, aes(x=age, y=CP, color=lang)) +
  labs(colour = "Languages", shape="Languages") +
  labs(x = "Age (months)")+
  labs(y = "CP")+
  geom_point()+
  # Add regression lines
  geom_smooth(method=lm,se=FALSE)

ggplot(mydata_sub, aes(x=age, y=CP, color=Mad_C, shape=Mad_C)) +
  labs(title = "Distribution of CP wrt. Age (up to 40 months)")+
  labs(colour = "Consonants", shape="Consonants")+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
 # scale_color_manual(values=c("#E69F00", "#56B4E9", "#009E73"))+
 # scale_shape_manual(values=c(4, 16, 0))+
  labs(x = "Age (months)")+
  labs(y = "CP")+
  geom_point(size=2.5)+
  # Add regression lines
  geom_smooth(method=lm, se=FALSE)
  # Add loess lines
  #geom_smooth(span = 0.8)


ggplot(mydata_sub, aes(x=age, y=CP, color=Mad_VQ, shape=Mad_VQ)) +
  labs(title = "Distribution of CP wrt. Age (up to 40 months)")+
  labs(colour = "Vowels", shape="Vowels")+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  scale_color_manual(values=c("#E69F00", "#56B4E9", "#009E73"))+
  scale_shape_manual(values=c(4, 16, 0))+
  labs(x = "Age (months)")+
  labs(y = "CP")+
  geom_point(size=2.5)+
  # Add regression lines
  geom_smooth(method=lm, se=FALSE)
# Add loess lines
#geom_smooth(span = 0.8)

#1- a xyplot for CP predicted by (=as a function of) both age (x axis) and Mad_syl_comp (line types/colors)
ggplot(mydata_sub, aes(x=age, y=CP, color=Mad_syl_comp , shape= Mad_syl_comp)) +
  labs(title = "Distribution of CP wrt. Age (up to 40 months)")+
  labs(colour = "Syllable Complexity", shape="Syllable Complexity")+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  scale_color_manual(values=c("#E69F00", "#56B4E9", "#009E73"))+
  scale_shape_manual(values=c(4, 16, 0))+
  labs(x = "Age (months)")+
  labs(y = "CP")+
  geom_point(size=2.5)+
  # Add regression lines
  geom_smooth(method=lm, se=FALSE)
# Add loess lines
#geom_smooth(span = 0.8)

#2- a xyplot for CP predicted by (=as a function of) Phon complexity 
ggplot(mydata_sub, aes(x=Phon_count, y=CP, color=Mad_syl_comp))  +
  labs(title = "Distribution of CP wrt. Phonological complexity")+
  labs(colour = "Syllable Complexity", shape="Syllable Complexity")+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
#  scale_color_manual(values=c("#E69F00", "#56B4E9", "#009E73"))+
 # scale_shape_manual(values=c(4, 16, 0))+
  labs(x = "Phonological complexity")+
  labs(y = "CP")+
  geom_point(size=2.5)+
  # Add regression lines
  geom_smooth(method=lm, se=FALSE)
# Add loess lines
#geom_smooth(span = 0.8)


```


## Mean CP and Standard Deviation (box-plots included)

```{r read-in}

"Average CP"; mean(mydata_sub$CP) ; "Standard Deviation" ; sd(mydata_sub$CP)

#Consonants
"Average CP for Consonants: Moderately Small"; mean(mydata_sub$CP[mydata_sub$Mad_C == "Moderately Small"]) ; "Standard Deviation" ; sd(mydata_sub$CP[mydata_sub$Mad_C == "Moderately Small"])
"Average CP for Consonants: Average"; mean(mydata_sub$CP[mydata_sub$Mad_C == "Average"]) ; "Standard Deviation" ; sd(mydata_sub$CP[mydata_sub$Mad_C == "Average"])

boxplot(mydata_sub$CP~mydata_sub$Mad_C, main="Distribution of CP by Consonants", xlab="Consonants", ylab="CP")

library(ggpol)
library(scales)
ggplot(mydata_sub, aes(x = factor(Mad_C, 
                      labels = c("Moderately Small","Average","Large","Moderately large")),y = CP, fill=Mad_C)) +
  geom_boxjitter(color="black", jitter.color = "darkgrey", errorbar.draw = TRUE) +
  scale_y_continuous() +
  labs(title = "Distribution of CP by Consonants", 
       x = "",
       y = "") +
  theme_minimal() +
  theme(legend.position = "none") 


#Vowels
"Average CP for Vowels: Average"; mean(mydata_sub$CP[mydata_sub$Mad_VQ == "Average"]) ; "Standard Deviation" ; sd(mydata_sub$CP[mydata_sub$Mad_VQ == "Average"])
"Average CP for Consonants: Large"; mean(mydata_sub$CP[mydata_sub$Mad_VQ == "Large"]) ; "Standard Deviation" ; sd(mydata_sub$CP[mydata_sub$Mad_VQ == "Large"])

boxplot(mydata_sub$CP~mydata_sub$Mad_VQ, main="Distribution of CP by Vowels", xlab="Vowels", ylab="CP")

ggplot(mydata_sub, aes(x = factor(Mad_VQ, 
                      labels = c("Average","Large")),y = CP, fill=Mad_VQ)) +
  geom_boxjitter(color="black", jitter.color = "darkgrey", errorbar.draw = TRUE) +
  scale_y_continuous() +
  labs(title = "Distribution of CP by Vowels", 
       x = "",
       y = "") +
  theme_minimal() +
  theme(legend.position = "none") 


```

## Analysis

```{r read-in}

#Small vs. Average vs. Mod. Large Consonants -----------------------------------------

#Pearson's correlations for each Mad_C level
mydata_sub_S <- subset(mydata_sub,  Mad_C=="Moderately Small")
mydata_sub_A <- subset(mydata_sub,  Mad_C=="Average")
mydata_sub_L <- subset(mydata_sub,  Mad_C=="Large") 

#summary(mydata_sub)
"Moderately Small Mad_C"; cor.test(mydata_sub_S$age,mydata_sub_S$CP) #0.87546 
"Average Mad_C"; cor.test(mydata_sub_A$age,mydata_sub_A$CP) #0.6519982  
"Large Mad_C"; cor.test(mydata_sub_L$age,mydata_sub_L$CP) #0.6306421 

ggqqplot(mydata_sub_A$age, ylab = "Age")+
  labs(title = "Pearson's correlations for Average Consonants by Age") 
ggqqplot(mydata_sub_A$CP, ylab = "CP")+
  labs(title = "Pearson's correlations for Average Consonants by CP") 

#check assumptions

#Moderately Small Mad_C
lm_Small <- lm(mydata_sub_S$CP~mydata_sub_S$age)
par(mfrow=c(2,2))
plot(lm_Small, main="Moderately Small Mad_C")


#Average Mad_C
lm_Avg <- lm(mydata_sub_A$CP~mydata_sub_A$age)
par(mfrow=c(2,2))
plot(lm_Avg, main="Average Mad_C")

#Large Mad_C  
lm_large <- lm(mydata_sub_L$CP~mydata_sub_L$age)
par(mfrow=c(2,2))
plot(lm_large, main="Moderately large Mad_C") 

#Small vs. Average
SvA <- subset(mydata_sub,  Mad_C!="Large")
lm_SvA <- (lm(SvA$CP~SvA$age))
par(mfrow=c(2,2))
plot(lm_SvA, main="Small vs. Average")
"Small vs. Average"; anova(lm(SvA$CP~SvA$age*SvA$Mad_C))

#Average vs. Large
AvL <- subset(mydata_sub,  Mad_C!="Moderately Small")
lm_AvL <- (lm(AvL$CP~AvL$age))
par(mfrow=c(2,2))
plot(lm_AvL, main="Average vs. Large")
"Average vs. Large"; anova(lm(AvL$CP~AvL$age*AvL$Mad_C))

#Small vs. Large
SvL <- subset(mydata_sub,  Mad_C!="Average")
lm_SvL <- (lm(SvL$CP~SvL$age))
par(mfrow=c(2,2))
plot(lm_SvL, main="Small vs. Large")#looks quite bad
"Small vs. Large"; anova(lm(SvL$CP~SvL$age*SvL$Mad_C))


#Average vs. Large Vowels -----------------------------------------

#Pearson's correlations for each Mad_V level
mydata_sub_Av <- subset(mydata_sub,  Mad_VQ=="Average")
mydata_sub_Lr <- subset(mydata_sub,  Mad_VQ=="Large")

#summary(mydata_sub)
"Average Mad_VQ"; cor.test(mydata_sub_Av$age,mydata_sub_Av$CP,)  #0.5963248 
"Large Mad_VQ"; cor.test(mydata_sub_Lr$age,mydata_sub_Lr$CP)  #0.7459597

#check assumptions
#Average Mad_VQ
lm_Average <- lm(mydata_sub_Av$CP~mydata_sub_Av$age)
par(mfrow=c(2,2))
plot(lm_Average, main="Average Mad_VQ")  
#Large Mad_VQ 
lm_lrg <- lm(mydata_sub_Lr$CP~mydata_sub_Lr$age) #
par(mfrow=c(2,2))
plot(lm_lrg, main="Large Mad_VQ")#looks quite bad  


#Average vs. Large
AvvLr <- subset(mydata_sub)
lm_AvvLr <- (lm(AvvLr$CP~AvvLr$age))
par(mfrow=c(2,2))
plot(lm_AvvLr, main="Average vs. Large")
"Average vs. Large"; anova(lm(AvvLr$CP~AvvLr$age*AvvLr$Mad_VQ))


# Some Models -----------------------------------------


#
lm_data <- lm(CP ~ age + Mad_C + Mad_VQ, data = mydata_sub)
summary(lm_data)
summary(lm(age ~ CP +  Mad_VQ + Mad_C))

ggPredict(lm_data,se=TRUE,interactive=TRUE)

lm_data_C <- lm(formula = CP~age*Mad_C, data = mydata_sub)
summary(lm_data_C)
ggPredict(lm_data_C,se=TRUE,interactive=TRUE)
par(mfrow=c(2,2))
plot(lm_data_C)  
gvlma(lm_data_C) #Heteroscedasticity not satisfied

lm_data_V <- lm(formula = CP~age*Mad_VQ, data = mydata_sub)
summary(lm_data_V )
ggPredict(lm_data_V ,se=TRUE,interactive=TRUE)
par(mfrow=c(2,2))
plot(lm_data_V)  
gvlma(lm_data_V) #passes checks

lm_data_SC <- lm(formula = CP ~ age * Mad_syl_comp, data = mydata_sub)
summary(lm_data_SC)
ggPredict(lm_data_SC,se=TRUE,interactive=TRUE)
par(mfrow=c(2,2))
plot(lm_data_SC)  
gvlma(lm_data_SC) #passes checks

Anova(lm_data_SC, type="III") 

```

## SRCD results

```{r}

cor(mydata_sub[,c("age" , "C_count" , "V_count")])
mydata_sub$Phon_count=mydata_sub$C_count+mydata_sub$V_count

cor(mydata_sub[,c("age" , "Phon_count")])

#hist(mydata_sub$Phon_count)
hist(log(mydata_sub$Phon_count,10))
mydata_sub$Phon_count_l10=log(mydata_sub$Phon_count,10)

ggplot(mydata_sub, aes(x = factor(Mad_syl_comp),y = Phon_count_l10, fill=Mad_syl_comp)) +
  geom_boxjitter(color="black", jitter.color = "darkgrey", errorbar.draw = TRUE) +
  scale_y_continuous() +
  labs(title = "Distribution of Phon_count_l10 by Mad_syl_comp", 
       x = "",
       y = "") +
  theme_minimal() +
  theme(legend.position = "none") 

library(lme4)
lm_data <- lmer(CP ~ age + Phon_count_l10 + Mad_syl_comp + (1|lang), data = mydata_sub)
plot(lm_data)
summary(lm_data)
```

Homework:
- why are Mad_syl_compModerate and Mad_syl_compHigh negative?
- why are they significant?
- what does this mean for our predictions?

```{r}
lm_data <- lmer(CP ~ age  + Mad_syl_comp + (1|lang), data = mydata_sub)
plot(lm_data)
summary(lm_data)

```

Homework:
- what is the difference between this model and the previous one?
- are age, Mad_syl_compModerate and Mad_syl_compHigh significant?
- why?
- what do their estimates mean?
- what does this mean for our predictions?