/
Jack Ross Survival Exercise _Logistic Regression (Titanic Data)
106 lines (85 loc) · 3.71 KB
/
Jack Ross Survival Exercise _Logistic Regression (Titanic Data)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
setwd("C:\\YYYYYY\\AMMA 2017\\Data")
getwd()
df.titanic <- read.csv('train.csv')
View(df.titanic)
str(df.titanic)
head(df.titanic)
titanic_final <- df.titanic["Pclass","Sex","Age","SibSp","Parch","Survived"]
titanic_final <- df.titanic[c("Pclass","Sex","Age","SibSp","Parch","Survived")]
View(titanic_final)
str(titanic_final)
summary(titanic_final)
nrow(titanic_final)
titanic_final$Age[is.na(titanic_final$Age)]=mean(titanic_final$Age,na.rm = TRUE)
# split the data into training and test
set.seed(1234) # for reproducibility #ask Rajneesh what is set.seed
titanic_final$rand <- runif(nrow(titanic_final))
titanic_train <- titanic_final[titanic_final$rand <= 0.7,]
titanic_test <- titanic_final[titanic_final$rand > 0.7,]
nrow(titanic_train)
nrow(titanic_test)
#Histogram
hist(titanic_final$Age)
#Check the response rate
CrossTable(titanic_final$Age)
CrossTable(titanic_final$Survived)
CrossTable(titanic_test$Survived)
CrossTable(titanic_train$Survived)
#LM
titanic_linear = lm(formula= Survived ~ Pclass+Sex+Age+SibSp+Parch,data=titanic_final)
vif(titanic_linear)
summary(titanic_linear)
#GLM
titanic_generalizedlinear = glm(formula= Survived ~ Pclass+Sex+Age+SibSp+Parch,data=titanic_final)
vif(titanic_generalizedlinear)
summary(titanic_generalizedlinear)
#GLM on Train
titanic_generalizedlinear = glm(formula= Survived ~ Pclass+Sex+Age+SibSp+Parch,data=titanic_train)
vif(titanic_generalizedlinear)
summary(titanic_generalizedlinear)
#After removing Insignificant variables i.e. Parch
titanic_generalizedlinear_1 = glm(formula= Survived ~ Pclass+Sex+Age+SibSp,data=titanic_final)
vif(titanic_generalizedlinear_1)
summary(titanic_generalizedlinear_1)
#After removing Insignificant variables i.e. Parch on Train data
titanic_generalizedlinear_1 = glm(formula= Survived ~ Pclass+Sex+Age+SibSp,data=titanic_train)
vif(titanic_generalizedlinear_1)
summary(titanic_generalizedlinear_1)
##After removing Insignificant variables i.e. SibSp on Train data
titanic_generalizedlinear_1 = glm(formula= Survived ~ Pclass+Sex+Age,data=titanic_train)
vif(titanic_generalizedlinear_1)
summary(titanic_generalizedlinear_1) ##Don't do this step. It is just for fun and practice
# training probabilities and roc
titanic_train$prob = predict(titanic_generalizedlinear_1, type=c("response"))
q <- roc(Survived ~ prob, data = titanic_train)
plot(q)
auc(q)
# confusion matrix for Train data
titanic_train$Survivedpred = ifelse(titanic_train$prob>=.5,'pred_yes','pred_no')
table(titanic_train$Survivedpred,titanic_train$Survived)
#Accuracy, Precision, Recall and F1 for Train
accuracy_train <- (322+179)/626 #80.03
precision_train <- 179/ (51+179) #77.8
recall_train <- 179/(179+74) #70.75
F1_train <- (2*80.03*77.8)/(80.03+77.8) #78.89
summary(titanic_generalizedlinear_1)
# training probabilities and roc considering test data
titanic_test$prob = predict(titanic_generalizedlinear_1,newdata = titanic_test, type=c("response"))
q <- roc(Survived ~ prob, data = titanic_train)
plot(q)
auc(q)
# confusion matrix for Test data
titanic_test$Survivedpred = ifelse(titanic_test$prob>=.5,'pred_yes','pred_no')
table(titanic_test$Survivedpred,titanic_test$Survived)
#Accuracy, Precision, Recall and F1 for Test
accuracy_test <- (151+59)/(151+30+25+59)#79.24
precision_test <- 59/ (59+25) #70.23
recall_test <- 59/(59+30) #66.29
F1_test <- (2*79.24*70.23)/(79.24+70.23) #74.46
View(titanic_test)
df.titanic_final_jackrose <- read.csv('JackRose.csv')
# confusion matrix for Jack Rose
df.titanic_final_jackrose$pred = predict(titanic_generalizedlinear_1,newdata = df.titanic_final_jackrose, type=c("response"))
df.titanic_final_jackrose$Survivedpred = ifelse(df.titanic_final_jackrose$pred>=.5,'pred_yes','pred_no')
table(titanic_test$Survivedpred,titanic_test$Survived)
View(df.titanic_final_jackrose)