/
titanic LogReg.py
189 lines (155 loc) · 6.47 KB
/
titanic LogReg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""
Created on Mon Jan 08 17:15:39 2018
@author: swaraj
"""
'''Using Logistic regression to predict if the passenger survived Titanic Crash
using the Titanic dataset on kaggle
https://www.kaggle.com/c/titanic/
'''
'''Importing the libraries'''
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
'''Importing the dataset'''
train_data = pd.read_csv("../input/train.csv")
test_data = pd.read_csv("../input/test.csv')
train_data = pd.read_csv('C:/Users/swara/Desktop/DS/kaggle/titanic/train.csv')
test_data = pd.read_csv('C:/Users/swara/Desktop/DS/kaggle/titanic/test.csv')
'''Take a look at data'''
print(train_data.info())
print(train_data.head())
'''
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S
After looking at data we can see that the columns PassengerId
and Name wouldnt be much of use for our model
We also see that much of cabin data is not present
We will try to find more about it.
'''
train_data.isnull().sum()
'''
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
'''
train_data['PassengerId'].count()
''' 891
We can see that Cabin has many missing values and imputing this feature
won't be good for our model. So we'll drop this column.
On the other hand we can impute Age and Embarked columns.
We'll further see the how to impute these columns
'''
train_data["Embarked"].value_counts()
'''
S 644
C 168
Q 77
So we can just impute the null values by S as it has the largest occurence
'''
train_data["Embarked"].fillna("S", inplace=True)
'''For Age, we find out the mean of traveller ages in corresponding Class
and use that to fill in the Null values'''
train_data.groupby("Pclass").mean()
'''
PassengerId Survived Age SibSp Parch Fare
Pclass
1 461.597222 0.629630 38.233441 0.416667 0.356481 84.154687
2 445.956522 0.472826 29.877630 0.402174 0.380435 20.662183
3 439.154786 0.242363 25.140620 0.615071 0.393075 13.675550
'''
def fill_age(data):
Pclass = data[1]
Age = data[0]
if(pd.isnull(Age)):
if Pclass == 1:
return 38
elif Pclass == 2:
return 29
else:
return 25
else:
return Age
train_data['Age'] = train_data[['Age','Pclass']].apply(fill_age,axis=1)
test_data['Age'] = test_data[['Age','Pclass']].apply(fill_age,axis=1)
'''Encoding categorical variables
We will encode variables like Sex, Embarked so that our model can use them while making decisions'''
sex = pd.get_dummies(train_data['Sex'],drop_first=True)
embark = pd.get_dummies(train_data['Embarked'],drop_first=True)
'''On some research I found that the variables SibSp and Parch denote if the traveller
was alone or with family, so we'll make use of this information by encoding the two variables into one'''
train_data['Travel_Family']=train_data["SibSp"]+train_data["Parch"]
train_data['isAlone']=np.where(train_data['Travel_Family']>0, 0, 1)
train_data.drop('Travel_Family', axis=1, inplace=True)
'''Drop the variables which are not needed'''
train_data.drop(['PassengerId','Cabin','Sex','Embarked','Name','Ticket','SibSp','Parch'],axis=1,inplace=True)
'''Need to include the encoded categorical variable'''
train_data = pd.concat([train_data,sex,embark],axis=1)
'''Perform the same process for test data'''
test_data.isnull().sum()
''' Drop Cabin and impute Fare'''
test_data.at[152, 'Fare'] = 8
test_data['Travel_Family']=test_data["SibSp"]+test_data["Parch"]
test_data['isAlone']=np.where(test_data['Travel_Family']>0, 0, 1)
test_data.drop('Travel_Family', axis=1, inplace=True)
sex = pd.get_dummies(test_data['Sex'],drop_first=True)
embark = pd.get_dummies(test_data['Embarked'],drop_first=True)
test_data.drop(['Sex','Cabin','Embarked','Name','Ticket','SibSp','Parch'],axis=1,inplace=True)
test_data = pd.concat([test_data,sex,embark],axis=1)
'''Dividing the train data and label'''
train_label = train_data['Survived']
train_data.drop(['Survived'],axis=1,inplace=True)
'''Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train_data = sc.fit_transform(train_data)
test_data1 = sc.transform(test_data.drop(['PassengerId'],axis=1))'''
'''Splitting the training dataset into the Training set and Test set'''
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data, train_label, test_size = 0.2, random_state = 0)
'''We will use Logistic Regression for this problem'''
''' Importing the Keras libraries and packages'''
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression()
lm.fit(X_train,y_train)
y_pred = lm.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
'''We get accuracy of 78.21% for this model'''