-
Notifications
You must be signed in to change notification settings - Fork 0
/
spaceship_etl.py
98 lines (69 loc) · 4.08 KB
/
spaceship_etl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from sklearn.impute import SimpleImputer, KNNImputer
def col_splitter(df):
#PassengerId A is unique Id for each passenger.
#Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group.
df['GroupId'] = [int(i.split('_')[0]) for i in df['PassengerId']]
#The cabin number takes the form deck/num/side, where side can be either P for Port or S for Starboard
df[["Deck", "Cabin_num", "Side"]] = df["Cabin"].str.split("/", expand=True)
return df
def bool_transformer(df):
df['VIP'] = df['VIP'].astype(int)
df['CryoSleep'] = df['CryoSleep'].astype(int)
return df
def custom_imputer(df):
# create an instance of the SimpleImputer class
imputer = SimpleImputer(strategy='most_frequent')
# find the group ID of passengers whose 'HomePlanet' or 'Destination' value is missing
group_list = df.loc[(df['Destination'].isna()) | (df['HomePlanet'].isna())]['GroupId'].to_list()
# we know for sure passengers within a group have the same HomePlanet and Destination, so we will apply imputer within groups.
for i in group_list:
temp_df = df.loc[df.GroupId == i].copy()
idx = temp_df.index
group_size = temp_df.shape[0]
# check whether whole data within the group is missing or not
if group_size > 1 and temp_df['HomePlanet'].isna().sum() < group_size and temp_df['Destination'].isna().sum() < group_size:
df.loc[idx,['HomePlanet', 'Destination']] = imputer.fit_transform(temp_df[['HomePlanet', 'Destination']])
# as we saw the Deck passengers use to get on spaceship depend on the starting point of their travel,
# for example there is no Deck 'E', 'F', 'G' in Europa.
# we will impute the missing data of Deck column within each HomePlanet seperately.
HomePlanets = ['Europa', 'Earth', 'Mars']
for planet in HomePlanets:
temp_df = df.loc[df.HomePlanet == planet].copy()
idx = temp_df.index
df.loc[idx,'Deck'] = imputer.fit_transform(temp_df[['Deck']])
# take the same aproach for imputing Deck columns based on Destinations.
Destinations = ['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e']
for Destination in Destinations:
temp_df = df.loc[df.Destination == Destination].copy()
idx = temp_df.index
df.loc[idx,'Deck'] = imputer.fit_transform(temp_df[['Deck']])
# Now we don't have any missing Deck Value, So we can impute impute all the remaining HomePlanet and Destination of
# the passengers who travel alone or whole data of the group was missing.
Decks = ['B', 'F', 'A', 'G', 'E', 'D', 'C', 'T']
for Deck in Decks:
temp_df = df.loc[df.Deck == Deck].copy()
idx = temp_df.index
df.loc[idx,['HomePlanet', 'Destination']] = imputer.fit_transform(temp_df[['HomePlanet', 'Destination']])
# We will impute all numerical and boolean values with KNN Imputer because it is likely the passengers who are neighbours
# have the same behaviour spending or age
KNNimputer = KNNImputer(n_neighbors=2, weights="uniform")
features_num = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_num', 'CryoSleep', 'VIP']
df[features_num] = KNNimputer.fit_transform(df[features_num])
return df
def cleaner(df):
df = col_splitter(df)
df = custom_imputer(df)
df.Side = df.Side.fillna('P')
df = bool_transformer(df)
return df
def path_maker(df):
# Define a lambda function to show the travel path
path_maker = lambda x, y: x + "-" + y
# Use the map function to apply the add function to the 'Value1' and 'Value2' columns and create a new column 'Path'
df['Path'] = df[['HomePlanet','Destination']].apply(lambda x: path_maker(*x), axis=1)
return df
def avg_spent(df):
#df['Avg_Spent'] = df[['RoomService', 'Spa', 'VRDeck', 'FoodCourt', 'ShoppingMall']].sum(axis=1)/5
df['Avg_In_Spent'] = df[['RoomService', 'Spa', 'VRDeck']].sum(axis=1)/3
df['Avg_Out_Spent'] = df[['FoodCourt', 'ShoppingMall']].sum(axis=1)/2
return df