/
absenteeism-data-cleaning.py
116 lines (94 loc) · 3.92 KB
/
absenteeism-data-cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
# Import csv data
df = pd.read_csv("Absenteeism-data.csv", delimiter=",")
# Create backup
raw_csv_data = df.copy()
# Delete ID column as it is not needed for analysis
df = df.drop(["ID"], axis = 1)
# Get descriptive statistics of columns
print(df.describe())
# Explore values within "Reason for Abscence" column
print(df["Reason for Absence"].min())
print(df["Reason for Absence"].max())
print(df["Reason for Absence"].unique())
print(len(df["Reason for Absence"].unique()))
print(sorted(df["Reason for Absence"].unique()))
# Set categorical nominal values for "Reason for Absence" to dummy values
# where 1 = presence of categorical event
# where 0 = absence of categorical event
reason_columns = pd.get_dummies(df["Reason for Absence"], dtype = int)
# Check that their are no missing values for "Reason for Absence"
reason_columns["check"] = reason_columns.sum(axis = 1)
print(reason_columns["check"].unique())
# Remove check column
reason_columns = reason_columns.drop(["check"], axis = 1)
# Drop reason column 0, that represents records with 0 hours of
# absenteeism as analysis is not concerned with records without absenteeism
reason_columns = reason_columns.drop([0], axis = 1)
# To avoid multicollinearity the "Reason for Abscence" column will be
# replaced with the grouping / classification of reason_columns columns
df = df.drop(["Reason for Absence"], axis = 1)
# Group absence reasons
reason_type1 = reason_columns.loc[:, 1:14].max(axis=1)
reason_type2 = reason_columns.loc[:, 15:17].max(axis=1)
reason_type3 = reason_columns.loc[:, 18:21].max(axis=1)
reason_type4 = reason_columns.loc[:, 22:].max(axis=1)
# Concatenate absence reason group columns to data frame
df = pd.concat([df, reason_type1, reason_type2, reason_type3, reason_type4],
axis = 1)
# Rename columns with names of 0, 1, 2, and 3 as Reason_1 ... Reason_4
# Get comma seperated list of column names that can be pasted into column_names
print([df.columns.values])
column_names = ["Date", "Transportation Expense", "Distance to Work", "Age",
"Daily Work Load Average", "Body Mass Index", "Education",
"Children", "Pets", "Absenteeism Time in Hours", "Reason_1",
"Reason_2", "Reason_3", "Reason_4"]
# Assigned revised column_names list to data frame
df.columns = column_names
# Check column names
print(df.head())
# Reorder columns
column_names_reordered = ["Reason_1", "Reason_2", "Reason_3", "Reason_4",
"Date", "Transportation Expense", "Distance to Work",
"Age", "Daily Work Load Average", "Body Mass Index",
"Education", "Children", "Pets",
"Absenteeism Time in Hours"]
df = df[column_names_reordered]
# Check column names
print(df.head())
# Create backup
df_reason_mod = df.copy()
# Check data type
print(type(df["Date"][0]))
# Change string to timestamp
df["Date"] = pd.to_datetime(df["Date"], format = "%d/%m/%Y")
# Check date data type is datetime
print(df.info())
# Extract month from each record and store in months_list
months_list = []
for i in range(df.shape[0]):
months_list.append(df["Date"][i].month)
# Update data frame with "Month" column
df["Month"] = months_list
# Check months
print(df.head(20))
# Extract day of week and update data frame with "Day of Week" column
df["Day of Week"] = df["Date"].apply(lambda d: d.weekday())
# Check day of week
print(df["Day of Week"].iloc[:5])
# Reorder "Month" and "Day of Week" columns
col = df.pop("Month")
df.insert(4, col.name, col)
col = df.pop("Day of Week")
df.insert(5, col.name, col)
# Check "Month" and "Day of Week"
print(df.head(20))
# Remove Date column
df = df.drop(["Date"], axis=1)
# Group education as binary variable 0 - high school, 1 - college
df["Education"].map({1:0, 2:1, 3:1, 4:1})
# Check "Education" values
df["Education"].unique()
df["Education"].value_counts()
# Save data frame to .csv file
df.to_csv("absenteeism-cleaned.csv", index=False)