/
DVillalobos-WorkSample.Rmd
2082 lines (1301 loc) · 70.9 KB
/
DVillalobos-WorkSample.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Program Data Analyst: Energy Management follow up"
#subtitle: "DATA"
header-includes: # allows you to add in your own Latex packages
- \usepackage{titling}
- \pretitle{\begin{center}\LARGE\includegraphics[width=6in]{CUNYSPS-WorkHard.jpg}\\[\bigskipamount]}
- \posttitle{\end{center}}
- \usepackage{float} #use the 'float' package
- \floatplacement{figure}{H} #make every figure with caption = h
author: "Duubar Villalobos Jimenez -- mydvtech@gmail.com"
date: "May 27, 2019"
output:
prettydoc::html_pretty:
theme: leonids
highlight: github
toc: yes
df_print: paged
html_document:
df_print: paged
code_folding: hide
pdf_document:
highlight: tango
toc: true
toc_depth: 4
number_sections: false
df_print: kable
fig_width: 7
fig_height: 6
fig_caption: true
#template: quarterly-report.tex
#includes:
# in_header: preamble.tex
# before_body: doc-prefix.tex
# after_body: doc-suffix.tex
#citation_package: natbib
#keep_tex: true # To create .tex files
geometry: margin=1in
fontfamily: mathpazo
fontsize: 11pt
#spacing: double
bibliography: bibliography.bib
#biblio-style: "apalike"
link-citations: yes
---
```{r, echo=FALSE, warning=FALSE, error=FALSE, cache=FALSE, results='hide', message=FALSE}
knitr::opts_chunk$set(echo = TRUE, fig.pos = 'h')
```
```{r, echo=FALSE, warning=FALSE, message=FALSE}
# Library definitions
library(parallel) # for paralell computing
library(tidyr) # for data manipulation
library(dplyr) # for data manipulation
library(lubridate) # for dates
library(zoo) # for dates
library(fpp2) # for datasets and libraries
library(imputeTS) # to replace nas from ts
library(naniar) # For missing data visualiations
library(kableExtra) # for table formating
library(DT) # to create data table
```
\newpage
```{r fig1, echo=FALSE, out.width='100%', fig.pos = 'h'}
knitr::include_graphics('/home/mydvtech/TESTGitXYZ/CUNY/Energy/un-sdgs.jpg')
```
# Requirements
A work sample report should be redacted, reflect your individual work effort, and illustrate your capability as a data analyst. Please include a brief summary that identifies the project goals, methodology, data sample, tools, etc. You are requested to submit the document in PDF format to us no later than noon on Thursday May 30th.
# Summary
This work sample will be created using a tool called R. @R is a language and environment for statistical computing and graphics that is rich for statistical and data analysis and for sharing results in various forms.
This sample, will encompass a total of two different projects, one involving time series; the other involving a more methodical approach to a given data set.
\newpage
# Work Samples
## Example 1
### Overview
Example 1 consists of a simple data set of residential power usage for January 1998 until December 2013. The data is given in a single file. The variable *"KWH"* is power consumption in Kilowatt hours, the rest is straight forward.
#### Objective
The objective is to model the data and to perform a monthly forecast for 2014.
```{r, echo=FALSE}
# Read file
power.data <- readxl::read_excel('/home/mydvtech/TESTGitXYZ/CUNY/Energy/data/ResidentialCustomerForecastLoad-624.xlsx')
```
### Procedure
**First**, let's have a small idea of how the data look like:
```{r, echo=FALSE}
kable(head(power.data)) %>%
kable_styling("striped", full_width = F)
```
From above, we notice 3 columns as follows:
**CaseSequence**: Indicate the Sequence of the readings.
**YYYY-MMM **: Indicate the date of the reading.
**KWH**: Indicate the value of the reading in KWH.
**Second**, let's have a description of the data:
```{r, echo=FALSE}
summary(power.data)
```
From above, there seems to be a missing value as reported in the summary table under NA's.
**Third**, I would like to have a visualization of the missing data since there's an indication of `NA`. For this purpose, I will make use of the function `vis_miss()` from the library `naniar`.
```{r, echo=FALSE, fig.height = 5, fig.width = 9, fig.align = "center"}
vis_miss(power.data)
```
Let's have a better understanding of the missing data.
```{r, echo=FALSE}
kable(power.data[!complete.cases(power.data),]) %>%
kable_styling("striped", full_width = F)
```
Currently, we are not sure why there's a missing value for the month of September of 2008. At this current point in time I am not sure if I should just remove the missing value or replace it with a more meaningful reading perhaps the mean value for all months representing September. I will come back to this issue as I go further.
**Fourth:** Let's create a time series object.
```{r, echo=FALSE}
# Transforming data, but it is not required.
power.data$DATE <- as.yearmon(power.data$`YYYY-MMM`, "%Y-%b")
# Let's sort by date and ATM
power.data <- power.data[order(power.data$DATE),]
# Create a ts object
start_date <- c(1998, 1)
power.ts <- ts(data = power.data[, c("KWH")],
start = start_date,
frequency = 12)
```
Let's have a better understanding of the time series.
```{r, echo=FALSE}
power.ts
```
From the above table, it is evident that we need to replace the NA with a more "meaningful" value, it is not recommended eliminate such value; my approach will be to calculate the mean of all readings for all years for the month of September and replace the NA with such value.
```{r, echo=FALSE}
# Calculate the mean for each month, it removes NAs
power.mean <- tapply(power.ts, cycle(power.ts), mean, na.rm=TRUE)
# Replacing missing NA with mean
power.ts <- na.replace(power.ts, fill = power.mean[9] )
```
Time series after replacement of missing data with the mean for the respective month, in this case it was for Sep, 2008; it got replaced for `r format(power.mean[9], scientific=FALSE)`.
```{r, echo=FALSE}
power.ts
```
Let's visualize our data.
```{r, echo=FALSE,fig.height = 5, fig.width = 9, fig.align = "center"}
ggseasonplot(power.ts, polar=TRUE) +
ylab("KWH") +
ggtitle("Polar seasonal plot: Monthly KWH readings.")
```
```{r, echo=FALSE, warning=FALSE, fig.height = 5, fig.width = 9, fig.align = "center"}
# Plot time series
autoplot(power.ts, facet=TRUE) +
xlab("Date") +
ylab("KWH") +
ggtitle("Monthly KWH readings")
```
In this particular case, I am not sure as to why there is a very low reading for July, 2010,it is currently showing unusually low (corresponding to some large values in the remainder time series). Some possibilities could point to be a power outage during summer time; this seems to be a good possibility. I did some research and since there's no reference as to the geographical area for the data set, I could not confirm such thing. I will assume this to be the cause, I will consider this to be an accurate reading and I will not change that value.
Also, the data are clearly non-stationary, as the series wanders up and down for some periods. Consequently, we will take a first difference of the data. The difference data are shown below.
Let's have a visualization of the differences.
```{r, echo=FALSE}
power.ts %>% diff() %>% ggtsdisplay(main="")
```
In the above plot, we notice some auto correlations in the lag, the PACF suggest a AR(3) model. So an initial candidate model is **ARIMA(3,1,0)**.
**Training/test:** In this section, I will split the given data into Train/Test data. This will be used in order to determine the accuracy of the model.
```{r,echo=TRUE}
power.train <- window(power.ts, end=c(2012,12))
power.test <- window(power.ts, start=c(2013,1))
```
**ARIMA** Let's find an arima model.
**Regular fit.** No transformation, the reason why, is because there seems to be no evidence of changing variance.
```{r,echo=TRUE}
power.fit.manual.arima <- Arima(power.train, c(3,1,0))
power.fit.auto.arima <- auto.arima(power.train, seasonal=FALSE, stepwise=FALSE, approximation=FALSE)
```
Let's see the results:
**Manual Arima fit**
```{r, echo=FALSE}
summary(power.fit.manual.arima)
```
**Auto Arima fit**
```{r, echo=FALSE}
summary(power.fit.auto.arima)
```
If we compare both models, we notice how the RMSE value is by far a better value in the Auto Arima model, also, another indication is the AICc value, in this case the Auto Arima model has a better value compared to our manually selected model. Hence, I will pick the Automated Arima model.
**Accuracy** Let's find how accurate the models are:
**Manual Arima(3,1,0)**
```{r, echo=FALSE,}
# Calculating forecasts
power.forecast.manual.arima <- forecast(power.fit.manual.arima, h=12)
# Calculating accuracy
power.accuracy.manual.arima <- accuracy(power.forecast.manual.arima, power.test)
```
Let's visualize the manually selected Arima(3,1,0) model forecast results:
```{r,echo=FALSE}
kable(data.frame(power.forecast.manual.arima)) %>%
kable_styling("striped", full_width = F)
```
Let's take a look at the accuracy table and let's focus on the RMSE results for the manually selected Arima model. In this particular case, the test set results are not very promising.
```{r, echo=FALSE}
kable(data.frame(power.accuracy.manual.arima)) %>%
kable_styling("striped", full_width = F)
```
Let's have a visualization of the manually selected Arima model forecasts.
```{r, echo=FALSE, warning=FALSE, fig.height = 5, fig.width = 9, fig.align = "center"}
power.forecast.manual.arima %>% autoplot(include=80)
```
In effect, the curve seems not to follow the pattern of the data.
**Autmated Arima(3,1,1)**
```{r, echo=FALSE,}
# Calculating forecasts
power.forecast.auto.arima <- forecast(power.fit.auto.arima, h=12)
# Calculating accuracy
power.accuracy.auto.arima <- accuracy(power.forecast.auto.arima, power.test)
```
Let's visualize the automatically selected Arima(3,1,1) with drift model forecast results:
```{r,echo=FALSE}
kable(data.frame(power.forecast.auto.arima)) %>%
kable_styling("striped", full_width = F)
```
Let's take a look at the accuracy table and let's focus on the RMSE results for the automatically selected Arima model. In this particular case, the test set results are not very promising. Now, comparing the RMSE values to our manually selected model, there's an improvement but still, it seems that the forecasts are not very accurate with this model.
```{r, echo=FALSE}
kable(data.frame(power.accuracy.auto.arima)) %>%
kable_styling("striped", full_width = F)
```
Let's have a visualization of the automatically selected Arima model forecasts.
```{r, echo=FALSE, warning=FALSE, fig.height = 5, fig.width = 9, fig.align = "center"}
power.forecast.auto.arima %>% autoplot(include=80)
```
Let's compare side by side the test forecasts, compared to our test data.
```{r, echo=FALSE, warning=FALSE, fig.height = 5, fig.width = 9, fig.align = "center"}
fmanual.power.forecast <- power.forecast.manual.arima$mean
fauto.power.forecast <- power.forecast.auto.arima$mean
autoplot(power.ts) +
autolayer(fmanual.power.forecast, series="Manual") +
autolayer(fauto.power.forecast, series="Auto") +
xlab("Year") +
ylab("Turnover") +
ggtitle("KWH Forecast comparison.") +
guides(colour=guide_legend(title="Forecast"))
```
In effect, the forecasts are not very accurate and perhaps another model should be selected.
**STL model:** Based on the previous results, I will focus on the STL model.
STL is a versatile and robust method for decomposing time series. STL is an acronym for “Seasonal and Trend decomposition using Loess”.
STL has several advantages over the classical, SEATS and X11 decomposition methods:
- Unlike SEATS and X11, STL will handle any type of seasonality, not only monthly and quarterly data.
- The seasonal component is allowed to change over time, and the rate of change can be controlled by the user.
- The smoothness of the trend-cycle can also be controlled by the user.
- It can be robust to outliers (i.e., the user can specify a robust decomposition), so that occasional unusual observations will not affect the estimates of the trend-cycle and seasonal components. They will, however, affect the remainder component.
Let's visualize the STL decomposition.
```{r, echo=FALSE, warning=FALSE, fig.height = 5, fig.width = 9, fig.align = "center"}
power.fit.stl <- stl(power.train[,1], t.window=13, s.window="periodic", robust=TRUE)
autoplot(power.fit.stl) +
xlab("Date") +
ylab("KWH") +
ggtitle("STL decomposition for the KWH consumption")
```
Let's forecast with the **naive** and **snaive** method.
```{r, echo=TRUE}
# Calculating forecasts for naive and snaive
power.fit.naive <- forecast(power.fit.stl, method="naive", h =12)
power.fit.snaive <- snaive(power.train[,1], h =12)
# Calculating accuracy
power.accuracy.naive <- accuracy(power.fit.naive, power.test)
power.accuracy.snaive <- accuracy(power.fit.snaive, power.test)
```
Let's see the respective accuracy results for both models.
**Naive** Forecast accuracy results.
```{r, echo=FALSE}
kable(data.frame(power.accuracy.naive)) %>%
kable_styling("striped", full_width = F)
```
**SNaive** Forecast accuracy results.
```{r, echo=FALSE}
kable(data.frame(power.accuracy.snaive)) %>%
kable_styling("striped", full_width = F)
```
In this particular case, the **snaive** method offers a much better RMSE value. Making this model the most accurate of them all.
Let's visualize the results.
```{r, echo=FALSE,fig.height = 5, fig.width = 9, fig.align = "center"}
autoplot(power.ts) +
autolayer(power.fit.naive, series="STL Naive.", PI=FALSE) +
autolayer(power.fit.snaive, series="Seasonal Naive.", PI=FALSE) +
xlab("Date") +
ylab("KWH") +
ggtitle("Comparing KWH forecast consumption")
```
**Forecasting 2014** Employing snaive model.
```{r, echo=FALSE}
power.fit.snaive.2014 <- snaive(power.ts[,1], h =12)
```
**Forecast results**
```{r, echo=FALSE}
kable(data.frame(power.fit.snaive.2014)) %>%
kable_styling("striped", full_width = F)
```
**Forecast visualization**
```{r, echo=FALSE,fig.height = 5, fig.width = 9, fig.align = "center"}
autoplot(power.fit.snaive.2014) +
xlab("Year") +
ylab("Turnover") +
ggtitle("2014 KWH Forecast.") +
guides(colour=guide_legend(title="Forecast"))
```
### Conclusion
From the above analysis, we could conclude that a good prediction model will be the STL employing the snaive method. Thus due to the similar pattern followed for the testing data and the predicted future values.
\newpage
## Example 2
```{r, echo=FALSE, warning=FALSE, error=FALSE, cache=FALSE, results='hide', message=FALSE}
library(dplyr)
library(car)
library(stringr)
library(corrplot)
library(PerformanceAnalytics)
library(caret)
library(pROC)
```
### Overview
Example 2 consist as follows: to explore, analyze and model a data set containing approximately $8000$ records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, **TARGET_FLAG**, is a $1$ or a $0$. A "$1$" means that the person was in a car crash. A zero means that the person was not in a car crash. The second response variable is **TARGET_AMT**. This value is zero if the person did not crash their car. But if they did crash their car, this number will be a value greater than zero.
#### Objective
The objective is to build multiple linear regression and binary logistic regression models on the training data to predict the probability that a person will crash their car and also the amount of money it will cost if the person does crash their car. I can only use the variables given (or variables that I derive from the variables provided).
### Procedure
In this example, I will make use of a version and collaboration tool called github, alongside R. GitHub, a subsidiary of Microsoft, is an American web-based hosting service for version control using Git. It is mostly used for computer code. It offers all of the distributed version control and source code management (SCM) functionality of Git as well as adding its own features.
It provides access control and several collaboration features such as bug tracking, feature requests, task management, and wikis for every project.
#### Dataset description
Let's start by taking a look at the data and it's respective dictionary.
```{r, echo=FALSE}
git_user <- 'dvillalobos'
git_user <- paste('https://raw.githubusercontent.com/',git_user,sep = "")
git_dir <- '/MSDS/master/621/Homeworks/assignment-04/data/'
data.desc <- read.csv(paste(git_user, git_dir, "hmwrk4-vardesc.csv", sep = ""))
data.desc$VARIABLE_NAME <- str_trim(data.desc$VARIABLE_NAME)
data.desc$DEFINITION <- str_trim(data.desc$DEFINITION)
data.desc$THEORETICAL_EFFECT <- str_trim(data.desc$THEORETICAL_EFFECT)
```
##### Variable definitions
The below list represent the definitions for each given variable.
```{r, echo=FALSE}
data.desc[,c("VARIABLE_NAME", "DEFINITION")]
```
##### Theoretical effect of variables
The below list represent the theoretical effects for each given variable.
```{r, echo=FALSE}
data.desc[,c("VARIABLE_NAME", "THEORETICAL_EFFECT")]
```
#### Data Exploration
Let's take a look at the hidden layers and some composition of the data set.
##### Data acquisition
For reproducibility purposes, I have included the original data sets in my Git Hub account, I will read it as a data frame from that location.
```{r, echo=FALSE}
get_data <- function(git_user, git_dir, file){
file_loc <- paste(git_user, git_dir, file, sep = "")
data <- read.csv(file_loc, stringsAsFactors=TRUE)
return(data)
}
```
```{r}
data.train <- get_data(git_user, git_dir, 'insurance_training_data.csv')
data.eval <- get_data(git_user, git_dir, 'insurance-evaluation-data.csv')
```
##### General exploration
The below process will help us obtain insights from our given data.
**Dimensions**
Let's see the dimensions of our training data set.
```{r, echo=FALSE}
dimensions <- dim(data.train)
dimensions <- data.frame('Records' = dimensions[1],
'Variables' = dimensions[2])
dimensions
```
From the above table, we can see how the training data set has a total of `r dimensions$Records[1]` different records and `r dimensions$Variables[1]` variables including **INDEX, TARGET_FLAG** and **TARGET_AMT**. These variables do not represent much of the initial insights since they correspond to our response variables.
For simplicity reasons, I will discard the **INDEX** column.
```{r, echo=TRUE}
remove_cols <- names(data.train) %in% c('INDEX')
data.train <- data.train[!remove_cols]
```
**Structure**
The below structure is currently present in the data, for simplicity reasons, I have previously loaded and treated this data set as a data frame in which all the variables with decimals are numeric.
```{r, echo=FALSE}
# Taken from https://gist.github.com/jbryer/4a0a5ab9fe7e1cf3be0e
# strtable that provides the information str.data.frame does but returns the results as a data.frame.
# This provides much more flexibility for controlling how the output is formatted.
# Specifically, it will return a data.frame with four columns: variable, class, levels, and examples.
strtable <- function(df, n=4, width=60,
n.levels=n, width.levels=width,
factor.values=as.character) {
stopifnot(is.data.frame(df))
tab <- data.frame(variable=names(df),
class=rep(as.character(NA), ncol(df)),
levels=rep(as.character(NA), ncol(df)),
examples=rep(as.character(NA), ncol(df)),
stringsAsFactors=FALSE)
collapse.values <- function(col, n, width) {
result <- NA
for(j in 1:min(n, length(col))) {
el <- ifelse(is.numeric(col),
paste0(col[1:j], collapse=', '),
paste0('"', col[1:j], '"', collapse=', '))
if(nchar(el) <= width) {
result <- el
} else {
break
}
}
if(length(col) > n) {
return(paste0(result, ', ...'))
} else {
return(result)
}
}
for(i in seq_along(df)) {
if(is.factor(df[,i])) {
tab[i,]$class <- paste0('Factor w/ ', nlevels(df[,i]), ' levels')
tab[i,]$levels <- collapse.values(levels(df[,i]), n=n.levels, width=width.levels)
tab[i,]$examples <- collapse.values(factor.values(df[,i]), n=n, width=width)
} else {
tab[i,]$class <- class(df[,i])[1]
tab[i,]$examples <- collapse.values(df[,i], n=n, width=width)
}
}
class(tab) <- c('strtable', 'data.frame')
return(tab)
}
#' Prints the results of \code{\link{strtable}}.
#' @param x result of code \code{\link{strtable}}.
#' @param ... other parameters passed to \code{\link{print.data.frame}}.
#' @export
print.strtable <- function(x, ...) {
NextMethod(x, row.names=FALSE, ...)
}
```
```{r, echo=FALSE}
#str(data.train)
str.data.train <- strtable(data.train)[,c(1:3)]
str.data.train
```
From the above table, we can notice how we need to take care of certain strings that are seeing as factors but in reality they are representing numbers and should not be seeing as factors. This will be addressed in more detail as we advance.
##### Summaries
Let's find some summary statistics about our given data, for that; I will get a little bit more insights for all the columns including the **TARGET_FLAG** and **TARGET_AMT** variables.
```{r, echo=FALSE, warning=FALSE, message=FALSE}
# Function that extract summary values into a
get_df_summary <- function(df){
df.summary <- data.frame(unclass(summary(df)),
check.names = FALSE,
row.names = NULL,
stringsAsFactors = FALSE)
# Let's transpose the resulting data frame
df.summary <- data.frame(t(df.summary))
# Let's rename the columns
if ( length(colnames(df.summary)) > 6 ){
colnames(df.summary) <- c('Min', '1st Qu', 'Median', 'Mean', '3rd Qu', 'Max', 'Other')
df.summary$Other <- as.character(df.summary$Other)
} else {
colnames(df.summary) <- c('Min', '1st Qu', 'Median', 'Mean', '3rd Qu', 'Max')
}
# Let's extract numeric values
df.summary$Min <- as.numeric(gsub('Min. :', '', df.summary$Min))
df.summary$`1st Qu` <- as.numeric(gsub('1st Qu.:', '', df.summary$`1st Qu`))
df.summary$Median <- as.numeric(gsub('Median :', '', df.summary$Median))
df.summary$Mean <- as.numeric(gsub('Mean :', '', df.summary$Mean))
df.summary$`3rd Qu` <- as.numeric(gsub('3rd Qu.:', '', df.summary$`3rd Qu`))
df.summary$Max <- as.numeric(gsub('Max. :', '', df.summary$Max))
df.summary[is.na(df.summary)] <- ""
row.names(df.summary) <- str_trim(row.names(df.summary))
return(df.summary)
}
```
**Combined Summary**
In this section, we will explore the combined results as introductory insights.
```{r, ech0=FALSE, message=FALSE, warning=FALSE}
data.train.summary <- get_df_summary(data.train)
data.train.summary
```
Please note that this is for introductory insights and should not be considered as complete results.
**TARGET_FLAG Summaries**
In the mean time I will split the data into two data-sets depending on the **TARGET_FLAG**. Let's see some summaries for each group.
```{r, echo=TRUE}
TARGET_FLAG_0 <- data.train[data.train$TARGET_FLAG == 0,]
TARGET_FLAG_1 <- data.train[data.train$TARGET_FLAG == 1,]
```
**Number of records by group**
The below table shows how many records each group has.
```{r, echo=FALSE}
dim_0 <-dim(TARGET_FLAG_0)[1]
dim_1 <-dim(TARGET_FLAG_1)[1]
dim_t <-dim(data.train)[1]
dim_0.p <- round(dim_0[1] * 100 / dim_t[1],2)
dim_1.p <- round(dim_1[1] * 100 / dim_t[1],2)
dim_df <- t(data.frame("TARGET_FLAG_0" = c(dim_0, dim_0.p),
"TARGET_FLAG_1" = c(dim_1, dim_1.p),
"TOTAL" = c(dim_t, dim_0.p+dim_1.p)))
colnames(dim_df) <- c("Records", "Percentage")
data.frame(dim_df)
```
**TARGET_FLAG = 0**
Let's have a better look at the individualized summaries by having TARGET_FLAG = 0.
```{r, ech0=FALSE, message=FALSE, warning=FALSE}
data.train.summary_0 <- get_df_summary(TARGET_FLAG_0)
data.train.summary_0
```
**TARGET_FLAG = 1**
Let's have a better look at the individualized summaries by having TARGET_FLAG = 1.
```{r, ech0=FALSE, message=FALSE, warning=FALSE}
data.train.summary_1 <- get_df_summary(TARGET_FLAG_1)
data.train.summary_1
```
From the above reports, we can notice how we need to "transform" our data in order to make it more workable.
In order to do so, I will start to prep the data. Graphical visualizations and correlations will be provided later on since we need to address a few things in order to make our data more workable.
#### Findings
From the above tables, is interesting to note as follows:
- The training data-set shows the presence of missing values or **NAs** in some columns; that can be seeing in the **Other** column. This will be addressed as we prepare our data down the road.
- **"(Other)"** means that there are factor values that could not be grouped accordingly.
- Interesting to see that **CAR_AGE** shows a minimum value of -3. This needs to be investigated since it seems that it's not accurate.
- The Maximum value for **TARGET_AMT** seems to be very far away from the mean and the median value. This needs to be evaluated and find out if this is accurate.
#### Data Preparation
In this section, I will prepare our given data-set. For that I will need to address a few things, like factors and missing data.
##### Data conversion
In this section, I will describe the conversion of the data that is required in order to have a more manageable understanding of it.
**FACTOR to NUMERIC**
This section explains the conversions of currency values in which the system interpreted as factors when in reality these should have been treated as numeric type.
The variables that need to be converted are: **INCOME, HOME_VAL, BLUEBOOK** and **OLDCLAIM**.
```{r, echo=FALSE}
# Function that transform Factor variables to Numeric
factor_to_numeric <- function(df){
df$INCOME <- as.character(df$INCOME)
df$INCOME <- as.numeric(gsub('[$,]', '', df$INCOME))
df$HOME_VAL <- as.character(df$HOME_VAL)
df$HOME_VAL <- as.numeric(gsub('[$,]', '', df$HOME_VAL))
df$BLUEBOOK <- as.character(df$BLUEBOOK)
df$BLUEBOOK <- as.numeric(gsub('[$,]', '', df$BLUEBOOK))
df$OLDCLAIM <- as.character(df$OLDCLAIM)
df$OLDCLAIM <- as.numeric(gsub('[$,]', '', df$OLDCLAIM))
return(df)
}
```
```{r, echo=FALSE}
data.train <- factor_to_numeric(data.train)
```
**FACTOR to "Dummy"**
In this section, I will transform the remaining factor variables into various binary "Dummy" variables with values 1 for "Yes" and 0 for "No".
The variables that need to be converted are: **PARENT1, MSTATUS, SEX, EDUCATION, JOB, CAR_USE, CAR_TYPE, RED_CAR, REVOKED** and **URBANICITY**.
Please note that there will be a a new set of variables summarizing the above data as follows:
- **IS_SINGLE_PARENT** will represent if **PARENT1** = "Yes" with a value of $1$; $0$ otherwise.
- **IS_MARRIED** will represent if **MSTATUS** = "Yes"" with a value of $1$; $0$ otherwise.
- **IS_FEMALE** will represent if **SEX** = "z_F" with a value of $1$; $0$ otherwise.
- **EDUCATION** will represent diverse **EDUCATION** levels, "<High School" is the default and this column was not included.
- **JOB** will represent diverse **JOB** levels, "Blank" is the default and this column was not included.
- **IS_CAR_PRIVATE_USE** will represent if **CAR_USE** = "Private" with a value of $1$; $0$ otherwise.
- **CAR_TYPE** will represent diverse **CAR_TYPE** levels, "Minivan" is the default and this column was not included.
- **IS_CAR_RED** will represent if **RED_CAR** = "Yes" with a value of $1$; $0$ otherwise.
- **IS_LIC_REVOKED** will represent if **REVOKED** = "Yes" with a value of $1$; $0$ otherwise.
- **IS_URBAN** will represent if **URBANICITY** = "Highly Urban/ Urban" with a value of $1$; $0$ otherwise.
```{r, echo=FALSE}
# Function that transform Factor variables to Binary Dummy variables
factor_to_dummy <- function(df){
PARENT1 <- data.frame(model.matrix( ~ PARENT1 - 1, data=df ))
MSTATUS <- data.frame(model.matrix( ~ MSTATUS - 1, data=df ))
SEX <- data.frame(model.matrix( ~ SEX - 1, data=df ))
EDUCATION <- data.frame(model.matrix( ~ EDUCATION - 1, data=df ))
JOB <- data.frame(model.matrix( ~ JOB - 1, data=df ))
CAR_USE <- data.frame(model.matrix( ~ CAR_USE - 1, data=df ))
CAR_TYPE <- data.frame(model.matrix( ~ CAR_TYPE - 1, data=df ))
RED_CAR <- data.frame(model.matrix( ~ RED_CAR - 1, data=df ))
REVOKED <- data.frame(model.matrix( ~ REVOKED - 1, data=df ))
URBANICITY <- data.frame(model.matrix( ~ URBANICITY - 1, data=df ))
df <- cbind(df, "IS_SINGLE_PARENT" = PARENT1$PARENT1Yes)
df <- cbind(df, "IS_MARRIED" = MSTATUS$MSTATUSYes)
df <- cbind(df, "IS_FEMALE" = SEX$SEXz_F)
df <- cbind(df, EDUCATION[2:5]) # <High School is the default
df <- cbind(df, JOB[2:9]) # Other ("Blank") is the default
df <- cbind(df, "IS_CAR_PRIVATE_USE" = CAR_USE$CAR_USEPrivate)
df <- cbind(df, CAR_TYPE[2:6] ) # CAR_TYPEMinivan is the default
df <- cbind(df, "IS_CAR_RED" = RED_CAR$RED_CARyes)
df <- cbind(df, "IS_LIC_REVOKED" = REVOKED$REVOKEDYes)
df <- cbind(df, "IS_URBAN" = URBANICITY$URBANICITYHighly.Urban..Urban)
remove_cols <- names(df) %in% c('PARENT1', 'MSTATUS', 'SEX',
'EDUCATION', 'JOB', 'CAR_USE',
'CAR_TYPE', 'RED_CAR', 'REVOKED',
'URBANICITY')
df <- df[!remove_cols]
return(df)
}
```
```{r, echo=FALSE}
data.train <- factor_to_dummy(data.train)
```
Let's see our resulting table.
```{r, echo=FALSE}
data.frame("Column_Names" = colnames(data.train))
```
##### NAs prep
First, let's see how our values are represented after the above transformations.
```{r, echo=FALSE}
data.train.summary <- get_df_summary(data.train)
data.train.summary
```
In order to work the missing values, I will proceed as follows.
**Proportion findings**
Let's calculate the proportion of missing values in order to determine the best approach for these variables.
```{r, echo=FALSE}
get_missing_NA_p <- function(df){
AGE <- round(sum(is.na(df$AGE))/dim(df)[1]*100,2)
YOJ <- round(sum(is.na(df$YOJ))/dim(df)[1]*100,2)
INCOME <- round(sum(is.na(df$INCOME))/dim(df)[1]*100,2)
HOME_VAL <- round(sum(is.na(df$HOME_VAL))/dim(df)[1]*100,2)
CAR_AGE <- round(sum(is.na(df$CAR_AGE))/dim(df)[1]*100,2)
df.p <- data.frame(AGE, YOJ, INCOME, HOME_VAL, CAR_AGE)
df.p <- data.frame(t(df.p))
colnames(df.p) <- c("% Total missing")
return(df.p)
}
missing_NA <- get_missing_NA_p(data.train)
```
The below list display the combined missing percentage values for each variable.
```{r, echo=FALSE}
missing_NA
```
**NAs by TARGET_FLAG group**
For this, let's see how many records each group has.
```{r, echo=FALSE}
TARGET_FLAG_0 <- data.train[data.train$TARGET_FLAG == 0,]
TARGET_FLAG_1 <- data.train[data.train$TARGET_FLAG == 1,]
missing_NA$`% TARGET_FLAG = 0` <- get_missing_NA_p(TARGET_FLAG_0)[,1]
missing_NA$`% TARGET_FLAG = 1` <- get_missing_NA_p(TARGET_FLAG_1)[,1]
missing_NA
```
Since those values are considered low percentages compared to our data set; I will replace the missing NA values with randomly selected values in between the respective Min and Max value with the exception of CAR_AGE since it shows a negative value, hence I will select 0 to be the minimum value for that particular variable.
```{r, echo=FALSE}
# Function that fill out missing NA values.
fill_missing_na <- function(df, df_summary){
set.seed(123)
rand_values_AGE <- sample(df_summary["AGE","Min"]:df_summary["AGE","Max"],
size=sum(is.na(df$AGE)),
replace = TRUE)
rand_values_YOJ <- sample(df_summary["YOJ","Min"]:df_summary["YOJ","Max"],
size=sum(is.na(df$YOJ)),
replace = TRUE)
rand_values_INCOME <- sample(df_summary["INCOME","Min"]:df_summary["INCOME","Max"],
size=sum(is.na(df$INCOME)),
replace = TRUE)
rand_values_HOME_VAL <- sample(df_summary["HOME_VAL","Min"]:df_summary["HOME_VAL","Max"],
size=sum(is.na(df$HOME_VAL)),
replace = TRUE)
rand_values_CAR_AGE <- sample(0:df_summary["CAR_AGE","Max"],
size=sum(is.na(df$CAR_AGE)),
replace = TRUE)
df$AGE[is.na(df$AGE)] <- rand_values_AGE
df$YOJ[is.na(df$YOJ)] <- rand_values_YOJ
df$INCOME[is.na(df$INCOME)] <- rand_values_INCOME
df$HOME_VAL[is.na(df$HOME_VAL)] <- rand_values_HOME_VAL
df$CAR_AGE[is.na(df$CAR_AGE)] <- rand_values_CAR_AGE
return(df)
}
```
```{r, echo=FALSE}
data.train <- fill_missing_na(data.train, data.train.summary)
```
##### New Structure
In order to visualize our new structure, I will put together the new set of variables with the transformations. Let's see our structure once again, but this time after the transformation of the data.
```{r, echo=FALSE}
#str(data.train)
str.data.train <- strtable(data.train)[,c(1:3)]
str.data.train
```
##### CAR_AGE investigation
Let's find out why CAR_AGE has a minimum value of $-3$ which seems to be incorrect, for this I will select the records for **CAR_AGE < 0**, with the goal of identifying more possible unrealistic values.
```{r, echo=FALSE}
min_CAR_AGE <- data.frame(t(data.train[data.train$CAR_AGE < 0,]))
colnames(min_CAR_AGE) <- c("Values")
min_CAR_AGE
```
From the above results, there seems to be no apparent reason as to why this value was entered. A possible reason could be that the person who typed the record, entered a wrong number; it could be probably 3 or 0 or any other value. In order to keep data integrity, I will remove that record from our data set.
```{r, echo=FALSE}
data.train <- data.train[data.train$CAR_AGE >= 0,]
```