data_explore.Rmd

---
title: "RChallenge German credit dataset"
output:
  html_document:
    toc: true
    toc_float: true
    fig_caption: true
    self_contained: true
    code_folding: hide
---

```{r}
knitr::opts_chunk$set(collapse = TRUE, echo = TRUE, message = FALSE, warning = FALSE)
```


```{r echo =  FALSE}
set.seed(123)
library(xgboost)
library(caret)
library(MLmetrics)
#library(Metrics)
library(knitrBootstrap)
library(rchallenge)
library(DT)
library(dplyr)
library(ggplot2)
library(reshape2)
library(cowplot)
library(patchwork)
library(rstatix)
library(smotefamily)
library(MLeval)
rawdata <- rchallenge::german
#define colors
cols_sex = c("F" = "lightseagreen", "M" = "violetred4")
cols_risk = c("good" = "darkseagreen3", "bad" = "firebrick2")
library(treeshap)
```

# General data exploration 

Raw, unedited data looks messy:

```{r echo = FALSE}
dim(rawdata)
rawdata <- rawdata %>%	
	mutate(gender = case_when(personal_status_sex == "male : divorced/separated" | personal_status_sex == "male : married/widowed" ~ "M",
			          personal_status_sex == "female : non-single or male : single" ~ NA,
				  personal_status_sex == "female : single" ~ "F")) 

DT::datatable(rawdata)
```

## Response variable 

- (Should the data be split before EDA or after? Performing associations between response variable and predictors could result in "data leak".)   
- The response variable is imbalanced with the minority class being the one that is "more important" since we aim at identifying bad risk individuals.  
- By definition this imbalance is still mild. Therefore I conclude to not think about downsampling but rather making sure to use an adequate evaluation metric (not accuracy) later on.   
```{r echo = FALSE}
rawdata %>%
	group_by(credit_risk) %>%
	tally %>%
	mutate(Perc = paste0(100*n / sum(n), "%"))
```

## Imbalances among predictor variables

Skewed distributions or categorical variables with underrepresented categories?

```{r fig.height = 9}
#mutating numerical columns to numeric  
rawdata %>% 
	mutate(age = as.numeric(age),
	       amount = as.numeric(amount),
	       duration = as.numeric(duration))-> rawdata
rawdata %>% mutate(id = as.factor(seq(1,nrow(.)))) %>%
	select(id, credit_risk, where(is.numeric)) %>%
#	mutate(Age_logged = log2(age),
#	       Duration = log2(duration),
#	       Loan_amount = log2(amount)) %>%
	reshape2::melt(., id.vars = c("id", "credit_risk")) %>%
	mutate(variable = case_when(variable == "age" ~ "Age",
				    variable == "duration" ~ "Duration",
				    variable == "amount" ~ "Loan amount")) -> df
ggplot(df, aes(x = value)) +
	geom_histogram(fill = "transparent", col = "black", aes(y = ..density..), bins = 40) +
	geom_density() +
	facet_wrap(~variable, ncol = 1, scales = "free") +
	cowplot::theme_cowplot() -> p1
p1
ggplot(df, aes(x = credit_risk, y = value, col = credit_risk)) +
	#geom_histogram(bins = 40, fill = "transparent", aes( y = ..density..)) +
	#geom_density() +
	geom_violin() +
 	geom_boxplot(width = .2) +
	scale_color_manual(values = cols_risk) +
	ggpubr::stat_compare_means() +
	#scale_fill_manual(values = cols_risk) +
	facet_wrap(~variable, ncol = 1, scales = "free") +
	cowplot::theme_cowplot() -> p2
p2
 df %>% 
	 tidyr::pivot_wider(., names_from = "variable", id_cols = c("id", "credit_risk"), values_from = "value") -> df_wide
 df_wide %>% 
	 select(where(is.numeric)) %>%
	cor(., use = "pairwise", method = "spearman") %>%
	ggcorrplot::ggcorrplot(., hc.order = T, outline.col = "white", lab = T) +
	 theme(axis.text.x = element_text(margin=margin(-2,0,0,0)),  # Order: top, right, bottom, left
        axis.text.y = element_text(margin=margin(0,-2,0,0))) +
  	geom_vline(xintercept=1:3-0.5, colour="white", size=2) +
  	geom_hline(yintercept=1:3-0.5, colour="white", size=2)  -> p3

p3

#ADD ggpairs
# Can numerical variables alone separate good from bad?
prcomp(df_wide %>% select(where(is.numeric)) %>% log, scale = T, center = T) -> pca 
# Extract loadings of the variables
pca_loadings <- data.frame(Variables = rownames(pca$rotation), pca$rotation)
pca$x %>%
	as.data.frame %>%
	mutate(id = rownames(.)) %>%
	left_join(df_wide, by = "id") %>%
	ggplot(., aes(x = PC1, y = PC2, col = credit_risk)) +
	geom_point() +
	scale_color_manual(values = cols_risk) +
	stat_ellipse() +
	geom_segment(data = pca_loadings, aes(x = 0, y = 0, xend = (PC1),
     yend = (PC2)), arrow = arrow(length = unit(1, "picas")),
     color = "black") +
	annotate("text", x = (pca_loadings$PC1), y = (pca_loadings$PC2),
     label = pca_loadings$Variables) +
	theme_cowplot() -> p4 


(p1 + p2) / (p3 + p4)
```
- Loan amount and Duration both along the first, explaining most of the variance, age in the second PC
- Log transfrom to alleviate skewedness AFTER data split
- identificaton of outliers after datasplit replot distributions?
- Corrplot for categorical variables

```{r cat_gender,  fig.width = 15, fig.height = 12}
rawdata %>%
	select(-where(is.numeric)) %>%
	mutate(id = seq(1,nrow(.))) %>%
	reshape2::melt(., id.vars = c("gender", "id")) %>% 
	group_by(variable, gender, value) %>%
	tally %>%
	mutate(perc = 100 * (n / sum(n))) %>% 
ggplot(., aes(x = value, y = perc, fill = gender)) +
geom_bar(position = "dodge", stat = "identity") +
coord_flip() +
scale_fill_manual(values = cols_sex) +
facet_wrap(~variable, ncol = 3, scales =  "free") +
theme_cowplot() -> p1
```

```{r cat_risk, fig.width = 15, fig.height = 12}
rawdata %>%
	select(-where(is.numeric)) %>%
	mutate(id = seq(1,nrow(.))) %>%
	reshape2::melt(., id.vars = c("credit_risk", "id")) %>% 
	group_by(variable, credit_risk, value) %>%
	tally %>% head
	mutate(perc = 100 * (n / sum(n))) %>% 
ggplot(., aes(x = value, y = perc, fill = credit_risk)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
scale_fill_manual(values = cols_risk) +
facet_wrap(~variable, scales =  "free", ncol = 3) +
theme_cowplot() -> p2
```

```{r}
p1 + p2
```


NEED TO THINK ABOUT WHAT PERCENTAGE TO SHOW IN THE PLOT:
how many bad/good in one level or how much percentage a level makes up of the category

Note: now im only looking at associations with credit risk but what baout correlations between predictors?

# Encoding variables 


No missing values, although in truth there is missing information (e.g. people without checking or savings account). But no imputation needed right now.

```{r }
sum(rawdata %>% is.na %>% colSums) > 0
```

Identify categorical variables and print levels to get an overview

```{r result = "asis"}
# Exploring factor variables
columninfo <- apply(rawdata %>% select(!where(is.numeric)), 2, function(col) {
col %>% table -> mystr
mystr	      #cat(col, "\n", as.character(mystr), "\n")
	  })

#columninfo
```

Notes and thoughts while encoding variables. Some decisions are probably not optimal but time is limited.  

- Encode categorical variables which are ordinal with integers
- Introduce dummy variables for true categorical variables
- Encoding the response variable such that the positive class (1) is assigned to "bad" risk: 
	-->more intuitive because we want to avoid missing bad risk cases (FN), rather than missing good risk cases.  
- The encoding code be done in a much simpler way but now I will also directly go through the variables to get familiar with them  
- For some variables it is difficult for me to know if they could actually be oridnal, like housing or "other installment plans", so I will keep them categorical. And of course the point is to identify possibly new insights, for example that after all people who have life insurances are a lower risk than people with real estate  
- Im splitting personal status and gender because Im interested if one of these will come up higher later in the variable importance analysis --> hm cant actually split it, weird variable  
- I assume relativeley equal magnitudes between the values of those categorical ordinal variables, although this could be argued about. 
- Dummy variables are good when doing lm etc (avoid collinear variables) doesnt matter much for tree based algorithms and can pose a problem when done before data split if test set doesnt have a level you end up with another variable 

```{r encoding}
categorical <- c("job", "housing", "other_installment_plans", "property", "other_debtors", "personal_status_sex", "purpose", "credit_history")
M1 <- paste("~ 0 +", paste(categorical, collapse = "+"))
model.matrix(as.formula(M1),rawdata) %>% colnames -> cat_names

edited <- rawdata %>%
	mutate(credit_risk = ifelse(credit_risk == "bad", 1, 0), 
	       foreign_worker = ifelse(foreign_worker == "yes", 1, 0),
		telephone = ifelse(telephone == "no", 0, 1),
		people_liable = ifelse(people_liable == "0 to 2", 1, 2),
		number_credits = case_when( number_credits == "1" ~ 1,
					    number_credits == "2-3" ~ 2,
					    number_credits == "4-5" ~ 3,
					    number_credits == ">= 6" ~ 4,
					    TRUE ~ NA),
	       present_residence = case_when( present_residence == "< 1 yr" ~ 1,
					      present_residence == "1 <= ... < 4 yrs" ~ 2,
					      present_residence == "4 <= ... < 7 yrs" ~ 3,
					      present_residence == ">= 7 yrs" ~ 4,
					      TRUE ~ NA),
	       installment_rate = case_when( installment_rate == "< 20" ~ 1,
					     installment_rate == "20 <= ... < 25" ~ 2,
					     installment_rate == "25 <= ... < 35" ~ 3,
					     installment_rate == ">= 35" ~ 4,
					     TRUE ~ NA),
	       employment_duration = case_when( employment_duration == "unemployed" ~ 0,
					        employment_duration == "< 1 yr" ~ 1,
						employment_duration == "1 <= ... < 4 yrs" ~ 2,
						employment_duration == "4 <= ... < 7 yrs" ~ 3,
						employment_duration == ">= 7 yrs" ~ 4,
						TRUE ~ NA),
# This is tricky because the case of unknown and no savings account are not necessarily the same. First intuition is to rate it as the lowest cateogory but it could also be a NA, the problem is its 600 values..Other option is to leave this variable categorical even though it is clearly ordinal
# The same applies to status (of checking account)
	 	savings = case_when( savings == "unknown/no savings account" ~ 0,
				     savings == "... <  100 DM" ~  1,
				     savings == "100 <= ... <  500 DM" ~ 2,
				     savings == "500 <= ... < 1000 DM" ~ 3,
				     savings == "... >= 1000 DM" ~ 4,
				     TRUE ~ NA),
	# Treating debt (< 0DM) and no checking account as the same...probably not a good idea
	       status = case_when(status == "... < 0 DM" | status ==  "no checking account" ~ 0,
				   status ==  "0<= ... < 200 DM" ~ 1,
				   status == "... >= 200 DM / salary for at least 1 year" ~ 2)) %>%
	select(-all_of(categorical)) %>%
	dplyr::bind_cols(model.matrix(as.formula(M1), rawdata) %>% as.data.frame) 
```


```{r, include=FALSE}
knitr::knit_hooks$set(output = function(x, options){
  if(!is.null(options$max_height)){
    paste('<pre style = "max-height:', options$max_height, '; float: left; width: 700px; overflow-y: auto;">', x, "</pre>", sep = "")
  }else{
    x
  }
  
})
```

Now we have `r ncol(edited)-1` variables: 

```{r echo = FALSE, max_height = "100px"}
print(colnames(edited))
```

## Data split

- random split or stratified?
- down sampling or not?
- Consider transformations or scaling of numerical variables (depends on distribution and algorithm to be used on the data). Not doing anything for now due to time reasons (and the need to read up on it)

- TODO: improve this and think about downsampling good
```{r datasplit}
x  <- edited
#log numerical variables
x %>% mutate_at(c("age", "duration", "amount"), log2)  %>%
	select(-gender) %>%
	mutate(credit_risk_fac = factor(ifelse(credit_risk == 1, "bad", "good"), levels = c("good", "bad"))) -> x
x$id <- paste0("A",rownames(x))
rownames(x) <- x$id
train_x <- x %>% dplyr::sample_frac(.7) %>% select(-credit_risk,-credit_risk_fac)
test_x  <- x %>% anti_join(., train_x, by = "id") %>% select(-credit_risk, -credit_risk_fac)
train_y <- x[train_x$id, "credit_risk_fac"]  
test_y <- x[test_x$id, "credit_risk_fac"]
train_x <- train_x %>% select(-id)
test_x <- test_x %>% select(-id)
```

Look at the orinal variables

```{r}
train_x %>% select(-all_of(cat_names),-age,-duration,-amount) %>%
	bind_cols("credit_risk" = train_y) %>%
	mutate(id = rownames(.)) %>%
	reshape2::melt(., id.vars = c("id", "credit_risk")) %>%
	group_by(variable, credit_risk, value) %>%
	tally %>%
	mutate(PercOfRiskGroup = n / sum(n)) -> df
ggplot(df, aes(x = as.factor(value), y = PercOfRiskGroup,  fill = credit_risk)) +
	geom_bar(stat = "identity", position = "dodge") +
	labs(x = "Level", y = "% credit_risk group per level") +
#	geom_density() +
	scale_fill_manual(values = cols_risk) +
	facet_wrap(~variable, ncol = 3, scales = "free") +
	cowplot::theme_cowplot() -> p1
p1
```


# Xgboost

## Tune with grid search

Set up grid search, tune, evaluate

** todo: for now the F1 is treating both classes equal, need to find out where I can adjust beta, or use the caret thing where you can give class weights

```{r eval = F}
# grid search setup 
xgb_grid_1 = expand.grid(
#train rounds
nrounds = 5,
#Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features, and eta shrinks the feature weights to make the boosting process more conservative.
eta = c(0.01, 0.001, 0.0001, 0.00001),
#tree depth
max_depth = c(2, 4, 6, 8, 10, 12, 16),
#reg parameter
#"the higher the simpler"
gamma = c(0.1, 0.4, 1, 2, 4),
# minimum sum of instance weight 
min_child_weight = 1,
# subsampling ratio of columns for each tree
colsample_bytree = c(1, 0.4, 0.8, 0.6, 0.7),
#colsample_bylevel = c(0.3, 0.5, 0.8),
subsample = c(0.4, 0.6,0.3))
#

#custom function for fbeta measure
#setting beta to 2 to emphasise recall
#more concerned with false negatives (predicting good when its bad risk) than false positives (predicting bad when good)
# double check that lev[2] is "bad"
fb <- function(data, lev = NULL, model = NULL) {
  f1_val <- FBeta_Score(y_pred = data$pred, y_true = data$obs, positive = lev[2], beta = 2)
  #custom summary function the name can be claled as argument metric for cv tuning
  #could add more metrices here
  tcs  <- twoClassSummary(data, lev, model)
  prs  <- prSummary(data, lev, model)
   c(Fbeta = f1_val, twoClSum = tcs, prSum <- prs)
}

# set up cross validation
xgb_trcontrol_1 = trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
returnData = FALSE,
returnResamp = "all",                                                        # save losses across all models
classProbs = TRUE,                                                           # set to TRUE for AUC to be computed
#summaryFunction = twoClassSummary,
summaryFunction = fb,
allowParallel = TRUE,
savePredictions = "final"
)

#no sampling
#train
xgb_train_1 = train(
#x = as.matrix(train_x),
x = as.matrix(train_x %>% 
	      select(-all_of(failed_feat))),
y = train_y,
trControl = xgb_trcontrol_1,
tuneGrid = xgb_grid_1,
method = "xgbTree",
metric = "Fbeta"
)
#down sample majority class
xgb_trcontrol_1$sampling <- "down"
xgb_train_down = train(
x = as.matrix(train_x %>% 
	      select(-all_of(failed_feat))),
y = train_y,
trControl = xgb_trcontrol_1,
tuneGrid = xgb_grid_1,
method = "xgbTree",
metric = "Fbeta"
)
#upsampling minority class
xgb_trcontrol_1$sampling <- "up"
xgb_train_up = train(
x = as.matrix(train_x %>% 
	      select(-all_of(failed_feat))),
y = train_y,
trControl = xgb_trcontrol_1,
tuneGrid = xgb_grid_1,
method = "xgbTree",
metric = "Fbeta"
)
#hybrid smote
xgb_trcontrol_1$sampling <- "smote"
xgb_train_smote = train(
x = as.matrix(train_x %>% 
	      select(-all_of(failed_feat))),
y = train_y,
trControl = xgb_trcontrol_1,
tuneGrid = xgb_grid_1,
method = "xgbTree",
metric = "Fbeta"
)
#
saveRDS(xgb_train_1, "xgb1_beta2_selectedF.rds")
saveRDS(xgb_train_up, "xgbup_beta2_selectedF.rds")
saveRDS(xgb_train_down, "xgbdown_beta2_selectedF.rds")
saveRDS(xgb_train_smote, "xgbsmote_beta2_selectedF.rds")


#Saved current model with beta 2, but first reload the beta 1.5 and doucment performance and feature importance
# do the same for beta 2 models
# rerun with reduced features 
```

Results of first try: all features (beta = 2)

```{r}
xgb_train_1 <- readRDS("./firstModelsAllFeat/xgb1_beta2.rds")
xgb_train_down  <- readRDS("./firstModelsAllFeat/xgbdown_beta2.rds")
xgb_train_up  <- readRDS("./firstModelsAllFeat/xgbup_beta2.rds")
xgb_train_smote  <- readRDS("./firstModelsAllFeat/xgbsmote_beta2.rds")


#plot tuning based on f1
plot(xgb_train_1)
plot(xgb_train_up)
plot(xgb_train_down)
plot(xgb_train_smote)
```

ARghh..the twoclass summary did not calculate precision and recall according to "bad"

```{r}
getMyDf <- function(trainObj_lst) {
	lapply(trainObj_lst, function(trainObj){
		tab1 <- caret::confusionMatrix(trainObj)
		myRow <- data.frame(Fbeta = caret::F_meas(tab1$table, relevant = "bad", beta = 2),
		   		Precision = caret::precision(tab1$table, relevant = "bad"),
		   		Recall = caret::recall(tab1$table, relevant = "bad"),
		   		FbetaSD =  trainObj$results %>%
			   			arrange(desc(Fbeta)) %>% head(1) %>% 
			    			pull(FbetaSD))
		return(myRow)
	}) %>%
	do.call(rbind, .)
}
getMyDf(list(xgb_train_1, xgb_train_up, xgb_train_down, xgb_train_smote)) %>%
	mutate(Sampling = c("None", "Up", "Down", "Smote")) ->df1
#Fbeta of baseline model
#Predict test and evaluate fbeta
base_f <-  lapply(xgb_train_1$control$indexOut, function(cv){
	true = train_y[cv]
	true
	levels(true)
	#predicted vector is random sampling from the two classes given a probability (can be adjusted to match the ratio of the classes)
	pred = factor(sample(x = c("good", "bad"),
			size = length(true),
			replace = T,
			prob = c(.3,.7)),
			levels = c("good", "bad")
	)	
	pred
	tab <- caret::confusionMatrix(pred, true, positive = "bad")
	tab
	pr <- caret::precision(tab$table, relevant = "bad" )
	rec  <- caret::recall(tab$table, relevant = "bad")
	fb  <- caret::F_meas(tab$table, relevant = "bad", beta = 2)
	return(list("Fbeta" = fb, "Precision" = pr, "Recall" = rec)) 
})
do.call(rbind, base_f) %>%
	as.matrix %>%
	apply(., 2, as.numeric) %>%
	as.data.frame %>%
	dplyr::rename(fb = Fbeta) %>%
	summarise(Fbeta = mean(fb),
		 Recall = mean(Recall),
		Precision = mean(Precision),
		FbetaSD = sd(fb)) %>%
	reshape2::melt(.) %>%
	mutate(Sampling = "BL_imbalanced") -> base_f_imb

df1 %>% 
	reshape2::melt(., id.vars = "Sampling") %>%
	bind_rows(base_f_imb) %>%
	bind_rows(base_f_balanced) %>%
	ggplot(., aes(x = Sampling, y = value, col = Sampling)) +
	geom_point(size = 5) +
	viridis::scale_color_viridis(discrete = T) +
	facet_wrap(~variable, scales = "free", ncol = 2) +
	panel_border() +
  	background_grid() +
#	theme_minimal_grid() +
 	theme(axis.text.x = element_text(angle = 45)) +
theme(text = element_text(size = 20))
#xgb_train_1$results %>% arrange(desc(Fbeta)) %>% 
#	head(1) %>% 
#	select(Fbeta, FbetaSD, Precision, Recall) %>%
#	mutate(Sampling = "None") -> df1
#xgb_train_1$results %>% arrange(desc(Fbeta)) %>% 
#	head(1) %>% 
#	select(Fbeta, FbetaSD, Precision, Recall) %>%
#	mutate(Sampling = "None") -> df1


#Time is limited now Im only looking at one model
treeshap::xgboost.unify(xgb_train_down$finalModel, train_x) -> unified
treeshap1 <- treeshap(unified, train_x, interactions = T, verbose = 0)
treeshap1$shaps %>% apply(., 2, mean) %>% 
	data.frame(Feature = names(.), mean_shap = .) -> df_vars
df_vars %>%  
	group_by(Feature) %>%
	summarise(mean_imp = mean(mean_shap)) %>%
	arrange(abs(mean_imp)) -> order_f
df_vars <- df_vars %>%
	mutate(Feature = factor(Feature, levels = order_f$Feature))

ggplot(df_vars, aes(x = abs(mean_shap), y = Feature, fill = as.factor(sign(mean_shap)))) +
#scale_fill_manual(values = cols_risk) +
geom_bar(position = "dodge", stat = "identity") +
#viridis::scale_fill_viridis(discrete = T) +
theme_cowplot()


plot_feature_dependence(treeshap1, "duration")
# better to pull out shap values and observations and plot myself with ggplot
treeshap1$observations <- treeshap1$observations %>%
	mutate(`propertyreal estate` = factor(`propertyreal estate`))
plot_interaction(treeshap1, "age", "propertyreal estate") +
	viridis::scale_color_viridis(discrete = T)


# predict on test
pred <- predict(xgb_train_down, newdata = test_x %>% as.matrix, type = "prob")
#test_y$Diagnosis %>% as.factor -> test_y
tab <- caret::confusionMatrix(pred, test_y, positive = "bad")
tab
caret::precision(tab$table, relevant = "bad")
caret::recall(tab$table, relevant = "bad")
caret::F_meas(tab$table, relevant = "bad", beta = 2) 
# test random sampling fbeta
# 0.346
true = test_y
pred = factor(sample(x = c("good", "bad"),
			size = length(true),
			replace = T,
			prob = c(.7,.3)),
			levels = c("good", "bad")
	)	
tab <- caret::confusionMatrix(pred, true, positive = "bad")
caret::precision(tab$table, relevant = "bad")
caret::recall(tab$table, relevant = "bad")
caret::F_meas(tab$table, relevant = "bad", beta = 2)
```

todo:

- remove features that score low
- plot boxplot for fbeta for x each model and y each cv also include random sampling fbeta
- remove zero features
- rerun all