/
Kaggle_mlsp-2014-mri_solution.R
100 lines (68 loc) · 3.24 KB
/
Kaggle_mlsp-2014-mri_solution.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# MLSP Kaggle competition
# 2nd place solution
# Author: Alexander V. Lebedev
# I. PREPARATORY STEP:
# Ia. Load the libraries:
library(caret)
library(e1071)
library(kernlab)
library(doMC)
library(foreach)
# Ib. Read the data:
trFC <- read.csv('/YOUR-PATH/Kaggle/SCH/Train/train_FNC.csv')
trSBM <- read.csv('/YOUR-PATH/Kaggle/SCH/Train/train_SBM.csv')
tr <- merge(trFC, trSBM, by='Id')
tstFC <- read.csv('/YOUR-PATH/Kaggle/SCH/Test/test_FNC.csv')
tstSBM <- read.csv('/YOUR-PATH/Kaggle/SCH/Test/test_SBM.csv')
tst <- merge(tstFC, tstSBM, by='Id')
y <- read.csv('/YOUR-PATH/Kaggle/SCH/Train/train_labels.csv')
# II. ANALYSIS
# IIa. "Feature trimming"
# Registering 6 cores to speed up my computations:
registerDoMC(cores=6)
# Just converting a y-label vector into appropriate format:
y <- as.factor(paste('X.', y[,2], sep = ''))
# This step is important (I introduce a random vector into my feature set):
all <- cbind(tr, rnorm(1:dim(tr)[1]))
colnames(all)[412] <- 'rand'
# Now I train Random Forest with this (full) feature set:
rf.mod <- foreach(ntree=rep(2500, 6), .combine=combine, .multicombine=TRUE,
.packages='randomForest') %dopar% {
randomForest(all[,2:412], y, ntree=ntree)
}
# Now, I am looking at the feature importances:
imp <- as.data.frame(rf.mod$importance[order(rf.mod$importance),])
# Everything below importance of my "dummy" feature (random vector) can likely be ignored
imp <- subset(imp, imp>imp['rand',])
# Saving the data in one rda-file for further analyses:
save('all', 'y', 'tst', 'imp', file = '/YOUR-PATH/Kaggle/SCH/Train/AllData.rda')
# Now, I reduce my feature set:
dat <- all[,rownames(imp)]
IIb. Training the final model:
# I usually start from SVM and then proceed with ensemble methods
# (in this competition, the use of boosted trees did not result in superior performance and I stopped)
# I would have tried other algorithms and more sophisticated feature selection approaches
# (like SCAD-SVM and recursive feature elimination), but I was not impressed with my intermediate results and gave up =)
# So, here is the model that provided me 2nd position:
# First, I estimate "sigma" (inverse width parameter for the RBF-SVM)
# Of note, sometimes I use a subset of my data, but here I used the whole training set due to its very limited size:
sigDist <- sigest(y ~ as.matrix(dat), data=dat, frac = 1)
# Creating a tune grid for further C-parameter selection):
svmTuneGrid <- data.frame(.sigma = sigDist[1], .C = 2^(-20:100))
# And... training the final RBF-SVM model with leave-one-out cross-validation:
# (Yes. It's as simple as that!)
svmFit <- train(dat,y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneGrid = svmTuneGrid,
trControl = trainControl(method = "cv", number = 86, classProbs = TRUE))
# III. FINAL STEP
# Making predictions
ttst <- tst[,rownames(imp)]
predTst <- predict(svmFit, ttst, type='prob')
predTst <- predTst[,2]
# Formatting submission:
pred <- cbind(as.integer(tst$Id), as.numeric(predTst))
colnames(pred) <- c('Id', 'Probability')
# Writing:
write.table(pred, file = '/YOUR-PATH/Kaggle/SCH/submissions/submission_rbfSVM_RFtrimmed.csv', sep=',', quote=F, row.names=F, fileEncoding = 'UTF-16LE')