-
Notifications
You must be signed in to change notification settings - Fork 1
/
building.txt
105 lines (79 loc) · 4.17 KB
/
building.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
## ----prelims-------------------------------------------------------------
root_dir<-"/Users/tom/Documents/Programming/Restaurant kaggle"
require(foreign)
require(caret)
require(caretEnsemble)
#require(h2o)
require(doSNOW)
require(parallel)
## ------------------------------------------------------------------------
train<-read.csv("train.csv")
test<-read.csv("test.csv")
train$Open.Date<-as.integer(as.POSIXct(as.Date(train$Open.Date,format="%m/%d/%Y"),tz = "GMT"))
test$Open.Date<-as.integer(as.POSIXct(as.Date(test$Open.Date,format="%m/%d/%Y"),tz = "GMT"))
levels(train$Type)<-c(levels(train$Type),"MB")
partition<-createDataPartition(train$revenue,2)
#' get the requisite training data. convert into epoch time. that's seconds since jan 1 1970. it's a number, so who cares what it is.
## ----factors-------------------------------------------------------------
require("psych")
x<-fa(train[,grepl(names(train),pattern="P")],nfactors = 6,rotate = "varimax")
trainfa<-cbind(train[grepl(names(train),pattern=".*(rev|Open|Type|City|Group).*")],x$scores)
testfa<-fa(test[,grepl(names(test),pattern="P")],nfactors = 6,rotate = "varimax")
test<-cbind(test[,],testfa$scores)
train<-cbind(train,x$scores)
#' I tried to mess with this as factors, but it didn't pan out. Then again, I know shit about factors. I ran this with oblimax rotations too. still little to no improvement.
## ------------------------------------------------------------------------
tc<-trainControl(method = 'cv',number = 70,repeats = 100)
#' I am abundantly aware that using this many folds will overfit the data, esp. when using gbm. but it has yeilded a decent score so far.
tl=list(
gbm=caretModelSpec(method='gbm', tuneGrid=expand.grid(interaction.depth = c(3:5), n.trees = c(500,1000,1500), shrinkage = c(0.01,.1,.001) ) )
,dnn=caretModelSpec(method='dnn',tuneGrid=expand.grid(layer1=c(3),layer2=c(5),layer3=c(2),hidden_dropout=c(.1,.2,.3),visible_dropout=c(.1,.2,.3)))
,glmboost=caretModelSpec(method='glmboost',tuneGrid=expand.grid(prune=T,mstop=c(100,200,300)) )
)
model_list <- caretList(
revenue~., data=train[,-c(1,3)],
tuneList=tl,
trControl = tc
# ,methodList=c('brnn')
)
modelCor(resamples(model_list))
#' i've been looking at this to try to get more diversity in the algorithms I use.
#' methods tried: ANFIS (high rmse), bsTree(high rmse, correlates with gbm), rpart and rf (both correlate strongly with gbm but are slower)
## ------------------------------------------------------------------------
greedy_ensemble <- caretEnsemble(model_list)
summary(greedy_ensemble)
#' Running a greedy ensemble like this produces shitty rmse
glmensemble<-caretStack(
model_list,
method='glmboost',
tuneGrid=expand.grid(mstop=c(25),prune=TRUE),
#tuneGrid = expand.grid(interaction.depth = c(3:5),n.trees=c(500,1000,1500),shrinkage=.001)
#,
trControl=trainControl(method='cv',number=110,repeats = 10)
)
#' again, way overfitting here. I know. but it has been good for the score. low RMSE, even with high variance seems to pay off a bit here.
#' I've run the stacked model using both glmboost, straight glm and rf. glmboost has performed the best (w.r.t. lowest rmse)
glmensemble
## ------------------------------------------------------------------------
sample<-read.csv(file.path("sampleSubmission.csv"))
submit<-predict(glmensemble,test)
sample$Prediction<-submit
write.csv(sample,file="submit.csv",row.names=F,col.names=F)
#' pulling in the sample submission and writing the results to a file called submit.csv.
##-----------------------
tag <- read.csv("http://topepo.github.io/caret/tag_data.csv", row.names = 1)
tag <- as.matrix(tag)
## Select only models for regression
regModels <- tag[tag[,"Regression"] == 1,]
all <- 1:nrow(regModels)
## Seed the analysis with the SVM model
start <- grep("(gbmboost)", rownames(regModels), fixed = TRUE)
pool <- all[all != start]
## Select 4 model models by maximizing the Jaccard
## dissimilarity between sets of models
nextMods <- maxDissim(regModels[start,,drop = FALSE],
regModels[pool, ],
method = "Jaccard",
n = 4)
rownames(regModels)[c(start, nextMods)]
#' this can be used to see what other models (that don't correlate)