Skip to content

Latest commit

 

History

History
138 lines (105 loc) · 3.9 KB

building.md

File metadata and controls

138 lines (105 loc) · 3.9 KB

prelims

root_dir<-"/Users/tom/Documents/Programming/Restaurant kaggle"
require(foreign)
require(caret)
require(caretEnsemble)
#require(h2o)
require(doSNOW)
require(parallel)
require(knitr)
opts_chunk$set(eval=F,echo=T,purl=F)

get the requisite training data. convert into epoch time. that's seconds since jan 1 1970. it's a number, so who cares what it is.

train<-read.csv("train.csv")
test<-read.csv("test.csv")
train$Open.Date<-as.integer(
  as.POSIXct(as.Date(train$Open.Date,format="%m/%d/%Y"),tz = "GMT"))
test$Open.Date<-as.integer(
  as.POSIXct(as.Date(test$Open.Date,format="%m/%d/%Y"),tz = "GMT"))
levels(train$Type)<-c(levels(train$Type),"MB")
partition<-createDataPartition(train$revenue,2)

I tried to mess with this as factors, but it didn't pan out. Then again, I know shit about factors. I ran this with oblimax rotations too. still little to no improvement.

require("psych")
x<-fa(train[,grepl(names(train),pattern="P")],nfactors = 6,rotate = "varimax")
trainfa<-cbind(
  train[grepl(names(train),pattern=".*(rev|Open|Type|City|Group).*")],x$scores)
testfa<-fa(
  test[,grepl(names(test),pattern="P")],nfactors = 6,rotate = "varimax")
test<-cbind(test[,],testfa$scores)
train<-cbind(train,x$scores)

setup for parallel processing

cl<-makeCluster(8)
registerDoSNOW(cl)

I am abundantly aware that using this many folds will overfit the data, esp. when using gbm. but it has yeilded a decent score so far.

tc<-trainControl(method = 'cv',number = 70,repeats = 100)

i've been looking at this to try to get more diversity in the algorithms I use. methods tried: ANFIS (high rmse), bsTree(high rmse, correlates with gbm), rpart and rf (both correlate strongly with gbm but are slower)

tl=list(
  gbm=caretModelSpec(
    method='gbm', 
    tuneGrid=expand.grid(interaction.depth = c(3:5), 
      n.trees = seq(500,5000,500), shrinkage = c(0.01,.1,.001) )  )
   ,dnn=caretModelSpec(method='dnn',
    tuneGrid=expand.grid(layer1=c(3),layer2=c(5),layer3=c(2),
      hidden_dropout=c(.1,.2,.3),visible_dropout=c(.1,.2,.3)))
   ,glmboost=caretModelSpec(
    method='glmboost',tuneGrid=expand.grid(prune=T,mstop=c(100,200,300)) )
)
model_list <- caretList(
  revenue~., data=train[,c(2,4,6:19,24:29,33,34,43)],
  tuneList=tl,
  trControl = tc
  ,methodList=c('cubist')
  )

modelCor(resamples(model_list))

Running a greedy ensemble like this produces shitty rmse again, way overfitting here. I know. but it has been good for the score. low RMSE, even with high variance seems to pay off a bit here. I've run the stacked model using both glmboost, straight glm and rf. glmboost has performed the best (w.r.t. lowest rmse)

greedy_ensemble <- caretEnsemble(model_list)
summary(greedy_ensemble)

glmensemble<-caretStack(  
  model_list,
  method='gbm'
  ,tuneGrid = expand.grid(
    interaction.depth = c(3:5),n.trees=seq(500,5000,500),shrinkage=.001)
  ,trControl=trainControl(method='cv',number=110,repeats = 10)
  )
glmensemble

pulling in the sample submission and writing the results to a file called submit.csv.

sample<-read.csv(file.path("sampleSubmission.csv"))
submit<-predict(glmensemble,test)
sample$Prediction<-submit
write.csv(sample,file="submit.csv",row.names=F,col.names=F)

this can be used to see what other models (that don't correlate)

tag <- read.csv("http://topepo.github.io/caret/tag_data.csv", row.names = 1)
tag <- as.matrix(tag)

## Select only models for regression
regModels <- tag[tag[,"Regression"] == 1,]

all <- 1:nrow(regModels)
## Seed the analysis with the SVM model
start <- grep("(gbm)", rownames(regModels), fixed = TRUE)
pool <- all[all != start]

## Select 4 model models by maximizing the Jaccard
## dissimilarity between sets of models
nextMods <- maxDissim(regModels[start,,drop = FALSE],
                      regModels[pool, ],
                      method = "Jaccard",
                      n = 4)

rownames(regModels)[c(start, nextMods)]