building.txt

## ----prelims-------------------------------------------------------------
root_dir<-"/Users/tom/Documents/Programming/Restaurant kaggle"
require(foreign)
require(caret)
require(caretEnsemble)
#require(h2o)
require(doSNOW)
require(parallel)

## ------------------------------------------------------------------------
train<-read.csv("train.csv")
test<-read.csv("test.csv")
train$Open.Date<-as.integer(as.POSIXct(as.Date(train$Open.Date,format="%m/%d/%Y"),tz = "GMT"))
test$Open.Date<-as.integer(as.POSIXct(as.Date(test$Open.Date,format="%m/%d/%Y"),tz = "GMT"))
levels(train$Type)<-c(levels(train$Type),"MB")
partition<-createDataPartition(train$revenue,2)

#' get the requisite training data. convert into epoch time. that's seconds since jan 1 1970. it's a number, so who cares what it is. 


## ----factors-------------------------------------------------------------
require("psych")
x<-fa(train[,grepl(names(train),pattern="P")],nfactors = 6,rotate = "varimax")
trainfa<-cbind(train[grepl(names(train),pattern=".*(rev|Open|Type|City|Group).*")],x$scores)
testfa<-fa(test[,grepl(names(test),pattern="P")],nfactors = 6,rotate = "varimax")
test<-cbind(test[,],testfa$scores)
train<-cbind(train,x$scores)

#' I tried to mess with this as factors, but it didn't pan out. Then again, I know shit about factors. I ran this with oblimax rotations too. still little to no improvement. 


## ------------------------------------------------------------------------
tc<-trainControl(method = 'cv',number = 70,repeats = 100)
#' I am abundantly aware that using this many folds will overfit the data, esp. when using gbm. but it has yeilded a decent score so far. 


tl=list(
  gbm=caretModelSpec(method='gbm', tuneGrid=expand.grid(interaction.depth = c(3:5), n.trees = c(500,1000,1500), shrinkage = c(0.01,.1,.001) )  )
   ,dnn=caretModelSpec(method='dnn',tuneGrid=expand.grid(layer1=c(3),layer2=c(5),layer3=c(2),hidden_dropout=c(.1,.2,.3),visible_dropout=c(.1,.2,.3)))
   ,glmboost=caretModelSpec(method='glmboost',tuneGrid=expand.grid(prune=T,mstop=c(100,200,300)) )
)
model_list <- caretList(
  revenue~., data=train[,-c(1,3)],
  tuneList=tl,
  trControl = tc
#  ,methodList=c('brnn')
  )

modelCor(resamples(model_list))
#' i've been looking at this to try to get more diversity in the algorithms I use.


#' methods tried: ANFIS (high rmse), bsTree(high rmse, correlates with gbm), rpart and rf (both correlate strongly with gbm but are slower)

## ------------------------------------------------------------------------
greedy_ensemble <- caretEnsemble(model_list)
summary(greedy_ensemble)
#' Running a greedy ensemble like this produces shitty rmse

glmensemble<-caretStack(	
  model_list,
  method='glmboost',
	tuneGrid=expand.grid(mstop=c(25),prune=TRUE),
#tuneGrid = expand.grid(interaction.depth = c(3:5),n.trees=c(500,1000,1500),shrinkage=.001)
#, 
  trControl=trainControl(method='cv',number=110,repeats = 10)
  )
#' again, way overfitting here. I know. but it has been good for the score. low RMSE, even with high variance seems to pay off a bit here.
#' I've run the stacked model using both glmboost, straight glm and rf. glmboost has performed the best (w.r.t. lowest rmse)

glmensemble

## ------------------------------------------------------------------------
sample<-read.csv(file.path("sampleSubmission.csv"))
submit<-predict(glmensemble,test)
sample$Prediction<-submit
write.csv(sample,file="submit.csv",row.names=F,col.names=F)

#' pulling in the sample submission and writing the results to a file called submit.csv.


##-----------------------
tag <- read.csv("http://topepo.github.io/caret/tag_data.csv", row.names = 1)
tag <- as.matrix(tag)

## Select only models for regression
regModels <- tag[tag[,"Regression"] == 1,]

all <- 1:nrow(regModels)
## Seed the analysis with the SVM model
start <- grep("(gbmboost)", rownames(regModels), fixed = TRUE)
pool <- all[all != start]

## Select 4 model models by maximizing the Jaccard
## dissimilarity between sets of models
nextMods <- maxDissim(regModels[start,,drop = FALSE],
                      regModels[pool, ],
                      method = "Jaccard",
                      n = 4)

rownames(regModels)[c(start, nextMods)]
#' this can be used to see what other models (that don't correlate)