My-Code_Housing_Price

install.packages("tidyr")
install.packages("corrplot")
install.packages("reshape2")
install.packages("caret")
install.packages("AppliedPredictiveModeling")
install.packages("stargazer")
install.packages("dplyr")
install.packages("plyr")
install.packages("stringr")
install.packages("lazyeval")
install.packages("hexbin")
install.packages("devtools")

install_github("mrdwabmisc", "mrdwab")

library(plyr)
library(devtools)
library(hexbin)
library(caret)
library(sunflowerplot)
library(lazyeval)
library(stringr)
library(tidyr)
library(corrplot)
library(reshape2)
library(caret) 
library(AppliedPredictiveModeling)
library(stargazer)
library(dplyr)

#how to import a file : 
library(readr)
BD5TRAIN2_jo2 <- read_csv("/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/BD5TRAIN2_jo2.csv")
View(BD5TRAIN2_jo2)  


head(BD5TRAIN)

Violations2 <- Violations %>% separate(Location, c("Lat", "Long"))
Violations2 <- Violations %>% separate(Violations, Location, c("Lat", "Long"), sep = " ", remove = FALSE, convert = TRUE, extra = "warn", fill = "warn",)
Violations2016 <- read.csv("C:/Users/Keisan/Documents/CPLN590/Violations2016.csv", header = TRUE, row.names = NULL)
gsub( "POINT (", "", as.character(Violations$Location) n)
Violations2 <- gsub("C([0-7]+)_.*", "\\1", Violations$Location)
Property <- read.csv("C:/Users/Keisan/DOcuments/CPLN590/MIDTERM/DATA/Propertyfile.csv", header = TRUE, row.names = NULL)
View(Property)
View(Property)
View(Violations)
Violations <- read.csv("C:/Users/Keisan/Downloads/LI_Violations2016.csv", header = TRUE, row.names = NULL)
stringsAsFactors= TRUE,
stringsAsFactors= TRUE,
strip.white = TRUE)
BD5TRAIN2_jo2 <- read.table("C:/Users/Keisan/DOcuments/CPLN590/MIDTERM/DATA/BD5TRAIN2_jo2.csv", header = TRUE, fill = TRUE)
separate(Violations, Location, c("Lat11", "Long11"), sep = " ")
Violationstest2 <- gsub("POINT ", "", Violations$Location)
vlocsplit= gsub("POINT", "", Location)
V4 <- gsub("\\(|\\)", "", V3)
gsub("\\[|\\]", "", n)
View(V4)
V5 <- separate(V4, x, c("Lat11","Long11"), sep = " ")
V5 <- merge(Violations[Violations$year=="2014",], test[test$year=="2015",], by = "p1", all = TRUE)
Violations2 <- Violations + as.data.frame(V4)
dataFrame$newColumn <- dataFrame$oldColumn1 + dataFrame$oldColumn2
LocDataFrame <- (V4 c("Location")))
V4df <- data.frame(as.list(V4)) 
V4DF <- data.frame(lapply(V4, type.convert), stringsAsFactors=FALSE)
V4DF2 <- data.frame(keyName=names(V4), value=V4, row.names=NULL)
require(reshape2)
Violations$Locations2 <- rownames(V4) 
melt(V4)
Violationstest <- Violations
read.table(text=gsub("^[^(]+\(|\)", "", Violations$Location),
           header=FALSE, col.names = c("lat", "lon"), stringsAsFactors=FALSE)

Violations = data.frame(Location = "POINT (-7423249 453982)")
Violations$Lat = strsplit(as.character(Violations$Location)," ")[[1]][2]
Violations$Lat = as.numeric(gsub("\\(","",Violations$Lat))
Violations$Lon = strsplit(as.character(Violations$Location)," ")[[1]][3]
Violations$Lon = as.numeric(gsub("\\)","",Violations$Lat))


ViolationsTest <- structure(list(Location = c("POINT (-7423249 453982)", 
                                              "POINT (-7412345 453321)"
)), .Names = "Location", row.names = c(NA, -2L), class = "data.frame")


read.table(text=gsub("^[^(]+\(|\)"), "", Violations$Location,
           header=FALSE, col.names = c("lat", "lon"), stringsAsFactors=FALSE)
read.table(text=gsub("^[^(]+\\(|\\)", "", ViolationsTest$Location),
           header = FALSE, col.names = c("lat", "lon"), stringsAsFactors=FALSE)
ViolationsTest =  c("POINT (-7423249 453982)", 
                    "POINT (-7412345 453321)")
lat = as.numeric(gsub("POINT\\s*\\((-?\\d+).*", "\\1", ViolationsTest))
lon = as.numeric(gsub("POINT\\W+\\d+\\s+-?(\\d+).*", "\\1", ViolationsTest))
lat
Villations <- Violations
View(Villations)
V2k16 %>% separate(V2k16$Location, c("Lat", "Long"), " ")


library(stringr)
str_split_fixed(before$type, "_and_", 2)
within(V2k16, FOO<-data.frame(do.call('rbind', strsplit(as.character(FOO), '|', fixed=TRUE))))

separate(V2k16, Location, (Lat + Long), sep = " ") 
separate(V2k16,Location, c('Lat', 'Long'), sep = " ")
separate(V2k16, Location, c('Lat', 'Long'))
VVV <- strsplit(V2k16$Location, " ")
separate(V2k16, Location, c('Lat', 'Long'), sep = " ", remove = TRUE, convert = FALSE, extra = "warn", fill = "warn",)
V2k16x <- with(V2k16, data.frame(attr = VVV))
V2k16x <- cbind(V2k16x, data.frame(t(sapply(out, `[`))))
names(after)[2:3] <- paste("type", 1:2, sep = "_")


BD5TRAIN2_jo2 <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/BD5TRAIN2_jo2.csv",header = TRUE , sep = ",")
Near_Retail <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_AllRetail.txt",
                          header = TRUE)
Near_Retail <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_AllRetail.txt", header = TRUE)
Near_CommCorr <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_CommCorr.txt", header = TRUE , sep = ",")
Near_CrimeInc <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_CrimeInc.txt", header = TRUE , sep = ",")
Near_HighwyExit <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_HighwyExit.txt", header = TRUE , sep = ",")
Near_SeptaStops <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_SeptaStops.txt", header = TRUE , sep = ",")
Near_TrafficData <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_TrafficData.txt", header = TRUE , sep = ",")
Near_Vacant <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_Vacant.txt", header = TRUE , sep = ",")
Near_Violations <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_Violations.txt", header = TRUE , sep = ",")
View (Near_HighwyExit)
View (Near_SeptaStops)
View (Near_TrafficData)
View (Near_Vacant)
View (Near_Violations)
View(Near_Retail)
View(CommCorr)

View(PropertyR)
PropertyR <- read.table("C:/Users/Keisan/Documents/CPLN590/PropertyR.csv", header = TRUE , sep = ",")
Near_Vacant <- Near_Vacant[,-1,]
head(Near_Vacant)
Near_Vacant > subset(Near_Vacant, select=-c(OBJECTID))
BigData1 = merge(Near_Violations, Near_Vacant, by.x=("OBJECTID"),  by.y= c("IN_FID", all = TRUE)) 
head(BigData1)
merged.BIGDATA1.all <- merge(Near_Violations, Near_Vacant, by.x=("OBJECTID"), by.y=c("IN_FID"), all=TRUE)
# edits titles of columns for clarity as dataframe gets bigger

colnames(Near_Vacant) <- paste(colnames(Near_Vacant), "Vac", sep = ".")
colnames(Near_Violations) <- paste(colnames(Near_Violations), "LIViol", sep = ".")
colnames(Near_Retail) <- paste(colnames(Near_Retail), "Retail", sep = ".")
colnames(Near_SeptaStops) <- paste(colnames(Near_SeptaStops), "SeptST", sep = ".")
colnames(Near_TrafficData) <- paste(colnames(Near_TrafficData), "TrafDat", sep = ".")
colnames(Near_HighwyExit) <- paste(colnames(Near_HighwyExit), "HwyEx", sep = ".")
colnames(Near_CommCorr) <- paste(colnames(Near_CommCorr), "ComCor", sep = ".")
#print names to check
colnames(Near_Vacant)
colnames(Near_Violations)
colnames(Near_Retail)
colnames(Near_SeptaStops)
colnames(Near_TrafficData)
colnames(Near_HighwyExit)
colnames(Near_CommCorr)

#substitute in case you made a mistake
names(Near_Vacant) <- gsub("_Vac","",names(Near_Vacant)) 
head(Near_Vacant)
head(Near_Violations)

#merge 2 dataframes at a time to make one big dataset and keep all records
BigData1 <- merge(x = Near_Vacant, y = Near_CommCorr, by.x = "IN_FID.Vac", by.y = "OBJECTID.ComCor", all = TRUE)

#Keep replacing the 'y' variable and 'by.y' variable names each time
BigData1 <- merge(x = BigData1, y = Near_Violations, by.x = "IN_FID.Vac", by.y = "OBJECTID.LIViol", all = TRUE)
BigData1 <- merge(x = BigData1, y = Near_SeptaStops, by.x = "IN_FID.Vac", by.y = "OBJECTID.SeptST", all = TRUE)
BigData1 <- merge(x = BigData1, y = Near_HighwyExit, by.x = "IN_FID.Vac", by.y = "IN_FID.HwyEx", all = TRUE)

#check the number of columns after each merge. should grow
ncol(BigData1)
colnames(BigData1)

#format BigData1 for only the near_dist fields
Independent_variables <- BigData1 #reassigns dataset for manipulaiton
Near_dist_col <- BigData1 %>% select(starts_with("Near_DIST"))
ncol(Near_dist_col)
head(Near_dist_col)
View(Near_dist_col)
Prop_Ind_Var <- PropertyR %>% select(Zoning,Taxable_Bu,Taxable_La,Depth,Garage_Space,Total_Liva)
#add a new column to eaach data frame
Train85spread[,13] <- NA
Train85spread <- Train85cut
colnames(Train85spread)[13] <- "FID"
#gives empty column numbers
Train85spread$FID <- 1:nrow(Train85spread)
Near_dist_col$FID.Main2 <- 1:nrow(Near_dist_col)

head(Train85spread)
#drop column by number
BD5TRAIN <- BD5TRAIN[,-18]


BigData2 <- merge(x = Prop_Ind_Var, y = Neardist_col, by.x = "FID.Main", by.y = "FID.Main3", all = TRUE)

#add dependent variable to dataframe
DepVar <- PropertyR %>% select(Sale_Pri_1,Field1,Sale_Price,Sale_Year,Lat1,Long1)
DepVar <- PropertyR %>% select(Sale_Pri_1,Field1,Sale_Price,Sale_Year,Lat1,Long1)

#output this as a csv for backup

write.csv(BD5TRAIN, file = "C:/Users/Keisan/Documents/CPLN590/bd5TRAIN_RESULTS.csv", append = FALSE, quote = TRUE, sep = ",",
          eol = "\n", na = "NA", dec = ".", row.names = TRUE,
          col.names = TRUE, qmethod = c("escape", "double"),
          fileEncoding = "")

######################################################################################
#----------------------------------------Creating Training Dataset-------------------------

#in case you need to reimport BigData3- 
BD5TEST2 <- read.csv2("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/BD5TEST.csv", header = TRUE)
#make a copy of course ;)
BD3 <- BigData3

#This dataset has the dependent variable and "most of the independent variables"
TrainingSet1 <- BigData3
BigData3 [,1]
colnames(BigData1 [,1])
#gives column number with name
which( colnames(BigData3)=="random_name" )
#gives column name with number
colnames( BigData3 )[1]
colnames(QTrain175)
TraingingSet1 <- BigData3[,-1] #removes one column from dataframe
QTrain175 <- QTrain175[,-1] #removes column by number
TrainingSet1 <- TrainingSet1[, -c(2:4)] #deletes a series of columns

TrainingSet1 <- TrainingSet1[,-1]
BD5TRAIN[,-1]
View(TrainingSet1)
colnames(TrainingSet1)

training3 <- training[, c(2, 12,10)] # include three other columns
############################################################
#----------------------------------Summary Stats on Dataset--------------------------

stargazer(TrainingSet1, type ="text", title = "Summary Statistics")
stargazer(both, type = "html")
#####################################################################
#Now were going to create a correlation matrix using the corrplot package
TrainingSet1Cpy <- TrainingSet1

TrainingSet1Cpy[is.na(TrainingSet1Cpy)] <- 0
cor(TrainingSet1Cpy, use="pairwise.complete.obs")
TS1C <- cor(TrainingSet1Cpy)
M[is.na(M)] <- 0
TH

head(trainprop3)
TP <- cor(trainprop3)
#returns correlation for each variable to all the others. Here is a pretty correlation matrix plot.
#check the help for corrplot. There are a bunch of different methods for creating these graphics
corrplot(TH, method = "number")

####################################
#----------------------------Quick Training Dataset---------------------
#You just downsized the Dataset so that every value matches up. should've done thisi before.
#nevertheless you now have a lot less data to work with so time to re-import the dataset and 
#re do the training|testing partition

Quick_Train <- read.csv2("C:/Users/Keisan/Documents/CPLN590/Quick_Train.csv", header= TRUE)
cutoff = round(0.7*nrow(Quick_Train666))
Train666 <- Quick_Train666

Train666_70cut <- Train666[1:cutoff,]
Test666_30cut <- Train666[-(1:cutoff),]

View(Train666_70cut)
View(Test666_30cut)

# Need to make your dataframes numeric
QuickT_Set <- Quick_train_set  #make a copy for safety
colnames(QuickT_Set)  #trying to figure out which columns are not numeric

is.numeric(QuickT_Set$Zoning) #f
is.numeric(QuickT_Set$Taxable_La)#f
is.numeric(QuickT_Set$Depth) #f
is.numeric(QuickT_Set$Garage_Spa) #t
is.numeric(QuickT_Set$Total_Liva) #T
is.numeric(QuickT_Set$NEAR_DIST.Vac) #T
is.numeric(QuickT_Set$NEAR_DIST.ComCor) #t
is.numeric(QuickT_Set$NEAR_DIST.SeptaST) #F
is.numeric(QuickT_Set$NEAR_DIST.TrafDat) #T
is.numeric(QuickT_Set$NEAR_DIST.HwyEx) #T
is.numeric(QuickT_Set$Sale_Pri_1) #T


transform(TrainingSet2, NEAR_DIST.SeptaST = as.numeric(NEAR_DIST.SeptaST))
#Remove all $ in these columns "Taxable_Bu" & "Taxable_La"
gsub("$", "", QTrain175$Depth)
head(BD3$Taxable_La)

#what mode
mode(TrainingSet2$NEAR_DIST.SeptaST)
#####

# The droids you are looking for   aka. the variables you wanna do regression on
#"Depth"             "Garage_Spa"        "Total_Liva"        "NEAR_DIST.Vac"    
#[5] "NEAR_DIST.ComCor"  "NEAR_DIST.LIViol"  "NEAR_DIST.Retail"  "NEAR_DIST.SeptST" 
#[9] "NEAR_DIST.TrafDat" "NEAR_DIST.HwyEx"   "Sale_Pri_1"        "Tax_Build"        
#[13] "Tax_Land"


#But one better! - Output this as backup just incase. 
write.csv(Quick_Train, file = "C:/Users/Keisan/Documents/CPLN590/Quick_Train.csv", append = FALSE, quote = TRUE, sep = ",",
          eol = "\n", na = "NA", dec = ".", row.names = TRUE,
          col.names = TRUE, qmethod = c("escape", "double"),
          fileEncoding = "")
write.csv(Quick_Test, file = "C:/Users/Keisan/Documents/CPLN590/Quick_Test.csv", append = FALSE, quote = TRUE, sep = ",",
          eol = "\n", na = "NA", dec = ".", row.names = TRUE,
          col.names = TRUE, qmethod = c("escape", "double"),
          fileEncoding = "")
stargazer(QTrain70, type="text", title = "Summary Statistics")
View(QTrain70)
str(QTrain70)
head(QTrain70)
#Assign factors as integers
Train85cut$NEAR_DIST.Retail <- as.integer(as.character(Train85cut$NEAR_DIST.Retail))
#Do it for all factoral columns 
Train85cut$NEAR_DIST.ComCor <- as.integer(as.character(Train85cut$NEAR_DIST.ComCor))
Train85cut$NEAR_DIST.SeptST <- as.integer(as.character(Train85cut$NEAR_DIST.SeptST))
Train85cut$NEAR_DIST.Vac <- as.integer(as.character(Train85cut$NEAR_DIST.Vac))
Train85cut$NEAR_DIST.LIViol <- as.integer(as.character(Train85cut$NEAR_DIST.LIViol))
Train85cut$NEAR_DIST.TrafDat <- as.integer(as.character(Train85cut$NEAR_DIST.TrafDat))
Train85cut$NEAR_DIST.HwyEx <- as.integer(as.character(Train85cut$NEAR_DIST.HwyEx))
#format summary statistics output
stargazer(OLS_1, type="text", title = "Summary Statistics")  #basic
stargazer(OLS_1, type = "text", nobs = FALSE, mean.sd = TRUE, median = TRUE,
          +           iqr = TRUE)   #more information in charts

stargazer(QTrain70, type = "text", style = "qje") #style difference 


stargazer(OLS_1, type = "text", 
          title            = "Summary Statistics",
          covariate.labels = c("Garage Space Dimension", "Total Livable Space", "Dist.to Vacancy",
                               "Nearest Commercial Corridor", "Nearest LI Violation", "Nearest Retail", 
                               "Nearest Septa Stop", "Nearby Traffic Density", "Nearest Highway Exit", "Sale Price", "Building Tax","Land Tax"),
          dep.var.caption  = "Dependent Variable",
          dep.var.labels   = "Sale Price")  #rename independent labels

stargazer(QTrain70, type = "text", 
          dep.var.labels.include = FALSE,
          model.numbers          = FALSE) #not interested in this

##########################################################################
#running a OLS Regression on the dataset 
colnames(QTrain70)notepad
##########################################   Runs a regression ###############################!!!!!!!!!!!!!!!!!!
N_hood <- lm(Sale_Pri_1 ~ Garage_Spa + Total_Liva + NEAR_DIST_Vac + NEAR_DIST_ComCor + NEAR_DIST_LIViol + NEAR_DIST_Retail + NEAR_DIST_SeptST + NEAR_DIST_TrafDat + NEAR_DIST_HwyEx + Tax_Build + Tax_Land, data = BD6_Neighb_Jo)
### make dataset smaller

cutoff = round(0.7*nrow(BD5))
QTrain175 <- QTrain35

BD5TRAIN <- BD5[1:cutoff,]
BD5TEST <- BD5[-(1:cutoff),]
dim(BD5TRAIN)


#regression ran after you subsetted the data twice. Hooray! you have some data to plot visualizations
#based on! this is the main point.
#Now just output that file for safety reasons...
write.csv(BD5TRAIN, file = "C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/BD5TRAIN.csv", append = FALSE, quote = TRUE, sep = ",",
          eol = "\n", na = "NA", dec = ".", row.names = TRUE,
          col.names = TRUE, qmethod = c("escape", "double"),
          fileEncoding = "")  #cant cuz itsaves as a character not a data.frame and cannot be 
#coerced into a data.frame

View(OLS_1) #can't view it saves as a function
#summary  of OLS_1

summary(OLS_1)


library(corrplot)
cor(Train85cut) #produces correlation coefficients in list 
TR85cut <- cor(Train85cut)
#Now to visualize
corrplot.mixed(TR85cut, order ="AOE",method ="color",addCoef.col="grey") #not working :(
#if column has null values in them
QMat[is.na(Qmat)] <- 0
#pairwise obs
QTMat <- QtestMatrix
cor(QtestMatrix, use="pairwise.complete.obs")
corrplot(BD5TRAIN, method = "number")


#subsetted data with. Now to go through previous steps with reformatting and subsetting
Quick_Train666 <- na.omit(Quick_Train)
Train666 <- Quick_Train666

Train666_70cut <- Train666[1:cutoff,]
is.numeric(Train85cut$NEAR_DIST.LIViol) 


#rmse, rse, etc for training set

#now the mean residual or error from the regression 'reg' aka the 'mean absolute error' aka the MAE
mean(abs(Train85cut$Sale_Pri_1))

#calculate the MAPE - or mean absolute percent error aka MAPE
mean(abs(Train85cut$Sale_Pri_1))


#plot variation in regression output

x <- rnorm(1000)
y <- rnorm(1000)
bin<-hexbin(x, y, xbins=50) 
plot(bin, main="Hexagonal Binning")

hexbin(Train85cut)

#stylize the corrplot: 
col1 <- colorRampPalette(c("#7F0000","red","#FF7F00","yellow","white", 
                           "cyan", "#007FFF", "blue","#00007F"))
col2 <- colorRampPalette(c("#67001F", "#B2182B", "#D6604D", "#F4A582", "#FDDBC7",
                           "#FFFFFF", "#D1E5F0", "#92C5DE", "#4393C3", "#2166AC", "#053061"))  
col3 <- colorRampPalette(c("red", "white", "blue")) 
col4 <- colorRampPalette(c("#7F0000","red","#FF7F00","yellow","#7FFF7F", 
                           "cyan", "#007FFF", "blue","#00007F"))   
wb <- c("white","black")


corrplot(Train85cut, method="square", order = "FPC", cl.pos="n") #corrplot of Train85cut OLS output, using squares, First Class Principle, no color ramp legend,
head(Train85cut)


#this works for corrplot 
corrplot(cor(Train85cut), method="square", order = "FPC", cl.pos="n", p.mat = res1[[1]], sig.level = 0.02, insig = c("p-value"))

pAdj <- p.adjust(c(res1[[1]]), method = "BH")


###select specific columns and drop them 
Train85cut2> subset(Train85cut, select=-c(Train85cut,Near_DIST.TrafDat))

R> subset(df, select=-c(z,u))
############## CORRELATION OUTPUT with significance levels indicated by an "X"

cor.mtest <- function(mat, conf.level = 0.95){
  mat <- as.matrix(mat)
  n <- ncol(mat)
  p.mat <- lowCI.mat <- uppCI.mat <- matrix(NA, n, n)
  diag(p.mat) <- 0
  diag(lowCI.mat) <- diag(uppCI.mat) <- 1
  for(i in 1:(n-1)){
    for(j in (i+1):n){
      tmp <- cor.test(mat[,i], mat[,j], conf.level = conf.level)
      p.mat[i,j] <- p.mat[j,i] <- tmp$p.value
      lowCI.mat[i,j] <- lowCI.mat[j,i] <- tmp$conf.int[1]
      uppCI.mat[i,j] <- uppCI.mat[j,i] <- tmp$conf.int[2]
    }
  }
  return(list(p.mat, lowCI.mat, uppCI.mat))
}

res1 <- cor.mtest(mtcars,0.95)
res2 <- cor.mtest(mtcars,0.99)
## specialized the insignificant value according to the significant level
corrplot(M, p.mat = res1[[1]], sig.level=0.2)


######################## RMSE
summary(Train85cut)
lm(formula = Sale_Pri_1 ~ Garage_Spa + NEAR_DIST.LIViol + NEAR_DIST.Retail, data = Train85cut)
CV <- CVlm(data=trainhouse5, CVT, m=8)
exists("fit")

#m=5 sets it to 5 folds
mse <- attr(CVT, "ms")
rmse <- sqrt(mse)    #Obtaining RMSE for model 1
rmse
#Now were going to see how generalizable some of these ind. variables are across both Starbucks and Dunkin Donuts.
#The following lines of code do the same thing as above to create the 'training2' data frame but this time using
#only Starbucks and Dunkin Donuts locations.
#Create a new data frame of just starbucks and DD locations
StarbucksOrDunkin <- biz[ which(biz$CONAME == "DUNKIN' DONUTS" | biz$CONAME == "STARBUCKS" ), ]
#create a dummy variablefor DD
StarbucksOrDunkin$isDunkin = ifelse(StarbucksOrDunkin$CONAME == "DUNKIN' DONUTS" ,1,0)
#create a new data frame of just the important variables.
SD2 <- StarbucksOrDunkin[,22:40] #create a new data frame of most of the ind. variables
SD2 <- SD2[,-5] #removes one column from that
SD3 <- StarbucksOrDunkin[, c(12,10)] # install.packages("tidyr")
install.packages("corrplot")
install.packages("reshape2")
install.packages("caret")
install.packages("AppliedPredictiveModeling")
install.packages("stargazer")
install.packages("dplyr")
install.packages("stringr")
install.packages("lazyeval")
install.packages("hexbin")
install.packages("ggplot2")


library(hexbin)
library(car)
library(sunflowerplot)
library(lazyeval)
library(stringr)
library(tidyr)
library(corrplot)
library(reshape2)
library(caret) 
library(AppliedPredictiveModeling)
library(stargazer)
library(dplyr)

BD5TEST2 <- read.csv2("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/BD5TEST.csv",
                      header= FALSE) 

Violations2 <- Violations %>% separate(Location, c("Lat", "Long"))
Violations2 <- Violations %>% separate(Violations, Location, c("Lat", "Long"), sep = " ", remove = FALSE, convert = TRUE, extra = "warn", fill = "warn",)
Violations2016 <- read.csv("C:/Users/Keisan/Documents/CPLN590/Violations2016.csv", header = TRUE)
gsub( "POINT (", "", as.character(Violations$Location) n)
Violations2 <- gsub("C([0-7]+)_.*", "\\1", Violations$Location)
BD5TEST2 <- read.csv("C:/Users/Keisan/DOcuments/CPLN590/MIDTERM/DATA/BD5TEST.csv", header = TRUE, row.names = )
View(Property)
View(Property)
View(Violations)
Violations <- read.csv("C:/Users/Keisan/Downloads/LI_Violations2016.csv", header = TRUE, row.names = NULL)
stringsAsFactors= TRUE,
stringsAsFactors= TRUE,
strip.white = TRUE)
Property <- read.table("C:/Users/Keisan/DOcuments/CPLN590/MIDTERM/DATA/Propertyfile.csv", header = TRUE, fill = TRUE)
separate(Violations, Location, c("Lat11", "Long11"), sep = " ")
Violationstest2 <- gsub("POINT ", "", Violations$Location)
vlocsplit= gsub("POINT", "", Location)
V4 <- gsub("\\(|\\)", "", V3)
gsub("\\[|\\]", "", n)
View(V4)
V5 <- separate(V4, x, c("Lat11","Long11"), sep = " ")
V5 <- merge(Violations[Violations$year=="2014",], test[test$year=="2015",], by = "p1", all = TRUE)
Violations2 <- Violations + as.data.frame(V4)
dataFrame$newColumn <- dataFrame$oldColumn1 + dataFrame$oldColumn2
LocDataFrame <- (V4 c("Location")))
V4df <- data.frame(as.list(V4)) 
V4DF <- data.frame(lapply(V4, type.convert), stringsAsFactors=FALSE)
V4DF2 <- data.frame(keyName=names(V4), value=V4, row.names=NULL)
require(reshape2)
Violations$Locations2 <- rownames(V4) 
melt(V4)
Violationstest <- Violations
read.table(text=gsub("^[^(]+\(|\)", "", Violations$Location),
           header=FALSE, col.names = c("lat", "lon"), stringsAsFactors=FALSE)

Violations = data.frame(Location = "POINT (-7423249 453982)")
Violations$Lat = strsplit(as.character(Violations$Location)," ")[[1]][2]
Violations$Lat = as.numeric(gsub("\\(","",Violations$Lat))
Violations$Lon = strsplit(as.character(Violations$Location)," ")[[1]][3]
Violations$Lon = as.numeric(gsub("\\)","",Violations$Lat))


ViolationsTest <- structure(list(Location = c("POINT (-7423249 453982)", 
                                              "POINT (-7412345 453321)"
)), .Names = "Location", row.names = c(NA, -2L), class = "data.frame")


read.table(text=gsub("^[^(]+\(|\)"), "", Violations$Location,
           header=FALSE, col.names = c("lat", "lon"), stringsAsFactors=FALSE)
read.table(text=gsub("^[^(]+\\(|\\)", "", ViolationsTest$Location),
           header = FALSE, col.names = c("lat", "lon"), stringsAsFactors=FALSE)
ViolationsTest =  c("POINT (-7423249 453982)", 
                    "POINT (-7412345 453321)")
lat = as.numeric(gsub("POINT\\s*\\((-?\\d+).*", "\\1", ViolationsTest))
lon = as.numeric(gsub("POINT\\W+\\d+\\s+-?(\\d+).*", "\\1", ViolationsTest))
lat
Villations <- Violations
View(Villations)
V2k16 %>% separate(V2k16$Location, c("Lat", "Long"), " ")


library(stringr)
str_split_fixed(before$type, "_and_", 2)
within(V2k16, FOO<-data.frame(do.call('rbind', strsplit(as.character(FOO), '|', fixed=TRUE))))

separate(V2k16, Location, (Lat + Long), sep = " ") 
separate(V2k16,Location, c('Lat', 'Long'), sep = " ")
separate(V2k16, Location, c('Lat', 'Long'))
VVV <- strsplit(V2k16$Location, " ")
separate(V2k16, Location, c('Lat', 'Long'), sep = " ", remove = TRUE, convert = FALSE, extra = "warn", fill = "warn",)
V2k16x <- with(V2k16, data.frame(attr = VVV))
V2k16x <- cbind(V2k16x, data.frame(t(sapply(out, `[`))))
names(after)[2:3] <- paste("type", 1:2, sep = "_")


BD5TEST2 <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/BD5TEST.csv",header = TRUE , sep = ",")
Near_Retail <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_AllRetail.txt",
                          header = TRUE)
Near_Retail <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_AllRetail.txt", header = TRUE)
Near_CommCorr <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_CommCorr.txt", header = TRUE , sep = ",")
Near_CrimeInc <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_CrimeInc.txt", header = TRUE , sep = ",")
Near_HighwyExit <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_HighwyExit.txt", header = TRUE , sep = ",")
Near_SeptaStops <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_SeptaStops.txt", header = TRUE , sep = ",")
Near_TrafficData <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_TrafficData.txt", header = TRUE , sep = ",")
Near_Vacant <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_Vacant.txt", header = TRUE , sep = ",")
Near_Violations <- read.table("C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Near_Violations.txt", header = TRUE , sep = ",")
View (Near_HighwyExit)
View (Near_SeptaStops)
View (Near_TrafficData)
View (Near_Vacant)
View (Near_Violations)
View(Near_Retail)
View(CommCorr)

View(PropertyR)
PropertyR <- read.table("C:/Users/Keisan/Documents/CPLN590/PropertyR.csv", header = TRUE , sep = ",")
Near_Vacant <- Near_Vacant[,-1,]
head(Near_Vacant)
Near_Vacant > subset(Near_Vacant, select=-c(OBJECTID))
BigData1 = merge(Near_Violations, Near_Vacant, by.x=("OBJECTID"),  by.y= c("IN_FID", all = TRUE)) 
head(BigData1)
merged.BIGDATA1.all <- merge(Near_Violations, Near_Vacant, by.x=("OBJECTID"), by.y=c("IN_FID"), all=TRUE)
# edits titles of columns for clarity as dataframe gets bigger

colnames(Near_Vacant) <- paste(colnames(Near_Vacant), "Vac", sep = ".")
colnames(Near_Violations) <- paste(colnames(Near_Violations), "LIViol", sep = ".")
colnames(Near_Retail) <- paste(colnames(Near_Retail), "Retail", sep = ".")
colnames(Near_SeptaStops) <- paste(colnames(Near_SeptaStops), "SeptST", sep = ".")
colnames(Near_TrafficData) <- paste(colnames(Near_TrafficData), "TrafDat", sep = ".")
colnames(Near_HighwyExit) <- paste(colnames(Near_HighwyExit), "HwyEx", sep = ".")
colnames(Near_CommCorr) <- paste(colnames(Near_CommCorr), "ComCor", sep = ".")
#print names to check
colnames(Near_Vacant)
colnames(Near_Violations)
colnames(Near_Retail)
colnames(Near_SeptaStops)
colnames(Near_TrafficData)
colnames(Near_HighwyExit)
colnames(Near_CommCorr)

#substitute in case you made a mistake
names(Near_Vacant) <- gsub("_Vac","",names(Near_Vacant)) 
head(Near_Vacant)
head(Near_Violations)

#merge 2 dataframes at a time to make one big dataset and keep all records
BigData1 <- merge(x = Near_Vacant, y = Near_CommCorr, by.x = "IN_FID.Vac", by.y = "OBJECTID.ComCor", all = TRUE)

#Keep replacing the 'y' variable and 'by.y' variable names each time
BigData1 <- merge(x = BigData1, y = Near_Violations, by.x = "IN_FID.Vac", by.y = "OBJECTID.LIViol", all = TRUE)
BigData1 <- merge(x = BigData1, y = Near_SeptaStops, by.x = "IN_FID.Vac", by.y = "OBJECTID.SeptST", all = TRUE)
BigData1 <- merge(x = BigData1, y = Near_HighwyExit, by.x = "IN_FID.Vac", by.y = "IN_FID.HwyEx", all = TRUE)

#check the number of columns after each merge. should grow
ncol(BigData1)
colnames(BigData1)

#format BigData1 for only the near_dist fields
Independent_variables <- BigData1 #reassigns dataset for manipulaiton
Near_dist_col <- BigData1 %>% select(starts_with("Near_DIST"))
ncol(Near_dist_col)
head(Near_dist_col)
View(Near_dist_col)
Prop_Ind_Var <- PropertyR %>% select(Zoning,Taxable_Bu,Taxable_La,Depth,Garage_Space,Total_Liva)
#add a new column to eaach data frame
Train85spread[,13] <- NA
Train85spread <- Train85cut
colnames(Train85spread)[13] <- "FID"
#gives empty column numbers
Train85spread$FID <- 1:nrow(Train85spread)
Near_dist_col$FID.Main2 <- 1:nrow(Near_dist_col)

head(Train85spread)
#drop column by number
Neardist_col <- Near_dist_col[,-8]
BigData2 <- merge(x = Prop_Ind_Var, y = Neardist_col, by.x = "FID.Main", by.y = "FID.Main3", all = TRUE)

#add dependent variable to dataframe
DepVar <- PropertyR %>% select(Sale_Pri_1,Field1,Sale_Price,Sale_Year,Lat1,Long1)
DepVar <- PropertyR %>% select(Sale_Pri_1,Field1,Sale_Price,Sale_Year,Lat1,Long1)

#output this as a csv for backup

write.csv(BigData3, file = "C:/Users/Keisan/Documents/CPLN590/BigData3.csv", append = FALSE, quote = TRUE, sep = ",",
          eol = "\n", na = "NA", dec = ".", row.names = TRUE,
          col.names = TRUE, qmethod = c("escape", "double"),
          fileEncoding = "")

######################################################################################
#----------------------------------------Creating Training Dataset-------------------------

#in case you need to reimport BigData3- 
> BigData3 <- read.csv2("C:/Users/Keisan/Documents/CPLN590/BigData3.csv", header = TRUE)
#make a copy of course ;)
BD3 <- BigData3
ncol(BD3)
colnames(BD3)
#This dataset has the dependent variable and "most of the independent variables"
TrainingSet1 <- BigData3
BigData3 [,1]
colnames(BigData1 [,1])
#gives column number with name
which( colnames(BigData3)=="random_name" )
#gives column name with number
colnames( BigData3 )[1]
colnames(QTrain175)
TraingingSet1 <- BigData3[,-1] #removes one column from dataframe
QTrain175 <- QTrain175[,-1] #removes column by number
TrainingSet1 <- TrainingSet1[, -c(2:4)] #deletes a series of columns

TrainingSet1 <- TrainingSet1[,-1]

View(TrainingSet1)
colnames(TrainingSet1)
colnames(BD3)
na.omit(BD3)
BD3_2 <-BD3
training3 <- training[, c(2, 12,10)] # include three other columns
############################################################
#----------------------------------Summary Stats on Dataset--------------------------

stargazer(TrainingSet1, type ="text", title = "Summary Statistics")
stargazer(both, type = "html")
#####################################################################
#Now were going to create a correlation matrix using the corrplot package
TrainingSet1Cpy <- TrainingSet1

BD5TRAIN[is.na(BD5TRAIN)] <- 0
cor(BD5TRAIN, use="pairwise.complete.obs")
CorBD5train <- cor(BD5TRAIN)
M[is.na(M)] <- 0
TH
na.omit(Quick_Train) #removes na rows.
head(trainprop3)
TP <- cor(trainprop3)
#returns correlation for each variable to all the others. Here is a pretty correlation matrix plot.
#check the help for corrplot. There are a bunch of different methods for creating these graphics
corrplot(CorBD5train, method = "square")

####################################
#----------------------------Quick Training Dataset---------------------
#You just downsized the Dataset so that every value matches up. should've done thisi before.
#nevertheless you now have a lot less data to work with so time to re-import the dataset and 
#re do the training|testing partition
table(is.na(Quick_Train)) 
Quick_Train <- read.csv2("C:/Users/Keisan/Documents/CPLN590/Quick_Train.csv", header= TRUE)
cutoff = round(0.7*nrow(DupBD5))
Train666 <- Quick_Train666


#Let's see if any of this predictive variation is spatial in nature. To do this, were going to have to map these 
#predictions in ArcGIS.
#First create a dataframe of the predicted values and the FID which we can join back to the shapefile in ArcGIS
predForArcGIS <- cbind(trainhouse$X, CVT$fitted.values)

summary(BD5ols)

#created test and training sets 
BD5train <- DupBD5[1:cutoff,]
BD5Testpool <- DupBD5[-(1:cutoff),]

# Run OLS on Training set

Rittenhouse_ols  <- lm(Sale_Pri_1 ~ Garage_Spa + Total_Liva + NEAR_DIST_Vac + NEAR_DIST_ComCor + NEAR_DIST_LIViol + NEAR_DIST_Retail + NEAR_DIST_SeptST + NEAR_DIST_TrafDat + NEAR_DIST_HwyEx + Tax_Build + Tax_Land, data = Rittenhouse, na.action=na.omit)

##################################RMSE & MAE for residuals in BD5ols  #############################


# Calculate error
error <- actual - predicted

actual <- resid(BD5ols)
predicted <- predict(BD5ols)

# Function that returns Root Mean Squared Error
rmse <- function(error)  ##fuckthis
{
  sqrt(mean(error^2))
}

RMSE <- sqrt(mean((y_hat-predict(BD5ols))^2))
############################////////////////////// hey look! its your fItTeD vAlUes
y_hat <- fitted.values(BD5ols)

#and your em ay eee (MAE)
dim(BD5TRAIN)
mae(sim,obs)

MAE <- sum(abs(y_hat-predict)) / length(y_hat)

data(cars)
reg <- lm(log(dist) ~ log(speed), data = cars)
MAPE(y_pred = exp(BD5ols$fitted.values), y_true = BD5train$Sale_Pri_1)


#lets take a closer look at the model error.
#first what is the mean SALES_VOL
mean(BD5TRAIN$Sale_Pri_1)


#calculate the difference and percent change between observed and predicted
BD5TRAIN$diff <- regPredStarbucksValues$obs - regPredStarbucksValues$pred
regPredStarbucksValues$diffP <- (regPredStarbucksValues$obs - regPredStarbucksValues$pred) / regPredStarbucksValues$obs


mean(abs(BD5TRAIN$Sale_Pri_1))


View(Train666_70cut)
View(Test666_30cut)

# Need to make your dataframes numeric
QuickT_Set <- Quick_train_set  #make a copy for safety
colnames(QuickT_Set)  #trying to figure out which columns are not numeric

is.numeric(QuickT_Set$Zoning) #f
is.numeric(QuickT_Set$Taxable_La)#f
is.numeric(QuickT_Set$Depth) #f
is.numeric(QuickT_Set$Garage_Spa) #t
is.numeric(QuickT_Set$Total_Liva) #T
is.numeric(QuickT_Set$NEAR_DIST.Vac) #T
is.numeric(QuickT_Set$NEAR_DIST.ComCor) #t
is.numeric(QuickT_Set$NEAR_DIST.SeptaST) #F
is.numeric(QuickT_Set$NEAR_DIST.TrafDat) #T
is.numeric(QuickT_Set$NEAR_DIST.HwyEx) #T
is.numeric(QuickT_Set$Sale_Pri_1) #T


transform(TrainingSet2, NEAR_DIST.SeptaST = as.numeric(NEAR_DIST.SeptaST))
#Remove all $ in these columns "Taxable_Bu" & "Taxable_La"
gsub("$", "", QTrain175$Depth)
head(BD3$Taxable_La)

#what mode
mode(TrainingSet2$NEAR_DIST.SeptaST)
#####

# The droids you are looking for   aka. the variables you wanna do regression on
#"Depth"             "Garage_Spa"        "Total_Liva"        "NEAR_DIST.Vac"    
#[5] "NEAR_DIST.ComCor"  "NEAR_DIST.LIViol"  "NEAR_DIST.Retail"  "NEAR_DIST.SeptST" 
#[9] "NEAR_DIST.TrafDat" "NEAR_DIST.HwyEx"   "Sale_Pri_1"        "Tax_Build"        
#[13] "Tax_Land"


#But one better! - Output this as backup just incase. 
write.csv(Quick_Train, file = "C:/Users/Keisan/Documents/CPLN590/Quick_Train.csv", append = FALSE, quote = TRUE, sep = ",",
          eol = "\n", na = "NA", dec = ".", row.names = TRUE,
          col.names = TRUE, qmethod = c("escape", "double"),
          fileEncoding = "")
write.csv(Quick_Test, file = "C:/Users/Keisan/Documents/CPLN590/Quick_Test.csv", append = FALSE, quote = TRUE, sep = ",",
          eol = "\n", na = "NA", dec = ".", row.names = TRUE,
          col.names = TRUE, qmethod = c("escape", "double"),
          fileEncoding = "")
stargazer(BD5ols, type="text", title = "Summary Statistics")
View(QTrain70)
str(QTrain70)
head(QTrain70)
#Assign factors as integers
Train85cut$NEAR_DIST.Retail <- as.integer(as.character(Train85cut$NEAR_DIST.Retail))
#Do it for all factoral columns 
Train85cut$NEAR_DIST.ComCor <- as.integer(as.character(Train85cut$NEAR_DIST.ComCor))
Train85cut$NEAR_DIST.SeptST <- as.integer(as.character(Train85cut$NEAR_DIST.SeptST))
Train85cut$NEAR_DIST.Vac <- as.integer(as.character(Train85cut$NEAR_DIST.Vac))
Train85cut$NEAR_DIST.LIViol <- as.integer(as.character(Train85cut$NEAR_DIST.LIViol))
Train85cut$NEAR_DIST.TrafDat <- as.integer(as.character(Train85cut$NEAR_DIST.TrafDat))
Train85cut$NEAR_DIST.HwyEx <- as.integer(as.character(Train85cut$NEAR_DIST.HwyEx))
#format summary statistics output
stargazer(OLS_1, type="text", title = "Summary Statistics")  #basic
stargazer(OLS_1, type = "text", nobs = FALSE, mean.sd = TRUE, median = TRUE,
          +           iqr = TRUE)   #more information in charts

stargazer(QTrain70, type = "text", style = "qje") #style difference 


stargazer(OLS_1, type = "text", 
          title            = "Summary Statistics",
          covariate.labels = c("Garage Space Dimension", "Total Livable Space", "Dist.to Vacancy",
                               "Nearest Commercial Corridor", "Nearest LI Violation", "Nearest Retail", 
                               "Nearest Septa Stop", "Nearby Traffic Density", "Nearest Highway Exit", "Sale Price", "Building Tax","Land Tax"),
          dep.var.caption  = "Dependent Variable",
          dep.var.labels   = "Sale Price")  #rename independent labels

stargazer(QTrain70, type = "text", 
          dep.var.labels.include = FALSE,
          model.numbers          = FALSE) #not interested in this

##########################################################################
#running a OLS Regression on the dataset 
colnames(QTrain70)notepad
##########################################   Runs a regression ###############################!!!!!!!!!!!!!!!!!!
OLS_1  <- lm(Sale_Pri_1 ~ Garage_Spa + Total_Liva + NEAR_DIST.Vac + NEAR_DIST.ComCor + NEAR_DIST.LIViol + NEAR_DIST.Retail + NEAR_DIST.SeptST + NEAR_DIST.TrafDat + NEAR_DIST.HwyEx + Tax_Build + Tax_Land, data = Train85cut)
### make dataset smaller
#####  tHIS IS LIFE ######################
BD5ols  <- lm(Sale_Pri_1 ~ Garage_Spa + Total_Liva + NEAR_DIST_Vac + NEAR_DIST_ComCor + NEAR_DIST_LIViol + NEAR_DIST_Retail + NEAR_DIST_SeptST + NEAR_DIST_TrafDat + NEAR_DIST_HwyEx + Tax_Build + Tax_Land, data = BD5TRAIN)

Rittenhouse_ols  <- lm(Sale_Pri_1 ~ Garage_Spa + Total_Liva + NEAR_DIST_Vac + NEAR_DIST_ComCor + NEAR_DIST_LIViol + NEAR_DIST_Retail + NEAR_DIST_SeptST + NEAR_DIST_TrafDat + NEAR_DIST_HwyEx + Tax_Build + Tax_Land, data = Rittenhouse)

cutoff = round(0.5*nrow(Train175cut))
QTrain175 <- QTrain35

Train85cut <- Train175cut[1:cutoff,]
Train <- QTrain[-(1:cutoff),]
dim(Train85cut)

#regression ran after you subsetted the data twice. Hooray! you have some data to plot visualizations
#based on! this is the main point.
#Now just output that file for safety reasons...
write.csv
(OLS_1, file = "C:/Users/Keisan/Documents/CPLN590/OLS_1.csv", append = FALSE, quote = TRUE, sep = ",",
  eol = "\n", na = "NA", dec = ".", row.names = TRUE,
  col.names = TRUE, qmethod = c("escape", "double"),
  fileEncoding = "")  #cant cuz itsaves as a character not a data.frame and cannot be 
#coerced into a data.frame

View(OLS_1) #can't view it saves as a function
#summary  of OLS_1

summary(BD5ols)


library(corrplot)
CorBD4 <- cor(BD4)

cor(Train85cut) #produces correlation coefficients in list 
TR85cut <- cor(Train85cut)
#Now to visualize
corrplot.mixed(TR85cut, order ="AOE",method ="color",addCoef.col="grey") #not working :(
#if column has null values in them
QMat[is.na(Qmat)] <- 0
#pairwise obs
QTMat <- QtestMatrix
cor(QtestMatrix, use="pairwise.complete.obs")
corrplot(TR85cut, method = "number")


#subsetted data with. Now to go through previous steps with reformatting and subsetting
Quick_Train666 <- na.omit(Quick_Train)
Train666 <- Quick_Train666

Train666_70cut <- Train666[1:cutoff,]
is.numeric(Train85cut$NEAR_DIST.LIViol) 


#rmse, rse, etc for training set

#now the mean residual or error from the regression 'reg' aka the 'mean absolute error' aka the MAE
mean(abs(Train85cut$Sale_Pri_1))

#calculate the MAPE - or mean absolute percent error aka MAPE
mean(abs(Train85cut$Sale_Pri_1))


#plot variation in regression output

x <- rnorm(1000)
y <- rnorm(1000)
bin<-hexbin(x, y, xbins=50) 
plot(bin, main="Hexagonal Binning")

hexbin(Train85cut)

#stylize the corrplot: 
col1 <- colorRampPalette(c("#7F0000","red","#FF7F00","yellow","white", 
                           "cyan", "#007FFF", "blue","#00007F"))
col2 <- colorRampPalette(c("#67001F", "#B2182B", "#D6604D", "#F4A582", "#FDDBC7",
                           "#FFFFFF", "#D1E5F0", "#92C5DE", "#4393C3", "#2166AC", "#053061"))  
col3 <- colorRampPalette(c("red", "white", "blue")) 
col4 <- colorRampPalette(c("#7F0000","red","#FF7F00","yellow","#7FFF7F", 
                           "cyan", "#007FFF", "blue","#00007F"))   
wb <- c("white","black")
view(CorBD4)

corrplot(CorBD4, method="square", order = "FPC", cl.pos="n") #corrplot of Train85cut OLS output, using squares, First Class Principle, no color ramp legend,
head(Train85cut)


#this works for corrplot 
corrplot(cor(Train85cut), method="square", order = "FPC", cl.pos="n", p.mat = res1[[1]], sig.level = 0.02, insig = c("p-value"))

pAdj <- p.adjust(c(res1[[1]]), method = "BH")


###select specific columns and drop them 
Train85cut2> subset(Train85cut, select=-c(Train85cut,Near_DIST.TrafDat))

R> subset(df, select=-c(z,u))
############## CORRELATION OUTPUT with significance levels indicated by an "X"

cor.mtest <- function(mat, conf.level = 0.95){
  mat <- as.matrix(mat)
  n <- ncol(mat)
  p.mat <- lowCI.mat <- uppCI.mat <- matrix(NA, n, n)
  diag(p.mat) <- 0
  diag(lowCI.mat) <- diag(uppCI.mat) <- 1
  for(i in 1:(n-1)){
    for(j in (i+1):n){
      tmp <- cor.test(mat[,i], mat[,j], conf.level = conf.level)
      p.mat[i,j] <- p.mat[j,i] <- tmp$p.value
      lowCI.mat[i,j] <- lowCI.mat[j,i] <- tmp$conf.int[1]
      uppCI.mat[i,j] <- uppCI.mat[j,i] <- tmp$conf.int[2]
    }
  }
  return(list(p.mat, lowCI.mat, uppCI.mat))
}

res1 <- cor.mtest(mtcars,0.95)
res2 <- cor.mtest(mtcars,0.99)
## specialized the insignificant value according to the significant level
corrplot(M, p.mat = res1[[1]], sig.level=0.2)

hist(QTrain175$Sale_Pri_1)
summary(OLS_1)
summary(OLS_1)
######################## RMSE
summary(Train85cut)
lm(formula = Sale_Pri_1 ~ Garage_Spa + NEAR_DIST.LIViol + NEAR_DIST.Retail, data = Train85cut)
CV <- CVlm(data=trainhouse5, CVT, m=8)
exists("fit")

#m=5 sets it to 5 folds
mse <- attr(CVT, "ms")
rmse <- sqrt(mse)    #Obtaining RMSE for model 1
rmse
#Now were going to see how generalizable some of these ind. variables are across both Starbucks and Dunkin Donuts.
#The following lines of code do the same thing as above to create the 'training2' data frame but this time using
#only Starbucks and Dunkin Donuts locations.
#Create a new data frame of just starbucks and DD locations
StarbucksOrDunkin <- biz[ which(biz$CONAME == "DUNKIN' DONUTS" | biz$CONAME == "STARBUCKS" ), ]
#create a dummy variablefor DD
StarbucksOrDunkin$isDunkin = ifelse(StarbucksOrDunkin$CONAME == "DUNKIN' DONUTS" ,1,0)
#create a new data frame of just the important variables.
SD2 <- StarbucksOrDunkin[,22:40] #create a new data frame of most of the ind. variables
SD2 <- SD2[,-5] #removes one column from that
SD3 <- StarbucksOrDunkin[, c(12,10)] # include to other columns
SD4 <- cbind(SD2, SD3) #merge together the pertiant variables
Garage_Spa Total_Liva NEAR_DIST.Vac NEAR_DIST.ComCor NEAR_DIST.LIViol NEAR_DIST.Retail NEAR_DIST.SeptST NEAR_DIST.TrafDat NEAR_DIST.HwyEx Sale_Pri_1 Tax_Build Tax_Land


#///////////////////////////////////// 5/26/2017////////////////////////////////////////////////////////////
# Copied from Ken's Dunkin Assignment. Going to redo using my own data. K folding

PART 4 - Cross validation time.------------------------------------------------------------------
  
  #were going to use the original training set
  
  #Perform 10-fold cross validation
  #begin by specifying the type of resampling (cross validation)
  View(QTrain175)
CV_Sampletest <- trainControl(method = "cv", number = 10)
str(QTrain175)
#run the cross validation - note that here with the use of the '.' we are using all the variables in that dataframe
#In other words, we are not writing the ind. variables out one by one.
# DOES NOT WORK ////   >>>  
mymodel <- train(Sale_Pri_1 ~ ., data=Sample_Test, method = "lm", trControl=fitControl,+mymodel)
#^ remove NA values
#removes a whole column :/  >   Qtr1 <- Qtr1[, colSums(is.na(Qtr1)) == 0]
#This is the Cross validation test code
n <- names(Sample_Test)
f <- as.formula(paste("y ~", paste(n[!n %in% "y"], collapse = " + ")))
f

library(caret); mymodel <- matrix(rnorm(100),ncol=10); f <- c(paste("V",1:5,sep=""), "Sale_Pri_1"); predictors <- train(na.omit(Sale_Pri_1 ~ ., data=Sample_Test, method = "lm"))

#why did you get this warning?
summary(linmodfit)
trainhouse5 <- trainhouse5[,-14] #removes one column from that data frame

#now run it again.

### Trying CV Again#######################

#Randomly shuffle the data
ShuffleData<-BD5TRAIN[sample(nrow(BD5TRAIN)),]

#Create 10 equally size folds
folds <- cut(seq(1,nrow(Sample_Test)),breaks=10,labels=FALSE)

#Perform 10 fold cross validation
for(i in 1:10){
  #Segement your data by fold using the which() function 
  testIndexes <- which(folds==i,arr.ind=TRUE)
  TesteData <- Sample_Test[testIndexes, ]
  TrainneData <- Sample_Test[-testIndexes, ]
  #Use the test and train data partitions however you desire...
}
#########################################////   RMSE,  MAE,  R^2 ////##########################################

#predicting based on OLS
predict(OLS_1) #worked
#also worked- 
predicted <- predict(OLS_1, QTrain175, se.fit = FALSE, scale = NULL, df = Inf,
                     interval = c("none", "confidence", "prediction"),
                     level = 0.95, type = c("response", "terms"),
                     terms = NULL, na.action = na.pass,
                     pred.var = res.var/weights, weights = 1)

# Function that returns Root Mean Squared Error
rmse <- function(error)
{
  sqrt(mean(error^2))
}

# Function that returns Mean Absolute Error
mae <- function(error)
{
  mean(abs(error))
}

# Example data
actual <- c(4, 6, 9, 10, 4, 6, 4, 7, 8, 7)
predicted <- c(5, 6, 8, 10, 4, 8, 4, 9, 8, 9)

# Calculate error
error <- QTrain175 - predicted

# Example of invocation of functions
rmse(error)
mae(error)

# Example in a linear model
## Annette Dobson (1990) "An Introduction to Generalized Linear Models".
## Page 9: Plant Weight Data.
ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)
trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)
grouped <- gl(folds, labels = c(names(TrainneData)))
weight <- c(TrainneData)
lm.D9 <- lm(weight ~ group)
rmse(lm.D9$residuals) # root mean squared error


####still trying to predict :(

#this is OLS_1 model <- lm(Coupon ~ Total, data=df)
new.df <- data.frame(Total=c(79037022, 83100656, 104299800))
predict(OLS_1, TrainneData)
names(QTest$Sale_Pri_1)
names(Quick_Test)


#plot the residuals against the predicted and observed
xyplot(residuals(model) ~ predict(model), type = c("p", "g"), xlab = "Predicted", ylab = "Residuals", col.line = "red")
# not working >  
xyplot(resid(OLS_1) ~ QTrain175$Sale_Pri_1, type = c("p", "g"), xlab = "Observed", ylab = "Residuals", col.line = "red")
#Step 2: obtain predicted and residual values
#Next, we want to get predicted and residual values to add supplementary information to this graph. We can do this as follows:
dupQTrain175 <- QTrain175  
dupQTrain175$predicted <- predict(OLS_1)   # Save the predicted values
dupQTrain175$residuals <- residuals(OLS_1) # Save the residual values
dupQTrain175
# Quick look at the actual, predicted, and residual values
library(dplyr)
dupQTrain175 %>% select(Sale_Pri_1, predicted, residuals) %>% head()                
## FIguring out how to plot residuals ???/////////
OLS_2 <- OLS_1
OLS_1.lm = lm(eruptions ~ waiting, data=faithful) 
OLS_2.res = resid(OLS_1)
plot(QTrain175$Sale_Pri_1, OLS_2.res, 
     +     ylab="Residuals", xlab="Observations")
abline(0, 0)                  # the horizon
## cross validation method###################################
#Randomly shuffle the data
yourData<-yourData[sample(nrow(yourData)),]

#Create 10 equally size folds
folds <- cut(seq(1,nrow(yourData)),breaks=10,labels=FALSE)

#Perform 10 fold cross validation
for(i in 1:10){
  #Segement your data by fold using the which() function 
  testIndexes <- which(folds==i,arr.ind=TRUE)
  testData <- yourData[testIndexes, ]
  trainData <- yourData[-testIndexes, ]
  #Use the test and train data partitions however you desire...
}

#More cross validation stuff 
train_control <- trainControl(method="cv", number=10)
# fix the parameters of the algorithm
grid <- expand.grid(.fL=c(0), .usekernel=c(FALSE))
# train the model
model <- train(Sale_Pri_1~., data=Sample_Test, trControl=train_control, method="nb", tuneGrid=grid)
11# summarize results
print(df1)
library(caret); df1 <- matrix(rnorm(100, ncol=10); colnames(df1) <- c(paste("V", 1:10,12, sep=""), "Sale_Pri_1"); linmodfit <- train(Sale_Pri_1 ~ .,data=df1,method = "lm")
                              #output file to csv-   
                              write.csv(QTrain175, file = "C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/QTrain175.csv")
                              write.csv(QTrain175, file = "C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/QTrain175.csv")
                              install.packages("broom")
                              library(broom)
                              #Call
                              OLS_1
                              
                              tidy_OLS_1 <- tidy(OLS_1)
                              tidy_OLS_1
                              
                              write.csv2(as.data.frame(summary(OLS_1)), file="OLS_1.csv")
                              
                              
                              #------------------------------------------------------------------------------------------------------------------
                              #  PART 5 - Out of sample prediction 2 - Starbucks------------------------------------------------------------------
                              
                              #Recall that reg is the regression on just the FULL SET of training data. 
                              #Now were going to use that to predict for the 17 candidate starbucks locations
                              #Then of course, because we actually have the SALES_VOL figures for those starbucks, we will calculate some
                              #error
                              
                              #subject these predictions on to the testing data frame w/ the 17 candidate locations
                              regPredStarbucks <- predict(reg, testing)
                              #now create a new data frame where one column is the dependent variable 'SALES_VOL' from testing 
                              #(Which is column 10)
                              regPredStarbucksValues <- data.frame(obs = testing[,10], pred = regPredStarbucks)
                              head(regPredStarbucksValues)
                              #Finally regress these predicted values from reg2 on to the SALES_VOL from the random2Test data frame
                              defaultSummary(regPredStarbucksValues)
                              #calculate the difference and percent change between observed and predicted
                              regPredStarbucksValues$diff <- regPredStarbucksValues$obs - regPredStarbucksValues$pred
                              regPredStarbucksValues$diffP <- (regPredStarbucksValues$obs - regPredStarbucksValues$pred) / regPredStarbucksValues$obs
                              #now the mean residual or error from the regression 'reg' aka the 'mean absolute error' aka the MAE
                              mean(abs(regPredStarbucksValues$diff))
                              #calculate the MAPE - or mean absolute percent error aka MAPE
                              mean(abs(regPredStarbucksValues$diffP))
                              #update this data frame to include the FID field
                              regPredStarbucksValues <- cbind(testing$FID, regPredStarbucksValues)
                              #change columns names and output for arcgis
                              colnames(regPredStarbucksValues) <- c("FID", "observed", "predicted", "diff", "diffPct")
                              write.csv(regPredStarbucksValues, file = "C:/Users/kennethsteif/regPredStarbucksValues.csv")
                              include to other columns
                              SD4 <- cbind(SD2, SD3) #merge together the pertiant variables
                              Garage_Spa Total_Liva NEAR_DIST.Vac NEAR_DIST.ComCor NEAR_DIST.LIViol NEAR_DIST.Retail NEAR_DIST.SeptST NEAR_DIST.TrafDat NEAR_DIST.HwyEx Sale_Pri_1 Tax_Build Tax_Land
                              
                              #creates a dataset of just these columns 
                              
                              BD5 <- BD4[ , c("Zoning","Taxable_Bu", "Taxable_La", "Sale_Price", "Sale_Year")]
                              
                              
                              BD4[ , -which(names(BD4) %in% c("Zoning","Taxable_Bu", "Taxable_La", "Sale_Price", "Sale_Year"))]
                              FID_Main"          "Zoning"            "Taxable_Bu"        "Taxable_La"        "Depth"             "Garage_Spa"       
                              [7] "Total_Liva"        "NEAR_DIST_Vac"     "NEAR_DIST_ComCor"  "NEAR_DIST_LIViol"  "NEAR_DIST_Retail"  "NEAR_DIST_SeptST" 
                              [13] "NEAR_DIST_TrafDat" "NEAR_DIST_HwyEx"   "Sale_Pri_1"        "Sale_Price"        "Sale_Year"         "Tax_Build"        
                              [19] "Tax_Land"          "Lat1"              "Long1"
                              
                              
                              BD5Trainols_1 <- lm(formula = Sale_Pri_1 ~ Garage_Spa + Total_Liva + NEAR_DIST.Vac + 
                              NEAR_DIST_ComCor + NEAR_DIST_LIViol + NEAR_DIST_Retail + 
                              NEAR_DIST_SeptST + NEAR_DIST_TrafDat + NEAR_DIST_HwyEx + 
                              BD5TRAIN, na.action=na.omit)  
                              
                              
                              BD5Trainols_1 <- lm(Sale_Pri_1 ~ Garage_Spa + Total_Liva + NEAR_DIST.Vac + 
                              NEAR_DIST_ComCor + NEAR_DIST_LIViol + NEAR_DIST_Retail + 
                              NEAR_DIST_SeptST + NEAR_DIST_TrafDat + NEAR_DIST_HwyEx + 
                              BD5TRAIN, na.action=na.omit)
                              
                              head(BD5TRAIN)
                              
                              
                              #######################################################################   Super Important!!!    #########
                              # these don't match for some reason
                              
                              > fitO <- lm(Y ~ x, na.action=na.omit)     # fit with na.omit
                              dim(resid(BD5Trainols))                     # use extractor function
                              summary(BD5Trainols)
                              dim(resid(BD5Trainols))
                              > fitE <- lm(Y ~ x, na.action=na.exclude)  # fit with na.exclude
                              > dim(residuals(fitE))                     # use extractor function -> = N
                              
                              
                              BD5TRAIN$resid <- resid(BD5ols)
                              BD5$fitted <- fitted(BD5ols)
                              
                              #RMSE   WITH na values
                              
                              sqrt( sum( (BD5TRAIN$Sale_Pri_1 - BD5TRAIN$predict)^2 , na.rm = TRUE ) / nrow(BD5TRAIN) )
                              
                              # MEAN WITH na values  
                              TSC <- BD5TRAIN$Sale_Pri_1 
                              PRF <- BD5TRAIN$predict
                              mean(TSC, na.rm = TRUE)
                              
                              
                              #MAE WITH na values
                              require(hydroGOF)
                              mae(PRF, TSC, na.rm=TRUE,)
                              mae(sim, obs, na.rm=TRUE, ...)
                              #"mae": mean absolute error, which is calculated as sum(|t_i - p_i|)/N, 
                              #where t's are the true values and p's are the predictions, 
                              #while N is supposed to be the size of both vectors.
                              sum(BD5TRAIN$Sale_Pri_1 - BD5TRAIN$predict, na.rm= TRUE) #/(nrow(BD5TRAIN)*ncol(BD5TRAIN))
                              
                              #MAPE WITH na values
                              #calculate the MAPE - or mean absolute percent error aka MAPE
                              mean(abs(BD5TRAIN$Error),na.rm = TRUE)   ### WORKS WELL.
                              
                              #choose a single house 
                              
                              SINGLE <- BD5TRAIN[sample(nrow(BD5TRAIN), 1), ]
                              view(single)
                              head(SINGLE)
                              
                              #choose random sample 
                              dim(BD6)
                              Sample_Test <- BD5TRAIN[sample(nrow(BD5TRAIN), 2519), ]
                              head(SINGLE)
                              install.packages("devtools")
                              library(devtools)
                              install_github("mrdwabmisc", "mrdwab")
                              
                              ## nto working >  can't find sratified function
                              #stratified(BD6_Neighb_Jo, group="NAME", size=., select=list(NAME = "RITTENHOUSE"))
                              
                              Rittenhouse <- BD6_Neighb_Jo[ sample( which(BD6_Neighb_Jo$NAME=='RITTENHOUSE'), round(1.0*length(which(BD6_Neighb_Jo$NAME=='RITTENHOUSE')))), ]
                              
                              
                              #lists unique items in a list
                              unique(unlist(BD6_Neighb_Jo$NAME))
                              unique(unlist(BD6_Neighb_Jo$NAME))
                              
                              #count the number of each unique item
                              ddply(BD6_Neighb_JO,~NAME,summarise,number_of_distinct_orders=length(unique(order_no))) ## not quite
                              
                              Total_nhoods <- length(unique(BD6_Neighb_Jo$NAME))
                              
                              
                              #new dataframe with rows as column names from another dataframe
                              
                              colnames(BD5TRAIN) <- neighborhoodMAPE[1, ]
                              neighborhoodMAPE[1,119]
                              
                              
                              The key here is to unlist the row first.
                              colnames(BD6_Neighb_Jo) <- as.character(unlist(BD6_Neighb_Jo[1,])
                              BD6_Neighb_Jo = BD6_Neighb_Jo[-1, ]
                              
                              #mpty dataframe with 5 column names
                              
                              Total_hoods <- as.data.frame(setNames(replicate(5,numeric(0), simplify = F), letters[1:5]))
                              head(Total_hoods)
                              unique(BD6_Neighb_Jo$NAME) <- Total_hoods[1, ]
                              Total_hoods[1,] <- unique(BD6_Neighb_Jo)
                              
                              colnames(X)[2] <- "superduper" #changing column names
                              
                              tapply(X = BD6_Neighb_Jo$Error/df$HomePr, INDEX = df$Town, mean)
                              
                              
                              #merge errors with neighborhoods
                              
                              BD6_Trainset <- data.frame(
                              
                              )
                              BD6Train_Full <- merge(BD6_NJo, BD5TRAIN2, na.exclude = TRUE, by=c("Lat1", "Long1"))
                              
                              merge(x=BD6_NJo, y=BD5TRAIN2, by.x=c("Lat1","Long1"), by.y=c("Lat1","Long1"))
                              
                              #predicting values for 3 test sets.
                              predict(single_ols)
                              predict(ten_ols)
                              predict(Rittenhouse_ols)
                              
                              #saving predictions
                              N_hood10$predict <- predict(ten_ols)
                              SINGLE$predict <- predict(single_ols)
                              Rittenhouse$predict <- predict(Rittenhouse_ols)
                              
                              
                              # Save the residual values
                              
                              SINGLE$residuals <- residuals(single_ols)
                              N_hood10$residuals <- residuals(ten_ols)
                              Rittenhouse$residuals <- residuals(Rittenhouse_ols)
                              
                              
                              #save fitted values
                              
                              SINGLE$fitted_Value  <- fitted.values(single_ols)
                              N_hood10$fitted_Value <- fitted.values(ten_ols)
                              Rittenhouse$fitted_Value <- fitted.values(Rittenhouse_ols)
                              head(SINGLE)
                              
                              #write these files for arcgis.
write.csv(Sample_Test, file = "C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Sample_Test.csv", append = FALSE, quote = TRUE, sep = ",", eol = "\n", na = "NA", dec = ".", row.names = TRUE, col.names = TRUE, qmethod = c("escape", "double"), fileEncoding = "")
                              
                              
                              write.csv(N_hood10, file = "C:/Users/Keisan/Documents/CPLN590//MIDTERM/DATA/Random_Sample10.csv", append = FALSE, quote = TRUE, sep = ",",
                              eol = "\n", na = "NA", dec = ".", row.names = TRUE,
                              col.names = TRUE, qmethod = c("escape", "double"),
                              fileEncoding = "")
                              
                              write.csv(Rittenhouse, file = "C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/Rittenhouse.csv", append = FALSE, quote = TRUE, sep = ",",
                              eol = "\n", na = "NA", dec = ".", row.names = TRUE,
                              col.names = TRUE, qmethod = c("escape", "double"),
                              fileEncoding = "")
                              # to export to get the neighborhood names. easier than trying to fig out in r
                              
                              write.csv(The_MAPE, file = "C:/Users/Keisan/Documents/CPLN590/MIDTERM/DATA/The_Mape.csv", append = FALSE, quote = TRUE, sep = ",",
                              eol = "\n", na = "NA", dec = ".", row.names = TRUE,
                              col.names = TRUE, qmethod = c("escape", "double"),
                              fileEncoding = "")
                              
                              
                              #MAPE By Town
                              
                              library(dplyr); library(tibble)
                              
                              actual <- BDTR2$Sale_Pri_1
                              forecasted <- BDTR2$predict
                              ###
                              actual <- BDTR2$Sale_Pri_1
                              forecasted <- BDTR2$predict
                              
                              mape <- function(actual, forecasted){
                              x = 0.1*((actual - forecasted)/actual)*100
                              return(x)
                              }
                              The_MAPE <- tibble(
                              Sale_Price = c(BDTR2$Sale_Pri_1),
                              Error = c(BDTR2$Error),
                              NAME = c(as.character(BDTR2$NAME))
                              ) %>% 
                              group_by(NAME) %>% 
                              summarise(means_pr = mean(Sale_Price),
                              means_err = mean(Error)) %>% 
                              mutate(Mape = mape(means_pr, means_err))
                              ####
                              
                              
                              BDTR2 <- BD5TRAIN2_jo2
                              
                              
                              ## another mape grouping option
                              ## both work :)
                              tapply(X = BD5TRAIN2_jo2$Error/BD5TRAIN2_jo2$Sale_Pri_1, INDEX = BD5TRAIN2_jo2$NAME, mean)
                              
                              #choose random sample 
                              dim(BD6)
                              Sample_Test <- BD5TRAIN[sample(nrow(BD5TRAIN), 2519), ]
                              dim(Sample_Test)
                              
                              ################################################## CARET Cross Validation  ###############################
                              
                              require(RCurl)
                              require(prettyR)
                              
                              #save a workspace
                              save.image("~/CPLN590/MIDTERM/DATA/Rworkspace_Steif_6-13-2017.RData")
                              
                              ####
                              
                              classes <- Sample_Test[, "Sale_Pri_1"]
predictors <- Sample_Test[, -match(c("Garage_Spa", "NEAR_DIST_TrafDat", "NEAR_DIST_HwyEx", "Total_Liva", "NEAR_DIST_Vac","NEAR_DIST_ComCor", "NEAR_DIST_LIViol", "NEAR_DIST_Retail", "NEAR_DIST_SeptST", "Tax_Land", "Tax_Build",na.action=na.omit), colnames(Sample_Test))]
                              
                              train_set <-createDataPartition(classes, p = 0.8, list = FALSE)
                              str(train_set)#####
                              #######################
                              
                              ######################
                              ###################33
                              ####################################3
                              
                              
                              ##############################################################################################################
                              
                              
                              # original example from Digg Data website (Takashi J. OZAKI, Ph. D.) 
                              # http://diggdata.in/post/58333540883/k-fold-cross-validation-in-r
                              
                              
                              library(plyr)
                              install.packages(randomForest)
                              library(randomForest)
                              
                              data <- iris
                              
                              # in this cross validation example, we use the iris data set to 
                              # predict the Sepal Length from the other variables in the dataset 
                              # with the random forest model 
                              
                              k = 5 #Folds
                              
                              # sample from 1 to k, nrow times (the number of observations in the data)
                              Sample_Test$SalePri_1 <- sample(1:k, nrow(Sample_Test), replace = TRUE)
                              list <- 1:k
                              
                              # prediction and testset data frames that we add to with each iteration over
                              # the folds
                              
                              prediction <- data.frame()
                              testsetCopy <- data.frame()
                              
                              #Creating a progress bar to know the status of CV
                              progress.bar <- create_progress_bar("Progress_Bar")
                              progress.bar$init(k)
                              
                              for (i in 1:k){
                              # remove rows with id i from dataframe to create training set
                              # select rows with id i to create test set
                              trainingset <- subset(Sample_Test, Sale_Pri_1 %in% list[-i])
                              testset <- subset(Sample_Test, Sale_Pri_1 %in% c(i))}
                              
                              # run a random forest model
                              if(all(is.na(Sample_Test)) return(NA))
                              mymodel <- lm(trainingset$Sale_Pri_1 ~ ., data = trainingset, ntree = 100)
                              
                              # remove response column 1, Sepal.Length
                              temp <- as.data.frame(predict(mymodel, testset[,-1]))
                              # append this iteration's predictions to the end of the prediction data frame
                              prediction <- rbind(prediction, temp)
                              
                              # append this iteration's test set to the test set copy data frame
                              # keep only the Sepal Length Column
                              testsetCopy <- rbind(testsetCopy, as.data.frame(testset[,1]))
                              
                              progress.bar$step()
                              }
                              
                              # add predictions and actual Sepal Length values
                              result <- cbind(prediction, testsetCopy[, 1])
                              names(result) <- c("Predicted", "Actual")
                              result$Difference <- abs(result$Actual - result$Predicted)
                              
                              # As an example use Mean Absolute Error as Evalution 
                              summary(result$Difference)
                              
                              library(boot)
                              k    <- 10
                              GLM_ST_ols  <- glm(Sale_Pri_1 ~ Garage_Spa + Total_Liva + NEAR_DIST_Vac + NEAR_DIST_ComCor +
                              NEAR_DIST_LIViol + NEAR_DIST_Retail + NEAR_DIST_SeptST + 
                              NEAR_DIST_TrafDat + NEAR_DIST_HwyEx + Tax_Build + Tax_Land, data = Sample_Test, 
                              na.action=na.omit, family=gaussian(link="identity"))
                              
                              ##glmFit <- glm(Y ~ X1 + X2 + X3, data=dfRegr,
                              family=gaussian(link="identity"))
                              
                              kfCV <- cv.glm(data=BD88, glmfit=glmfit, K=k)
kfCV$delta
summary(kfCV)
library(plyr)
library(randomForest)

data <- Sample_Test

# in this cross validation example, we use the iris data set to 
# predict the Sepal Length from the other variables in the dataset 
# with the random forest model 

k = 5 #Folds

# sample from 1 to k, nrow times (the number of observations in the data)
data$id <- sample(1:k, nrow(data), replace = TRUE)
list <- 1:k

# prediction and testset data frames that we add to with each iteration over
# the folds

prediction <- data.frame()
testsetCopy <- data.frame()

#Creating a progress bar to know the status of CV
progress.bar <- create_progress_bar("Progress Bar")
progress.bar$init(k)

for (i in 1:k){
  # remove rows with id i from dataframe to create training set
  # select rows with id i to create test set
  trainingset <- subset(data, id %in% list[-i])
  testset <- subset(data, id %in% c(i))
  View(trainingset)
  # run a  model
  mymodel <- lm(trainingset$Sale_Pri_1 ~ ., data = trainingset, ntree = 100)
  summary(mymodel) 
names(trainingset)
  # remove response column 1, Sepal.Length
  temp <- as.data.frame(predict(mymodel, testset[,20]))
  # append this iteration's predictions to the end of the prediction data frame
  prediction <- rbind(prediction, temp)
  
  # append this iteration's test set to the test set copy data frame
  # keep only the Sepal Length Column
  testsetCopy <- rbind(testsetCopy, as.data.frame(trainingse[,]))
  trainingset$predictTestSet<- predict(mymodel)
  progress.bar$step()
}
View(trainingset)
# add predictions and actual Sepal Length values
result <- cbind(prediction,trainingset$predictTestSet[, -21])
names(result) <- c("Predicted", "Actual")
result$Difference <- abs(result$Actual - result$Predicted)

# As an example use Mean Absolute Error as Evalution 
summary(result$Difference)

xyplot(log(residuals(model)) ~ log(predict(model)), type = c("p", "g"), xlab = "Predicted", ylab = "Residuals", col.line = "red")
Warning message:
In log(residuals(model)) : NaNs produced
> xyplot(residuals(model) ~ predict(model), type = c("p", "g"), xlab = "Predicted", ylab = "Residuals", col.line = "red")
> hist(residuals(model))
> hist(predict(model))
> hist(predict(log(model))
+ )
Error in log(model) : non-numeric argument to mathematical function
> hist(log(predict(model)))
xyplot(log(residuals(model2)) ~ predict(model), type = c("p", "g"), xlab = "Predicted", ylab = "Residuals", col.line = "red")
Warning message:
In log(residuals(model)) : NaNs produced
> k    <- 10
> GLM_ST_ols  <- glm(Sale_Pri_1 ~ Garage_Spa + Total_Liva + NEAR_DIST_Vac + NEAR_DIST_ComCor +
+                               NEAR_DIST_LIViol + NEAR_DIST_Retail + NEAR_DIST_SeptST + 
+                               NEAR_DIST_TrafDat + NEAR_DIST_HwyEx + Tax_Build + Tax_Land, data = Sample_Test, 
+                               na.action=na.omit, family=gaussian(link="identity"))
> kfCV <- cv.glm(data=BD88, glmfit=glmfit, K=k)
Error: could not find function "cv.glm"
> library(boot)

Attaching package: ‘boot’

The following object is masked from ‘package:lattice’:

    melanoma

> kfCV <- cv.glm(data=BD88, glmfit=glmfit, K=k)
> kfCV$delta
[1] 38859501102 38856463003
> summary(kfCV)
      Length Class  Mode   
call    4    -none- call   
K       1    -none- numeric
delta   2    -none- numeric
seed  626    -none- numeric
model <- train(Sale_Pri_1~., data=Sample_Test, trControl=train_control, method="nb", tuneGrid=grid)
model<- train(Sale_Pri_1~., data=Sample_Test, trControl=train_control, method="rpart")
grid <- expand.grid(.fL=c(0), .usekernel=c(FALSE))
train_control <- trainControl(method="cv", number=10)
library(caret)
train_control<- trainControl(method="cv", number=10, savePredictions = TRUE)

for(i in 1:10){
  #Segement your data by fold using the which() function 
  testIndexes <- which(folds==i,arr.ind=TRUE)
  TesteData <- Sample_Test[testIndexes, ]
  TrainneData <- Sample_Test[-testIndexes, ]
  #Use the test and train data partitions however you desire...
}
folds <- cut(seq(1,nrow(Sample_Test)),breaks=10,labels=FALSE)
predictors <- Sample_Test[, -match(c("Garage_Spa", "NEAR_DIST_TrafDat", "NEAR_DIST_HwyEx", "Total_Liva", "NEAR_DIST_Vac","NEAR_DIST_ComCor", "NEAR_DIST_LIViol", "NEAR_DIST_Retail", "NEAR_DIST_SeptST", "Tax_Land", "Tax_Build",na.action=na.omit), colnames(Sample_Test))]


train_control<- trainControl(method="cv", number=10)
> 
> model<- train(Sale_Pri_1~., data=BD5TRAIN2, trControl=train_control, method="rpart")

> model2<- train(Sale_Pri_1~., data=BD5TRAIN2, trControl=train_control, method="rpart")
Warning message:
In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,  :
  There were missing values in resampled performance measures.
> 
> BD5TRAIN2$Model2_Predict <- predict(na.exclude(model2))
> BD5TRAIN2$Model2_Fitted_Values <- fitted.values(na.exclude(model2))
> BD5TRAIN2$Model2_Predict <- predict(na.exclude(model2))
> confusionMatrix <- confusionMatrix(BD5TRAIN$Model2_Predict,BD5TRAIN2$Sale_Pri_1)
Error in confusionMatrix.default(BD5TRAIN$Model2_Predict, BD5TRAIN2$Sale_Pri_1) : 
  The data must contain some levels that overlap the reference.
> 
> confusionMatrix <- confusionMatrix(BD5TRAIN$Model2_Predict,na.exclude(BD5TRAIN2$Sale_Pri_1))
Error in confusionMatrix.default(BD5TRAIN$Model2_Predict, na.exclude(BD5TRAIN2$Sale_Pri_1)) : 
  The data must contain some levels that overlap the reference.
> library(caret)

library("e1071")
Warning message:
package ‘e1071’ was built under R version 3.3.3 
> library(caret)
> 
> # Sample Data
> predicted = BD5TRAIN2$Model2_Predict # Levels 1,2,3,4,5,6
> reference = BD5TRAIN2$Sale_Pri_1 # Levels 1,2,3,4
> 
> u = union(predicted, reference)
> t = table(factor(predicted, u), factor(reference, u))
> confusionMatrix(t)


library(cvTools)
## set up folds for cross-validation
folds2 <- cvFolds(nrow(coleman), K = 10, R = 50)
## compare LS, MM and LTS regression
# perform cross-validation for an LS regression model
#fitLm <- lm(Y ~ ., data = coleman)

cvFitLm <- cvLm(BD5Trainols2, cost = rtmspe,
folds = folds2, trim = 0.1)

...

# plot results for the MM regression model
bwplot(cvFitLmrob)
# plot combined results
bwplot(cvFitLm)
bwplot(BD5Trainols2)