-
Notifications
You must be signed in to change notification settings - Fork 0
/
R-Code-for-Business-Prediction
265 lines (201 loc) · 12.6 KB
/
R-Code-for-Business-Prediction
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#Ken Steif
#MUSA 507/CPLN 590
#R BUSINESS SITE SUITABILIY FORECASTING RUN THROUGH - PREDICTING STARBUCKS REVENUE
#Let's see if we can predict the sales volume of coffee shops given the spatial and aspatial qualities of
#coffee shops in PA
#Start by importing the data into R from the provided shapefile. Call the data frame 'biz'. Variable descriptions can
#be found in the shapefile metadata.
#install and load the appropriate packages.
library(corrplot)
library(reshape2)
library(caret)
library(AppliedPredictiveModeling)
library(stargazer)
# PART 1 - Explanatory analysis ----------------------------------------------------------------------------
#Create two dataframes. The first is what we use to "train" the model and the second will be used to "test" the model.
#The test model will be those Starbucks locations I have previously designated using the variable 'newStrbck'
training <- biz[ which(biz$newStrbck == 0), ]
testing <- biz[ which(biz$newStrbck == 1), ]
#For the first part we're going to work exclusively with the training dataset.
#Let's clean up the dataset and create one that is just the variables we're interested in. Were going to do this using
#'column indices' to subset different columns of the training data frame. There are many ways to subset data in R -
#this is one way. A column index is just the column number.
#For example:
training[,2] #the second column
head(midtrain)
#the function 'head' returns only the first 5 rows. Next we're going to return the tenth THROUGH twelth column.
head(Properties[, 1:12])
propstrain <- properties
#this line of code returns the first 5 rows of the 10th AND 12th column
head(training[, c(10,12)])
head(Properties)
#now were going to subset to create a training dataset of just the dependent and independent variables
trainprop <- Properties(,1,2,11,12) #create a new data frame of most of the ind. variables
trainprop <- Properties[1,2,11,12]
training2 <- training2[,-5] #removes one column from that
training3 <- training[, c(2, 12,10)] # include three other columns
training4 <- cbind(training2, training3) #merge together the pertinent variables
#rename these data frames appropriately and remove the excess data frames that we no longer need
training2 <- training4
remove(training3, training4)
#you should notice the dependent variables 'SALES_VOL' is the last variable of training2
head(trainhouse5)
#Now lets output a table of summary statistics. Change 'type' to html to copy/paste the data in to Excel
stargazer(QTrain70, type="text", title = "Summary Statistics")
#Now were going to create a matrix using the corrplot package
trainprop3[is.na(trainprop3)] <- 0
cor(traisummarynhouse, use="pairwise.complete.obs")
TH <- cor(trainhouse5)
M[is.na(M)] <- 0
TH
head(trainprop3)
TP <- cor(trainprop3)
#returns correlation for each variable to all the others. Here is a pretty correlation matrix plot.
#check the help for corrplot. There are a bunch of different methods for creating these graphics
corrplot(TH, method = "number")
install.packages("DAAG")
library(DAAG)
CVT <- lm(Sale.Price ~ Market.Value + Taxable.Land + Taxable.Building+ Exempt.Land + Category.Code + Total.Area + Year.Built + Total.Livable.Area, data=trainhouse5)
CV <- CVlm(data=trainhouse5, CVT, m=8)
exists("fit")
#m=5 sets it to 5 folds
mse <- attr(BD5ols, "ms")
rmse <- sqrt(mse) #Obtaining RMSE for model 1
rmse
#Now were going to see how generalizable some of these ind. variables are across both Starbucks and Dunkin Donuts.
#The following lines of code do the same thing as above to create the 'training2' data frame but this time using
#only Starbucks and Dunkin Donuts locations.
#Create a new data frame of just starbucks and DD locations
StarbucksOrDunkin <- biz[ which(biz$CONAME == "DUNKIN' DONUTS" | biz$CONAME == "STARBUCKS" ), ]
#create a dummy variablefor DD
StarbucksOrDunkin$isDunkin = ifelse(StarbucksOrDunkin$CONAME == "DUNKIN' DONUTS" ,1,0)
#create a new data frame of just the important variables.
SD2 <- StarbucksOrDunkin[,22:40] #create a new data frame of most of the ind. variables
SD2 <- SD2[,-5] #removes one column from that
SD3 <- StarbucksOrDunkin[, c(12,10)] # include to other columns
SD4 <- cbind(SD2, SD3) #merge together the pertiant variables
#The melt function reformats the data in "long" form which we can use for the below plot
SDMelt <- melt(trainhouse5, id.vars = c("Sale.Price"))
#get rid of some extraneous data frames.
remove(SD2,SD3, SD4, StarbucksOrDunkin)
#Lastly we're going to plot these differences using boxplots and ggplot's 'facet' feature which can create many
#sub-plots
ggplot(data = SDMelt, aes(Sale.Price, value)) +
geom_boxplot(aes(fill=as.factor(Sale.Price))) + facet_wrap(~variable,scales="free",ncol=6) +
scale_fill_discrete(name = "1 = Dunkin Donuts") +
ggtitle("Variable distributions across Starbucks and Dunkin' Donuts")
#-----------------------------------------------------------------------------------------------------------
# PART 2 - In Sample prediction ----------------------------------------------------------------------------
#Finally, let's build a regression using the training set. This is what we call an 'in-sample' regression
reg <- lm(SALES_VOL ~ distHwy + CoffeeDist + DistShop + popDens + POP + HHs + Families + Homes + Med_Inc + Med_Rent +
Med_Value + Pct_White + Pct_le_5yr + Avg_HHSze + Pct_Col2 + Pct_BlPov + distEmpC + NUMBER_EMP,
data = training2)
summary(CVT)
#Lets see what happens if we plot the predicted values from this regression against the actual sales values
#this is a measure of how well our predictions are relative to observed values.
#If the predictions were perfect, the scatterplot would aline on the red
xyplot(trainhouse5$Sale.Price ~ CVT$fitted.values, type = c("p", "r"), xlab = "Predicted", ylab = "Observed", col.line = "red")
#Let's see if any of this predictive variation is spatial in nature. To do this, were going to have to map these
#predictions in ArcGIS.
#First create a dataframe of the predicted values and the FID which we can join back to the shapefile in ArcGIS
predForArcGIS <- cbind(BD5train$X, CVT$fitted.values)
colnames(predForArcGIS) <- c("X", "Predicted")
#now write it out to the C: Make sure you specifc your own folder and watch the direction of the slashes
write.csv(predForArcGIS, file = "C:/Users/Keisan/Documents/CPLN590/predForArcGIS.csv")
#lets see what happens if we include controls for Dunkin Donuts
Proptest <- Properties
trainhouse5$Sale.Price = ifelse(trainhouse5$CONAME == "DUNKIN' DONUTS" ,1,0)
reg2 <- lm(SALES_VOL ~ distHwy + CoffeeDist + DistShop + popDens + POP + HHs + Families + Homes + Med_Inc + Med_Rent +
Med_Value + Pct_White + Pct_le_5yr + Avg_HHSze + Pct_Col2 + Pct_BlPov + distEmpC + NUMBER_EMP + isDunkin,
data = training2)
summary(reg2)
#now lets plot the predicted aganst the observed again with the new model that controls for the dunkin/starbucks
#difference
xyplot(log(BD5train$Sale_Pri_1) ~ log(predict(BD5ols)), type = c("p", "r"), xlab = "Observed", ylab = "Predicted", col.line = "red")
xyplot(log(resid(BD5ols)) ~ log(predict(BD5ols)), type = c("p", "r"), xlab = "Residuals", ylab = "Predicted", col.line = "red")
xyplot(log(BD5train$Sale_Pri_1) ~ log(resid(BD5ols)), type = c("p", "r"), xlab = "Observed", ylab = "Residuals", col.line = "red")
#-----------------------------------------------------------------------------------------------------------
# PART 3 - Out of sample prediction 1 --------------------------------------------------------------------------
#If you're trying to make predictions and all you have are observations from past buinesses, how do you assess
#how well your model predicts? Let's talk about 'out of sample prediction'. This is really the crux of predictive
#modeling.
#To begin, we're going to see how well the regression predicts if we split the training set into two different data
#frames. We're going to take a 75% random sample as a training set, leaving the remaining data as the test set.
Subtrain <- createDataPartition(y = trainhouse5$Market.Value, p = .75, list = FALSE)
rand1train <- trainhouse5[ Subtrain,] #the new training set
rand2Test <- trainhouse5[-Subtrain,] #the new test set
nrow(rand1train) #how many rows are in our new training set
#now run the regression on the random1Train data frame
Khan <- lm(Market.Value ~ Frontage + Depth + Total.Area + Total.Livable.Area + Exempt.Building + Exempt.Land + Taxable.Land + Sale.Price,
data = rand1train)
summary(Khan)
stargazer(Khan, CVT, type = "html")
#subject these predictions on to the random2Test test set data
Khan2 <- predict(Khan, rand2Test)
#notice that the lenth of this list = the length of SALES_VOL observations in random2Test - one prediction for each
#SALES_VOL observation.
length(Khan2 )
length(Khan2 ) == length(rand2Test$Market.Value )
#in this case we actually do have observed locations that we want to predict for the coffee locations that.
#so let's plot that.
plot(Khan2, rand2Test$Market.Value) #still looks pretty good
#now create a new data frame where one column is the dependent variable values 'SALES_VOL' from random2Test
#(Which is column 19), and a second column which is the predicted values from our above regression (reg2)
Khan2val <- data.frame(obs = rand2Test[,2], pred = Khan2)
#Finally regress these predicted values from reg2 on to the SALES_VOL from the random2Test data frame
defaultSummary(Khan2val)
#lets take a closer look at the model error.
#first what is the mean SALES_VOL
mean(trainhouse5$Sale.Price)
#now the mean residual or error from the regression aka the 'mean absolute error'
mean(abs(Khan$residuals))
#manually calculate RMSE.
sqrt(mean(abs(Khan$residuals^2)))
#------------------------------------------------------------------------------------------------------------------
# PART 4 - Cross validation time.------------------------------------------------------------------
#were going to use the original training set
#Perform 10-fold cross validation
#begin by specifying the type of resampling (cross validation)
CvT2 <- trainControl(method = "cv", number = 10)
#run the cross validation - note that here with the use of the '.' we are using all the variables in that dataframe
#In other words, we are not writing the ind. variables out one by one.
lmfit5 <- train(Sale.Price ~ ., data=trainhouse5, method = "lm", trControl=fitControl,
lmfit5
#lists everything in your environment
ls()
#removes things from the environment
rm(list = ls())
#why did you get this warning?
summary(lmfit5)
trainhouse5 <- trainhouse5[,-14] #removes one column from that data frame
#now run it again.
#plot the residuals against the predicted and observed
xyplot(residuals(CVT) ~ predict(CVT), type = c("p", "g"), xlab = "Predicted", ylab = "Residuals", col.line = "red")
xyplot(residuals(CVT) ~ trainhouse5$Sale.Price, type = c("p", "g"), xlab = "Observed", ylab = "Residuals", col.line = "red")
#------------------------------------------------------------------------------------------------------------------
# PART 5 - Out of sample prediction 2 - Starbucks------------------------------------------------------------------
#Recall that reg is the regression on just the FULL SET of training data.
#Now were going to use that to predict for the 17 candidate starbucks locations
#Then of course, because we actually have the SALES_VOL figures for those starbucks, we will calculate some
#error
#subject these predictions on to the testing data frame w/ the 17 candidate locations
regPredStarbucks <- predict(reg, testing)
#now create a new data frame where one column is the dependent variable 'SALES_VOL' from testing
#(Which is column 10)
regPredStarbucksValues <- data.frame(obs = testing[,10], pred = regPredStarbucks)
head(regPredStarbucksValues)
#Finally regress these predicted values from reg2 on to the SALES_VOL from the random2Test data frame
defaultSummary(regPredStarbucksValues)
#calculate the difference and percent change between observed and predicted
regPredStarbucksValues$diff <- regPredStarbucksValues$obs - regPredStarbucksValues$pred
regPredStarbucksValues$diffP <- (regPredStarbucksValues$obs - regPredStarbucksValues$pred) / regPredStarbucksValues$obs
#now the mean residual or error from the regression 'reg' aka the 'mean absolute error' aka the MAE
mean(abs(regPredStarbucksValues$diff))
#calculate the MAPE - or mean absolute percent error aka MAPE
mean(abs(regPredStarbucksValues$diffP))
#update this data frame to include the FID field
regPredStarbucksValues <- cbind(testing$FID, regPredStarbucksValues)
#change columns names and output for arcgis
colnames(regPredStarbucksValues) <- c("FID", "observed", "predicted", "diff", "diffPct")
write.csv(regPredStarbucksValues, file = "C:/Users/kennethsteif/regPredStarbucksValues.csv")
#