/
classify.R
executable file
·89 lines (79 loc) · 3.22 KB
/
classify.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
##########################################################
###Load Packages###
library(xgboost)
##########################################################
###Read in the Data###
setwd(path)
df<-read.csv('data.csv')
###Replace the flag with NA###
for(i in 1:ncol(df)){
df[df[,i]==9999999.000,i] = NA
}
#Remove the columns where more than 30% is missing###
checks = apply(apply(df,2,is.na),2,mean) < 0.3
df= df[,checks]
##########################################################
###K-Fold Cross Validation to find best Hyperparameters###
#Randomly Shuffle the Data#
set.seed(666)
df<-df[sample(nrow(df)),]
xdata = data.matrix(df[,c(2,3)])
ydata = as.factor(as.numeric(df[,1]))
#Create 10 equally size folds#
folds <- cut(seq(1,nrow(df)),breaks=10,labels=FALSE)
#Perform 10 fold cross validation#
nrounds = c(100,150,200,250,300)
maxdepths = c(1,2,3,4)
etas = c(0.1,0.3,0.5,0.7,0.9)
search_grid=expand.grid(nrounds,maxdepths,etas)
all_error_rate = numeric(nrow(search_grid))
sd_rate = numeric(nrow(search_grid))
for(j in 1:nrow(search_grid)){
nround = search_grid[j,1]
maxdepth = search_grid[j,2]
eta = search_grid[j,3]
error_rate = numeric(10)
for(i in 1:10){
testIndexes <- which(folds==i,arr.ind=TRUE)
xtrain = xdata[-testIndexes,]
ytrain = ydata[-testIndexes]
xtest = xdata[testIndexes,]
ytest = ydata[testIndexes]
model<-xgboost(data=xtrain,label = ytrain,nrounds = nround,max_depth =maxdepth,eta=eta,objective="multi:softmax",num_class=10,verbose=0)
modelpred<-predict(model,xtest)
error_rate[i]<-mean(modelpred==ytest)
mean(modelpred==ytest)
}
all_error_rate[j]<-mean(error_rate)
sd_rate[j]<-sd(error_rate)
print(j/nrow(search_grid))
}
CV_Results<-data.frame(all_error_rate,sd_rate)
#Output and save the CV Results#
write.csv(CV_Results,'CV_Results.csv')
rm(list=ls())
#Select the parameters corresponding to the minimum in the error rate#
best=t(search_grid[which.max(CV_Results$all_error_rate),])
#Create data matrix and scale#
set.seed(666)
df<-df[sample(nrow(df)),]
xdata = data.matrix(df[,c(3,4,5,6,7,8,9,10,11)])
means<-sapply(df,mean,na.rm=TRUE)
means= means[c(3,4,5,6,7,8,9,10,11)]
sds<-sapply(df,sd,na.rm=TRUE)
sds= sds[c(3,4,5,6,7,8,9,10,11)]
xdata = scale(xdata)
ydata = as.factor(as.numeric(df[,2]))
###Fit the data using an XGBoost, a gradient boosted decision tree algorithm.
model<-xgboost(data=xdata,label = ydata,nrounds = best[1,] ,max_depth =best[2,],eta=best[3,],objective="multi:softmax",num_class=10,verbose=1)
model_prob<-xgboost(data=xdata,label = ydata,nrounds = best[1,] ,max_depth =best[2,],eta=best[3,],objective="multi:softprob",num_class=10,verbose=1)
modelpred<-predict(model,xdata)
#Calculate the Accuracy#
mean(modelpred==ydata)
#Create a Confusion Matrix for the Model Predictions#
ydata_confusion=revalue(ydata, c('1'='AGN','2'='ATNF','3'='ATNF_BIN','4'='CV','5'='HMXB','6'='LMXB','7'='STAR','8'='WR','9'='YSO'))
modelpred_confusion=revalue(as.factor(modelpred), c('1'='AGN','2'='ATNF','3'='ATNF_BIN','4'='CV','5'='HMXB','6'='LMXB','7'='STAR','8'='WR','9'='YSO'))
table(modelpred_confusion,ydata_confusion,dnn=list('Prediction','Data'))
#Calculate the most import features in X-Ray Object Classification#
importance <- xgb.importance(feature_names = colnames(xdata), model = model)
importance