-
Notifications
You must be signed in to change notification settings - Fork 0
/
PCA.R
234 lines (193 loc) · 10.7 KB
/
PCA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#Mount the libraries we need
library(tidyverse)
library(ggcorrplot)
library(factoextra)
library(plyr)
# Data set 1:SIMD Data ######
## Import the First Data set ========
SIMD <- read_csv("data/simd2020_withinds_toTest.csv")
#Let's Look at it
#Some data are weird e.g. Percentages has a % symbol and Null value are encoded as *
## Data Wrangling and Cleaning ========
#Let's Clean But first subset for only the columns we are interested in
SIMDSubset<-SIMD[,c(3,18,20,29,32,33,38,39,49)]# selecting by position
#Step 1 Remove % and * and re-encode as Continuous values
SIMDCleaned <- mutate_if(SIMDSubset, #change if is character
is.character,
str_replace_all,
pattern = "[%*]", #What I am searching
replacement = "")#What I am replacing with
#Now that we have removed all the weird symbols we can transform the columns containing numeric values into proper numeric ones
SIMDCleaned <- SIMDCleaned %>%
mutate_at(c(2:9), as.double)
## Getting set for PCA (Remove null values and subset by numeric) =====
#Step 1 Remove Null Values
# Check for Columns with Null Values
colSums(is.na(SIMDCleaned))
#Council_area, income_rate, employment_rate, overcrowded_rate, nocentralheating_rate, are ok
#not_participating, University, and Broadband are ok-ish we can just remove those Attendance is around 2% so we should discuss but for now let's just remove all null values
SIMDNoNull <-na.omit(SIMDCleaned)#This will blankly remove all row when you have at least one null value (nb do it after you already drop all the columns you do not want to use or you will drop rows you should not)
#Check again
colSums(is.na(SIMDNoNull))
#University is the only positive one so we actually may want to flip it so we get % of people that did not go to University rather than the percentage that went
SIMDNoNull$University <-(100-SIMDNoNull$University)
#Since we want to look at by authority/constituency lets
#Group by constituency
Costituencies <- SIMDNoNull%>%
dplyr::group_by(Council_area)%>%
dplyr::summarise(income_rate=mean(income_rate),
employment_rate=mean(employment_rate),
Attendance=mean(Attendance),
not_participating=mean(not_participating),
University=mean(University),
overcrowded_rate=mean(overcrowded_rate),
nocentralheating_rate=mean(nocentralheating_rate),
broadband=mean(broadband))
#Subset only continuous variables
SIMDNumerical <- Costituencies[,2:9]#selecting all rows and only columns 2-9
#Let's check them
head(SIMDNumerical)
#now we normalise. Scale them on the average of the variable so we look at deviation from the mean. standardize the numerical data by subtracting the mean and dividing by the standard deviation
SIMDNormalized <- scale(SIMDNumerical)
head(SIMDNormalized)
#Plot Matrix to check correlation between variables
SIMDCorrMatrix <- cor(SIMDNormalized)
ggcorrplot(SIMDCorrMatrix)
SIMDpca <- princomp(SIMDCorrMatrix)#Perform PCA on the correlation matrix
summary(SIMDpca)#let see how much each PC count for
SIMDpca$loadings[, 1:2]#Lets look at loadings (impact of each variables, our ingredients)
## Visualise the results ====
fviz_eig(SIMDpca, addlabels = TRUE)#Let's check the scree plot (of the eigen values) for the % of PC1 and PC2
# Graph of the variables
fviz_pca_var(SIMDpca, col.var = "black")#check impact of each variables
#First, all the variables that are grouped together are positively correlated to each other
#Then, the higher the distance between the variable and the origin, the better represented that variable is.
#Finally, variables that are negatively correlated are displayed to the opposite sides of the biplot’s origin.
fviz_cos2(SIMDpca, choice = "var", axes = 1:2)# Calculate how much each values impacted in PCA lower values means not very well represented
fviz_pca_var(SIMDpca, col.var = "cos2",
gradient.cols = c("black", "orange", "green"),
repel = TRUE)#See it in relation to their correlation
#Compute the PCS in a different way to check if we can see patterns against factor variables
pcsSIMD <- prcomp(SIMDNormalized)#do PCA on our normalised data. This time because we want to see each eigenvalue rather thant the total loadings we do it on the normalised data rather than the correlation matrix
print(SIMDpca)
#let's add the PC1 and PC2 info to the data set
pc1_2 <- as.data.frame(cbind(PC1=pcsSIMD$x[,1], PC2=pcsSIMD$x[,2]))
pcSIMDData <-cbind(pc1_2, Costituencies)
#Finally let's plot it
ggplot(pcSIMDData, aes(x=PC1, y=PC2, color=Council_area, label=Council_area)) +
geom_point(size=8, alpha=0.5)+
theme_bw()+
labs(title = "Council")+geom_text(size=3)+
theme(legend.position = "none")+
geom_hline(yintercept=0)+
geom_vline(xintercept=0)
#last check does it cluster better than by using two of the variable alone
# Let see for example income_rate and employment_rate
ggplot(pcSIMDData, aes(x=income_rate, y=not_participating, color=Council_area, label=Council_area)) +
geom_point(size=8, alpha=0.5)+
theme_bw()+
labs(title = "Council")+geom_text(size=3)+
theme(legend.position = "none")+
geom_hline(yintercept=0)+
geom_vline(xintercept=0)
#Now play around with this last graph and see if the partial pattern we saw is visible across only two variables
# Data set 2: Authority data ######
## Import the Second Data set =======
CostOfLiving <- read_csv("data/AuthorityData.csv",locale = locale(encoding = "WINDOWS-1252"))
## Data Wrangling and Cleaning =========
#Let's Look at it
#There are a lot of empty rows and the first one is actually the total across Scotland so let's remove the first row and all rows past 33
CostOfLiving<- CostOfLiving[2:33,]#All column but only row 2:33
#Some data are weird e.g. Percentages has a % symbol and there are £ symbols in there
#Remove % and £ and re-encode as Continuous values
CostOfLivingCleaned <- mutate_if(CostOfLiving,
is.character,
str_replace_all,
pattern = "[%£]",
replacement = "")
#Some columns have , rather than . as decimal point
CostOfLivingCleaned <- mutate_if(CostOfLivingCleaned,
is.character,
str_replace_all,
pattern = "[,]",
replacement = ".")
#Now re-encode numerical columns as such
CostOfLivingCleaned <- CostOfLivingCleaned %>%
mutate_at(c(4:23), as.double)
#We want to look at Change rather than general number so we want to look at increase % But first let's see where the null values are
colSums(is.na(CostOfLivingCleaned))
#Since we have few records lets only use what we have all data from so
#Food insecurity
#House prices
#Welfare applications
#Homeless applications
#Rent
#Because some of these will be connected to population we want to look at % increase/decrease rather than the values themselves
#So the first step is to generate 6 new columns that will calculate this % of increase
Evolution <-CostOfLivingCleaned%>%
mutate(FoodInsecurity=round(((`food_insecurity2018-2022`-`food_insecurity2017-2021`)/`food_insecurity2017-2021`)*100,1),
HousePrices=round(((house_price_jul_22-house_price_jul_21)/house_price_jul_21)*100,1),
WelfareApp=round(((`welfare_applications2021-2022`-`welfare_applications2020-2021`)/`welfare_applications2020-2021`)*100,1),
Rent=round(((average_rent_2022-average_rent_2021)/average_rent_2021)*100,1),
Homeless=round(((`homelessness_applications 2021-2022`-`homelessness_applications2020-21`)/`homelessness_applications2020-21`)*100,1))%>%
select(authority,location,region,FoodInsecurity,HousePrices,WelfareApp,Rent,Homeless)#Select only the textual columns and the newly created ones
## Getting Set for PCA =====
#Check again
colSums(is.na(Evolution))
#Extract the numerical variables
numericalEvolution<- Evolution[,4:8]
head(numericalEvolution)
#Now we normalise. Scale them on the average of the variable so we look at deviation from the mean. standardize the numerical data by subtracting the mean and dividing by the standard deviation
EvolutionNormalised <- scale(numericalEvolution)
head(EvolutionNormalised)
#Plot Matrix to check correlation between variables
EveolCorrMatrix<- cor(EvolutionNormalised)
ggcorrplot(EveolCorrMatrix)
EvolPCA <- princomp(EveolCorrMatrix)
EvolPCA$loadings[, 1:2]
## Visualise the results ====
fviz_eig(EvolPCA, addlabels = TRUE)#Let's check the scree plot (of the eigen values) for the % of PC1 and PC2 not ideally but hopefully still significant
# Graph of the variables
fviz_pca_var(EvolPCA, col.var = "black")#check impact of each variables so we can see if some of them are not very well portrayed in our analysis
#First, all the variables that are grouped together are positively correlated to each other
#Then, the higher the distance between the variable and the origin, the better represented that variable is.
#Finally, variables that are negatively correlated are displayed to the opposite sides of the biplot’s origin.
fviz_cos2(EvolPCA, choice = "var", axes = 1:2)# Calculate how much each values impacted in PCA lower values means not very well represented
fviz_pca_var(EvolPCA, col.var = "cos2",
gradient.cols = c("black", "orange", "green"),
repel = TRUE)
#Compute the PCS again but this time on the single records rather than on the correlation matrix
EvolPCS <- prcomp(EvolutionNormalised)
print(EvolPCS)
#let's add the PC1 and PC2 info to the dataset
pc1_2 <- as.data.frame(cbind(PC1=EvolPCS$x[,1], PC2=EvolPCS$x[,2]))
EvolPCSData <-cbind(pc1_2, Evolution)
#Finally let's plot it
ggplot(EvolPCSData, aes(x=PC1, y=PC2, color=location, label=authority)) +
geom_point(size=8, alpha=0.5)+
theme_bw()+
labs(title = "Authority")+
geom_text(size=3)+
geom_hline(yintercept = 0)+
geom_vline(xintercept=0)
#Better see the clusters
find_hull <- function(EvolPCSData) EvolPCSData[chull(EvolPCSData$PC1, EvolPCSData$PC2), ]
hulls <- ddply(EvolPCSData,"location", find_hull)
ggplot(EvolPCSData, aes(x=PC1, y=PC2, color=location,label=authority)) +
geom_point(size=6, alpha=0.5)+
theme_bw()+
labs(title = "Try")+
geom_polygon(data=hulls, alpha=.2, aes(fill=location))+
geom_text(size=3)+
geom_hline(yintercept = 0)+
geom_vline(xintercept=0)
#Exercise try to do the same but using region as categorical value
#Does single variables explain more #last check does it cluster better than by using two of the variable alone
# Let see for example Rent and Welfare Applications
ggplot(EvolPCSData, aes(x=Rent, y=WelfareApp, color=location, label=authority)) +
geom_point(size=8, alpha=0.5)+
theme_bw()+
labs(title = "Authority")+
geom_text(size=3)+
geom_hline(yintercept=0)+
geom_vline(xintercept=0)