Load required packages

title

author

date

output

Clustering and Factor Analysis - Microvan Research

Nammn Joshii

14/10/2020

pdf_document	html_document
default	default

knitr::opts_chunk$set(echo = TRUE)

Load required packages

library(ggplot2)
library(gmodels)
library(REdaS)
library(haven)

Reading the dataset

mv <- read_dta("/Users/naman/Desktop/microvan.dta")

mv[1] <-NULL

cor <- cor(mv[,colnames(mv)[2:31]])

Creating Histograms for frequency distribution

i = 1
par(mfrow = c(5, 6), mar = c(2,2,2,2))
for (col in mv[2:38]) {
  hist(col, main = colnames(mv)[i],col=rainbow(i))
  i = i + 1
}

Fitting the model with 30 variables

model <- lm(mvliking ~ ., data=mv[1:31])
summary(model)

Factor Analysis

Pre-factor diagnostics

# Bartlett test of sphericity
bart_spher(mv[,colnames(mv)[2:31]])

# Kaiser-Meyer-Olkin Measure of sampling adequacy
KMOS(mv[,colnames(mv)[2:31]])

Determine the number of factors

# Create a table of results for ease of interpretation
ev <- eigen(cor(mv[,colnames(mv)[2:31]]))$values
e <- data.frame(Eigenvalue = ev, PropOfVar = ev / length(ev), CumPropOfVar = cumsum(ev / length(ev)))
round(e, 4)

Draw a scree plot

p <- ggplot()
p <- p + geom_line(aes(x = 1:length(ev), y = ev))
p <- p + geom_point(aes(x = 1:length(ev), y = ev))
p <- p + geom_hline(yintercept = 1, colour = "red")
p <- p + labs(x = "Number", y = "Eigenvalues", title = "Scree Plot of Eigenvalues")
p <- p + scale_x_continuous(breaks = 1:length(ev), minor_breaks = NULL)
p <- p + theme_bw()
p

Extract solution

Select number of factors

n <- length(which(ev > 1))

Extract and rotate principal components

library(psych)
pc <- principal(mv[,colnames(mv)[2:31]], nfactors = n, rotate="varimax")

Create a factor loadings table; Sort based on uniqueness

fl <- cbind.data.frame(pc$loadings[,], Uniqueness = pc$uniquenesses)
round(fl[order(pc$uniquenesses),], 4)

# View(fl[fl$RC1 > 0.5 | fl$RC1 < -0.5,])
# View(fl[fl$RC2 > 0.5 | fl$RC2 < -0.5,])
# View(fl[fl$RC3 > 0.5 | fl$RC3 < -0.5,])
# View(fl[fl$RC4 > 0.5 | fl$RC4 < -0.5,])
# View(fl[fl$RC5 > 0.5 | fl$RC5 < -0.5,])

Segmenting based on factor scores

Print factor scores

factor_scores <- as.data.frame(pc$scores)

final_data <- cbind.data.frame(mv[1],pc$scores)

Fitting the model with 5 factors

new_model <- lm(mvliking ~ RC1 + RC2 + RC3 + RC4 + RC5, data=final_data)

summary(new_model)

Fitting the model with 3 factors

reduced_model <- lm(mvliking ~ RC1 + RC2  + RC4, data=final_data)

summary(reduced_model)

Comparing the two models

anova(new_model,reduced_model)

Conduct k-means clustering on the factor scores

Remove factors which are insignificant

factor_scores$RC3 <- NULL
factor_scores$RC5 <- NULL

Hierarchical clustering to see number of clusters using dendrogram

d <- dist(pc$scores)
h <- hclust(d, method = "ward.D2")

View dendogram

plot(h, xlab = "Respondent")

First, standardize the input variables (z-scores)

z <- scale(factor_scores, center = TRUE, scale = TRUE)

Since the k-means algorithm starts with a random set of centers, setting the seed helps ensure the results are reproducible

set.seed(1)

Apply K-means clustering with the selected numbers of centers (e.g. 3)

k <- kmeans(z, centers = 3)
k

Display cluster sizes

k$size

Cluster means

For ease of interpretation, convert standardized values for factor scores

sapply(c("RC1", "RC2","RC4"), function(n) k$centers[, n]*sd(pc$scores[,n]) + mean(pc$scores[,n]))

Crosstab analysis on cluster segements versus likeness

mv <- cbind(mv, cluster = k$cluster)
mv$mvliking_group <- ifelse(mv$mvliking >= 0 & mv$mvliking <= 3, 'Low Liking ',
                            ifelse(mv$mvliking >=4 & mv$mvliking <=6, 'Medium Liking',
                                   ifelse(mv$mvliking >=7, 'High Liking', 'something else')))

CrossTable(x = mv$cluster, y = mv$mvliking_group, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)

Adding groups across segment descriptors


mv$age_group <- ifelse(mv$age >= 19 & mv$age <= 35, '19-35',
                            ifelse(mv$age >= 36 & mv$age <= 50, '36-50',
                                      ifelse( mv$age > 50, '50+','something else')))

mv$income_group <- ifelse(mv$income >= 15 & mv$income <= 50, '0-50k',
                       ifelse(mv$income >= 51 & mv$income <= 100, '51-100k',
                              ifelse(mv$income >= 101 & mv$income <= 150, '101-150k',
                                ifelse( mv$income > 150, '150+k','something else'))))

mv$miles_group <- ifelse(mv$miles >= 0 & mv$miles <= 14, '< 14k',
                       ifelse(mv$miles >= 15 & mv$miles <= 18, '15-19k',
                              ifelse(mv$miles >= 19 & mv$miles <= 22, '19-22k',
                                ifelse( mv$miles > 22, '22+','something else'))))

mv$educ_group <- ifelse(mv$educ >= 1 & mv$educ <= 2, 'Less Formal Education',
                       ifelse(mv$educ == 3, 'Undergraduate',
                              ifelse( mv$educ == 4, 'Graduate','something else')))

Crosstab analysis for identifying segment descriptors

For Age Group

CrossTable(x = mv$cluster, y = mv$age_group, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)

For Income Group

CrossTable(x = mv$cluster, y = mv$income_group, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)

For Miles Driven Group

CrossTable(x = mv$cluster, y = mv$miles_group, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)

For Gender

CrossTable(x = mv$cluster, y = mv$female, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)

For Recycle

CrossTable(x = mv$cluster, y = mv$recycle, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)

For Number of Kids

CrossTable(x = mv$cluster, y = mv$numkids, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)

For Education Group

CrossTable(x = mv$cluster, y = mv$educ_group, expected = TRUE, prop.r = FALSE, prop.c = FALSE, prop.t = FALSE)

Name		Name	Last commit message	Last commit date
Latest commit History 4 Commits
Clustering and Factor Analysis .Rmd		Clustering and Factor Analysis .Rmd
Insights - Gross Pointe Associates And The “Microvan” Analysis.pdf		Insights - Gross Pointe Associates And The “Microvan” Analysis.pdf
README.md		README.md
microvan.dta		microvan.dta

nammnjoshii/Clustering_and_Factor_Analysis-Gross-Pointe-Associates-And-The-Microvan-

Folders and files

Latest commit

History

Repository files navigation

Load required packages

Reading the dataset

Creating Histograms for frequency distribution

Fitting the model with 30 variables

Factor Analysis

Pre-factor diagnostics

Determine the number of factors

Draw a scree plot

Extract solution

Select number of factors

Extract and rotate principal components

Create a factor loadings table; Sort based on uniqueness

Segmenting based on factor scores

Print factor scores

Fitting the model with 5 factors

Fitting the model with 3 factors

Comparing the two models

Conduct k-means clustering on the factor scores

Remove factors which are insignificant

Hierarchical clustering to see number of clusters using dendrogram

View dendogram

First, standardize the input variables (z-scores)

Since the k-means algorithm starts with a random set of centers, setting the seed helps ensure the results are reproducible

Apply K-means clustering with the selected numbers of centers (e.g. 3)

Display cluster sizes

Cluster means

For ease of interpretation, convert standardized values for factor scores

Crosstab analysis on cluster segements versus likeness

Adding groups across segment descriptors

Crosstab analysis for identifying segment descriptors

For Age Group

For Income Group

For Miles Driven Group

For Gender

For Recycle

For Number of Kids

For Education Group

About

Topics

Resources

Stars

Watchers

Forks