Stack Overflow survey analysis

Look at evidence for formal training in CS/programming among those who use R.

Use 2018 Stack Overflow Annual Developer Survey:

Original code from Julia Silge, data scientist at Stack Overflow. Modified by Jenny Bryan.


Load packages.

Make sure we have the data.

survey_path <- here(
if (!file.exists(survey_path)) {
  ## consults Content-Description to get filename
  dl <- usethis:::download_zip(
    url = "",
    destdir = here("stackoverflow-survey")
  target <- here(
  utils::unzip(dl, exdir = target)

Load the data.

theme_set(theme_minimal(base_family="Source Sans Pro"))
survey2018 <- read_csv(survey_path)
What kinds of majors do R users have?

users_majors <- survey2018 %>%
  select(Respondent, LanguageWorkedWith, UndergradMajor) %>%
  filter(! %>%
  mutate(LanguageWorkedWith = str_split(LanguageWorkedWith, pattern = ";")) %>%
  unnest(LanguageWorkedWith) %>%
  group_by(Respondent) %>%
  summarize(UsesR = "R" %in% LanguageWorkedWith,
            UndergradMajor = first(UndergradMajor))

counts_major <- users_majors %>%
  count(UsesR, UndergradMajor) %>%
  mutate(UsesR = if_else(UsesR, "useR", "Other")) %>%
  spread(UsesR, n, fill = 0)

logratio_major <- counts_major %>%
  mutate_if(is.numeric, funs((. + 1) / sum(. + 1))) %>%
  mutate(logratio = log2(useR / Other)) %>%
  arrange(desc(logratio)) %>%
    UndergradMajor = reorder(UndergradMajor, logratio),
    Direction = factor(if_else(logratio > 0, "useRs", "Other")),
    Direction = forcats::fct_reorder(Direction, logratio, .desc = TRUE)
UndergradMajor Other useR
A business discipline (ex. accounting, finance, marketing) 1750 171
A health science (ex. nursing, pharmacy, radiology) 217 29
A humanities discipline (ex. literature, history, philosophy) 1487 103
A natural science (ex. biology, chemistry, physics) 2561 489
A social science (ex. anthropology, psychology, political science) 1122 255
Another engineering discipline (ex. civil, electrical, mechanical) 6575 370
Computer science, computer engineering, or software engineering 48340 1996
Fine arts or performing arts (ex. graphic design, music, studio art) 1105 30
I never declared a major 677 16
Information systems, information technology, or system administration 6307 200
Mathematics or statistics 2236 582
Web development or web design 2397 21
UndergradMajor Other useR logratio Direction
Mathematics or statistics 0.0299120 0.1364062 2.1891119 useRs
A social science (ex. anthropology, psychology, political science) 0.0150162 0.0598971 1.9959672 useRs
A natural science (ex. biology, chemistry, physics) 0.0342577 0.1146467 1.7426926 useRs
A health science (ex. nursing, pharmacy, radiology) 0.0029150 0.0070192 1.2678157 useRs
A business discipline (ex. accounting, finance, marketing) 0.0234135 0.0402433 0.7814108 useRs
A humanities discipline (ex. literature, history, philosophy) 0.0198968 0.0243332 0.2903903 useRs
Another engineering discipline (ex. civil, electrical, mechanical) 0.0879309 0.0868039 -0.0186098 Other
Computer science, computer engineering, or software engineering 0.6463910 0.4672438 -0.4682317 Other
Information systems, information technology, or system administration 0.0843473 0.0470285 -0.8428058 Other
Fine arts or performing arts (ex. graphic design, music, studio art) 0.0147889 0.0072532 -1.0278300 Other
I never declared a major 0.0090659 0.0039775 -1.1885692 Other
Web development or web design 0.0320648 0.0051474 -2.6390749 Other
p <- logratio_major %>% 
  group_by(Direction) %>% 
  ggplot(aes(UndergradMajor, logratio, fill = Direction)) +
  geom_col(alpha = 0.9) +
  coord_flip() +
  scale_y_continuous(breaks = seq(-2, 2),
                     labels = c("0.25x", "0.5x", "Same", "2x", "4x"))

## Julia's original
p +
  labs(y = "Relatively more from R users", x = NULL,
       fill = "More likely from...",
       subtitle = "R users are less likely to have formal programming training",
       title = "What kinds of undergrad majors do R users have?")       

## For use in Keynote
p +
  labs(y = "Relative prevalence", x = NULL,
       fill = "Major is more common among",
       caption = "Julia Silge & Jenny Bryan\nSource: 2018 Stack Overflow Annual Developer Survey") +
    legend.position = "top",
    legend.title = element_text(size = rel(1.4)),
    axis.text.y = element_text(size = rel(1.3))