-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleaning_dataset.R
104 lines (95 loc) · 2.69 KB
/
cleaning_dataset.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
library(tidyverse)
library(janitor)
library(knitr)
languages_raw <- read_csv(
"languages_dataset.csv",
# Some of the columns are very sparse, so have readr use everything for
# guessing.
guess_max = 4303
) |>
clean_names() |>
# The semantic_scholar column is misformed for a handful of languages. It's ok
# to introduce NAs here.
mutate(
semantic_scholar = as.integer(semantic_scholar)
)
# Almost all columns are almost completely empty. Keep the columns that have
# more than 10% coverage.
good_lang_cols <- languages_raw |>
summarize(
across(everything(), ~sum(!is.na(.x)))
) |>
tidyr::pivot_longer(
everything(),
names_to = "column",
values_to = "non_empty"
) |>
mutate(
coverage = non_empty/nrow(languages_raw)
) |>
filter(coverage > 0.1) |>
pull(column)
languages <- languages_raw |>
select(!!!good_lang_cols) |>
# This column references a site that doesn't want to be used in projects.
# select(-hopl) |>
# A couple columns are only relevant in the context of the mixed table with
# non-languages included.
# select(-number_of_repos, -rank, -paper_count) |>
# Some columns are internal metadata that is no longer true with this subset.
# select(-fact_count, -example_count) |>
# I looked at R specifically, and the "country" column was inaccurate. Let's
# not confuse people with known bad data.
# select(-country) |>
# Organize the columns.
select(
pldb_id,
title,
description,
type,
appeared,
creators,
website,
starts_with("domain_name"),
reference,
isbndb,
book_count,
semantic_scholar,
language_rank,
starts_with("github_"),
starts_with("wikipedia"),
starts_with("features_"),
line_comment_token,
everything()
)
write_csv(
languages,
"languages_dataset_cleaning.csv"
)
# Use the online dictionary to help with the dictionary in the post.
dictionary_url <- "https://pldb.com/columns.csv"
dictionary <- read_csv(dictionary_url) |>
clean_names() |>
# I only need the column name and description for our dictionary.
select(column, description) |>
# I cleaned the column names, so let's do the same here.
mutate(
column = make_clean_names(column)
) |>
# We don't need the extras.
filter(
column %in% colnames(languages)
)
# Arrange dictionary to match the order of colnames(languages).
dictionary <- dictionary[match(colnames(languages), dictionary$column), ]
dictionary |>
mutate(
class = map_chr(languages, typeof)
) |>
select(
variable = column,
class,
description
) |>
knitr::kable()
head(languages)