/
ANLY510-90-DataCleaning_initial.R
94 lines (67 loc) · 2.49 KB
/
ANLY510-90-DataCleaning_initial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
## Team mbr
# Qi Zhang
# Wen He
# Chen Wu
# Yimin Wang
###Updates & Package Installation
install.packages("installr")
require(installr)
updateR()
install.packages("stringr")
library(stringr)
install.packages("zoo")
library(zoo)
library(xlsx)
###Task1
#Done by Ellen
#Populate the missing values in the Area variable with an appropriate values
#(Birmingham, Coventry, Dudley, Sandwell, Solihull, Walsall or Wolverhampton)
clean <- read.csv("dirty_data.csv",header = TRUE)
clean$Area <- sub("^$", NA, clean$Area)
clean$Area <- na.locf(clean$Area)
###Task2
#Done by Chen Wu
#Remove special characters from Street 1 and Street 2
clean$Street <- str_replace_all(clean$Street,"[^[:alnum:]]","")
clean$Street <- str_replace_all(clean$Street,"[^a-zA-Z0-9]"," ")
clean$Street.2 <- str_replace_all(clean$Street.2,"[^[:alnum:]]"," ")
#Remove padding (the white space before and after the text) from Street 1 and Street 2.
clean$Street <- trimws(clean$Street)
clean$Street.2 <- trimws(clean$Street.2)
#Make sure the first letters of street names are capitalized
clean$Street <- tolower(clean$Street)
clean$Street.2 <- tolower(clean$Street.2)
simpleCap <- function(x)
{s <- strsplit(x, " ")[[1]]
paste(toupper(substring(s, 1,1)),
substring(s, 2),
sep="", collapse=" ")}
clean$Street <- sapply(clean$Street,simpleCap)
clean$Street.2 <- sapply(clean$Street.2,simpleCap)
#Make sure the street denominations are following the same standard.
#For example, all streets are indicated as "str.", avenues as "ave.", etc.
clean$Street <- str_replace_all(clean$Street,"Rd","Rd.")
clean$Street <- str_replace_all(clean$Street,"Road","Rd.")
clean$Street.2 <- str_replace_all(clean$Street.2,"Rd","Rd.")
clean$Street.2 <- str_replace_all(clean$Street.2,"Road","Rd.")
clean$Street <- str_replace_all(clean$Street,"Street","St.")
clean$Street.2 <- str_replace_all(clean$Street.2,"Street","St.")
clean$Street <- str_replace_all(clean$Street,"Lane","Ln.")
clean$Street.2 <- str_replace_all(clean$Street.2,"Lane","Ln.")
###Task3
#If the value in Street 2 duplicates the value in Street 1, remove the value in Street 2
#Done by Qi Zhang
#If the value in Street 2 duplicates the value in Street 1, remove the value in Street 2
i <- 1
while(i < NROW(clean))
{
if(clean$Street[i] == clean$Street.2[i])
{
clean$Street.2[i] = ""
}
i <- i+1
}
#Remove the "Strange HTML column"
clean$Strange.HTML <- NULL
#Export data to Desktop
write.xlsx(clean, "Dirty_Data_After_Clean.xlsx")