forked from embruna/Editorial-Board-Geography
/
Country.Codes.R
55 lines (36 loc) · 2.45 KB
/
Country.Codes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
Country.Codes <- function(DATASET) {
# DATA CLEANUP
#Remove (trim) the leading and trailing white spaces (not can do with one command as per: http://stackoverflow.com/questions/2261079/how-to-trim-leading-and-trailing-whitespace-in-r)
trim.trailing <- function (x) sub("\\s+$", "", x)
DATASET$COUNTRY<-trim.trailing(DATASET$COUNTRY)
trim.leading <- function (x) sub("^\\s+", "", x)
DATASET$COUNTRY<-trim.leading(DATASET$COUNTRY)
# remove any double spaces
DATASET$COUNTRY<-gsub(" ", " ", DATASET$COUNTRY, fixed=TRUE)
# USING COUNTRY NAMES AND Package "countrycode" TO ADD A COLUMN WITH THE 3 DIGIT CODE
library(countrycode)
# ELIMINATE ANY ROWS WITH BLANKS IN THE
DATASET<-subset(DATASET, COUNTRY!="Unknown")
# Chnage countries as needed, either because country names have changed or to reflect political organization
DATASET$COUNTRY[DATASET$COUNTRY == "Scotland"] <- "UK" #With apologies to Scots everywhere
DATASET$COUNTRY[DATASET$COUNTRY == "SCOTLAND"] <- "UK" #With apologies to Scots everywhere
DATASET$COUNTRY[DATASET$COUNTRY == "Wales"] <- "UK"
DATASET$COUNTRY[DATASET$COUNTRY == "England"] <- "UK"
DATASET$COUNTRY[DATASET$COUNTRY == "German Democratic Republic"] <- "Germany" #removing old names
DATASET$COUNTRY[DATASET$COUNTRY == "US"] <- "USA" #in case any snuck in
DATASET$COUNTRY[DATASET$COUNTRY == "Yugoslavia"] <- "Croatia" #Authors from Pretinac
DATASET$COUNTRY[DATASET$COUNTRY == "French Guiana"] <- "France"
#step3: change all the country names to the codes used in mapping
#Add a column with the 3 letter country codes to be consistent with the other datasets
#Maptools uses the ISO 3166 three letter codes: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3
#The packahge countrycode will take your column of country names and convert them to ISO3166-3 Codes
#I began by checking the values of COUNTRY to see if there are any mistakes. To do so I just created a vector
#called CODECHECK
DATASET$geo.code<-countrycode(DATASET$COUNTRY, "country.name", "iso3c", warn = TRUE)
#By setting "warn=TRUE" it will tell you which ones it couldn't convert. Because of spelling mistakes, etc.
DATASET$geo.code<-as.factor(DATASET$geo.code)
#Deleting rows without country
# DATASET <- DATASET[!is.na(DATASET$geo.code),]
#You can correct these as follows in the dataframe with all the data, then add a new column to the dataframe with the country codes
return(DATASET)
}