/
gender_0308.R
227 lines (198 loc) · 10.1 KB
/
gender_0308.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
library(ggplot2)
library(lubridate)
library(reshape2)
Sys.setlocale("LC_TIME", "usa")
setwd("C:/Users/Craig/Desktop/UAC")
source('UAC_load.R')
all_uac$age <- as.numeric(all_uac$age)
all_uac$date <- ymd(all_uac$date)
all_uac <- unique(all_uac)
# Clean up locations
geo_unique <- read.csv('allnames-0302.csv',stringsAsFactors=FALSE)
geo_unique <- geo_unique[!is.na(geo_unique$lat),]
geo_unique <- geo_unique[geo_unique$lat != 0,]
geo_unique[geo_unique$country=='GUATEMALA' & !is.na(geo_unique$country),'country'] <- 'Guatemala'
geo_unique[geo_unique$country=='EL SALVADOR' & !is.na(geo_unique$country),'country'] <- 'El Salvador'
names(geo_unique) <- c('lat','lon','country','admin1','address','type','locstr','str2','count')
geo_unique[geo_unique$admin1 == 'San Marcos Department','admin1'] <- 'San Marcos'
# I can infer genders using this API:
# https://market.mashape.com/namsor/gendre-infer-gender-from-world-names
# Pricing is:
# $0/month for 1000, with $0.01 per additional ($385.98)
# $9.99/month for 5000, with $0.005 per additional ($182.98)
# $45/month for 25,000, with $0.002 per additional ($74.20)
# $99/month for 100,000, with $0.001 per additional ($99)
# So if I *had* to purchase this, I could do it for about $75.
# More than I want to pay out of pocket.
fnames=sapply(all_uac$name, function(x) strsplit(x,', ')[[1]][2])
first_names <- unlist(strsplit(fnames,' '))
t <- as.data.frame(table(first_names))
t <- t[order(t$Freq,decreasing=TRUE),]
(nrow(t) - 1000)*0.01 # $70.75 to do all unique first names
###############################################################################
# Try some lists from internet. This isn't the most sophisticated approach,
# but let's see how it does.
###############################################################################
# List of the most 5000 most common male and female names in Spain, from
# http://www.ine.es/tnombres/formGeneralresult.do?vista=4
girl_es <- read.delim('female_names_es.txt',stringsAsFactors=FALSE)
boy_es <- read.delim('male_names_es.txt',stringsAsFactors=FALSE)
girl_es$percent <- 100 * girl_es$Total / sum(girl_es$Total)
boy_es$percent <- 100 * boy_es$Total / sum(boy_es$Total)
girl_es$Nombre <- tolower(girl_es$Nombre)
boy_es$Nombre <- tolower(boy_es$Nombre)
# Most common names from 1990 US Census
# http://www2.census.gov/topics/genealogy/1990surnames/dist.female.first
girl_us <- read.table('census-dist-female-first.txt',stringsAsFactors=FALSE)
boy_us <- read.table('census-dist-male-first.txt',stringsAsFactors=FALSE)
names(girl_us) <- c('name','percent','cum_percent','rank')
names(boy_us) <- c('name','percent','cum_percent','rank')
girl_us$name <- tolower(girl_us$name)
boy_us$name <- tolower(boy_us$name)
names <- data.frame(fullname=all_uac$name,given=tolower(fnames),
male_match=0,female_match=0,stringsAsFactors=FALSE)
# Match on Spanish names first
names$female_match <- 0
for (i in 1:nrow(girl_es)) {
p1<- paste('(^| )',girl_es[i,'Nombre'],'( |$)',sep='')
m1 <- grep(p1,names$given,perl=TRUE)
names[m1,'female_match'] <- names[m1,'female_match'] + girl_es[i,'percent']
}
names$male_match <- 0
for (i in 1:nrow(boy_es)) {
p1<- paste('(^| )',boy_es[i,'Nombre'],'( |$)',sep='')
m1 <- grep(p1,names$given,perl=TRUE)
names[m1,'male_match'] <- names[m1,'male_match'] + boy_es[i,'percent']
}
names$female <- NA
names[names$male_match > names$female_match,'female'] <- 0
names[names$male_match < names$female_match,'female'] <- 1
nrow(names[!is.na(names$female),])/nrow(names) # placed 89.4% based on Spanish names
head(names[is.na(names$female),],100)
# In cases where this didn't work, try the US names
names$female_match_us <- 0
for (i in 1:nrow(girl_us)) {
p1<- paste('(^| )',girl_us[i,'name'],'( |$)',sep='')
m1 <- grep(p1,names$given,perl=TRUE)
names[m1,'female_match_us'] <- names[m1,'female_match_us'] + girl_us[i,'percent']
}
names$male_match_us <- 0
for (i in 1:nrow(boy_us)) {
p1<- paste('(^| )',boy_us[i,'name'],'( |$)',sep='')
m1 <- grep(p1,names$given,perl=TRUE)
names[m1,'male_match_us'] <- names[m1,'male_match_us'] + boy_us[i,'percent']
}
names[is.na(names$female) & names$female_match_us > names$male_match_us,'female'] <- 1
names[is.na(names$female) & names$female_match_us < names$male_match_us,'female'] <- 0
nrow(names[!is.na(names$female),])/nrow(names) # up to 91.2% with US names included
# I can guess based on whether the final letter is a or o.
names$final_letter <- NA
names[grep('a($| )',names$given),'final_letter'] <- 'a'
names[grep('o($| )',names$given),'final_letter'] <- 'o'
names[is.na(names$female) & !is.na(names$final_letter) &
names$final_letter == 'a','female'] <- 1
names[is.na(names$female) & !is.na(names$final_letter) &
names$final_letter == 'o','female'] <- 0
nrow(names[!is.na(names$female),])/nrow(names) # up to 94% with final letters
# Let's call that good enough -- it's as good as my geocoding at this point.
###############################################################################
# Now let's see if there's anything useful in there. Summary stats first.
###############################################################################
all_uac$female <- names$female
binom.test(sum(all_uac$female,na.rm=TRUE),sum(!is.na(all_uac$female)))
# 32.8 - 33.7% female
t.test(all_uac[!is.na(all_uac$female) & all_uac$female==1,'age'],
all_uac[!is.na(all_uac$female) & all_uac$female==0,'age'])
# Girls are significantly younger -- 13.9 years vs 14.8
###############################################################################
# Age distribution
###############################################################################
plotme <- all_uac[,c('age','female')]
plotme <- plotme[!is.na(plotme$age) & !is.na(plotme$female),]
plotme[plotme$female==1,'gender'] <- 'Female'
plotme[plotme$female==0,'gender'] <- 'Male'
ggplot(plotme,aes(x=age,group=gender,fill=gender)) +
geom_density(alpha=0.2,adjust=1.4) +
theme_classic() +
labs(x='Age (years)',y='Density') +
theme(text=element_text(size=20),
axis.line=element_blank(),axis.ticks=element_blank())
###############################################################################
# Time trend in female fraction?
###############################################################################
uac_date <- all_uac[,c('locstr','date','female')]
uac_date$date <- ymd(uac_date$date)
uac_date <- uac_date[!is.na(uac_date$date),]
uac_date$month <- month(uac_date$date)
uac_date$year <- year(uac_date$date)
uac_date <- uac_date[order(uac_date$date),]
uac_date$monthnum <- (uac_date$year - uac_date$year[1])*12 + (uac_date$month - uac_date$month[1])
m <- ddply(uac_date,'monthnum',summarize,mean=mean(female,na.rm=TRUE),
conf95_lo=binom.test(sum(female,na.rm=TRUE),sum(!is.na(female)))$conf.int[1],
conf95_hi=binom.test(sum(female,na.rm=TRUE),sum(!is.na(female)))$conf.int[2])
m$date <- uac_date$date[1] + months(m$monthnum)
ggplot(m,aes(x=date,y=mean,ymin=conf95_lo,ymax=conf95_hi)) +
geom_line(size=3,color='aquamarine4') +
geom_errorbar(width=100,color='darkgray') +
theme_classic() +
labs(x='',y='Female fraction') +
theme(text=element_text(size=20),
axis.line=element_blank(),axis.ticks=element_blank())
# Fraction of girls stays close to 1/3 and decreases with time but increases
# significantly from Apr-May 2015. What happened then that made people more
# willing to send girls?
###############################################################################
# Geography of girl migrants?
###############################################################################
tmp <- uac_date[,c('locstr','date','female')]
j <- join(tmp,geo_unique,by='locstr')
j <- j[!is.na(j$female),]
j <- j[!is.na(j$admin1),]
adm1 <- unique(geo_unique[,c('country','admin1')])
sig <- ldply(1:nrow(adm1), function(i) {
country <- adm1[i,'country']
admin1 <- adm1[i,'admin1']
avg <- mean(j[j$country==country & j$admin1==admin1,'female'])
signif <- 0
tryCatch({
bt_in <- binom.test(sum(j[j$country==country & j$admin1==admin1,'female']),
sum(j$country==country & j$admin1==admin1))
bt_out <- binom.test(sum(j[j$country!=country | j$admin1!=admin1,'female']),
sum(j$country!=country | j$admin1!=admin1))
# non-overapping intervals?
signif <- as.numeric(bt_in$conf.int[1] > bt_out$conf.int[2])
}, error=function(e) {})
data.frame(country=country,admin1=admin1,avg=avg,sig=signif)
})
source('NT-isocodes.R')
sig$admin1 <- as.character(sig$admin1)
sig[sig$admin1 == 'San Marcos Department','admin1'] <- 'San Marcos'
writeme <- join(sig,shp_clean,by=c('country','admin1'))
writeme[is.na(writeme$adm1_code),]
write.csv(writeme,'adm1-gender-0309.csv',row.names=FALSE)
###############################################################################
# Girls more likely to travel in groups?
###############################################################################
match2 <- data.frame(last_name=sapply(all_uac$name, function(x) strsplit(x,', ')[[1]][1]),
city=all_uac$city,date=as.character(all_uac$date),
sector=all_uac$sector)
match2$last_city_date_sector <- paste(match2$last_name,match2$city,match2$date,match2$sector,sep='_')
t <- as.data.frame(table(match2$last_city_date_sector))
t <- t[t$Freq>1,]
t <- t[order(t$Freq,decreasing=TRUE),]
groups <- colsplit(t$Var1,'_',c('lastname','city','date','sector'))
groups$size <- t$Freq
groups <- na.omit(groups)
groups <- groups[groups$city != '',]
tmp <- all_uac
tmp$lastname <- match2$last_name
j <- join(tmp,groups,by=c('lastname','city','date','sector'))
group_members <- j[!is.na(j$size),] # 4428 total kids in these groups
binom.test(sum(all_uac$female,na.rm=TRUE),
sum(!is.na(all_uac$female)))
# Overall fraction is 32.75 - 33.72%
binom.test(sum(group_members$female,na.rm=TRUE),
sum(!is.na(group_members$female)))
# Fraction in groups is 46.53 - 49.59% --> *much* higher
sum(group_members$female,na.rm=TRUE)/sum(all_uac$female,na.rm=TRUE) # 16.4%
nrow(group_members)/nrow(all_uac) # 11.4%