/
bechdel.Rmd
216 lines (191 loc) · 7.54 KB
/
bechdel.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
```{r setup,include=FALSE}
# set the knitr options ... for everyone!
# if you unset this, then vignette build bonks. oh, joy.
#opts_knit$set(progress=TRUE)
opts_knit$set(eval.after='fig.cap')
# for a package vignette, you do want to echo.
# opts_chunk$set(echo=FALSE,warning=FALSE,message=FALSE)
opts_chunk$set(warning=FALSE,message=FALSE)
#opts_chunk$set(results="asis")
opts_chunk$set(cache=FALSE,cache.path="cache/")
opts_chunk$set(fig.path="figure/bechdel_",dev=c("png"))
opts_chunk$set(fig.width=9,fig.height=6,dpi=100,out.width='900px',out.height='600px')
# doing this means that png files are made of figures;
# the savings is small, and it looks like shit:
#opts_chunk$set(fig.path="figure/",dev=c("png","pdf","cairo_ps"))
#opts_chunk$set(fig.width=4,fig.height=4)
# for figures? this is sweave-specific?
#opts_knit$set(eps=TRUE)
# this would be for figures:
#opts_chunk$set(out.width='.8\\textwidth')
# for text wrapping:
options(width=96,digits=2)
opts_chunk$set(size="small")
opts_chunk$set(tidy=TRUE,tidy.opts=list(width.cutoff=50,keep.blank.line=TRUE))
library(ggplot2)
```
# Bechdel test data
Here I use `dplyr` to gather data on films and gender from the IMDb mirror.
I drop movies within the `Documentary` genre, and only include those
which list English as one language.
```{r bechdel_setup,eval=TRUE,tidy=FALSE}
library(RMySQL)
library(dplyr)
library(knitr)
dbcon <- src_mysql(host='0.0.0.0',user='moe',password='movies4me',dbname='IMDB',port=23306)
capt <- dbGetQuery(dbcon$con,'SET NAMES utf8')
# genre information
movie_genres <- tbl(dbcon,'movie_info') %>%
inner_join(tbl(dbcon,'info_type') %>%
filter(info %regexp% 'genres') %>%
select(info_type_id),
by='info_type_id')
# get documentary movies;
doccos <- movie_genres %>%
filter(info %regexp% 'Documentary') %>%
select(movie_id)
# language information
movie_languages <- tbl(dbcon,'movie_info') %>%
inner_join(tbl(dbcon,'info_type') %>%
filter(info %regexp% 'languages') %>%
select(info_type_id),
by='info_type_id')
# get movies with English
unnerstandit <- movie_languages %>%
filter(info %regexp% 'English') %>%
select(movie_id)
# movies which are not documentaries, have some English, filtered by production year
movies <- tbl(dbcon,'title') %>%
select(-imdb_index,-ttid,-md5sum) %>%
anti_join(doccos %>% distinct(movie_id),by='movie_id') %>%
inner_join(unnerstandit %>% distinct(movie_id),by='movie_id') %>%
filter(production_year >= 1965,production_year <= 2015) %>%
collect(n=Inf)
```
Now I load information about people, selecting relationships where the person
is an actor or actress, director, writer, or producer of a film. I join the
gender and age information to the 'is in' relationship, and take sums
and weighted means, where the weighted means use a downweighting depending
on the `nr_order`.
```{r bechdel_isin,eval=TRUE,tidy=FALSE}
# change this to change downweighting.
# 3 = person #1 is twice as important as person #4
# 10 = person #1 is twice as important as person #11
ORDER_DOWNWEIGHTING <- 3.0
# acts/directs/writes/produces relation
# convert 'actress' and 'actor' to 'acts in' so that
# nr_order makes sense.
# mariadb = awesome, BTW
raw_in <- tbl(dbcon,'cast_info') %>%
inner_join(tbl(dbcon,'role_type') %>%
filter(role %regexp% 'actor|actress|producer|writer|director'),
by='role_id') %>%
select(person_id,movie_id,nr_order,role) %>%
collect(n=Inf) %>%
inner_join(movies %>% distinct(movie_id),by='movie_id') %>%
mutate(role=as.factor(gsub('actor|actress','actsin',role)))
#mutate(role=regexp_replace(role,'actor|actress','actsin'))
# then coalesce nr_order to the maximal value.
max_order <- raw_in %>%
group_by(movie_id,role) %>%
summarize(max_ord=pmin(1e4,max(coalesce(nr_order,0L),na.rm=TRUE))) %>%
ungroup()
#summarize(max_ord=max(coalesce(nr_order,0))) %>%
# then coalesce nr_order to the maximal value.
is_in <- raw_in %>%
inner_join(max_order,by=c('movie_id','role')) %>%
mutate(nr_order=as.numeric(nr_order)) %>%
mutate(nr_order=coalesce(nr_order,max_ord)) %>%
mutate(weight=2^(-nr_order/ORDER_DOWNWEIGHTING)) %>%
select(-nr_order,-max_ord)
#mutate(nr_order=coalesce(nr_order,max_ord)) %>%
# get person data
person_data <- tbl(dbcon,'name') %>%
select(person_id,name,gender,dob) %>%
filter(!is.na(dob)) %>%
filter(gender %regexp% 'm|f') %>%
mutate(yob=year(dob)) %>%
mutate(ismale=(gender=='m')) %>%
filter(yob >= 1875) %>%
collect(n=Inf)
# merge person data and is-in data
merge_in <- is_in %>%
inner_join(person_data %>% select(person_id,yob,ismale),by='person_id') %>%
inner_join(movies %>% distinct(movie_id,production_year),by='movie_id') %>%
mutate(age=pmax(0,pmin(100,production_year - yob))) %>%
select(-person_id,-production_year,-yob) %>%
group_by(role,movie_id) %>%
summarize(count=n(),
count_male=sum(ismale),wsum_male=sum(ismale * weight),
sum_age=sum(age),wsum_age=sum(age * weight),
wsum=sum(weight)) %>%
ungroup() %>%
mutate(mean_male=count_male / count,
wmean_male=wsum_male / wsum,
mean_age=sum_age / count,
wmean_age=wsum_age / wsum,
count_female=count-count_male) %>%
select(-wsum_male,-sum_age,-wsum_age,-wsum) %>%
mutate(status=as.factor(ifelse(count_female==0,'all male',ifelse(count_male==0,'all female','mixed'))))
```
Now I get information about films: IMDb ratings (votes), and domestic gross box office.
```{r bechdel_film_data,eval=TRUE,tidy=FALSE}
# votes for all movies, filtered by having enough votes
vote_info <- tbl(dbcon,'movie_votes') %>%
select(movie_id,votes,vote_mean,vote_sd) %>%
filter(votes >= 20) %>%
collect(n=Inf) %>%
inner_join(movies %>% distinct(movie_id),by='movie_id')
# US gross box office, in dollars. this is a view in the db
gross <- tbl(dbcon,'movie_US_gross') %>%
collect(n=Inf) %>%
rename(theat_gross_dollars=gross_dollars) %>%
rename(theat_last_date=last_report_date) %>%
inner_join(movies %>% distinct(movie_id),by='movie_id')
```
Now put them all together:
```{r bechdel_joinem,eval=TRUE,tidy=FALSE}
movie_data <- movies %>%
inner_join(vote_info,by='movie_id') %>%
inner_join(gross,by='movie_id') %>%
inner_join(merge_in,by='movie_id')
```
Now take a look:
```{r bechdel_show,eval=TRUE,tidy=FALSE}
movie_data %>% head(n=20) %>% kable()
```
## Bechdel test data
You can get this via their [API](http://bechdeltest.com/api/v1/doc#getMoviesByTitle). Get
rid of awful JSON with (awful) `jq`:
```{r bechdel_get,engine='bash',comment='bash',cache=TRUE,eval=FALSE}
curl -o bechdel.json "http://bechdeltest.com/api/v1/getMoviesByTitle"
echo -e "id\timdbid\tyear\ttitle\trating" > bechdel.tsv
jq15 -r -s '.[] | .[] | [.id, .imdbid, .year, .title, .rating] | @tsv' bechdel.json >> bechdel.tsv
```
Join them together and save:
```{r bechdel_join,eval=TRUE,tidy=FALSE}
upstream <- readr::read_tsv('bechdel.tsv')
# fix json problems
upstream <- upstream %>%
rename(bechdel_id=id) %>%
mutate(title=gsub('&','&',title)) %>%
mutate(title=gsub(''',"'",title))
# string matching. fun!
library(stringdist)
twoway_key <- movie_data %>%
select(movie_id,title,production_year) %>%
distinct(movie_id,.keep_all=TRUE) %>%
mutate(ttl=phonetic(title)) %>%
inner_join(upstream %>% mutate(ttl=phonetic(title)),by=c('ttl'='ttl','production_year'='year')) %>%
mutate(titled=stringdist(title.x,title.y,method='jw')) %>%
filter(titled < 0.1) %>%
arrange(titled) %>%
distinct(movie_id,.keep_all=TRUE) %>%
arrange(desc(titled)) %>%
select(movie_id,bechdel_id)
joined_data <- movie_data %>% inner_join(twoway_key,by='movie_id') %>%
inner_join(upstream %>% rename(bechdel_title=title,bechdel_test=rating),by='bechdel_id')
# write it so you all can have it.
library(readr)
readr::write_csv(joined_data,path='bechdel_data.csv')
```