/
unbiasedciter.py
273 lines (217 loc) · 11 KB
/
unbiasedciter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
"""
hi, this code is to check your citations for race biases
please contact max bertolero at mbertolero@me.com for any questions!
all you need to do is make sure you have the correct path to your bib file and where you store this repo
then type something like:
%run unbiasedciter -authors 'Maxwell Bertolero Danielle Bassett' -bibfile '/Users/maxwell/Documents/GitHub/data/example.bib' -gender_key 'key from gender-api.com'
"""
import os
import pandas as pd
import tqdm
import matplotlib.pylab as plt
import matplotlib.gridspec as gridspec
import numpy as np
import json
import pickle
from urllib.request import urlopen
"""
some uncommon libs
"""
try:
from ethnicolr import pred_fl_reg_name
except:
os.system('pip install ethnicolr')
from ethnicolr import census_ln, pred_census_ln,pred_wiki_name
try:
from pybtex.database import parse_file
except:
os.system('pip install pybtex')
from pybtex.database import parse_file
import seaborn as sns
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-bibfile',action='store',dest='bibfile',default='/data/example.bib')
parser.add_argument('-homedir',action='store',dest='homedir',default='//Users/maxwell/Documents/GitHub/unbiasedciter/')
parser.add_argument('-method',action='store',dest='method',default='florida')
parser.add_argument('-authors',action='store',dest='authors')
parser.add_argument('-bibname',action='store',dest='bibname',default='mydata')
parser.add_argument('-font',action='store',dest='font',default='Palatino') # hey, we all have our favorite
parser.add_argument('-gender_key',action='store',dest='gender_key',default=None) # hey, we all have our favorite
r = parser.parse_args()
locals().update(r.__dict__)
bibfile = parse_file(bibfile)
def gender_base():
"""
for unknown gender, fill with base rates
you will never / can't run this (that file is too big to share)
"""
main_df = pd.read_csv('%s/data/NewArticleData2019.csv'%(homedir),header=0)
gender_base = {}
for year in np.unique(main_df.PY.values):
ydf = main_df[main_df.PY==year].AG
fa = np.array([x[0] for x in ydf.values])
la = np.array([x[1] for x in ydf.values])
fa_m = len(fa[fa=='M'])/ len(fa[fa!='U'])
fa_w = len(fa[fa=='W'])/ len(fa[fa!='U'])
la_m = len(la[fa=='M'])/ len(la[la!='U'])
la_w = len(la[fa=='W'])/ len(la[la!='U'])
gender_base[year] = [fa_m,fa_w,la_m,la_w]
gender_base[2020] = [fa_m,fa_w,la_m,la_w]
with open(homedir + '/data/gender_base' + '.pkl', 'wb') as f:
pickle.dump(gender_base, f, pickle.HIGHEST_PROTOCOL)
with open(homedir + '/data/gender_base' + '.pkl', 'rb') as f:
gender_base = pickle.load(f)
authors = authors.split(' ')
print ('first author is %s %s '%(authors[0],authors[1]))
print ('last author is %s %s '%(authors[2],authors[3]))
print ("we don't count these, but check the predictions file to ensure your names did not slip through!")
citation_matrix = np.zeros((8,8))
matrix_idxs = {'white_m':0,'api_m':1,'hispanic_m':2,'black_m':3,'white_f':4,'api_f':5,'hispanic_f':6,'black_f':7}
asian = [0,1,2]
black = [3,4]
white = [5,6,7,8,9,11,12]
hispanic = [10]
print ('looping through your references, predicting gender and race')
columns=['Reference Key','Author','Gender','W','A']
paper_df = pd.DataFrame(columns=columns)
gender = []
race = []
idx = 0
for paper in tqdm.tqdm(bibfile.entries,total=len(bibfile.entries)):
if 'author' not in bibfile.entries[paper].persons.keys():
continue #some editorials have no authors
if 'year' not in bibfile.entries[paper].fields.keys():
year = 2020
else: year = int(bibfile.entries[paper].fields['year'])
if year not in gender_base.keys():
gb = gender_base[1995]
else:
gb = gender_base[year]
fa = bibfile.entries[paper].persons['author'][0]
try:fa_fname = fa.first_names[0]
except:fa_fname = fa.last_names[0] #for people like Plato
fa_lname = fa.last_names[0]
la = bibfile.entries[paper].persons['author'][-1]
try:la_fname = la.first_names[0]
except:la_fname = la.last_names[0] #for people like Plato
la_lname = la.last_names[0]
if fa_fname.lower().strip() == authors[0].lower().strip():
if fa_lname.lower().strip() == authors[1].lower().strip() :
continue
if fa_fname.lower().strip() == authors[2].lower().strip() :
if fa_lname.lower().strip() == authors[3].lower().strip() :
continue
if la_fname.lower().strip() == authors[0].lower().strip() :
if la_lname.lower().strip() == authors[1].lower().strip() :
continue
if la_fname.lower().strip() == authors[2].lower().strip() :
if la_lname.lower().strip() == authors[3].lower().strip() :
continue
fa_fname = fa_fname.encode("ascii", errors="ignore").decode()
fa_lname = fa_lname.encode("ascii", errors="ignore").decode()
la_fname = la_fname.encode("ascii", errors="ignore").decode()
la_lname = la_lname.encode("ascii", errors="ignore").decode()
# 1/0
names = [{'lname': fa_lname,'fname':fa_fname}]
fa_df = pd.DataFrame(names,columns=['fname','lname'])
asian,hispanic,black,white = pred_fl_reg_name(fa_df,'lname','fname').values[0][[4, 8, 12, 16]]
fa_race = [white,asian,hispanic,black]
names = [{'lname': la_lname,'fname':la_fname}]
la_df = pd.DataFrame(names,columns=['fname','lname'])
asian,hispanic,black,white = pred_fl_reg_name(la_df,'lname','fname').values[0][[4, 8, 12, 16]]
la_race = [white,asian,hispanic,black]
url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" %(fa_fname)
response = urlopen(url)
decoded = response.read().decode('utf-8')
fa_gender = json.loads(decoded)
if fa_gender['gender'] == 'female':
fa_g = [0,fa_gender['accuracy']/100.]
if fa_gender['gender'] == 'male':
fa_g = [fa_gender['accuracy']/100.,0]
if fa_gender['gender'] == 'unknown':
fa_g = gb[:2]
url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" %(la_fname)
response = urlopen(url)
decoded = response.read().decode('utf-8')
la_gender = json.loads(decoded)
if la_gender['gender'] == 'female':
la_g = [0,la_gender['accuracy']/100.]
if la_gender['gender'] == 'male':
la_g = [la_gender['accuracy']/100.,0]
if la_gender['gender'] == 'unknown':
la_g = gb[2:]
fa_data = np.array([paper,'%s,%s'%(fa_fname,fa_lname),'%s,%s'%(fa_gender['gender'],fa_gender['accuracy']),fa_race[0],np.sum(fa_race[1:])]).reshape(1,5)
paper_df = paper_df.append(pd.DataFrame(fa_data,columns=columns),ignore_index =True)
la_data = np.array([paper,'%s,%s'%(la_fname,la_lname),'%s,%s'%(la_gender['gender'],la_gender['accuracy']),la_race[0],np.sum(la_race[1:])]).reshape(1,5)
paper_df = paper_df.append(pd.DataFrame(la_data,columns=columns),ignore_index =True)
mm = fa_g[0]*la_g[0]
wm = fa_g[1]*la_g[0]
mw = fa_g[0]*la_g[1]
ww = fa_g[1]*la_g[1]
mm,wm,mw,ww = [mm,wm,mw,ww]/np.sum([mm,wm,mw,ww])
gender.append([mm,wm,mw,ww])
ww = fa_race[0] * la_race[0]
aw = np.sum(fa_race[1:]) * la_race[0]
wa = fa_race[0] * np.sum(la_race[1:])
aa = np.sum(fa_race[1:]) * np.sum(la_race[1:])
race.append([ww,aw,wa,aa])
paper_matrix = np.zeros((2,8))
paper_matrix[0] = np.outer(fa_g,fa_race).flatten()
paper_matrix[1] = np.outer(la_g,la_race).flatten()
paper_matrix = np.outer(paper_matrix[0],paper_matrix[1])
citation_matrix = citation_matrix + paper_matrix
idx = idx + 1
mm,wm,mw,ww = np.mean(gender,axis=0)*100
WW,aw,wa,aa = np.mean(race,axis=0)*100
statement = "Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars\
are under-cited relative to the number of such papers in the field (1-5). Here we sought to proactively consider choosing references that reflect the \
diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \
and last author of each reference by using databases that store the probability of a first name being carried by a woman (5, 6). By this measure \
(and excluding self-citations to the first and last authors of our current paper), our references contain ww% woman(first)/woman(last), \
MW% man/woman, WM% woman/man, and MM% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the \
databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. \
Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a \
first and last name being carried by an author of color (7,8). By this measure (and excluding self-citations), our references contain AA% author of \
color (first)/author of color(last), WA% white author/author of color, AW% author of color/white author, and WW% white author/white author. This method \
is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) \
it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names. \
We look forward to future work that could help us to better understand how to support equitable practices in science."
statement = statement.replace('MM',str(np.around(mm,2)))
statement = statement.replace('WM',str(np.around(wm,2)))
statement = statement.replace('MW',str(np.around(mw,2)))
statement = statement.replace('ww',str(np.around(ww,2)))
statement = statement.replace('WW',str(np.around(WW,2)))
statement = statement.replace('AW',str(np.around(aw,2)))
statement = statement.replace('WA',str(np.around(wa,2)))
statement = statement.replace('AA',str(np.around(aa,2)))
print (statement)
cmap = sns.diverging_palette(220, 10, as_cmap=True)
names = ['white_m','api_m','hispanic_m','black_m','white_w','api_w','hispanic_w','black_w']
plt.close()
sns.set(style='white',font=font)
fig, axes = plt.subplots(ncols=2,nrows=1,figsize=(7.5,4))
axes = axes.flatten()
plt.sca(axes[0])
heat = sns.heatmap(np.around((citation_matrix/citation_matrix.sum())*100,2),annot=True,ax=axes[0],annot_kws={"size": 8},cmap=cmap,vmax=1,vmin=0)
axes[0].set_ylabel('first author',labelpad=0)
heat.set_yticklabels(names,rotation=0)
axes[0].set_xlabel('last author',labelpad=1)
heat.set_xticklabels(names,rotation=90)
heat.set_title('percentage of citations')
citation_matrix_sum = citation_matrix / np.sum(citation_matrix)
expected = np.load('%s/data/expected_matrix_florida.npy'%(homedir))
expected = expected/np.sum(expected)
percent_overunder = np.ceil( ((citation_matrix_sum - expected) / expected)*100)
plt.sca(axes[1])
heat = sns.heatmap(np.around(percent_overunder,2),annot=True,ax=axes[1],fmt='g',annot_kws={"size": 8},vmax=50,vmin=-50,cmap=cmap)
axes[1].set_ylabel('',labelpad=0)
heat.set_yticklabels('')
axes[1].set_xlabel('last author',labelpad=1)
heat.set_xticklabels(names,rotation=90)
heat.set_title('percentage over/under-citations')
plt.tight_layout()
print('Saving to %s/data/race_gender_citations_%s.pdf'%(homedir,bibname))
plt.savefig('%s/data/race_gender_citations_%s.pdf'%(homedir,bibname))
print('Saving to %s/data/predictions_%s.csv'%(homedir,bibname))
paper_df.to_csv('%s/data/predictions_%s.csv'%(homedir,bibname))
print("Finished!")