-
Notifications
You must be signed in to change notification settings - Fork 0
/
country.py
415 lines (342 loc) · 13.4 KB
/
country.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
from pathlib import Path
from typing import Any, Iterator, NamedTuple
import numpy as np
import pandas as pd
import geopandas as geo # type: ignore
from .frame_logger import FrameLogger, silent_logger
YEAR_LABELS = tuple(str(year) for year in range(2019, 2024))
REPORT_TOTALS = {
'2019': 16_987_361,
'2020': 21_751_085,
'2021': 29_397_681,
'2022': 32_059_029,
'2023': 36_210_368,
}
_PROBLEMATIC_GEOMETRIES = set(['France', 'Kosovo', 'N. Cyprus', 'Norway', 'Somaliland'])
def read_reports(path: str | Path) -> pd.DataFrame:
# Read data
reports = pd.read_csv(path, thousands=',').drop(columns='country')
# Validate data
actual = reports.shape[0]
if actual != 252:
raise AssertionError(f'{actual:,d} instead of 250 rows in "{path}"')
actual = reports['iso3'].isna().sum()
if actual != 1:
raise AssertionError(
f'{actual:,d} instead of 1 row without ISO Alpha-3 code in "{path}"'
)
for year, expected_nulls in zip(YEAR_LABELS, (10, 7, 7, 5, 8)):
actual_nulls = reports[year].isna().sum()
if actual_nulls != expected_nulls:
raise AssertionError(
f'{year}: {actual_nulls:,d} instead of {expected_nulls:,d} countries '
'with no reports'
)
# Clean up and reorganize data
reports = (
reports
# NCMEC includes a line for reports without country in each disclosure
# but adds them to USA's tally for analysis. We do the same.
.assign(iso3=lambda df: df['iso3'].fillna('USA'))
.fillna(0)
.melt(
id_vars=['iso3'],
value_vars=YEAR_LABELS,
var_name='year',
value_name='reports',
)
.astype({'iso3': 'category', 'year': 'category', 'reports': 'int'})
.groupby(['iso3', 'year'])
.sum()
)
# 252 rows - 1 extra row formerly without ISO3 then USA - 1 extra row GUF
actual = reports.index.get_level_values('iso3').nunique()
if actual != 250:
raise AssertionError(
f'{actual:,d} instead of 250 ISO Alpha-3 codes with reports'
)
# Sum up reports.
total_yearly_reports = reports.groupby(level='year')['reports'].sum()
reports['reports_pct'] = reports['reports'] / total_yearly_reports * 100
# Validate totals.
for year, expected_total in REPORT_TOTALS.items():
if year not in total_yearly_reports.index:
continue
actual_total = total_yearly_reports[year]
if actual_total != expected_total:
raise AssertionError(
f'{year}: {actual_total:,d} instead of {expected_total:,d}'
'total reports'
)
# Add column with percentage fractions.
actual_pct = reports.groupby(level='year')['reports_pct'].sum()
expected_pct = pd.Series([100.0] * len(YEAR_LABELS), index=YEAR_LABELS)
if not actual_pct.equals(expected_pct):
raise AssertionError(f'{actual_pct} instead of {expected_pct} report fractions')
return reports
def read_populations(path: str | Path) -> pd.DataFrame:
populations = (
pd.read_csv(
path,
sep="|",
usecols=['Iso3', 'VariantId', 'TimeLabel', 'SexId', 'Value'],
dtype={'Iso3': 'category', 'TimeLabel': 'category', 'Value': 'int'},
)
.rename(columns={'Iso3': 'iso3', 'TimeLabel': 'year', 'Value': 'population'})
.pipe(lambda df: df[(df['VariantId'] == 4) & (df['SexId'] == 3)])
.drop(columns=['VariantId', 'SexId'])
.set_index(['iso3', 'year'])
)
row_no = populations.shape[0]
if row_no != 1_180:
raise AssertionError(
f'{row_no:,d} instead of 1,180 rows with population counts'
)
country_no = populations.index.get_level_values('iso3').nunique()
if country_no != 236:
raise AssertionError(
f'{country_no:,d} instead of 236 countries with population counts'
)
# Compute total population per year and add column with percentage fraction.
total_yearly_populations = populations.groupby(level='year')['population'].sum()
populations['population_pct'] = (
populations['population'] / total_yearly_populations * 100
)
actual_pct = populations.groupby(level='year')['population_pct'].sum()
expected_pct = pd.Series([100.0] * len(YEAR_LABELS), index=YEAR_LABELS)
if not actual_pct.equals(expected_pct):
raise AssertionError(
f'{actual_pct} instead of {expected_pct} population fractions'
)
return populations
def read_online(path: str | Path) -> pd.DataFrame:
online = (
pd.
read_csv(
path,
usecols=['iso3', 'year', 'online'],
dtype={'iso3': 'category', 'year': 'category', 'online': 'int'},
)
.set_index(['iso3', 'year'])
)
total_yearly_online = online.groupby(level='year')['online'].sum()
online['online_pct'] = online['online'] / total_yearly_online * 100
if len(online) != 6_406:
raise AssertionError(
f'{len(online)} instead of 6,406 rows for online populations'
)
return online
def read_countries(path: str | Path) -> pd.DataFrame:
countries = pd.read_csv(path, index_col='iso3', dtype='category', na_filter=False)
if countries.at['NAM', 'iso2'] != 'NA':
raise AssertionError("Namibia's ISO2 code (NA) is *not* null")
return countries
def read_regions(path: str | Path) -> pd.DataFrame:
regions = (
(rs := pd.read_csv(path))
.merge(
rs,
how='left',
left_on='superregion',
right_on='region',
suffixes=('', '2'),
)
.assign(continent=lambda df: df['superregion2'].fillna(df['superregion']))
.drop(columns=['superregion2', 'region2'])
.astype('category')
)
return regions
def read_arab_league(path: str | Path) -> pd.DataFrame:
arab_league = pd.read_csv(path).drop(columns=['country']).astype('category')
assert arab_league.shape[0] == 22
return arab_league
def read_geometries(path: str | Path) -> pd.DataFrame:
# https://github.com/geopandas/geopandas/blob/04c2dee547777d9e87f9df4c85cc372a03b29f93/geopandas/datasets/naturalearth_creation.py#L87
geometries = (
geo.read_file(path)
.rename(columns={'ISO_A3': 'iso3', 'NAME': 'name'})
.loc[:, ['iso3', 'name', 'geometry']]
)
# Check that countries with missing ISO Alpha-3 codes are the expected five.
# North Cyprus and Somaliland are not internationally recognized and we
# treat them as part of Cyprus and Somalia, respectively.
missing_iso3 = set(geometries.query('iso3 == "-99"')['name'].to_list())
if missing_iso3 != _PROBLEMATIC_GEOMETRIES:
raise AssertionError(
f'geometries for {missing_iso3} instead of {_PROBLEMATIC_GEOMETRIES} '
'without ISO Alpha-3 code'
)
country = geometries['name']
geometries.loc[country == 'France', 'iso3'] = 'FRA'
geometries.loc[country == 'Kosovo', 'iso3'] = 'XKX'
geometries.loc[country == 'N. Cyprus', 'iso3'] = 'CYP'
geometries.loc[country == 'Norway', 'iso3'] = 'NOR'
geometries.loc[country == 'Somaliland', 'iso3'] = 'SOM'
geometries = (
geometries.drop(columns=['name']).astype({'iso3': 'category'}).set_index('iso3')
)
# There are two fewer countries thanks to N. Cyprus and Somaliland.
actual = geometries.shape[0]
if actual != 177:
raise AssertionError(f'{actual:,d} instead of 177 geometries')
actual = geometries.index.nunique()
if actual != 175:
raise AssertionError(f'{actual:,d} instead of 175 countries with geometries')
geometries = geometries.dissolve(by='iso3')
actual = geometries.shape[0]
if actual != 175:
raise AssertionError(f'{actual:,d} instead of 175 geometries after dissolution')
return geometries
def without_populations(
reports: pd.DataFrame, populations: pd.DataFrame, countries: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame]:
countries_without = (
reports.index
.get_level_values('iso3')
.difference(populations.index.get_level_values('iso3'))
.to_frame()
.drop(columns='iso3')
.merge(countries, how='left', on='iso3')
.drop(columns='iso2')
)
reports_without = (
reports[reports.index.get_level_values('iso3').isin(countries_without.index)]
.groupby('year')
.sum()
)
expected = [28, 97, 243, 117, 58]
actual = reports_without['reports']
if not np.array_equal(actual, expected):
raise AssertionError(
'reports for countries without population'
f' {",".join(str(n) for n in actual)} does not match expected '
f' {",".join(str(n) for n in expected)}'
)
return countries_without, reports_without
def merge_reports_per_country(
reports: pd.DataFrame,
populations: pd.DataFrame,
online: pd.DataFrame,
countries: pd.DataFrame,
regions: pd.DataFrame,
arab_league: pd.DataFrame,
) -> pd.DataFrame:
df = (
reports.merge(populations, how='inner', left_index=True, right_index=True)
# index is (iso3, year)
.merge(online, how='left', on=['iso3', 'year'])
# index still is (iso3, year)
.reset_index(level='year')
# index is iso3
.merge(countries, how='left', on='iso3')
.reset_index()
# index is row number
.merge(regions, how='left', on='region')
.astype({'region': 'category'}) # Restore category lost on merge
)
df.insert(4, 'reports_per_capita', df['reports'] / df['population'])
df['arab_league'] = df['iso3'].isin(arab_league['iso3'])
df = df.set_index(['iso3', 'year'])
expected_rows = populations.shape[0]
actual_rows = df.shape[0]
if actual_rows != expected_rows:
raise AssertionError(
f'{actual_rows:,d} instead of {expected_rows:,d} rows in merged table'
)
return df
def summarize_arab_league(frame: pd.DataFrame) -> pd.DataFrame:
rows = []
for year in YEAR_LABELS:
year_in_league = frame.query(f'year == "{year}" and arab_league == True')
reports = year_in_league['reports'].sum()
population = year_in_league['population'].sum()
rows.append(
{
'iso3': '\u262A',
'year': year,
'reports': reports,
'reports_pct': year_in_league['reports_pct'].sum(),
'reports_per_capita': reports / population,
'population': population,
'population_pct': year_in_league['population_pct'].sum(),
'country': 'Arab League',
'iso2': '\u262A',
'region': None,
'superregion': None,
'continent': None,
'arab_league': False, # Only True for members, not a synthetic summary
}
)
addendum = pd.DataFrame(rows).set_index(['iso3', 'year'])
return pd.concat([frame, addendum])
class ReportsPerCountry(NamedTuple):
reports_per_capita: pd.DataFrame
reports: pd.DataFrame
populations: pd.DataFrame
online: pd.DataFrame
countries: pd.DataFrame
regions: pd.DataFrame
arab_league: pd.DataFrame
geometries: None | pd.DataFrame
def ingest_reports_per_country(
path: str | Path,
*,
logger: None | FrameLogger = None,
load_geometries: bool = False,
) -> ReportsPerCountry:
path = Path(path)
if logger is None:
logger = silent_logger
reports = read_reports(path / 'csam-reports-per-country.csv')
logger(reports, caption='reports')
populations = read_populations(path / 'populations.csv')
logger(populations, caption='populations')
online = read_online(path / 'internet-users.csv')
logger(online, caption='online')
countries = read_countries(path / 'countries.csv')
logger(countries, caption='countries')
regions = read_regions(path / 'regions.csv')
logger(regions, caption='regions')
arab_league = read_arab_league(path / 'arab-league.csv')
logger(arab_league, caption='arab_league')
geometries = None
if load_geometries:
geometries = read_geometries(
path / 'naturalearth/ne_110m_admin_0_countries.shp'
)
logger(geometries, caption='geometries')
reports_per_capita = merge_reports_per_country(
reports, populations, online, countries, regions, arab_league
)
logger(reports_per_capita, caption='reports per capita')
reports_per_capita = summarize_arab_league(reports_per_capita)
return ReportsPerCountry(
reports_per_capita,
reports,
populations,
online,
countries,
regions,
arab_league,
geometries,
)
def reports_per_capita_country_year(
reports_per_country: ReportsPerCountry,
) -> Iterator[Any]:
"""
Create an iterator over the year and reports per capita per country, with
the entries sorted in descending order by reports per capita. The index is a
country's rank.
"""
sorted_and_grouped = (
reports_per_country.reports_per_capita.drop(
columns=['region', 'superregion', 'continent']
)
.sort_values('reports_per_capita', ascending=False)
.groupby('year')
)
for year, group in sorted_and_grouped:
group = group.reset_index()
group.index = group.index + 1
group.index.name = 'rank'
yield year, group