Skip to content

Commit

Permalink
Merge pull request #15 from JannisHoch/dev
Browse files Browse the repository at this point in the history
aligned runner.py with notebook
  • Loading branch information
JannisHoch committed Jun 18, 2020
2 parents 57843f2 + 350e3b2 commit c35568a
Show file tree
Hide file tree
Showing 10 changed files with 435 additions and 720 deletions.
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\python.exe",
"python.pythonPath": "C:\\Users\\hoch0001\\AppData\\Local\\Continuum\\anaconda3\\envs\\conflict_model\\python.exe",
"restructuredtext.confPath": "${workspaceFolder}\\docs"
}
6 changes: 3 additions & 3 deletions conflict_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from . import selection
from . import utils
from . import analysis
from . import env_vars_nc
from . import get_boolean_conflict
from . import get_var_from_nc

__author__ = """Jannis M. Hoch"""
__email__ = 'j.m.hoch@uu.nl'
__version__ = '0.0.1-beta'
__version__ = '0.0.1'
64 changes: 0 additions & 64 deletions conflict_model/analysis.py

This file was deleted.

83 changes: 0 additions & 83 deletions conflict_model/env_vars_nc.py

This file was deleted.

49 changes: 49 additions & 0 deletions conflict_model/get_boolean_conflict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year):
"""Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not.
Args:
conflict_gdf (geodataframe): geo-dataframe containing georeferenced information of conflict (tested with PRIO/UCDP data)
extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted
config (config): parsed configuration settings of run
sim_year (int): year for which data is extracted
Raises:
AssertionError: raised if the length of output list does not match length of input geo-dataframe
Returns:
list: list containing 0/1 per polygon depending on conflict occurence
"""

print('determining whether a conflict took place or not')

# select the entries which occured in this year
temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year]

# merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
data_merged = gpd.sjoin(temp_sel_year, extent_gdf)

# determine the aggregated amount of fatalities in one region (e.g. water province)
fatalities_per_watProv = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})

# loop through all regions and check if exists in sub-set
# if so, this means that there was conflict and thus assign value 1
list_out = []
for i in range(len(extent_gdf)):
i_watProv = extent_gdf.iloc[i]['watprovID']
if i_watProv in fatalities_per_watProv.index.values:
list_out.append(1)
else:
list_out.append(0)

if not len(extent_gdf) == len(list_out):
raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_out)))

print('...DONE' + os.linesep)

return list_out
130 changes: 130 additions & 0 deletions conflict_model/get_var_from_nc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import xarray as xr
import rasterio as rio
import pandas as pd
import geopandas as gpd
import rasterstats as rstats
import numpy as np
import matplotlib.pyplot as plt
import os, sys

def nc_with_integer_timestamp(extent_gdf, config, var_name, sim_year, stat_func='mean'):
"""This function extracts a statistical value from a netCDF-file (specified in the config-file) for each polygon specified in extent_gdf for a given year.
By default, the mean value of all cells within a polygon is computed.
The resulting list does not contain additional meta-information about the files or polygons and is mostly intended for data-driven approaches such as machine learning.
NOTE:
The var_name must be identical to the key in the config-file.
NOTE:
This function is specifically written for netCDF-files where the time variable contains integer (year-)values, e.g. 1995, 1996, ...
NOTE:
Works only with nc-files with annual data.
Args:
extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted
config (config): parsed configuration settings of run
var_name (str): name of variable in nc-file, must also be the same under which path to nc-file is specified in cfg-file
sim_year (int): year for which data is extracted
stat_func (str, optional): Statistical function to be applied, choose from available options in rasterstats package. Defaults to 'mean'.
Raises:
ValueError: raised if the extracted variable at a time step does not contain data
Returns:
list: list containing statistical value per polygon, i.e. with same length as extent_gdf
"""
# get path to netCDF-file.
nc_fo = os.path.join(config.get('general', 'input_dir'),
config.get('env_vars', var_name))

print('calculating mean {0} per aggregation unit from file {1} for year {2}'.format(var_name, nc_fo, sim_year))

# open nc-file with xarray as dataset
nc_ds = xr.open_dataset(nc_fo)
# get xarray data-array for specified variable
nc_var = nc_ds[var_name]

# open nc-file with rasterio to get affine information
affine = rio.open(nc_fo).transform

# get values from data-array for specified year
nc_arr = nc_var.sel(time=sim_year)
nc_arr_vals = nc_arr.values
if nc_arr_vals.size == 0:
raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))

# initialize output list
list_out = []
# loop through all polygons in geo-dataframe and compute statistics, then append to output file
for i in range(len(extent_gdf)):
prov = extent_gdf.iloc[i]
zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func)
list_out.append(zonal_stats[0][stat_func])

print('...DONE' + os.linesep)

return list_out

def nc_with_continous_regular_timestamp(extent_gdf, config, var_name, sim_year, stat_func='mean'):
"""This function extracts a statistical value from a netCDF-file (specified in the config-file) for each polygon specified in extent_gdf for a given year.
By default, the mean value of all cells within a polygon is computed.
The resulting list does not contain additional meta-information about the files or polygons and is mostly intended for data-driven approaches such as machine learning.
NOTE:
The var_name must be identical to the key in the config-file.
NOTE:
Works only with nc-files with annual data.
Args:
extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted
config (config): parsed configuration settings of run
var_name (str): name of variable in nc-file, must also be the same under which path to nc-file is specified in cfg-file
sim_year (int): year for which data is extracted
stat_func (str, optional): Statistical function to be applied, choose from available options in rasterstats package. Defaults to 'mean'.
Raises:
ValueError: raised if specfied year cannot be found in years in nc-file
ValueError: raised if the extracted variable at a time step does not contain data
Returns:
list: list containing statistical value per polygon, i.e. with same length as extent_gdf
"""
# get path to netCDF-file.
nc_fo = os.path.join(config.get('general', 'input_dir'),
config.get('env_vars', var_name))

print('calculating mean {0} per aggregation unit from file {1} for year {2}'.format(var_name, nc_fo, sim_year))

# open nc-file with xarray as dataset
nc_ds = xr.open_dataset(nc_fo)
# get xarray data-array for specified variable
nc_var = nc_ds[var_name]
# get years contained in nc-file as integer array to be compatible with sim_year
years = pd.to_datetime(nc_ds.time.values).to_period(freq='Y').strftime('%Y').to_numpy(dtype=int)
if sim_year not in years:
raise ValueError('the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo))

# get index which corresponds with sim_year in years in nc-file
sim_year_idx = int(np.where(years == sim_year)[0])
# get values from data-array for specified year based on index
nc_arr = nc_var.sel(time=nc_ds.time.values[sim_year_idx])
nc_arr_vals = nc_arr.values
if nc_arr_vals.size == 0:
raise ValueError('no data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))

# open nc-file with rasterio to get affine information
affine = rio.open(nc_fo).transform

# initialize output list
list_out = []
# loop through all polygons in geo-dataframe and compute statistics, then append to output file
for i in range(len(extent_gdf)):
prov = extent_gdf.iloc[i]
zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func)
list_out.append(zonal_stats[0][stat_func])

print('...DONE' + os.linesep)

return list_out
7 changes: 4 additions & 3 deletions data/run_setting.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ output_dir=C:\Users\hoch0001\Documents\_code\conflict_model\data\OUT

[settings]
y_start=2000
y_end=2011
y_end=2015

[extent]
shp=waterProvinces/waterProvinces_Africa.shp
Expand All @@ -21,5 +21,6 @@ zones=BWh,BSh
code2class=KoeppenGeiger/classification_codes.txt

[env_vars]
GDP_PPP=GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc
evaporation=PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
#variable name here needs to be identical with variable name in nc-file
GDP_per_capita_PPP=GDP_HDI/GDP_per_capita_PPP_1990_2015_Africa.nc
total_evaporation=PCRGLOBWB/totalEvap/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc

0 comments on commit c35568a

Please sign in to comment.