Skip to content

Commit

Permalink
Merge pull request #78 from JannisHoch/dev
Browse files Browse the repository at this point in the history
version 0.0.3
  • Loading branch information
JannisHoch committed Sep 4, 2020
2 parents 10d4beb + 4a75350 commit 0b176f9
Show file tree
Hide file tree
Showing 29 changed files with 1,843 additions and 3,871 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,7 @@ dmypy.json
.pyre/

#output folders
OUT*/
OUT*/

#latest UCDP file too large for commit
*ged201*
13 changes: 9 additions & 4 deletions conflict_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@

from . import selection
from . import utils
from . import get_boolean_conflict
from . import get_var_from_nc
from . import conflict
from . import variables
from . import machine_learning
from . import data
from . import pipeline
from . import evaluation
from . import models
from . import plots

__author__ = """Jannis M. Hoch"""
__author__ = """Jannis M. Hoch, Niko Wanders, Sophie de Bruin"""
__email__ = 'j.m.hoch@uu.nl'
__version__ = '0.0.2'
__version__ = '0.0.3'
150 changes: 150 additions & 0 deletions conflict_model/conflict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys

def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year):
"""Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not.
Args:
conflict_gdf (geodataframe): geo-dataframe containing georeferenced information of conflict (tested with PRIO/UCDP data)
extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted
config (config): parsed configuration settings of run
sim_year (int): year for which data is extracted
Raises:
AssertionError: raised if the length of output list does not match length of input geo-dataframe
Returns:
list: list containing 0/1 per polygon depending on conflict occurence
"""

# select the entries which occured in this year
temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year]

# merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
data_merged = gpd.sjoin(temp_sel_year, extent_gdf)

# determine the aggregated amount of fatalities in one region (e.g. water province)
try:
fatalities_per_watProv = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
except:
fatalities_per_watProv = data_merged['best'].groupby(data_merged['name']).sum().to_frame().rename(columns={"best": 'total_fatalities'})

# loop through all regions and check if exists in sub-set
# if so, this means that there was conflict and thus assign value 1
list_out = []
for i in range(len(extent_gdf)):
try:
i_watProv = extent_gdf.iloc[i]['watprovID']
except:
i_watProv = extent_gdf.iloc[i]['name']
if i_watProv in fatalities_per_watProv.index.values:
list_out.append(1)
else:
list_out.append(0)

if not len(extent_gdf) == len(list_out):
raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_out)))

return list_out

def get_poly_ID(extent_gdf):
"""Extracts geometry information for each polygon from geodataframe and saves in list.
Args:
extent_gdf ([type]): [description]
Raises:
AssertionError: [description]
Returns:
[type]: [description]
"""

# initiatie empty list
list_ID = []

# loop through all polygons
for i in range(len(extent_gdf)):
# append geometry of each polygon to list
try:
list_ID.append(extent_gdf.iloc[i]['name'])
except:
list_ID.append(extent_gdf.iloc[i]['watprovID'])

# in the end, the same number of polygons should be in geodataframe and list
if not len(extent_gdf) == len(list_ID):
raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_ID)))

return list_ID

def get_poly_geometry(extent_gdf):
"""Extracts geometry information for each polygon from geodataframe and saves in list.
Args:
extent_gdf ([type]): [description]
Raises:
AssertionError: [description]
Returns:
[type]: [description]
"""

print('listing the geometry of all geographical units')

# initiatie empty list
list_geometry = []

# loop through all polygons
for i in range(len(extent_gdf)):
# append geometry of each polygon to list
list_geometry.append(extent_gdf.iloc[i]['geometry'])

# in the end, the same number of polygons should be in geodataframe and list
if not len(extent_gdf) == len(list_geometry):
raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_geometry)))

return list_geometry

def split_conflict_geom_data(X):
"""[summary]
Args:
X ([type]): [description]
Returns:
[type]: [description]
"""

X_ID = X[:, 0]
X_geom = X[:, 1]
X_data = X[: , 2:]

return X_ID, X_geom, X_data

def get_pred_conflict_geometry(X_test_ID, X_test_geom, y_test, y_pred):
"""[summary]
Args:
X_test_ID ([type]): [description]
X_test_geom ([type]): [description]
y_test ([type]): [description]
y_pred ([type]): [description]
Returns:
[type]: [description]
"""

arr = np.column_stack((X_test_ID, X_test_geom, y_test, y_pred))

df = pd.DataFrame(arr, columns=['ID', 'geometry', 'y_test', 'y_pred'])

#TODO: think this through properly
# df['conflict_hit'] = np.where((df['y_test'] == 1) & (df['y_pred'] ==1), 1, np.nan)

df['overall_hit'] = np.where(df['y_test'] == df['y_pred'], 1, 0)

return df
124 changes: 124 additions & 0 deletions conflict_model/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
from conflict_model import conflict, variables
import numpy as np
import xarray as xr
import pandas as pd
import os, sys


def initiate_XY_data(config):
"""[summary]
Args:
config ([type]): [description]
Returns:
[type]: [description]
"""

XY = {}
XY['poly_ID'] = pd.Series()
XY['poly_geometry'] = pd.Series()
for key in config.items('env_vars'):
XY[str(key[0])] = pd.Series(dtype=float)
XY['conflict'] = pd.Series(dtype=int)

if config.getboolean('general', 'verbose'): print('{}'.format(XY) + os.linesep)

return XY

def fill_XY(XY, config, conflict_gdf, extent_active_polys_gdf):
"""[summary]
Args:
XY ([type]): [description]
config ([type]): [description]
conflict_gdf ([type]): [description]
extent_active_polys_gdf ([type]): [description]
Raises:
Warning: [description]
Returns:
array: [description]
"""

if config.getboolean('general', 'verbose'): print('reading data for period from', str(config.getint('settings', 'y_start')), 'to', str(config.getint('settings', 'y_end')) + os.linesep)

# go through all simulation years as specified in config-file
for sim_year in np.arange(config.getint('settings', 'y_start'), config.getint('settings', 'y_end'), 1):

if config.getboolean('general', 'verbose'): print(os.linesep + 'entering year {}'.format(sim_year) + os.linesep)

# go through all keys in dictionary
for key, value in XY.items():

if key == 'conflict':

data_series = value
data_list = conflict.conflict_in_year_bool(conflict_gdf, extent_active_polys_gdf, config, sim_year)
data_series = data_series.append(pd.Series(data_list), ignore_index=True)
XY[key] = data_series

elif key == 'poly_ID':

data_series = value
data_list = conflict.get_poly_ID(extent_active_polys_gdf)
data_series = data_series.append(pd.Series(data_list), ignore_index=True)
XY[key] = data_series

elif key == 'poly_geometry':

data_series = value
data_list = conflict.get_poly_geometry(extent_active_polys_gdf)
data_series = data_series.append(pd.Series(data_list), ignore_index=True)
XY[key] = data_series

else:

nc_ds = xr.open_dataset(os.path.join(config.get('general', 'input_dir'), config.get('env_vars', key)))

if (np.dtype(nc_ds.time) == np.float32) or (np.dtype(nc_ds.time) == np.float64):
data_series = value
data_list = variables.nc_with_float_timestamp(extent_active_polys_gdf, config, key, sim_year)
data_series = data_series.append(pd.Series(data_list), ignore_index=True)
XY[key] = data_series

elif np.dtype(nc_ds.time) == 'datetime64[ns]':
data_series = value
data_list = variables.nc_with_continous_datetime_timestamp(extent_active_polys_gdf, config, key, sim_year)
data_series = data_series.append(pd.Series(data_list), ignore_index=True)
XY[key] = data_series

else:
raise Warning('this nc-file does have a different dtype for the time variable than currently supported: {}'.format(nc_fo))

if config.getboolean('general', 'verbose'): print('...reading data DONE' + os.linesep)

return pd.DataFrame.from_dict(XY).to_numpy()

def split_XY_data(XY, config):
"""[summary]
Args:
XY (array): [description]
Returns:
[type]: [description]
"""

XY = pd.DataFrame(XY)
if config.getboolean('general', 'verbose'): print('number of data points including missing values:', len(XY))

XY = XY.dropna()
if config.getboolean('general', 'verbose'): print('number of data points excluding missing values:', len(XY))

XY = XY.to_numpy()
X = XY[:, :-1] # since conflict is the last column, we know that all previous columns must be variable values
Y = XY[:, -1]
Y = Y.astype(int)

if config.getboolean('general', 'verbose'):
fraction_Y_1 = 100*len(np.where(Y != 0)[0])/len(Y)
print('from this, {0} points are equal to 1, i.e. represent conflict occurence. This is a fraction of {1} percent.'.format(len(np.where(Y != 0)[0]), round(fraction_Y_1, 2)))

return X, Y

0 comments on commit 0b176f9

Please sign in to comment.