Merge pull request #78 from JannisHoch/dev

version 0.0.3
JannisHoch · Sep 4, 2020 · 0b176f9 · 0b176f9
2 parents 10d4beb + 4a75350
commit 0b176f9
Show file tree

Hide file tree

Showing 29 changed files with 1,843 additions and 3,871 deletions.
diff --git a/.gitignore b/.gitignore
@@ -129,4 +129,7 @@ dmypy.json
 .pyre/
 
 #output folders
-OUT*/
+OUT*/
+
+#latest UCDP file too large for commit
+*ged201*
diff --git a/conflict_model/__init__.py b/conflict_model/__init__.py
@@ -2,10 +2,15 @@
 
 from . import selection
 from . import utils
-from . import get_boolean_conflict
-from . import get_var_from_nc
+from . import conflict
+from . import variables
 from . import machine_learning
+from . import data
+from . import pipeline
+from . import evaluation
+from . import models
+from . import plots
 
-__author__ = """Jannis M. Hoch"""
+__author__ = """Jannis M. Hoch, Niko Wanders, Sophie de Bruin"""
 __email__ = 'j.m.hoch@uu.nl'
-__version__ = '0.0.2'
+__version__ = '0.0.3'
diff --git a/conflict_model/conflict.py b/conflict_model/conflict.py
@@ -0,0 +1,150 @@
+import geopandas as gpd
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import os, sys
+
+def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year): 
+    """Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not.
+
+    Args:
+        conflict_gdf (geodataframe): geo-dataframe containing georeferenced information of conflict (tested with PRIO/UCDP data)
+        extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted
+        config (config): parsed configuration settings of run
+        sim_year (int): year for which data is extracted
+
+    Raises:
+        AssertionError: raised if the length of output list does not match length of input geo-dataframe
+
+    Returns:
+        list: list containing 0/1 per polygon depending on conflict occurence
+    """    
+
+    # select the entries which occured in this year
+    temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year]   
+
+    # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
+    data_merged = gpd.sjoin(temp_sel_year, extent_gdf)
+
+    # determine the aggregated amount of fatalities in one region (e.g. water province)
+    try:
+        fatalities_per_watProv = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
+    except:
+        fatalities_per_watProv = data_merged['best'].groupby(data_merged['name']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
+
+    # loop through all regions and check if exists in sub-set
+    # if so, this means that there was conflict and thus assign value 1
+    list_out = []
+    for i in range(len(extent_gdf)):
+        try:
+            i_watProv = extent_gdf.iloc[i]['watprovID']
+        except:
+            i_watProv = extent_gdf.iloc[i]['name']
+        if i_watProv in fatalities_per_watProv.index.values:
+            list_out.append(1)
+        else:
+            list_out.append(0)
+
+    if not len(extent_gdf) == len(list_out):
+        raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_out)))
+
+    return list_out
+
+def get_poly_ID(extent_gdf): 
+    """Extracts geometry information for each polygon from geodataframe and saves in list.
+
+    Args:
+        extent_gdf ([type]): [description]
+
+    Raises:
+        AssertionError: [description]
+
+    Returns:
+        [type]: [description]
+    """    
+
+    # initiatie empty list
+    list_ID = []
+
+    # loop through all polygons
+    for i in range(len(extent_gdf)):
+        # append geometry of each polygon to list
+        try:
+            list_ID.append(extent_gdf.iloc[i]['name'])
+        except:
+            list_ID.append(extent_gdf.iloc[i]['watprovID'])
+
+    # in the end, the same number of polygons should be in geodataframe and list        
+    if not len(extent_gdf) == len(list_ID):
+        raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_ID)))
+
+    return list_ID
+
+def get_poly_geometry(extent_gdf): 
+    """Extracts geometry information for each polygon from geodataframe and saves in list.
+
+    Args:
+        extent_gdf ([type]): [description]
+
+    Raises:
+        AssertionError: [description]
+
+    Returns:
+        [type]: [description]
+    """    
+
+    print('listing the geometry of all geographical units')
+
+    # initiatie empty list
+    list_geometry = []
+
+    # loop through all polygons
+    for i in range(len(extent_gdf)):
+        # append geometry of each polygon to list
+        list_geometry.append(extent_gdf.iloc[i]['geometry'])
+
+    # in the end, the same number of polygons should be in geodataframe and list        
+    if not len(extent_gdf) == len(list_geometry):
+        raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_geometry)))
+
+    return list_geometry
+
+def split_conflict_geom_data(X):
+    """[summary]
+
+    Args:
+        X ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """    
+
+    X_ID = X[:, 0]
+    X_geom = X[:, 1]
+    X_data = X[: , 2:]
+
+    return X_ID, X_geom, X_data
+
+def get_pred_conflict_geometry(X_test_ID, X_test_geom, y_test, y_pred):
+    """[summary]
+
+    Args:
+        X_test_ID ([type]): [description]
+        X_test_geom ([type]): [description]
+        y_test ([type]): [description]
+        y_pred ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """   
+
+    arr = np.column_stack((X_test_ID, X_test_geom, y_test, y_pred))
+
+    df = pd.DataFrame(arr, columns=['ID', 'geometry', 'y_test', 'y_pred'])
+
+    #TODO: think this through properly
+    # df['conflict_hit'] = np.where((df['y_test'] == 1) & (df['y_pred'] ==1), 1, np.nan)
+
+    df['overall_hit'] = np.where(df['y_test'] == df['y_pred'], 1, 0)
+
+    return df
diff --git a/conflict_model/data.py b/conflict_model/data.py
@@ -0,0 +1,124 @@
+from conflict_model import conflict, variables
+import numpy as np
+import xarray as xr
+import pandas as pd
+import os, sys
+
+
+def initiate_XY_data(config):
+    """[summary]
+
+    Args:
+        config ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """    
+
+    XY = {}
+    XY['poly_ID'] = pd.Series()
+    XY['poly_geometry'] = pd.Series()
+    for key in config.items('env_vars'):
+        XY[str(key[0])] = pd.Series(dtype=float)
+    XY['conflict'] = pd.Series(dtype=int)
+
+    if config.getboolean('general', 'verbose'): print('{}'.format(XY) + os.linesep)
+
+    return XY
+
+def fill_XY(XY, config, conflict_gdf, extent_active_polys_gdf):
+    """[summary]
+
+    Args:
+        XY ([type]): [description]
+        config ([type]): [description]
+        conflict_gdf ([type]): [description]
+        extent_active_polys_gdf ([type]): [description]
+
+    Raises:
+        Warning: [description]
+
+    Returns:
+        array: [description]
+    """    
+
+    if config.getboolean('general', 'verbose'): print('reading data for period from', str(config.getint('settings', 'y_start')), 'to', str(config.getint('settings', 'y_end')) + os.linesep)
+
+    # go through all simulation years as specified in config-file
+    for sim_year in np.arange(config.getint('settings', 'y_start'), config.getint('settings', 'y_end'), 1):
+
+        if config.getboolean('general', 'verbose'): print(os.linesep + 'entering year {}'.format(sim_year) + os.linesep)
+
+        # go through all keys in dictionary
+        for key, value in XY.items(): 
+
+            if key == 'conflict':
+
+                data_series = value
+                data_list = conflict.conflict_in_year_bool(conflict_gdf, extent_active_polys_gdf, config, sim_year)
+                data_series = data_series.append(pd.Series(data_list), ignore_index=True)
+                XY[key] = data_series
+
+            elif key == 'poly_ID':
+
+                data_series = value
+                data_list = conflict.get_poly_ID(extent_active_polys_gdf)
+                data_series = data_series.append(pd.Series(data_list), ignore_index=True)
+                XY[key] = data_series
+
+            elif key == 'poly_geometry':
+
+                data_series = value
+                data_list = conflict.get_poly_geometry(extent_active_polys_gdf)
+                data_series = data_series.append(pd.Series(data_list), ignore_index=True)
+                XY[key] = data_series
+
+            else:
+
+                nc_ds = xr.open_dataset(os.path.join(config.get('general', 'input_dir'), config.get('env_vars', key)))
+
+                if (np.dtype(nc_ds.time) == np.float32) or (np.dtype(nc_ds.time) == np.float64):
+                    data_series = value
+                    data_list = variables.nc_with_float_timestamp(extent_active_polys_gdf, config, key, sim_year)
+                    data_series = data_series.append(pd.Series(data_list), ignore_index=True)
+                    XY[key] = data_series
+
+                elif np.dtype(nc_ds.time) == 'datetime64[ns]':
+                    data_series = value
+                    data_list = variables.nc_with_continous_datetime_timestamp(extent_active_polys_gdf, config, key, sim_year)
+                    data_series = data_series.append(pd.Series(data_list), ignore_index=True)
+                    XY[key] = data_series
+
+                else:
+                    raise Warning('this nc-file does have a different dtype for the time variable than currently supported: {}'.format(nc_fo))
+
+    if config.getboolean('general', 'verbose'): print('...reading data DONE' + os.linesep)
+
+    return pd.DataFrame.from_dict(XY).to_numpy()
+
+def split_XY_data(XY, config):
+    """[summary]
+
+    Args:
+        XY (array): [description]
+
+    Returns:
+        [type]: [description]
+    """    
+
+    XY = pd.DataFrame(XY)
+    if config.getboolean('general', 'verbose'): print('number of data points including missing values:', len(XY))
+
+    XY = XY.dropna()
+    if config.getboolean('general', 'verbose'): print('number of data points excluding missing values:', len(XY))
+
+    XY = XY.to_numpy()
+    X = XY[:, :-1] # since conflict is the last column, we know that all previous columns must be variable values
+    Y = XY[:, -1]
+    Y = Y.astype(int)
+
+    if config.getboolean('general', 'verbose'): 
+        fraction_Y_1 = 100*len(np.where(Y != 0)[0])/len(Y)
+        print('from this, {0} points are equal to 1, i.e. represent conflict occurence. This is a fraction of {1} percent.'.format(len(np.where(Y != 0)[0]), round(fraction_Y_1, 2)))
+
+    return X, Y