Merge pull request #4 from caglorithm/feature/get_run

Feature/get run
caglorithm · May 27, 2020 · d20bc25 · d20bc25
2 parents 75e5e0d + b499c3f
commit d20bc25
Show file tree

Hide file tree

Showing 3 changed files with 118 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -75,17 +75,35 @@ Running the exploration is in parallel and is handled by `ray`. You can also use
 ```python
 ex.run()
 >> 100%|██████████| 441/441 [426.57it/s]
-ex.load_results()
 ```
 
-An overview of the runs and runs is given as a `pandas` DataFrame, available as `ex.df`. Here we load the result, which is simply a `float`, directly into the DataFrame. However, if the result was a timeseries (a `numpy.ndarray`), we could process it at this stage and extract some scalar value, for example the amplitude of the data or the dominant frequency. Using some fancy pivoting, we can create a 2D matrix with the results as entries
+After your exploration has finished, you will find a file `exploration.h5` in your current directory with all the runs, their parameters and their outputs, neatly organized. If you open this file (with [HDFView](https://www.hdfgroup.org/downloads/hdfview/) for example), you'll see something like this:
 
+<p align="center">
+  	<img alt="Build" src="resources/hdf_file.jpg">
+</p>
+
+
+
+## Loading exploration results
+
+You can load the exploration results using 
+
+```python
+ex.load_results(all=True)
+``` 
+
+Note that using `all=True` will load all results into memory. Please make sure that you have enough free memory for this. If not, do not use `all=True` but load individual results using their `run_id` (which is an integer counting up one per run):
+
+```python
+ex.get_run(run_id=0)
+``` 
+
+After using `ex.load_results()`, an overview of all runs and their parameters is given as a `pandas` DataFrame, available as `ex.df`. Using `ex.load_results()` with the default parameters will automatically aggregate all scalar results into this table, like `distance` in our example above, which is a float.
+
+Using some fancy pivoting, we can create a 2D matrix with the results as entries
 
 ```python
-ex.df["result"] = None
-for r in ex.df.index:
-    ex.df.loc[r, "result"] = ex.results[r]['result']
-
 pivoted = ex.df.pivot_table(values='result', index = 'y', columns='x', aggfunc='first')
 ```
 <p align="center">

diff --git a/mopet/mopet.py b/mopet/mopet.py
@@ -15,6 +15,10 @@ class Exploration:
 
     RUN_PREFIX = "run_"
 
+    ##############################################
+    ## USER FUNCTIONS
+    ##############################################
+
     def __init__(
         self,
         function,
@@ -70,6 +74,8 @@ def __init__(
 
     def run(self):
         """Start parameter exploration.
+
+        TODO: Pass kwargs in run() to the exploration function
         """
         # Initialize ray
         self._init_ray()
@@ -142,6 +148,91 @@ def run(self):
 
         self._shutdown_ray()
 
+    def load_results(
+        self, filename=None, exploration_name=None, aggregate=True, all=False
+    ):
+        """Load results from previouis explorations. This function 
+        will open an HDF file and look for an exploration. It will 
+        create a Pandas `Dataframe` object (accessible thorugh the 
+        attribute `.df`) with a list of all runs and their paramters. 
+        
+        You can load the exploration results using following paramters:
+
+        - If `aggregate==True`, all scalar results (such as `float` 
+        or `int`) from the exploration will be added to the Dataframe.
+        - If `all==True`, then all results, including arrays and other
+        types, will be saved in the attribute `.results`. This can take
+        up a lot of RAM since all results will be available. Only
+        use this option if you know that you have enough memory. Otherwise,
+        you might want to skip this and load results separately using the
+        method `.get_run()`.
+
+        :param filename: Filename of HDF file, usese default filename or previously used filename if not given, defaults to None
+        :type filename: str, optional
+        :param exploration_name: Name of the exploration, same as the group names of the explorations in the HDF file, defaults to None
+        :type exploration_name: str, optional
+        :param aggregate: Aggregate scalar results into the results Dataframe. If this option is enabled, , defaults to True
+        :type aggregate: bool, optional
+        :param all: Load all results into a dictionary available as the attribute `.results`. Can use a lot of RAM, defaults to False
+        :type all: bool, optional
+        """
+        if exploration_name is None:
+            exploration_name = self.exploration_name
+        else:
+            self.exploration_name = exploration_name
+
+        self._open_hdf(filename=filename)
+        self._load_all_results(exploration_name, all=all)
+        self._create_df()
+        if aggregate:
+            self._aggregate_results(exploration_name)
+        self.close_hdf()
+
+    def get_run(self, run_id=None, run_name=None, filename=None, exploration_name=None):
+        """Get a sigle result from a previous exploration. This function
+        will load a single result from the HDF file. Use this function
+        if you want to avoid loading all results to memory, which you can 
+        do using `.load_results(all=True)`.
+
+        Note: This function will open the HDF for reading but will not close
+        it afterwards! This is to speed up many sequential loads but it also
+        means that you have to close the HDF file yourself. You can do this
+        by uysing `.close_hdf()`.
+
+        :param run_id: Unique id of the run. Has to be given if run_name is not given, defaults to None
+        :type run_id: int, optional
+        :param run_name: The name of the run. Has to be given if run_id is not given, defaults to None
+        :type run_name: str, optional
+        :param exploration_name: Filename of the HDF with previous exploration results. Previously used filename will be used if not given, defaults to None
+        :type exploration_name: str, optional
+        :param exploration_name: Name of the exploration to load data from. Previously used exploration_name will be used if not given, defaults to None
+        :type exploration_name: str, optional
+        
+        :return: Results of the run
+        :rtype: dict
+        """
+        # get result by id or if not then by run_name (hdf_run)
+        assert (
+            run_id is not None or run_name is not None
+        ), "Either use `run_id` or `run_name`."
+
+        if exploration_name is None:
+            exploration_name = self.exploration_name
+        else:
+            self.exploration_name = exploration_name
+
+        if run_id is not None:
+            run_name = self.RUN_PREFIX + str(run_id)
+
+        if not self._hdf_open_for_reading:
+            self._open_hdf(filename)
+        run_results_group = self.h5file.get_node("/" + self.exploration_name, "runs")[
+            run_name
+        ]
+
+        result = self._read_group_as_dict(run_results_group)
+        return result
+
     def _cartesian_product_dict(self, input_dict):
         """Returns the cartesian product of the exploration parameters.
         
@@ -296,21 +387,6 @@ def _store_result_in_dictionary(self, result_id, result_dict):
     ## READ DATA
     ##############################################
 
-    def load_results(
-        self, filename=None, exploration_name=None, aggregate=True, all=False
-    ):
-        if exploration_name is None:
-            exploration_name = self.exploration_name
-        else:
-            self.exploration_name = exploration_name
-
-        self._open_hdf(filename=filename)
-        self._load_all_results(exploration_name, all=all)
-        self._create_df()
-        if aggregate:
-            self._aggregate_results(exploration_name)
-        self.close_hdf()
-
     def _create_df(self):
         logging.info("Creating new results DataFrame")
         self.explore_params = self._read_explore_params()
@@ -333,6 +409,8 @@ def _open_hdf(self, filename=None):
         logging.info(f"{self.hdf_filename} opened for reading.")
 
     def close_hdf(self):
+        """Close a previously opened HDF file.
+        """
         self.h5file.close()
         self._hdf_open_for_reading = False
         logging.info(f"{self.hdf_filename} closed.")
@@ -382,29 +460,6 @@ def _load_all_results(self, exploration_name=None, all=True):
 
         logging.debug(f"{len(self.results)} results loaded to memory.")
 
-    def get_run(self, run_id=None, run_name=None, exploration_name=None):
-        # get result by id or if not then by run_name (hdf_run)
-        assert (
-            run_id is not None or run_name is not None
-        ), "Either use `run_id` or `run_name`."
-
-        if exploration_name is None:
-            exploration_name = self.exploration_name
-        else:
-            self.exploration_name = exploration_name
-
-        if run_id is not None:
-            run_name = self.RUN_PREFIX + str(run_id)
-
-        if not self._hdf_open_for_reading:
-            self._open_hdf()
-        run_results_group = self.h5file.get_node("/" + self.exploration_name, "runs")[
-            run_name
-        ]
-
-        result = self._read_group_as_dict(run_results_group)
-        return result
-
     def _read_group_as_dict(self, group):
         return_dict = {}
         # iterate through all arrays in that group

diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
 
 setuptools.setup(
     name="mopet",
-    version="0.1.1",
+    version="0.1.2",
     description="The mildly ominous parameter exploration toolkit",
     long_description=long_description,
     long_description_content_type="text/markdown",