ISA-tools · ptth222 · Mar 16, 2024 · Mar 17, 2024 · Mar 21, 2024 · May 20, 2024
diff --git a/isatools/isatab/dump/write.py b/isatools/isatab/dump/write.py
@@ -16,6 +16,7 @@
 )
 from isatools.isatab.defaults import log
 from isatools.isatab.graph import _all_end_to_end_paths, _longest_path_and_attrs
+from isatools.model.utils import _build_paths_and_indexes
 from isatools.isatab.utils import (
     get_comment_column,
     get_pv_columns,
@@ -256,24 +257,21 @@ def flatten(current_list):
 
             columns = []
 
-            # start_nodes, end_nodes = _get_start_end_nodes(a_graph)
-            paths = _all_end_to_end_paths(
-                a_graph, [x for x in a_graph.nodes()
-                          if isinstance(a_graph.indexes[x], Sample)])
+            paths, indexes = _build_paths_and_indexes(assay_obj.process_sequence)
             if len(paths) == 0:
                 log.info("No paths found, skipping writing assay file")
                 continue
-            if _longest_path_and_attrs(paths, a_graph.indexes) is None:
+            if _longest_path_and_attrs(paths, indexes) is None:
                 raise IOError(
                     "Could not find any valid end-to-end paths in assay graph")
 
             protocol_in_path_count = 0
-            for node_index in _longest_path_and_attrs(paths, a_graph.indexes):
-                node = a_graph.indexes[node_index]
+            output_label_in_path_counts = {}
+            name_label_in_path_counts = {}
+            for node_index in _longest_path_and_attrs(paths, indexes):
+                node = indexes[node_index]
                 if isinstance(node, Sample):
                     olabel = "Sample Name"
-                    # olabel = "Sample Name.{}".format(sample_in_path_count)
-                    # sample_in_path_count += 1
                     columns.append(olabel)
                     columns += flatten(
                         map(lambda x: get_comment_column(olabel, x),
@@ -302,7 +300,12 @@ def flatten(current_list):
                             protocol_types_dict
                         )
                         if oname_label is not None:
-                            columns.append(oname_label)
+                            if oname_label not in name_label_in_path_counts:
+                                name_label_in_path_counts[oname_label] = 0
+                            new_oname_label = oname_label + "." + str(name_label_in_path_counts[oname_label])
+
+                            columns.append(new_oname_label)
+                            name_label_in_path_counts[oname_label] += 1
                         elif node.executes_protocol.protocol_type.term.lower() \
                                 in protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
                             columns.extend(
@@ -312,12 +315,6 @@ def flatten(current_list):
                         map(lambda x: get_comment_column(olabel, x),
                             node.comments))
 
-                    for output in [x for x in node.outputs if isinstance(x, DataFile)]:
-                        if output.label not in columns:
-                            columns.append(output.label)
-                        columns += flatten(
-                            map(lambda x: get_comment_column(output.label, x),
-                                output.comments))
                 elif isinstance(node, Material):
                     olabel = node.type
                     columns.append(olabel)
@@ -329,7 +326,18 @@ def flatten(current_list):
                             node.comments))
 
                 elif isinstance(node, DataFile):
-                    pass  # handled in process
+                    # pass  # handled in process
+
+                    output_label = node.label
+                    if output_label not in output_label_in_path_counts:
+                        output_label_in_path_counts[output_label] = 0
+                    new_output_label = output_label + "." + str(output_label_in_path_counts[output_label])
+
+                    columns.append(new_output_label)
+                    output_label_in_path_counts[output_label] += 1
+                    columns += flatten(
+                        map(lambda x: get_comment_column(new_output_label, x),
+                            node.comments))
 
             omap = get_object_column_map(columns, columns)
 
@@ -344,8 +352,10 @@ def pbar(x):
                     df_dict[k].extend([""])
 
                 protocol_in_path_count = 0
+                output_label_in_path_counts = {}
+                name_label_in_path_counts = {}
                 for node_index in path_:
-                    node = a_graph.indexes[node_index]
+                    node = indexes[node_index]
                     if isinstance(node, Process):
                         olabel = "Protocol REF.{}".format(protocol_in_path_count)
                         protocol_in_path_count += 1
@@ -356,8 +366,12 @@ def pbar(x):
                                 protocol_types_dict
                             )
                             if oname_label is not None:
-                                df_dict[oname_label][-1] = node.name
-
+                                if oname_label not in name_label_in_path_counts:
+                                    name_label_in_path_counts[oname_label] = 0
+                                new_oname_label = oname_label + "." + str(name_label_in_path_counts[oname_label])
+
+                                df_dict[new_oname_label][-1] = node.name
+                                name_label_in_path_counts[oname_label] += 1
                             elif node.executes_protocol.protocol_type.term.lower() in \
                                     protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
                                 df_dict["Hybridization Assay Name"][-1] = \
@@ -375,23 +389,8 @@ def pbar(x):
                             colabel = "{0}.Comment[{1}]".format(olabel, co.name)
                             df_dict[colabel][-1] = co.value
 
-                        for output in [x for x in node.outputs if isinstance(x, DataFile)]:
-                            output_by_type = []
-                            delim = ";"
-                            olabel = output.label
-                            if output.label not in columns:
-                                columns.append(output.label)
-                            output_by_type.append(output.filename)
-                            df_dict[olabel][-1] = delim.join(map(str, output_by_type))
-
-                            for co in output.comments:
-                                colabel = "{0}.Comment[{1}]".format(olabel, co.name)
-                                df_dict[colabel][-1] = co.value
-
                     elif isinstance(node, Sample):
                         olabel = "Sample Name"
-                        # olabel = "Sample Name.{}".format(sample_in_path_count)
-                        # sample_in_path_count += 1
                         df_dict[olabel][-1] = node.name
                         for co in node.comments:
                             colabel = "{0}.Comment[{1}]".format(
@@ -418,7 +417,19 @@ def pbar(x):
                             df_dict[colabel][-1] = co.value
 
                     elif isinstance(node, DataFile):
-                        pass  # handled in process
+                        # pass  # handled in process
+
+                        output_label = node.label
+                        if output_label not in output_label_in_path_counts:
+                            output_label_in_path_counts[output_label] = 0
+                        new_output_label = output_label + "." + str(output_label_in_path_counts[output_label])
+                        df_dict[new_output_label][-1] = node.filename
+                        output_label_in_path_counts[output_label] += 1
+
+                        for co in node.comments:
+                            colabel = "{0}.Comment[{1}]".format(
+                                new_output_label, co.name)
+                            df_dict[colabel][-1] = co.value
 
             DF = DataFrame(columns=columns)
             DF = DF.from_dict(data=df_dict)
@@ -466,6 +477,11 @@ def pbar(x):
                     columns[i] = "Protocol REF"
                 elif "." in col:
                     columns[i] = col[:col.rindex(".")]
+                else:
+                    for output_label in output_label_in_path_counts:
+                        if output_label in col:
+                            columns[i] = output_label
+                            break
 
             log.debug("Rendered {} paths".format(len(DF.index)))
             if len(DF.index) > 1:
@@ -505,8 +521,6 @@ def write_value_columns(df_dict, label, x):
                 elif x.unit.term_source.name:
                     df_dict[label + ".Unit.Term Source REF"][-1] = x.unit.term_source.name
 
-            # df_dict[label + ".Unit.Term Source REF"][-1] = \
-            #     x.unit.term_source.name if x.unit.term_source else ""
             df_dict[label + ".Unit.Term Accession Number"][-1] = \
                 x.unit.term_accession
         else:

diff --git a/isatools/isatab/load/ProcessSequenceFactory.py b/isatools/isatab/load/ProcessSequenceFactory.py
@@ -1,3 +1,5 @@
+import re
+
 from isatools.isatab.utils import process_keygen, find_lt, find_gt, pairwise, get_object_column_map, get_value
 from isatools.isatab.defaults import (
     log,
@@ -146,11 +148,12 @@ def create_from_df(self, DF):
         except KeyError:
             pass
 
-        for data_col in [x for x in DF.columns if x.endswith(" File")]:
+        for data_col in [x for x in DF.columns if " File" in x]:
+            label = re.match(r'(.* File)', data_col).group(0)
             filenames = [x for x in DF[data_col].drop_duplicates() if x != '']
-            data.update(dict(map(lambda x: (':'.join([data_col, x]), DataFile(filename=x, label=data_col)), filenames)))
+            data.update(dict(map(lambda x: (':'.join([data_col, x]), DataFile(filename=x, label=label)), filenames)))
 
-        node_cols = [i for i, c in enumerate(DF.columns) if c in _LABELS_MATERIAL_NODES + _LABELS_DATA_NODES]
+        node_cols = [i for i, c in enumerate(DF.columns) if c in _LABELS_MATERIAL_NODES + _LABELS_DATA_NODES or ' File' in c]
         proc_cols = [i for i, c in enumerate(DF.columns) if c.startswith("Protocol REF")]
 
         try:
@@ -167,7 +170,7 @@ def get_node_by_label_and_key(labl, this_key):
                 n = samples[lk]
             elif labl in ('Extract Name', 'Labeled Extract Name'):
                 n = other_material[lk]
-            elif labl.endswith(' File'):
+            elif ' File' in labl:
                 n = data[lk]
             return n
 
@@ -260,7 +263,7 @@ def get_node_by_label_and_key(labl, this_key):
                             fv_set.add(fv)
                             material.factor_values = list(fv_set)
 
-            elif object_label in _LABELS_DATA_NODES:
+            elif object_label in _LABELS_DATA_NODES or ' File' in object_label:
                 for _, object_series in DF[column_group].drop_duplicates().iterrows():
                     try:
                         data_file = get_node_by_label_and_key(object_label, str(object_series[object_label]))

diff --git a/isatools/isatab/utils.py b/isatools/isatab/utils.py
@@ -496,7 +496,7 @@ def get_object_column_map(isatab_header, df_columns):
     """
     labels = _LABELS_MATERIAL_NODES + _LABELS_DATA_NODES
     if set(isatab_header) == set(df_columns):
-        object_index = [i for i, x in enumerate(df_columns) if x in labels or 'Protocol REF' in x]
+        object_index = [i for i, x in enumerate(df_columns) if x in labels or 'Protocol REF' in x or ' File' in x]
     else:
         object_index = [i for i, x in enumerate(isatab_header) if x in labels + ['Protocol REF']]
 

diff --git a/isatools/model/utils.py b/isatools/model/utils.py
@@ -1,3 +1,5 @@
+import itertools
+
 import networkx as nx
 from hashlib import md5, sha1, sha256, blake2b
 import os
@@ -18,6 +20,154 @@ def find(predictor, iterable):
     return None, it
 
 
+def _build_paths_and_indexes(process_sequence=None):
+    """Returns the paths from source/sample to end points and a mapping of sequence_identifier to object."""
+
+    def _compute_combinations(identifier_list, identifiers_to_objects):
+        io_types = {}
+        for identifier in identifier_list:
+            io_object = identifiers_to_objects[identifier]
+            if isinstance(io_object, DataFile):
+                label = io_object.label
+                if label not in io_types:
+                    io_types[label] = [identifier]
+                else:
+                    io_types[label].append(identifier)
+            else:
+                if "Material" not in io_types:
+                    io_types["Material"] = [identifier]
+                else:
+                    io_types["Material"].append(identifier)
+        combinations = [item for item in list(itertools.product(*[values for values in io_types.values()])) if item]
+        return combinations
+
+    ## Determining paths depends on processes having next and prev sequence, so add them if they aren't there 
+    ## based on inputs and outputs.
+    inputs_to_process = {id(p_input):{"process":process, "input":p_input} for process in process_sequence for p_input in process.inputs}
+    outputs_to_process = {id(output):{"process":process, "output":output} for process in process_sequence for output in process.outputs}
+    for output, output_dict in outputs_to_process.items():
+        if output in inputs_to_process:
+            if not inputs_to_process[output]["process"].prev_process:
+                inputs_to_process[output]["process"].prev_process = output_dict["process"]
+            if not output_dict["process"].next_process:
+                output_dict["process"].next_process = inputs_to_process[output]["process"]
+
+    paths = []
+    identifiers_to_objects = {}
+    all_inputs = set()
+    all_outputs = set()
+    for process in process_sequence:
+
+        identifiers_to_objects[process.sequence_identifier] = process
+        for output in process.outputs:
+            identifiers_to_objects[output.sequence_identifier] = output
+            all_outputs.add(output.sequence_identifier)
+        for input_ in process.inputs:
+            identifiers_to_objects[input_.sequence_identifier] = input_
+            all_inputs.add(input_.sequence_identifier)
+
+
+        original_process = process
+
+        right_processes = []
+        while next_process := process.next_process:
+            right_processes.append(next_process.sequence_identifier)
+            process = next_process
+
+        left_processes = []
+        process = original_process
+        while prev_process := process.prev_process:
+            left_processes.append(prev_process.sequence_identifier)
+            process = prev_process
+        left_processes = list(reversed(left_processes))
+
+        paths.append(left_processes + [original_process.sequence_identifier] + right_processes)
+
+
+    unique_paths = [list(x) for x in set(tuple(x) for x in paths)]
+    paths = unique_paths
+    dead_end_outputs = all_outputs - all_inputs
+
+    ## Add paths based on inputs and outputs.
+    str_path_to_path = {}
+    was_path_modified = {}
+    paths_seen = []
+    paths_seen_twice = []
+    while True:
+        new_paths = []
+        paths_seen_changed = False
+        for path in paths:
+            str_path = str(path)
+            str_path_to_path[str_path] = path
+            if path not in paths_seen:
+                paths_seen.append(path)
+                paths_seen_changed = True
+            else:
+                paths_seen_twice.append(path)
+                continue
+            path_len = len(path)
+            path_modified = False
+            for i, identifier in enumerate(path):
+                node = identifiers_to_objects[identifier]
+
+                if i == 0 and isinstance(node, Process):
+                    identifier_list = [input_.sequence_identifier for input_ in node.inputs]
+                    combinations = _compute_combinations(identifier_list, identifiers_to_objects)
+                    for combo in combinations:
+                        new_path = list(combo) + path
+                        path_modified = True
+                        if new_path not in new_paths:
+                            new_paths.append(new_path)
+                    continue
+
+                if i == path_len - 1 and isinstance(node, Process):
+                    identifier_list = [output.sequence_identifier for output in node.outputs]
+                    combinations = _compute_combinations(identifier_list, identifiers_to_objects)
+                    for combo in combinations:
+                        new_path = path + list(combo)
+                        path_modified = True
+                        if new_path not in new_paths:
+                            new_paths.append(new_path)
+                    continue
+
+                if i + 1 < path_len and isinstance(identifiers_to_objects[path[i+1]], Process) and i > 0 and isinstance(node, Process):
+                    output_sequence_identifiers = {output.sequence_identifier for output in node.outputs}
+                    input_sequence_identifiers = {input_.sequence_identifier for input_ in identifiers_to_objects[path[i+1]].inputs}
+                    identifier_intersection = output_sequence_identifiers.intersection(input_sequence_identifiers)
+
+                    combinations = _compute_combinations(identifier_intersection, identifiers_to_objects)
+                    for combo in combinations:
+                        new_path = path[0:i+1] + list(combo) + path[i+1:]
+                        path_modified = True
+                        if new_path not in new_paths:
+                            new_paths.append(new_path)
+
+                    ## Add outputs that aren't later used as inputs.
+                    for output in output_sequence_identifiers.intersection(dead_end_outputs):
+                        new_path = path[:i+1] + [output]
+                        path_modified = True
+                        if new_path not in new_paths:
+                            new_paths.append(new_path)
+                    continue
+            ## This is supposed to catch different length paths.
+            if not path_modified and path not in new_paths:
+                new_paths.append(path)
+
+            if str_path in was_path_modified:
+                if path_modified:
+                    was_path_modified[str_path] = path_modified
+            else:
+                was_path_modified[str_path] = path_modified
+        if not paths_seen_changed:
+            break
+        paths = new_paths
+
+
+    paths = [str_path_to_path[path] for path, was_modified in was_path_modified.items() if not was_modified]
+
+    return paths, identifiers_to_objects
+
+
 def _build_assay_graph(process_sequence=None):
     """:obj:`networkx.DiGraph` Returns a directed graph object based on a
     given ISA process sequence."""