Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tracking to data file type column names 2. #553

Open
wants to merge 8 commits into
base: issue-511
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
90 changes: 52 additions & 38 deletions isatools/isatab/dump/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)
from isatools.isatab.defaults import log
from isatools.isatab.graph import _all_end_to_end_paths, _longest_path_and_attrs
from isatools.model.utils import _build_paths_and_indexes
from isatools.isatab.utils import (
get_comment_column,
get_pv_columns,
Expand Down Expand Up @@ -256,24 +257,21 @@ def flatten(current_list):

columns = []

# start_nodes, end_nodes = _get_start_end_nodes(a_graph)
paths = _all_end_to_end_paths(
a_graph, [x for x in a_graph.nodes()
if isinstance(a_graph.indexes[x], Sample)])
paths, indexes = _build_paths_and_indexes(assay_obj.process_sequence)
if len(paths) == 0:
log.info("No paths found, skipping writing assay file")
continue
if _longest_path_and_attrs(paths, a_graph.indexes) is None:
if _longest_path_and_attrs(paths, indexes) is None:
raise IOError(
"Could not find any valid end-to-end paths in assay graph")

protocol_in_path_count = 0
for node_index in _longest_path_and_attrs(paths, a_graph.indexes):
node = a_graph.indexes[node_index]
output_label_in_path_counts = {}
name_label_in_path_counts = {}
for node_index in _longest_path_and_attrs(paths, indexes):
node = indexes[node_index]
if isinstance(node, Sample):
olabel = "Sample Name"
# olabel = "Sample Name.{}".format(sample_in_path_count)
# sample_in_path_count += 1
columns.append(olabel)
columns += flatten(
map(lambda x: get_comment_column(olabel, x),
Expand Down Expand Up @@ -302,7 +300,12 @@ def flatten(current_list):
protocol_types_dict
)
if oname_label is not None:
columns.append(oname_label)
if oname_label not in name_label_in_path_counts:
name_label_in_path_counts[oname_label] = 0
new_oname_label = oname_label + "." + str(name_label_in_path_counts[oname_label])

columns.append(new_oname_label)
name_label_in_path_counts[oname_label] += 1
elif node.executes_protocol.protocol_type.term.lower() \
in protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
columns.extend(
Expand All @@ -312,12 +315,6 @@ def flatten(current_list):
map(lambda x: get_comment_column(olabel, x),
node.comments))

for output in [x for x in node.outputs if isinstance(x, DataFile)]:
if output.label not in columns:
columns.append(output.label)
columns += flatten(
map(lambda x: get_comment_column(output.label, x),
output.comments))
elif isinstance(node, Material):
olabel = node.type
columns.append(olabel)
Expand All @@ -329,7 +326,18 @@ def flatten(current_list):
node.comments))

elif isinstance(node, DataFile):
pass # handled in process
# pass # handled in process

output_label = node.label
if output_label not in output_label_in_path_counts:
output_label_in_path_counts[output_label] = 0
new_output_label = output_label + "." + str(output_label_in_path_counts[output_label])

columns.append(new_output_label)
output_label_in_path_counts[output_label] += 1
columns += flatten(
map(lambda x: get_comment_column(new_output_label, x),
node.comments))

omap = get_object_column_map(columns, columns)

Expand All @@ -344,8 +352,10 @@ def pbar(x):
df_dict[k].extend([""])

protocol_in_path_count = 0
output_label_in_path_counts = {}
name_label_in_path_counts = {}
for node_index in path_:
node = a_graph.indexes[node_index]
node = indexes[node_index]
if isinstance(node, Process):
olabel = "Protocol REF.{}".format(protocol_in_path_count)
protocol_in_path_count += 1
Expand All @@ -356,8 +366,12 @@ def pbar(x):
protocol_types_dict
)
if oname_label is not None:
df_dict[oname_label][-1] = node.name

if oname_label not in name_label_in_path_counts:
name_label_in_path_counts[oname_label] = 0
new_oname_label = oname_label + "." + str(name_label_in_path_counts[oname_label])

df_dict[new_oname_label][-1] = node.name
name_label_in_path_counts[oname_label] += 1
elif node.executes_protocol.protocol_type.term.lower() in \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see comment above, same logic

protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
df_dict["Hybridization Assay Name"][-1] = \
Expand All @@ -375,23 +389,8 @@ def pbar(x):
colabel = "{0}.Comment[{1}]".format(olabel, co.name)
df_dict[colabel][-1] = co.value

for output in [x for x in node.outputs if isinstance(x, DataFile)]:
output_by_type = []
delim = ";"
olabel = output.label
if output.label not in columns:
columns.append(output.label)
output_by_type.append(output.filename)
df_dict[olabel][-1] = delim.join(map(str, output_by_type))

for co in output.comments:
colabel = "{0}.Comment[{1}]".format(olabel, co.name)
df_dict[colabel][-1] = co.value

elif isinstance(node, Sample):
olabel = "Sample Name"
# olabel = "Sample Name.{}".format(sample_in_path_count)
# sample_in_path_count += 1
df_dict[olabel][-1] = node.name
for co in node.comments:
colabel = "{0}.Comment[{1}]".format(
Expand All @@ -418,7 +417,19 @@ def pbar(x):
df_dict[colabel][-1] = co.value

elif isinstance(node, DataFile):
pass # handled in process
# pass # handled in process

output_label = node.label
if output_label not in output_label_in_path_counts:
output_label_in_path_counts[output_label] = 0
new_output_label = output_label + "." + str(output_label_in_path_counts[output_label])
df_dict[new_output_label][-1] = node.filename
output_label_in_path_counts[output_label] += 1

for co in node.comments:
colabel = "{0}.Comment[{1}]".format(
new_output_label, co.name)
df_dict[colabel][-1] = co.value

DF = DataFrame(columns=columns)
DF = DF.from_dict(data=df_dict)
Expand Down Expand Up @@ -466,6 +477,11 @@ def pbar(x):
columns[i] = "Protocol REF"
elif "." in col:
columns[i] = col[:col.rindex(".")]
else:
for output_label in output_label_in_path_counts:
if output_label in col:
columns[i] = output_label
break

log.debug("Rendered {} paths".format(len(DF.index)))
if len(DF.index) > 1:
Expand Down Expand Up @@ -505,8 +521,6 @@ def write_value_columns(df_dict, label, x):
elif x.unit.term_source.name:
df_dict[label + ".Unit.Term Source REF"][-1] = x.unit.term_source.name

# df_dict[label + ".Unit.Term Source REF"][-1] = \
# x.unit.term_source.name if x.unit.term_source else ""
df_dict[label + ".Unit.Term Accession Number"][-1] = \
x.unit.term_accession
else:
Expand Down
13 changes: 8 additions & 5 deletions isatools/isatab/load/ProcessSequenceFactory.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from isatools.isatab.utils import process_keygen, find_lt, find_gt, pairwise, get_object_column_map, get_value
from isatools.isatab.defaults import (
log,
Expand Down Expand Up @@ -146,11 +148,12 @@ def create_from_df(self, DF):
except KeyError:
pass

for data_col in [x for x in DF.columns if x.endswith(" File")]:
for data_col in [x for x in DF.columns if " File" in x]:
label = re.match(r'(.* File)', data_col).group(0)
filenames = [x for x in DF[data_col].drop_duplicates() if x != '']
data.update(dict(map(lambda x: (':'.join([data_col, x]), DataFile(filename=x, label=data_col)), filenames)))
data.update(dict(map(lambda x: (':'.join([data_col, x]), DataFile(filename=x, label=label)), filenames)))

node_cols = [i for i, c in enumerate(DF.columns) if c in _LABELS_MATERIAL_NODES + _LABELS_DATA_NODES]
node_cols = [i for i, c in enumerate(DF.columns) if c in _LABELS_MATERIAL_NODES + _LABELS_DATA_NODES or ' File' in c]
proc_cols = [i for i, c in enumerate(DF.columns) if c.startswith("Protocol REF")]

try:
Expand All @@ -167,7 +170,7 @@ def get_node_by_label_and_key(labl, this_key):
n = samples[lk]
elif labl in ('Extract Name', 'Labeled Extract Name'):
n = other_material[lk]
elif labl.endswith(' File'):
elif ' File' in labl:
n = data[lk]
return n

Expand Down Expand Up @@ -260,7 +263,7 @@ def get_node_by_label_and_key(labl, this_key):
fv_set.add(fv)
material.factor_values = list(fv_set)

elif object_label in _LABELS_DATA_NODES:
elif object_label in _LABELS_DATA_NODES or ' File' in object_label:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

object_label="foo File" would cause issue

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. I don't think "foo File" would validate.
  2. There are other instances of patterns like ' File' in or endswith('File') where "foo File" would cause an issue that were already present. I just made things consistent.
  3. When this change was originally made not every File column was in _LABELS_DATA_NODES.
  4. Having a list of specific acceptable file names is pretty fragile anyway and I would have generalized to columns ending in " File" a while ago, or a pattern like what "Protocol REF" does.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After discussing in meeting. I will remove this and make the code always just look in _LABELS_DATA_NODES.

for _, object_series in DF[column_group].drop_duplicates().iterrows():
try:
data_file = get_node_by_label_and_key(object_label, str(object_series[object_label]))
Expand Down
2 changes: 1 addition & 1 deletion isatools/isatab/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ def get_object_column_map(isatab_header, df_columns):
"""
labels = _LABELS_MATERIAL_NODES + _LABELS_DATA_NODES
if set(isatab_header) == set(df_columns):
object_index = [i for i, x in enumerate(df_columns) if x in labels or 'Protocol REF' in x]
object_index = [i for i, x in enumerate(df_columns) if x in labels or 'Protocol REF' in x or ' File' in x]
else:
object_index = [i for i, x in enumerate(isatab_header) if x in labels + ['Protocol REF']]

Expand Down
150 changes: 150 additions & 0 deletions isatools/model/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import itertools

import networkx as nx
from hashlib import md5, sha1, sha256, blake2b
import os
Expand All @@ -18,6 +20,154 @@ def find(predictor, iterable):
return None, it


def _build_paths_and_indexes(process_sequence=None):
"""Returns the paths from source/sample to end points and a mapping of sequence_identifier to object."""

def _compute_combinations(identifier_list, identifiers_to_objects):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ptth222 please refactor to avoid nested function and revisit the nested for loops before @terazus can review.

io_types = {}
for identifier in identifier_list:
io_object = identifiers_to_objects[identifier]
if isinstance(io_object, DataFile):
label = io_object.label
if label not in io_types:
io_types[label] = [identifier]
else:
io_types[label].append(identifier)
else:
if "Material" not in io_types:
io_types["Material"] = [identifier]
else:
io_types["Material"].append(identifier)
combinations = [item for item in list(itertools.product(*[values for values in io_types.values()])) if item]
return combinations

## Determining paths depends on processes having next and prev sequence, so add them if they aren't there
## based on inputs and outputs.
inputs_to_process = {id(p_input):{"process":process, "input":p_input} for process in process_sequence for p_input in process.inputs}
outputs_to_process = {id(output):{"process":process, "output":output} for process in process_sequence for output in process.outputs}
for output, output_dict in outputs_to_process.items():
if output in inputs_to_process:
if not inputs_to_process[output]["process"].prev_process:
inputs_to_process[output]["process"].prev_process = output_dict["process"]
if not output_dict["process"].next_process:
output_dict["process"].next_process = inputs_to_process[output]["process"]

paths = []
identifiers_to_objects = {}
all_inputs = set()
all_outputs = set()
for process in process_sequence:

identifiers_to_objects[process.sequence_identifier] = process
for output in process.outputs:
identifiers_to_objects[output.sequence_identifier] = output
all_outputs.add(output.sequence_identifier)
for input_ in process.inputs:
identifiers_to_objects[input_.sequence_identifier] = input_
all_inputs.add(input_.sequence_identifier)


original_process = process

right_processes = []
while next_process := process.next_process:
right_processes.append(next_process.sequence_identifier)
process = next_process

left_processes = []
process = original_process
while prev_process := process.prev_process:
left_processes.append(prev_process.sequence_identifier)
process = prev_process
left_processes = list(reversed(left_processes))

paths.append(left_processes + [original_process.sequence_identifier] + right_processes)


unique_paths = [list(x) for x in set(tuple(x) for x in paths)]
paths = unique_paths
dead_end_outputs = all_outputs - all_inputs

## Add paths based on inputs and outputs.
str_path_to_path = {}
was_path_modified = {}
paths_seen = []
paths_seen_twice = []
while True:
new_paths = []
paths_seen_changed = False
for path in paths:
str_path = str(path)
str_path_to_path[str_path] = path
if path not in paths_seen:
paths_seen.append(path)
paths_seen_changed = True
else:
paths_seen_twice.append(path)
continue
path_len = len(path)
path_modified = False
for i, identifier in enumerate(path):
node = identifiers_to_objects[identifier]

if i == 0 and isinstance(node, Process):
identifier_list = [input_.sequence_identifier for input_ in node.inputs]
combinations = _compute_combinations(identifier_list, identifiers_to_objects)
for combo in combinations:
new_path = list(combo) + path
path_modified = True
if new_path not in new_paths:
new_paths.append(new_path)
continue

if i == path_len - 1 and isinstance(node, Process):
identifier_list = [output.sequence_identifier for output in node.outputs]
combinations = _compute_combinations(identifier_list, identifiers_to_objects)
for combo in combinations:
new_path = path + list(combo)
path_modified = True
if new_path not in new_paths:
new_paths.append(new_path)
continue

if i + 1 < path_len and isinstance(identifiers_to_objects[path[i+1]], Process) and i > 0 and isinstance(node, Process):
output_sequence_identifiers = {output.sequence_identifier for output in node.outputs}
input_sequence_identifiers = {input_.sequence_identifier for input_ in identifiers_to_objects[path[i+1]].inputs}
identifier_intersection = output_sequence_identifiers.intersection(input_sequence_identifiers)

combinations = _compute_combinations(identifier_intersection, identifiers_to_objects)
for combo in combinations:
new_path = path[0:i+1] + list(combo) + path[i+1:]
path_modified = True
if new_path not in new_paths:
new_paths.append(new_path)

## Add outputs that aren't later used as inputs.
for output in output_sequence_identifiers.intersection(dead_end_outputs):
new_path = path[:i+1] + [output]
path_modified = True
if new_path not in new_paths:
new_paths.append(new_path)
continue
## This is supposed to catch different length paths.
if not path_modified and path not in new_paths:
new_paths.append(path)

if str_path in was_path_modified:
if path_modified:
was_path_modified[str_path] = path_modified
else:
was_path_modified[str_path] = path_modified
if not paths_seen_changed:
break
paths = new_paths


paths = [str_path_to_path[path] for path, was_modified in was_path_modified.items() if not was_modified]

return paths, identifiers_to_objects


def _build_assay_graph(process_sequence=None):
""":obj:`networkx.DiGraph` Returns a directed graph object based on a
given ISA process sequence."""
Expand Down