Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

state.topological_sort is incorrect, while sdfg.utils.dfs_topological_sort is correct #1560

Open
ThrudPrimrose opened this issue Apr 26, 2024 · 1 comment

Comments

@ThrudPrimrose
Copy link

Describe the bug
When I call:
dace.sdfg.utils.dfs_topological_sort(state)
I have a correct topological sorting of nodes within a state but, calling:
state.topological_sort()
Results with an ordering that is not correct.

To Reproduce
Steps to reproduce the behavior:
I have a minimal program to reproduce the issue. In the example I reproduce, an access node, which has an edge to an map node, is listed after the map when sorted with state.topological_sort

Expected behavior
When I print the nodes and edges in the order received from the nodes and their outgoing edges are printed as follows:

...
dataA  |  [dataA:None  -(gpu_dataA[0:N] <- [0:N])->  gpu_dataA:None]
kernelAddBToAOnGPUNoWCR_42[i=0:N]  |  [kernelAddBToAOnGPUNoWCR_42[i=0:N]:OUT_dataA  -(gpu_dataA[0:N])->  gpu_dataA:None]
gpu_dataA  |  [gpu_dataA:None  -(gpu_dataA[0:N])->  kernelAddBToAOnGPUNoWCR_42[i=0:N]:IN_dataA]
gpu_dataA  |  [gpu_dataA:None  -(dataA[0:N] <- [0:N])->  dataA:None]
dataA  |  []

Expected would be: (the access node gpu_dataA that as outgoing edge to the kernel, needs to be before the kernel)

gpu_dataA  |  [gpu_dataA:None  -(gpu_dataA[0:N])->  kernelAddBToAOnGPUNoWCR_42[i=0:N]:IN_dataA]
gpu_dataB  |  [gpu_dataB:None  -(gpu_dataB[0:N])->  kernelAddBToAOnGPUNoWCR_42[i=0:N]:IN_dataB]
kernelAddBToAOnGPUNoWCR_42[i=0:N]  |  [kernelAddBToAOnGPUNoWCR_42[i=0:N]:OUT_dataB  -(gpu_dataB[i])->  _Mult_:__in2, kernelAddBToAOnGPUNoWCR_42[i=0:N]:OUT_dataA  -(gpu_dataA[i])->  _Add_:__in2]
_Mult_  |  [_Mult_:__out  -(__tmp10[0])->  __tmp10:None]
__tmp10  |  [__tmp10:None  -(__tmp10[0])->  _Add_:__in1]
_Add_  |  [_Add_:__out  -(__tmp11[0])->  __tmp11:None]
__tmp11  |  [__tmp11:None  -(__tmp11[0])->  assign_43_8:__inp]
assign_43_8  |  [assign_43_8:__out  -(gpu_dataA[i])->  kernelAddBToAOnGPUNoWCR_42[i=0:N]:IN_dataA]
kernelAddBToAOnGPUNoWCR_42[i=0:N]  |  [kernelAddBToAOnGPUNoWCR_42[i=0:N]:OUT_dataA  -(gpu_dataA[0:N])->  gpu_dataA:None]
gpu_dataA  |  [gpu_dataA:None  -(dataA[0:N] <- [0:N])->  dataA:None]
dataA  |  []

The generated example program I can't attach but here is the full code, it prints the nodes and saves the sdfg as ex.sdfg:

import dace
import numpy as np
import cupy as cp
from dace.dtypes import StorageType, ScheduleType
from dace.sdfg.analysis import cfg

size = int(1e4)
N = dace.symbol('N')

@dace.program
def kernelInitAOnGPU(dataA : dace.float32[N] @ StorageType.GPU_Global):
    for i in dace.map[0:N] @ ScheduleType.GPU_Device:
        dataA[i] = 2.0

@dace.program
def kernelInitBOnGPU(dataB : dace.float32[N] @ StorageType.GPU_Global):
    for i in dace.map[0:N] @ ScheduleType.GPU_Device:
        dataB[i] = 0.65

@dace.program
def copyToGPU(data : dace.float32[N] @ StorageType.Default):
    # Does not compile if: gpu_data : dace.float32[N] @ StorageType.GPU_Global = cp.array(data)
    gpu_data : dace.float32[N] @ StorageType.GPU_Global = cp.zeros((N,), dace.float32)
    gpu_data[0:N] = data[0:N]
    return gpu_data


@dace.program
def kernelOverwriteBAddBToAOnGPUNoWCR(
    dataA : dace.float32[N] @ StorageType.GPU_Global,
    dataB : dace.float32[N] @ StorageType.GPU_Global
    ):
    for i in dace.map[0:N] @ ScheduleType.GPU_Device:
        dataB[i] = 3.0
        dataA[i] = 0.5 * dataB[i] + dataA[i]

@dace.program
def kernelAddBToAOnGPUNoWCR(
    dataA : dace.float32[N] @ StorageType.GPU_Global,
    dataB : dace.float32[N] @ StorageType.GPU_Global
    ):
    for i in dace.map[0:N] @ ScheduleType.GPU_Device:
        dataA[i] = 0.5 * dataB[i] + dataA[i]

@dace.program
def kernelAddBToAOnCPU(
    dataA : dace.float32[N] @ StorageType.Default,
    dataB : dace.float32[N] @ StorageType.Default
    ):
    for i in dace.map[0:N] @ ScheduleType.CPU_Multicore:
        dataA[i] += 0.5 * dataB[i]

@dace.program
def kernelLifetimesOnGPUNoWCR(dataA : dace.float32[N] @ StorageType.Default,
                       dataB : dace.float32[N] @ StorageType.Default):
    gpu_dataA = copyToGPU(data = dataA)
    gpu_dataB = copyToGPU(data = dataB)
    kernelInitAOnGPU(dataA = gpu_dataA)
    kernelInitBOnGPU(dataB = gpu_dataB)
    kernelOverwriteBAddBToAOnGPUNoWCR(dataA = gpu_dataA, dataB = gpu_dataB)
    dataA[0:N] = gpu_dataA[0:N]
    dataB[0:N] = gpu_dataB[0:N]
    kernelAddBToAOnCPU(dataA = dataA, dataB = dataB)
    gpu_dataA[0:N] = dataA[0:N]
    gpu_dataB[0:N] = dataB[0:N]
    kernelAddBToAOnGPUNoWCR(dataA = gpu_dataA, dataB = gpu_dataB)
    dataA[0:N] = gpu_dataA[0:N]

def testLifetimesOnGPUNoWCR():
    dataA = np.zeros(size, dtype=np.float32)
    dataB = np.zeros(size, dtype=np.float32)
    reference = np.full(size, 6.5, dtype=np.float32)
    sdfg = kernelLifetimesOnGPUNoWCR.to_sdfg(dataA=dataA, dataB=dataB, N = size)
    sdfg.save("ex.sdfg")
    state_order = list(cfg.stateorder_topological_sort(sdfg))

    for state in state_order:
        nodes_with_0_incoming = list()
        for nd in state.nodes():
            if state.in_degree(nd) == 0:
                nodes_with_0_incoming.append(nd)

        if len(nodes_with_0_incoming) == 1:
            topologically_sorted_nodes1 = list(dace.sdfg.utils.dfs_topological_sort(state))
            topologically_sorted_nodes2 = list(state.topological_sort(nodes_with_0_incoming[0]))
            def same_element_representation(list1, list2):
                # Check if the lists have the same length
                if len(list1) != len(list2):
                    return False

                # Convert each element to string and compare
                for elem1, elem2 in zip(list1, list2):
                    if str(elem1) != str(elem2):
                        return False

                return True
            if not same_element_representation(topologically_sorted_nodes1, topologically_sorted_nodes2):
                print(f"For state {state}:")
                print("DaCe SDFG Utils DFS Topological Sort:")
                for n in topologically_sorted_nodes1:
                    print(n, " | ", state.out_edges(n))
                print("===")
                print("State's Internal Topological Sort:")
                for n in topologically_sorted_nodes2:
                    print(n, " | ", state.out_edges(n))
                print("Not equal")
                print("***")

    sdfg(dataA = dataA, dataB = dataB, N=size)
    assert(np.allclose(dataA, reference))

if __name__ == '__main__':
    testLifetimesOnGPUNoWCR()

Desktop (please complete the following information):

  • I am on Ubuntu 23.10
  • Output of git log: commit 78759b5 (HEAD -> master, tag: v0.16rc1)
@BenWeber42
Copy link
Contributor

Yes, this seems to be a bug in Graph.topological_sort of our own graph layer:

dace/dace/sdfg/graph.py

Lines 367 to 390 in ee5a6df

def topological_sort(self, source: NodeT = None) -> Sequence[NodeT]:
"""Returns nodes in topological order iff the graph contains exactly
one node with no incoming edges."""
if source is not None:
sources = [source]
else:
sources = self.source_nodes()
if len(sources) == 0:
sources = [self.nodes()[0]]
#raise RuntimeError("No source nodes found")
if len(sources) > 1:
sources = [self.nodes()[0]]
#raise RuntimeError("Multiple source nodes found")
seen = OrderedDict() # No OrderedSet in Python
queue = deque(sources)
while len(queue) > 0:
node = queue.popleft()
seen[node] = None
for e in self.out_edges(node):
succ = e.dst
if succ not in seen:
seen[succ] = None
queue.append(succ)
return seen.keys()

This algorithm can fail for a graph like:

 A
 | \
 |   \
 v    v 
 B <- C

The only correct topological order would be A, C, B. Whereas the current algorithm can also produce the incorrect order of A, B, C (ignoring the dependency between B and C).

My language server could find only three references to the broken Graph.topological_sort function, and all three seem to be for SDFG/ControlFlowRegion.

I would suggest completely removing Graph.topological_sort and replacing all occurrences with the working utils.dfs_topological_sort.

However... makes no sense to use topological_sort for SDFG/ControlFlowRegion, they can be cyclic. Why are we trying to get a topological order for a potentially cyclic graph?! Probably we are looking for a different kind of ordering 🤔

The references that my language server found:

def determine_allocation_lifetime(self, top_sdfg: SDFG):
"""
Determines where (at which scope/state/SDFG) each data descriptor
will be allocated/deallocated.
:param top_sdfg: The top-level SDFG to determine for.
"""
# Gather shared transients, free symbols, and first/last appearance
shared_transients = {}
fsyms = {}
reachability = StateReachability().apply_pass(top_sdfg, {})
access_instances: Dict[int, Dict[str, List[Tuple[SDFGState, nodes.AccessNode]]]] = {}
for sdfg in top_sdfg.all_sdfgs_recursive():
shared_transients[sdfg.cfg_id] = sdfg.shared_transients(check_toplevel=False, include_nested_data=True)
fsyms[sdfg.cfg_id] = self.symbols_and_constants(sdfg)
#############################################
# Look for all states in which a scope-allocated array is used in
instances: Dict[str, List[Tuple[SDFGState, nodes.AccessNode]]] = collections.defaultdict(list)
array_names = sdfg.arrays.keys(
) #set(k for k, v in sdfg.arrays.items() if v.lifetime == dtypes.AllocationLifetime.Scope)
# Iterate topologically to get state-order
for state in sdfg.topological_sort():
for node in state.data_nodes():
if node.data not in array_names:
continue
instances[node.data].append((state, node))
# Look in the surrounding edges for usage
edge_fsyms: Set[str] = set()
for e in sdfg.all_edges(state):
edge_fsyms |= e.data.free_symbols
for edge_array in edge_fsyms & array_names:
instances[edge_array].append((state, nodes.AccessNode(edge_array)))
#############################################
access_instances[sdfg.cfg_id] = instances

dace/dace/sdfg/state.py

Lines 2548 to 2605 in ee5a6df

def _used_symbols_internal(self,
all_symbols: bool,
defined_syms: Optional[Set] = None,
free_syms: Optional[Set] = None,
used_before_assignment: Optional[Set] = None,
keep_defined_in_mapping: bool = False) -> Tuple[Set[str], Set[str], Set[str]]:
defined_syms = set() if defined_syms is None else defined_syms
free_syms = set() if free_syms is None else free_syms
used_before_assignment = set() if used_before_assignment is None else used_before_assignment
try:
ordered_blocks = self.topological_sort(self.start_block)
except ValueError: # Failsafe (e.g., for invalid or empty SDFGs)
ordered_blocks = self.nodes()
for block in ordered_blocks:
state_symbols = set()
if isinstance(block, ControlFlowRegion):
b_free_syms, b_defined_syms, b_used_before_syms = block._used_symbols_internal(all_symbols)
free_syms |= b_free_syms
defined_syms |= b_defined_syms
used_before_assignment |= b_used_before_syms
state_symbols = b_free_syms
else:
state_symbols = block.used_symbols(all_symbols)
free_syms |= state_symbols
# Add free inter-state symbols
for e in self.out_edges(block):
# NOTE: First we get the true InterstateEdge free symbols, then we compute the newly defined symbols by
# subracting the (true) free symbols from the edge's assignment keys. This way we can correctly
# compute the symbols that are used before being assigned.
efsyms = e.data.used_symbols(all_symbols)
# collect symbols representing data containers
dsyms = {sym for sym in efsyms if sym in self.arrays}
for d in dsyms:
efsyms |= {str(sym) for sym in self.arrays[d].used_symbols(all_symbols)}
defined_syms |= set(e.data.assignments.keys()) - (efsyms | state_symbols)
used_before_assignment.update(efsyms - defined_syms)
free_syms |= efsyms
# Remove symbols that were used before they were assigned.
defined_syms -= used_before_assignment
if isinstance(self, dace.SDFG):
# Remove from defined symbols those that are in the symbol mapping
if self.parent_nsdfg_node is not None and keep_defined_in_mapping:
defined_syms -= set(self.parent_nsdfg_node.symbol_mapping.keys())
# Add the set of SDFG symbol parameters
# If all_symbols is False, those symbols would only be added in the case of non-Python tasklets
if all_symbols:
free_syms |= set(self.symbols.keys())
# Subtract symbols defined in inter-state edges and constants from the list of free symbols.
free_syms -= defined_syms
return free_syms, defined_syms, used_before_assignment

def collect_constants(self,
sdfg: SDFG,
initial_symbols: Optional[Dict[str, Any]] = None) -> Dict[SDFGState, Dict[str, Any]]:
"""
Finds all constants and constant-assigned symbols in the SDFG for each state.
:param sdfg: The SDFG to traverse.
:param initial_symbols: If not None, sets values of initial symbols.
:return: A dictionary mapping an SDFG state to a mapping of constants and their corresponding values.
"""
arrays: Set[str] = set(sdfg.arrays.keys() | sdfg.constants_prop.keys())
result: Dict[SDFGState, Dict[str, Any]] = {}
# Add nested data to arrays
def _add_nested_datanames(name: str, desc: data.Structure):
for k, v in desc.members.items():
if isinstance(v, data.Structure):
_add_nested_datanames(f'{name}.{k}', v)
elif isinstance(v, data.ContainerArray):
# TODO: How are we handling this?
pass
arrays.add(f'{name}.{k}')
for name, desc in sdfg.arrays.items():
if isinstance(desc, data.Structure):
_add_nested_datanames(name, desc)
# Process:
# * Collect constants in topologically ordered states
# * If unvisited state has one incoming edge - propagate symbols forward and edge assignments
# * If unvisited state has more than one incoming edge, consider all paths (use reverse DFS on unvisited paths)
# * If value is ambiguous (not the same), set value to UNKNOWN
start_state = sdfg.start_state
if initial_symbols:
result[start_state] = {}
result[start_state].update(initial_symbols)
# Traverse SDFG topologically
for state in optional_progressbar(sdfg.topological_sort(start_state), 'Collecting constants',
sdfg.number_of_nodes(), self.progress):
# NOTE: We must always check the start-state regardless if there are initial symbols. This is necessary
# when the start-state is a scope's guard instead of a special initialization state, i.e., when the start-
# state has incoming edges that may involve the initial symbols. See also:
# `tests.passes.constant_propagation_test.test_for_with_external_init_nested_start_with_guard``
if state in result and state is not start_state:
continue
# Get predecessors
in_edges = sdfg.in_edges(state)
if len(in_edges) == 1: # Special case, propagate as-is
if state not in result: # Condition evaluates to False when state is the start-state
result[state] = {}
# First the prior state
if in_edges[0].src in result: # Condition evaluates to False when state is the start-state
self._propagate(result[state], result[in_edges[0].src])
# Then assignments on the incoming edge
self._propagate(result[state], self._data_independent_assignments(in_edges[0].data, arrays))
continue
# More than one incoming edge: may require reversed traversal
assignments = {}
for edge in in_edges:
# If source was already visited, use its propagated constants
constants: Dict[str, Any] = {}
if edge.src in result:
constants.update(result[edge.src])
else: # Otherwise, reverse DFS to find constants until a visited state
constants = self._constants_from_unvisited_state(sdfg, edge.src, arrays, result)
# Update constants with incoming edge
self._propagate(constants, self._data_independent_assignments(edge.data, arrays))
for aname, aval in constants.items():
# If something was assigned more than once (to a different value), it's not a constant
if aname in assignments and aval != assignments[aname]:
assignments[aname] = _UnknownValue
else:
assignments[aname] = aval
if state not in result: # Condition may evaluate to False when state is the start-state
result[state] = {}
self._propagate(result[state], assignments)
return result

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants