stage2.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Stage2 code generator: internal API to public API.

stage2 takes in the generated code from stage1, and possible additional
user-defined Fortran interfaces (see ``add_intfs()``). These are treated
on equal footing as stage1 code.

stage2 analyzes the dependencies between the stage1 functions, and writes
wrappers, where all bound symbols (quantities defined by any of the stage1
functions) are automatically computed, by calling other stage1 functions
(recursing where necessary).

Each function generated by stage2 takes in values only for the free symbols
(quantities *not* defined by any of the stage1 functions) encountered anywhere
in its call tree. This makes e.g. ∂²ϕ/∂Bx² "see" the dependencies on e.g.
u0, I4, and εxx.

"Free symbol" is here meant in a mathematical sense; in the programming sense,
these "free symbols" appear in the formal parameter list of the function being
generated, so they are bound names.

Created on Tue Oct 24 14:07:45 2017

@author: Juha Jeronen <juha.jeronen@tut.fi>
"""

import re
import os

from iterutil import uniqify
from util import fold_fortran_code, TextMultiBuffer
import fian  # Fortran interface analyzer, for reading stage1 code

_outfileheader = \
"""!******************************************************************************
!*              Code generated with mgs-galfenol-codegen stage2               *
!*                                                                            *
!* See https://github.com/TUTElectromechanics/mm-codegen for more information *
!*                                                                            *
!*                 This file is part of 'elmer-mgs-galfenol'                  *
!******************************************************************************
"""

class CodeGenerator:
    """Generate stage2 code (public API) for the stage1 code (internal functions)."""

    # no constructor, this is OOFP with just static and class methods.

    @staticmethod
    def analyze_args(fname, args, lookup, recurse):
        """Split args of stage1 function ``fname`` into bound and free sets.

        Any arg that exists as a key in ``lookup`` is considered to be bound
        to that function.

        All other arguments are considered free.

        Parameters:
            fname: str
                Name of the stage1 function being analyzed.

            args: tuple of str
                Names of formal parameters of fname. Each arg is, generally:
                  a) the name of another stage1 function, or
                  b) a free argument (anything not defined by a stage1 function).
                See ``fian.analyze_interface()``.

            lookup: dict(str -> list(str))
                Function name (for all functions) to list of its inargs.

            recurse: bool
                If True, recurse into ``args`` with the help of ``lookup``.
                If False, analyze ``args`` themselves only.

        Returns:
            tuple (bound, free), where each item is:
              set of ``(level, arg, fname)`` tuples (argument records, argrecs):
                where
                  ``level`` (int) is the recursion depth where ``arg`` was
                  seen. A ``level`` of 0 means top level. A level ``i > 0``
                  means "needed as a dependency from level ``i-1``".

                  ``arg`` (str) is the argument name.

                  ``fname`` (str) is the name of the function whose argument
                  ``arg`` is. Can be used to retrieve metadata (such as dtype
                  and dimspec) from the results of the interface analyzer.

                The same arg may appear at multiple levels; in this case,
                each level has its own instance in the results. If you need
                just the names, see ``strip_argrecs()``.
        """
        def analyze(fname, args, level):
            bound = set()
            free  = set()
            for arg in args:
                if arg in lookup:  # if we know a stage1 function of this name
                    bound.add((level, arg, fname))
                    if recurse:
                        b, f = analyze(arg, lookup[arg], level+1)
                        bound.update(b)
                        free.update(f)
                else:
                    free.add((level, arg, fname))
            return (bound, free)
        return analyze(fname, args, level=0)

    @staticmethod
    def strip_argrecs(argrecs):
        """Strip all except the argument names from an output set of analyze_args()."""
        return tuple(uniqify([arg for (_, arg, _) in argrecs]))  # level, arg, fname

    @classmethod
    def validate_bound_args(cls, bound, lookup):
        """Validate bound args.

        If validation passes, it proves that the dependencies (as declared
        by the formal parameter names) between the functions in the stage1
        Fortran interface are NOT:
          - recursive (calling itself or any parent on its call stack)
          - mutually recursive (a calling b and b calling a; detected even if
                                the calls are in different call chains)

        Parameters:
            bound: set of ``(level, arg, fname)`` (int, str, str) tuples
                as output by ``analyze_args()`` with ``recurse=True``.
                (Note: supply the bound set only!)

            lookup: dict(str -> list(str))
                Function name (for all functions) to list of its inargs.

        Returns:
            None. ``ValueError`` is raised if the validation fails.
        """
        # Check top level; we should be given only bound args.
        fnames = cls.strip_argrecs(bound)  # the **args** are the fnames to validate.
        invalid_args = [fname for fname in fnames if fname not in lookup]
        if len(invalid_args):
            raise ValueError("Got free top-level fname(s) {invalid}; validator checks only bound args".format(invalid=invalid_args))

        # Sets of callers of each bound var, for mutual recursion detection.
        #
        # The callers of "func" are the content of the call stack just before
        # we push "func" itself onto the stack. This includes "implicit" callers,
        # in the sense that f in f(g(h(x))) implicitly calls h, because g does.
        #
        # (To collect only the explicit callers, we would take only the
        #  current topmost item in the call stack.)
        #
        # The sets of callers are built globally across all call chains;
        # the set of callers of "func" is updated with any new callers
        # of "func" encountered in any call chain.
        #
        callers_of = {}  # str: set
        def update_callers_of(k, more):
            if k not in callers_of:
                callers_of[k] = more
            else:
                callers_of[k].update(more)

        # Validate each chain individually. As a side effect, build callers_of.
        r = []  # for error reporting
        def process(fname, callstack):
            # Track *each chain of calls* independently. (E.g. in dwp_dI6 in
            # the 3par model, both I5 and I6, at the same level, depend on exx.)
            if fname in lookup:  # check bound only; may have free args at deeper levels
                if fname in callstack:  # recursive call, not allowed
                    r.append((toplevel_fname, fname, callstack))
                update_callers_of(fname, set(callstack))
                new_callstack = callstack + [fname]
                for dependency in lookup[fname]:
                    process(dependency, new_callstack)
        for toplevel_fname in fnames:
            process(toplevel_fname, [])
        if len(r):
            raise ValueError("recursion detected; (top-level fname, target, callstack) info follows: {invalid}".format(invalid=", ".join(str(item) for item in r)))

        # Detect mutual recursion between different call chains.
        # a = the thing being called; b = its callers
        mr = [(a, b) for a in callers_of.keys() for b in callers_of[a] if a in callers_of[b]]
        if len(mr):
            raise ValueError("mutual recursion (possibly implicit) detected, function pair(s): {invalid}".format(invalid=mr))

    # When writing stage2 code, arguments of stage1 functions may be:
    #
    #  - Free arguments, for which no stage1 function exists.
    #      - Must be supplied by end user; add to arg list of stage2 wrapper.
    #      - May appear deeper in the call tree; propagate also those to the
    #        top-level wrapper.
    #  - Bound arguments, obtainable as the return value of a stage1 function.
    #      - Call the stage1 function in the wrapper body, then use the result.
    #        Resolve any dependencies recursively.
    #      - Processing in descending order of call tree depth, of the deepest
    #        instance of each bound arg, we always have its dependencies, since:
    #          1) No recursion or mutual recursion in the call tree
    #          2) The leaves of the call tree depend only on free args (at most)
    #      - Optimization: in a single call, we may re-use the stage1 results
    #        as many times as we need, because stage1 consists of pure functions.
    #
    @classmethod
    def write_stage2_object(cls, objtype, stage1_oname, stage1_args, metas, lookup, outbuf):
        """Write stage2 function/subroutine wrapper (public API) for a stage1 function/subroutine.

        Parameters:
            objtype: str, one of:
                "function", "subroutine"
            stage1_oname: str
                Name of the stage1 function/subroutine.
            stage1_args: list(str)
                All arguments of the stage1 function/subroutine, in original order.
            metas: dict(str -> metarec)
                Lookup table of metadata records of all objects.
                key = function name, value = corresponding metarec.
            lookup: dict(str -> list(str))
                Lookup table of intent(in) args of all functions.
                (Functions only, as we do not allow subroutines as dependencies.)
            outbuf: util.TextMultiBuffer
                Where to write the output. Keyed under ".f90" and ".h".

        Returns:
            None. Mutates outbuf instead!
        """
        key_impl = ".f90"
        key_intf = ".h"
        key_both = (key_impl, key_intf)

        # Get the dtype of the return value of a stage1 function.
        def rettype_of(fname):
            metarec = metas[fname]  # metadata record for fname
            retval_meta = metarec[fname]    # return value metadata: key in metarec = function name itself
            dtype, _, _ = retval_meta
            return dtype

        # Sort by level (descending), then by name.
        def level_sortkey(argrec):
            level, argname, _ = argrec
            return (-level, argname)

        # Sort by intent, then lexicographically.
        def intent_sortkey(argrec):
            _, argname, fname = argrec
            metarec = metas[fname]  # metadata record for function whose argument this is
            _, intent, _ = metarec[argname]
            return (intent, argname)  # "in" sorts before "inout" and "out" so we're good.

        # Analyze all args, because intent(out) args for subroutines are
        # also free; must be detected as such for the post-binding validation.
        bound_set, free_set = cls.analyze_args(stage1_oname, stage1_args,
                                               lookup, recurse=True)

        # Check that we can handle the declared dependencies between the bound args.
        cls.validate_bound_args(bound_set, lookup)

        # Find the function (in the call chain) in whose arguments each freevar
        # originally appears; needed to access the metadata for the freevar.
        #
        # DANGER: slight oversimplification:
        #   We assume all instances of a freevar with the same name mean the same thing!
        #
        arg_to_metasrc = {arg: fname for _,arg,fname in free_set}

        # Order free args by intent ("in" first), then lexicographically.
        # Order bound args by level, descending, for dependency resolution.
        # uniqify(), as the same arg may appear at different levels.
        freevars = cls.strip_argrecs(sorted(free_set, key=intent_sortkey))
        boundvars = cls.strip_argrecs(sorted(bound_set, key=level_sortkey))

        # output: function header
        return_decl = "{rettype} ".format(rettype=rettype_of(stage1_oname)) if objtype == "function" else ""

        stage2_oname = "{name}_public".format(name=stage1_oname)  # name of public API function/subroutine to write
        outbuf.append(key_both, "\n")
        outbuf.append(key_intf, "interface\n")
        outbuf.append(key_both, "{return_decl}{objtype} {name}(".format(return_decl=return_decl,
                                                                        objtype=objtype,
                                                                        name=stage2_oname))
        outbuf.append(key_both, ", ".join(freevars))
        outbuf.append(key_both, ")\n")
        outbuf.append(key_both, "use types\n")

        # output: argument declarations for the public API function (free args only!)
        outbuf.append(key_both, "implicit none\n")
        for fvar in freevars:
            # Get the metadata record for the function whose argument
            # this freevar originally is.
            #
            # DANGER: slight oversimplification:
            #   We assume all instances of a freevar with the same name mean the same thing!
            #   (So it doesn't matter even if we get the "wrong" metasrc, as long as
            #    it takes this freevar as an argument.)
            #
            metarec = metas[arg_to_metasrc[fvar]]
            dtype, intent, dimspec = metarec[fvar]

            if dimspec is not None:
                outbuf.append(key_both, "{dtype}, intent({intent}), dimension({dimspec}) :: {argname}\n".format(dtype=dtype,
                                                                                                                intent=intent,
                                                                                                                dimspec=dimspec,
                                                                                                                argname=fvar))
            else:
                outbuf.append(key_both, "{dtype}, intent({intent}) :: {argname}\n".format(dtype=dtype,
                                                                                          intent=intent,
                                                                                          argname=fvar))

        # Declare any needed localvars and populate them by calls to
        # the stage1 functions represented by boundvars.

        bound_to_local = {}  # populated later
        def bind_to_locals(names):
            result = [(bound_to_local[name] if name in boundvars else name) for name in names]
            # sanity check: each bound arg in names should now be bound to
            # something, so the result should have only localvars or freevars.
            localvars = bound_to_local.values()
            invalid_args = [name for name in result if name not in localvars and name not in freevars]
            if len(invalid_args):
                raise RuntimeError("post-binding check: undefined symbol(s) {invalid}, neither in localvars nor in freevars".format(invalid=invalid_args))
            return result

        # We must first process all boundvars to generate all of localvars,
        # but we must output the declarations of all localvars first,
        # before writing the calls to the boundvar functions (that then
        # populate the localvars). Solution: use a temporary buffer.
        lvar_code = ""
        for bvar in boundvars:  # follow the ordering by level, descending (deepest first)
            lvar = "{boundvar}_".format(boundvar=bvar)
            # Write code to call the stage1 function for this boundvar.
            #
            # The descending level ordering makes sure that the arguments of
            # each generated call will contain only vars that already have
            # a localvar, or free vars. In each call, no unbound vars remain.
            #
            # Take the argument list from lookup[], because it preserves
            # the ordering of the args (which are positional in Fortran).
            #
            # TODO later: if no function name matches an input arg,
            # we could check if there is a subroutine that provides
            # it as one of its output args, and call it.
            bvar_args = ", ".join(bind_to_locals(lookup[bvar]))
            lvar_code += "{localvar} = {boundvar}({args})\n".format(localvar=lvar,
                                                                    boundvar=bvar,
                                                                    args=bvar_args)
            bound_to_local[bvar] = lvar  # later calls can bind to this result
        if len(boundvars):
            lvar_code += "\n"

        # output: declare localvars
        for bvar in boundvars:  # use same ordering as boundvars, for readability
            outbuf.append(key_impl, "{rettype} {localvar}\n".format(rettype=rettype_of(bvar),
                                                                    localvar=bound_to_local[bvar]))

        # output: evaluate localvars
        outbuf.append(key_impl, "\n")
        outbuf.append(key_impl, lvar_code)

        # output: call the wrapped stage1 function
        final_args = ", ".join(bind_to_locals(stage1_args))
        if objtype == "function":
            outbuf.append(key_impl, "{retname} = {stage1_name}({args})\n".format(retname=stage2_oname,
                                                                                 stage1_name=stage1_oname,
                                                                                 args=final_args))
        else: # objtype == "subroutine":
            outbuf.append(key_impl, "call {stage1_name}({args})\n".format(stage1_name=stage1_oname,
                                                                          args=final_args))

        outbuf.append(key_impl, "\n")
        outbuf.append(key_both, "end {objtype}\n".format(objtype=objtype))
        outbuf.append(key_intf, "end interface\n")

    @classmethod
    def run(cls, s1code):
        """Generate the stage2 code (i.e. the public API) based on stage1 code.

        Parameters:
            s1code: [(label, filename, content), ...]
                Stage1 code, in the output format of stage1.CodeGenerator.run().

                If you need additional user-defined interfaces, use add_intfs()
                on s1code before calling run().

        Returns:
            tuple of tuples, stage2 code. Each item has the format:
                (label, output_filename, content)
        """
        stage1_intf = intfs_only(s1code)

        generated_code_out = []
        for i, (label, input_filename, content) in enumerate(stage1_intf, start=1):

            progress_header_outer = "({iteration:d}/{total:d})".format(iteration=i, total=len(stage1_intf))
            print("stage2: {outer_progress} {label} model: generating public API based on '{file}'".format(outer_progress=progress_header_outer,
                                                                                                           label=label,
                                                                                                           file=input_filename))

            data_funcs, data_subroutines = fian.analyze_interface(content)

            # The bound args lookup table is determined by the functions only,
            # since we do not allow subroutines to appear as a dependency.
            _, lookup, _ = data_funcs

            # Map function/subroutine names to their parameter metadata.
            #
            # Both functions and subroutines need access to the metadata
            # of functions (for processing dependencies).
            def objname_to_meta():
                _, _, fmeta = data_funcs
                _, _, smeta = data_subroutines
                out = fmeta.copy()
                out.update(smeta)
                return out
            metas = objname_to_meta()

            # Text of implementation and interface will be added into named
            # buffers. This is convenient because they are mostly identical.
            outbuf = TextMultiBuffer()

            # Generate public API for functions, then for subroutines.
            for objtype, (objs, _, _) in (("function",   data_funcs),
                                          ("subroutine", data_subroutines)):
                for j, (stage1_oname, stage1_args) in enumerate(objs, start=1):

                    progress_header_inner = "({iteration:d}/{total:d})".format(iteration=j, total=len(objs))
                    progress_header = "{outer_progress} {inner_progress}".format(outer_progress=progress_header_outer,
                                                                                 inner_progress=progress_header_inner)
                    print("stage2: {header} {label} model: public API for {objtype} {name}".format(header=progress_header,
                                                                                                   label=label,
                                                                                                   objtype=objtype,
                                                                                                   name=stage1_oname))

                    cls.write_stage2_object(objtype, stage1_oname, stage1_args,
                                            metas, lookup, outbuf)  # mutates outbuf!

            # Generate the final code for the output files.
            outfile_basename = "mgs_{label}".format(label=label)
            for key in sorted(outbuf.keys()):
                outfile_name = "{basename}{file_ext}".format(basename=outfile_basename, file_ext=key)
                final_code = _outfileheader + fold_fortran_code(outbuf[key])
                generated_code_out.append((label, outfile_name, final_code))

        return generated_code_out

##############################################################################
# Main program (stage2)
##############################################################################

def intfs_only(s1code):
    """Given s1code, keep only interfaces (.h); ignore implementations (.f90)."""
    return [(l, f, c) for l, f, c in s1code if f.endswith(".h")]

def load_stage1_intfs(path):
    """Load interfaces of stage1 generated code.

    Parameters:
        path: str
            Filesystem path to read data from. Relative or absolute.
            No final pathsep. Example: "." for the current directory.

            Will be scanned for filenames of the form "mgs_*_impl.h",
            where the wildcard gives the model label.

    Returns:
        [(label,filename,content), ...]
          where
            label: str
                Label from the model. Deduced from the filename.
            filename: str
                Basename of the file (no path).
            content: str
                File content as one string (containing linefeeds).
    """
    p_maybepath = r"(?:.*{pathsep})?".format(pathsep=os.path.sep)
    p_basename = r"mgs_(.*)_impl"
    p_interface = r"\.h"
    pattern = "{maybepath}{basename}{interface}".format(maybepath=p_maybepath,
                                                        basename=p_basename,
                                                        interface=p_interface)
    def relevant(filename):
        return len(re.findall(pattern, filename))
    def getlabel(filename):
        matches = re.findall(pattern, filename)
        assert len(matches) == 1
        group = matches[0]
        return group

    files_and_dirs = [os.path.join(path, x) for x in os.listdir(path)]
    files = [x for x in files_and_dirs if os.path.isfile(x)]
    matching_files = [x for x in files if relevant(x)]
    if not len(matching_files):
        raise(ValueError("No stage1 files found; please generate them first by running stage1.py."))

    def read(filename):
        with open(filename, "rt", encoding="utf-8") as f:
            content = f.read()
        return content

    return [(getlabel(f), os.path.basename(f), read(f)) for f in matching_files]

def add_intfs(s1code, path, basenames):
    """Add user-defined stage1 interfaces.

    Parameters:
        s1code: [(label,filename,content), ...]
            As output by ``load_stage1_intfs()`` or ``stage1.CodeGenerator.run()``.

        path: str
            Filesystem path to read data from. Relative or absolute.
            No final pathsep. Example: "." for the current directory.

        basenames: list(str)
            Basenames of files containing additional user-defined interfaces.

            Their content is just pasted to the end of the s1code content of
            each model, so they get handled on equal footing with any stage1
            generated code.

            In the basenames, the tag "{label}" is replaced by each label from
            s1code. Any missing files are ignored (so it's ok for a file to
            exist for only some of the models).

    Returns:
        [(label,filename,content), ...]
            where each content has been updated with the additional interfaces.
    """
    out = []
    for l, f, c in s1code:
        for basename in (fn.format(label=l) for fn in basenames):
            filename = os.path.join(path, basename)
            try:
                with open(filename, "rt", encoding="utf-8") as file:
                    print("stage2: {label} model: reading additional interface '{file}'".format(label=l, file=basename))
                    content = file.read()
                c += content
            except FileNotFoundError:
                print("stage2: {label} model: no match for '{file}', ignoring".format(label=l, file=basename))
        out.append((l, f, c))
    return out