pyodide/pyodide-build/pyodide_build/_f2c_fixes.py

import re
import subprocess
from collections.abc import Iterable, Iterator
from pathlib import Path
from textwrap import dedent  # for doctests


def prepare_doctest(x: str) -> list[str]:
    return dedent(x).strip().splitlines(True)


def fix_f2c_input(f2c_input_path: str) -> None:
    """
    OpenBLAS has been manually modified to remove useless arguments generated by
    f2c. But the mismatches between the f2c ABI and the human-curated sensible
    ABI in OpenBLAS cause us great pain.

    This stuff applies to actual source files, but scipy also has multiple
    templating engines for Fortran, so these changes have to be applied
    immediately prior to f2c'ing a .f file to ensure that they also work
    correctly on templated files.

    Fortran seems to be mostly case insensitive. The templated files in
    particular can include weird mixtures of lower and upper case.

    Mostly the issues are related to 'character' types. Most LAPACK functions
    that take string arguments use them as enums and only care about the first
    character of the string. f2c generates a 'length' argument to indicate how
    long the string is, but OpenBLAS leaves these length arguments out because
    the strings are assumed to have length 1.

    So the goal is to cause f2c to generate no length argument. We can achieve
    this by replacing the string with the ascii code of the first character
    e.g.,:

        f('UPPER') --> f(85)

    Coming from C this surprises me a bit. I would expect `f(85)` to cause a
    segfault or something when f tries to find its string at memory address 85.

    f("UPPER") gets f2c'd to:

        f("UPPER", 5)

    But f2c compiles f(85) to the C code:

        static integer c__85 = 85;
        f(&c__85);

    This is perfect. Not sure why it does this, but it's very convenient for us.

    chla_transtype is a special case. The OpenBLAS version of chla_transtype takes
    a return argument, whereas f2c thinks it should return the value.

    """
    f2c_input = Path(f2c_input_path)
    with open(f2c_input) as f:
        lines = f.readlines()
    new_lines = []
    lines = char1_args_to_int(lines)

    for line in lines:
        line = fix_string_args(line)

        if f2c_input_path.endswith("_flapack-f2pywrappers.f"):
            line = line.replace("character cmach", "integer cmach")
            line = line.replace("character norm", "integer norm")
        if "id_dist" in str(f2c_input):
            line = line.replace("character*1 jobz", "integer jobz")
            if "jobz =" in line:
                line = re.sub("'(.)'", lambda r: str(ord(r.group(1))), line)

        if f2c_input.name in [
            "_lapack_subroutine_wrappers.f",
            "_blas_subroutine_wrappers.f",
        ]:
            line = line.replace("character", "integer")
            line = line.replace("ret = chla_transtype(", "call chla_transtype(ret, 1,")

        # f2c has no support for variable sized arrays, so we replace them with
        # dummy fixed sized arrays and then put the formulas back in in
        # fix_f2c_output. Luckily, variable sized arrays are scarce in the scipy
        # code base.
        if "PROPACK" in str(f2c_input):
            line = line.replace("ylocal(n)", "ylocal(123001)")
            line = line.replace("character*1", "integer")

        if f2c_input.name == "mvndst.f":
            line = re.sub(r"(infin|stdev|nlower|nupper)\(d\)", r"\1(123001)", line)
            line = line.replace("rho(d*(d-1)/2)", "rho(123002)")

        new_lines.append(line)

    with open(f2c_input_path, "w") as f:
        f.writelines(new_lines)


def fix_string_args(line: str) -> str:
    """
    Replace all single character strings in (the first line of) "call"
    statements with their ascci codes.
    """
    if (
        not re.search("call", line, re.I)
        and "SIGNST" not in line
        and "TRANST" not in line
    ):
        return line
    else:
        return re.sub("'[A-Za-z0-9]'", lambda y: str(ord(y.group(0)[1])), line)


def char1_to_int(x: str) -> str:
    """
    Replace multicharacter strings with the ascii code of their first character.

    >>> char1_to_int("CALL sTRSV( 'UPPER', 'NOTRANS', 'NONUNIT', J, H, LDH, Y, 1 )")
    'CALL sTRSV( 85, 78, 78, J, H, LDH, Y, 1 )'
    """
    return re.sub("'(.)[A-Za-z -]*'", lambda r: str(ord(r.group(1))), x)


def char1_args_to_int(lines: list[str]) -> list[str]:
    """
    Replace strings with the ascii code of their first character if they are
    arguments to one of a long list of hard coded LAPACK functions (see
    fncstems). This handles multiline function calls.

    >>> print(char1_args_to_int(["CALL sTRSV( 'UPPER', 'NOTRANS', 'NONUNIT', J, H, LDH, Y, 1 )"]))
    ['CALL sTRSV( 85, 78, 78, J, H, LDH, Y, 1 )']

    >>> print("".join(char1_args_to_int(prepare_doctest('''
    ...               call cvout (logfil, nconv, workl(ihbds), ndigit,
    ...     &            '_neupd: Last row of the eigenvector matrix for T')
    ...     call ctrmm('Right'   , 'Upper'      , 'No transpose',
    ...     &                  'Non-unit', n            , nconv         ,
    ...     &                  one       , workl(invsub), ldq           ,
    ...     &                  z         , ldz)
    ... '''))))
    call cvout (logfil, nconv, workl(ihbds), ndigit,
    &            '_neupd: Last row of the eigenvector matrix for T')
    call ctrmm(82   , 85      , 78,
    &                  78, n            , nconv         ,
    &                  one       , workl(invsub), ldq           ,
    &                  z         , ldz)
    """
    fncstems = [
        "gemm",
        "ggbak",
        "gghrd",
        "lacpy",
        "lamch",
        "lanhs",
        "lanst",
        "larf",
        "lascl",
        "laset",
        "lasr",
        "ormqr",
        "orm2r",
        "steqr",
        "stevr",
        "trevc",
        "trmm",
        "trsen",
        "trsv",
        "unm2r",
        "unmqr",
    ]
    fncnames = []
    for c in "cdsz":
        for stem in fncstems:
            fncnames.append(c + stem)
    fncnames += ["lsame"]

    funcs_pattern = "|".join(fncnames)
    new_lines = []
    replace = False
    for line in lines:
        if re.search(funcs_pattern, line, re.IGNORECASE):
            replace = True
        if replace:
            line = char1_to_int(line)
        if not re.search(r",\s*$", line):
            replace = False
        new_lines.append(line)
    return new_lines


def fix_f2c_output(f2c_output_path: str) -> str | None:
    """
    This function is called on the name of each C output file. It fixes up the C
    output in various ways to compensate for the lack of f2c support for Fortran
    90 and Fortran 95.
    """
    f2c_output = Path(f2c_output_path)

    with open(f2c_output) as f:
        lines = f.readlines()
    if "id_dist" in f2c_output_path:
        # Fix implicit casts in id_dist.
        lines = fix_inconsistent_decls(lines)
    if "odepack" in f2c_output_path or f2c_output.name == "mvndst.c":
        # Mark all but one declaration of each struct as extern.
        if f2c_output.name == "blkdta000.c":
            # extern marking in blkdata000.c doesn't work properly so we let it
            # define the one copy of the structs. It doesn't talk about lsa001
            # at all though, so we need to add a definition of it.
            lines.append(
                """
                struct {    doublereal rownd2, pdest, pdlast, ratio, cm1[12], cm2[5], pdnorm;
                    integer iownd2[3], icount, irflag, jtyp, mused, mxordn, mxords;
                } lsa001_;
                """
            )
        else:
            add_externs_to_structs(lines)

    if f2c_output.name == "_lapack_subroutine_wrappers.c":
        lines = [
            line.replace("integer chla_transtype__", "void chla_transtype__")
            for line in lines
        ]

    # Substitute back the dummy fixed array sizes. We also have to remove the
    # "static" storage specifier since variable sized arrays can't have static
    # storage.
    if f2c_output.name == "mvndst.c":
        lines = fix_inconsistent_decls(lines)

        def fix_line(line: str) -> str:
            if "12300" in line:
                return (
                    line.replace("static", "")
                    .replace("123001", "(*d__)")
                    .replace("123002", "(*d__)*((*d__)-1)/2")
                )
            return line

        lines = list(map(fix_line, lines))

    if "PROPACK" in str(f2c_output):

        def fix_line(line: str) -> str:
            if f2c_output.name != "cgemm_ovwr.c":
                line = line.replace("struct", "extern struct")
            if "12300" in line:
                return line.replace("static", "").replace("123001", "(*n)")
            return line

        lines = list(map(fix_line, lines))
        if f2c_output.name.endswith("lansvd.c"):
            lines.append(
                """
                #include <time.h>

                int second_(real *t) {
                    *t = clock()/1000;
                    return 0;
                }
                """
            )

    # In numpy 1.24 f2py changed its treatment of character argument. In
    # particular it does not generate a ftnlen parameter for each
    # character parameter but f2c still generates it. The following code
    # removes unneeded ftnlen parameters from the f2ced signature. The
    # problematic subroutines and parameters are the ones with a type character
    # in scipy/sparse/linalg/_eigen/arpack/arpack.pyf.src
    if "eupd.c" in str(f2c_output):
        # put signature on a single line to make replacement more
        # straightforward
        regrouped_lines = regroup_lines(lines)
        lines = [
            re.sub(r",?\s*ftnlen\s*(howmny_len|bmat_len)", "", line)
            for line in regrouped_lines
        ]

    # Fix signature of c_abs to match the OpenBLAS one
    if "REVCOM.c" in str(f2c_output):
        lines = [line.replace("double c_abs(", "float c_abs(") for line in lines]

    with open(f2c_output, "w") as f:
        f.writelines(lines)

    return None


def add_externs_to_structs(lines: list[str]) -> None:
    """
    The fortran "common" keyword is supposed to share variables between a bunch
    of files. f2c doesn't handle this correctly (it isn't possible for it to
    handle it correctly because it only looks one file at a time).

    We mark all the structs as externs and then (separately) add one non extern
    version to each file.
    >>> lines = prepare_doctest('''
    ...     struct {    doublereal rls[218];
    ...         integer ils[39];
    ...     } ls0001_;
    ...     struct {    doublereal rlsa[22];
    ...         integer ilsa[9];
    ...     } lsa001_;
    ...     struct {    integer ieh[2];
    ...     } eh0001_;
    ... ''')
    >>> add_externs_to_structs(lines)
    >>> print("".join(lines))
    extern struct {    doublereal rls[218];
        integer ils[39];
    } ls0001_;
    extern struct {    doublereal rlsa[22];
        integer ilsa[9];
    } lsa001_;
    extern struct {    integer ieh[2];
    } eh0001_;
    """
    for idx, line in enumerate(lines):
        if line.startswith("struct"):
            lines[idx] = "extern " + lines[idx]


def regroup_lines(lines: Iterable[str]) -> Iterator[str]:
    """
    Make sure that functions and declarations have their argument list only on
    one line.

    >>> print("".join(regroup_lines(prepare_doctest('''
    ...     /* Subroutine */ int clanhfwrp_(real *ret, char *norm, char *transr, char *
    ...     	uplo, integer *n, complex *a, real *work, ftnlen norm_len, ftnlen
    ...     	transr_len, ftnlen uplo_len)
    ...     {
    ...        static doublereal psum[52];
    ...        extern /* Subroutine */ int dqelg_(integer *, doublereal *, doublereal *,
    ...            doublereal *, doublereal *, integer *);
    ... '''))))
    /* Subroutine */ int clanhfwrp_(real *ret, char *norm, char *transr, char * uplo, integer *n, complex *a, real *work, ftnlen norm_len, ftnlen transr_len, ftnlen uplo_len){
       static doublereal psum[52];
       extern /* Subroutine */ int dqelg_(integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *);

    """
    line_iter = iter(lines)
    for line in line_iter:
        if "/* Subroutine */" not in line:
            yield line
            continue

        is_definition = line.startswith("/* Subroutine */")
        stop = ")" if is_definition else ";"
        if stop in line:
            yield line
            continue

        sub_lines = [line.rstrip()]
        for line in line_iter:
            sub_lines.append(line.strip())
            if stop in line:
                break
        joined_line = " ".join(sub_lines)
        if is_definition:
            yield joined_line
        else:
            yield from (x + ";" for x in joined_line.split(";")[:-1])


def fix_inconsistent_decls(lines: list[str]) -> list[str]:
    """
    Fortran functions in id_dist use implicit casting of function args which f2c
    doesn't support.

    The fortran equivalent of the following code:

        double f(double x){
            return x + 5;
        }
        double g(int x){
            return f(x);
        }

    gets f2c'd to:

        double f(double x){
            return x + 5;
        }
        double g(int x){
            double f(int);
            return f(x);
        }

    which fails to compile because the declaration of f type clashes with the
    definition. Gather up all the definitions in each file and then gathers the
    declarations and fixes them if necessary so that the declaration matches the
    definition.

    >>> print("".join(fix_inconsistent_decls(prepare_doctest('''
    ...    /* Subroutine */ double f(double x){
    ...        return x + 5;
    ...    }
    ...    /* Subroutine */ double g(int x){
    ...        extern /* Subroutine */ double f(int);
    ...        return f(x);
    ...    }
    ... '''))))
    /* Subroutine */ double f(double x){
        return x + 5;
    }
    /* Subroutine */ double g(int x){
        extern /* Subroutine */ double f(double);
        return f(x);
    }
    """
    func_types = {}
    lines = list(regroup_lines(lines))
    for line in lines:
        if not line.startswith("/* Subroutine */"):
            continue
        [func_name, types] = get_subroutine_decl(line)
        func_types[func_name] = types

    for idx, line in enumerate(lines):
        if "extern /* Subroutine */" not in line:
            continue
        decls = line.split(")")[:-1]
        for decl in decls:
            [func_name, types] = get_subroutine_decl(decl)
            if func_name not in func_types or types == func_types[func_name]:
                continue
            types = func_types[func_name]
            l = list(line.partition(func_name + "("))
            l[2:] = list(l[2].partition(")"))
            l[2] = ", ".join(types)
            line = "".join(l)
        lines[idx] = line
    return lines


def get_subroutine_decl(sub: str) -> tuple[str, list[str]]:
    """
    >>> get_subroutine_decl(
    ...     "extern /* Subroutine */ int dqelg_(integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *);"
    ... )
    ('dqelg_', ['integer *', 'doublereal *', 'doublereal *', 'doublereal *', 'doublereal *', 'integer *'])
    """
    func_name = sub.partition("(")[0].rpartition(" ")[2]
    args_str = sub.partition("(")[2].partition(")")[0]
    args = args_str.split(",")
    types = []
    for arg in args:
        arg = arg.strip()
        if "*" in arg:
            type = "".join(arg.partition("*")[:-1])
        else:
            type = arg.partition(" ")[0]
        types.append(type.strip())
    return (func_name, types)


def scipy_fix_cfile(path: str) -> None:
    """
    Replace void return types with int return types in various generated .c and
    .h files. We can't achieve this with a simple patch because these files are
    not in the sdist, they are generated as part of the build.
    """
    source_path = Path(path)
    text = source_path.read_text()
    text = text.replace("extern void F_WRAPPEDFUNC", "extern int F_WRAPPEDFUNC")
    text = text.replace("extern void F_FUNC", "extern int F_FUNC")
    text = text.replace("void (*f2py_func)", "int (*f2py_func)")
    text = text.replace("static void cb_", "static int cb_")
    text = text.replace("typedef void(*cb_", "typedef int(*cb_")
    text = text.replace("void(*)", "int(*)")
    text = text.replace("static void f2py_setup_", "static int f2py_setup_")

    if path.endswith("_flapackmodule.c"):
        text = text.replace(",size_t", "")
        text = re.sub(r",slen\([a-z]*\)\)", ")", text)

    if path.endswith("stats/statlib/spearman.c"):
        # in scipy/stats/statlib/swilk.f ALNORM is called with a double, and in
        # scipy/stats/statlib/spearman.f with a real this generates
        # inconsistent signature. Let's use double in both, I don't think this
        # code path will work (but at least it will compile) since it needs
        # "ALNORM = algorithm AS66", which I don't think we have with the f2c
        # route
        text = text.replace("extern real alnorm_", "extern doublereal alnorm_")

    source_path.write_text(text)

    for lib in ["lapack", "blas"]:
        if path.endswith(f"cython_{lib}.c"):
            header_name = f"_{lib}_subroutines.h"
            header_dir = Path(path).parent
            header_path = find_header(header_dir, header_name)

            header_text = header_path.read_text()
            header_text = header_text.replace("void F_FUNC", "int F_FUNC")
            header_path.write_text(header_text)


def find_header(source_dir: Path, header_name: str) -> Path:
    """
    Find the header file that corresponds to a source file.
    """
    while not (header_path := source_dir / header_name).exists():
        # meson copies the source files into a subdirectory of the build
        source_dir = source_dir.parent
        if source_dir == Path("/"):
            raise RuntimeError(f"Could not find header file {header_name}")

    return header_path


def scipy_fixes(args: list[str]) -> None:
    for arg in args:
        if arg.endswith(".c"):
            scipy_fix_cfile(arg)


def replay_f2c(args: list[str], dryrun: bool = False) -> list[str] | None:
    """Apply f2c to compilation arguments

    Parameters
    ----------
    args
       input compiler arguments
    dryrun
       if False run f2c on detected fortran files

    Returns
    -------
    new_args
       output compiler arguments


    Examples
    --------

    >>> replay_f2c(['gfortran', 'test.f'], dryrun=True)
    ['gcc', 'test.c']
    """

    new_args = ["gcc"]
    found_source = False
    for arg in args[1:]:
        if arg.endswith(".f") or arg.endswith(".F"):
            filepath = Path(arg).resolve()
            if not dryrun:
                fix_f2c_input(arg)
                if arg.endswith(".F"):
                    # .F files apparently expect to be run through the C
                    # preprocessor (they have #ifdef's in them)
                    # Use gfortran frontend, as gcc frontend might not be
                    # present on osx
                    # The file-system might be not case-sensitive,
                    # so take care to handle this by renaming.
                    # For preprocessing and further operation the
                    # expected file-name and extension needs to be preserved.
                    subprocess.check_call(
                        [
                            "gfortran",
                            "-E",
                            "-C",
                            "-P",
                            filepath,
                            "-o",
                            filepath.with_suffix(".f77"),
                        ]
                    )
                    filepath = filepath.with_suffix(".f77")
                # -R flag is important, it means that Fortran functions that
                # return real e.g. sdot will be transformed into C functions
                # that return float. For historic reasons, by default f2c
                # transform them into functions that return a double. Using -R
                # allows to match what OpenBLAS has done when they f2ced their
                # Fortran files, see
                # https://github.com/xianyi/OpenBLAS/pull/3539#issuecomment-1493897254
                # for more details
                with (
                    open(filepath) as input_pipe,
                    open(filepath.with_suffix(".c"), "w") as output_pipe,
                ):
                    subprocess.check_call(
                        ["f2c", "-R"],
                        stdin=input_pipe,
                        stdout=output_pipe,
                        cwd=filepath.parent,
                    )
                fix_f2c_output(arg[:-2] + ".c")
            new_args.append(arg[:-2] + ".c")
            found_source = True
        else:
            new_args.append(arg)

    new_args_str = " ".join(args)
    if ".so" in new_args_str and "libgfortran.so" not in new_args_str:
        found_source = True

    if not found_source:
        return None
    return new_args