From e95f4c63b0db430fe3580de419a77d9fd5b9683f Mon Sep 17 00:00:00 2001
From: Ben Olmstead <bolms@google.com>
Date: Thu, 19 Dec 2024 21:07:34 +0000
Subject: [PATCH 1/3] Handle very large `.emb` files.

This change switches parse tree handling to use iteration (with an
explicit stack) instead of recursion, which:

*   Allows large (>~1000 entity) `.emb` files to be formatted.
*   Allows very large (>~16k entity) `.emb` files to be compiled.

The difference in sizes in the previous code was due to `module_ir.py`
hackily increasing the recursion limit: while this more or less worked,
it was a little dangerous (it ran the risk of blowing out the C stack,
depending on platform) and only increased the limit.  This change
removes the limit entirely (at least, up to the available memory on the
system).
---
 compiler/front_end/BUILD              |   2 +
 compiler/front_end/format_emb.py      |  24 ++--
 compiler/front_end/format_emb_test.py |  16 ++-
 compiler/front_end/module_ir.py       | 151 ++++++++++++--------------
 compiler/front_end/module_ir_test.py  |  13 +++
 compiler/util/BUILD                   |   8 ++
 compiler/util/parser_util.py          | 138 +++++++++++++++++++++++
 7 files changed, 259 insertions(+), 93 deletions(-)
 create mode 100644 compiler/util/parser_util.py

diff --git a/compiler/front_end/BUILD b/compiler/front_end/BUILD
index efcec3e..5128a77 100644
--- a/compiler/front_end/BUILD
+++ b/compiler/front_end/BUILD
@@ -73,6 +73,7 @@ py_library(
         "//compiler/util:ir_data",
         "//compiler/util:name_conversion",
         "//compiler/util:parser_types",
+        "//compiler/util:parser_util",
     ],
 )
 
@@ -436,6 +437,7 @@ py_library(
         ":module_ir",
         ":tokenizer",
         "//compiler/util:parser_types",
+        "//compiler/util:parser_util",
     ],
 )
 
diff --git a/compiler/front_end/format_emb.py b/compiler/front_end/format_emb.py
index fc7bf94..f0b0137 100644
--- a/compiler/front_end/format_emb.py
+++ b/compiler/front_end/format_emb.py
@@ -26,6 +26,7 @@
 from compiler.front_end import module_ir
 from compiler.front_end import tokenizer
 from compiler.util import parser_types
+from compiler.util import parser_util
 
 
 class Config(collections.namedtuple("Config", ["indent_width", "show_line_types"])):
@@ -67,18 +68,17 @@ def format_emboss_parse_tree(parse_tree, config, used_productions=None):
     Returns:
         A string of the reformatted source text.
     """
-    if hasattr(parse_tree, "children"):
-        parsed_children = [
-            format_emboss_parse_tree(child, config, used_productions)
-            for child in parse_tree.children
-        ]
-        args = parsed_children + [config]
-        if used_productions is not None:
-            used_productions.add(parse_tree.production)
-        return _formatters[parse_tree.production](*args)
-    else:
-        assert isinstance(parse_tree, parser_types.Token), str(parse_tree)
-        return parse_tree.text
+    formatters = {}
+    for production, handler in _formatters.items():
+        # An extra layer of indirection is required here so that the resulting
+        # lambda does not capture the local variable `handler`.
+        def wrapped_handler(handler):
+            return lambda _, *args: handler(*(args + (config,)))
+
+        formatters[production] = wrapped_handler(handler)
+    return parser_util.transform_parse_tree(
+        parse_tree, lambda n: n.text, formatters, used_productions
+    )
 
 
 def sanity_check_format_result(formatted_text, original_text):
diff --git a/compiler/front_end/format_emb_test.py b/compiler/front_end/format_emb_test.py
index 3d5331e..42d8a26 100644
--- a/compiler/front_end/format_emb_test.py
+++ b/compiler/front_end/format_emb_test.py
@@ -92,7 +92,21 @@ def test_eol_missing(self):
 
 
 class FormatEmbTest(unittest.TestCase):
-    pass
+
+    def test_very_long_emb(self):
+        """Checks that very long inputs do not hit the Python recursion limit."""
+        emb = ["enum Test:\n"]
+        # Enough entities to blow through the default recursion limit and the
+        # bumped limit that was previously in place.
+        for i in range(max(sys.getrecursionlimit(), 16 * 1024) * 2):
+            emb.append(f"  VALUE_{i} = {i}\n")
+        parsed_unformatted = parser.parse_module(
+            tokenizer.tokenize("".join(emb), "long.emb")[0]
+        )
+        formatted_text = format_emb.format_emboss_parse_tree(
+            parsed_unformatted.parse_tree,
+            format_emb.Config(indent_width=2),
+        )
 
 
 def _make_golden_file_tests():
diff --git a/compiler/front_end/module_ir.py b/compiler/front_end/module_ir.py
index 145bba3..633c1b8 100644
--- a/compiler/front_end/module_ir.py
+++ b/compiler/front_end/module_ir.py
@@ -29,6 +29,7 @@
 from compiler.util import ir_data_utils
 from compiler.util import name_conversion
 from compiler.util import parser_types
+from compiler.util import parser_util
 
 
 # Intermediate types; should not be found in the final IR.
@@ -82,88 +83,78 @@ def __init__(self, field, subtypes=None):
 def build_ir(parse_tree, used_productions=None):
     r"""Builds a module-level intermediate representation from a valid parse tree.
 
-  The parse tree is precisely dictated by the exact productions in the grammar
-  used by the parser, with no semantic information.  _really_build_ir transforms
-  this "raw" form into a stable, cooked representation, thereby isolating
-  subsequent steps from the exact details of the grammar.
-
-  (Probably incomplete) list of transformations:
-
-  *   ParseResult and Token nodes are replaced with Module, Attribute, Struct,
-      Type, etc. objects.
-
-  *   Purely syntactic tokens ('"["', '"struct"', etc.) are discarded.
-
-  *   Repeated elements are transformed from tree form to list form:
-
-          a*
-         / \
-        b   a*
+    The parse tree is precisely dictated by the exact productions in the grammar
+    used by the parser, with no semantic information.  _really_build_ir
+    transforms this "raw" form into a stable, cooked representation, thereby
+    isolating subsequent steps from the exact details of the grammar.
+    
+    (Probably incomplete) list of transformations:
+    
+    *   ParseResult and Token nodes are replaced with Module, Attribute, Struct,
+        Type, etc. objects.
+    
+    *   Purely syntactic tokens ('"["', '"struct"', etc.) are discarded.
+    
+    *   Repeated elements are transformed from tree form to list form:
+    
+            a*
            / \
-          c   a*
+          b   a*
              / \
-            d   a*
-
-      (where b, c, and d are nodes of type "a") becomes [b, c, d].
-
-  *   The values of numeric constants (Number, etc. tokens) are parsed.
-
-  *   Different classes of names (snake_names, CamelNames, ShoutyNames) are
-      folded into a single "Name" type, since they are guaranteed to appear in
-      the correct places in the parse tree.
-
-
-  Arguments:
-    parse_tree: A parse tree.  Each leaf node should be a parser_types.Token
-      object, and each non-leaf node should have a 'symbol' attribute specifying
-      which grammar symbol it represents, and a 'children' attribute containing
-      a list of child nodes.  This is the format returned by the parsers
-      produced by the lr1 module, when run against tokens from the tokenizer
-      module.
-    used_productions: If specified, used_productions.add() will be called with
-      each production actually used in parsing.  This can be useful when
-      developing the grammar and writing tests; in particular, it can be used to
-      figure out which productions are *not* used when parsing a particular
-      file.
-
-  Returns:
-    A module-level intermediate representation (module IR) for an Emboss module
-    (source file).  This IR will not have symbols resolved; that must be done on
-    a forest of module IRs so that names from other modules can be resolved.
-  """
-
-    # TODO(b/140259131): Refactor _really_build_ir to be less recursive/use an
-    # explicit stack.
-    old_recursion_limit = sys.getrecursionlimit()
-    sys.setrecursionlimit(16 * 1024)  # ~8000 top-level entities in one module.
-    try:
-        result = _really_build_ir(parse_tree, used_productions)
-    finally:
-        sys.setrecursionlimit(old_recursion_limit)
-    return result
-
-
-def _really_build_ir(parse_tree, used_productions):
-    """Real implementation of build_ir()."""
-    if used_productions is None:
-        used_productions = set()
-    if hasattr(parse_tree, "children"):
-        parsed_children = [
-            _really_build_ir(child, used_productions) for child in parse_tree.children
-        ]
-        used_productions.add(parse_tree.production)
-        result = _handlers[parse_tree.production](*parsed_children)
-        if parse_tree.source_location:
-            if isinstance(result, tuple):
-                result = result._replace(source_location=parse_tree.source_location)
-            else:
-                result.source_location = parse_tree.source_location
-        return result
-    else:
-        # For leaf nodes, the temporary "IR" is just the token.  Higher-level rules
-        # will translate it to a real IR.
-        assert isinstance(parse_tree, parser_types.Token), str(parse_tree)
-        return parse_tree
+            c   a*
+               / \
+              d   a*
+    
+        (where b, c, and d are nodes of type "a") becomes [b, c, d].
+    
+    *   The values of numeric constants (Number, etc. tokens) are parsed.
+    
+    *   Different classes of names (snake_names, CamelNames, ShoutyNames) are
+        folded into a single "Name" type, since they are guaranteed to appear in
+        the correct places in the parse tree.
+    
+    
+    Arguments:
+        parse_tree: A parse tree.  Each leaf node should be a parser_types.Token
+            object, and each non-leaf node should have a 'symbol' attribute
+            specifying which grammar symbol it represents, and a 'children'
+            attribute containing a list of child nodes.  This is the format
+            returned by the parsers produced by the lr1 module, when run
+            against tokens from the tokenizer module.
+        used_productions: If specified, used_productions.add() will be called
+            with each production actually used in parsing.  This can be useful
+            when developing the grammar and writing tests; in particular, it
+            can be used to figure out which productions are *not* used when
+            parsing a particular file.
+    
+    Returns:
+        A module-level intermediate representation (module IR) for an Emboss
+        module (source file).  This IR will not have symbols resolved,
+        constraints checked, fields synthesized, etc.; it will only be a
+        representation of the syntactic elements of the source.
+    """
+    handlers = {}
+    for production, handler in _handlers.items():
+        # An extra layer of indirection is required here so that the resulting
+        # lambda does not capture the local variable `handler`.
+        def wrapped_handler(handler):
+            def wrapped_handler(node, *args):
+                module_node = handler(*args)
+                if node.source_location:
+                    if isinstance(module_node, tuple):
+                        module_node = module_node._replace(
+                            source_location=node.source_location
+                        )
+                    else:
+                        module_node.source_location = node.source_location
+                return module_node
+
+            return wrapped_handler
+
+        handlers[production] = wrapped_handler(handler)
+    return parser_util.transform_parse_tree(
+        parse_tree, lambda n: n, handlers, used_productions
+    )
 
 
 # Map of productions to their handlers.
diff --git a/compiler/front_end/module_ir_test.py b/compiler/front_end/module_ir_test.py
index 1931bab..fa8db9c 100644
--- a/compiler/front_end/module_ir_test.py
+++ b/compiler/front_end/module_ir_test.py
@@ -18,6 +18,7 @@
 
 import collections
 import pkgutil
+import sys
 import unittest
 
 from compiler.front_end import module_ir
@@ -4057,6 +4058,18 @@ def test_double_negative_non_compilation(self):
             parse_result = parser.parse_module(tokenizer.tokenize(example, "")[0])
             self.assertFalse(parse_result.error)
 
+    def test_long_input(self):
+        """Checks that very long inputs do not hit the Python recursion limit."""
+        emb = ["enum Test:\n"]
+        # Enough entities to blow through the default recursion limit and the
+        # bumped limit that was previously in place.
+        for i in range(max(sys.getrecursionlimit(), 16 * 1024) * 2):
+            emb.append(f"  VALUE_{i} = {i}\n")
+        parse_result = parser.parse_module(
+            tokenizer.tokenize("".join(emb), "long.emb")[0]
+        )
+        module_ir.build_ir(parse_result.parse_tree)
+
 
 def _make_superset_tests():
 
diff --git a/compiler/util/BUILD b/compiler/util/BUILD
index b0a29a3..4a444fd 100644
--- a/compiler/util/BUILD
+++ b/compiler/util/BUILD
@@ -154,6 +154,14 @@ py_test(
     ],
 )
 
+py_library(
+    name = "parser_util",
+    srcs = ["parser_util.py"],
+    deps = [
+        ":parser_types",
+    ],
+)
+
 py_library(
     name = "error",
     srcs = [
diff --git a/compiler/util/parser_util.py b/compiler/util/parser_util.py
new file mode 100644
index 0000000..c95f70a
--- /dev/null
+++ b/compiler/util/parser_util.py
@@ -0,0 +1,138 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for working with parse trees."""
+
+from compiler.util import parser_types
+
+
+def transform_parse_tree(
+    parse_tree, token_handler, production_handlers, used_productions=None
+):
+    """Walks the provided parse_tree, calling handlers for each node.
+
+    This function uses provided handlers to transform a parse tree into a new
+    structure.  From the bottom up, calls a handler on each node, passing the
+    node itself and the results of calling handlers on each child of the node
+    (if any).
+
+    This function avoids recursion, so it is suitable for very deep parse
+    trees.
+
+    Arguments:
+        parse_tree: the tree to process
+        token_handler: the handler to use for Token nodes.
+        production_handlers: a map from productions to handlers for those
+            productions.
+        used_productions: an optional set; all encountered productions will be
+            added to used_productions.
+
+    Returns:
+        The result of the production_handler for the top-level parse_tree node.
+    """
+    # The stack of entries to process.  Each entry is in one of two states,
+    # depending on the 3rd element (children_completed):
+    #
+    # If False, the node's children have not been added, and the action is to
+    # push the same node with children_completed=True, then push each an entry
+    # for each of node's children with their children_completed=False.
+    #
+    # If True, the node's children have been added and (by the time the entry
+    # is back on top of the stack) all of them have been processed.  In this
+    # case, the action is to call the appropriate handler (token_handler() or
+    # production_handlers[node.production]()) with the node and its transformed
+    # children, then store the result in the node's parent's
+    # transformed_children list.
+    #
+    # As an example, if we have:
+    #
+    #                                A
+    #                               / \
+    #                              B   C
+    #                                 / \
+    #                                D   E
+    #
+    # Then the steps are:
+    #
+    # Initialize:
+    # 1.  Push (A, children_completed=False)
+    #
+    # Start handling A:
+    # 2.  Pop (A, children_completed=False)
+    # 3.  Push (A, children_completed=True)
+    # 4.  Push (B, children_completed=False)
+    # 5.  Push (C, children_completed=False)
+    #
+    # Start handling C:
+    # 6.  Pop (C, children_completed=False)
+    # 7.  Push (C, children_completed=True)
+    # 8.  Push (D, children_completed=False)
+    # 9.  Push (E, children_completed=False)
+    #
+    # Start handling E:
+    # 10. Pop (E, children_completed=False)
+    # 11. Push (E, children_completed=True)
+    #
+    # Finish handling E:
+    # 12. Pop (E, children_completed=True)
+    # 13. Insert token_handler(E) into C.transformed_children
+    #
+    # Start handling D:
+    # 14. Pop (D, children_completed=False)
+    # 15. Push (D, children_completed=True)
+    #
+    # Finish handling D:
+    # 16. Pop (D, children_completed=True)
+    # 17. Insert token_handler(D) into C.transformed_children
+    #
+    # Finish handling C:
+    # 18. Pop (C, children_completed=True)
+    # 19. Insert production_handlers[C.production](C, *C.transformed_children)
+    #     into A.transformed_children
+    #
+    # Start handling B:
+    # 20. Pop (B, children_completed=False)
+    # 21. Push (B, children_completed=True)
+    #
+    # Finish handling B:
+    # 22. Pop (B, children_completed=True)
+    # 23. Insert token_handler(B) into A.transformed_children
+    #
+    # Finish handling A:
+    # 24. Pop (A, children_completed=True)
+    # 25. Return production_handlers[A.production](A, *A.transformed_children)
+    #
+    # It takes quite a few steps to handle even a small tree!
+    stack = [(parse_tree, None, False, None)]
+    while True:
+        node, parent, children_completed, transformed_children = stack.pop()
+        if not children_completed:
+            parent_entry = []
+            stack.append((node, parent, True, parent_entry))
+            if hasattr(node, "children"):
+                for child in node.children:
+                    stack.append((child, parent_entry, False, None))
+                if used_productions is not None:
+                    used_productions.add(node.production)
+        else:
+            if isinstance(node, parser_types.Token):
+                transformed_node = token_handler(node)
+            else:
+                transformed_node = production_handlers[node.production](
+                    *([node] + transformed_children)
+                )
+            if parent is None:
+                return transformed_node
+            else:
+                parent.insert(0, transformed_node)

From e158cdca66e331c7c15cf2c1d369b966b8b05e4c Mon Sep 17 00:00:00 2001
From: Ben Olmstead <bolms@google.com>
Date: Thu, 19 Dec 2024 21:58:57 +0000
Subject: [PATCH 2/3] Output test logs on failure.

---
 .github/workflows/verify-pull-request.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/verify-pull-request.yml b/.github/workflows/verify-pull-request.yml
index ccaedad..9c57be7 100644
--- a/.github/workflows/verify-pull-request.yml
+++ b/.github/workflows/verify-pull-request.yml
@@ -17,7 +17,7 @@ jobs:
           disk-cache: "verify-pr:run-bazel-tests:${{ matrix.cpp-compiler }}"
           repository-cache: true
       - run: echo "CC=${{ matrix.cpp-compiler }}" >> $GITHUB_ENV
-      - run: bazel test ${{ matrix.options }} ...
+      - run: bazel test --test_output=errors ${{ matrix.options }} ...
   check-formatting:
     name: "Check Python formatting"
     runs-on: ubuntu-latest

From 57d5ce197b0485d249aec4886e5083f2dfd86be1 Mon Sep 17 00:00:00 2001
From: Ben Olmstead <bolms@google.com>
Date: Thu, 19 Dec 2024 23:04:41 +0000
Subject: [PATCH 3/3] Backport test fix.

---
 compiler/back_end/cpp/testcode/auto_array_size_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/back_end/cpp/testcode/auto_array_size_test.cc b/compiler/back_end/cpp/testcode/auto_array_size_test.cc
index eaae8ec..e3456e2 100644
--- a/compiler/back_end/cpp/testcode/auto_array_size_test.cc
+++ b/compiler/back_end/cpp/testcode/auto_array_size_test.cc
@@ -306,7 +306,7 @@ TEST(AutoSizeView, CanCopyFrom) {
   auto source = MakeAlignedAutoSizeView<const ::std::uint8_t, 8>(
       kAutoSize, sizeof kAutoSize);
 
-  ::std::array</**/ ::std::uint8_t, sizeof kAutoSize> buf = {0};
+  alignas(8) ::std::array</**/ ::std::uint8_t, sizeof kAutoSize> buf = {0};
   auto dest =
       MakeAlignedAutoSizeView</**/ ::std::uint8_t, 8>(buf.data(), buf.size());