From e95f4c63b0db430fe3580de419a77d9fd5b9683f Mon Sep 17 00:00:00 2001 From: Ben Olmstead Date: Thu, 19 Dec 2024 21:07:34 +0000 Subject: [PATCH 1/3] Handle very large `.emb` files. This change switches parse tree handling to use iteration (with an explicit stack) instead of recursion, which: * Allows large (>~1000 entity) `.emb` files to be formatted. * Allows very large (>~16k entity) `.emb` files to be compiled. The difference in sizes in the previous code was due to `module_ir.py` hackily increasing the recursion limit: while this more or less worked, it was a little dangerous (it ran the risk of blowing out the C stack, depending on platform) and only increased the limit. This change removes the limit entirely (at least, up to the available memory on the system). --- compiler/front_end/BUILD | 2 + compiler/front_end/format_emb.py | 24 ++-- compiler/front_end/format_emb_test.py | 16 ++- compiler/front_end/module_ir.py | 151 ++++++++++++-------------- compiler/front_end/module_ir_test.py | 13 +++ compiler/util/BUILD | 8 ++ compiler/util/parser_util.py | 138 +++++++++++++++++++++++ 7 files changed, 259 insertions(+), 93 deletions(-) create mode 100644 compiler/util/parser_util.py diff --git a/compiler/front_end/BUILD b/compiler/front_end/BUILD index efcec3e..5128a77 100644 --- a/compiler/front_end/BUILD +++ b/compiler/front_end/BUILD @@ -73,6 +73,7 @@ py_library( "//compiler/util:ir_data", "//compiler/util:name_conversion", "//compiler/util:parser_types", + "//compiler/util:parser_util", ], ) @@ -436,6 +437,7 @@ py_library( ":module_ir", ":tokenizer", "//compiler/util:parser_types", + "//compiler/util:parser_util", ], ) diff --git a/compiler/front_end/format_emb.py b/compiler/front_end/format_emb.py index fc7bf94..f0b0137 100644 --- a/compiler/front_end/format_emb.py +++ b/compiler/front_end/format_emb.py @@ -26,6 +26,7 @@ from compiler.front_end import module_ir from compiler.front_end import tokenizer from compiler.util import parser_types +from compiler.util import parser_util class Config(collections.namedtuple("Config", ["indent_width", "show_line_types"])): @@ -67,18 +68,17 @@ def format_emboss_parse_tree(parse_tree, config, used_productions=None): Returns: A string of the reformatted source text. """ - if hasattr(parse_tree, "children"): - parsed_children = [ - format_emboss_parse_tree(child, config, used_productions) - for child in parse_tree.children - ] - args = parsed_children + [config] - if used_productions is not None: - used_productions.add(parse_tree.production) - return _formatters[parse_tree.production](*args) - else: - assert isinstance(parse_tree, parser_types.Token), str(parse_tree) - return parse_tree.text + formatters = {} + for production, handler in _formatters.items(): + # An extra layer of indirection is required here so that the resulting + # lambda does not capture the local variable `handler`. + def wrapped_handler(handler): + return lambda _, *args: handler(*(args + (config,))) + + formatters[production] = wrapped_handler(handler) + return parser_util.transform_parse_tree( + parse_tree, lambda n: n.text, formatters, used_productions + ) def sanity_check_format_result(formatted_text, original_text): diff --git a/compiler/front_end/format_emb_test.py b/compiler/front_end/format_emb_test.py index 3d5331e..42d8a26 100644 --- a/compiler/front_end/format_emb_test.py +++ b/compiler/front_end/format_emb_test.py @@ -92,7 +92,21 @@ def test_eol_missing(self): class FormatEmbTest(unittest.TestCase): - pass + + def test_very_long_emb(self): + """Checks that very long inputs do not hit the Python recursion limit.""" + emb = ["enum Test:\n"] + # Enough entities to blow through the default recursion limit and the + # bumped limit that was previously in place. + for i in range(max(sys.getrecursionlimit(), 16 * 1024) * 2): + emb.append(f" VALUE_{i} = {i}\n") + parsed_unformatted = parser.parse_module( + tokenizer.tokenize("".join(emb), "long.emb")[0] + ) + formatted_text = format_emb.format_emboss_parse_tree( + parsed_unformatted.parse_tree, + format_emb.Config(indent_width=2), + ) def _make_golden_file_tests(): diff --git a/compiler/front_end/module_ir.py b/compiler/front_end/module_ir.py index 145bba3..633c1b8 100644 --- a/compiler/front_end/module_ir.py +++ b/compiler/front_end/module_ir.py @@ -29,6 +29,7 @@ from compiler.util import ir_data_utils from compiler.util import name_conversion from compiler.util import parser_types +from compiler.util import parser_util # Intermediate types; should not be found in the final IR. @@ -82,88 +83,78 @@ def __init__(self, field, subtypes=None): def build_ir(parse_tree, used_productions=None): r"""Builds a module-level intermediate representation from a valid parse tree. - The parse tree is precisely dictated by the exact productions in the grammar - used by the parser, with no semantic information. _really_build_ir transforms - this "raw" form into a stable, cooked representation, thereby isolating - subsequent steps from the exact details of the grammar. - - (Probably incomplete) list of transformations: - - * ParseResult and Token nodes are replaced with Module, Attribute, Struct, - Type, etc. objects. - - * Purely syntactic tokens ('"["', '"struct"', etc.) are discarded. - - * Repeated elements are transformed from tree form to list form: - - a* - / \ - b a* + The parse tree is precisely dictated by the exact productions in the grammar + used by the parser, with no semantic information. _really_build_ir + transforms this "raw" form into a stable, cooked representation, thereby + isolating subsequent steps from the exact details of the grammar. + + (Probably incomplete) list of transformations: + + * ParseResult and Token nodes are replaced with Module, Attribute, Struct, + Type, etc. objects. + + * Purely syntactic tokens ('"["', '"struct"', etc.) are discarded. + + * Repeated elements are transformed from tree form to list form: + + a* / \ - c a* + b a* / \ - d a* - - (where b, c, and d are nodes of type "a") becomes [b, c, d]. - - * The values of numeric constants (Number, etc. tokens) are parsed. - - * Different classes of names (snake_names, CamelNames, ShoutyNames) are - folded into a single "Name" type, since they are guaranteed to appear in - the correct places in the parse tree. - - - Arguments: - parse_tree: A parse tree. Each leaf node should be a parser_types.Token - object, and each non-leaf node should have a 'symbol' attribute specifying - which grammar symbol it represents, and a 'children' attribute containing - a list of child nodes. This is the format returned by the parsers - produced by the lr1 module, when run against tokens from the tokenizer - module. - used_productions: If specified, used_productions.add() will be called with - each production actually used in parsing. This can be useful when - developing the grammar and writing tests; in particular, it can be used to - figure out which productions are *not* used when parsing a particular - file. - - Returns: - A module-level intermediate representation (module IR) for an Emboss module - (source file). This IR will not have symbols resolved; that must be done on - a forest of module IRs so that names from other modules can be resolved. - """ - - # TODO(b/140259131): Refactor _really_build_ir to be less recursive/use an - # explicit stack. - old_recursion_limit = sys.getrecursionlimit() - sys.setrecursionlimit(16 * 1024) # ~8000 top-level entities in one module. - try: - result = _really_build_ir(parse_tree, used_productions) - finally: - sys.setrecursionlimit(old_recursion_limit) - return result - - -def _really_build_ir(parse_tree, used_productions): - """Real implementation of build_ir().""" - if used_productions is None: - used_productions = set() - if hasattr(parse_tree, "children"): - parsed_children = [ - _really_build_ir(child, used_productions) for child in parse_tree.children - ] - used_productions.add(parse_tree.production) - result = _handlers[parse_tree.production](*parsed_children) - if parse_tree.source_location: - if isinstance(result, tuple): - result = result._replace(source_location=parse_tree.source_location) - else: - result.source_location = parse_tree.source_location - return result - else: - # For leaf nodes, the temporary "IR" is just the token. Higher-level rules - # will translate it to a real IR. - assert isinstance(parse_tree, parser_types.Token), str(parse_tree) - return parse_tree + c a* + / \ + d a* + + (where b, c, and d are nodes of type "a") becomes [b, c, d]. + + * The values of numeric constants (Number, etc. tokens) are parsed. + + * Different classes of names (snake_names, CamelNames, ShoutyNames) are + folded into a single "Name" type, since they are guaranteed to appear in + the correct places in the parse tree. + + + Arguments: + parse_tree: A parse tree. Each leaf node should be a parser_types.Token + object, and each non-leaf node should have a 'symbol' attribute + specifying which grammar symbol it represents, and a 'children' + attribute containing a list of child nodes. This is the format + returned by the parsers produced by the lr1 module, when run + against tokens from the tokenizer module. + used_productions: If specified, used_productions.add() will be called + with each production actually used in parsing. This can be useful + when developing the grammar and writing tests; in particular, it + can be used to figure out which productions are *not* used when + parsing a particular file. + + Returns: + A module-level intermediate representation (module IR) for an Emboss + module (source file). This IR will not have symbols resolved, + constraints checked, fields synthesized, etc.; it will only be a + representation of the syntactic elements of the source. + """ + handlers = {} + for production, handler in _handlers.items(): + # An extra layer of indirection is required here so that the resulting + # lambda does not capture the local variable `handler`. + def wrapped_handler(handler): + def wrapped_handler(node, *args): + module_node = handler(*args) + if node.source_location: + if isinstance(module_node, tuple): + module_node = module_node._replace( + source_location=node.source_location + ) + else: + module_node.source_location = node.source_location + return module_node + + return wrapped_handler + + handlers[production] = wrapped_handler(handler) + return parser_util.transform_parse_tree( + parse_tree, lambda n: n, handlers, used_productions + ) # Map of productions to their handlers. diff --git a/compiler/front_end/module_ir_test.py b/compiler/front_end/module_ir_test.py index 1931bab..fa8db9c 100644 --- a/compiler/front_end/module_ir_test.py +++ b/compiler/front_end/module_ir_test.py @@ -18,6 +18,7 @@ import collections import pkgutil +import sys import unittest from compiler.front_end import module_ir @@ -4057,6 +4058,18 @@ def test_double_negative_non_compilation(self): parse_result = parser.parse_module(tokenizer.tokenize(example, "")[0]) self.assertFalse(parse_result.error) + def test_long_input(self): + """Checks that very long inputs do not hit the Python recursion limit.""" + emb = ["enum Test:\n"] + # Enough entities to blow through the default recursion limit and the + # bumped limit that was previously in place. + for i in range(max(sys.getrecursionlimit(), 16 * 1024) * 2): + emb.append(f" VALUE_{i} = {i}\n") + parse_result = parser.parse_module( + tokenizer.tokenize("".join(emb), "long.emb")[0] + ) + module_ir.build_ir(parse_result.parse_tree) + def _make_superset_tests(): diff --git a/compiler/util/BUILD b/compiler/util/BUILD index b0a29a3..4a444fd 100644 --- a/compiler/util/BUILD +++ b/compiler/util/BUILD @@ -154,6 +154,14 @@ py_test( ], ) +py_library( + name = "parser_util", + srcs = ["parser_util.py"], + deps = [ + ":parser_types", + ], +) + py_library( name = "error", srcs = [ diff --git a/compiler/util/parser_util.py b/compiler/util/parser_util.py new file mode 100644 index 0000000..c95f70a --- /dev/null +++ b/compiler/util/parser_util.py @@ -0,0 +1,138 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for working with parse trees.""" + +from compiler.util import parser_types + + +def transform_parse_tree( + parse_tree, token_handler, production_handlers, used_productions=None +): + """Walks the provided parse_tree, calling handlers for each node. + + This function uses provided handlers to transform a parse tree into a new + structure. From the bottom up, calls a handler on each node, passing the + node itself and the results of calling handlers on each child of the node + (if any). + + This function avoids recursion, so it is suitable for very deep parse + trees. + + Arguments: + parse_tree: the tree to process + token_handler: the handler to use for Token nodes. + production_handlers: a map from productions to handlers for those + productions. + used_productions: an optional set; all encountered productions will be + added to used_productions. + + Returns: + The result of the production_handler for the top-level parse_tree node. + """ + # The stack of entries to process. Each entry is in one of two states, + # depending on the 3rd element (children_completed): + # + # If False, the node's children have not been added, and the action is to + # push the same node with children_completed=True, then push each an entry + # for each of node's children with their children_completed=False. + # + # If True, the node's children have been added and (by the time the entry + # is back on top of the stack) all of them have been processed. In this + # case, the action is to call the appropriate handler (token_handler() or + # production_handlers[node.production]()) with the node and its transformed + # children, then store the result in the node's parent's + # transformed_children list. + # + # As an example, if we have: + # + # A + # / \ + # B C + # / \ + # D E + # + # Then the steps are: + # + # Initialize: + # 1. Push (A, children_completed=False) + # + # Start handling A: + # 2. Pop (A, children_completed=False) + # 3. Push (A, children_completed=True) + # 4. Push (B, children_completed=False) + # 5. Push (C, children_completed=False) + # + # Start handling C: + # 6. Pop (C, children_completed=False) + # 7. Push (C, children_completed=True) + # 8. Push (D, children_completed=False) + # 9. Push (E, children_completed=False) + # + # Start handling E: + # 10. Pop (E, children_completed=False) + # 11. Push (E, children_completed=True) + # + # Finish handling E: + # 12. Pop (E, children_completed=True) + # 13. Insert token_handler(E) into C.transformed_children + # + # Start handling D: + # 14. Pop (D, children_completed=False) + # 15. Push (D, children_completed=True) + # + # Finish handling D: + # 16. Pop (D, children_completed=True) + # 17. Insert token_handler(D) into C.transformed_children + # + # Finish handling C: + # 18. Pop (C, children_completed=True) + # 19. Insert production_handlers[C.production](C, *C.transformed_children) + # into A.transformed_children + # + # Start handling B: + # 20. Pop (B, children_completed=False) + # 21. Push (B, children_completed=True) + # + # Finish handling B: + # 22. Pop (B, children_completed=True) + # 23. Insert token_handler(B) into A.transformed_children + # + # Finish handling A: + # 24. Pop (A, children_completed=True) + # 25. Return production_handlers[A.production](A, *A.transformed_children) + # + # It takes quite a few steps to handle even a small tree! + stack = [(parse_tree, None, False, None)] + while True: + node, parent, children_completed, transformed_children = stack.pop() + if not children_completed: + parent_entry = [] + stack.append((node, parent, True, parent_entry)) + if hasattr(node, "children"): + for child in node.children: + stack.append((child, parent_entry, False, None)) + if used_productions is not None: + used_productions.add(node.production) + else: + if isinstance(node, parser_types.Token): + transformed_node = token_handler(node) + else: + transformed_node = production_handlers[node.production]( + *([node] + transformed_children) + ) + if parent is None: + return transformed_node + else: + parent.insert(0, transformed_node) From e158cdca66e331c7c15cf2c1d369b966b8b05e4c Mon Sep 17 00:00:00 2001 From: Ben Olmstead Date: Thu, 19 Dec 2024 21:58:57 +0000 Subject: [PATCH 2/3] Output test logs on failure. --- .github/workflows/verify-pull-request.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/verify-pull-request.yml b/.github/workflows/verify-pull-request.yml index ccaedad..9c57be7 100644 --- a/.github/workflows/verify-pull-request.yml +++ b/.github/workflows/verify-pull-request.yml @@ -17,7 +17,7 @@ jobs: disk-cache: "verify-pr:run-bazel-tests:${{ matrix.cpp-compiler }}" repository-cache: true - run: echo "CC=${{ matrix.cpp-compiler }}" >> $GITHUB_ENV - - run: bazel test ${{ matrix.options }} ... + - run: bazel test --test_output=errors ${{ matrix.options }} ... check-formatting: name: "Check Python formatting" runs-on: ubuntu-latest From 57d5ce197b0485d249aec4886e5083f2dfd86be1 Mon Sep 17 00:00:00 2001 From: Ben Olmstead Date: Thu, 19 Dec 2024 23:04:41 +0000 Subject: [PATCH 3/3] Backport test fix. --- compiler/back_end/cpp/testcode/auto_array_size_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler/back_end/cpp/testcode/auto_array_size_test.cc b/compiler/back_end/cpp/testcode/auto_array_size_test.cc index eaae8ec..e3456e2 100644 --- a/compiler/back_end/cpp/testcode/auto_array_size_test.cc +++ b/compiler/back_end/cpp/testcode/auto_array_size_test.cc @@ -306,7 +306,7 @@ TEST(AutoSizeView, CanCopyFrom) { auto source = MakeAlignedAutoSizeView( kAutoSize, sizeof kAutoSize); - ::std::array buf = {0}; + alignas(8) ::std::array buf = {0}; auto dest = MakeAlignedAutoSizeView(buf.data(), buf.size());