Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle very large .emb files. #215

Merged
merged 4 commits into from
Dec 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions compiler/front_end/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ py_library(
"//compiler/util:ir_data",
"//compiler/util:name_conversion",
"//compiler/util:parser_types",
"//compiler/util:parser_util",
],
)

Expand Down Expand Up @@ -436,6 +437,7 @@ py_library(
":module_ir",
":tokenizer",
"//compiler/util:parser_types",
"//compiler/util:parser_util",
],
)

Expand Down
24 changes: 12 additions & 12 deletions compiler/front_end/format_emb.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from compiler.front_end import module_ir
from compiler.front_end import tokenizer
from compiler.util import parser_types
from compiler.util import parser_util


class Config(collections.namedtuple("Config", ["indent_width", "show_line_types"])):
Expand Down Expand Up @@ -67,18 +68,17 @@ def format_emboss_parse_tree(parse_tree, config, used_productions=None):
Returns:
A string of the reformatted source text.
"""
if hasattr(parse_tree, "children"):
parsed_children = [
format_emboss_parse_tree(child, config, used_productions)
for child in parse_tree.children
]
args = parsed_children + [config]
if used_productions is not None:
used_productions.add(parse_tree.production)
return _formatters[parse_tree.production](*args)
else:
assert isinstance(parse_tree, parser_types.Token), str(parse_tree)
return parse_tree.text
formatters = {}
for production, handler in _formatters.items():
# An extra layer of indirection is required here so that the resulting
# lambda does not capture the local variable `handler`.
def wrapped_handler(handler):
return lambda _, *args: handler(*(args + (config,)))

formatters[production] = wrapped_handler(handler)
return parser_util.transform_parse_tree(
parse_tree, lambda n: n.text, formatters, used_productions
)


def sanity_check_format_result(formatted_text, original_text):
Expand Down
16 changes: 15 additions & 1 deletion compiler/front_end/format_emb_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,21 @@ def test_eol_missing(self):


class FormatEmbTest(unittest.TestCase):
pass

def test_very_long_emb(self):
"""Checks that very long inputs do not hit the Python recursion limit."""
emb = ["enum Test:\n"]
# Enough entities to blow through the default recursion limit and the
# bumped limit that was previously in place.
for i in range(max(sys.getrecursionlimit(), 16 * 1024) * 2):
emb.append(f" VALUE_{i} = {i}\n")
parsed_unformatted = parser.parse_module(
tokenizer.tokenize("".join(emb), "long.emb")[0]
)
formatted_text = format_emb.format_emboss_parse_tree(
parsed_unformatted.parse_tree,
format_emb.Config(indent_width=2),
)


def _make_golden_file_tests():
Expand Down
151 changes: 71 additions & 80 deletions compiler/front_end/module_ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from compiler.util import ir_data_utils
from compiler.util import name_conversion
from compiler.util import parser_types
from compiler.util import parser_util


# Intermediate types; should not be found in the final IR.
Expand Down Expand Up @@ -82,88 +83,78 @@ def __init__(self, field, subtypes=None):
def build_ir(parse_tree, used_productions=None):
r"""Builds a module-level intermediate representation from a valid parse tree.

The parse tree is precisely dictated by the exact productions in the grammar
used by the parser, with no semantic information. _really_build_ir transforms
this "raw" form into a stable, cooked representation, thereby isolating
subsequent steps from the exact details of the grammar.

(Probably incomplete) list of transformations:

* ParseResult and Token nodes are replaced with Module, Attribute, Struct,
Type, etc. objects.

* Purely syntactic tokens ('"["', '"struct"', etc.) are discarded.

* Repeated elements are transformed from tree form to list form:

a*
/ \
b a*
The parse tree is precisely dictated by the exact productions in the grammar
used by the parser, with no semantic information. _really_build_ir
transforms this "raw" form into a stable, cooked representation, thereby
isolating subsequent steps from the exact details of the grammar.

(Probably incomplete) list of transformations:

* ParseResult and Token nodes are replaced with Module, Attribute, Struct,
Type, etc. objects.

* Purely syntactic tokens ('"["', '"struct"', etc.) are discarded.

* Repeated elements are transformed from tree form to list form:

a*
/ \
c a*
b a*
/ \
d a*

(where b, c, and d are nodes of type "a") becomes [b, c, d].

* The values of numeric constants (Number, etc. tokens) are parsed.

* Different classes of names (snake_names, CamelNames, ShoutyNames) are
folded into a single "Name" type, since they are guaranteed to appear in
the correct places in the parse tree.


Arguments:
parse_tree: A parse tree. Each leaf node should be a parser_types.Token
object, and each non-leaf node should have a 'symbol' attribute specifying
which grammar symbol it represents, and a 'children' attribute containing
a list of child nodes. This is the format returned by the parsers
produced by the lr1 module, when run against tokens from the tokenizer
module.
used_productions: If specified, used_productions.add() will be called with
each production actually used in parsing. This can be useful when
developing the grammar and writing tests; in particular, it can be used to
figure out which productions are *not* used when parsing a particular
file.

Returns:
A module-level intermediate representation (module IR) for an Emboss module
(source file). This IR will not have symbols resolved; that must be done on
a forest of module IRs so that names from other modules can be resolved.
"""

# TODO(b/140259131): Refactor _really_build_ir to be less recursive/use an
# explicit stack.
old_recursion_limit = sys.getrecursionlimit()
sys.setrecursionlimit(16 * 1024) # ~8000 top-level entities in one module.
try:
result = _really_build_ir(parse_tree, used_productions)
finally:
sys.setrecursionlimit(old_recursion_limit)
return result


def _really_build_ir(parse_tree, used_productions):
"""Real implementation of build_ir()."""
if used_productions is None:
used_productions = set()
if hasattr(parse_tree, "children"):
parsed_children = [
_really_build_ir(child, used_productions) for child in parse_tree.children
]
used_productions.add(parse_tree.production)
result = _handlers[parse_tree.production](*parsed_children)
if parse_tree.source_location:
if isinstance(result, tuple):
result = result._replace(source_location=parse_tree.source_location)
else:
result.source_location = parse_tree.source_location
return result
else:
# For leaf nodes, the temporary "IR" is just the token. Higher-level rules
# will translate it to a real IR.
assert isinstance(parse_tree, parser_types.Token), str(parse_tree)
return parse_tree
c a*
/ \
d a*

(where b, c, and d are nodes of type "a") becomes [b, c, d].

* The values of numeric constants (Number, etc. tokens) are parsed.

* Different classes of names (snake_names, CamelNames, ShoutyNames) are
folded into a single "Name" type, since they are guaranteed to appear in
the correct places in the parse tree.


Arguments:
parse_tree: A parse tree. Each leaf node should be a parser_types.Token
object, and each non-leaf node should have a 'symbol' attribute
specifying which grammar symbol it represents, and a 'children'
attribute containing a list of child nodes. This is the format
returned by the parsers produced by the lr1 module, when run
against tokens from the tokenizer module.
used_productions: If specified, used_productions.add() will be called
with each production actually used in parsing. This can be useful
when developing the grammar and writing tests; in particular, it
can be used to figure out which productions are *not* used when
parsing a particular file.

Returns:
A module-level intermediate representation (module IR) for an Emboss
module (source file). This IR will not have symbols resolved,
constraints checked, fields synthesized, etc.; it will only be a
representation of the syntactic elements of the source.
"""
handlers = {}
for production, handler in _handlers.items():
# An extra layer of indirection is required here so that the resulting
# lambda does not capture the local variable `handler`.
def wrapped_handler(handler):
def wrapped_handler(node, *args):
module_node = handler(*args)
if node.source_location:
if isinstance(module_node, tuple):
module_node = module_node._replace(
source_location=node.source_location
)
else:
module_node.source_location = node.source_location
return module_node

return wrapped_handler

handlers[production] = wrapped_handler(handler)
return parser_util.transform_parse_tree(
parse_tree, lambda n: n, handlers, used_productions
)


# Map of productions to their handlers.
Expand Down
13 changes: 13 additions & 0 deletions compiler/front_end/module_ir_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import collections
import pkgutil
import sys
import unittest

from compiler.front_end import module_ir
Expand Down Expand Up @@ -4057,6 +4058,18 @@ def test_double_negative_non_compilation(self):
parse_result = parser.parse_module(tokenizer.tokenize(example, "")[0])
self.assertFalse(parse_result.error)

def test_long_input(self):
"""Checks that very long inputs do not hit the Python recursion limit."""
emb = ["enum Test:\n"]
# Enough entities to blow through the default recursion limit and the
# bumped limit that was previously in place.
for i in range(max(sys.getrecursionlimit(), 16 * 1024) * 2):
emb.append(f" VALUE_{i} = {i}\n")
parse_result = parser.parse_module(
tokenizer.tokenize("".join(emb), "long.emb")[0]
)
module_ir.build_ir(parse_result.parse_tree)


def _make_superset_tests():

Expand Down
8 changes: 8 additions & 0 deletions compiler/util/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,14 @@ py_test(
],
)

py_library(
name = "parser_util",
srcs = ["parser_util.py"],
deps = [
":parser_types",
],
)

py_library(
name = "error",
srcs = [
Expand Down
Loading
Loading