semantic_parsing_with_constrained_lm_changes.patch

diff --git a/src/semantic_parsing_with_constrained_lm/README.md b/src/semantic_parsing_with_constrained_lm/README.md
new file mode 100644
index 0000000..0afb506
--- /dev/null
+++ b/src/semantic_parsing_with_constrained_lm/README.md
@@ -0,0 +1,8 @@
+# Semantic Parsing with Constrained Language Models
+
+This is a modified version of [microsoft](https://github.com/microsoft)/**[semantic_parsing_with_constrained_lm](https://github.com/microsoft/semantic_parsing_with_constrained_lm)**, [MIT license](https://github.com/microsoft/semantic_parsing_with_constrained_lm/blob/main/LICENSE).
+
+Modifications:
+
+- Implemented `semantic_parsing_with_constrained_lm.finetune.platypus`, see https://github.com/microsoft/semantic_parsing_with_constrained_lm/issues/13
+- Supported SMCalFlow V1 and Simplified syntax.
diff --git a/src/semantic_parsing_with_constrained_lm/configs/benchclamp_config.py b/src/semantic_parsing_with_constrained_lm/configs/benchclamp_config.py
index c81ac78..bd238ee 100644
--- a/src/semantic_parsing_with_constrained_lm/configs/benchclamp_config.py
+++ b/src/semantic_parsing_with_constrained_lm/configs/benchclamp_config.py
@@ -61,7 +61,7 @@ HUGGINGFACE_MODEL_DIR = (
     else Path("huggingface_models/")
 )
 TRAINED_MODEL_DIR = (
-    Path("/mnt/my_output/trained_models/") if RUN_ON_AML else Path("trained_models/")
+    Path("/mnt/my_output/trained_models/") if RUN_ON_AML else Path("data/models/trained_models/")
 )
 LOG_DIR = Path("/mnt/my_output/logs/") if RUN_ON_AML else Path("logs/")
 VERSION = "1.0"
@@ -131,6 +131,8 @@ BATCH_SIZE_PER_DEVICE_OVERRIDES: Dict[str, int] = {
     for lr in ["0.0001"]
     for split_id in ["low_0", "low_1", "low_2", "medium_0", "all"]
 }
+BATCH_SIZE_PER_DEVICE_OVERRIDES['t5-large-lm-adapt_calflow_last_user_all_0.0001'] = 4
+BATCH_SIZE_PER_DEVICE_OVERRIDES['bart-large_calflow_last_user_all_0.0001'] = 4
 
 
 def create_train_exp(
@@ -192,10 +194,16 @@ def create_eval_exp(
     data_config: ClampDataConfig,
     problem_type: Literal["constrained", "unconstrained-beam", "unconstrained-greedy"],
     is_dev: bool,
+    load_data=True,
+    max_steps_slope_intercept=None,
+    **kwargs,
 ) -> Experiment:
     model, tokenizer, _ = model_config.setup_model()
     data_config.tokenizer = tokenizer
-    train_data, dev_data, test_data = data_config.setup_data()
+    if load_data:
+        train_data, dev_data, test_data = data_config.setup_data()
+    else:
+        train_data, dev_data, test_data = [], [], []
     is_encoder_decoder = not isinstance(model_config, GPT2ModelConfig)
 
     lm: Seq2SeqModel
@@ -278,17 +286,20 @@ def create_eval_exp(
 
         else:
             # Everything other than Overnight in BenchClamp
-            train_length_pairs = []
-            for datum in train_data:
-                num_input_tokens = len(tokenizer.tokenize(datum.natural))
-                num_output_tokens = len(tokenizer.tokenize(datum.canonical)) + 1
-                train_length_pairs.append((num_input_tokens, num_output_tokens))
-
-            print("Computing max steps regression model parameters ...")
-            max_steps_intercept, max_steps_slope = compute_and_print_fit(
-                train_length_pairs, 10, 1
-            )
-            print("Done")
+            if train_data:
+                train_length_pairs = []
+                for datum in train_data:
+                    num_input_tokens = len(tokenizer.tokenize(datum.natural))
+                    num_output_tokens = len(tokenizer.tokenize(datum.canonical)) + 1
+                    train_length_pairs.append((num_input_tokens, num_output_tokens))
+
+                print("Computing max steps regression model parameters ...")
+                max_steps_intercept, max_steps_slope = compute_and_print_fit(
+                    train_length_pairs, 10, 1
+                )
+                print("Done")
+            else:
+                max_steps_slope, max_steps_intercept = max_steps_slope_intercept or (205.0, 10.0)
             partial_parse_builder = create_partial_parse_builder(
                 constrained, data_config, tokenizer
             )
@@ -313,6 +324,7 @@ def create_eval_exp(
             }
             if data_config.dataset_name in [
                 BenchClampDataset.CalFlowV2.value,
+                BenchClampDataset.CalFlowSimplified.value,
                 BenchClampDataset.TreeDST.value,
             ]:
                 metrics["lispress_match"] = TopKLispressMatch(beam_size)
@@ -336,12 +348,13 @@ def create_eval_exp(
                 test_data=eval_data,
                 client=lm,
                 log_dir=LOG_DIR / VERSION,
+                data_config=data_config,
             )
 
     raise ValueError("Could not create eval experiment with inputs")
 
 
-def create_exps_dict() -> Tuple[
+def create_exps_dict(**kwargs) -> Tuple[
     Dict[str, Callable[[], TrainExperiment]], Dict[str, Callable[[], Experiment]]
 ]:
     train_exps_dict: Dict[str, Callable[[], TrainExperiment]] = {}
@@ -404,7 +417,17 @@ def create_exps_dict() -> Tuple[
                         "unconstrained-greedy",
                         is_dev=True,
                     )
+                eval_exps_dict[eval_exp_name] = functools.partial(
+                    create_eval_exp,
+                    f"{trained_model_id}_test_eval",
+                    eval_model_config,
+                    data_config,
+                    "unconstrained-greedy",
+                    is_dev=False,
+                    **kwargs
+                )
 
+            dev_complete = True
             if dev_complete and len(dev_results) > 0:
                 print(f"All dev expts complete. Results gathered.\n{dev_results}")
                 best_trained_model_info = max(
@@ -431,6 +454,7 @@ def create_exps_dict() -> Tuple[
                             data_config,
                             constrained,  # type: ignore
                             is_dev=False,
+                            **kwargs
                         )
 
     return train_exps_dict, eval_exps_dict
@@ -442,7 +466,7 @@ def build_config(
 ) -> Dict[str, Callable[[], Union[TrainExperiment, Experiment]]]:
     sys.setrecursionlimit(50000)
     expts: Dict[str, Callable[[], Union[TrainExperiment, Experiment]]] = {}
-    train_expts, eval_expts = create_exps_dict()
+    train_expts, eval_expts = create_exps_dict(**kwargs)
     expts.update(train_expts)
     expts.update(eval_expts)
     return expts
diff --git a/src/semantic_parsing_with_constrained_lm/configs/lib/benchclamp.py b/src/semantic_parsing_with_constrained_lm/configs/lib/benchclamp.py
index 9858570..f12963b 100644
--- a/src/semantic_parsing_with_constrained_lm/configs/lib/benchclamp.py
+++ b/src/semantic_parsing_with_constrained_lm/configs/lib/benchclamp.py
@@ -74,6 +74,7 @@ def create_partial_parse_builder(
 
         elif data_config.dataset_name in (
             BenchClampDataset.CalFlowV2.value,
+            BenchClampDataset.CalFlowSimplified.value,
             BenchClampDataset.TreeDST.value,
         ):
             partial_parse_builder = create_partial_parse_builder_lispress_v2(
diff --git a/src/semantic_parsing_with_constrained_lm/domains/benchclamp_data_setup.py b/src/semantic_parsing_with_constrained_lm/domains/benchclamp_data_setup.py
index f054a80..1c4c692 100644
--- a/src/semantic_parsing_with_constrained_lm/domains/benchclamp_data_setup.py
+++ b/src/semantic_parsing_with_constrained_lm/domains/benchclamp_data_setup.py
@@ -98,6 +98,9 @@ class BenchClampDatasetConfig(ClampDataConfig):
             test_data_file = f"{BENCH_CLAMP_PROCESSED_DATA_DIR_AZURE}/{self.dataset_name}/{domain_str}test_all.jsonl"
         else:
             test_data_file = f"{BENCH_CLAMP_PROCESSED_DATA_DIR_AZURE}/{self.dataset_name}/{domain_str}test.jsonl"
+        train_data_file = self.to_local(train_data_file)
+        dev_data_file = self.to_local(dev_data_file)
+        test_data_file = self.to_local(test_data_file)
         with BlobFile(str(train_data_file)) as bf:
             print(f"Reading {train_data_file}")
             train_data = data_from_textio(bf)
@@ -115,9 +118,13 @@ class BenchClampDatasetConfig(ClampDataConfig):
             self.modify_data_with_sequence_creator(test_data),
         )
 
+    def to_local(self, url:str):
+        suffix = url.split('benchclamp')[-1]
+        return 'data/benchclamp' + suffix
 
 class BenchClampDataset(str, Enum):
     CalFlowV2 = "CalFlowV2"
+    CalFlowSimplified = "CalFlowSimplified"
     TreeDST = "TreeDST"
     Overnight = "Overnight"
     MTOP = "MTOP"
@@ -157,6 +164,21 @@ BENCHCLAMP_DATA_CONFIGS: List[ClampDataConfig] = (
             ("last_user", LastUserAgentUtterance(), BENCHCLAMP_SPLIT_NAMES),
         ]
         for split_name in split_names
+    ] +
+    [
+        BenchClampDatasetConfig(
+            data_id=f"calflowsimplified_{input_sequence_creator_name}_{split_name}",
+            split_name=split_name,
+            domain=None,
+            dataset_name=BenchClampDataset.CalFlowSimplified.value,
+            input_sequence_creator=input_sequence_creator,
+        )
+        for input_sequence_creator_name, input_sequence_creator, split_names in [
+            ("no_context", IdentitySequenceCreator(), BENCHCLAMP_SPLIT_NAMES),
+            ("last_agent", LastAgentUtterance(), BENCHCLAMP_SPLIT_NAMES),
+            ("last_user", LastUserAgentUtterance(), BENCHCLAMP_SPLIT_NAMES),
+        ]
+        for split_name in split_names
     ]
     + [
         BenchClampDatasetConfig(
diff --git a/src/semantic_parsing_with_constrained_lm/domains/lispress_v2/create_benchclamp_data.py b/src/semantic_parsing_with_constrained_lm/domains/lispress_v2/create_benchclamp_data.py
index cc30e44..f60b6c8 100644
--- a/src/semantic_parsing_with_constrained_lm/domains/lispress_v2/create_benchclamp_data.py
+++ b/src/semantic_parsing_with_constrained_lm/domains/lispress_v2/create_benchclamp_data.py
@@ -5,7 +5,7 @@ import itertools
 import json
 import sys
 from pathlib import Path
-from typing import Optional, Set
+from typing import Optional, Set, List
 
 import jsons
 from blobfile import BlobFile
@@ -30,7 +30,36 @@ from semantic_parsing_with_constrained_lm.paths import (
     BENCH_CLAMP_RAW_DATA_DIR,
 )
 from semantic_parsing_with_constrained_lm.tokenization import GPT2ClampTokenizer
-from semantic_parsing_with_constrained_lm.finetune.calflow import calflow_to_datum_format
+# from semantic_parsing_with_constrained_lm.finetune.platypus import calflow_to_datum_format
+
+
+def calflow_to_datum_format(path: str) -> List[FullDatum]:
+    dataset = load_jsonl_file(path, None)
+    converted = []
+
+    for item in dataset:
+        last_user_utterance = None
+        last_agent_utterance = None
+        last_plan = None
+        for i, turn in enumerate(item['turns']):
+            utterance = turn['user_utterance']['original_text']
+            plan = turn['lispress']
+            datum = FullDatum(
+                dialogue_id=item['dialogue_id'],
+                turn_part_index=i,
+                natural=utterance,
+                canonical=plan,
+                agent_context=json.dumps({
+                    'plan': last_plan,
+                    'user_utterance': last_user_utterance,
+                    'agent_utterance': last_agent_utterance,
+                })
+            )
+            last_user_utterance = utterance
+            last_agent_utterance = turn['agent_utterance']['original_text']
+            last_plan = plan
+            converted.append(datum)
+    return converted
 
 
 def extract_and_write_grammar(
@@ -115,6 +144,7 @@ def write_data_and_grammar(
         test_benchclamp_data,
         datum_output_dir,
     )
+    return
 
     extract_and_write_grammar(
         train_dataflow_dialogues_jsonl,
@@ -192,27 +222,27 @@ def create_grammar_from_train_split():
 def main():
     write_data_and_grammar(
         train_dataflow_dialogues_jsonl=BENCH_CLAMP_RAW_DATA_DIR
-        / f"{BenchClampDataset.CalFlowV2}/train.dataflow_dialogues.jsonl",
+        / f"{BenchClampDataset.CalFlowSimplified}/train.dataflow_dialogues.jsonl",
         dev_dataflow_dialogues_jsonl=BENCH_CLAMP_RAW_DATA_DIR
-        / f"{BenchClampDataset.CalFlowV2}/valid.dataflow_dialogues.jsonl",
+        / f"{BenchClampDataset.CalFlowSimplified}/valid.dataflow_dialogues.jsonl",
         test_dataflow_dialogues_jsonl=None,
         datum_output_dir=BENCH_CLAMP_PROCESSED_DATA_DIR
-        / f"{BenchClampDataset.CalFlowV2}/",
-        grammar_output_dir=BENCH_CLAMP_GRAMMAR_DATA_DIR
-        / f"{BenchClampDataset.CalFlowV2}/",
-    )
-    write_data_and_grammar(
-        train_dataflow_dialogues_jsonl=BENCH_CLAMP_RAW_DATA_DIR
-        / f"{BenchClampDataset.TreeDST}/train_dst.dataflow_dialogues.jsonl",
-        dev_dataflow_dialogues_jsonl=BENCH_CLAMP_RAW_DATA_DIR
-        / f"{BenchClampDataset.TreeDST}/dev_dst.dataflow_dialogues.jsonl",
-        test_dataflow_dialogues_jsonl=BENCH_CLAMP_RAW_DATA_DIR
-        / f"{BenchClampDataset.TreeDST}/test_dst.dataflow_dialogues.jsonl",
-        datum_output_dir=BENCH_CLAMP_PROCESSED_DATA_DIR
-        / f"{BenchClampDataset.TreeDST}/",
+        / f"{BenchClampDataset.CalFlowSimplified}/",
         grammar_output_dir=BENCH_CLAMP_GRAMMAR_DATA_DIR
-        / f"{BenchClampDataset.TreeDST}/",
+        / f"{BenchClampDataset.CalFlowSimplified}/",
     )
+    # write_data_and_grammar(
+    #     train_dataflow_dialogues_jsonl=BENCH_CLAMP_RAW_DATA_DIR
+    #     / f"{BenchClampDataset.TreeDST}/train_dst.dataflow_dialogues.jsonl",
+    #     dev_dataflow_dialogues_jsonl=BENCH_CLAMP_RAW_DATA_DIR
+    #     / f"{BenchClampDataset.TreeDST}/dev_dst.dataflow_dialogues.jsonl",
+    #     test_dataflow_dialogues_jsonl=BENCH_CLAMP_RAW_DATA_DIR
+    #     / f"{BenchClampDataset.TreeDST}/test_dst.dataflow_dialogues.jsonl",
+    #     datum_output_dir=BENCH_CLAMP_PROCESSED_DATA_DIR
+    #     / f"{BenchClampDataset.TreeDST}/",
+    #     grammar_output_dir=BENCH_CLAMP_GRAMMAR_DATA_DIR
+    #     / f"{BenchClampDataset.TreeDST}/",
+    # )
 
 
 if __name__ == "__main__":
diff --git a/src/semantic_parsing_with_constrained_lm/finetune/download_huggingface_lms.py b/src/semantic_parsing_with_constrained_lm/finetune/download_huggingface_lms.py
index 0ca73d8..5f47ce9 100644
--- a/src/semantic_parsing_with_constrained_lm/finetune/download_huggingface_lms.py
+++ b/src/semantic_parsing_with_constrained_lm/finetune/download_huggingface_lms.py
@@ -23,11 +23,11 @@ def save_model_and_tokenizer(model, tokenizer, save_dir: Path) -> None:
 def main():
     # T5
     for model_id, huggingface_model_id in [
-        ("t5-small-lm-adapt", "google/t5-small-lm-adapt"),
-        ("t5-base-lm-adapt", "google/t5-base-lm-adapt"),
-        ("t5-large-lm-adapt", "google/t5-large-lm-adapt"),
-        ("t5-xl-lm-adapt", "google/t5-xl-lm-adapt"),
-        ("t5-xxl-lm-adapt", "google/t5-xxl-lm-adapt"),
+        # ("t5-small-lm-adapt", "google/t5-small-lm-adapt"),
+        # ("t5-base-lm-adapt", "google/t5-base-lm-adapt"),
+        # ("t5-large-lm-adapt", "google/t5-large-lm-adapt"),
+        # ("t5-xl-lm-adapt", "google/t5-xl-lm-adapt"),
+        # ("t5-xxl-lm-adapt", "google/t5-xxl-lm-adapt"),
     ]:
         print(f"Downloading {model_id} ...")
         model = T5ForConditionalGeneration.from_pretrained(huggingface_model_id)
@@ -37,20 +37,20 @@ def main():
         )
 
     # CodeT5
-    for model_id, huggingface_model_id in [
-        ("codet5-base", "Salesforce/codet5-base"),
-        ("codet5-base-multi-sum", "Salesforce/codet5-base-multi-sum"),
-    ]:
-        print(f"Downloading {model_id} ...")
-        model = T5ForConditionalGeneration.from_pretrained(huggingface_model_id)
-        tokenizer = RobertaTokenizer.from_pretrained(huggingface_model_id)
-        save_model_and_tokenizer(
-            model, tokenizer, CLAMP_PRETRAINED_MODEL_DIR / model_id
-        )
-
+    # for model_id, huggingface_model_id in [
+    #     ("codet5-base", "Salesforce/codet5-base"),
+    #     ("codet5-base-multi-sum", "Salesforce/codet5-base-multi-sum"),
+    # ]:
+    #     print(f"Downloading {model_id} ...")
+    #     model = T5ForConditionalGeneration.from_pretrained(huggingface_model_id)
+    #     tokenizer = RobertaTokenizer.from_pretrained(huggingface_model_id)
+    #     save_model_and_tokenizer(
+    #         model, tokenizer, CLAMP_PRETRAINED_MODEL_DIR / model_id
+    #     )
+    #
     # Bart
     for model_id, huggingface_model_id in [
-        ("bart-base", "facebook/bart-base"),
+        # ("bart-base", "facebook/bart-base"),
         ("bart-large", "facebook/bart-large"),
     ]:
         print(f"Downloading {model_id} ...")
diff --git a/src/semantic_parsing_with_constrained_lm/run_exp.py b/src/semantic_parsing_with_constrained_lm/run_exp.py
index c365b66..f9bff13 100644
--- a/src/semantic_parsing_with_constrained_lm/run_exp.py
+++ b/src/semantic_parsing_with_constrained_lm/run_exp.py
@@ -28,7 +28,7 @@ from typing import (
     Tuple,
     TypeVar,
     Union,
-    cast,
+    cast, Any,
 )
 
 import jsons
@@ -56,6 +56,7 @@ class Experiment(Generic[FullDatumSub]):
     metrics: Mapping[str, Metric[Sequence[str], FullDatumSub]]
     log_dir: Optional[Path] = None
     loggers: Optional[List[Logger[Sequence[ModelResult], FullDatumSub]]] = None
+    data_config:Optional[Any] = None
 
 
 class EvalSplit(str, Enum):
@@ -75,10 +76,10 @@ class EvalSplit(str, Enum):
 
 
 def filter_exp_dict(
-    # Using Iterable[Tuple[str, E]] is deprecated
-    exps: Union[Iterable[Tuple[str, E]], Dict[str, Callable[[], E]]],
-    exp_names: Optional[List[str]],
-    exp_name_pattern: Optional[List[str]],
+        # Using Iterable[Tuple[str, E]] is deprecated
+        exps: Union[Iterable[Tuple[str, E]], Dict[str, Callable[[], E]]],
+        exp_names: Optional[List[str]],
+        exp_name_pattern: Optional[List[str]],
 ) -> Dict[str, Callable[[], E]]:
     if isinstance(exps, dict):
         exps_dict = exps
@@ -122,15 +123,15 @@ def filter_exp_dict(
 
 
 async def run(
-    exp_name: str,
-    exp: Experiment,
-    log_dir: Optional[pathlib.Path] = None,
-    debug: bool = False,
-    ids: Optional[List[str]] = None,
-    rerun: bool = False,
-    num_eval_examples: Optional[int] = None,
-    rank: int = 0,
-    world_size: int = 1,
+        exp_name: str,
+        exp: Experiment,
+        log_dir: Optional[pathlib.Path] = None,
+        debug: bool = False,
+        ids: Optional[List[str]] = None,
+        rerun: bool = False,
+        num_eval_examples: Optional[int] = None,
+        rank: int = 0,
+        world_size: int = 1,
 ) -> None:
     if log_dir is None:
         if exp.log_dir is None:
@@ -166,10 +167,10 @@ async def run(
 
     print(f"Total test examples: {len(test_data)}")
     test_data = test_data[
-        (rank * len(test_data))
-        // world_size : ((rank + 1) * len(test_data))
-        // world_size
-    ]
+                (rank * len(test_data))
+                // world_size: ((rank + 1) * len(test_data))
+                               // world_size
+                ]
     if num_eval_examples is not None:
         test_data = test_data[:num_eval_examples]
 
@@ -185,7 +186,7 @@ async def run(
                 [json.loads(line) for line in open(past_model_outputs_path, "r")],
             )
         )
-    if candidate_past_model_outputs:
+    if candidate_past_model_outputs and not rerun:
         past_model_outputs_path, past_model_outputs_to_copy = max(
             candidate_past_model_outputs, key=lambda t: len(t[1])
         )
@@ -197,7 +198,7 @@ async def run(
         past_model_outputs_to_copy = []
 
     with logger.intercept_output(
-        exp_log_dir / f"stdout.{now}", exp_log_dir / f"stderr.{now}"
+            exp_log_dir / f"stdout.{now}", exp_log_dir / f"stderr.{now}"
     ), open(
         exp_log_dir / f"model_outputs.{now}.jsonl", "w"
     ) as model_outputs_f, ExitStack() as logger_cm:
@@ -208,13 +209,13 @@ async def run(
             for metric in exp.metrics.values():
                 metric.reset()
             for test_datum, past_model_output in zip(
-                test_data, past_model_outputs_to_copy
+                    test_data, past_model_outputs_to_copy
             ):
                 current_test_index += 1
                 assert test_datum.dialogue_id == past_model_output["test_datum_id"]
                 assert (
-                    test_datum.turn_part_index
-                    == past_model_output["test_datum_turn_part_index"]
+                        test_datum.turn_part_index
+                        == past_model_output["test_datum_turn_part_index"]
                 )
                 for metric in exp.metrics.values():
                     metric.update(past_model_output["outputs"], test_datum)
@@ -226,10 +227,10 @@ async def run(
 
             async with exp.client:
                 async for kbest, test_datum in limits.map_async_limited(
-                    exp.model.predict,
-                    test_data[len(past_model_outputs_to_copy) :],
-                    max_concurrency=1,
-                    wrap_exception=not debug,
+                        exp.model.predict,
+                        test_data[len(past_model_outputs_to_copy):],
+                        max_concurrency=1,
+                        wrap_exception=not debug,
                 ):
                     beam_search_text = [beam.text for beam in kbest]
 
@@ -291,8 +292,8 @@ async def run(
                 with open(results_path, "w") as results_f:
                     json.dump(all_metric_results, results_f)
         except (  # pylint: disable=try-except-raise
-            KeyboardInterrupt,
-            bdb.BdbQuit,
+                KeyboardInterrupt,
+                bdb.BdbQuit,
         ):
             # If we get Ctrl-C then we want to stop the entire program,
             # instead of just skipping this one experiment.
@@ -319,21 +320,21 @@ async def run(
 
 
 def main(
-    config_name: str = typer.Option(...),
-    log_dir: Optional[pathlib.Path] = typer.Option(None),
-    debug: bool = typer.Option(False),
-    exp_names: Optional[List[str]] = typer.Option(
-        None
-    ),  # pylint: disable=unused-argument
-    exp_name_pattern: Optional[List[str]] = typer.Option(None),
-    ids: Optional[List[str]] = typer.Option(None),
-    rerun: bool = typer.Option(False),
-    num_eval_examples: Optional[int] = typer.Option(None),
-    model: ClientType = typer.Option(ClientType.GPT2),
-    rank: int = typer.Option(0),
-    world_size: int = typer.Option(1),
-    results_dir: str = typer.Option("results"),
-    eval_split: EvalSplit = typer.Option(EvalSplit.DevSubset),
+        config_name: str = typer.Option(...),
+        log_dir: Optional[pathlib.Path] = typer.Option(None),
+        debug: bool = typer.Option(False),
+        exp_names: Optional[List[str]] = typer.Option(
+            None
+        ),  # pylint: disable=unused-argument
+        exp_name_pattern: Optional[List[str]] = typer.Option(None),
+        ids: Optional[List[str]] = typer.Option(None),
+        rerun: bool = typer.Option(False),
+        num_eval_examples: Optional[int] = typer.Option(None),
+        model: ClientType = typer.Option(ClientType.GPT2),
+        rank: int = typer.Option(0),
+        world_size: int = typer.Option(1),
+        results_dir: str = typer.Option("results"),
+        eval_split: EvalSplit = typer.Option(EvalSplit.DevSubset),
 ):
     async def inner():
         nonlocal exp_names
@@ -350,6 +351,10 @@ def main(
         exps = config_mod.build_config(log_dir, **kwargs)
         filtered_exp_dict = filter_exp_dict(exps, exp_names, exp_name_pattern)
         for exp_name in filtered_exp_dict:
+            # TODO: block training exp
+            if 'eval' not in exp_name:
+                continue
+
             try:
                 exp = filtered_exp_dict[exp_name]()
             except TrainedModelNotFoundError:
diff --git a/src/semantic_parsing_with_constrained_lm/scfg/generate.py b/src/semantic_parsing_with_constrained_lm/scfg/generate.py
index 74c430c..06212f4 100644
--- a/src/semantic_parsing_with_constrained_lm/scfg/generate.py
+++ b/src/semantic_parsing_with_constrained_lm/scfg/generate.py
@@ -228,7 +228,7 @@ def generate_from_grammar_and_nonterminal(
 ) -> Iterator[GeneratedNonterminalNode]:
     """
     Generate from a grammar, given a nonterminal.
-    If `randomize`, samples without replacement. Otherwise, enumerates.
+    If `randomize`, tiny_samples without replacement. Otherwise, enumerates.
     TODO: Do we ever use the enumeration part? Or the w/o replacement part?
     """
     alias_to_iterator: Dict[Alias, Iterator[Tuple[GeneratedNode, ...]]] = {}