EleutherAI
diff --git a/‎.pre-commit-config.yaml
+1-1 b/‎.pre-commit-config.yaml
+1-1
diff --git a/‎.vscode/launch.json
+2-2 b/‎.vscode/launch.json
+2-2
diff --git a/‎README.md
+6 b/‎README.md
+6
diff --git a/‎elk/__main__.py
+3-2 b/‎elk/__main__.py
+3-2
diff --git a/‎elk/debug_logging.py
+3-3 b/‎elk/debug_logging.py
+3-3
diff --git a/‎elk/evaluation/evaluate.py
+14-20 b/‎elk/evaluation/evaluate.py
+14-20
diff --git a/‎elk/extraction/balanced_sampler.py
+33-23 b/‎elk/extraction/balanced_sampler.py
+33-23
@@ -24,4 +24,4 @@ repos:
     hooks:
     -   id: codespell
         # The promptsource templates spuriously get flagged without this
-        args: ["--skip=*.yaml"]
+        args: ["-L fpr", "--skip=*.yaml"]
@@ -9,11 +9,11 @@
             "type": "python",
             "request": "launch",
             "module": "elk",
-            "args": ["elicit", "RWKV", "imdb", "--max_examples=5"],
+            "args": ["elicit", "rwkv", "imdb", "--max_examples=5"],
             "env": {
                 "CUDA_VISIBLE_DEVICES": "0",
             },
             "justMyCode": true
         }
     ]
-}
+}
@@ -32,6 +32,12 @@ The following command will evaluate the probe from the run naughty-northcutt on
 elk eval naughty-northcutt microsoft/deberta-v2-xxlarge-mnli imdb
 ```
 
+The following runs `elicit` on the Cartesian product of the listed models and datasets, storing it in a special folder ELK_DIR/sweeps/<memorable_name>. Moreover, `--add_pooled` adds an additional dataset that pools all of the datasets together.
+
+```bash
+elk sweep --models gpt2-{medium,large,xl} --datasets imdb amazon_polarity --add_pooled
+```
+
 ## Caching
 
 The hidden states resulting from `elk elicit` are cached as a HuggingFace dataset to avoid having to recompute them every time we want to train a probe. The cache is stored in the same place as all other HuggingFace datasets, which is usually `~/.cache/huggingface/datasets`.
 
@@ -5,21 +5,22 @@
 from simple_parsing import ArgumentParser
 
 from elk.evaluation.evaluate import Eval
+from elk.training.sweep import Sweep
 from elk.training.train import Elicit
 
 
 @dataclass
 class Command:
     """Some top-level command"""
 
-    command: Elicit | Eval
+    command: Elicit | Eval | Sweep
 
     def execute(self):
         return self.command.execute()
 
 
 def run():
-    parser = ArgumentParser(add_help=False, add_config_path_arg=True)
+    parser = ArgumentParser(add_help=False)
     parser.add_arguments(Command, dest="run")
     args = parser.parse_args()
     run: Command = args.run
 
@@ -27,17 +27,17 @@ def save_debug_log(datasets: list[DatasetDict], out_dir: Path) -> None:
         )
 
         train_split, val_split = select_train_val_splits(ds)
-        text_inputs = ds[val_split][0]["text_inputs"]
+        text_questions = ds[val_split][0]["text_questions"]
         template_ids = ds[val_split][0]["variant_ids"]
         label = ds[val_split][0]["label"]
 
         # log the train size and val size
         logging.info(f"Train size: {len(ds[train_split])}")
         logging.info(f"Val size: {len(ds[val_split])}")
 
-        templates_text = f"{len(text_inputs)} templates used:\n"
+        templates_text = f"{len(text_questions)} templates used:\n"
         trailing_whitespace = False
-        for (text0, text1), id in zip(text_inputs, template_ids):
+        for (text0, text1), id in zip(text_questions, template_ids):
             templates_text += (
                 f'***---TEMPLATE "{id}"---***\n'
                 f"{'false' if label else 'true'}:\n"
 
@@ -9,9 +9,9 @@
 
 from ..extraction.extraction import Extract
 from ..files import elk_reporter_dir
+from ..metrics import evaluate_preds
 from ..run import Run
 from ..training import Reporter
-from ..training.supervised import evaluate_supervised
 from ..utils import select_usable_devices
 
 
@@ -43,6 +43,8 @@ class Eval(Serializable):
     out_dir: Path | None = None
     skip_supervised: bool = False
 
+    disable_cache: bool = field(default=False, to_dict=False)
+
     def execute(self):
         transfer_dir = elk_reporter_dir() / self.source / "transfer_eval"
 
@@ -69,34 +71,26 @@ def evaluate_reporter(
         reporter.eval()
 
         row_buf = []
-        for ds_name, (val_x0, val_x1, val_gt, _) in val_output.items():
-            val_result = reporter.score(
-                val_gt,
-                val_x0,
-                val_x1,
-            )
-
-            stats_row = pd.Series(
-                {
-                    "dataset": ds_name,
-                    "layer": layer,
-                    **val_result._asdict(),
-                }
-            )
+        for ds_name, (val_h, val_gt, _) in val_output.items():
+            val_result = evaluate_preds(val_gt, reporter(val_h))
+
+            stats_row = {
+                "dataset": ds_name,
+                "layer": layer,
+                **val_result.to_dict(),
+            }
 
             lr_dir = experiment_dir / "lr_models"
             if not self.cfg.skip_supervised and lr_dir.exists():
                 with open(lr_dir / f"layer_{layer}.pt", "rb") as f:
                     lr_model = torch.load(f, map_location=device).eval()
 
-                lr_auroc, lr_acc = evaluate_supervised(lr_model, val_x0, val_x1, val_gt)
-
-                stats_row["lr_auroc"] = lr_auroc
-                stats_row["lr_acc"] = lr_acc
+                lr_result = evaluate_preds(val_gt, lr_model(val_h))
+                stats_row.update(lr_result.to_dict(prefix="lr_"))
 
             row_buf.append(stats_row)
 
-        return pd.DataFrame(row_buf)
+        return pd.DataFrame.from_records(row_buf)
 
     def evaluate(self):
         """Evaluate the reporter on all layers."""
 
@@ -1,4 +1,5 @@
 from collections import deque
+from dataclasses import dataclass, field
 from itertools import cycle
 from random import Random
 from typing import Iterable, Iterator, Optional
@@ -11,39 +12,48 @@
 from ..utils.typing import assert_type
 
 
+@dataclass
 class BalancedSampler(TorchIterableDataset):
     """
-    Approximately balances a binary classification dataset in a streaming fashion.
-
-    Args:
-        dataset (IterableDataset): The HuggingFace IterableDataset to balance.
-        label_col (Optional[str], optional): The name of the column containing the
-            binary label. If not provided, the label column will be inferred from
-            the dataset features. Defaults to None.
-        buffer_size (int, optional): The total buffer size to use for balancing the
-            dataset. This value should be divisible by 2, as it will be equally
-            divided between the two binary label values (0 and 1). Defaults to 1000.
+    A sampler that approximately balances a multi-class classification dataset in a
+    streaming fashion.
+
+    Attributes:
+        data: The input dataset to balance.
+        num_classes: The total number of classes expected in the data.
+        buffer_size: The total buffer size to use for balancing the dataset. Each class
+            will have its own buffer with this size.
     """
 
-    def __init__(self, data: Iterable[dict], buffer_size: int = 1000):
-        self.data = data
+    data: Iterable[dict]
+    num_classes: int
+    buffer_size: int = 1000
+    buffers: dict[int, deque[dict]] = field(default_factory=dict, init=False)
+    label_col: str = "label"
 
-        self.neg_buffer = deque(maxlen=buffer_size)
-        self.pos_buffer = deque(maxlen=buffer_size)
+    def __post_init__(self):
+        # Initialize empty buffers
+        self.buffers = {
+            label: deque(maxlen=self.buffer_size) for label in range(self.num_classes)
+        }
 
     def __iter__(self):
         for sample in self.data:
-            label = sample["label"]
+            label = sample[self.label_col]
 
-            # Add the sample to the appropriate buffer
-            if label == 0:
-                self.neg_buffer.append(sample)
-            else:
-                self.pos_buffer.append(sample)
+            # This whole class is a no-op if the label is not an integer
+            if not isinstance(label, int):
+                yield sample
+                continue
+
+            # Add the sample to the buffer for its class label
+            self.buffers[label].append(sample)
 
-            while self.neg_buffer and self.pos_buffer:
-                yield self.neg_buffer.popleft()
-                yield self.pos_buffer.popleft()
+            # Check if all buffers have at least one sample
+            while all(len(buffer) > 0 for buffer in self.buffers.values()):
+                # Yield one sample from each buffer in a round-robin fashion
+                for buf in self.buffers.values():
+                    yield buf.popleft()
 
 
 class FewShotSampler:
Original file line number	Diff line number	Diff line change
`@@ -9,11 +9,11 @@`
`9`	`9`	`"type": "python",`
`10`	`10`	`"request": "launch",`
`11`	`11`	`"module": "elk",`
`12`		`- "args": ["elicit", "RWKV", "imdb", "--max_examples=5"],`
	`12`	`+ "args": ["elicit", "rwkv", "imdb", "--max_examples=5"],`
`13`	`13`	`"env": {`
`14`	`14`	`"CUDA_VISIBLE_DEVICES": "0",`
`15`	`15`	`},`
`16`	`16`	`"justMyCode": true`
`17`	`17`	`}`
`18`	`18`	`]`
`19`		`-}`
	`19`	`+}`