Transformer Pre-Trainining: Add early stop + fix eval set👷‍♀️ (#409)

KarelZe · web-flow · commit dd69da6663cf · 2023-06-15T09:41:40.000+02:00
diff --git a/src/otc/models/fttransformer.py b/src/otc/models/fttransformer.py
@@ -614,9 +614,10 @@ def forward(
         if self.dropout is not None:
             attention_probs = self.dropout(attention_probs)
 
-        self.save_attn(attention_probs)
-        if attention_probs.requires_grad:
-            attention_probs.register_hook(self.save_attn_gradients)
+        # comment out for training
+        # self.save_attn(attention_probs)
+        # if attention_probs.requires_grad:
+        #     attention_probs.register_hook(self.save_attn_gradients)
 
         x = attention_probs @ self._reshape(v)
         x = (
diff --git a/src/otc/models/objective.py b/src/otc/models/objective.py
@@ -157,6 +157,7 @@ def __init__(
 
         self._clf: BaseEstimator
         self._callbacks = CallbackContainer([SaveCallback(), PrintCallback()])
+        self._pretrain = pretrain
 
         super().__init__(x_train, y_train, x_val, y_val, name, pretrain)
 
@@ -235,6 +236,7 @@ def __call__(self, trial: optuna.Trial) -> float:
             "feature_tokenizer": FeatureTokenizer(**feature_tokenizer_kwargs),  # type: ignore # noqa: E501
             "cat_features": self._cat_features,
             "cat_cardinalities": self._cat_cardinalities,
+            "d_token": d_token,
         }
 
         optim_params = {"lr": lr, "weight_decay": weight_decay}
@@ -245,6 +247,7 @@ def __call__(self, trial: optuna.Trial) -> float:
             optim_params=optim_params,
             dl_params=dl_params,
             callbacks=self._callbacks,  # type: ignore # noqa: E501
+            pretrain=self._pretrain,
         )
 
         self._clf.fit(
diff --git a/src/otc/models/train_model.py b/src/otc/models/train_model.py
@@ -75,12 +75,6 @@
 @click.option(
     "--pretrain/--no-pretrain", default=False, help="Flag to activate pretraining."
 )
-@click.option(
-    "--sample",
-    type=click.FloatRange(0, 1),
-    default=1,
-    help="Sampling factor applied to train and validation set.",
-)
 def main(
     trials: int,
     seed: int,
@@ -89,7 +83,6 @@ def main(
     id: str,
     dataset: str,
     pretrain: bool,
-    sample: float,
 ) -> None:
     """
     Start study.
@@ -102,7 +95,6 @@ def main(
         id (str): id of study.
         dataset (str): name of data set.
         pretrain (bool): whether to pretrain model.
-        sample (float): sampling factor.
     """
     logger = logging.getLogger(__name__)
     warnings.filterwarnings("ignore", category=ExperimentalWarning)
@@ -171,19 +163,6 @@ def main(
     y_val = x_val["buy_sell"]
     x_val.drop(columns=["buy_sell"], inplace=True)
 
-    if sample < 1.0:
-        # sample down train data
-        x_train = x_train.sample(frac=sample, random_state=set_seed(seed)).reset_index(
-            drop=True
-        )
-        y_train = y_train.iloc[x_train.index]
-
-        # sample down validation data
-        x_val = x_val.sample(frac=sample, random_state=set_seed(seed)).reset_index(
-            drop=True
-        )
-        y_val = y_val.iloc[x_val.index]
-
     # pretrain training activated
     has_label = (y_train != 0).all()
     if pretrain and has_label:
@@ -251,7 +230,7 @@ def main(
             "dataset": dataset,
             "seed": seed,
             "pretrain": pretrain,
-            "sample": sample,
+            "sample": 1.0,
         }
     )
 
diff --git a/src/otc/models/transformer_classifier.py b/src/otc/models/transformer_classifier.py
@@ -15,8 +15,7 @@
 import pandas as pd
 import torch
 from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.utils.multiclass import check_classification_targets
-from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
+from sklearn.utils.validation import check_array, check_is_fitted
 from torch import nn, optim
 
 from otc.data.dataloader import TabDataLoader
@@ -38,7 +37,7 @@ class TransformerClassifier(BaseEstimator, ClassifierMixin):
     """
 
     epochs_pretrain = 20
-    epochs_finetune = 1
+    epochs_finetune = 20
 
     def __init__(
         self,
@@ -246,6 +245,7 @@ def fit(  # noqa: C901
 
         if self.pretrain:
 
+            print("start pre-training...")
             mask = y == 0
 
             # isolate unlabelled
@@ -260,10 +260,14 @@ def fit(  # noqa: C901
             train_loader_pretrain = self.array_to_dataloader_pretrain(
                 X_unlabelled, y_unlabelled
             )
-            val_loader_pretrain = self.array_to_dataloader_pretrain(
-                X_unlabelled, y_unlabelled
+
+            # use in-sample instead of validation set, if None is provided
+            X_val, y_val = (
+                eval_set if eval_set is not None else (X_unlabelled, y_unlabelled)
             )
 
+            val_loader_pretrain = self.array_to_dataloader_pretrain(X_val, y_val)
+
             # free up memory
             del X_unlabelled, y_unlabelled
             gc.collect()
@@ -314,24 +318,30 @@ def fit(  # noqa: C901
                 optimizer=optimizer, warmup=warmup_steps, max_iters=max_steps
             )
 
-            criterion = nn.BCEWithLogitsLoss()
+            # keep track of val loss and do early stopping
+            early_stopping = EarlyStopping(patience=10)
+
+            # mean bce with logits loss
+            criterion = nn.BCEWithLogitsLoss(reduction="mean")
 
             step = 0
+            best_accuracy = -1.0
+
             for epoch in range(self.epochs_pretrain):
 
                 # perform training
                 loss_in_epoch_train = 0
 
                 batch = 0
 
-                for x_cat, x_cont, masks in train_loader_pretrain:
+                for x_cat, x_cont, mask in train_loader_pretrain:
 
                     self.clf.train()
                     optimizer.zero_grad()
 
                     with torch.autocast(device_type="cuda", dtype=torch.float16):
                         logits = self.clf(x_cat, x_cont)
-                        train_loss = criterion(logits, masks.float())  # type: ignore[union-attr] # noqa: E501
+                        train_loss = criterion(logits, mask.float())  # type: ignore[union-attr] # noqa: E501
 
                     scaler.scale(train_loss).backward()
                     scaler.step(optimizer)
@@ -353,31 +363,45 @@ def fit(  # noqa: C901
                 correct = 0
 
                 with torch.no_grad():
-                    for x_cat, x_cont, masks in val_loader_pretrain:
+                    for x_cat, x_cont, mask in val_loader_pretrain:
 
                         # for my implementation
                         logits = self.clf(x_cat, x_cont)
-                        val_loss = criterion(logits, masks.float())  # type: ignore[union-attr] # noqa: E501
-
+                        val_loss = criterion(logits, mask.float())  # type: ignore[union-attr] # noqa: E501
                         loss_in_epoch_val += val_loss.item()
 
+                        # accuracy
+                        # adapted from here, but over columns + rows https://github.com/puhsu/tabular-dl-pretrain-objectives/blob/3f503d197867c341b4133efcafd3243eb5bb93de/bin/mask.py#L440 # noqa: E501
+                        hard_predictions = torch.zeros_like(logits, dtype=torch.long)
+                        hard_predictions[logits > 0] = 1
+                        # sum columns and rows
+                        correct += (hard_predictions.bool() == mask).sum()
+
                         batch += 1
 
                 # loss average over all batches
                 train_loss_all = loss_in_epoch_train / len(train_loader_pretrain)
                 val_loss_all = loss_in_epoch_val / len(val_loader_pretrain)
+                # correct / (rows * columns)
+                val_accuracy = correct / (X_val.shape[0] * X_val.shape[1])
+
+                print(f"train loss: {train_loss}")
+                print(f"val loss: {val_loss}")
+                print(f"val accuracy: {val_accuracy}")
 
                 self._stats_pretrain_epoch.append(
                     {
                         "train_loss": train_loss_all,
                         "val_loss": val_loss_all,
+                        "val_accuracy": val_accuracy,
                         "step": step,
                         "epoch": epoch,
                     }
                 )
 
-                print(f"train loss: {train_loss}")
-                print(f"val loss: {val_loss}")
+                if best_accuracy < val_accuracy:
+                    self._checkpoint_write()
+                    best_accuracy = val_accuracy
 
             # https://discuss.huggingface.co/t/clear-gpu-memory-of-transformers-pipeline/18310/2
             del train_loader_pretrain, val_loader_pretrain
@@ -389,17 +413,10 @@ def fit(  # noqa: C901
         self.clf.to(self.dl_params["device"])
 
         # start finetuning beneath
-        check_classification_targets(y)
-        X, y = check_X_y(X, y, multi_output=False, accept_sparse=False)
+        print("start finetuning...")
 
         # use in-sample instead of validation set, if None is provided
-        if eval_set:
-            X_val, y_val = eval_set
-            X_val, y_val = check_X_y(
-                X_val, y_val, multi_output=False, accept_sparse=False
-            )
-        else:
-            X_val, y_val = X, y
+        X_val, y_val = eval_set if eval_set is not None else (X, y)
 
         # save for accuracy calculation
         len_x_val = len(X_val)
@@ -529,8 +546,8 @@ def fit(  # noqa: C901
                     )
                     loss_in_epoch_val += val_loss.item()
 
-                    # print(f"[{epoch}-{val_batch}] val loss: {val_loss}")
                     val_batch += 1
+
             # loss average over all batches
             train_loss_all = loss_in_epoch_train / len(train_loader_finetune)
             val_loss_all = loss_in_epoch_val / len(val_loader_finetune)