update ft script for multi-label and reformat with black

jstremme · jstremme · commit 3193e2b534ed · 2022-10-25T18:27:58.000-05:00
diff --git a/evaluate/auto_evaluate.py b/evaluate/auto_evaluate.py
@@ -23,6 +23,7 @@
 import pandas as pd
 from scipy.stats import pearsonr
 
+
 def main():
 
     # Load Run Parameters
@@ -34,13 +35,15 @@ def main():
     logger = logging.getLogger(__name__)
 
     # Load N-Gram Feature Weights from Logisitic Regression
-    with open(PARAMS['lr_sorted_pairs'], "rb") as f:
+    with open(PARAMS["lr_sorted_pairs"], "rb") as f:
         sorted_pairs = pickle.load(f)
 
     # Load Results from Explainability Experiments
     all_info_dfs = []
     exp_data_dfs = []
-    for i, results_dir in enumerate([PARAMS['rnd_results'], PARAMS['soc_results'], PARAMS['msp_results']]):
+    for i, results_dir in enumerate(
+        [PARAMS["rnd_results"], PARAMS["soc_results"], PARAMS["msp_results"]]
+    ):
         for j, file in enumerate(glob.glob(os.path.abspath(results_dir + "*"))):
             if "all_info" in file:
 
@@ -104,6 +107,7 @@ def main():
             f"Algo: {algo} correlation with logistic regression coefficients: {round(corr, 5)} (p={round(p, 5)})."
         )
 
+
 if __name__ == "__main__":
 
     main()
diff --git a/explain/explain_with_rnd.py b/explain/explain_with_rnd.py
@@ -60,8 +60,8 @@ def main():
         dataset = load_from_disk(PARAMS["data"])
     else:
         dataset = load_dataset(PARAMS["data"])
-    tokenizer = AutoTokenizer.from_pretrained(PARAMS['tokenizer'])
-    model = AutoModelForSequenceClassification.from_pretrained(PARAMS['model'])
+    tokenizer = AutoTokenizer.from_pretrained(PARAMS["tokenizer"])
+    model = AutoModelForSequenceClassification.from_pretrained(PARAMS["model"])
 
     # Tokenize Test Data
     def tokenize_function(batch):
diff --git a/models/ft.py b/models/ft.py
@@ -99,22 +99,53 @@ def tokenize_function(batch):
         )
 
     # Define transformation to tokenize data in batches
-    dataset = dataset.map(
-        tokenize_function,
-        batched=True,
-        remove_columns=["text"],
-        batch_size=PARAMS["per_device_train_batch_size"],
-    ).with_format("torch")
+    dataset["train"] = (
+        dataset["train"]
+        .map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["text"],
+            batch_size=PARAMS["per_device_train_batch_size"],
+        )
+        .with_format("torch")
+    )
+    dataset["val"] = (
+        dataset["val"]
+        .map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["text"],
+            batch_size=PARAMS["per_device_eval_batch_size"],
+        )
+        .with_format("torch")
+    )
+    dataset["test"] = (
+        dataset["test"]
+        .map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["text"],
+            batch_size=PARAMS["per_device_test_batch_size"],
+        )
+        .with_format("torch")
+    )
+
+    # Define problem type
+    # PyTorch expects multi-hot labels where each element is a float
+    # For multi-label classification with binary_cross_entropy_with_logits loss
+    if PARAMS["class_strategy"] == "binary":
+        problem_type = "single_label_classification"
+    elif PARAMS["class_strategy"] == "multi_label":
+        problem_type = "multi_label_classification"
 
     # Create sequence classifier from pretrained model
     model = AutoModelForSequenceClassification.from_pretrained(
-        PARAMS["lm_path"], num_labels=PARAMS["num_labels"], return_dict=True
+        PARAMS["lm_path"],
+        num_labels=PARAMS["num_labels"],
+        return_dict=True,
+        problem_type=problem_type,
     )
 
-    # Optionally formulate problem as binary classification with one label
-    if PARAMS["class_strategy"] == "binary":
-        model.problem_type = "single_label_classification"
-
     # Define early stopping callback
     early_stopping = EarlyStoppingCallback(
         early_stopping_patience=PARAMS["early_stopping_patience"]
diff --git a/models/lr.py b/models/lr.py
@@ -38,6 +38,7 @@
 from transformers import AutoTokenizer
 from utils import make_lr_model_and_target_multi_class
 
+
 def main():
 
     # Load Run Parameters
@@ -49,10 +50,10 @@ def main():
     logger = logging.getLogger(__name__)
 
     # Define output path and output data name
-    output_path = f"./lr_outputs_{PARAMS['data']}/"  # will be deleted if it already exists
-    output_data_name = (
-        f"ngram_range_{PARAMS['n_gram_range_min']}_{PARAMS['n_gram_range_max']}_features_coefs_count_vec_no_reg"
+    output_path = (
+        f"./lr_outputs_{PARAMS['data']}/"  # will be deleted if it already exists
     )
+    output_data_name = f"ngram_range_{PARAMS['n_gram_range_min']}_{PARAMS['n_gram_range_max']}_features_coefs_count_vec_no_reg"
 
     # Create Directory to Save Results
     # This script is for demo purposes and **will delete** the `output_path` directory if it exists on each new run.
@@ -75,10 +76,12 @@ def main():
     def tokenize_function(sample):
 
         return tokenizer(
-            sample["text"], padding="do_not_pad", truncation=True, max_length=PARAMS['max_seq_len']
+            sample["text"],
+            padding="do_not_pad",
+            truncation=True,
+            max_length=PARAMS["max_seq_len"],
         )
 
-
     # Define transformation to tokenize data in batches
     dataset = dataset.map(
         tokenize_function,
@@ -122,7 +125,7 @@ def fit_pipeline(
             strip_accents="unicode",
             analyzer="word",
             token_pattern=r"\w{1,}",
-            ngram_range=(PARAMS['n_gram_range_min'], PARAMS['n_gram_range_max']),
+            ngram_range=(PARAMS["n_gram_range_min"], PARAMS["n_gram_range_max"]),
             min_df=0.00001,
             max_df=0.2,
         )
@@ -181,15 +184,18 @@ def fit_pipeline(
 
             return estimator
 
-
     # Fit vanilla logistic regression
     # Regularization will screw up coefficient interpretation
     clf = LogisticRegression(
-        solver="sag", fit_intercept=True, max_iter=5000, class_weight=None, penalty="none"
+        solver="sag",
+        fit_intercept=True,
+        max_iter=5000,
+        class_weight=None,
+        penalty="none",
     )
 
     # The MIMIC50 dataset is multi-label
-    if PARAMS['data'] == "mimic50":
+    if PARAMS["data"] == "mimic50":
         clf, y_train = make_lr_model_and_target_multi_class(
             clf, y_train, class_strategy="multi_label", n_jobs=10
         )
@@ -215,15 +221,17 @@ def compute_bootstrap_metrics(labels, preds, n_bootstrap=1000, avg="micro"):
 
         return np.mean(accs), np.std(accs), np.mean(f1s), np.std(f1s)
 
-
     # Compute Predicted Class Labels
-    if PARAMS['data'] == "mimic50":
+    if PARAMS["data"] == "mimic50":
         scores = best_estimator.predict_proba(X_test)
-        preds = np.array((scores >= PARAMS['threshold']), dtype=int)
+        preds = np.array((scores >= PARAMS["threshold"]), dtype=int)
         avg = "micro"
     else:
         preds = np.array(
-            [int(x > PARAMS['threshold']) for x in best_estimator.predict_proba(X_test)[:, 1]]
+            [
+                int(x > PARAMS["threshold"])
+                for x in best_estimator.predict_proba(X_test)[:, 1]
+            ]
         )
         avg = "binary"
 
@@ -242,7 +250,7 @@ def compute_bootstrap_metrics(labels, preds, n_bootstrap=1000, avg="micro"):
         f.write(f1)
 
     # Extract Final Word Vectorizer and Model
-    if PARAMS['data'] != "mimic50":
+    if PARAMS["data"] != "mimic50":
         word_vectorizer, lr_model = best_estimator[0], best_estimator[1]
     else:
         word_vectorizer, lr_model = best_estimator[0], best_estimator[1]
@@ -273,6 +281,7 @@ def compute_bootstrap_metrics(labels, preds, n_bootstrap=1000, avg="micro"):
     logger.info("Top positive predictors:")
     logger.info(sorted_pairs[-100:])
 
+
 if __name__ == "__main__":
 
     main()