Skip to content

Commit 3193e2b

Browse files
committed
update ft script for multi-label and reformat with black
1 parent 6bf38db commit 3193e2b

File tree

4 files changed

+73
-29
lines changed

4 files changed

+73
-29
lines changed

evaluate/auto_evaluate.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import pandas as pd
2424
from scipy.stats import pearsonr
2525

26+
2627
def main():
2728

2829
# Load Run Parameters
@@ -34,13 +35,15 @@ def main():
3435
logger = logging.getLogger(__name__)
3536

3637
# Load N-Gram Feature Weights from Logisitic Regression
37-
with open(PARAMS['lr_sorted_pairs'], "rb") as f:
38+
with open(PARAMS["lr_sorted_pairs"], "rb") as f:
3839
sorted_pairs = pickle.load(f)
3940

4041
# Load Results from Explainability Experiments
4142
all_info_dfs = []
4243
exp_data_dfs = []
43-
for i, results_dir in enumerate([PARAMS['rnd_results'], PARAMS['soc_results'], PARAMS['msp_results']]):
44+
for i, results_dir in enumerate(
45+
[PARAMS["rnd_results"], PARAMS["soc_results"], PARAMS["msp_results"]]
46+
):
4447
for j, file in enumerate(glob.glob(os.path.abspath(results_dir + "*"))):
4548
if "all_info" in file:
4649

@@ -104,6 +107,7 @@ def main():
104107
f"Algo: {algo} correlation with logistic regression coefficients: {round(corr, 5)} (p={round(p, 5)})."
105108
)
106109

110+
107111
if __name__ == "__main__":
108112

109113
main()

explain/explain_with_rnd.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@ def main():
6060
dataset = load_from_disk(PARAMS["data"])
6161
else:
6262
dataset = load_dataset(PARAMS["data"])
63-
tokenizer = AutoTokenizer.from_pretrained(PARAMS['tokenizer'])
64-
model = AutoModelForSequenceClassification.from_pretrained(PARAMS['model'])
63+
tokenizer = AutoTokenizer.from_pretrained(PARAMS["tokenizer"])
64+
model = AutoModelForSequenceClassification.from_pretrained(PARAMS["model"])
6565

6666
# Tokenize Test Data
6767
def tokenize_function(batch):

models/ft.py

+42-11
Original file line numberDiff line numberDiff line change
@@ -99,22 +99,53 @@ def tokenize_function(batch):
9999
)
100100

101101
# Define transformation to tokenize data in batches
102-
dataset = dataset.map(
103-
tokenize_function,
104-
batched=True,
105-
remove_columns=["text"],
106-
batch_size=PARAMS["per_device_train_batch_size"],
107-
).with_format("torch")
102+
dataset["train"] = (
103+
dataset["train"]
104+
.map(
105+
tokenize_function,
106+
batched=True,
107+
remove_columns=["text"],
108+
batch_size=PARAMS["per_device_train_batch_size"],
109+
)
110+
.with_format("torch")
111+
)
112+
dataset["val"] = (
113+
dataset["val"]
114+
.map(
115+
tokenize_function,
116+
batched=True,
117+
remove_columns=["text"],
118+
batch_size=PARAMS["per_device_eval_batch_size"],
119+
)
120+
.with_format("torch")
121+
)
122+
dataset["test"] = (
123+
dataset["test"]
124+
.map(
125+
tokenize_function,
126+
batched=True,
127+
remove_columns=["text"],
128+
batch_size=PARAMS["per_device_test_batch_size"],
129+
)
130+
.with_format("torch")
131+
)
132+
133+
# Define problem type
134+
# PyTorch expects multi-hot labels where each element is a float
135+
# For multi-label classification with binary_cross_entropy_with_logits loss
136+
if PARAMS["class_strategy"] == "binary":
137+
problem_type = "single_label_classification"
138+
elif PARAMS["class_strategy"] == "multi_label":
139+
problem_type = "multi_label_classification"
108140

109141
# Create sequence classifier from pretrained model
110142
model = AutoModelForSequenceClassification.from_pretrained(
111-
PARAMS["lm_path"], num_labels=PARAMS["num_labels"], return_dict=True
143+
PARAMS["lm_path"],
144+
num_labels=PARAMS["num_labels"],
145+
return_dict=True,
146+
problem_type=problem_type,
112147
)
113148

114-
# Optionally formulate problem as binary classification with one label
115-
if PARAMS["class_strategy"] == "binary":
116-
model.problem_type = "single_label_classification"
117-
118149
# Define early stopping callback
119150
early_stopping = EarlyStoppingCallback(
120151
early_stopping_patience=PARAMS["early_stopping_patience"]

models/lr.py

+23-14
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from transformers import AutoTokenizer
3939
from utils import make_lr_model_and_target_multi_class
4040

41+
4142
def main():
4243

4344
# Load Run Parameters
@@ -49,10 +50,10 @@ def main():
4950
logger = logging.getLogger(__name__)
5051

5152
# Define output path and output data name
52-
output_path = f"./lr_outputs_{PARAMS['data']}/" # will be deleted if it already exists
53-
output_data_name = (
54-
f"ngram_range_{PARAMS['n_gram_range_min']}_{PARAMS['n_gram_range_max']}_features_coefs_count_vec_no_reg"
53+
output_path = (
54+
f"./lr_outputs_{PARAMS['data']}/" # will be deleted if it already exists
5555
)
56+
output_data_name = f"ngram_range_{PARAMS['n_gram_range_min']}_{PARAMS['n_gram_range_max']}_features_coefs_count_vec_no_reg"
5657

5758
# Create Directory to Save Results
5859
# This script is for demo purposes and **will delete** the `output_path` directory if it exists on each new run.
@@ -75,10 +76,12 @@ def main():
7576
def tokenize_function(sample):
7677

7778
return tokenizer(
78-
sample["text"], padding="do_not_pad", truncation=True, max_length=PARAMS['max_seq_len']
79+
sample["text"],
80+
padding="do_not_pad",
81+
truncation=True,
82+
max_length=PARAMS["max_seq_len"],
7983
)
8084

81-
8285
# Define transformation to tokenize data in batches
8386
dataset = dataset.map(
8487
tokenize_function,
@@ -122,7 +125,7 @@ def fit_pipeline(
122125
strip_accents="unicode",
123126
analyzer="word",
124127
token_pattern=r"\w{1,}",
125-
ngram_range=(PARAMS['n_gram_range_min'], PARAMS['n_gram_range_max']),
128+
ngram_range=(PARAMS["n_gram_range_min"], PARAMS["n_gram_range_max"]),
126129
min_df=0.00001,
127130
max_df=0.2,
128131
)
@@ -181,15 +184,18 @@ def fit_pipeline(
181184

182185
return estimator
183186

184-
185187
# Fit vanilla logistic regression
186188
# Regularization will screw up coefficient interpretation
187189
clf = LogisticRegression(
188-
solver="sag", fit_intercept=True, max_iter=5000, class_weight=None, penalty="none"
190+
solver="sag",
191+
fit_intercept=True,
192+
max_iter=5000,
193+
class_weight=None,
194+
penalty="none",
189195
)
190196

191197
# The MIMIC50 dataset is multi-label
192-
if PARAMS['data'] == "mimic50":
198+
if PARAMS["data"] == "mimic50":
193199
clf, y_train = make_lr_model_and_target_multi_class(
194200
clf, y_train, class_strategy="multi_label", n_jobs=10
195201
)
@@ -215,15 +221,17 @@ def compute_bootstrap_metrics(labels, preds, n_bootstrap=1000, avg="micro"):
215221

216222
return np.mean(accs), np.std(accs), np.mean(f1s), np.std(f1s)
217223

218-
219224
# Compute Predicted Class Labels
220-
if PARAMS['data'] == "mimic50":
225+
if PARAMS["data"] == "mimic50":
221226
scores = best_estimator.predict_proba(X_test)
222-
preds = np.array((scores >= PARAMS['threshold']), dtype=int)
227+
preds = np.array((scores >= PARAMS["threshold"]), dtype=int)
223228
avg = "micro"
224229
else:
225230
preds = np.array(
226-
[int(x > PARAMS['threshold']) for x in best_estimator.predict_proba(X_test)[:, 1]]
231+
[
232+
int(x > PARAMS["threshold"])
233+
for x in best_estimator.predict_proba(X_test)[:, 1]
234+
]
227235
)
228236
avg = "binary"
229237

@@ -242,7 +250,7 @@ def compute_bootstrap_metrics(labels, preds, n_bootstrap=1000, avg="micro"):
242250
f.write(f1)
243251

244252
# Extract Final Word Vectorizer and Model
245-
if PARAMS['data'] != "mimic50":
253+
if PARAMS["data"] != "mimic50":
246254
word_vectorizer, lr_model = best_estimator[0], best_estimator[1]
247255
else:
248256
word_vectorizer, lr_model = best_estimator[0], best_estimator[1]
@@ -273,6 +281,7 @@ def compute_bootstrap_metrics(labels, preds, n_bootstrap=1000, avg="micro"):
273281
logger.info("Top positive predictors:")
274282
logger.info(sorted_pairs[-100:])
275283

284+
276285
if __name__ == "__main__":
277286

278287
main()

0 commit comments

Comments
 (0)