38
38
from transformers import AutoTokenizer
39
39
from utils import make_lr_model_and_target_multi_class
40
40
41
+
41
42
def main ():
42
43
43
44
# Load Run Parameters
@@ -49,10 +50,10 @@ def main():
49
50
logger = logging .getLogger (__name__ )
50
51
51
52
# Define output path and output data name
52
- output_path = f"./lr_outputs_{ PARAMS ['data' ]} /" # will be deleted if it already exists
53
- output_data_name = (
54
- f"ngram_range_{ PARAMS ['n_gram_range_min' ]} _{ PARAMS ['n_gram_range_max' ]} _features_coefs_count_vec_no_reg"
53
+ output_path = (
54
+ f"./lr_outputs_{ PARAMS ['data' ]} /" # will be deleted if it already exists
55
55
)
56
+ output_data_name = f"ngram_range_{ PARAMS ['n_gram_range_min' ]} _{ PARAMS ['n_gram_range_max' ]} _features_coefs_count_vec_no_reg"
56
57
57
58
# Create Directory to Save Results
58
59
# This script is for demo purposes and **will delete** the `output_path` directory if it exists on each new run.
@@ -75,10 +76,12 @@ def main():
75
76
def tokenize_function (sample ):
76
77
77
78
return tokenizer (
78
- sample ["text" ], padding = "do_not_pad" , truncation = True , max_length = PARAMS ['max_seq_len' ]
79
+ sample ["text" ],
80
+ padding = "do_not_pad" ,
81
+ truncation = True ,
82
+ max_length = PARAMS ["max_seq_len" ],
79
83
)
80
84
81
-
82
85
# Define transformation to tokenize data in batches
83
86
dataset = dataset .map (
84
87
tokenize_function ,
@@ -122,7 +125,7 @@ def fit_pipeline(
122
125
strip_accents = "unicode" ,
123
126
analyzer = "word" ,
124
127
token_pattern = r"\w{1,}" ,
125
- ngram_range = (PARAMS [' n_gram_range_min' ], PARAMS [' n_gram_range_max' ]),
128
+ ngram_range = (PARAMS [" n_gram_range_min" ], PARAMS [" n_gram_range_max" ]),
126
129
min_df = 0.00001 ,
127
130
max_df = 0.2 ,
128
131
)
@@ -181,15 +184,18 @@ def fit_pipeline(
181
184
182
185
return estimator
183
186
184
-
185
187
# Fit vanilla logistic regression
186
188
# Regularization will screw up coefficient interpretation
187
189
clf = LogisticRegression (
188
- solver = "sag" , fit_intercept = True , max_iter = 5000 , class_weight = None , penalty = "none"
190
+ solver = "sag" ,
191
+ fit_intercept = True ,
192
+ max_iter = 5000 ,
193
+ class_weight = None ,
194
+ penalty = "none" ,
189
195
)
190
196
191
197
# The MIMIC50 dataset is multi-label
192
- if PARAMS [' data' ] == "mimic50" :
198
+ if PARAMS [" data" ] == "mimic50" :
193
199
clf , y_train = make_lr_model_and_target_multi_class (
194
200
clf , y_train , class_strategy = "multi_label" , n_jobs = 10
195
201
)
@@ -215,15 +221,17 @@ def compute_bootstrap_metrics(labels, preds, n_bootstrap=1000, avg="micro"):
215
221
216
222
return np .mean (accs ), np .std (accs ), np .mean (f1s ), np .std (f1s )
217
223
218
-
219
224
# Compute Predicted Class Labels
220
- if PARAMS [' data' ] == "mimic50" :
225
+ if PARAMS [" data" ] == "mimic50" :
221
226
scores = best_estimator .predict_proba (X_test )
222
- preds = np .array ((scores >= PARAMS [' threshold' ]), dtype = int )
227
+ preds = np .array ((scores >= PARAMS [" threshold" ]), dtype = int )
223
228
avg = "micro"
224
229
else :
225
230
preds = np .array (
226
- [int (x > PARAMS ['threshold' ]) for x in best_estimator .predict_proba (X_test )[:, 1 ]]
231
+ [
232
+ int (x > PARAMS ["threshold" ])
233
+ for x in best_estimator .predict_proba (X_test )[:, 1 ]
234
+ ]
227
235
)
228
236
avg = "binary"
229
237
@@ -242,7 +250,7 @@ def compute_bootstrap_metrics(labels, preds, n_bootstrap=1000, avg="micro"):
242
250
f .write (f1 )
243
251
244
252
# Extract Final Word Vectorizer and Model
245
- if PARAMS [' data' ] != "mimic50" :
253
+ if PARAMS [" data" ] != "mimic50" :
246
254
word_vectorizer , lr_model = best_estimator [0 ], best_estimator [1 ]
247
255
else :
248
256
word_vectorizer , lr_model = best_estimator [0 ], best_estimator [1 ]
@@ -273,6 +281,7 @@ def compute_bootstrap_metrics(labels, preds, n_bootstrap=1000, avg="micro"):
273
281
logger .info ("Top positive predictors:" )
274
282
logger .info (sorted_pairs [- 100 :])
275
283
284
+
276
285
if __name__ == "__main__" :
277
286
278
287
main ()
0 commit comments