-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdvc.lock
319 lines (319 loc) · 10.1 KB
/
dvc.lock
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
schema: '2.0'
stages:
clean_data:
cmd: python src\data\clean_data.py
deps:
- path: data\training
md5: 7a96cfa2ff8c169761a068447405c17e.dir
size: 5926077
nfiles: 3
- path: src\data\clean_data.py
md5: 446fefd52294d288399ab790a42b6388
size: 2633
outs:
- path: data\prepared
md5: 15682c488e4b458bdc8b68fb97c82852.dir
size: 591595
nfiles: 1
generate_training_features:
cmd: python -W ignore src\data\generate_training_features.py
deps:
- path: data\prepared
md5: 15682c488e4b458bdc8b68fb97c82852.dir
size: 591595
nfiles: 1
- path: src\data\generate_training_features.py
md5: 3bce09ebf41b12d3b25c3edbebdb3c71
size: 7590
- path: src\models\feature_eng
md5: bd374ef549941e43b16e8b2da60001d4.dir
size: 45257
nfiles: 9
params:
params.yaml:
featurize:
binary_encoder:
columns:
- RESOURCE
- ROLE_ROLLUP_1
- ROLE_ROLLUP_2
- ROLE_DEPTNAME
- ROLE_FAMILY_DESC
- ROLE_FAMILY
- ROLE_CODE
combine_feat:
targetcol: ACTION
ignore_columns:
- ROLE_TITLE
- MGR_ID
- RESOURCE
resource_catagory_encode:
column_to_consider:
- ROLE_DEPTNAME_ROLE_FAMILY
concat_result_to_input: true
ktarget_enc:
n_fold: 15
random_seed: 2023
targetcol: ACTION
columns: []
concat_result_to_input: true
random_catagory_encode:
random_seed: 2023
targetcol: ACTION
random_cnt: 5
columns:
- RESOURCE
- ROLE_ROLLUP_1
- ROLE_ROLLUP_2
- ROLE_DEPTNAME
- ROLE_FAMILY_DESC
- ROLE_FAMILY
- ROLE_CODE
fequency_encode:
min_group_size: 2
n_fold: 5
random_seed: 2023
targetcol: ACTION
log_transform: true
concat_result_to_input: true
columns:
- ROLE_DEPTNAME
- ROLE_ROLLUP_1_ROLE_DEPTNAME
- ROLE_ROLLUP_2_ROLE_DEPTNAME
- ROLE_ROLLUP_2_ROLE_CODE
- ROLE_DEPTNAME_ROLE_FAMILY_DESC
- ROLE_DEPTNAME_ROLE_FAMILY
- ROLE_DEPTNAME_ROLE_CODE
tfidf:
random_seed: 2023
pair_columns:
- RESOURCE
permute_columns:
- ROLE_ROLLUP_1
- ROLE_ROLLUP_2
- ROLE_FAMILY
- ROLE_CODE
- ROLE_ROLLUP_2_ROLE_CODE
targetcol: ACTION
combine_columns_required: false
concat_result_to_input: true
dim_reduction:
ROLE_ROLLUP_1: 27
ROLE_ROLLUP_2: 21
ROLE_FAMILY: 15
ROLE_CODE: 60
ROLE_ROLLUP_2_ROLE_CODE: 90
output:
folder: feature\tfidf
filename: tfidf.parquet
count_vector:
random_seed: 2023
targetcol: ACTION
combine_columns_required: false
concat_result_to_input: true
columns:
- RESOURCE
- ROLE_ROLLUP_1
- ROLE_ROLLUP_2
- ROLE_DEPTNAME
- ROLE_FAMILY_DESC
- ROLE_FAMILY
- ROLE_CODE
permute_columns:
- RESOURCE_ROLE_ROLLUP_1
- ROLE_ROLLUP_1_ROLE_ROLLUP_2
- ROLE_ROLLUP_1_ROLE_FAMILY_DESC
- ROLE_ROLLUP_1_ROLE_FAMILY
- ROLE_ROLLUP_1_ROLE_CODE
- ROLE_FAMILY_DESC_ROLE_FAMILY
- ROLE_FAMILY_ROLE_ROLLUP_2
- ROLE_ROLLUP_2_ROLE_FAMILY_DESC
- ROLE_FAMILY_DESC_ROLE_CODE
- ROLE_ROLLUP_1_RESOURCE
- ROLE_ROLLUP_2_ROLE_ROLLUP_1
- ROLE_FAMILY_DESC_ROLE_ROLLUP_1
- ROLE_FAMILY_ROLE_ROLLUP_1
- ROLE_CODE_ROLE_ROLLUP_1
- ROLE_FAMILY_ROLE_FAMILY_DESC
- ROLE_ROLLUP_2_ROLE_FAMILY
- ROLE_FAMILY_DESC_ROLE_ROLLUP_2
- ROLE_CODE_ROLE_FAMILY_DESC
- ROLE_ROLLUP_1_ROLE_DEPTNAME
- ROLE_DEPTNAME_ROLE_ROLLUP_1
dim_reduction:
RESOURCE_ROLE_ROLLUP_1: 1
ROLE_ROLLUP_1_ROLE_ROLLUP_2: 1
ROLE_ROLLUP_1_ROLE_FAMILY_DESC: 1
ROLE_ROLLUP_1_ROLE_FAMILY: 1
ROLE_ROLLUP_1_ROLE_CODE: 1
ROLE_FAMILY_DESC_ROLE_FAMILY: 1
ROLE_FAMILY_ROLE_ROLLUP_2: 3
ROLE_ROLLUP_2_ROLE_FAMILY_DESC: 4
ROLE_FAMILY_DESC_ROLE_CODE: 5
ROLE_ROLLUP_1_RESOURCE: 1
ROLE_ROLLUP_2_ROLE_ROLLUP_1: 1
ROLE_FAMILY_DESC_ROLE_ROLLUP_1: 1
ROLE_FAMILY_ROLE_ROLLUP_1: 1
ROLE_CODE_ROLE_ROLLUP_1: 1
ROLE_FAMILY_ROLE_FAMILY_DESC: 1
ROLE_ROLLUP_2_ROLE_FAMILY: 3
ROLE_FAMILY_DESC_ROLE_ROLLUP_2: 4
ROLE_CODE_ROLE_FAMILY_DESC: 5
ROLE_ROLLUP_1_ROLE_DEPTNAME: 5
ROLE_DEPTNAME_ROLE_ROLLUP_1: 7
output:
folder: feature\cntvector
filename: count_vectorizer.parquet
model.bagging_decision_tree.pipeline_type:
KFoldTE: false
frequency_encoding: true
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: false
count_vectorizer_encoding: false
random_catagory_encode: false
resource_catagory_encode: true
binary_encode: false
model.decision_tree.pipeline_type:
KFoldTE: false
frequency_encoding: false
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: false
count_vectorizer_encoding: true
random_catagory_encode: false
resource_catagory_encode: true
binary_encode: false
model.extra_decision_tree.pipeline_type:
KFoldTE: true
frequency_encoding: false
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: true
count_vectorizer_encoding: false
random_catagory_encode: false
resource_catagory_encode: false
binary_encode: false
model.logistic_reg.pipeline_type:
KFoldTE: false
frequency_encoding: false
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: false
count_vectorizer_encoding: false
random_catagory_encode: false
resource_catagory_encode: false
binary_encode: true
model.model_type: logistic_reg
model.random_forest.pipeline_type:
KFoldTE: false
frequency_encoding: true
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: true
count_vectorizer_encoding: false
random_catagory_encode: false
resource_catagory_encode: false
binary_encode: false
model.xgboost.pipeline_type:
KFoldTE: false
frequency_encoding: true
KFold_frequency_encoding: false
tfidf_vectorizer_encoding: false
count_vectorizer_encoding: true
random_catagory_encode: false
resource_catagory_encode: true
binary_encode: false
train_test_split:
test_size: 0.1
random_seed: 7899
cv: 3
train_data: feature\train_data.parquet
outs:
- path: data\feature\
md5: 83a8e2f3fec8358e256fe5aa94fa7043.dir
size: 481071
nfiles: 1
train_model:
cmd: python -W ignore src\data\train_model.py
deps:
- path: data\feature\
md5: 83a8e2f3fec8358e256fe5aa94fa7043.dir
size: 481071
nfiles: 1
- path: src\data\train_model.py
md5: e1b9ed673e1583069ce9b5d161c65931
size: 11482
params:
params.yaml:
model.bagging_decision_tree.hyper_params:
base_estimator:
random_state: 1907
max_depth: 21
splitter: best
ccp_alpha: 2.4e-05
bagging:
random_seed: 1907
n_estimators: 19
model.decision_tree.hyper_params:
random_state: 1907
max_depth: 21
splitter: best
ccp_alpha: 2.4e-05
model.extra_decision_tree.hyper_params:
random_state: 56
n_estimators: 55
max_depth: 4
bootstrap: true
max_samples: 0.75
max_features: 0.7
min_samples_leaf: 0.5
class_weight: balanced
model.logistic_reg.hyper_params:
random_state: 2023
max_iter: 370
penalty: l2
solver: liblinear
fit_intercept: true
C: 0.01
class_weight: balanced
n_jobs: -1
model.model_type: logistic_reg
model.random_forest.hyper_params:
random_state: 42
n_estimators: 220
max_depth: 5
bootstrap: true
max_samples: 0.5
max_features: 0.5
min_samples_leaf: 0.005
class_weight: balanced
model.xgboost.hyper_params:
n_estimators: 300
reg_lambda: 1.1800000000000002
max_depth: 4
learning_rate: 0.07
random_state: 2045
colsample_bytree: 0.685
outs:
- path: data\model\metrics\metrics.json
md5: 3dc13ddeb43be83e4b69ac401c6fdb9c
size: 76
evaluate_model:
cmd: python src\data\eval.py
deps:
- path: data\model
md5: e16941fb70e9c3401de9df9dc9870e9f.dir
size: 32977
nfiles: 1
- path: src\data\eval.py
md5: 64779cb95c7bf90ff0f45abd4722d6a2
size: 12080
outs:
- path: data\eval\metrics\metrics.json
md5: bd822b5620ed49d00a7ee86bb046b3ba
size: 80
- path: data\eval\plots\confusion_matrix.png
md5: 8ccc83fbfab075ed404888d66c50f761
size: 16259
- path: data\eval\plots\pr_rc_curve.png
md5: 0ee74bacf57f7daa655da842106347d9
size: 23723
- path: data\eval\plots\roc_curve.png
md5: e5d3dd354e83c2731d920315c6a3023d
size: 25948