Skip to content

Commit 93bb892

Browse files
committed
removed another source of individual importances not summing up to one
1 parent e9e58e1 commit 93bb892

File tree

2 files changed

+26
-32
lines changed

2 files changed

+26
-32
lines changed

fANOVA demo.ipynb

+8-21
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
"outputs": [],
1010
"source": [
1111
"import sys\n",
12-
"sys.path.append('/ihome/sfalkner/repositories/github/random_forest_run/build')\n",
13-
"sys.path.append('/ihome/sfalkner/repositories/github/ConfigSpace')\n",
14-
"sys.path.append('/ihome/sfalkner/repositories/github/fanova')\n",
12+
"sys.path.append('/home/sfalkner/repositories/github/random_forest_run/build')\n",
13+
"sys.path.append('/home/sfalkner/repositories/github/ConfigSpace')\n",
14+
"sys.path.append('/home/sfalkner/repositories/github/fanova')\n",
1515
"\n",
1616
"import numpy as np\n",
1717
"import matplotlib.pyplot as plt\n",
@@ -30,8 +30,8 @@
3030
"outputs": [],
3131
"source": [
3232
"# load data\n",
33-
"X_full = np.loadtxt('/ihome/sfalkner/repositories/github/random_forest_run/test_data_sets/online_lda_features.csv', delimiter=',')\n",
34-
"y_full = np.loadtxt('/ihome/sfalkner/repositories/github/random_forest_run/test_data_sets/online_lda_responses.csv', delimiter=',')\n",
33+
"X_full = np.loadtxt('/home/sfalkner/repositories/github/random_forest_run/test_data_sets/online_lda_features.csv', delimiter=',')\n",
34+
"y_full = np.loadtxt('/home/sfalkner/repositories/github/random_forest_run/test_data_sets/online_lda_responses.csv', delimiter=',')\n",
3535
"\n",
3636
"\n",
3737
"#n_samples = X_full.shape[0]//2\n",
@@ -52,17 +52,6 @@
5252
"f = fanova.fANOVA(X,y, n_trees=32,bootstrapping=True)"
5353
]
5454
},
55-
{
56-
"cell_type": "code",
57-
"execution_count": null,
58-
"metadata": {
59-
"collapsed": false
60-
},
61-
"outputs": [],
62-
"source": [
63-
"f.quantify_importance(0)"
64-
]
65-
},
6655
{
6756
"cell_type": "markdown",
6857
"metadata": {
@@ -188,7 +177,7 @@
188177
"source": [
189178
"f.set_cutoffs((-np.inf, np.inf)) #just reset the cutoffs\n",
190179
"print(f.trees_total_variance)\n",
191-
"f.set_cutoffs((0,2000))\n",
180+
"#f.set_cutoffs((0,2000))\n",
192181
"print(f.trees_total_variance)\n",
193182
"\n",
194183
"importance_dict = f.quantify_importance([0,1,2]) "
@@ -245,9 +234,7 @@
245234
"\n",
246235
"data=np.hstack([X_full,y_full[:,None]])\n",
247236
"np.savetxt('/tmp/tmp_data.csv', data, delimiter=',')\n",
248-
"f = FanovaFromCSV(\"/tmp/tmp_data.csv\")\n",
249-
"\n",
250-
"# fails in IPython, but might work in a script -> TODO!"
237+
"f = FanovaFromCSV(\"/tmp/tmp_data.csv\")\n"
251238
]
252239
},
253240
{
@@ -276,7 +263,7 @@
276263
"name": "python",
277264
"nbconvert_exporter": "python",
278265
"pygments_lexer": "ipython3",
279-
"version": "3.6.0"
266+
"version": "3.4.5"
280267
}
281268
},
282269
"nbformat": 4,

fanova/fanova.py

+18-11
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ def __init__(self, X, Y, config_space=None,
8989
else:
9090
pcs[i] = (hp.lower, hp.upper)
9191

92-
print(pcs)
9392
# set forest options
9493
forest = reg.fanova_forest()
9594
forest.options.num_trees = n_trees
@@ -108,7 +107,7 @@ def __init__(self, X, Y, config_space=None,
108107
rng = reg.default_random_engine()
109108
else:
110109
rng = reg.default_random_engine(seed)
111-
data = reg.data_container(X.shape[1])
110+
data = reg.default_data_container(X.shape[1])
112111

113112
for i, (mn,mx) in enumerate(pcs):
114113
if(np.isnan(mx)):
@@ -142,8 +141,16 @@ def __init__(self, X, Y, config_space=None,
142141
midpoints = []
143142
for i, split_vals in enumerate(tree_split_values):
144143
if np.isnan(pcs[i][1]): # categorical parameter
145-
midpoints.append(split_vals)
146-
sizes.append( np.ones(len(split_vals)))
144+
# check if the tree actually splits on this parameter
145+
if len(split_vals) > 0:
146+
midpoints.append(split_vals)
147+
sizes.append( np.ones(len(split_vals)))
148+
# if not, simply append 0 as the value with the number
149+
# of categories as the size, that way this parameter will
150+
# get 0 importance from this tree.
151+
else:
152+
midpoints.append((0,))
153+
sizes.append((pcs[i][0],))
147154
else:
148155
# add bounds to split values
149156
sv = np.array([pcs[i][0]] + list(split_vals) + [pcs[i][1]])
@@ -240,7 +247,7 @@ def __compute_marginals(self, dimensions):
240247
for i, (m, s) in enumerate(zip(prod_midpoints, prod_sizes)):
241248
sample[list(dimensions)] = list(m)
242249
ls = self.the_forest.marginal_prediction_stat_of_tree(tree_idx, sample.tolist())
243-
print(sample, ls.mean())
250+
#print(sample, ls.mean())
244251
if not np.isnan(ls.mean()):
245252
stat.push( ls.mean(), np.prod(np.array(s)) * ls.sum_of_weights())
246253

@@ -272,12 +279,12 @@ def quantify_importance(self, dimensions):
272279
for k in range(1, len(dimensions)+1):
273280
for sub_dims in it.combinations(dimensions, k):
274281
importance_dict[sub_dims] = {}
275-
fractions_total = [self.V_U_total[sub_dims][t]/self.trees_total_variance[t] for t in range(self.n_trees)]
276-
fractions_individual = [self.V_U_individual[sub_dims][t]/self.trees_total_variance[t] for t in range(self.n_trees)]
277-
# TODO: clean NANs here and catch zero variance in a tree!
278-
279-
importance_dict[sub_dims]['individual importance'] = np.mean(fractions_individual)
280-
importance_dict[sub_dims]['total importance'] = np.mean(fractions_total)
282+
fractions_total = np.array([self.V_U_total[sub_dims][t]/self.trees_total_variance[t] for t in range(self.n_trees)])
283+
fractions_individual = np.array([self.V_U_individual[sub_dims][t]/self.trees_total_variance[t] for t in range(self.n_trees)])
284+
# clean NANs here to catch zero variance in a trees
285+
indices = np.logical_and(~np.isnan(fractions_individual), ~np.isnan(fractions_total))
286+
importance_dict[sub_dims]['individual importance'] = np.mean(fractions_individual[indices])
287+
importance_dict[sub_dims]['total importance'] = np.mean(fractions_total[indices])
281288

282289
return(importance_dict)
283290

0 commit comments

Comments
 (0)