@@ -17,21 +17,6 @@ use rayon::iter::IntoParallelRefIterator;
17
17
use rayon:: iter:: ParallelIterator ;
18
18
use std:: collections:: BTreeMap ;
19
19
20
- /// number of kmeans centroids.
21
- /// this determines the granularity of the abstraction space.
22
- ///
23
- /// - CPU: O(N^2) for kmeans initialization
24
- /// - CPU: O(N) for kmeans clustering
25
- /// - RAM: O(N^2) for learned metric
26
- /// - RAM: O(N) for learned centroids
27
- const N_KMEANS_CENTROIDS : usize = 256 ;
28
-
29
- /// number of kmeans iterations.
30
- /// this controls the precision of the abstraction space.
31
- ///
32
- /// - CPU: O(N) for kmeans clustering
33
- const N_KMEANS_ITERATION : usize = 64 ;
34
-
35
20
/// Hierarchical K Means Learner.
36
21
/// this is decomposed into the necessary data structures
37
22
/// for kmeans clustering to occur for a given `Street`.
@@ -61,6 +46,35 @@ pub struct Layer {
61
46
}
62
47
63
48
impl Layer {
49
+ /// number of kmeans centroids.
50
+ /// this determines the granularity of the abstraction space.
51
+ ///
52
+ /// - CPU: O(N^2) for kmeans initialization
53
+ /// - CPU: O(N) for kmeans clustering
54
+ /// - RAM: O(N^2) for learned metric
55
+ /// - RAM: O(N) for learned centroids
56
+ const fn k ( street : Street ) -> usize {
57
+ match street {
58
+ Street :: Pref => 169 ,
59
+ Street :: Flop => 8 ,
60
+ Street :: Turn => 8 ,
61
+ Street :: Rive => unreachable ! ( ) ,
62
+ }
63
+ }
64
+
65
+ /// number of kmeans iterations.
66
+ /// this controls the precision of the abstraction space.
67
+ ///
68
+ /// - CPU: O(N) for kmeans clustering
69
+ const fn t ( street : Street ) -> usize {
70
+ match street {
71
+ Street :: Pref => 0 ,
72
+ Street :: Flop => 128 ,
73
+ Street :: Turn => 32 ,
74
+ Street :: Rive => unreachable ! ( ) ,
75
+ }
76
+ }
77
+
64
78
/// start with the River layer. everything is empty because we
65
79
/// can generate `Abstractor` and `SmallSpace` from "scratch".
66
80
/// - `lookup`: lazy equity calculation of river observations
@@ -95,8 +109,8 @@ impl Layer {
95
109
}
96
110
/// save the current layer's `Metric` and `Abstractor` to disk
97
111
pub fn save ( self ) -> Self {
98
- self . metric . save ( format ! ( "{}" , self . street. next( ) ) ) ; // outer layer generates this purely (metric over projections)
99
- self . lookup . save ( format ! ( "{}" , self . street) ) ; // while inner layer generates this (clusters)
112
+ self . metric . save ( self . street . next ( ) ) ; // outer layer generates this purely (metric over projections)
113
+ self . lookup . save ( self . street ) ; // while inner layer generates this (clusters)
100
114
self
101
115
}
102
116
@@ -115,7 +129,7 @@ impl Layer {
115
129
///
116
130
/// we symmetrize the distance by averaging the EMDs in both directions.
117
131
/// the distnace isn't symmetric in the first place only because our heuristic algo is not fully accurate
118
- pub fn inner_metric ( & self ) -> Metric {
132
+ fn inner_metric ( & self ) -> Metric {
119
133
log:: info!(
120
134
"{:<32}{:<32}" ,
121
135
"computing metric" ,
@@ -170,13 +184,13 @@ impl Layer {
170
184
log:: info!(
171
185
"{:<32}{:<32}" ,
172
186
"declaring abstractions" ,
173
- format!( "{} {} clusters" , self . street, N_KMEANS_CENTROIDS )
187
+ format!( "{} {} clusters" , self . street, Self :: k ( self . street ) )
174
188
) ;
175
189
let ref mut rng = rand:: thread_rng ( ) ;
176
- let progress = Self :: progress ( N_KMEANS_CENTROIDS ) ;
190
+ let progress = Self :: progress ( Self :: k ( self . street ) ) ;
177
191
self . kmeans . expand ( self . sample_uniform ( rng) ) ;
178
192
progress. inc ( 1 ) ;
179
- while self . kmeans . 0 . len ( ) < N_KMEANS_CENTROIDS {
193
+ while self . kmeans . 0 . len ( ) < Self :: k ( self . street ) {
180
194
self . kmeans . expand ( self . sample_outlier ( rng) ) ;
181
195
progress. inc ( 1 ) ;
182
196
}
@@ -189,17 +203,16 @@ impl Layer {
189
203
log:: info!(
190
204
"{:<32}{:<32}" ,
191
205
"clustering observations" ,
192
- format!( "{} {} iterations" , self . street, N_KMEANS_ITERATION )
206
+ format!( "{} {} iterations" , self . street, Self :: t ( self . street ) )
193
207
) ;
194
- let progress = Self :: progress ( N_KMEANS_ITERATION ) ;
195
- for _ in 0 ..N_KMEANS_ITERATION {
208
+ let progress = Self :: progress ( Self :: t ( self . street ) ) ;
209
+ for _ in 0 ..Self :: t ( self . street ) {
196
210
let neighbors = self
197
211
. points
198
212
. 0
199
213
. par_iter ( )
200
214
. map ( |( _, h) | self . nearest_neighbor ( h) )
201
215
. collect :: < Vec < ( Abstraction , f32 ) > > ( ) ;
202
- self . kmeans . clear ( ) ;
203
216
self . assign_nearest_neighbor ( neighbors) ;
204
217
self . assign_orphans_randomly ( ) ;
205
218
progress. inc ( 1 ) ;
@@ -211,36 +224,33 @@ impl Layer {
211
224
/// by computing the EMD distance between the `Observation`'s `Histogram` and each `Centroid`'s `Histogram`
212
225
/// and returning the `Abstraction` of the nearest `Centroid`
213
226
fn assign_nearest_neighbor ( & mut self , neighbors : Vec < ( Abstraction , f32 ) > ) {
227
+ self . kmeans . clear ( ) ;
214
228
let mut loss = 0. ;
215
- for ( ( observation, histogram) , ( abstraction, distance) ) in
216
- std:: iter:: zip ( self . points . 0 . iter_mut ( ) , neighbors. iter ( ) )
217
- {
218
- loss += distance * distance;
219
- self . lookup . assign ( abstraction, observation) ;
220
- self . kmeans . absorb ( abstraction, histogram) ;
229
+ for ( ( obs, hist) , ( abs, dist) ) in self . points . 0 . iter_mut ( ) . zip ( neighbors. iter ( ) ) {
230
+ loss += dist * dist;
231
+ self . lookup . assign ( abs, obs) ;
232
+ self . kmeans . absorb ( abs, hist) ;
221
233
}
222
- log:: debug!( "LOSS {:>12.8}" , loss / self . points. 0 . len( ) as f32 ) ;
234
+ let loss = loss / self . points . 0 . len ( ) as f32 ;
235
+ log:: trace!( "LOSS {:>12.8}" , loss) ;
223
236
}
224
237
/// centroid drift may make it such that some centroids are empty
225
238
/// so we reinitialize empty centroids with random Observations if necessary
226
239
fn assign_orphans_randomly ( & mut self ) {
227
240
for ref a in self . kmeans . orphans ( ) {
228
- log:: warn!(
229
- "{:<32}{:<32}" ,
230
- "reassigning empty centroid" ,
231
- format!( "0x{}" , a)
232
- ) ;
233
241
let ref mut rng = rand:: thread_rng ( ) ;
234
242
let ref sample = self . sample_uniform ( rng) ;
235
243
self . kmeans . absorb ( a, sample) ;
244
+ log:: debug!(
245
+ "{:<32}{:<32}" ,
246
+ "reassigned empty centroid" ,
247
+ format!( "0x{}" , a)
248
+ ) ;
236
249
}
237
250
}
238
251
239
252
/// the first Centroid is uniformly random across all `Observation` `Histogram`s
240
- fn sample_uniform < R > ( & self , rng : & mut R ) -> Histogram
241
- where
242
- R : Rng ,
243
- {
253
+ fn sample_uniform < R : Rng > ( & self , rng : & mut R ) -> Histogram {
244
254
self . points
245
255
. 0
246
256
. values ( )
@@ -251,10 +261,7 @@ impl Layer {
251
261
/// each next Centroid is selected with probability proportional to
252
262
/// the squared distance to the nearest neighboring Centroid.
253
263
/// faster convergence, i guess. on the shoulders of giants
254
- fn sample_outlier < R > ( & self , rng : & mut R ) -> Histogram
255
- where
256
- R : Rng ,
257
- {
264
+ fn sample_outlier < R : Rng > ( & self , rng : & mut R ) -> Histogram {
258
265
let weights = self
259
266
. points
260
267
. 0
0 commit comments