Skip to content

Commit

Permalink
adds plots
Browse files Browse the repository at this point in the history
  • Loading branch information
micheleriva committed Nov 26, 2024
1 parent f046524 commit 93e6b8b
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 20 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
.idea
target
target
benchmark_results.csv
benchmark_results.png
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ rayon = "1.10.0"
log = "0.4.22"
thiserror = "2.0.3"
env_logger = "0.11.5"
serde = { version = "1.0.215", features = ["derive"] }

[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }
Expand Down
14 changes: 14 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
.PHONY: quality_check

RUST_LOG := info
PLOTS_DIR := plots
BENCHMARK_RESULTS := benchmark_results.png

quality_check:
$(RUST_LOG) cargo run --release --bin quality_check
cd $(PLOTS_DIR) && python3 main.py
mv $(PLOTS_DIR)/$(BENCHMARK_RESULTS) ./$(BENCHMARK_RESULTS)

clean:
rm -f $(BENCHMARK_RESULTS)
rm -f benchmark_results.csv
52 changes: 52 additions & 0 deletions plots/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read the benchmark results
df = pd.read_csv('../benchmark_results.csv')

# Set up the plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Create a figure with multiple subplots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# Plot 1: Timing metrics
ax1.plot(df['n_samples'], df['fit_time_ms'], marker='o', label='Fit Time')
ax1.plot(df['n_samples'], df['compression_time_ms'], marker='o', label='Compression Time')
ax1.set_xlabel('Number of Samples')
ax1.set_ylabel('Time (ms)')
ax1.set_title('Processing Time vs Dataset Size')
ax1.legend()
ax1.set_xscale('log')
ax1.set_yscale('log')

# Plot 2: Quality metrics
ax2.plot(df['n_samples'], df['reconstruction_error'], marker='o', label='Reconstruction Error')
ax2.plot(df['n_samples'], df['recall'], marker='o', label='Recall@10')
ax2.set_xlabel('Number of Samples')
ax2.set_ylabel('Score')
ax2.set_title('Quality Metrics vs Dataset Size')
ax2.legend()
ax2.set_xscale('log')

# Plot 3: Memory reduction
ax3.plot(df['n_samples'], (1 - df['memory_reduction_ratio']) * 100, marker='o')
ax3.set_xlabel('Number of Samples')
ax3.set_ylabel('Memory Reduction (%)')
ax3.set_title('Memory Reduction vs Dataset Size')
ax3.set_xscale('log')

# Plot 4: Time per sample
df['time_per_sample'] = (df['compression_time_ms']) / df['n_samples']
ax4.plot(df['n_samples'], df['time_per_sample'], marker='o')
ax4.set_xlabel('Number of Samples')
ax4.set_ylabel('Compression Time per Sample (ms)')
ax4.set_title('Scaling Efficiency')
ax4.set_xscale('log')
ax4.set_yscale('log')

plt.tight_layout()
plt.savefig('benchmark_results.png', dpi=300, bbox_inches='tight')
plt.close()
3 changes: 3 additions & 0 deletions plots/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pandas
matplotlib
seaborn
126 changes: 107 additions & 19 deletions src/bin/quality_check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,60 @@ use log::info;
use ndarray::{s, Array2, ArrayView1, Axis};
use ndarray_rand::RandomExt;
use rand_distr::Uniform;
use serde::Serialize;
use std::fs::File;
use std::io::Write;
use std::time::Instant;
use vector_quantizer::pq::PQ;

#[derive(Serialize)]
struct BenchmarkResult {
n_samples: usize,
n_dims: usize,
fit_time_ms: f64,
compression_time_ms: f64,
reconstruction_error: f32,
recall: f32,
memory_reduction_ratio: f32,
}

fn run_benchmark(
n_samples: usize,
n_dims: usize,
m: usize,
ks: u32,
iterations: usize,
) -> Result<BenchmarkResult> {
let original_data = Array2::<f32>::random((n_samples, n_dims), Uniform::new(0.0, 1.0));

let mut pq = PQ::try_new(m, ks)?;

let fit_start = Instant::now();
pq.fit(&original_data, iterations)?;
let fit_time = fit_start.elapsed().as_secs_f64() * 1000.0;

let compress_start = Instant::now();
let compressed_data = pq.compress(&original_data)?;
let compression_time = compress_start.elapsed().as_secs_f64() * 1000.0;

let reconstruction_error = calculate_reconstruction_error(&original_data, &compressed_data);
let recall = calculate_recall(&original_data, &compressed_data, 10)?;

let original_size = n_samples * n_dims * size_of::<f32>();
let compressed_size = n_samples * m; // Each subspace uses 1 byte
let memory_reduction_ratio = compressed_size as f32 / original_size as f32;

Ok(BenchmarkResult {
n_samples,
n_dims,
fit_time_ms: fit_time,
compression_time_ms: compression_time,
reconstruction_error,
recall,
memory_reduction_ratio,
})
}

fn euclidean_distance(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> f32 {
a.iter()
.zip(b.iter())
Expand All @@ -30,20 +81,38 @@ fn calculate_reconstruction_error(original: &Array2<f32>, reconstructed: &Array2

fn calculate_recall(original: &Array2<f32>, compressed: &Array2<f32>, k: usize) -> Result<f32> {
let n_samples = original.len_of(Axis(0));

let max_eval_samples = 1000;
let eval_samples = if n_samples > max_eval_samples {
max_eval_samples
} else {
n_samples
};

let mut total_recall = 0.0;
let step = n_samples / eval_samples;

for i in 0..n_samples {
for i in (0..n_samples).step_by(step) {
let query = original.slice(s![i, ..]);

let mut true_neighbors: Vec<(usize, f32)> = (0..n_samples)
let search_window = if n_samples > 10000 { 5000 } else { n_samples };

let start_idx = if i > search_window / 2 {
i - search_window / 2
} else {
0
};
let end_idx = (i + search_window / 2).min(n_samples);

let mut true_neighbors: Vec<(usize, f32)> = (start_idx..end_idx)
.filter(|&j| j != i)
.map(|j| (j, euclidean_distance(&query, &original.slice(s![j, ..]))))
.collect();
true_neighbors.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
let true_neighbors: Vec<usize> =
true_neighbors.iter().take(k).map(|&(idx, _)| idx).collect();

let mut approx_neighbors: Vec<(usize, f32)> = (0..n_samples)
let mut approx_neighbors: Vec<(usize, f32)> = (start_idx..end_idx)
.filter(|&j| j != i)
.map(|j| {
(
Expand All @@ -67,35 +136,54 @@ fn calculate_recall(original: &Array2<f32>, compressed: &Array2<f32>, k: usize)
total_recall += intersection / k as f32;
}

Ok(total_recall / n_samples as f32)
Ok(total_recall / (n_samples / step) as f32)
}

fn main() -> Result<()> {
env_logger::init();

let n_samples = 1000;
let sample_sizes = vec![1000, 5000, 10000, 50000, 100000];
let n_dims = 128;
let original_data = Array2::<f32>::random((n_samples, n_dims), Uniform::new(0.0, 1.0));

let m = 16;
let ks = 256;
let iterations = 10;

let mut pq = PQ::try_new(m, ks)?;

let fit_start = Instant::now();
pq.fit(&original_data, iterations)?;
println!("Fit completed in {:?}", fit_start.elapsed());
let mut results = Vec::new();

let encode_start = Instant::now();
let compressed_data = pq.compress(&original_data)?;
println!("Compression completed in {:?}", encode_start.elapsed());
for n_samples in sample_sizes {
info!("Running benchmark with {} samples...", n_samples);
let result = run_benchmark(n_samples, n_dims, m, ks, iterations)?;
results.push(result);
}

let reconstruction_error = calculate_reconstruction_error(&original_data, &compressed_data);
println!("Reconstruction Error: {:.4}", reconstruction_error);
let mut file = File::create("benchmark_results.csv")?;
writeln!(file, "n_samples,n_dims,fit_time_ms,compression_time_ms,reconstruction_error,recall,memory_reduction_ratio")?;

for result in &results {
writeln!(
file,
"{},{},{},{},{},{},{}",
result.n_samples,
result.n_dims,
result.fit_time_ms,
result.compression_time_ms,
result.reconstruction_error,
result.recall,
result.memory_reduction_ratio
)?;
}

let recall = calculate_recall(&original_data, &compressed_data, 10)?;
println!("Recall@10: {:.4}", recall);
for result in &results {
info!("\nResults for {} samples:", result.n_samples);
info!("Fit time: {:.2}ms", result.fit_time_ms);
info!("Compression time: {:.2}ms", result.compression_time_ms);
info!("Reconstruction Error: {:.4}", result.reconstruction_error);
info!("Recall@10: {:.4}", result.recall);
info!(
"Memory reduction: {:.2}%",
(1.0 - result.memory_reduction_ratio) * 100.0
);
}

Ok(())
}

0 comments on commit 93e6b8b

Please sign in to comment.