Skip to content

Commit

Permalink
First rough benchmarking implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
bastikr committed Oct 11, 2024
1 parent 68bfbf7 commit dfbf0ac
Show file tree
Hide file tree
Showing 6 changed files with 509 additions and 57 deletions.
51 changes: 51 additions & 0 deletions docs/benchmarks.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Benchmarks

What are interesting questions we would like to answer?

## Build example program
Measure:
* Source code size:
* Header
* Source
* Total
* Binary size:
* libassets
* libcrl
* executable
* Time to build:
* Each step separately? (compile crl, compile assets, compile main, linking separate?)

Dependend on:
* Build status:
* Clean
* Nothing changed
* One file added
* All files changed
* cmake/ninja
* testsets
* features

## Build benchmark program
* all features enabled

Measure:
* Time to run different get functions

Dependend on:
* testsets (file size should not be relevant)
* search file:
* exists/does not exist
* order in sorted file list

## Testset characteristics
Testset characteristics:
* 1kb - 10MB
* 1 file - 10'000 files
* What about path length/directory depth?

## Features
* str comparison
* directory tree based
* simple hashing
* gperf perfect hashing
* static access
6 changes: 4 additions & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ FetchContent_Declare(

FetchContent_Declare(
googlebenchmark
URL https://github.com/google/benchmark/archive/refs/tags/v1.9.0.zip)
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG v1.9.0
)

# For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
Expand All @@ -37,7 +39,7 @@ add_executable(crl_benchmark
"src/bench.cpp"
)

add_resource_library(assets "${CMAKE_SOURCE_DIR}/../example/assets/" GPERF)
add_resource_library(assets "${CMAKE_SOURCE_DIR}/../tools/build/testsets_num10_depth2/" GPERF)

target_link_libraries(crl_benchmark
PRIVATE
Expand Down
99 changes: 44 additions & 55 deletions tests/src/bench.cpp
Original file line number Diff line number Diff line change
@@ -1,81 +1,70 @@
#include <benchmark/benchmark.h>

#include <assets.h>
#include <cstdio>
#include <string>
#include <string_view>

static void bench_get_file_found(benchmark::State &state) {
for (auto _ : state) {
auto r = assets::get_file("logs/source/t.txt");
assert(r.has_value());
}
}
static std::string g_PATH;
static std::array<std::string_view, 4> g_PATHV;
static bool g_EXISTS;

static void bench_get_file_not_found(benchmark::State &state) {
static void bench_get_file(benchmark::State &state) {
for (auto _ : state) {
auto r = assets::get_file("logs/source/t.txd");
assert(!r.has_value());
auto r = assets::get_file(g_PATH);
assert(r.has_value() == g_EXISTS);
}
}

static void bench_get_filev_found(benchmark::State &state) {
std::array<std::string_view, 4> b{"logs", "/", "source/t", ".txt"};
static void bench_get_filev(benchmark::State &state) {
for (auto _ : state) {
auto r = assets::get_filev(b);
assert(r.has_value());
auto r = assets::get_filev(g_PATHV);
assert(r.has_value() == g_EXISTS);
}
}

static void bench_get_filev_not_found(benchmark::State &state) {
std::array<std::string_view, 4> b{"logs", "/", "source/t", ".txd"};
static void bench_get_file_ph(benchmark::State &state) {
for (auto _ : state) {
auto r = assets::get_filev(b);
assert(!r.has_value());
auto r = assets::get_file_ph(g_PATH);
assert(r.has_value() == g_EXISTS);
}
}

static void bench_get_file_static(benchmark::State &state) {
static void bench_get_file_strcmp(benchmark::State &state) {
for (auto _ : state) {
auto r = assets::get<"logs/source/t.txt">();
assert(r.size() > 1);
auto r = assets::get_file_strcmp(g_PATH);
assert(r.has_value() == g_EXISTS);
}
}

static void bench_get_file_ph_found(benchmark::State &state) {
for (auto _ : state) {
auto r = assets::get_file_ph("logs/source/t.txt");
assert(r.has_value());
}
}
BENCHMARK(bench_get_file);
BENCHMARK(bench_get_filev);
BENCHMARK(bench_get_file_ph);
BENCHMARK(bench_get_file_strcmp);

static void bench_get_file_ph_not_found(benchmark::State &state) {
for (auto _ : state) {
auto r = assets::get_file_ph("logs/source/t.txd");
assert(!r.has_value());
}
}
void usage() { printf("Usage: <path> <exists> [gperf args...]\n"); }

static void bench_get_file_strcmp_found(benchmark::State &state) {
for (auto _ : state) {
auto r = assets::get_file_strcmp("logs/source/t.txt");
assert(r.has_value());
int main(int argc, char **argv) {
if (argc != 3) {
usage();
return 1;
}
}

static void bench_get_file_strcmp_not_found(benchmark::State &state) {
for (auto _ : state) {
auto r = assets::get_file_strcmp("logs/source/t.txd");
assert(!r.has_value());
std::string exists{argv[2]};
if (exists == "true") {
g_EXISTS = true;
} else if (exists == "false") {
g_EXISTS = false;
} else {
usage();
return 1;
}
}

BENCHMARK(bench_get_file_static);
g_PATH = argv[1];
int l = g_PATHV.size() / 4;
std::get<0>(g_PATHV) = std::string_view(g_PATH).substr(0, l);
std::get<1>(g_PATHV) = std::string_view(g_PATH).substr(l, l);
std::get<2>(g_PATHV) = std::string_view(g_PATH).substr(l + l, l);
std::get<3>(g_PATHV) = std::string_view(g_PATH).substr(l + l + l);

BENCHMARK(bench_get_file_found);
BENCHMARK(bench_get_file_not_found);
BENCHMARK(bench_get_filev_found);
BENCHMARK(bench_get_filev_not_found);
BENCHMARK(bench_get_file_ph_found);
BENCHMARK(bench_get_file_ph_not_found);
BENCHMARK(bench_get_file_strcmp_found);
BENCHMARK(bench_get_file_strcmp_not_found);

BENCHMARK_MAIN();
benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
}
1 change: 1 addition & 0 deletions tools/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
build
138 changes: 138 additions & 0 deletions tools/generate_testset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/bin/env python3

import os
import argparse
import random
import string
import math
import sys

def parse_size(size_str):
return int(size_str)

def random_string(min_length=5, max_length=15):
length = random.randint(min_length, max_length)
return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

def create_file(path, size):
with open(path, 'wb') as f:
f.write(os.urandom(size))

def distribute_files_balanced(num_files, files_per_dir):
num_dirs = math.ceil(num_files / files_per_dir)
distribution = [files_per_dir] * num_dirs
remaining = num_files - files_per_dir * num_dirs
for i in range(remaining):
distribution[i] += 1
return distribution

def distribute_files_unbalanced(num_files, files_per_dir):
num_dirs = math.ceil(num_files / files_per_dir)
distribution = []
remaining = num_files
for _ in range(num_dirs):
if remaining <= 0:
break
# Assign between 1 and files_per_dir files to each directory
files_in_dir = random.randint(1, min(files_per_dir, remaining))
distribution.append(files_in_dir)
remaining -= files_in_dir
# If there are remaining files, assign them randomly
while remaining > 0:
idx = random.randint(0, len(distribution)-1)
if distribution[idx] < files_per_dir:
distribution[idx] += 1
remaining -= 1
return distribution

def adjust_name_lengths(target_path_length, current_depth, max_depth, current_path_length, path_parts):
"""
Adjust directory and file name lengths to approach the target path length.
This is a heuristic approach and may not be exact.
"""
remaining_length = target_path_length - current_path_length
remaining_dirs = max_depth - current_depth
if remaining_dirs <= 0:
file_name_length = max(5, remaining_length - 10) # Reserve some for file name
else:
avg_dir_length = remaining_length // (remaining_dirs + 1)
file_name_length = max(5, avg_dir_length)
return file_name_length

def main():
parser = argparse.ArgumentParser(description="Generate a test dataset with configurable directory and file characteristics.")
parser.add_argument('--output-dir', type=str, required=True, help='Root directory to generate the dataset.')
parser.add_argument('--num-files', type=int, required=True, help='Total number of files to generate.')
parser.add_argument('--files-per-dir', type=int, default=10, help='Number of files per directory.')
parser.add_argument('--distribution', type=str, choices=['balanced', 'unbalanced'], default='balanced', help='Distribution of files across directories.')
parser.add_argument('--content-size', type=str, default='10', help='Size of each file\'s content in bytes.')
parser.add_argument('--dir-depth', type=int, default=3, help='Maximum depth of the directory tree.')
parser.add_argument('--path-length', type=int, default=100, help='Desired average path length in characters.')

args = parser.parse_args()

output_dir = args.output_dir
num_files = args.num_files
files_per_dir = args.files_per_dir
distribution = args.distribution
try:
content_size = parse_size(args.content_size)
except ValueError:
print("Invalid content size format. Use numbers with optional suffixes like KB, MB.")
sys.exit(1)
dir_depth = args.dir_depth
path_length = args.path_length

# Create the root output directory
os.makedirs(output_dir, exist_ok=True)

# Determine file distribution
if distribution == 'balanced':
files_distribution = distribute_files_balanced(num_files, files_per_dir)
else:
files_distribution = distribute_files_unbalanced(num_files, files_per_dir)

total_dirs = len(files_distribution)
print(f"Generating {num_files} files across {total_dirs} directories with a maximum depth of {dir_depth}.")

# Generate directories and files
for dir_idx, num_files_in_dir in enumerate(files_distribution):
# Generate directory path
current_path = output_dir
current_depth = 0
current_path_length = len(os.path.abspath(current_path))
path_parts = []
while current_depth < dir_depth:
# Adjust name lengths to approach target path length
file_name_length = adjust_name_lengths(path_length, current_depth, dir_depth, current_path_length, path_parts)
dir_name = random_string(min_length=3, max_length=file_name_length)
path_parts.append(dir_name)
current_path = os.path.join(current_path, dir_name)
current_path_length += len(dir_name) + 1 # +1 for the os.sep
current_depth += 1
# Break early if adding more directories would exceed path length
if current_path_length >= path_length - 50: # Reserve some space for file name
break
os.makedirs(current_path, exist_ok=True)

# Generate files in the directory
for file_idx in range(num_files_in_dir):
# Generate file name
file_name_length = max(5, min(15, path_length // 10)) # Heuristic for file name length
file_name = f"file_{random_string(min_length=3, max_length=file_name_length)}.bin"
file_path = os.path.join(current_path, file_name)
# Ensure the path length does not exceed the target
if len(os.path.abspath(file_path)) > path_length:
# Adjust by shortening the file name
excess = len(os.path.abspath(file_path)) - path_length
if excess > 0 and len(file_name) > excess + 4: # 4 for 'file_' and '.bin'
new_length = len(file_name) - excess
file_name = f"file_{random_string(min_length=3, max_length=new_length)}.bin"
file_path = os.path.join(current_path, file_name)
# Create the file
create_file(file_path, content_size)

print(f"Dataset generation complete. Files are located in '{output_dir}'.")

if __name__ == "__main__":
main()
Loading

0 comments on commit dfbf0ac

Please sign in to comment.