First rough benchmarking implementation

bastikr · Oct 11, 2024 · dfbf0ac · dfbf0ac
1 parent 68bfbf7
commit dfbf0ac
Show file tree

Hide file tree

Showing 6 changed files with 509 additions and 57 deletions.
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
@@ -0,0 +1,51 @@
+# Benchmarks
+
+What are interesting questions we would like to answer?
+
+## Build example program
+Measure:
+* Source code size:
+    * Header
+    * Source
+    * Total
+* Binary size:
+    * libassets
+    * libcrl
+    * executable
+* Time to build:
+    * Each step separately? (compile crl, compile assets, compile main, linking separate?)
+
+Dependend on:
+* Build status:
+    * Clean
+    * Nothing changed
+    * One file added
+    * All files changed
+* cmake/ninja
+* testsets
+* features
+
+## Build benchmark program
+* all features enabled
+
+Measure:
+* Time to run different get functions
+
+Dependend on:
+* testsets (file size should not be relevant)
+* search file:
+    * exists/does not exist
+    * order in sorted file list
+
+## Testset characteristics
+Testset characteristics:
+* 1kb - 10MB
+* 1 file - 10'000 files
+* What about path length/directory depth?
+
+## Features
+* str comparison
+* directory tree based
+* simple hashing
+* gperf perfect hashing
+* static access
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -13,7 +13,9 @@ FetchContent_Declare(
 
 FetchContent_Declare(
   googlebenchmark
-  URL https://github.com/google/benchmark/archive/refs/tags/v1.9.0.zip)
+  GIT_REPOSITORY https://github.com/google/benchmark.git
+  GIT_TAG v1.9.0
+)
 
 # For Windows: Prevent overriding the parent project's compiler/linker settings
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
@@ -37,7 +39,7 @@ add_executable(crl_benchmark
   "src/bench.cpp"
 )
 
-add_resource_library(assets "${CMAKE_SOURCE_DIR}/../example/assets/" GPERF)
+add_resource_library(assets "${CMAKE_SOURCE_DIR}/../tools/build/testsets_num10_depth2/" GPERF)
 
 target_link_libraries(crl_benchmark
     PRIVATE

diff --git a/tests/src/bench.cpp b/tests/src/bench.cpp
@@ -1,81 +1,70 @@
 #include <benchmark/benchmark.h>
 
 #include <assets.h>
+#include <cstdio>
+#include <string>
+#include <string_view>
 
-static void bench_get_file_found(benchmark::State &state) {
-    for (auto _ : state) {
-        auto r = assets::get_file("logs/source/t.txt");
-        assert(r.has_value());
-    }
-}
+static std::string g_PATH;
+static std::array<std::string_view, 4> g_PATHV;
+static bool g_EXISTS;
 
-static void bench_get_file_not_found(benchmark::State &state) {
+static void bench_get_file(benchmark::State &state) {
     for (auto _ : state) {
-        auto r = assets::get_file("logs/source/t.txd");
-        assert(!r.has_value());
+        auto r = assets::get_file(g_PATH);
+        assert(r.has_value() == g_EXISTS);
     }
 }
 
-static void bench_get_filev_found(benchmark::State &state) {
-    std::array<std::string_view, 4> b{"logs", "/", "source/t", ".txt"};
+static void bench_get_filev(benchmark::State &state) {
     for (auto _ : state) {
-        auto r = assets::get_filev(b);
-        assert(r.has_value());
+        auto r = assets::get_filev(g_PATHV);
+        assert(r.has_value() == g_EXISTS);
     }
 }
 
-static void bench_get_filev_not_found(benchmark::State &state) {
-    std::array<std::string_view, 4> b{"logs", "/", "source/t", ".txd"};
+static void bench_get_file_ph(benchmark::State &state) {
     for (auto _ : state) {
-        auto r = assets::get_filev(b);
-        assert(!r.has_value());
+        auto r = assets::get_file_ph(g_PATH);
+        assert(r.has_value() == g_EXISTS);
     }
 }
 
-static void bench_get_file_static(benchmark::State &state) {
+static void bench_get_file_strcmp(benchmark::State &state) {
     for (auto _ : state) {
-        auto r = assets::get<"logs/source/t.txt">();
-        assert(r.size() > 1);
+        auto r = assets::get_file_strcmp(g_PATH);
+        assert(r.has_value() == g_EXISTS);
     }
 }
 
-static void bench_get_file_ph_found(benchmark::State &state) {
-    for (auto _ : state) {
-        auto r = assets::get_file_ph("logs/source/t.txt");
-        assert(r.has_value());
-    }
-}
+BENCHMARK(bench_get_file);
+BENCHMARK(bench_get_filev);
+BENCHMARK(bench_get_file_ph);
+BENCHMARK(bench_get_file_strcmp);
 
-static void bench_get_file_ph_not_found(benchmark::State &state) {
-    for (auto _ : state) {
-        auto r = assets::get_file_ph("logs/source/t.txd");
-        assert(!r.has_value());
-    }
-}
+void usage() { printf("Usage: <path> <exists> [gperf args...]\n"); }
 
-static void bench_get_file_strcmp_found(benchmark::State &state) {
-    for (auto _ : state) {
-        auto r = assets::get_file_strcmp("logs/source/t.txt");
-        assert(r.has_value());
+int main(int argc, char **argv) {
+    if (argc != 3) {
+        usage();
+        return 1;
     }
-}
-
-static void bench_get_file_strcmp_not_found(benchmark::State &state) {
-    for (auto _ : state) {
-        auto r = assets::get_file_strcmp("logs/source/t.txd");
-        assert(!r.has_value());
+    std::string exists{argv[2]};
+    if (exists == "true") {
+        g_EXISTS = true;
+    } else if (exists == "false") {
+        g_EXISTS = false;
+    } else {
+        usage();
+        return 1;
     }
-}
-
-BENCHMARK(bench_get_file_static);
+    g_PATH = argv[1];
+    int l = g_PATHV.size() / 4;
+    std::get<0>(g_PATHV) = std::string_view(g_PATH).substr(0, l);
+    std::get<1>(g_PATHV) = std::string_view(g_PATH).substr(l, l);
+    std::get<2>(g_PATHV) = std::string_view(g_PATH).substr(l + l, l);
+    std::get<3>(g_PATHV) = std::string_view(g_PATH).substr(l + l + l);
 
-BENCHMARK(bench_get_file_found);
-BENCHMARK(bench_get_file_not_found);
-BENCHMARK(bench_get_filev_found);
-BENCHMARK(bench_get_filev_not_found);
-BENCHMARK(bench_get_file_ph_found);
-BENCHMARK(bench_get_file_ph_not_found);
-BENCHMARK(bench_get_file_strcmp_found);
-BENCHMARK(bench_get_file_strcmp_not_found);
-
-BENCHMARK_MAIN();
+    benchmark::Initialize(&argc, argv);
+    benchmark::RunSpecifiedBenchmarks();
+}
diff --git a/tools/.gitignore b/tools/.gitignore
@@ -0,0 +1 @@
+build
diff --git a/tools/generate_testset.py b/tools/generate_testset.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+
+import os
+import argparse
+import random
+import string
+import math
+import sys
+
+def parse_size(size_str):
+    return int(size_str)
+
+def random_string(min_length=5, max_length=15):
+    length = random.randint(min_length, max_length)
+    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))
+
+def create_file(path, size):
+    with open(path, 'wb') as f:
+        f.write(os.urandom(size))
+
+def distribute_files_balanced(num_files, files_per_dir):
+    num_dirs = math.ceil(num_files / files_per_dir)
+    distribution = [files_per_dir] * num_dirs
+    remaining = num_files - files_per_dir * num_dirs
+    for i in range(remaining):
+        distribution[i] += 1
+    return distribution
+
+def distribute_files_unbalanced(num_files, files_per_dir):
+    num_dirs = math.ceil(num_files / files_per_dir)
+    distribution = []
+    remaining = num_files
+    for _ in range(num_dirs):
+        if remaining <= 0:
+            break
+        # Assign between 1 and files_per_dir files to each directory
+        files_in_dir = random.randint(1, min(files_per_dir, remaining))
+        distribution.append(files_in_dir)
+        remaining -= files_in_dir
+    # If there are remaining files, assign them randomly
+    while remaining > 0:
+        idx = random.randint(0, len(distribution)-1)
+        if distribution[idx] < files_per_dir:
+            distribution[idx] += 1
+            remaining -= 1
+    return distribution
+
+def adjust_name_lengths(target_path_length, current_depth, max_depth, current_path_length, path_parts):
+    """
+    Adjust directory and file name lengths to approach the target path length.
+    This is a heuristic approach and may not be exact.
+    """
+    remaining_length = target_path_length - current_path_length
+    remaining_dirs = max_depth - current_depth
+    if remaining_dirs <= 0:
+        file_name_length = max(5, remaining_length - 10)  # Reserve some for file name
+    else:
+        avg_dir_length = remaining_length // (remaining_dirs + 1)
+        file_name_length = max(5, avg_dir_length)
+    return file_name_length
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate a test dataset with configurable directory and file characteristics.")
+    parser.add_argument('--output-dir', type=str, required=True, help='Root directory to generate the dataset.')
+    parser.add_argument('--num-files', type=int, required=True, help='Total number of files to generate.')
+    parser.add_argument('--files-per-dir', type=int, default=10, help='Number of files per directory.')
+    parser.add_argument('--distribution', type=str, choices=['balanced', 'unbalanced'], default='balanced', help='Distribution of files across directories.')
+    parser.add_argument('--content-size', type=str, default='10', help='Size of each file\'s content in bytes.')
+    parser.add_argument('--dir-depth', type=int, default=3, help='Maximum depth of the directory tree.')
+    parser.add_argument('--path-length', type=int, default=100, help='Desired average path length in characters.')
+
+    args = parser.parse_args()
+
+    output_dir = args.output_dir
+    num_files = args.num_files
+    files_per_dir = args.files_per_dir
+    distribution = args.distribution
+    try:
+        content_size = parse_size(args.content_size)
+    except ValueError:
+        print("Invalid content size format. Use numbers with optional suffixes like KB, MB.")
+        sys.exit(1)
+    dir_depth = args.dir_depth
+    path_length = args.path_length
+
+    # Create the root output directory
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Determine file distribution
+    if distribution == 'balanced':
+        files_distribution = distribute_files_balanced(num_files, files_per_dir)
+    else:
+        files_distribution = distribute_files_unbalanced(num_files, files_per_dir)
+
+    total_dirs = len(files_distribution)
+    print(f"Generating {num_files} files across {total_dirs} directories with a maximum depth of {dir_depth}.")
+
+    # Generate directories and files
+    for dir_idx, num_files_in_dir in enumerate(files_distribution):
+        # Generate directory path
+        current_path = output_dir
+        current_depth = 0
+        current_path_length = len(os.path.abspath(current_path))
+        path_parts = []
+        while current_depth < dir_depth:
+            # Adjust name lengths to approach target path length
+            file_name_length = adjust_name_lengths(path_length, current_depth, dir_depth, current_path_length, path_parts)
+            dir_name = random_string(min_length=3, max_length=file_name_length)
+            path_parts.append(dir_name)
+            current_path = os.path.join(current_path, dir_name)
+            current_path_length += len(dir_name) + 1  # +1 for the os.sep
+            current_depth += 1
+            # Break early if adding more directories would exceed path length
+            if current_path_length >= path_length - 50:  # Reserve some space for file name
+                break
+        os.makedirs(current_path, exist_ok=True)
+
+        # Generate files in the directory
+        for file_idx in range(num_files_in_dir):
+            # Generate file name
+            file_name_length = max(5, min(15, path_length // 10))  # Heuristic for file name length
+            file_name = f"file_{random_string(min_length=3, max_length=file_name_length)}.bin"
+            file_path = os.path.join(current_path, file_name)
+            # Ensure the path length does not exceed the target
+            if len(os.path.abspath(file_path)) > path_length:
+                # Adjust by shortening the file name
+                excess = len(os.path.abspath(file_path)) - path_length
+                if excess > 0 and len(file_name) > excess + 4:  # 4 for 'file_' and '.bin'
+                    new_length = len(file_name) - excess
+                    file_name = f"file_{random_string(min_length=3, max_length=new_length)}.bin"
+                    file_path = os.path.join(current_path, file_name)
+            # Create the file
+            create_file(file_path, content_size)
+
+    print(f"Dataset generation complete. Files are located in '{output_dir}'.")
+
+if __name__ == "__main__":
+    main()