Bulk Storage Benchmark

This example populates a storage with a large number of random terminal chunks, then queries a few by key to verify correctness. It demonstrates how to batch-create items and measure throughput.

Note associative index is also prepared in the process, and near-neighbor search is available immediately upon successful conclusion of all writing.

Motivated readers can further improve this script to test various producers or selectors.

Script

#!/usr/bin/env python3
"""Populate local storage with random terminal chunks and verify retrieval."""

import argparse
import shutil
import tempfile
import time
from kongming import hv, memory


def main():
    parser = argparse.ArgumentParser(description="Bulk storage benchmark")
    parser.add_argument(
        "-n", "--count", type=int, default=10_000,
        help="Number of terminal chunks to create (default: 10000)",
    )
    parser.add_argument(
        "--model", type=int, default=hv.MODEL_1M_10BIT,
        help="HV model (default: MODEL_1M_10BIT)",
    )
    parser.add_argument(
        "--domain", type=str, default="bench",
        help="Domain name for all chunks (default: bench)",
    )
    parser.add_argument(
        "--backend", type=str,
        choices=["inmemory", "embedded"],
        default="inmemory",
        help="Storage backend (default: inmemory)",
    )
    parser.add_argument(
        "--path", type=str, default=None,
        help="Disk path for embedded backend (default: temp directory)",
    )
    args = parser.parse_args()

    # --- Create storage ---
    tmpdir = None
    if args.backend == "embedded":
        if args.path:
            path = args.path
        else:
            tmpdir = tempfile.mkdtemp()
            path = f"{tmpdir}/bench_store"
        storage = memory.Embedded(args.model, path)
        print(f"Backend: Embedded (path={path})")
    else:
        storage = memory.InMemory(args.model)
        print("Backend: InMemory (BTreeMap, pure in-memory)")

    # --- Write phase ---
    print(f"Writing {args.count:,} terminal chunks …")
    t0 = time.perf_counter()
    for i in range(args.count):
        storage.mem_set(memory.new_terminal(args.domain, str(i)))

    elapsed = time.perf_counter() - t0
    rate = args.count / elapsed
    print(f"  done in {elapsed:.2f}s  ({rate:,.0f} chunks/s)")
    print(f"  item_count = {storage.item_count():,}")

    # --- Read phase: spot-check a few items ---
    spot_checks = range(0, args.count, args.count // 100)
    print(f"Spot-checking keys: {spot_checks}")
    for idx in spot_checks:
        expected = hv.Sparkle.from_word(args.model, args.domain, str(idx))
        chunk = storage.get(args.domain, str(idx))
        if not hv.equal(chunk.id, expected):
            print(f"mismatch at key {idx}: {chunk}")

    print("All checks passed.")

    # --- Cleanup ---
    if tmpdir:
        del storage
        shutil.rmtree(tmpdir)


if __name__ == "__main__":
    main()

Usage

# Default: 10K chunks, in-memory storage substrate.
python bulk_storage.py

# Embedded (disk-backed storage substrate).
python bulk_storage.py --backend embedded

# Embedded with a specific path (tip: use a tmpfs mount for near-in-memory speed)
python bulk_storage.py --backend embedded --path /dev/shm/my_bench

# Custom count
python bulk_storage.py -n 100000

# Different model, 1 implies MODEL_64K_8BIT model, etc.
python bulk_storage.py -n 10000 --model 1

Keyboard shortcuts

Kongming HV Documentation

Bulk Storage Benchmark

Script

Usage