diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake index 6081c2d2e45..a953668d183 100644 --- a/cmake/Modules/SourceFiles.cmake +++ b/cmake/Modules/SourceFiles.cmake @@ -147,6 +147,10 @@ set(VALKEY_CLI_SRCS ${CMAKE_SOURCE_DIR}/src/strl.c ${CMAKE_SOURCE_DIR}/src/cli_commands.c) +# Benchmark dataset module - used by benchmark binary and unit tests +set(BENCHMARK_DATASET_SRCS + ${CMAKE_SOURCE_DIR}/src/valkey-benchmark-dataset.c) + # valkey-benchmark set(VALKEY_BENCHMARK_SRCS ${CMAKE_SOURCE_DIR}/src/ae.c @@ -155,6 +159,7 @@ set(VALKEY_BENCHMARK_SRCS ${CMAKE_SOURCE_DIR}/src/sha256.c ${CMAKE_SOURCE_DIR}/src/util.c ${CMAKE_SOURCE_DIR}/src/valkey-benchmark.c + ${BENCHMARK_DATASET_SRCS} ${CMAKE_SOURCE_DIR}/src/adlist.c ${CMAKE_SOURCE_DIR}/src/dict.c ${CMAKE_SOURCE_DIR}/src/zmalloc.c diff --git a/docs/benchmark.md b/docs/benchmark.md new file mode 100644 index 00000000000..be43cde71bb --- /dev/null +++ b/docs/benchmark.md @@ -0,0 +1,276 @@ +# Valkey Benchmark + +Benchmark utility for measuring Valkey server performance. + +```bash +valkey-benchmark [OPTIONS] [--] [COMMAND ARGS...] +``` + +## Connection Options + +| Option | Description | +|--------|-------------| +| `-h ` | Server hostname (default: 127.0.0.1) | +| `-p ` | Server port (default: 6379) | +| `-s ` | Server socket (overrides host and port) | +| `-u ` | Server URI: `valkey://user:password@host:port/dbnum` | +| `-a ` | Password for Valkey Auth | +| `--user ` | Used to send ACL style 'AUTH username pass'. Needs `-a` | +| `--dbnum ` | SELECT the specified db number (default: 0) | +| `-3` | Start session in RESP3 protocol mode | + +## Performance Options + +| Option | Description | +|--------|-------------| +| `-c ` | Number of parallel connections (default: 50) | +| `-n ` | Total number of requests (default: 100000) | +| `--duration ` | Run benchmark for specified number of seconds (mutually exclusive with `-n`) | +| `--warmup ` | Run benchmark for specified warmup period before recording data | +| `-d ` | Data size of SET/GET value in bytes (default: 3) | +| `-P ` | Pipeline requests (default: 1, no pipeline) | +| `-k ` | Keep alive: 1=keep alive, 0=reconnect (default: 1) | +| `--threads ` | Enable multi-thread mode | +| `--rps ` | Limit requests per second (default: 0, no limit) | + +## Test Selection + +| Option | Description | +|--------|-------------| +| `-t ` | Comma-separated list of tests to run | +| `-l` | Loop mode: run tests forever | +| `-I` | Idle mode: open N idle connections and wait | + +Available tests: `ping`, `ping_inline`, `ping_mbulk`, `set`, `get`, `incr`, `lpush`, `rpush`, `lpop`, `rpop`, `sadd`, `hset`, `spop`, `zadd`, `zpopmin`, `lrange`, `lrange_100`, `lrange_300`, `lrange_500`, `lrange_600`, `mset`, `mget`, `xadd`, `function_load`, `fcall` + +## Output Options + +| Option | Description | +|--------|-------------| +| `-q` | Quiet mode: show only query/sec values | +| `--csv` | Output in CSV format | +| `--precision` | Number of decimal places in latency output (default: 0) | + +## Cluster Options + +| Option | Description | +|--------|-------------| +| `--cluster` | Enable cluster mode | +| `--rfr ` | Read from replicas: `no`/`yes`/`all` (default: `no`) | + +## Randomization Options + +| Option | Description | +|--------|-------------| +| `-r ` | Use random keys in range [0, keyspacelen-1] | +| `--sequential` | Use sequential numbers instead of random | +| `--seed ` | Set random number generator seed | + +## Dataset Support + +| Option | Description | +|--------|-------------| +| `--dataset ` | Dataset file for field placeholder replacement | +| `--maxdocs ` | Maximum number of documents to load from dataset (default: unlimited) | +| `--xml-root-element ` | Root element name for XML dataset parsing (required for XML files) | + +### File Formats + +**CSV** +```csv +term,category +anarchism,politics +democracy,politics +``` +Header row required, comma-delimited, field names become `__field:name__` placeholders. + +**TSV** +Tab-delimited with header row. + +**XML** +```xml + + Anarchism + 12 + + 1317806107 + Article content... + + +``` +Requires `--xml-root-element` parameter. Root element choice affects discovered fields - deeper elements include nested content. + +### Dataset Behavior + +- One row per command +- Sequential iteration with wraparound +- Thread-safe atomic selection +- Duplicate XML field names: first occurrence wins + +### Usage + +```bash +# CSV dataset +valkey-benchmark --dataset terms.csv \ + -n 50000 FT.SEARCH myindex "__field:term__" + +# Wikipedia XML +valkey-benchmark --dataset wiki.xml --xml-root-element page \ + -n 10000 HSET "doc:__rand_int__" title "__field:title__" body "__field:text__" +``` + +**Memory:** Large datasets may require GB-scale RAM. + +## Additional Options + +| Option | Description | +|--------|-------------| +| `--enable-tracking` | Send CLIENT TRACKING on | +| `--num-functions ` | Functions in Lua lib (default: 10) | +| `--num-keys-in-fcall ` | Keys for FCALL (default: 1) | +| `--seed ` | RNG seed | +| `-x` | Read last arg from STDIN | +| `--mptcp` | Enable MPTCP | +| `--help` | Show help | +| `--version` | Show version | + +## Placeholder System + +### Random Placeholders + +| Placeholder | Behavior | +|-------------|----------| +| `__rand_int__` | Different random value per occurrence | +| `__rand_1st__` | Same random value for all occurrences in command | +| `__rand_2nd__` | Same random value for all occurrences in command | +| ... | ... | +| `__rand_9th__` | Same random value for all occurrences in command | + +Random values are 12-digit zero-padded numbers in range [0, keyspacelen-1]. + +### Data Placeholders + +| Placeholder | Description | +|-------------|-------------| +| `__data__` | Random data of size specified by `-d` option | + +### Cluster Placeholders + +| Placeholder | Description | +|-------------|-------------| +| `{tag}` | Cluster slot hashtag for proper key distribution | + +Required in cluster mode to ensure commands route to correct nodes. + +## Command Sequences + +Commands can be chained using semicolon separators: + +```bash +valkey-benchmark -- multi ';' set key:__rand_int__ __data__ ';' incr counter ';' exec +``` + +### Repetition Syntax + +Prefix commands with a number to repeat: + +```bash +valkey-benchmark -- 5 set key:__rand_int__ value ';' get key:__rand_int__ +``` + +This executes 5 SET commands followed by 1 GET command per pipeline iteration. + + +## Examples + +### Basic Benchmarking + +```bash +# Default benchmark suite +valkey-benchmark + +# Specific tests +valkey-benchmark -t ping,set,get -n 100000 + +# Custom data size +valkey-benchmark -t set -d 1024 -n 50000 +``` + +### Random Key Distribution + +```bash +# Random keys in range [0, 999999] +valkey-benchmark -t set,get -r 1000000 -n 100000 + +# Sequential keys +valkey-benchmark -t set --sequential -r 1000000 -n 100000 +``` + +### Dataset-Driven Benchmarking + +```bash +# CSV dataset +valkey-benchmark --dataset terms.csv \ + -n 50000 FT.SEARCH myindex "__field:term__" + +# Wikipedia XML dataset (page-level) +valkey-benchmark --dataset wiki_sample.xml --xml-root-element page \ + -n 10000 HSET "doc:__rand_int__" title "__field:title__" content "__field:text__" id "__field:id__" + +# Wikipedia XML dataset (revision-level) +valkey-benchmark --dataset wiki_sample.xml --xml-root-element revision \ + -n 10000 HSET "doc:__rand_int__" content "__field:text__" timestamp "__field:timestamp__" + +# Multiple field usage +valkey-benchmark --dataset products.csv \ + -- HSET product:__field:id__ name "__field:name__" price __field:price__ +``` + +### Cluster Benchmarking + +```bash +# Cluster mode with proper key distribution +valkey-benchmark --cluster -t set,get \ + -- SET key:{tag}:__rand_int__ __data__ + +# Read from replicas +valkey-benchmark --cluster --rfr yes -t get \ + -- GET key:{tag}:__rand_int__ +``` + +### Pipelining + +```bash +# Pipeline 10 requests +valkey-benchmark -P 10 -t set -n 100000 + +# Pipeline with datasets +valkey-benchmark --dataset terms.csv -P 5 \ + -n 50000 FT.SEARCH index "__field:term__" +``` + +### Complex Command Sequences + +```bash +# Transaction benchmark +valkey-benchmark -r 100000 -n 10000 \ + -- multi ';' set key:__rand_int__ __data__ ';' \ + incr counter:__rand_int__ ';' exec + +# Mixed operations with repetition +valkey-benchmark -r 100000 \ + -- 3 set key:__rand_int__ __data__ ';' \ + 2 get key:__rand_int__ ';' \ + del key:__rand_int__ +``` + +### Rate Limiting + +```bash +# Limit to 1000 requests/second +valkey-benchmark --rps 1000 -t set -n 50000 + +# Dataset with rate limiting +valkey-benchmark --dataset search_terms.csv --rps 500 \ + -n 10000 FT.SEARCH index "__field:term__" +``` diff --git a/src/Makefile b/src/Makefile index a0fe81f5b0c..b96dc89f1d4 100644 --- a/src/Makefile +++ b/src/Makefile @@ -31,7 +31,7 @@ endif ifneq ($(OPTIMIZATION),-O0) OPTIMIZATION+=-fno-omit-frame-pointer endif -DEPENDENCY_TARGETS=libvalkey linenoise hdr_histogram fpconv +DEPENDENCY_TARGETS=libvalkey linenoise lua hdr_histogram fpconv NODEPS:=clean distclean # Default settings @@ -608,6 +608,7 @@ ENGINE_BENCHMARK_OBJ = \ strl.o \ util.o \ valkey-benchmark.o \ + valkey-benchmark-dataset.o \ zmalloc.o ENGINE_CHECK_RDB_NAME=$(ENGINE_NAME)-check-rdb$(PROG_SUFFIX) ENGINE_CHECK_AOF_NAME=$(ENGINE_NAME)-check-aof$(PROG_SUFFIX) @@ -688,8 +689,8 @@ $(ENGINE_LIB_NAME): $(ENGINE_SERVER_OBJ) $(SERVER_AR) rcs $@ $^ # valkey-unit-tests -$(ENGINE_UNIT_TESTS): $(ENGINE_TEST_OBJ) $(ENGINE_LIB_NAME) - $(SERVER_LD) -o $@ $^ ../deps/libvalkey/lib/libvalkey.a ../deps/hdr_histogram/libhdrhistogram.a ../deps/fpconv/libfpconv.a $(FINAL_LIBS) +$(ENGINE_UNIT_TESTS): $(ENGINE_TEST_OBJ) $(ENGINE_LIB_NAME) valkey-benchmark-dataset.o + $(SERVER_LD) -o $@ $^ ../deps/libvalkey/lib/libvalkey.a ../deps/lua/src/liblua.a ../deps/hdr_histogram/libhdrhistogram.a ../deps/fpconv/libfpconv.a $(FINAL_LIBS) # valkey-sentinel $(ENGINE_SENTINEL_NAME): $(SERVER_NAME) diff --git a/src/unit/CMakeLists.txt b/src/unit/CMakeLists.txt index 189c1cb1d9a..314da00e19b 100644 --- a/src/unit/CMakeLists.txt +++ b/src/unit/CMakeLists.txt @@ -23,7 +23,7 @@ add_library(valkeylib STATIC ${VALKEY_SERVER_SRCS}) target_compile_options(valkeylib PRIVATE "${COMPILE_FLAGS}") target_compile_definitions(valkeylib PRIVATE "${COMPILE_DEFINITIONS}") -add_executable(valkey-unit-tests ${UNIT_TEST_SRCS}) +add_executable(valkey-unit-tests ${UNIT_TEST_SRCS} ${BENCHMARK_DATASET_SRCS}) target_compile_options(valkey-unit-tests PRIVATE "${COMPILE_FLAGS}") target_compile_definitions(valkey-unit-tests PRIVATE "${COMPILE_DEFINITIONS}") add_dependencies(valkey-unit-tests generate_test_files_h) diff --git a/src/unit/test_dataset.c b/src/unit/test_dataset.c new file mode 100644 index 00000000000..4893250ed04 --- /dev/null +++ b/src/unit/test_dataset.c @@ -0,0 +1,201 @@ +#include +#include +#include +#include +#include "../valkey-benchmark-dataset.h" +#include "../zmalloc.h" +#include "../sds.h" +#include "test_help.h" + +#define UNUSED(x) (void)(x) + +static char *create_test_file(const char *suffix, const char *content) { + static char filename[512]; + snprintf(filename, sizeof(filename), "/tmp/test_dataset_%d_%s", getpid(), suffix); + FILE *f = fopen(filename, "w"); + if (!f) return NULL; + if (content) fputs(content, f); + fclose(f); + return filename; +} + +static void cleanup_test_file(const char *filename) { + if (filename) unlink(filename); +} + +int test_field_mapping_optimization(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + const char *csv = "a,b,c,d,e,f,g,h\n1,2,3,4,5,6,7,8\n"; + char *file = create_test_file("mapping.csv", csv); + TEST_ASSERT(file != NULL); + + sds arg1 = sdsnew("HSET"); + sds arg2 = sdsnew("key"); + sds arg3 = sdsnew("b"); + sds arg4 = sdsnew("__field:b__"); + sds arg5 = sdsnew("f"); + sds arg6 = sdsnew("__field:f__"); + sds args[] = {arg1, arg2, arg3, arg4, arg5, arg6}; + dataset *ds = datasetInit(file, NULL, 0, 1, args, 6); + + TEST_ASSERT_MESSAGE("Dataset initialized", ds != NULL); + TEST_ASSERT_MESSAGE("Total fields discovered", ds->field_count == 8); + TEST_ASSERT_MESSAGE("Only used fields loaded", ds->used_field_count == 2); + TEST_ASSERT_MESSAGE("Correct values loaded", !strcmp(ds->records[0].fields[0], "2")); + TEST_ASSERT_MESSAGE("Correct values loaded", !strcmp(ds->records[0].fields[1], "6")); + + datasetFree(ds); + sdsfree(arg1); + sdsfree(arg2); + sdsfree(arg3); + sdsfree(arg4); + sdsfree(arg5); + sdsfree(arg6); + cleanup_test_file(file); + return 0; +} + +int test_csv_quoted_fields(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + const char *csv = "id,title,desc\n" + "1,\"Book, Part 1\",\"Quote: \"\"Hello\"\"\"\n"; + char *file = create_test_file("quoted.csv", csv); + TEST_ASSERT(file != NULL); + + sds arg1 = sdsnew("SET"); + sds arg2 = sdsnew("__field:title__"); + sds args[] = {arg1, arg2}; + dataset *ds = datasetInit(file, NULL, 0, 1, args, 2); + + TEST_ASSERT_MESSAGE("Dataset initialized", ds != NULL); + TEST_ASSERT_MESSAGE("Quoted comma preserved", !strcmp(ds->records[0].fields[0], "Book, Part 1")); + + datasetFree(ds); + sdsfree(arg1); + sdsfree(arg2); + cleanup_test_file(file); + return 0; +} + +int test_field_count_correctness(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + const char *csv = "id,title,author\n1,Book,Alice\n"; + char *file = create_test_file("fields.csv", csv); + TEST_ASSERT(file != NULL); + + sds arg1 = sdsnew("GET"); + sds arg2 = sdsnew("__field:title__"); + sds args[] = {arg1, arg2}; + dataset *ds = datasetInit(file, NULL, 0, 1, args, 2); + + TEST_ASSERT_MESSAGE("Dataset initialized", ds != NULL); + TEST_ASSERT_MESSAGE("Correct field count", ds->field_count == 3); + TEST_ASSERT_MESSAGE("Correct record count", datasetGetRecordCount(ds) == 1); + + datasetFree(ds); + sdsfree(arg1); + sdsfree(arg2); + cleanup_test_file(file); + return 0; +} + +int test_excessive_field_name_length(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + /* Create a 513-character field name (over MAX_FIELD_NAME_LEN limit) */ + char excessive_field[514]; + memset(excessive_field, 'b', 513); + excessive_field[513] = '\0'; + + /* Build XML with the excessive field name */ + sds xml = sdsempty(); + xml = sdscat(xml, "\n"); + xml = sdscat(xml, " ValidValue\n"); + xml = sdscat(xml, " <"); + xml = sdscat(xml, excessive_field); + xml = sdscat(xml, ">ExcessiveValue\n"); + xml = sdscat(xml, "\n"); + + char *file = create_test_file("excessive_field.xml", xml); + sdsfree(xml); + TEST_ASSERT(file != NULL); + + /* Initialize dataset - only the valid field should be loaded */ + dataset *ds = datasetInit(file, "doc", 1, 0, NULL, 0); + + TEST_ASSERT_MESSAGE("Dataset initialized", ds != NULL); + TEST_ASSERT_MESSAGE("Only valid field loaded (513-char field rejected)", ds->field_count == 1); + TEST_ASSERT_MESSAGE("Valid field is 'title'", !strcmp(ds->field_names[0], "title")); + + datasetFree(ds); + cleanup_test_file(file); + return 0; +} + +int test_max_dataset_fields(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + /* Build XML with exactly 1000 fields (MAX_DATASET_FIELDS limit) */ + sds xml = sdsempty(); + xml = sdscat(xml, "\n"); + for (int i = 0; i < 1000; i++) { + xml = sdscatprintf(xml, " value%d\n", i, i, i); + } + xml = sdscat(xml, "\n"); + + char *file = create_test_file("max_fields.xml", xml); + sdsfree(xml); + TEST_ASSERT(file != NULL); + + dataset *ds = datasetInit(file, "doc", 1, 0, NULL, 0); + + TEST_ASSERT_MESSAGE("Dataset initialized", ds != NULL); + TEST_ASSERT_MESSAGE("1000 fields accepted (at MAX_DATASET_FIELDS limit)", ds->field_count == 1000); + TEST_ASSERT_MESSAGE("First field correct", !strcmp(ds->field_names[0], "field0")); + TEST_ASSERT_MESSAGE("Last field correct", !strcmp(ds->field_names[999], "field999")); + + datasetFree(ds); + cleanup_test_file(file); + return 0; +} + +int test_excessive_dataset_fields(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + /* Build XML with 1001 fields (over MAX_DATASET_FIELDS limit) */ + sds xml = sdsempty(); + xml = sdscat(xml, "\n"); + for (int i = 0; i < 1001; i++) { + xml = sdscatprintf(xml, " value%d\n", i, i, i); + } + xml = sdscat(xml, "\n"); + + char *file = create_test_file("excessive_fields.xml", xml); + sdsfree(xml); + TEST_ASSERT(file != NULL); + + /* Initialize dataset - should fail when exceeding MAX_DATASET_FIELDS */ + dataset *ds = datasetInit(file, "doc", 1, 0, NULL, 0); + + TEST_ASSERT_MESSAGE("Dataset initialization fails (1001 fields exceeds limit)", ds == NULL); + + cleanup_test_file(file); + return 0; +} diff --git a/src/unit/test_files.h b/src/unit/test_files.h index 30042dbf4e3..029f81ce480 100644 --- a/src/unit/test_files.h +++ b/src/unit/test_files.h @@ -9,6 +9,12 @@ typedef struct unitTest { int test_popcount(int argc, char **argv, int flags); int test_crc64(int argc, char **argv, int flags); int test_crc64combine(int argc, char **argv, int flags); +int test_field_mapping_optimization(int argc, char **argv, int flags); +int test_csv_quoted_fields(int argc, char **argv, int flags); +int test_field_count_correctness(int argc, char **argv, int flags); +int test_excessive_field_name_length(int argc, char **argv, int flags); +int test_max_dataset_fields(int argc, char **argv, int flags); +int test_excessive_dataset_fields(int argc, char **argv, int flags); int test_dictCreate(int argc, char **argv, int flags); int test_dictAdd16Keys(int argc, char **argv, int flags); int test_dictDisableResize(int argc, char **argv, int flags); @@ -287,6 +293,7 @@ int test_zmallocAllocZeroByteAndFree(int argc, char **argv, int flags); unitTest __test_bitops_c[] = {{"test_popcount", test_popcount}, {NULL, NULL}}; unitTest __test_crc64_c[] = {{"test_crc64", test_crc64}, {NULL, NULL}}; unitTest __test_crc64combine_c[] = {{"test_crc64combine", test_crc64combine}, {NULL, NULL}}; +unitTest __test_dataset_c[] = {{"test_field_mapping_optimization", test_field_mapping_optimization}, {"test_csv_quoted_fields", test_csv_quoted_fields}, {"test_field_count_correctness", test_field_count_correctness}, {"test_excessive_field_name_length", test_excessive_field_name_length}, {"test_max_dataset_fields", test_max_dataset_fields}, {"test_excessive_dataset_fields", test_excessive_dataset_fields}, {NULL, NULL}}; unitTest __test_dict_c[] = {{"test_dictCreate", test_dictCreate}, {"test_dictAdd16Keys", test_dictAdd16Keys}, {"test_dictDisableResize", test_dictDisableResize}, {"test_dictAddOneKeyTriggerResize", test_dictAddOneKeyTriggerResize}, {"test_dictDeleteKeys", test_dictDeleteKeys}, {"test_dictDeleteOneKeyTriggerResize", test_dictDeleteOneKeyTriggerResize}, {"test_dictEmptyDirAdd128Keys", test_dictEmptyDirAdd128Keys}, {"test_dictDisableResizeReduceTo3", test_dictDisableResizeReduceTo3}, {"test_dictDeleteOneKeyTriggerResizeAgain", test_dictDeleteOneKeyTriggerResizeAgain}, {"test_dictBenchmark", test_dictBenchmark}, {NULL, NULL}}; unitTest __test_endianconv_c[] = {{"test_endianconv", test_endianconv}, {NULL, NULL}}; unitTest __test_entry_c[] = {{"test_entryCreate", test_entryCreate}, {"test_entryUpdate", test_entryUpdate}, {"test_entryHasexpiry_entrySetExpiry", test_entryHasexpiry_entrySetExpiry}, {"test_entryIsExpired", test_entryIsExpired}, {"test_entryMemUsage_entrySetExpiry_entryUpdate", test_entryMemUsage_entrySetExpiry_entryUpdate}, {"test_entryStringRef", test_entryStringRef}, {NULL, NULL}}; @@ -318,6 +325,7 @@ struct unitTestSuite { {"test_bitops.c", __test_bitops_c}, {"test_crc64.c", __test_crc64_c}, {"test_crc64combine.c", __test_crc64combine_c}, + {"test_dataset.c", __test_dataset_c}, {"test_dict.c", __test_dict_c}, {"test_endianconv.c", __test_endianconv_c}, {"test_entry.c", __test_entry_c}, diff --git a/src/valkey-benchmark-dataset.c b/src/valkey-benchmark-dataset.c new file mode 100644 index 00000000000..0c58dad02ad --- /dev/null +++ b/src/valkey-benchmark-dataset.c @@ -0,0 +1,782 @@ +/* Dataset support for valkey-benchmark + * + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + +#include "fmacros.h" + +#include "valkey-benchmark-dataset.h" +#include "zmalloc.h" +#include +#include +#include +#include +#include + +/* Internal constants */ +#define PLACEHOLDER_COUNT 10 +#define PLACEHOLDER_LEN 12 +#define XML_TAG_OVERHEAD 16 /* Space for XML tag syntax ("<", ">", "") and null terminator */ + +static const char *PLACEHOLDERS[PLACEHOLDER_COUNT] = { + "__rand_int__", "__rand_1st__", "__rand_2nd__", "__rand_3rd__", "__rand_4th__", + "__rand_5th__", "__rand_6th__", "__rand_7th__", "__rand_8th__", "__rand_9th__"}; + +/* Forward declarations */ +static bool datasetBuildFieldMap(dataset *ds, sds *template_argv, int template_argc); +static sds getFieldValue(const char *row, int column_index, char delimiter); +static sds getXmlFieldValue(const char *xml_doc, const char *field_name); +static sds formatBytes(size_t bytes); +static bool csvDiscoverFields(dataset *ds); +static bool scanXmlFieldsFromFile(dataset *ds, const char *xml_root_element); +static bool scanXmlFields(const char *doc_start, const char *doc_end, dataset *ds, const char *start_root_tag, const char *end_root_tag); +static bool loadXmlDataset(dataset *ds, const char *xml_root_element); +static bool csvLoadDocuments(dataset *ds); +static bool shouldStopLoading(dataset *ds); +static int findFieldIndex(dataset *ds, const char *field_name, size_t field_name_len); +static const char *extractDatasetFieldValue(dataset *ds, int field_idx, int record_index); +static sds replaceOccurrence(sds processed_arg, const char *pos, const char *replacement); +static sds processFieldsInArg(dataset *ds, sds arg, int record_index); +static sds processRandPlaceholdersForDataSet(sds cmd, _Atomic uint64_t *seq_key, int replace_placeholders, int keyspacelen, int sequential_replacement); + +dataset *datasetInit(const char *filename, const char *xml_root_element, int max_documents, int has_field_placeholders, sds *template_argv, int template_argc) { + if (!filename) return NULL; + + dataset *ds = zcalloc(sizeof(dataset)); + if (!ds) return NULL; + + ds->filename = filename; + ds->xml_root_element = xml_root_element; + ds->max_documents = max_documents; + + /* Validate XML parameters */ + if (strstr(filename, ".xml") && !xml_root_element) { + fprintf(stderr, "Error: XML dataset requires --xml-root-element parameter\n"); + zfree(ds); + return NULL; + } + + /* Detect format */ + if (strstr(filename, ".csv")) { + ds->format = DATASET_FORMAT_CSV; + ds->delimiter = ','; + } else if (strstr(filename, ".tsv")) { + ds->format = DATASET_FORMAT_TSV; + ds->delimiter = '\t'; + } else if (strstr(filename, ".xml")) { + ds->format = DATASET_FORMAT_XML; + ds->delimiter = 0; + } else { + ds->format = DATASET_FORMAT_CSV; + ds->delimiter = ','; + } + + /* Discover fields */ + if (ds->format == DATASET_FORMAT_XML) { + if (!scanXmlFieldsFromFile(ds, xml_root_element)) goto error; + } else { + if (!csvDiscoverFields(ds)) goto error; + } + + /* Build field map if needed (BEFORE loading) */ + if (has_field_placeholders && template_argv && template_argc > 0) { + if (!datasetBuildFieldMap(ds, template_argv, template_argc)) goto error; + } else { + ds->used_field_count = ds->field_count; + } + + /* Load data with correct field count */ + if (ds->format == DATASET_FORMAT_XML) { + if (!loadXmlDataset(ds, xml_root_element)) goto error; + } else { + if (!csvLoadDocuments(ds)) goto error; + } + + return ds; + +error: + datasetFree(ds); + return NULL; +} + +void datasetFree(dataset *ds) { + if (!ds) return; + + if (ds->field_names) { + /* Unified memory management: all formats use zmalloc + individual sdsnew */ + for (int i = 0; i < ds->field_count; i++) { + sdsfree(ds->field_names[i]); + } + zfree(ds->field_names); + } + + if (ds->field_map) { + zfree(ds->field_map); + } + + if (ds->records) { + for (size_t i = 0; i < ds->record_count; i++) { + if (ds->records[i].fields) { + for (int j = 0; j < ds->used_field_count; j++) { + sdsfree(ds->records[i].fields[j]); + } + zfree(ds->records[i].fields); + } + } + zfree(ds->records); + } + + zfree(ds); +} + +bool datasetBuildFieldMap(dataset *ds, sds *template_argv, int template_argc) { + if (!ds) return false; + + ds->field_map = zmalloc(ds->field_count * sizeof(int)); + ds->used_field_count = 0; + + for (int i = 0; i < ds->field_count; i++) { + ds->field_map[i] = -1; + } + + for (int arg_idx = 0; arg_idx < template_argc; arg_idx++) { + const char *arg = template_argv[arg_idx]; + const char *field_pos = strstr(arg, FIELD_PREFIX); + + while (field_pos) { + const char *field_start = field_pos + FIELD_PREFIX_LEN; + const char *field_end = strstr(field_start, FIELD_SUFFIX); + if (!field_end) break; + + size_t field_name_len = field_end - field_start; + sds field_name = sdsnewlen(field_start, field_name_len); + + int field_idx = -1; + for (int k = 0; k < ds->field_count; k++) { + if (!strcmp(field_name, ds->field_names[k])) { + field_idx = k; + break; + } + } + + if (field_idx == -1) { + fprintf(stderr, "Error: Field placeholder '__field:%s__' not found in dataset fields\n", field_name); + fprintf(stderr, "Available fields: "); + for (int j = 0; j < ds->field_count; j++) { + fprintf(stderr, "%s%s", ds->field_names[j], (j < ds->field_count - 1) ? ", " : "\n"); + } + sdsfree(field_name); + return false; + } + + if (ds->field_map[field_idx] == -1) { + ds->field_map[field_idx] = ds->used_field_count++; + } + + sdsfree(field_name); + field_pos = strstr(field_end + FIELD_SUFFIX_LEN, FIELD_PREFIX); + } + } + + return true; +} + +bool datasetLoad(dataset *ds, const char *xml_root_element) { + if (!ds) return false; + (void)xml_root_element; /* Use stored value in ds */ + + if (ds->format == DATASET_FORMAT_XML) { + return loadXmlDataset(ds, ds->xml_root_element); + } else { + return csvLoadDocuments(ds); + } +} + +size_t datasetGetRecordCount(dataset *ds) { + return ds ? ds->record_count : 0; +} + +void datasetReportMemory(dataset *ds) { + if (!ds) return; + + size_t total_memory = 0; + for (size_t i = 0; i < ds->record_count; i++) { + for (int j = 0; j < ds->used_field_count; j++) { + total_memory += sdslen(ds->records[i].fields[j]); + } + } + sds size_str = formatBytes(total_memory); + printf("Dataset: %zu documents (%s)\n", ds->record_count, size_str); + sdsfree(size_str); +} + +sds datasetGenerateCommand(dataset *ds, int record_index, sds *template_argv, int template_argc, _Atomic uint64_t *seq_key, int replace_placeholders, int keyspacelen, int sequential_replacement) { + if (!ds || !template_argv) return NULL; + + sds *processed_argv = zmalloc(template_argc * sizeof(sds)); + for (int i = 0; i < template_argc; i++) { + processed_argv[i] = processFieldsInArg(ds, sdsdup(template_argv[i]), record_index); + } + + char *cmd; + int len = valkeyFormatCommandArgv(&cmd, template_argc, (const char **)processed_argv, NULL); + sds result = sdsnewlen(cmd, len); + free(cmd); + + result = processRandPlaceholdersForDataSet(result, seq_key, replace_placeholders, + keyspacelen, sequential_replacement); + + for (int i = 0; i < template_argc; i++) { + sdsfree(processed_argv[i]); + } + zfree(processed_argv); + + return result; +} + +static sds formatBytes(size_t bytes) { + if (bytes < 1024) { + return sdscatprintf(sdsempty(), "%zu bytes", bytes); + } else if (bytes < 1024 * 1024) { + return sdscatprintf(sdsempty(), "%.2f KB", bytes / 1024.0); + } else if (bytes < 1024 * 1024 * 1024) { + return sdscatprintf(sdsempty(), "%.2f MB", bytes / (1024.0 * 1024.0)); + } else { + return sdscatprintf(sdsempty(), "%.2f GB", bytes / (1024.0 * 1024.0 * 1024.0)); + } +} + +static bool shouldStopLoading(dataset *ds) { + if (ds->max_documents > 0 && (int)ds->record_count >= ds->max_documents) { + return true; + } + return false; +} + +static sds getFieldValue(const char *row, int column_index, char delimiter) { + int current_col = 0; + const char *start = row; + const char *p = row; + int in_quotes = 0; + + while (*p) { + if (*p == '"') { + in_quotes = !in_quotes; + } else if (*p == delimiter && !in_quotes) { + if (current_col == column_index) { + size_t len = p - start; + if (len > 0 && start[0] == '"' && p[-1] == '"') { + start++; + len -= 2; + } + return sdsnewlen(start, len); + } + current_col++; + start = p + 1; + } + p++; + } + + if (current_col == column_index) { + size_t len = p - start; + if (len > 0 && start[0] == '"' && p[-1] == '"') { + start++; + len -= 2; + } + return sdsnewlen(start, len); + } + + return sdsempty(); +} + +static sds getXmlFieldValue(const char *xml_doc, const char *field_name) { + size_t field_len = strlen(field_name); + if (field_len > MAX_FIELD_NAME_LEN) { + return sdsempty(); + } + + char start_tag_prefix[MAX_FIELD_NAME_LEN + XML_TAG_OVERHEAD], end_tag[MAX_FIELD_NAME_LEN + XML_TAG_OVERHEAD]; + snprintf(start_tag_prefix, sizeof(start_tag_prefix), "<%s", field_name); + snprintf(end_tag, sizeof(end_tag), "", field_name); + + const char *tag_start = strstr(xml_doc, start_tag_prefix); + if (!tag_start) return sdsempty(); + + const char *tag_end = strchr(tag_start, '>'); + if (!tag_end) return sdsempty(); + + if (tag_end > tag_start && tag_end[-1] == '/') { + return sdsempty(); + } + + const char *content_start = tag_end + 1; + const char *closing_tag = strstr(content_start, end_tag); + if (!closing_tag) return sdsempty(); + + size_t content_len = closing_tag - content_start; + return sdsnewlen(content_start, content_len); +} + +static bool csvDiscoverFields(dataset *ds) { + FILE *fp = fopen(ds->filename, "r"); + if (!fp) { + fprintf(stderr, "Cannot open dataset file: %s\n", ds->filename); + return false; + } + + char *line = NULL; + size_t len = 0; + if (getline(&line, &len, fp) == -1) { + fprintf(stderr, "Cannot read header from dataset file\n"); + free(line); + fclose(fp); + return false; + } + + len = strlen(line); + if (len > 0 && line[len - 1] == '\n') line[len - 1] = '\0'; + if (len > 1 && line[len - 2] == '\r') line[len - 2] = '\0'; + + int count; + char delim_str[2] = {ds->delimiter, '\0'}; + sds *temp = sdssplitlen(line, strlen(line), delim_str, 1, &count); + + ds->field_names = zmalloc(count * sizeof(sds)); + for (int i = 0; i < count; i++) { + ds->field_names[i] = sdsdup(temp[i]); + } + sdsfreesplitres(temp, count); + ds->field_count = count; + + free(line); + fclose(fp); + return true; +} + +static bool scanXmlFields(const char *doc_start, const char *doc_end, dataset *ds, const char *start_root_tag, const char *end_root_tag) { + char field_names[MAX_DATASET_FIELDS][MAX_FIELD_NAME_LEN + 1]; + int field_count = 0; + int root_start_tag_len = strlen(start_root_tag); + int root_end_tag_len = strlen(end_root_tag); + + const char *current_pos = doc_start; + while ((current_pos = strchr(current_pos, '<')) != NULL && current_pos < doc_end) { + if (current_pos[1] == '/' || current_pos[1] == '!' || + !strncmp(current_pos, start_root_tag, root_start_tag_len) || + !strncmp(current_pos, end_root_tag, root_end_tag_len)) { + current_pos++; + continue; + } + + const char *tag_end = strchr(current_pos, '>'); + if (!tag_end || tag_end >= doc_end) break; + + const char *field_start = current_pos + 1; + const char *field_name_end = field_start; + + while (field_name_end < tag_end && *field_name_end != ' ' && *field_name_end != '\t') { + field_name_end++; + } + + size_t field_name_len = field_name_end - field_start; + + if (field_name_len == 0 || field_name_len > MAX_FIELD_NAME_LEN) { + current_pos = tag_end + 1; + continue; + } + + int is_duplicate = 0; + for (int i = 0; i < field_count; i++) { + if (strlen(field_names[i]) == field_name_len && + !memcmp(field_names[i], field_start, field_name_len)) { + is_duplicate = 1; + break; + } + } + + if (!is_duplicate) { + if (field_count >= MAX_DATASET_FIELDS) { + fprintf(stderr, "Error: Dataset contains more than %d fields (limit exceeded)\n", MAX_DATASET_FIELDS); + return false; + } + memcpy(field_names[field_count], field_start, field_name_len); + field_names[field_count][field_name_len] = '\0'; + field_count++; + } + + current_pos = tag_end + 1; + } + + if (field_count == 0) return false; + + ds->field_names = zmalloc(field_count * sizeof(sds)); + for (int i = 0; i < field_count; i++) { + ds->field_names[i] = sdsnew(field_names[i]); + } + ds->field_count = field_count; + + return true; +} + +static bool scanXmlFieldsFromFile(dataset *ds, const char *xml_root_element) { + FILE *fp = fopen(ds->filename, "r"); + if (!fp) return false; + + char start_tag_prefix[MAX_FIELD_NAME_LEN + XML_TAG_OVERHEAD], start_tag[MAX_FIELD_NAME_LEN + XML_TAG_OVERHEAD], end_tag[MAX_FIELD_NAME_LEN + XML_TAG_OVERHEAD]; + snprintf(start_tag_prefix, sizeof(start_tag_prefix), "<%s", xml_root_element); + snprintf(start_tag, sizeof(start_tag), "<%s>", xml_root_element); + snprintf(end_tag, sizeof(end_tag), "", xml_root_element); + + char buffer[1024]; + sds current_doc = sdsempty(); + + while (fgets(buffer, sizeof(buffer), fp)) { + current_doc = sdscat(current_doc, buffer); + + const char *tag_prefix_pos = strstr(current_doc, start_tag_prefix); + if (!tag_prefix_pos) continue; + + const char *tag_end_pos = strchr(tag_prefix_pos, '>'); + if (!tag_end_pos) continue; + + const char *doc_start = tag_prefix_pos; + const char *doc_end = strstr(tag_end_pos, end_tag); + if (!doc_end) continue; + + doc_end += strlen(end_tag); + + bool result = scanXmlFields(doc_start, doc_end, ds, start_tag, end_tag); + sdsfree(current_doc); + fclose(fp); + return result; + } + + sdsfree(current_doc); + fclose(fp); + return false; +} + +static bool loadXmlDataset(dataset *ds, const char *xml_root_element) { + FILE *fp = fopen(ds->filename, "r"); + if (!fp) return false; + + char start_tag_prefix[MAX_FIELD_NAME_LEN + XML_TAG_OVERHEAD], start_tag[MAX_FIELD_NAME_LEN + XML_TAG_OVERHEAD], end_tag[MAX_FIELD_NAME_LEN + XML_TAG_OVERHEAD]; + size_t end_tag_len; + snprintf(start_tag_prefix, sizeof(start_tag_prefix), "<%s", xml_root_element); + snprintf(start_tag, sizeof(start_tag), "<%s>", xml_root_element); + snprintf(end_tag, sizeof(end_tag), "", xml_root_element); + end_tag_len = strlen(end_tag); + + size_t buffer_capacity = 4 * 1024 * 1024; + char *buffer = zmalloc(buffer_capacity); + size_t buffer_used = 0; + bool fields_discovered = false; + size_t capacity = 1000; + + ds->records = zmalloc(sizeof(datasetRecord) * capacity); + + printf("Loading XML dataset from %s...\n", ds->filename); + + if (ds->field_names && ds->field_count > 0) { + fields_discovered = true; + printf("Using %d fields: ", ds->field_count); + for (int i = 0; i < ds->field_count; i++) { + printf("%s%s", ds->field_names[i], (i < ds->field_count - 1) ? ", " : "\n"); + } + } + + while (!shouldStopLoading(ds)) { + size_t space_available = buffer_capacity - buffer_used; + + if (space_available == 0) { + size_t new_capacity = buffer_capacity * 2; + buffer = zrealloc(buffer, new_capacity); + buffer_capacity = new_capacity; + space_available = buffer_capacity - buffer_used; + } + + size_t bytes_read = fread(buffer + buffer_used, 1, space_available, fp); + buffer_used += bytes_read; + + if (buffer_used == 0) break; + + size_t scan_pos = 0; + while (scan_pos < buffer_used) { + const char *doc_start = NULL; + const char *tag_open_end = NULL; + + for (size_t i = scan_pos; i < buffer_used; i++) { + if (buffer[i] == '<' && i + 1 < buffer_used && + strncmp(buffer + i, start_tag_prefix, strlen(start_tag_prefix)) == 0) { + doc_start = buffer + i; + + for (size_t j = i + 1; j < buffer_used; j++) { + if (buffer[j] == '>') { + tag_open_end = buffer + j; + break; + } + } + break; + } + } + + if (!doc_start || !tag_open_end) break; + + const char *doc_end = NULL; + for (size_t i = (tag_open_end - buffer); i + end_tag_len <= buffer_used; i++) { + if (strncmp(buffer + i, end_tag, end_tag_len) == 0) { + doc_end = buffer + i + end_tag_len; + break; + } + } + + if (!doc_end) break; + + size_t doc_len = doc_end - doc_start; + + if (!fields_discovered) { + if (!scanXmlFields(doc_start, doc_end, ds, start_tag, end_tag)) { + fprintf(stderr, "No XML fields discovered\n"); + zfree(buffer); + fclose(fp); + return false; + } + fields_discovered = true; + + printf("Discovered %d fields: ", ds->field_count); + for (int i = 0; i < ds->field_count; i++) { + printf("%s%s", ds->field_names[i], (i < ds->field_count - 1) ? ", " : "\n"); + } + } + + if (ds->record_count >= capacity) { + capacity *= 2; + ds->records = zrealloc(ds->records, sizeof(datasetRecord) * capacity); + } + + datasetRecord *record = &ds->records[ds->record_count]; + record->fields = zmalloc(sizeof(sds) * ds->used_field_count); + + sds doc_str = sdsnewlen(doc_start, doc_len); + if (ds->field_map) { + /* With field mapping - only load used fields */ + for (int i = 0; i < ds->field_count; i++) { + if (ds->field_map[i] >= 0) { + record->fields[ds->field_map[i]] = getXmlFieldValue(doc_str, ds->field_names[i]); + } + } + } else { + /* No field mapping - load all fields directly */ + for (int i = 0; i < ds->field_count; i++) { + record->fields[i] = getXmlFieldValue(doc_str, ds->field_names[i]); + } + } + sdsfree(doc_str); + + ds->record_count++; + + if (ds->record_count % 1000 == 0) { + printf("\rLoaded %zu documents...", ds->record_count); + fflush(stdout); + } + + scan_pos = doc_end - buffer; + } + + if (scan_pos > 0 && scan_pos < buffer_used) { + size_t remaining = buffer_used - scan_pos; + memmove(buffer, buffer + scan_pos, remaining); + buffer_used = remaining; + } else if (scan_pos == buffer_used) { + buffer_used = 0; + } + + if (bytes_read == 0 && (buffer_used == 0 || scan_pos == 0)) { + break; + } + } + + printf("\rLoaded %zu documents%*s\n", ds->record_count, 20, ""); + + zfree(buffer); + fclose(fp); + return true; +} + +static bool csvLoadDocuments(dataset *ds) { + FILE *fp = fopen(ds->filename, "r"); + if (!fp) return false; + + char *line = NULL; + size_t len = 0; + if (getline(&line, &len, fp) == -1) { + fprintf(stderr, "Cannot read header from dataset file\n"); + free(line); + fclose(fp); + return false; + } + + size_t capacity = 1000; + ds->records = zmalloc(sizeof(datasetRecord) * capacity); + + const char *format_name = (ds->format == DATASET_FORMAT_CSV) ? "csv" : (ds->format == DATASET_FORMAT_TSV) ? "tsv" + : "xml"; + (void)format_name; /* Suppress output in unit tests */ + + int *load_indices = NULL; + int load_count = 0; + if (ds->field_map) { + load_indices = zmalloc(ds->used_field_count * sizeof(int)); + for (int i = 0; i < ds->field_count; i++) { + if (ds->field_map[i] >= 0) { + load_indices[load_count++] = i; + } + } + } + + while (getline(&line, &len, fp) != -1 && !shouldStopLoading(ds)) { + if (line[0] == '\0' || line[0] == '\n') continue; + + size_t line_len = strlen(line); + if (line_len > 0 && line[line_len - 1] == '\n') line[line_len - 1] = '\0'; + if (line_len > 1 && line[line_len - 2] == '\r') line[line_len - 2] = '\0'; + + if (ds->record_count >= capacity) { + capacity *= 2; + ds->records = zrealloc(ds->records, sizeof(datasetRecord) * capacity); + } + + datasetRecord *record = &ds->records[ds->record_count]; + record->fields = zmalloc(sizeof(sds) * ds->used_field_count); + + if (ds->field_map) { + for (int j = 0; j < load_count; j++) { + int orig_idx = load_indices[j]; + int mapped_idx = ds->field_map[orig_idx]; + record->fields[mapped_idx] = getFieldValue(line, orig_idx, ds->delimiter); + } + } else { + for (int i = 0; i < ds->field_count; i++) { + record->fields[i] = getFieldValue(line, i, ds->delimiter); + } + } + + ds->record_count++; + } + + if (load_indices) zfree(load_indices); + free(line); + fclose(fp); + return true; +} + +static int findFieldIndex(dataset *ds, const char *field_name, size_t field_name_len) { + for (int k = 0; k < ds->field_count; k++) { + if (strlen(ds->field_names[k]) == field_name_len && + !memcmp(ds->field_names[k], field_name, field_name_len)) { + return ds->field_map ? ds->field_map[k] : k; + } + } + return -1; +} + +static const char *extractDatasetFieldValue(dataset *ds, int field_idx, int record_index) { + return ds->records[record_index].fields[field_idx]; +} + +static sds replaceOccurrence(sds processed_arg, const char *pos, const char *replacement) { + size_t offset = pos - processed_arg; + size_t replacement_len = strlen(replacement); + size_t total_len = offset + replacement_len + (sdslen(processed_arg) - offset - PLACEHOLDER_LEN); + + sds result = sdsnewlen(NULL, total_len); + char *p = result; + + memcpy(p, processed_arg, offset); + p += offset; + + memcpy(p, replacement, replacement_len); + p += replacement_len; + + const char *after_start = pos + PLACEHOLDER_LEN; + size_t after_len = sdslen(processed_arg) - offset - PLACEHOLDER_LEN; + memcpy(p, after_start, after_len); + + sdsfree(processed_arg); + return result; +} + +static sds processFieldsInArg(dataset *ds, sds arg, int record_index) { + if (!strstr(arg, FIELD_PREFIX)) return arg; + + /* Loop through all field placeholders in the argument */ + while (strstr(arg, FIELD_PREFIX)) { + const char *field_pos = strstr(arg, FIELD_PREFIX); + const char *field_start = field_pos + FIELD_PREFIX_LEN; + const char *field_end = strstr(field_start, FIELD_SUFFIX); + if (!field_end) break; + + size_t field_name_len = field_end - field_start; + int field_idx = findFieldIndex(ds, field_start, field_name_len); + if (field_idx == -1) break; + + const char *field_value = extractDatasetFieldValue(ds, field_idx, record_index); + size_t before_len = field_pos - arg; + const char *after_start = field_end + FIELD_SUFFIX_LEN; + + sds result = sdsnewlen(arg, before_len); + result = sdscat(result, field_value); + result = sdscat(result, after_start); + + sdsfree(arg); + arg = result; + } + + return arg; +} + +static sds processRandPlaceholdersForDataSet(sds cmd, _Atomic uint64_t *seq_key, int replace_placeholders, int keyspacelen, int sequential_replacement) { + if (!replace_placeholders || keyspacelen == 0) return cmd; + + for (int ph = 0; ph < PLACEHOLDER_COUNT; ph++) { + if (!strstr(cmd, PLACEHOLDERS[ph])) continue; + + uint64_t shared_key = 0; + int generate_shared_key = (ph != 0); + + if (generate_shared_key) { + if (sequential_replacement) { + shared_key = atomic_fetch_add_explicit(&seq_key[ph], 1, memory_order_relaxed); + } else { + shared_key = (uint64_t)random(); + } + shared_key %= keyspacelen; + } + + size_t search_offset = 0; + char *pos; + while ((pos = strstr(cmd + search_offset, PLACEHOLDERS[ph])) != NULL) { + uint64_t key = generate_shared_key ? shared_key : 0; + + if (!generate_shared_key) { + if (sequential_replacement) { + key = atomic_fetch_add_explicit(&seq_key[ph], 1, memory_order_relaxed); + } else { + key = (uint64_t)random(); + } + key %= keyspacelen; + } + + char key_str[24]; + snprintf(key_str, sizeof(key_str), "%012llu", (unsigned long long)key); + + size_t offset = pos - cmd; + cmd = replaceOccurrence(cmd, pos, key_str); + search_offset = offset + PLACEHOLDER_LEN; + } + } + + return cmd; +} diff --git a/src/valkey-benchmark-dataset.h b/src/valkey-benchmark-dataset.h new file mode 100644 index 00000000000..86c9925cbd4 --- /dev/null +++ b/src/valkey-benchmark-dataset.h @@ -0,0 +1,65 @@ +/* Dataset support for valkey-benchmark + * + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + +#ifndef VALKEY_BENCHMARK_DATASET_H +#define VALKEY_BENCHMARK_DATASET_H + +#include "sds.h" +#include +#include +#include + +/* Dataset constants */ +#define MAX_DATASET_FIELDS 1000 +#define MAX_FIELD_NAME_LEN 512 +#define FIELD_PREFIX "__field:" +#define FIELD_PREFIX_LEN 8 +#define FIELD_SUFFIX "__" +#define FIELD_SUFFIX_LEN 2 + +/* Dataset format types */ +typedef enum datasetFormat { + DATASET_FORMAT_CSV = 0, + DATASET_FORMAT_TSV, + DATASET_FORMAT_XML +} datasetFormat; + +/* Dataset structures */ +typedef struct datasetRecord { + sds *fields; +} datasetRecord; + +typedef struct dataset { + datasetFormat format; + char delimiter; + sds *field_names; + int field_count; + int *field_map; + int used_field_count; + datasetRecord *records; + size_t record_count; + const char *filename; + const char *xml_root_element; + int max_documents; +} dataset; + +/* Initialize dataset from file - returns NULL on error */ +dataset *datasetInit(const char *filename, const char *xml_root_element, int max_documents, int has_field_placeholders, sds *template_argv, int template_argc); + +/* Free dataset and all memory */ +void datasetFree(dataset *ds); + +/* Get number of records */ +size_t datasetGetRecordCount(dataset *ds); + +/* Report memory usage */ +void datasetReportMemory(dataset *ds); + +/* Generate complete command for given record index (caller must sdsfree) */ +sds datasetGenerateCommand(dataset *ds, int record_index, sds *template_argv, int template_argc, _Atomic uint64_t *seq_key, int replace_placeholders, int keyspacelen, int sequential_replacement); + +#endif /* VALKEY_BENCHMARK_DATASET_H */ diff --git a/src/valkey-benchmark.c b/src/valkey-benchmark.c index a0738414a08..87b59ab303e 100644 --- a/src/valkey-benchmark.c +++ b/src/valkey-benchmark.c @@ -61,6 +61,7 @@ #include "hdr_histogram.h" #include "cli_common.h" #include "mt19937-64.h" +#include "valkey-benchmark-dataset.h" #define UNUSED(V) ((void)V) #define RANDPTR_INITIAL_SIZE 8 @@ -160,6 +161,15 @@ static struct config { atomic_uint_fast64_t last_time_ns; uint64_t time_per_token; uint64_t time_per_burst; + /* Dataset support */ + sds dataset_file; + int max_documents; /* Maximum documents to load from dataset */ + sds xml_root_element; /* XML root element name */ + dataset *current_dataset; /* Current loaded dataset */ + /* Command template for dataset mode */ + int template_argc; + sds *template_argv; + int has_field_placeholders; } config; /* Locations of the placeholders __rand_int__, __rand_1st__, @@ -171,6 +181,9 @@ static struct placeholders { size_t *index_data; /* allocation holding all index data */ } placeholders; +/* Sequence keys for dataset command generation */ +static _Atomic uint64_t dataset_seq_key[PLACEHOLDER_COUNT] = {0}; + typedef struct _client { valkeyContext *context; sds obuf; @@ -237,6 +250,7 @@ static int fetchClusterSlotsConfiguration(client c); static void updateClusterSlotsConfiguration(void); static long long showThroughput(struct aeEventLoop *eventLoop, long long id, void *clientData); int runFuzzerClients(const char *host, int port, int max_commands, int parallel_clients, int cluster_mode, int num_keys, cliSSLconfig *ssl_config, const char *log_level, int fuzz_flags); +static int parseCommandTemplate(int argc, char **argv); /* Dict callbacks */ static uint64_t dictSdsHash(const void *key); @@ -866,8 +880,28 @@ static void writeHandler(aeEventLoop *el, int fd, void *privdata, int mask) { return; } - /* Really initialize: replace keys and set start time. */ - if (config.replace_placeholders) replacePlaceholders(c->obuf + c->prefixlen, config.pipeline); + /* Dataset field access mode - completely independent command generation */ + if (config.has_field_placeholders && config.current_dataset && config.current_dataset->record_count > 0) { + static _Atomic uint64_t record_counter = 0; + + /* Generate complete pipeline commands for dataset placeholders */ + sdssetlen(c->obuf, c->prefixlen); + for (int p = 0; p < config.pipeline; p++) { + uint64_t record_index = atomic_fetch_add_explicit(&record_counter, 1, memory_order_relaxed) % config.current_dataset->record_count; + sds complete_cmd = datasetGenerateCommand(config.current_dataset, record_index, + config.template_argv, config.template_argc, + dataset_seq_key, config.replace_placeholders, + config.keyspacelen, config.sequential_replacement); + c->obuf = sdscatlen(c->obuf, complete_cmd, sdslen(complete_cmd)); + sdsfree(complete_cmd); + } + } else { + /* Standard mode */ + if (config.replace_placeholders) { + replacePlaceholders(c->obuf + c->prefixlen, config.pipeline); + } + } + if (config.cluster_mode && c->staglen > 0) setClusterKeyHashTag(c); c->slots_last_update = atomic_load_explicit(&config.slots_last_update, memory_order_relaxed); c->start = ustime(); @@ -1614,6 +1648,55 @@ static void updateClusterSlotsConfiguration(void) { pthread_mutex_unlock(&config.is_updating_slots_mutex); } +/* Free dataset memory */ +static void cleanupDataset(void) { + if (config.current_dataset) { + datasetFree(config.current_dataset); + config.current_dataset = NULL; + } +} + +/* Add RESP command to sequence with repeat count */ +static void addRespCommandToSequence(sds *sds_args, size_t *argvlen, int start, int end, int repeat, sds *cmd_seq, int *seq_len) { + char *cmd = NULL; + int len = valkeyFormatCommandArgv(&cmd, end - start, (const char **)sds_args + start, argvlen + start); + for (int j = 0; j < repeat; j++) { + *cmd_seq = sdscatlen(*cmd_seq, cmd, len); + } + *seq_len += repeat; + free(cmd); +} + +/* Parse and setup command template for dataset field validation */ +static int parseCommandTemplate(int argc, char **argv) { + sds *sds_args = getSdsArrayFromArgv(argc, argv, 0); + if (!sds_args) { + fprintf(stderr, "Invalid quoted string\n"); + return 0; + } + + /* Detect field placeholders */ + config.has_field_placeholders = 0; + for (int i = 0; i < argc; i++) { + if (strstr(sds_args[i], FIELD_PREFIX)) { + config.has_field_placeholders = 1; + break; + } + } + + if (config.has_field_placeholders) { + config.template_argc = argc; + config.template_argv = zmalloc(argc * sizeof(sds)); + for (int i = 0; i < argc; i++) { + config.template_argv[i] = sdsdup(sds_args[i]); + } + } + + sdsfreesplitres(sds_args, argc); + return 1; +} + + /* Generate random data for the benchmark. See #7196. */ static void genBenchmarkRandomData(char *data, int count) { static uint32_t state = 1234; @@ -1787,6 +1870,16 @@ int parseOptions(int argc, char **argv) { config.num_functions = atoi(argv[++i]); } else if (!strcmp(argv[i], "--num-keys-in-fcall")) { config.num_keys_in_fcall = atoi(argv[++i]); + } else if (!strcmp(argv[i], "--dataset")) { + if (lastarg) goto invalid; + config.dataset_file = sdsnew(argv[++i]); + } else if (!strcmp(argv[i], "--maxdocs")) { + if (lastarg) goto invalid; + config.max_documents = atoi(argv[++i]); + if (config.max_documents <= 0) config.max_documents = -1; + } else if (!strcmp(argv[i], "--xml-root-element")) { + if (lastarg) goto invalid; + config.xml_root_element = sdsnew(argv[++i]); } else if (!strcmp(argv[i], "--help")) { exit_status = 0; goto usage; @@ -1917,6 +2010,8 @@ int parseOptions(int argc, char **argv) { "__rand_1st__ Like __rand_int__ but multiple occurrences will have the same\n" " value. __rand_2nd__ through __rand_9th__ are also available.\n" " __data__ Replaced with data of the size specified by the -d option.\n" + " __field:name__ Replaced with data from the specified field/column in the\n" + " dataset. Requires --dataset option.\n" " {tag} Replaced with a tag that routes the command to each node in\n" " a cluster. Include this in key names when running in cluster\n" " mode.\n" @@ -2002,7 +2097,13 @@ int parseOptions(int argc, char **argv) { " loaded when running the 'function_load' test. (default 10).\n" " --num-keys-in-fcall \n" " Sets the number of keys passed to FCALL command when running\n" - " the 'fcall' test. (default 1)\n", + " the 'fcall' test. (default 1)\n" + " --dataset Path to CSV/TSV/XML dataset file for field placeholder replacement.\n" + " All fields auto-detected with natural content lengths.\n" + " --maxdocs Maximum number of documents to load from dataset file.\n" + " Default: unlimited.\n" + " --xml-root-element \n" + " Root element name for XML dataset parsing. Required for XML files.\n", tls_usage, rdma_usage, " --mptcp Enable an MPTCP connection.\n" @@ -2220,12 +2321,44 @@ int main(int argc, char **argv) { config.num_functions = 10; config.num_keys_in_fcall = 1; config.resp3 = 0; + config.dataset_file = NULL; + config.max_documents = -1; /* -1 = unlimited */ + config.xml_root_element = NULL; + config.current_dataset = NULL; + config.template_argc = 0; + config.template_argv = NULL; + config.has_field_placeholders = 0; resetPlaceholders(); i = parseOptions(argc, argv); argc -= i; argv += i; + /* Setup dataset if specified */ + if (config.dataset_file) { + if (argc == 0) { + fprintf(stderr, "Error: Dataset mode requires a command with field placeholders\n"); + fprintf(stderr, "Example: SET doc:__rand_int__ \"__field:content__\"\n"); + exit(1); + } + + /* Parse command template and setup field placeholder detection */ + if (!parseCommandTemplate(argc, argv)) { + exit(1); + } + + /* Initialize dataset - single call does everything atomically */ + config.current_dataset = datasetInit(config.dataset_file, config.xml_root_element, + config.max_documents, + config.has_field_placeholders, + config.template_argv, config.template_argc); + if (!config.current_dataset) { + fprintf(stderr, "Failed to initialize dataset\n"); + exit(1); + } + + datasetReportMemory(config.current_dataset); + } /* Set default for requests if not specified */ if (config.requests < 0) config.requests = 100000; @@ -2384,18 +2517,15 @@ int main(int argc, char **argv) { } else if (i == argc || strcmp(";", sds_args[i]) == 0) { cmd = NULL; if (i == start) continue; - /* End of command. RESP-encode and append to sequence. */ - len = valkeyFormatCommandArgv(&cmd, i - start, - (const char **)sds_args + start, - argvlen + start); - for (int j = 0; j < repeat; j++) { - cmd_seq = sdscatlen(cmd_seq, cmd, len); - } - seq_len += repeat; - free(cmd); + + addRespCommandToSequence(sds_args, argvlen, start, i, repeat, &cmd_seq, &seq_len); start = i + 1; repeat = 1; } else if (strstr(sds_args[i], "__data__")) { + if (config.current_dataset) { + fprintf(stderr, "Error: __data__ placeholders cannot be used with --dataset option\n"); + exit(1); + } /* Replace data placeholders with data of length given by -d. */ int num_parts; sds *parts = sdssplitlen(sds_args[i], sdslen(sds_args[i]), @@ -2414,6 +2544,7 @@ int main(int argc, char **argv) { sds_args[i] = newarg; argvlen[i] = sdslen(sds_args[i]); } + /* NOTE: Field placeholder processing is handled above in the command-level loop to ensure row consistency */ } len = sdslen(cmd_seq); /* adjust the datasize to the parsed command */ @@ -2634,6 +2765,15 @@ int main(int argc, char **argv) { freeCliConnInfo(config.conn_info); if (config.server_config != NULL) freeServerConfig(config.server_config); resetPlaceholders(); + cleanupDataset(); + + /* Clean up command template */ + if (config.template_argv) { + for (int i = 0; i < config.template_argc; i++) { + sdsfree(config.template_argv[i]); + } + zfree(config.template_argv); + } return 0; } diff --git a/tests/integration/valkey-benchmark.tcl b/tests/integration/valkey-benchmark.tcl index 0cf2906c85e..5525dbfc19f 100644 --- a/tests/integration/valkey-benchmark.tcl +++ b/tests/integration/valkey-benchmark.tcl @@ -199,6 +199,246 @@ tags {"benchmark network external:skip logreqres:skip"} { assert {$different_count > 0} } + test {benchmark: dataset CSV with field placeholders} { + # Create test CSV dataset + set csv_data "title,content,author\nTest Title 1,Test Content 1,Author 1\nTest Title 2,Test Content 2,Author 2" + set csv_file [tmpfile "dataset.csv"] + set fd [open $csv_file w] + puts $fd $csv_data + close $fd + + set cmd [valkeybenchmark $master_host $master_port "--dataset $csv_file -n 4 -r 10 -- HSET doc:__rand_int__ title \"__field:title__\" content \"__field:content__\""] + common_bench_setup $cmd + assert_match {*calls=4,*} [cmdstat hset] + + # Verify field data was inserted correctly + set keys [r keys "doc:*"] + assert {[llength $keys] > 0} + set sample_key [lindex $keys 0] + set title [r hget $sample_key title] + set content [r hget $sample_key content] + assert {$title eq "Test Title 1" || $title eq "Test Title 2"} + assert {$content eq "Test Content 1" || $content eq "Test Content 2"} + + file delete $csv_file + } + + test {benchmark: dataset XML with field placeholders} { + # Create test XML dataset matching Wikipedia structure + set xml_data " + + XML Title 1 + XML Abstract 1 + http://example1.com + + + test1 + http://test1.com + + + + + XML Title 2 + XML Abstract 2 + http://example2.com + + + test2 + http://test2.com + + + " + set xml_file [tmpfile "dataset.xml"] + set fd [open $xml_file w] + puts $fd $xml_data + close $fd + + set cmd [valkeybenchmark $master_host $master_port "--dataset $xml_file --xml-root-element doc -n 4 -r 10 -- HSET xml_doc:__rand_int__ title \"__field:title__\" abstract \"__field:abstract__\""] + common_bench_setup $cmd + assert_match {*calls=4,*} [cmdstat hset] + + # Verify XML field data was inserted correctly + set keys [r keys "xml_doc:*"] + assert {[llength $keys] > 0} + set sample_key [lindex $keys 0] + set title [r hget $sample_key title] + set abstract [r hget $sample_key abstract] + assert {$title eq "XML Title 1" || $title eq "XML Title 2"} + assert {$abstract eq "XML Abstract 1" || $abstract eq "XML Abstract 2"} + + file delete $xml_file + } + + test {benchmark: dataset with maxdocs limit} { + # Create test dataset with multiple rows + set csv_data "name,value\nitem1,value1\nitem2,value2\nitem3,value3\nitem4,value4" + set csv_file [tmpfile "dataset.csv"] + set fd [open $csv_file w] + puts $fd $csv_data + close $fd + + set cmd [valkeybenchmark $master_host $master_port "--dataset $csv_file --maxdocs 2 -n 4 -r 10 -- SET item:__rand_int__ \"__field:value__\""] + common_bench_setup $cmd + assert_match {*calls=4,*} [cmdstat set] + + # Should only use first 2 documents due to maxdocs limit + set keys [r keys "item:*"] + assert {[llength $keys] > 0} + + # Verify ALL keys only contain values from first 2 documents + set unique_values {} + foreach key $keys { + set value [r get $key] + assert {$value eq "value1" || $value eq "value2"} + if {[lsearch $unique_values $value] == -1} { + lappend unique_values $value + } + } + + file delete $csv_file + } + + test {benchmark: dataset error handling - invalid field} { + set csv_data "name,value\nitem1,value1" + set csv_file [tmpfile "dataset.csv"] + set fd [open $csv_file w] + puts $fd $csv_data + close $fd + + set cmd [valkeybenchmark $master_host $master_port "--dataset $csv_file -n 1 -- SET item:__rand_int__ \"__field:invalid_field__\""] + + # Should fail with invalid field error + if {[catch { exec {*}$cmd } error]} { + assert_match "*not found in dataset fields*" $error + } else { + fail "Expected error for invalid field placeholder" + } + + file delete $csv_file + } + + test {benchmark: dataset TSV with field placeholders} { + # Create test TSV dataset (tab-separated values) + set tsv_data "name\tvalue\tcount\nitem1\tvalue1\t100\nitem2\tvalue2\t200" + set tsv_file [tmpfile "dataset.tsv"] + set fd [open $tsv_file w] + puts $fd $tsv_data + close $fd + + set cmd [valkeybenchmark $master_host $master_port "--dataset $tsv_file -n 4 -r 10 -- HSET tsv_doc:__rand_int__ name \"__field:name__\" value \"__field:value__\" count __field:count__"] + common_bench_setup $cmd + assert_match {*calls=4,*} [cmdstat hset] + + # Verify TSV field data was inserted correctly + set keys [r keys "tsv_doc:*"] + assert {[llength $keys] > 0} + set sample_key [lindex $keys 0] + set name [r hget $sample_key name] + set value [r hget $sample_key value] + set count [r hget $sample_key count] + assert {$name eq "item1" || $name eq "item2"} + assert {$value eq "value1" || $value eq "value2"} + assert {$count eq "100" || $count eq "200"} + + file delete $tsv_file + } + + test {benchmark: XML dataset missing root element error} { + # Create test XML dataset + set xml_data "XML Title 1XML Abstract 1" + set xml_file [tmpfile "dataset.xml"] + set fd [open $xml_file w] + puts $fd $xml_data + close $fd + + # Should fail without --xml-root-element parameter + set cmd [valkeybenchmark $master_host $master_port "--dataset $xml_file -n 1 -- SET xml:__rand_int__ \"__field:title__\""] + + if {[catch { exec {*}$cmd } error]} { + assert_match "*XML dataset requires --xml-root-element parameter*" $error + } else { + fail "Expected error for XML dataset without --xml-root-element" + } + + file delete $xml_file + } + + test {benchmark: dataset with maxdocs larger than available documents} { + # Create test dataset with only 2 rows but request maxdocs=5 + set csv_data "name,value\nitem1,value1\nitem2,value2" + set csv_file [tmpfile "dataset.csv"] + set fd [open $csv_file w] + puts $fd $csv_data + close $fd + + set cmd [valkeybenchmark $master_host $master_port "--dataset $csv_file --maxdocs 5 -n 4 -r 10 -- SET item:__rand_int__ \"__field:value__\""] + common_bench_setup $cmd + assert_match {*calls=4,*} [cmdstat set] + + # Should gracefully use all available documents (2), cycling through them + set keys [r keys "item:*"] + assert {[llength $keys] > 0} + + # All values should still be only from available documents + foreach key $keys { + set value [r get $key] + assert {$value eq "value1" || $value eq "value2"} + } + + file delete $csv_file + } + + test {benchmark: mixed placeholders - dataset fields and rand placeholders} { + # Test combining __field:name__ with __rand_int__ placeholders + set csv_data "category,description\nuser,User Management\norder,Order Processing" + set csv_file [tmpfile "dataset.csv"] + set fd [open $csv_file w] + puts $fd $csv_data + close $fd + + set cmd [valkeybenchmark $master_host $master_port "--dataset $csv_file -n 6 -r 100 -- HSET mixed:__rand_int__ category \"__field:category__\" desc \"__field:description__\" score __rand_1st__"] + common_bench_setup $cmd + assert_match {*calls=6,*} [cmdstat hset] + + # Verify both field and random placeholders work together + set keys [r keys "mixed:*"] + assert {[llength $keys] > 0} + set sample_key [lindex $keys 0] + set category [r hget $sample_key category] + set desc [r hget $sample_key desc] + set score [r hget $sample_key score] + + # Field placeholders should contain dataset values + assert {$category eq "user" || $category eq "order"} + assert {$desc eq "User Management" || $desc eq "Order Processing"} + + # Random placeholder should be a 12-digit number + assert {[string length $score] == 12} + assert {[string is digit $score]} + + file delete $csv_file + } + + test {benchmark: dataset mode requires field placeholders} { + set csv_data "name,value\nitem1,value1\nitem2,value2" + set csv_file [tmpfile "dataset.csv"] + set fd [open $csv_file w] + puts $fd $csv_data + close $fd + + # Dataset mode should require field placeholders in the command + set cmd [valkeybenchmark $master_host $master_port "--dataset $csv_file -n 10 -r 10 -t set"] + + # Should fail with error about missing field placeholders + if {[catch { exec {*}$cmd } error]} { + assert_match "*Dataset mode requires a command with field placeholders*" $error + } else { + fail "Expected error for dataset mode without field placeholders" + } + + file delete $csv_file + } + test {benchmark: sequential zadd results in expected number of keys} { set cmd [valkeybenchmark $master_host $master_port "-r 50 -n 50 --sequential -t zadd"] common_bench_setup $cmd