diff --git a/.docker/Dockerfile.lance b/.docker/Dockerfile.lance index ca2590b..1930307 100644 --- a/.docker/Dockerfile.lance +++ b/.docker/Dockerfile.lance @@ -1,5 +1,6 @@ # Lance Docker image extending base with Rust and Lance FFI for TPC-H benchmarks -FROM ghcr.io/tsafin/tpch-cpp-base:latest +ARG BASE_IMAGE=ghcr.io/tsafin/tpch-cpp-base:latest +FROM ${BASE_IMAGE} LABEL org.opencontainers.image.source="https://github.com/tsafin/tpch-cpp" LABEL org.opencontainers.image.description="TPC-H C++ Lance Build Environment with Arrow/Parquet/Lance" diff --git a/.docker/Dockerfile.orc b/.docker/Dockerfile.orc index 24317ab..8a6b98c 100644 --- a/.docker/Dockerfile.orc +++ b/.docker/Dockerfile.orc @@ -1,5 +1,6 @@ # ORC Docker image extending base with ORC support for TPC-H benchmarks -FROM ghcr.io/tsafin/tpch-cpp-base:latest +ARG BASE_IMAGE=ghcr.io/tsafin/tpch-cpp-base:latest +FROM ${BASE_IMAGE} LABEL org.opencontainers.image.source="https://github.com/tsafin/tpch-cpp" LABEL org.opencontainers.image.description="TPC-H C++ ORC Build Environment with Arrow/Parquet/ORC" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 163f1a6..efe900c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -117,9 +117,11 @@ jobs: -DCMAKE_PREFIX_PATH=${{ matrix.deps_path }} \ -DTPCH_ENABLE_ORC=${{ matrix.enable_orc }} \ -DTPCH_ENABLE_LANCE=${{ matrix.enable_lance }} \ + -DTPCH_ENABLE_NATIVE_OPTIMIZATIONS=OFF \ -DTPCH_ENABLE_ASYNC_IO=ON \ -DTPCH_ENABLE_ASAN=OFF \ - -DTPCH_BUILD_TESTS=${{ matrix.enable_tests }} + -DTPCH_BUILD_TESTS=${{ matrix.enable_tests }} \ + -DTPCDS_ENABLE=ON - name: Build project run: cmake --build build -j$(nproc) @@ -127,6 +129,7 @@ jobs: - name: Verify executable and tests run: | test -f build/tpch_benchmark && echo "✓ tpch_benchmark created" + test -f build/tpcds_benchmark && echo "✓ tpcds_benchmark created" test -f build/tests/buffer_lifetime_manager_test && echo "✓ buffer_lifetime_manager_test created" || true test -f build/tests/dbgen_batch_iterator_test && echo "✓ dbgen_batch_iterator_test created" || true if [ "${{ matrix.enable_lance }}" = "ON" ]; then @@ -176,12 +179,13 @@ jobs: name: tpch-benchmark-${{ matrix.config }} path: | build/tpch_benchmark + build/tpcds_benchmark build/tests/*_test retention-days: 1 if-no-files-found: error - benchmark-suite: - name: Benchmark Suite + tpch-benchmark-suite: + name: TPC-H Benchmark Suite runs-on: ubuntu-22.04 needs: [resolve-images, build-matrix] timeout-minutes: 20 @@ -330,13 +334,13 @@ jobs: if: always() uses: actions/upload-artifact@v4 with: - name: benchmark-logs-suite-${{ matrix.format }}-${{ matrix.table }} + name: tpch-benchmark-logs-suite-${{ matrix.format }}-${{ matrix.table }} path: benchmark-results/${{ matrix.format }}_${{ matrix.table }}_baseline.log retention-days: 30 if-no-files-found: ignore - optimization-benchmarks: - name: Optimization Benchmarks (${{ matrix.format }}-${{ matrix.mode }}) + tpch-optimization-benchmarks: + name: TPC-H Optimization Benchmarks (${{ matrix.format }}-${{ matrix.mode }}) runs-on: ubuntu-22.04 needs: [resolve-images, build-matrix] timeout-minutes: 20 @@ -533,15 +537,229 @@ jobs: if: always() uses: actions/upload-artifact@v4 with: - name: benchmark-logs-optimization-${{ matrix.format }}-${{ matrix.mode }}-${{ matrix.table }} + name: tpch-benchmark-logs-optimization-${{ matrix.format }}-${{ matrix.mode }}-${{ matrix.table }} path: benchmark-results/${{ matrix.format }}_${{ matrix.table }}_${{ matrix.mode }}.log retention-days: 30 if-no-files-found: ignore + tpcds-benchmark-suite: + name: TPC-DS Benchmark Suite + runs-on: ubuntu-22.04 + needs: [resolve-images, build-matrix] + timeout-minutes: 20 + container: + image: ${{ matrix.build == 'base' && needs.resolve-images.outputs.base_image || matrix.build == 'orc' && needs.resolve-images.outputs.orc_image || needs.resolve-images.outputs.lance_image }} + options: --user root + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + strategy: + fail-fast: false + matrix: + include: + # CSV format + - format: csv + table: store_returns + build: base + - format: csv + table: store_sales + build: base + - format: csv + table: customer + build: base + - format: csv + table: item + build: base + # Parquet format + - format: parquet + table: store_returns + build: base + - format: parquet + table: store_sales + build: base + - format: parquet + table: customer + build: base + - format: parquet + table: item + build: base + # ORC format + - format: orc + table: store_returns + build: orc + - format: orc + table: store_sales + build: orc + # Lance format + - format: lance + table: store_returns + build: lance + - format: lance + table: store_sales + build: lance + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Download build artifact + uses: actions/download-artifact@v4 + with: + name: tpch-benchmark-${{ matrix.build }} + path: . + + - name: Setup benchmark executable + run: | + chmod +x tpcds_benchmark + mkdir -p benchmark-results + export LD_LIBRARY_PATH=/opt/dependencies/lib:$LD_LIBRARY_PATH + echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV + + - name: Run format coverage benchmark + run: | + if ! timeout 600 ./tpcds_benchmark \ + --scale-factor 1 \ + --format ${{ matrix.format }} \ + --table ${{ matrix.table }} \ + --output-dir benchmark-results/ \ + 2>&1 | grep -v "^DEBUG:" | tee "benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_baseline.log"; then + echo "ERROR: Benchmark failed with exit code $?" + exit 1 + fi + + if grep -q "dumped core" "benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_baseline.log"; then + echo "ERROR: Benchmark crashed with core dump" + exit 1 + fi + + if grep -qi "unknown format\|unsupported format\|not supported" "benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_baseline.log"; then + echo "ERROR: Format ${{ matrix.format }} not supported by this build" + exit 1 + fi + + - name: Upload benchmark logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: tpcds-benchmark-logs-suite-${{ matrix.format }}-${{ matrix.table }} + path: benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_baseline.log + retention-days: 30 + if-no-files-found: ignore + + tpcds-optimization-benchmarks: + name: TPC-DS Optimization Benchmarks (${{ matrix.format }}-${{ matrix.mode }}) + runs-on: ubuntu-22.04 + needs: [resolve-images, build-matrix] + timeout-minutes: 20 + container: + image: ${{ matrix.image == 'base' && needs.resolve-images.outputs.base_image || needs.resolve-images.outputs.lance_image }} + options: --user root + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + strategy: + fail-fast: false + matrix: + include: + # Parquet benchmarks + - format: parquet + mode: baseline + table: store_returns + image: base + - format: parquet + mode: baseline + table: store_sales + image: base + - format: parquet + mode: zero-copy + table: store_returns + image: base + - format: parquet + mode: zero-copy + table: store_sales + image: base + # Lance benchmarks + - format: lance + mode: baseline + table: store_returns + image: lance + - format: lance + mode: baseline + table: store_sales + image: lance + - format: lance + mode: zero-copy + table: store_returns + image: lance + - format: lance + mode: zero-copy + table: store_sales + image: lance + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Download build artifact + uses: actions/download-artifact@v4 + with: + name: tpch-benchmark-${{ matrix.image }} + path: . + + - name: Setup benchmark executable + run: | + chmod +x tpcds_benchmark + mkdir -p benchmark-results + export LD_LIBRARY_PATH=/opt/dependencies/lib:$LD_LIBRARY_PATH + echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV + + - name: Run optimization benchmark + run: | + MODE_FLAGS="" + if [ "${{ matrix.mode }}" = "zero-copy" ]; then + MODE_FLAGS="--zero-copy" + fi + + if ! timeout 600 ./tpcds_benchmark \ + --scale-factor 1 \ + --format ${{ matrix.format }} \ + --table ${{ matrix.table }} \ + --output-dir benchmark-results/ \ + $MODE_FLAGS \ + 2>&1 | grep -v "^DEBUG:" | tee "benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_${{ matrix.mode }}.log"; then + echo "ERROR: Benchmark failed with exit code $?" + exit 1 + fi + + if grep -q "dumped core" "benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_${{ matrix.mode }}.log"; then + echo "ERROR: Benchmark crashed with core dump" + exit 1 + fi + + if grep -qi "unknown format\|unsupported format\|not supported" "benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_${{ matrix.mode }}.log"; then + echo "ERROR: Format ${{ matrix.format }} not supported by this build" + exit 1 + fi + + - name: Upload benchmark logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: tpcds-benchmark-logs-optimization-${{ matrix.format }}-${{ matrix.mode }}-${{ matrix.table }} + path: benchmark-results/tpcds_${{ matrix.format }}_${{ matrix.table }}_${{ matrix.mode }}.log + retention-days: 30 + if-no-files-found: ignore + results-aggregation: name: Aggregate Results runs-on: ubuntu-22.04 - needs: [benchmark-suite, optimization-benchmarks] + needs: [tpch-benchmark-suite, tpch-optimization-benchmarks, tpcds-benchmark-suite, tpcds-optimization-benchmarks] if: always() steps: diff --git a/.github/workflows/docker-images.yml b/.github/workflows/docker-images.yml index 1ac3967..282ef7e 100644 --- a/.github/workflows/docker-images.yml +++ b/.github/workflows/docker-images.yml @@ -161,6 +161,16 @@ jobs: submodules: recursive fetch-depth: 1 + - name: Resolve base image + id: base-image + run: | + BRANCH_TAG=$(echo "${GITHUB_REF_NAME}" | tr '/' '-' | tr '[:upper:]' '[:lower:]') + if [ "${{ needs.build-base.result }}" = "success" ]; then + echo "image=${{ env.IMAGE_PREFIX }}-base:${BRANCH_TAG}" >> "$GITHUB_OUTPUT" + else + echo "image=${{ env.IMAGE_PREFIX }}-base:latest" >> "$GITHUB_OUTPUT" + fi + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -187,6 +197,8 @@ jobs: context: . file: .docker/Dockerfile.orc push: true + build-args: | + BASE_IMAGE=${{ steps.base-image.outputs.image }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: | @@ -219,6 +231,16 @@ jobs: submodules: recursive fetch-depth: 1 + - name: Resolve base image + id: base-image + run: | + BRANCH_TAG=$(echo "${GITHUB_REF_NAME}" | tr '/' '-' | tr '[:upper:]' '[:lower:]') + if [ "${{ needs.build-base.result }}" = "success" ]; then + echo "image=${{ env.IMAGE_PREFIX }}-base:${BRANCH_TAG}" >> "$GITHUB_OUTPUT" + else + echo "image=${{ env.IMAGE_PREFIX }}-base:latest" >> "$GITHUB_OUTPUT" + fi + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -245,6 +267,8 @@ jobs: context: . file: .docker/Dockerfile.lance push: true + build-args: | + BASE_IMAGE=${{ steps.base-image.outputs.image }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: | diff --git a/.gitmodules b/.gitmodules index 252bf94..cb7cd9c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -16,3 +16,6 @@ [submodule "third_party/lance"] path = third_party/lance url = https://github.com/tsafin/lance.git +[submodule "third_party/tpcds"] + path = third_party/tpcds + url = https://github.com/tsafin/tpchds-tools.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 668e799..9c02e30 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,8 +26,11 @@ option(TPCH_ENABLE_ORC "Enable ORC file format support" OFF) option(TPCH_ENABLE_PAIMON "Enable Apache Paimon table format support" OFF) option(TPCH_ENABLE_ICEBERG "Enable Apache Iceberg table format support" OFF) option(TPCH_ENABLE_LANCE "Enable Lance columnar format support (requires Rust)" OFF) +option(TPCH_USE_PREBUILT_LANCE_FFI "Use pre-compiled Lance FFI library when available" ON) option(TPCH_ENABLE_PERF_COUNTERS "Enable performance counters instrumentation" OFF) option(TPCH_ENABLE_MOLD "Enable mold linker if available (incompatible with GTest in this project)" ON) +option(TPCDS_ENABLE "Enable TPC-DS data generation (tpcds_benchmark executable)" OFF) +option(TPCH_ENABLE_NATIVE_OPTIMIZATIONS "Enable host-specific CPU optimizations such as -march=native" ON) # Compiler configuration include(cmake/CompilerWarnings.cmake) @@ -36,7 +39,11 @@ include(cmake/CompilerWarnings.cmake) if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") # Enable aggressive optimizations with SIMD support if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") - add_compile_options(-O3 -march=native) + add_compile_options(-O3) + + if(TPCH_ENABLE_NATIVE_OPTIMIZATIONS) + add_compile_options(-march=native) + endif() # Enable auto-vectorization and report optimizations add_compile_options( @@ -44,22 +51,26 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebI -fopt-info-vec-optimized # Report successful vectorizations ) - # Check for AVX2 support (preferred) - include(CheckCXXCompilerFlag) - check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2) + if(TPCH_ENABLE_NATIVE_OPTIMIZATIONS) + # Check for AVX2 support (preferred) + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2) - if(COMPILER_SUPPORTS_AVX2) - message(STATUS "Enabling AVX2 SIMD optimizations") - add_compile_options(-mavx2 -mfma) - else() - # Fallback to SSE4.2 (required for SIMD string utils) - check_cxx_compiler_flag("-msse4.2" COMPILER_SUPPORTS_SSE42) - if(COMPILER_SUPPORTS_SSE42) - message(STATUS "Enabling SSE4.2 SIMD optimizations") - add_compile_options(-msse4.2) + if(COMPILER_SUPPORTS_AVX2) + message(STATUS "Enabling AVX2 SIMD optimizations") + add_compile_options(-mavx2 -mfma) else() - message(WARNING "No SIMD support detected - performance will be degraded") + # Fallback to SSE4.2 (required for SIMD string utils) + check_cxx_compiler_flag("-msse4.2" COMPILER_SUPPORTS_SSE42) + if(COMPILER_SUPPORTS_SSE42) + message(STATUS "Enabling SSE4.2 SIMD optimizations") + add_compile_options(-msse4.2) + else() + message(WARNING "No SIMD support detected - performance will be degraded") + endif() endif() + else() + message(STATUS "Host-specific CPU optimizations disabled (TPCH_ENABLE_NATIVE_OPTIMIZATIONS=OFF)") endif() endif() elseif(CMAKE_BUILD_TYPE STREQUAL "Debug") @@ -258,6 +269,11 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS add_subdirectory(third_party/dbgen EXCLUDE_FROM_ALL) include_directories(${DBGEN_INCLUDE_DIRS}) +# TPC-DS dsdgen objects (built only when TPCDS_ENABLE=ON) +if(TPCDS_ENABLE) + add_subdirectory(third_party/dsdgen EXCLUDE_FROM_ALL) +endif() + # Copy TPC-H distribution file to build directory # Required by dbgen for loading nations, regions, and other lookup tables configure_file( @@ -272,8 +288,19 @@ if(TPCH_ENABLE_LANCE AND Lance_FOUND) set(LANCE_FFI_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third_party/lance-ffi") set(LANCE_FFI_LIB_FINAL "${CMAKE_BINARY_DIR}/liblance_ffi.a") - # Check if pre-compiled library exists (e.g. in Docker CI image) - find_library(LANCE_FFI_PREBUILT lance_ffi PATHS "${CMAKE_PREFIX_PATH}/lib" NO_DEFAULT_PATH) + set(LANCE_FFI_PREBUILT "") + if(TPCH_USE_PREBUILT_LANCE_FFI) + # In CI containers we know the exact archive name and location pattern. + # Prefer an explicit path probe over find_library() so a semicolon-based + # CMAKE_PREFIX_PATH cannot cause us to miss the prebuilt archive and + # fall back to a full Rust rebuild. + foreach(_prefix IN LISTS CMAKE_PREFIX_PATH) + if(EXISTS "${_prefix}/lib/liblance_ffi.a") + set(LANCE_FFI_PREBUILT "${_prefix}/lib/liblance_ffi.a") + break() + endif() + endforeach() + endif() if(LANCE_FFI_PREBUILT) message(STATUS "Using pre-compiled Lance FFI library: ${LANCE_FFI_PREBUILT}") @@ -287,7 +314,11 @@ if(TPCH_ENABLE_LANCE AND Lance_FOUND) ) add_custom_target(lance_ffi ALL DEPENDS "${LANCE_FFI_LIB_FINAL}") else() - message(STATUS "Pre-compiled Lance FFI not found, building from source with Rust cargo") + if(TPCH_USE_PREBUILT_LANCE_FFI) + message(STATUS "Pre-compiled Lance FFI not found, building from source with Rust cargo") + else() + message(STATUS "Building Lance FFI library from source with Rust cargo") + endif() set(LANCE_FFI_BUILD_DIR "${CMAKE_BINARY_DIR}/rust") # Determine output library path based on platform and Rust target @@ -559,6 +590,46 @@ if(TPCH_ENABLE_PERF_COUNTERS) target_compile_definitions(tpch_benchmark PRIVATE TPCH_ENABLE_PERF_COUNTERS) endif() +# TPC-DS benchmark executable +if(TPCDS_ENABLE) + add_executable(tpcds_benchmark + src/tpcds_main.cpp + src/dsdgen/dsdgen_wrapper.cpp + src/dsdgen/dsdgen_converter.cpp + ${DSDGEN_OBJECTS} + ) + target_link_libraries(tpcds_benchmark PRIVATE tpch_core) + target_include_directories(tpcds_benchmark PRIVATE ${DSDGEN_INCLUDE_DIRS}) + # dsdgen upstream has several globals defined in multiple source files + # (pCurrentFile in driver.c and grammar_support.c, etc.). Allow duplicates + # at link time — the old GCC linker accepted these by default via -fcommon. + target_link_options(tpcds_benchmark PRIVATE -Wl,--allow-multiple-definition) + # dsdgen headers require LINUX=1 and TPCDS=1 to define ds_key_t and enable + # 64-bit support (config.h/#ifdef LINUX). Also needed by dsdgen_wrapper.cpp + # and dsdgen_converter.cpp which include dsdgen C headers. + target_compile_definitions(tpcds_benchmark PRIVATE TPCDS_ENABLE LINUX=1 TPCDS=1 EMBEDDED_DSDGEN=1) + if(TPCH_ENABLE_ORC) + target_compile_definitions(tpcds_benchmark PRIVATE TPCH_ENABLE_ORC) + endif() + if(TPCH_ENABLE_PAIMON) + target_compile_definitions(tpcds_benchmark PRIVATE TPCH_ENABLE_PAIMON) + endif() + if(TPCH_ENABLE_ICEBERG) + target_compile_definitions(tpcds_benchmark PRIVATE TPCH_ENABLE_ICEBERG) + endif() + if(TPCH_ENABLE_LANCE) + target_compile_definitions(tpcds_benchmark PRIVATE TPCH_ENABLE_LANCE) + endif() + if(TPCH_ENABLE_ASYNC_IO AND Uring_FOUND) + target_compile_definitions(tpcds_benchmark PRIVATE TPCH_ENABLE_ASYNC_IO) + endif() + if(TPCH_ENABLE_PERF_COUNTERS) + target_compile_definitions(tpcds_benchmark PRIVATE TPCH_ENABLE_PERF_COUNTERS) + endif() + message(STATUS "TPC-DS support enabled: tpcds_benchmark target added") + install(TARGETS tpcds_benchmark RUNTIME DESTINATION bin) +endif() + # Examples if(TPCH_BUILD_EXAMPLES) add_subdirectory(examples) diff --git a/benchmark-results/LANCE_STREAMING_ZERO_COPY_INVESTIGATION_SF5_STORE_SALES.md b/benchmark-results/LANCE_STREAMING_ZERO_COPY_INVESTIGATION_SF5_STORE_SALES.md new file mode 100644 index 0000000..eaa354e --- /dev/null +++ b/benchmark-results/LANCE_STREAMING_ZERO_COPY_INVESTIGATION_SF5_STORE_SALES.md @@ -0,0 +1,260 @@ +# Lance Streaming Zero-Copy Investigation (SF=5, store_sales) + +Date: 2026-03-09 +Scope: `tpcds_benchmark --format lance --table store_sales --scale-factor 5 --zero-copy` + +## Goal + +Investigate extra memory usage and copy overhead in Lance streaming path, with focus on Rust/Tokio overhead and true zero-copy delivery of Arrow batches. + +## Hypotheses + +1. Per-row C++ builder path causes avoidable copies before Rust. +2. Rust stream handoff may add extra copies/queue overhead. +3. Tokio runtime configuration may contribute to memory overhead. +4. Optional schema rewrite in stream path may add avoidable work/copies. + +## Implemented Experiments + +1. `store_sales` direct Arrow column-buffer batching (C++ side) instead of builder append path. +2. Rust-side scatter/gather stream handoff with chunked queue: + - `--lance-sg-batches` + - `--lance-sg-queue-chunks` +3. Rust-side memory stage logging (`--lance-mem-profile`, `--lance-mem-every`). +4. Perf profiling from `~/CLAUDE.md` workflow: + - `perf record --no-buildid -e cpu-clock:u -g -F 99 ...` + +## Key Findings + +1. C++ direct-buffer batching reduced main-thread copy share, but did not remove Tokio-side copy hotspot. +2. Across runs, top copy hotspot remained: + - `tokio-runtime-w libc.so.6 __memmove_avx_unaligned_erms` +3. Scatter/gather changed throughput/stall behavior, but did not consistently reduce Tokio memmove share. +4. Disabling stream schema rewrite (`--lance-no-schema-rewrite`) was not useful for the target copy hotspot. + +## Scatter/Gather Matrix (3 runs each, queue=8) + +Median results from `/tmp/sg_matrix_q8_runs.tsv`: + +| sg-batches | median elapsed | median rate | median stalls | median stall ms | median Tokio memmove | +|---:|---:|---:|---:|---:|---:| +| 1 | 48.54s | 296,639 rows/s | 13 | 44,014.0 | 12.47% | +| 2 | 38.52s | 373,833 rows/s | 13 | 34,079.4 | 12.16% | +| 4 | 48.96s | 294,146 rows/s | 14 | 40,837.7 | 12.53% | +| 8 | 62.08s | 231,948 rows/s | 13 | 51,909.3 | 10.24% | + +Notes: +- Best median throughput in this sample was `sg=2`. +- `sg=8` lowered memmove percentage but had worst median runtime. +- Run-to-run variance is high, so medians are required for decisions. + +## Cleanup Decisions From This Investigation + +Removed as non-useful in recent experiments: + +1. `--lance-no-schema-rewrite` path and related FFI/config plumbing. +2. `--lance-tokio-current-thread` path and related FFI/config plumbing. + +Kept: + +1. Scatter/gather experiment controls (`--lance-sg-batches`, `--lance-sg-queue-chunks`). +2. Rust memory profile controls (`--lance-mem-profile`, `--lance-mem-every`). +3. `store_sales` direct Arrow column-buffer generation path. + +## Current Conclusion + +The dominant copy hotspot is still inside Rust/Lance processing path (Tokio worker), not in the C++ row-builder layer. +Scatter/gather is useful as a throughput/stall tuning lever, but not a direct fix for Tokio memmove overhead. + +## SF=5 Re-evaluation Across 3 Largest TPC-DS Tables + +Tables: + +1. `store_sales` (`14,400,052` rows) +2. `catalog_sales` (`7,199,490` rows) +3. `web_sales` (`3,599,503` rows) + +Command shape: + +- `./tpcds_benchmark --format lance --scale-factor 5 --max-rows 0 --zero-copy --zero-copy-mode ` + +Initial sweep (`/tmp/tpcds_lance_sf5_modes.txt`): + +| table | sync (time, RSS) | async (time, RSS) | auto (time, RSS) | +|---|---|---|---| +| store_sales | 18.92s, 101,476 KB | 21.09s, 876,036 KB | 18.61s, 101,732 KB | +| catalog_sales | 41.80s, 110,636 KB | 10.03s, 1,099,008 KB | 8.08s, 111,252 KB | +| web_sales | 50.91s, 110,244 KB | 3.78s, 1,068,776 KB | 3.95s, 110,052 KB | + +Run-order check showed strong outliers in sync mode for `catalog_sales` and `web_sales`. +When rerun with flipped order (`auto` then `sync`) on `web_sales`, results were close: + +1. `auto`: 4.25s, 109,240 KB +2. `sync`: 4.37s, 109,428 KB + +Conclusion from stable runs: + +1. `sync` and `auto` are similar for single-table generation. +2. `async` consistently increases peak RSS by about `8x-10x`. +3. Throughput differences are workload/noise-sensitive; memory delta is robust. + +## Agreed Next Plan + +1. Add `--zero-copy-mode sync|async|auto`. +2. Make `auto` choose sync for single-table generation. +3. Implement synchronous bounded streaming path for Lance: + - preserve memory capping goal of `--zero-copy` + - avoid Tokio background task/queue overhead for single table +4. Keep async path for cases where overlap can help (for example, multi-table parallel generation). +5. Generalize and clean current `store_sales`-specific column-buffer path into a table-agnostic columnar batching framework. + +## Implementation Status + +Implemented in code: + +1. `--zero-copy-mode auto|sync|async` option. +2. Lance single-table default behavior: + - `auto` selects synchronous bounded mode + - `async` keeps Tokio background streaming mode +3. New bounded buffered flush configuration in Rust FFI (for sync mode memory capping). +4. Removed `store_sales`-specific column-buffer hack path and switched `store_sales` back to generic generation flow for consistency. +5. `tpcds_benchmark` default `--zero-copy-mode` changed to `sync` for single-table Lance generation. +6. Added explicit copy telemetry at close: + - C++ side: `Lance Copy Profile: mode= cxx_to_rust_bytes=...` and async queue peak MB + - Rust side: `Lance FFI copy: reader_batches/rows/input_bytes/rewrap_bytes + SG queue bytes/chunks/peak` + +Still pending: + +1. Table-agnostic generalized columnar batching framework (clean replacement for specialized experiments). + +## Perf Profiling (SF=5, Lance, zero-copy) + +Using `~/CLAUDE.md` workflow: + +- `sudo perf record --no-buildid -e cpu-clock:u -g -F 99 -o /tmp/perf_*.data -- ./tpcds_benchmark ...` +- `sudo perf report --stdio --no-children ...` + +Top-40 `tokio-runtime-w` share in report: + +| table | sync | async | +|---|---:|---:| +| store_sales | 0.00% | 11.87% | +| catalog_sales | 0.00% | 12.50% | +| web_sales | 0.40% | 12.88% | + +Recurring async-specific hotspots: + +1. `tokio-runtime-w libc.so.6 __memmove_avx_unaligned_erms` +2. `tokio-runtime-w ...run_count::count_runs` +3. `tokio-runtime-w ...Iterator::fold` + +This confirms meaningful CPU work migration into Tokio worker threads in async mode, together with much higher RSS. + +## Async Memory-Tuning Experiment (Requested Follow-up) + +Target: + +- `store_sales`, SF=5, Lance, `--zero-copy --zero-copy-mode async` + +Matrix (`/tmp/tpcds_async_tuning_store_sales_sf5.log`): + +| config | key params | elapsed | rate | max RSS | +|---|---|---:|---:|---:| +| baseline | `queue=4, sg=1, sgq=4, blocking=8` | 32.21s | 447,177 r/s | 864,288 KB | +| q1_sg1 | `queue=1, sg=1, sgq=1, blocking=8` | 28.00s | 517,007 r/s | 895,780 KB | +| q1_sg2 | `queue=1, sg=2, sgq=1, blocking=8` | 28.52s | 505,492 r/s | 896,136 KB | +| q2_sg2 | `queue=2, sg=2, sgq=2, blocking=8` | 33.15s | 434,802 r/s | 895,072 KB | +| q1_sg4 | `queue=1, sg=4, sgq=1, blocking=8` | 27.91s | 516,731 r/s | 890,796 KB | +| q1_sg1_b2 | `queue=1, sg=1, sgq=1, blocking=2` | 66.09s | 217,918 r/s | 864,548 KB | + +Reference sync run: + +- `--zero-copy --zero-copy-mode sync`: `20.30s`, `709,727 r/s`, `102,308 KB` RSS + +Outcome: + +1. Async queue/chunk tuning changed throughput and queue behavior, but did **not** bring async RSS close to sync. +2. Async RSS stayed in a narrow high band (`~864–896 MB`) despite aggressive queue reduction. +3. Lowering Tokio blocking threads to 2 did not reduce RSS materially, but severely hurt performance. +4. Best async throughput in this sample (`q1_sg4`) is still significantly slower and much higher memory than sync reference. + +Updated recommendation: + +For single-table TPC-DS generation, keep synchronous zero-copy as the default path; treat async as experimental/optional. + +## Async RSS Floor Isolation (Next Step) + +Goal: + +Identify whether the async RSS floor is caused by C++ queue buffering or Rust/Lance-side processing. + +Method: + +`store_sales`, SF=5, `--format lance --zero-copy` with Rust memory profiling: + +- `--lance-mem-profile --lance-mem-every 100` + +Runs (`/tmp/tpcds_mem_isolation_store_sales_sf5.log`): + +1. `sync_profile`: `--zero-copy-mode sync` +2. `async_profile_default`: `--zero-copy-mode async --lance-stream-queue 4 --lance-sg-batches 1 --lance-sg-queue-chunks 4` +3. `async_profile_lowq`: `--zero-copy-mode async --lance-stream-queue 1 --lance-sg-batches 1 --lance-sg-queue-chunks 1` + +Results: + +| case | elapsed | rate | max RSS | C++ queue peak | Rust reader max RSS | Rust RSS after execute | +|---|---:|---:|---:|---:|---:|---:| +| sync_profile | 22.95s | 628,064 r/s | 103,152 KB | n/a | n/a | n/a | +| async_profile_default | 21.14s | 681,483 r/s | 869,800 KB | 5.625 MB | 855,976 KB | 817,292 KB | +| async_profile_lowq | 21.09s | 683,142 r/s | 850,112 KB | 1.406 MB | 822,592 KB | 791,656 KB | + +Interpretation: + +1. Shrinking C++ queue memory by `~4.2 MB` changed total RSS only by `~19.7 MB`. +2. Async run RSS is dominated by Rust/Lance-side memory during stream execution (`reader_next` stage reaching `~823–856 MB`). +3. `reader_input_bytes == reader_rewrap_bytes` in both async runs, confirming schema rewrap itself is not duplicating payload size. +4. SG queue bytes were zero in this test (`sg=1`), so scatter/gather queue buffering is not the source here. + +Conclusion: + +The async memory floor is primarily inside Lance async stream execution (Tokio worker + Lance encode/accumulation), not in the C++ producer queue. Queue-depth tuning alone cannot close the gap to sync memory. + +## Deeper Lance-Side Live-Memory Instrumentation + +Implemented additional Rust-side estimator: + +- `internal_live_est_kb = rss_kb - current_batch_bytes - sg_queue_current_bytes` +- tracked peak as `max_internal_live_est_bytes` + +This instrumentation is emitted in: + +1. per-batch memory logs (`reader_next` / `sg_reader_next`) +2. final copy summary (`Lance FFI copy: ... max_internal_live_est_bytes=...`) + +Sample run (SF=5, `store_sales`, async low queue): + +- command: `--zero-copy --zero-copy-mode async --lance-stream-queue 1 --lance-sg-batches 1 --lance-sg-queue-chunks 1 --lance-mem-profile` +- `MAX_RSS_KB=869324` +- `max_internal_live_est_bytes=868282368` (~`828 MB`) +- C++ queue peak remained only `1.40625 MB` + +Key point: + +Even after subtracting producer queue and current batch payload, estimated Lance-internal live memory still rises to ~`828 MB`, reinforcing that the dominant async memory overhead is inside Lance async execution/encoding lifecycle. + +## Post-Implementation Sanity Check (SF=5 store_sales) + +`--format lance --table store_sales --scale-factor 5 --max-rows 0 --zero-copy` + +1. `--zero-copy-mode sync` + - elapsed: `22.07s` + - rate: `652,552 rows/s` + - max RSS: `102,524 KB` +2. `--zero-copy-mode async` + - elapsed: `33.87s` + - rate: `425,172 rows/s` + - max RSS: `851,160 KB` + +Result in this run: + +Synchronous bounded zero-copy mode is both faster and significantly lower memory than async mode for single-table `store_sales` SF=5. diff --git a/benchmark-results/STORE_SALES_LANCE_STREAMING_LARGE_FRAGMENT_EXPERIMENT.md b/benchmark-results/STORE_SALES_LANCE_STREAMING_LARGE_FRAGMENT_EXPERIMENT.md new file mode 100644 index 0000000..e0b1b5f --- /dev/null +++ b/benchmark-results/STORE_SALES_LANCE_STREAMING_LARGE_FRAGMENT_EXPERIMENT.md @@ -0,0 +1,71 @@ +# Store Sales Lance Zero-Copy Large Fragment Experiment + +Date: 2026-03-10 +Command: `./tpcds_benchmark --format lance --table store_sales --scale-factor 20 --max-rows 0 --zero-copy --output-dir /tmp` +Mode: sync zero-copy (`--zero-copy-mode sync` default) + +## Baseline before changes +Hardcoded sync flush config in `src/tpcds_main.cpp`: +- `8` batches +- `65,536` rows + +Observed result: +- elapsed: `316.92s` +- throughput: `181,745 rows/s` +- max RSS: `~108 MB` +- output files: `879 data + 879 manifests + 879 transactions` + +## Experiment A: larger fragment / transaction +Changed sync flush config to: +- `128` batches +- `1,048,576` rows + +Observed result: +- elapsed: `210.77s` +- `TIME_SEC=212.26` +- throughput: `273,274 rows/s` +- `MAX_RSS_KB=602084` +- output files: `55 data + 55 manifests + 55 transactions` + +Delta vs baseline: +- throughput: about `+50%` +- file / manifest / transaction count: about `-16x` +- RSS: about `+494 MB` + +Interpretation: +- The original sync path was over-fragmenting badly. +- Larger transactions help a lot. +- The cost is higher bounded memory, but still well below machine capacity. + +## Experiment B: too large +Changed sync flush config to: +- `256` batches +- `2,097,152` rows + +Observed result: +- elapsed: `617.79s` +- `TIME_SEC=623.82` +- throughput: `93,234 rows/s` +- `MAX_RSS_KB=866080` +- output files: `28 data + 28 manifests + 28 transactions` + +Interpretation: +- Reducing transaction count further did not help. +- This setting likely pushes too much buffered data into a worse writeback / stall regime. +- Bigger transactions are not monotonic wins. + +## Conclusion +For this machine and workload, a moderate increase in transaction / fragment size is the winning direction: +- `128` / `1,048,576` looks much better than `8` / `65,536` +- `256` / `2,097,152` is too large + +This strongly supports the earlier diagnosis: +- the main scaling problem is Lance fragment / commit granularity in sync zero-copy mode +- not a new row-generation CPU hotspot + +## Recommended default +Keep sync zero-copy bounded, but use: +- `128` batches +- `1,048,576` rows + +Then re-evaluate SF=10/SF=20/SF=100 projections with that setting. diff --git a/benchmark-results/STORE_SALES_LANCE_STREAMING_RERUN_MATRIX_128x1M.md b/benchmark-results/STORE_SALES_LANCE_STREAMING_RERUN_MATRIX_128x1M.md new file mode 100644 index 0000000..5cdda86 --- /dev/null +++ b/benchmark-results/STORE_SALES_LANCE_STREAMING_RERUN_MATRIX_128x1M.md @@ -0,0 +1,82 @@ +# Store Sales Lance Zero-Copy Rerun Matrix with 128-batch / 1,048,576-row Flush + +Date: 2026-03-10 +Command: `./tpcds_benchmark --format lance --table store_sales --scale-factor --max-rows 0 --zero-copy --output-dir /tmp` +Mode: sync zero-copy +Flush setting: `128` batches / `1,048,576` rows + +## Rerun results + +| SF | elapsed | TIME_SEC | throughput rows/s | MAX_RSS_KB | data files | manifests | txns | +|---:|---:|---:|---:|---:|---:|---:|---:| +| 5 | 20.58s | 23.48 | 699,707 | 541,632 | 14 | 14 | 14 | +| 10 | 50.30s | 50.32 | 572,530 | 571,844 | 28 | 28 | 28 | +| 20 | 180.06s | 180.08 | 319,894 | 603,268 | 55 | 55 | 55 | + +## Previous tiny-fragment baseline + +| SF | elapsed | throughput rows/s | MAX_RSS_KB | data/manifests/txns | +|---:|---:|---:|---:|---:| +| 5 | 21.87s | 658,385 | 101,260 | about 220 each expected by shape | +| 10 | 201.33s | 143,053 | 104,732 | about 440 each expected by shape | +| 20 | 316.92s | 181,745 | 108,100 | 879 each | + +## Comparison + +Improvements with larger fragments: +- SF=5: `699,707 / 658,385 = 1.06x` +- SF=10: `572,530 / 143,053 = 4.00x` +- SF=20: `319,894 / 181,745 = 1.76x` + +Memory tradeoff: +- RSS rose from about `100 MB` to about `540-603 MB` +- still well within the machine memory limit + +Scaling slope with new setting: +- SF=5 -> SF=10 throughput drop: `699,707 -> 572,530` (`0.82x`) +- SF=10 -> SF=20 throughput drop: `572,530 -> 319,894` (`0.56x`) + +Conclusion: +- The catastrophic collapse at SF=10 was largely caused by tiny fragments / transactions. +- The new setting removes most of that pathologically bad behavior. +- There is still a noticeable decline by SF=20, so fragment sizing was not the only factor. +- But it is now plausible to continue profiling at this setting; the previous one was clearly misleadingly bad. + +## SF=20 Reprofile at 128-batch / 1,048,576-row Flush + +### perf record (`cpu-clock:u`) +Run result: +- `elapsed=153.23s` +- `375,896 rows/s` + +Top user-space symbols: +- `11.12%` `__memmove_avx_unaligned_erms` +- `9.37%` `tpcds::append_store_sales_to_builders` +- `9.34%` `decimal_t_op` +- `8.93%` `genrand_decimal` +- `7.21%` `arrow::NumericBuilder::Append` +- `6.14%` `genrand_integer` +- `6.11%` `arrow::NumericBuilder::Append` +- `6.01%` `getTableFromColumn` +- `4.91%` `genrand_key` +- `3.36%` `set_pricing` +- `2.61%` `lance_encoding::...run_count::count_runs` + +Interpretation: +- After fixing fragment size, the user-space profile is still dominated by row generation, Arrow append, and memcpy. +- Lance encoding work is visible but not dominant in user CPU samples. + +### perf stat +Run result: +- `elapsed=146.03s` +- `394,427 rows/s` +- `task-clock=40.82s` +- `CPUs utilized=0.276` +- `context-switches=25,138` +- `cpu-migrations=771` +- `page-faults=2,661,633` + +Interpretation: +- The fragment fix improved CPU utilization versus the old tiny-fragment path (`~0.276` vs `~0.168`). +- But the run is still mostly outside user CPU. +- Remaining slowdown is still dominated by stall / wait / writeback effects, not a new hot loop in row generation. diff --git a/benchmark-results/STORE_SALES_LANCE_STREAMING_SCALE_PROFILE.md b/benchmark-results/STORE_SALES_LANCE_STREAMING_SCALE_PROFILE.md new file mode 100644 index 0000000..1c4babc --- /dev/null +++ b/benchmark-results/STORE_SALES_LANCE_STREAMING_SCALE_PROFILE.md @@ -0,0 +1,75 @@ +# Store Sales Lance Zero-Copy Scaling Profile (SF=5/10/20) + +Date: 2026-03-09 +Mode: `tpcds_benchmark --format lance --table store_sales --zero-copy` +Observed default mode: `sync` + +## Measured runs + +### perf record (user CPU hotspots) + +| SF | rows | elapsed (run log) | rate rows/s | max rss kb | +|---:|---:|---:|---:|---:| +| 5 | 14,400,052 | 18.17s | 792,469 | 101,260 | +| 10 | 28,800,991 | 66.36s | 434,016 | 104,732 | +| 20 | 57,598,932 | 250.53s | 229,912 | 108,100 | + +Top user-space symbols stayed broadly stable: +- `__memmove_avx_unaligned_erms`: ~9.4% to 10.5% +- `tpcds::append_store_sales_to_builders`: ~7.4% to 10.0% +- `genrand_decimal`, `decimal_t_op`, `arrow::NumericBuilder<...>::Append`: similar ordering at all scales +- No new dominant user-space hotspot appears at SF=10 or SF=20 + +### perf stat (task-clock vs elapsed) + +| SF | elapsed (run log) | task-clock:u | CPUs utilized | page faults | +|---:|---:|---:|---:|---:| +| 5 | 21.87s | 10.83s | 0.495 | 950,875 | +| 10 | 201.33s | 25.56s | 0.127 | 1,943,553 | +| 20 | 316.92s | 53.24s | 0.168 | 4,113,395 | + +Interpretation: +- SF=5 already spends about half of wall time outside user CPU. +- At SF=10 and SF=20 the benchmark spends most wall time stalled or sleeping, not executing user-space compute. +- This is why `perf record -e cpu-clock:u` does not show a new hot function: the slowdown is dominated by non-user-CPU time. + +## Output shape at SF=20 + +Latest SF=20 output under `/tmp/store_sales.lance`: +- Total size: `9.8G` +- Total files: `2637` +- `_versions` manifests: `879` +- `_transactions`: `879` +- `data` files: `879` +- Average data file size: `11.25 MB` +- Estimated rows per data file: `57,598,932 / 879 = 65,528 rows` + +This is the key scaling signal: +- The writer is producing about one data file / one transaction / one manifest per ~65K rows. +- As scale increases, the benchmark performs hundreds more write/commit cycles. +- The CPU hotspot mix does not change, but wall time does, which is consistent with append/commit/writeback overhead rather than row generation cost. + +## Likely root cause + +The throughput collapse is most likely caused by the Lance write path committing too frequently with small fragments: +- many small data files +- many manifest updates +- many transaction files +- increasing filesystem writeback / metadata overhead + +This matches all observed evidence: +- stable user-space hotspots +- flat RSS +- low CPU utilization at larger SF +- high file / manifest / transaction counts at SF=20 + +## What this means + +The main problem is not Arrow batch construction or zero-copy import itself. +The main problem is fragment / commit granularity on the Lance side. + +## Recommended next experiments + +1. Force much larger flush / fragment sizes for sync zero-copy and re-measure SF=20. +2. Measure file count and throughput together to verify the slope improves when data files drop from ~879 to something much smaller. +3. If needed, bypass append-per-chunk behavior and write a single longer stream / transaction for sync mode too. diff --git a/cmake/gen_dist_cache.py b/cmake/gen_dist_cache.py new file mode 100644 index 0000000..b228c22 --- /dev/null +++ b/cmake/gen_dist_cache.py @@ -0,0 +1,322 @@ +#!/usr/bin/env python3 +""" +gen_dist_cache.py - Generate pre-parsed distribution cache C arrays from tpcds.idx. + +Usage: gen_dist_cache.py + +Background +---------- +dsdgen's dist_op() fetches distribution values per row by looking up a string +in a flat char buffer via atoi() (TKN_INT) or strtodec() (TKN_DECIMAL). In a +profiling run this appears as ~10% strlen_avx2 + ~9% strchr + ~8% strtoll. + +This generator reads tpcds.idx at *build time*, parses every int/decimal value +set, and emits static const arrays. dist.c then skips atoi/strtodec entirely +when EMBEDDED_DSDGEN is defined and points int_cache[]/dec_cache[] at these +read-only arrays. + +Binary format of tpcds.idx +--------------------------- + [0..3] int32_t entry_count (network byte order) + ...distribution data blocks... + [end - entry_count*IDX_SIZE .. end] index table + +IDX_SIZE = D_NAME_LEN(20) + 7 * sizeof(int32_t) = 48 bytes per entry. +Each index entry: + name[20] char (null-padded) + index int32_t + offset int32_t -- byte offset into file where dist data starts + str_space int32_t -- bytes in the string pool + length int32_t -- number of rows (entries) + w_width int32_t -- number of weight sets + v_width int32_t -- number of value sets + name_space int32_t -- bytes in the name alias pool + +Distribution data block at : + type_vector[v_width] int32_t each + weight_sets[w_width][length] int32_t each (cumulative, not needed here) + value_sets[v_width][length] int32_t each (byte offsets into strings[]) + names[name_space] char bytes + strings[str_space] char bytes (null-terminated values) + +Token types (from dcomp.h): + TKN_VARCHAR = 6 + TKN_INT = 7 + TKN_DATE = 9 + TKN_DECIMAL = 10 +""" + +import sys +import os +import struct + +# Token type constants (must match dcomp.h) +TKN_VARCHAR = 6 +TKN_INT = 7 +TKN_DATE = 9 +TKN_DECIMAL = 10 + +# Index entry size (must match dist.h IDX_SIZE) +D_NAME_LEN = 20 +IDX_SIZE = D_NAME_LEN + 7 * 4 # 48 bytes + + +def safe_c_ident(name: str) -> str: + """Convert a distribution name to a valid C identifier.""" + return name.strip('\x00').replace('-', '_').replace(' ', '_') + + +def strtodec_py(s: str): + """ + Replicate dsdgen's strtodec() logic in Python. + Returns (flags, precision, scale, number) matching decimal_t. + + strtodec() sets: + - flags = 0 + - if no decimal point: scale=len(int_str), number=int(s), precision=0 + - else: scale=len(int_part), number=int_part*10^frac_len+int_frac, precision=len(frac) + Then if s starts with '-' and number > 0: number *= -1 + """ + flags = 0 + s = s.strip() + dot = s.find('.') + if dot == -1: + scale = len(s) + number = int(s) if s and s not in ('-', '+') else 0 + precision = 0 + else: + int_part = s[:dot] + frac_part = s[dot+1:] + scale = len(int_part) + base = int(int_part) if int_part and int_part not in ('-', '+') else 0 + frac_val = int(frac_part) if frac_part else 0 + precision = len(frac_part) + number = base + for _ in range(precision): + number *= 10 + number += frac_val + # sign correction: if string starts with '-' but number ended up positive + if s.startswith('-') and number > 0: + number = -number + return (flags, precision, scale, number) + + +def parse_tpcds_idx(filepath: str): + """ + Parse tpcds.idx and return a list of distribution dicts: + { + 'name': str, + 'offset': int, + 'str_space': int, + 'length': int, + 'w_width': int, + 'v_width': int, + 'name_space': int, + 'type_vector': [int, ...], # v_width entries + 'value_sets': [[int,...], ...], # v_width x length offsets into strings + 'strings': bytes, # str_space bytes + } + """ + with open(filepath, 'rb') as f: + data = f.read() + + file_size = len(data) + offset = 0 + + # Read entry_count from the start of the file + entry_count, = struct.unpack_from('>i', data, 0) + + # Index table is at the end + idx_table_offset = file_size - entry_count * IDX_SIZE + + dists = [] + for i in range(entry_count): + base = idx_table_offset + i * IDX_SIZE + name_raw = data[base:base + D_NAME_LEN] + name = name_raw.split(b'\x00')[0].decode('ascii', errors='replace') + (index, d_offset, str_space, length, w_width, v_width, name_space) = \ + struct.unpack_from('>7i', data, base + D_NAME_LEN) + + # Parse distribution data at d_offset + pos = d_offset + + # type_vector + type_vector = list(struct.unpack_from('>' + 'i' * v_width, data, pos)) + pos += v_width * 4 + + # weight_sets (skip — not needed for value cache) + pos += w_width * length * 4 + + # value_sets: v_width x length offsets into strings[] + value_sets = [] + for v in range(v_width): + row = list(struct.unpack_from('>' + 'i' * length, data, pos)) + value_sets.append(row) + pos += length * 4 + + # names (skip for now) + pos += name_space + + # strings + strings = data[pos:pos + str_space] + + dists.append({ + 'name': name, + 'offset': d_offset, + 'str_space': str_space, + 'length': length, + 'w_width': w_width, + 'v_width': v_width, + 'name_space': name_space, + 'type_vector': type_vector, + 'value_sets': value_sets, + 'strings': strings, + }) + + return dists + + +def get_string(strings: bytes, offset: int) -> str: + """Extract a null-terminated string from the strings pool.""" + end = strings.index(b'\x00', offset) if b'\x00' in strings[offset:] else len(strings) + return strings[offset:end].decode('ascii', errors='replace') + + +def generate(input_path: str, output_path: str) -> None: + dists = parse_tpcds_idx(input_path) + + lines = [] + lines.append("/* Auto-generated by cmake/gen_dist_cache.py -- do not edit */") + lines.append("/* Pre-parsed TPC-DS distribution cache: eliminates per-row") + lines.append(" atoi/strtodec overhead in dsdgen's dist_op() hot path. */") + lines.append("") + lines.append("#ifdef EMBEDDED_DSDGEN") + lines.append("") + lines.append("#include ") + lines.append("#include /* strcmp */") + lines.append("#include \"decimal.h\"") + lines.append("") + + # Emit per-distribution per-vset arrays + int_entries = [] # (dist_name, vset_idx, c_array_name) + dec_entries = [] # (dist_name, vset_idx, c_array_name) + + for d in dists: + cname = safe_c_ident(d['name']) + length = d['length'] + strings = d['strings'] + + for vi, typ in enumerate(d['type_vector']): + offsets = d['value_sets'][vi] + + if typ == TKN_INT: + arr_name = "tpcds_int_{}_v{}".format(cname, vi) + vals = [] + for j in range(length): + s = get_string(strings, offsets[j]) + try: + vals.append(int(s)) + except ValueError: + vals.append(0) + # emit array + lines.append("static const int {}[{}] = {{".format(arr_name, length)) + # 16 values per row + for chunk_start in range(0, length, 16): + chunk = vals[chunk_start:chunk_start+16] + comma = "," if chunk_start + 16 < length else "" + lines.append(" {}{}".format(", ".join(str(v) for v in chunk), comma)) + lines.append("};") + lines.append("") + int_entries.append((d['name'], vi, arr_name)) + + elif typ == TKN_DECIMAL: + arr_name = "tpcds_dec_{}_v{}".format(cname, vi) + vals = [] + for j in range(length): + s = get_string(strings, offsets[j]) + try: + fl, prec, sc, num = strtodec_py(s) + except Exception: + fl, prec, sc, num = 0, 0, 0, 0 + vals.append((fl, prec, sc, num)) + lines.append("static const decimal_t {}[{}] = {{".format(arr_name, length)) + for j, (fl, prec, sc, num) in enumerate(vals): + comma = "," if j < length - 1 else "" + lines.append(" {{{}, {}, {}, {}LL}}{}".format(fl, prec, sc, num, comma)) + lines.append("};") + lines.append("") + dec_entries.append((d['name'], vi, arr_name)) + + # Emit lookup tables + lines.append("/* --- int cache lookup table --- */") + lines.append("typedef struct { const char *name; int vset; const int *vals; } tpcds_int_entry_t;") + lines.append("static const tpcds_int_entry_t tpcds_int_table[] = {") + for (dname, vi, arr) in int_entries: + lines.append(' {{"{}", {}, {}}},'.format(dname, vi, arr)) + lines.append(" {NULL, 0, NULL}") + lines.append("};") + lines.append("") + + lines.append("/* --- decimal cache lookup table --- */") + lines.append("typedef struct { const char *name; int vset; const decimal_t *vals; } tpcds_dec_entry_t;") + lines.append("static const tpcds_dec_entry_t tpcds_dec_table[] = {") + for (dname, vi, arr) in dec_entries: + lines.append(' {{"{}", {}, {}}},'.format(dname, vi, arr)) + lines.append(" {NULL, 0, NULL}") + lines.append("};") + lines.append("") + + # Emit lookup functions + lines.append("const int *tpcds_lookup_int_cache(const char *dist_name, int vset);") + lines.append("const decimal_t *tpcds_lookup_dec_cache(const char *dist_name, int vset);") + lines.append("") + lines.append("const int *tpcds_lookup_int_cache(const char *dist_name, int vset) {") + lines.append(" const tpcds_int_entry_t *e = tpcds_int_table;") + lines.append(" for (; e->name != NULL; ++e)") + lines.append(" if (e->vset == vset && strcmp(e->name, dist_name) == 0)") + lines.append(" return e->vals;") + lines.append(" return NULL;") + lines.append("}") + lines.append("") + lines.append("const decimal_t *tpcds_lookup_dec_cache(const char *dist_name, int vset) {") + lines.append(" const tpcds_dec_entry_t *e = tpcds_dec_table;") + lines.append(" for (; e->name != NULL; ++e)") + lines.append(" if (e->vset == vset && strcmp(e->name, dist_name) == 0)") + lines.append(" return e->vals;") + lines.append(" return NULL;") + lines.append("}") + lines.append("") + lines.append("#endif /* EMBEDDED_DSDGEN */") + lines.append("") + + out_dir = os.path.dirname(output_path) + if out_dir: + os.makedirs(out_dir, exist_ok=True) + + with open(output_path, 'w') as f: + f.write('\n'.join(lines) + '\n') + + n_int = len(int_entries) + n_dec = len(dec_entries) + n_dists = len(dists) + print("Parsed {} distributions, {} int arrays, {} decimal arrays -> {}".format( + n_dists, n_int, n_dec, os.path.basename(output_path))) + + +def main() -> None: + if len(sys.argv) != 3: + print("Usage: gen_dist_cache.py ", file=sys.stderr) + sys.exit(1) + + input_path = sys.argv[1] + output_path = sys.argv[2] + + if not os.path.exists(input_path): + print("Error: input file not found: {}".format(input_path), file=sys.stderr) + sys.exit(1) + + generate(input_path, output_path) + + +if __name__ == "__main__": + main() diff --git a/cmake/gen_dsts.py b/cmake/gen_dsts.py new file mode 100644 index 0000000..3d744d0 --- /dev/null +++ b/cmake/gen_dsts.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +""" +gen_dsts.py - Embed the TPC-DS binary distribution index (tpcds.idx) into a + C source file as a static byte array. + +Usage: gen_dsts.py + +Background +---------- +TPC-DS's dsdgen reads distribution data from a compiled binary file (tpcds.idx) +produced by the 'distcomp' tool. At runtime, dist.c opens this file via: + + fopen(get_str("DISTRIBUTIONS"), "rb") + +To avoid shipping tpcds.idx as a separate runtime file, we embed its bytes here +as a C uint8_t array. DSDGenWrapper writes the embedded bytes to a tmpfile on +first use and points the DISTRIBUTIONS param at that tmpfile. + +This mirrors the approach used by cmake/gen_dists.py for TPC-H's dists.dss. +""" + +import sys +import os + + +def embed_binary(input_path: str, output_path: str) -> None: + with open(input_path, "rb") as f: + data = f.read() + + size = len(data) + filename = os.path.basename(input_path) + + lines = [] + lines.append( + "/* Auto-generated from {} by cmake/gen_dsts.py -- do not edit */".format(filename) + ) + lines.append("") + lines.append("#include ") + lines.append("#include ") + lines.append("") + lines.append("/* Embedded binary content of {} ({} bytes) */".format(filename, size)) + lines.append("const uint8_t tpcds_idx_data[] = {") + + # 16 bytes per row for readability + for i in range(0, size, 16): + chunk = data[i : i + 16] + hex_vals = ", ".join("0x{:02x}".format(b) for b in chunk) + comma = "," if i + 16 < size else "" + lines.append(" {}{}".format(hex_vals, comma)) + + lines.append("};") + lines.append("") + lines.append( + "const size_t tpcds_idx_size = {};".format(size) + ) + lines.append("") + + out_dir = os.path.dirname(output_path) + if out_dir: + os.makedirs(out_dir, exist_ok=True) + + with open(output_path, "w") as f: + f.write("\n".join(lines) + "\n") + + print( + "Embedded {} ({} bytes) -> {}".format(filename, size, os.path.basename(output_path)) + ) + + +def main() -> None: + if len(sys.argv) != 3: + print("Usage: gen_dsts.py ", file=sys.stderr) + sys.exit(1) + + input_path = sys.argv[1] + output_path = sys.argv[2] + + if not os.path.exists(input_path): + print("Error: input file not found: {}".format(input_path), file=sys.stderr) + sys.exit(1) + + embed_binary(input_path, output_path) + + +if __name__ == "__main__": + main() diff --git a/include/tpch/dsdgen_col_idx.hpp b/include/tpch/dsdgen_col_idx.hpp new file mode 100644 index 0000000..9acb287 --- /dev/null +++ b/include/tpch/dsdgen_col_idx.hpp @@ -0,0 +1,534 @@ +// AUTO-GENERATED by scripts/gen_col_indices.py — DO NOT EDIT MANUALLY +// Regenerate: python3 scripts/gen_col_indices.py +// Source of truth: src/dsdgen/dsdgen_wrapper.cpp (get_schema() switch) +// +// Provides zero-overhead named column indices for BuilderMap (vector) access. +// Usage: builders[col::store_returns::sr_returned_date_sk] +#pragma once +#include + +namespace tpcds { +namespace col { + +// call_center (31 columns) +namespace call_center { + constexpr std::size_t cc_call_center_sk = 0; + constexpr std::size_t cc_call_center_id = 1; + constexpr std::size_t cc_rec_start_date_sk = 2; + constexpr std::size_t cc_rec_end_date_sk = 3; + constexpr std::size_t cc_closed_date_sk = 4; + constexpr std::size_t cc_open_date_sk = 5; + constexpr std::size_t cc_name = 6; + constexpr std::size_t cc_class = 7; + constexpr std::size_t cc_employees = 8; + constexpr std::size_t cc_sq_ft = 9; + constexpr std::size_t cc_hours = 10; + constexpr std::size_t cc_manager = 11; + constexpr std::size_t cc_mkt_id = 12; + constexpr std::size_t cc_mkt_class = 13; + constexpr std::size_t cc_mkt_desc = 14; + constexpr std::size_t cc_market_manager = 15; + constexpr std::size_t cc_division = 16; + constexpr std::size_t cc_division_name = 17; + constexpr std::size_t cc_company = 18; + constexpr std::size_t cc_company_name = 19; + constexpr std::size_t cc_street_number = 20; + constexpr std::size_t cc_street_name = 21; + constexpr std::size_t cc_street_type = 22; + constexpr std::size_t cc_suite_number = 23; + constexpr std::size_t cc_city = 24; + constexpr std::size_t cc_county = 25; + constexpr std::size_t cc_state = 26; + constexpr std::size_t cc_zip = 27; + constexpr std::size_t cc_country = 28; + constexpr std::size_t cc_gmt_offset = 29; + constexpr std::size_t cc_tax_percentage = 30; +} + +// catalog_page (9 columns) +namespace catalog_page { + constexpr std::size_t cp_catalog_page_sk = 0; + constexpr std::size_t cp_catalog_page_id = 1; + constexpr std::size_t cp_start_date_sk = 2; + constexpr std::size_t cp_end_date_sk = 3; + constexpr std::size_t cp_department = 4; + constexpr std::size_t cp_catalog_number = 5; + constexpr std::size_t cp_catalog_page_number = 6; + constexpr std::size_t cp_description = 7; + constexpr std::size_t cp_type = 8; +} + +// catalog_returns (27 columns) +namespace catalog_returns { + constexpr std::size_t cr_returned_date_sk = 0; + constexpr std::size_t cr_returned_time_sk = 1; + constexpr std::size_t cr_item_sk = 2; + constexpr std::size_t cr_refunded_customer_sk = 3; + constexpr std::size_t cr_refunded_cdemo_sk = 4; + constexpr std::size_t cr_refunded_hdemo_sk = 5; + constexpr std::size_t cr_refunded_addr_sk = 6; + constexpr std::size_t cr_returning_customer_sk = 7; + constexpr std::size_t cr_returning_cdemo_sk = 8; + constexpr std::size_t cr_returning_hdemo_sk = 9; + constexpr std::size_t cr_returning_addr_sk = 10; + constexpr std::size_t cr_call_center_sk = 11; + constexpr std::size_t cr_catalog_page_sk = 12; + constexpr std::size_t cr_ship_mode_sk = 13; + constexpr std::size_t cr_warehouse_sk = 14; + constexpr std::size_t cr_reason_sk = 15; + constexpr std::size_t cr_order_number = 16; + constexpr std::size_t cr_quantity = 17; + constexpr std::size_t cr_net_paid = 18; + constexpr std::size_t cr_ext_tax = 19; + constexpr std::size_t cr_net_paid_inc_tax = 20; + constexpr std::size_t cr_fee = 21; + constexpr std::size_t cr_ext_ship_cost = 22; + constexpr std::size_t cr_refunded_cash = 23; + constexpr std::size_t cr_reversed_charge = 24; + constexpr std::size_t cr_store_credit = 25; + constexpr std::size_t cr_net_loss = 26; +} + +// catalog_sales (34 columns) +namespace catalog_sales { + constexpr std::size_t cs_sold_date_sk = 0; + constexpr std::size_t cs_sold_time_sk = 1; + constexpr std::size_t cs_ship_date_sk = 2; + constexpr std::size_t cs_bill_customer_sk = 3; + constexpr std::size_t cs_bill_cdemo_sk = 4; + constexpr std::size_t cs_bill_hdemo_sk = 5; + constexpr std::size_t cs_bill_addr_sk = 6; + constexpr std::size_t cs_ship_customer_sk = 7; + constexpr std::size_t cs_ship_cdemo_sk = 8; + constexpr std::size_t cs_ship_hdemo_sk = 9; + constexpr std::size_t cs_ship_addr_sk = 10; + constexpr std::size_t cs_call_center_sk = 11; + constexpr std::size_t cs_catalog_page_sk = 12; + constexpr std::size_t cs_ship_mode_sk = 13; + constexpr std::size_t cs_warehouse_sk = 14; + constexpr std::size_t cs_item_sk = 15; + constexpr std::size_t cs_promo_sk = 16; + constexpr std::size_t cs_order_number = 17; + constexpr std::size_t cs_quantity = 18; + constexpr std::size_t cs_wholesale_cost = 19; + constexpr std::size_t cs_list_price = 20; + constexpr std::size_t cs_sales_price = 21; + constexpr std::size_t cs_ext_discount_amt = 22; + constexpr std::size_t cs_ext_sales_price = 23; + constexpr std::size_t cs_ext_wholesale_cost = 24; + constexpr std::size_t cs_ext_list_price = 25; + constexpr std::size_t cs_ext_tax = 26; + constexpr std::size_t cs_coupon_amt = 27; + constexpr std::size_t cs_ext_ship_cost = 28; + constexpr std::size_t cs_net_paid = 29; + constexpr std::size_t cs_net_paid_inc_tax = 30; + constexpr std::size_t cs_net_paid_inc_ship = 31; + constexpr std::size_t cs_net_paid_inc_ship_tax = 32; + constexpr std::size_t cs_net_profit = 33; +} + +// customer (18 columns) +namespace customer { + constexpr std::size_t c_customer_sk = 0; + constexpr std::size_t c_customer_id = 1; + constexpr std::size_t c_current_cdemo_sk = 2; + constexpr std::size_t c_current_hdemo_sk = 3; + constexpr std::size_t c_current_addr_sk = 4; + constexpr std::size_t c_first_shipto_date_id = 5; + constexpr std::size_t c_first_sales_date_id = 6; + constexpr std::size_t c_salutation = 7; + constexpr std::size_t c_first_name = 8; + constexpr std::size_t c_last_name = 9; + constexpr std::size_t c_preferred_cust_flag = 10; + constexpr std::size_t c_birth_day = 11; + constexpr std::size_t c_birth_month = 12; + constexpr std::size_t c_birth_year = 13; + constexpr std::size_t c_birth_country = 14; + constexpr std::size_t c_login = 15; + constexpr std::size_t c_email_address = 16; + constexpr std::size_t c_last_review_date = 17; +} + +// customer_address (13 columns) +namespace customer_address { + constexpr std::size_t ca_address_sk = 0; + constexpr std::size_t ca_address_id = 1; + constexpr std::size_t ca_street_number = 2; + constexpr std::size_t ca_street_name = 3; + constexpr std::size_t ca_street_type = 4; + constexpr std::size_t ca_suite_number = 5; + constexpr std::size_t ca_city = 6; + constexpr std::size_t ca_county = 7; + constexpr std::size_t ca_state = 8; + constexpr std::size_t ca_zip = 9; + constexpr std::size_t ca_country = 10; + constexpr std::size_t ca_gmt_offset = 11; + constexpr std::size_t ca_location_type = 12; +} + +// customer_demographics (9 columns) +namespace customer_demographics { + constexpr std::size_t cd_demo_sk = 0; + constexpr std::size_t cd_gender = 1; + constexpr std::size_t cd_marital_status = 2; + constexpr std::size_t cd_education_status = 3; + constexpr std::size_t cd_purchase_estimate = 4; + constexpr std::size_t cd_credit_rating = 5; + constexpr std::size_t cd_dep_count = 6; + constexpr std::size_t cd_dep_employed_count = 7; + constexpr std::size_t cd_dep_college_count = 8; +} + +// date_dim (26 columns) +namespace date_dim { + constexpr std::size_t d_date_sk = 0; + constexpr std::size_t d_date_id = 1; + constexpr std::size_t d_month_seq = 2; + constexpr std::size_t d_week_seq = 3; + constexpr std::size_t d_quarter_seq = 4; + constexpr std::size_t d_year = 5; + constexpr std::size_t d_dow = 6; + constexpr std::size_t d_moy = 7; + constexpr std::size_t d_dom = 8; + constexpr std::size_t d_qoy = 9; + constexpr std::size_t d_fy_year = 10; + constexpr std::size_t d_fy_quarter_seq = 11; + constexpr std::size_t d_fy_week_seq = 12; + constexpr std::size_t d_day_name = 13; + constexpr std::size_t d_holiday = 14; + constexpr std::size_t d_weekend = 15; + constexpr std::size_t d_following_holiday = 16; + constexpr std::size_t d_first_dom = 17; + constexpr std::size_t d_last_dom = 18; + constexpr std::size_t d_same_day_ly = 19; + constexpr std::size_t d_same_day_lq = 20; + constexpr std::size_t d_current_day = 21; + constexpr std::size_t d_current_week = 22; + constexpr std::size_t d_current_month = 23; + constexpr std::size_t d_current_quarter = 24; + constexpr std::size_t d_current_year = 25; +} + +// household_demographics (5 columns) +namespace household_demographics { + constexpr std::size_t hd_demo_sk = 0; + constexpr std::size_t hd_income_band_sk = 1; + constexpr std::size_t hd_buy_potential = 2; + constexpr std::size_t hd_dep_count = 3; + constexpr std::size_t hd_vehicle_count = 4; +} + +// income_band (3 columns) +namespace income_band { + constexpr std::size_t ib_income_band_id = 0; + constexpr std::size_t ib_lower_bound = 1; + constexpr std::size_t ib_upper_bound = 2; +} + +// inventory (4 columns) +namespace inventory { + constexpr std::size_t inv_date_sk = 0; + constexpr std::size_t inv_item_sk = 1; + constexpr std::size_t inv_warehouse_sk = 2; + constexpr std::size_t inv_quantity_on_hand = 3; +} + +// item (23 columns) +namespace item { + constexpr std::size_t i_item_sk = 0; + constexpr std::size_t i_item_id = 1; + constexpr std::size_t i_rec_start_date_id = 2; + constexpr std::size_t i_rec_end_date_id = 3; + constexpr std::size_t i_item_desc = 4; + constexpr std::size_t i_current_price = 5; + constexpr std::size_t i_wholesale_cost = 6; + constexpr std::size_t i_brand_id = 7; + constexpr std::size_t i_brand = 8; + constexpr std::size_t i_class_id = 9; + constexpr std::size_t i_class = 10; + constexpr std::size_t i_category_id = 11; + constexpr std::size_t i_category = 12; + constexpr std::size_t i_manufact_id = 13; + constexpr std::size_t i_manufact = 14; + constexpr std::size_t i_size = 15; + constexpr std::size_t i_formulation = 16; + constexpr std::size_t i_color = 17; + constexpr std::size_t i_units = 18; + constexpr std::size_t i_container = 19; + constexpr std::size_t i_manager_id = 20; + constexpr std::size_t i_product_name = 21; + constexpr std::size_t i_promo_sk = 22; +} + +// promotion (19 columns) +namespace promotion { + constexpr std::size_t p_promo_sk = 0; + constexpr std::size_t p_promo_id = 1; + constexpr std::size_t p_start_date_sk = 2; + constexpr std::size_t p_end_date_sk = 3; + constexpr std::size_t p_item_sk = 4; + constexpr std::size_t p_cost = 5; + constexpr std::size_t p_response_target = 6; + constexpr std::size_t p_promo_name = 7; + constexpr std::size_t p_channel_dmail = 8; + constexpr std::size_t p_channel_email = 9; + constexpr std::size_t p_channel_catalog = 10; + constexpr std::size_t p_channel_tv = 11; + constexpr std::size_t p_channel_radio = 12; + constexpr std::size_t p_channel_press = 13; + constexpr std::size_t p_channel_event = 14; + constexpr std::size_t p_channel_demo = 15; + constexpr std::size_t p_channel_details = 16; + constexpr std::size_t p_purpose = 17; + constexpr std::size_t p_discount_active = 18; +} + +// reason (3 columns) +namespace reason { + constexpr std::size_t r_reason_sk = 0; + constexpr std::size_t r_reason_id = 1; + constexpr std::size_t r_reason_desc = 2; +} + +// ship_mode (6 columns) +namespace ship_mode { + constexpr std::size_t sm_ship_mode_sk = 0; + constexpr std::size_t sm_ship_mode_id = 1; + constexpr std::size_t sm_type = 2; + constexpr std::size_t sm_code = 3; + constexpr std::size_t sm_carrier = 4; + constexpr std::size_t sm_contract = 5; +} + +// store (29 columns) +namespace store { + constexpr std::size_t s_store_sk = 0; + constexpr std::size_t s_store_id = 1; + constexpr std::size_t s_rec_start_date = 2; + constexpr std::size_t s_rec_end_date = 3; + constexpr std::size_t s_closed_date_sk = 4; + constexpr std::size_t s_store_name = 5; + constexpr std::size_t s_number_employees = 6; + constexpr std::size_t s_floor_space = 7; + constexpr std::size_t s_hours = 8; + constexpr std::size_t s_manager = 9; + constexpr std::size_t s_market_id = 10; + constexpr std::size_t s_geography_class = 11; + constexpr std::size_t s_market_desc = 12; + constexpr std::size_t s_market_manager = 13; + constexpr std::size_t s_division_id = 14; + constexpr std::size_t s_division_name = 15; + constexpr std::size_t s_company_id = 16; + constexpr std::size_t s_company_name = 17; + constexpr std::size_t s_street_number = 18; + constexpr std::size_t s_street_name = 19; + constexpr std::size_t s_street_type = 20; + constexpr std::size_t s_suite_number = 21; + constexpr std::size_t s_city = 22; + constexpr std::size_t s_county = 23; + constexpr std::size_t s_state = 24; + constexpr std::size_t s_zip = 25; + constexpr std::size_t s_country = 26; + constexpr std::size_t s_gmt_offset = 27; + constexpr std::size_t s_tax_percentage = 28; +} + +// store_returns (20 columns) +namespace store_returns { + constexpr std::size_t sr_returned_date_sk = 0; + constexpr std::size_t sr_returned_time_sk = 1; + constexpr std::size_t sr_item_sk = 2; + constexpr std::size_t sr_customer_sk = 3; + constexpr std::size_t sr_cdemo_sk = 4; + constexpr std::size_t sr_hdemo_sk = 5; + constexpr std::size_t sr_addr_sk = 6; + constexpr std::size_t sr_store_sk = 7; + constexpr std::size_t sr_reason_sk = 8; + constexpr std::size_t sr_ticket_number = 9; + constexpr std::size_t sr_quantity = 10; + constexpr std::size_t sr_net_paid = 11; + constexpr std::size_t sr_ext_tax = 12; + constexpr std::size_t sr_net_paid_inc_tax = 13; + constexpr std::size_t sr_fee = 14; + constexpr std::size_t sr_ext_ship_cost = 15; + constexpr std::size_t sr_refunded_cash = 16; + constexpr std::size_t sr_reversed_charge = 17; + constexpr std::size_t sr_store_credit = 18; + constexpr std::size_t sr_net_loss = 19; +} + +// store_sales (23 columns) +namespace store_sales { + constexpr std::size_t ss_sold_date_sk = 0; + constexpr std::size_t ss_sold_time_sk = 1; + constexpr std::size_t ss_item_sk = 2; + constexpr std::size_t ss_customer_sk = 3; + constexpr std::size_t ss_cdemo_sk = 4; + constexpr std::size_t ss_hdemo_sk = 5; + constexpr std::size_t ss_addr_sk = 6; + constexpr std::size_t ss_store_sk = 7; + constexpr std::size_t ss_promo_sk = 8; + constexpr std::size_t ss_ticket_number = 9; + constexpr std::size_t ss_quantity = 10; + constexpr std::size_t ss_wholesale_cost = 11; + constexpr std::size_t ss_list_price = 12; + constexpr std::size_t ss_sales_price = 13; + constexpr std::size_t ss_ext_discount_amt = 14; + constexpr std::size_t ss_ext_sales_price = 15; + constexpr std::size_t ss_ext_wholesale_cost = 16; + constexpr std::size_t ss_ext_list_price = 17; + constexpr std::size_t ss_ext_tax = 18; + constexpr std::size_t ss_coupon_amt = 19; + constexpr std::size_t ss_net_paid = 20; + constexpr std::size_t ss_net_paid_inc_tax = 21; + constexpr std::size_t ss_net_profit = 22; +} + +// time_dim (10 columns) +namespace time_dim { + constexpr std::size_t t_time_sk = 0; + constexpr std::size_t t_time_id = 1; + constexpr std::size_t t_time = 2; + constexpr std::size_t t_hour = 3; + constexpr std::size_t t_minute = 4; + constexpr std::size_t t_second = 5; + constexpr std::size_t t_am_pm = 6; + constexpr std::size_t t_shift = 7; + constexpr std::size_t t_sub_shift = 8; + constexpr std::size_t t_meal_time = 9; +} + +// warehouse (14 columns) +namespace warehouse { + constexpr std::size_t w_warehouse_sk = 0; + constexpr std::size_t w_warehouse_id = 1; + constexpr std::size_t w_warehouse_name = 2; + constexpr std::size_t w_warehouse_sq_ft = 3; + constexpr std::size_t w_street_number = 4; + constexpr std::size_t w_street_name = 5; + constexpr std::size_t w_street_type = 6; + constexpr std::size_t w_suite_number = 7; + constexpr std::size_t w_city = 8; + constexpr std::size_t w_county = 9; + constexpr std::size_t w_state = 10; + constexpr std::size_t w_zip = 11; + constexpr std::size_t w_country = 12; + constexpr std::size_t w_gmt_offset = 13; +} + +// web_page (14 columns) +namespace web_page { + constexpr std::size_t wp_web_page_sk = 0; + constexpr std::size_t wp_web_page_id = 1; + constexpr std::size_t wp_rec_start_date_sk = 2; + constexpr std::size_t wp_rec_end_date_sk = 3; + constexpr std::size_t wp_creation_date_sk = 4; + constexpr std::size_t wp_access_date_sk = 5; + constexpr std::size_t wp_autogen_flag = 6; + constexpr std::size_t wp_customer_sk = 7; + constexpr std::size_t wp_url = 8; + constexpr std::size_t wp_type = 9; + constexpr std::size_t wp_char_count = 10; + constexpr std::size_t wp_link_count = 11; + constexpr std::size_t wp_image_count = 12; + constexpr std::size_t wp_max_ad_count = 13; +} + +// web_returns (24 columns) +namespace web_returns { + constexpr std::size_t wr_returned_date_sk = 0; + constexpr std::size_t wr_returned_time_sk = 1; + constexpr std::size_t wr_item_sk = 2; + constexpr std::size_t wr_refunded_customer_sk = 3; + constexpr std::size_t wr_refunded_cdemo_sk = 4; + constexpr std::size_t wr_refunded_hdemo_sk = 5; + constexpr std::size_t wr_refunded_addr_sk = 6; + constexpr std::size_t wr_returning_customer_sk = 7; + constexpr std::size_t wr_returning_cdemo_sk = 8; + constexpr std::size_t wr_returning_hdemo_sk = 9; + constexpr std::size_t wr_returning_addr_sk = 10; + constexpr std::size_t wr_web_page_sk = 11; + constexpr std::size_t wr_reason_sk = 12; + constexpr std::size_t wr_order_number = 13; + constexpr std::size_t wr_quantity = 14; + constexpr std::size_t wr_net_paid = 15; + constexpr std::size_t wr_ext_tax = 16; + constexpr std::size_t wr_net_paid_inc_tax = 17; + constexpr std::size_t wr_fee = 18; + constexpr std::size_t wr_ext_ship_cost = 19; + constexpr std::size_t wr_refunded_cash = 20; + constexpr std::size_t wr_reversed_charge = 21; + constexpr std::size_t wr_store_credit = 22; + constexpr std::size_t wr_net_loss = 23; +} + +// web_sales (34 columns) +namespace web_sales { + constexpr std::size_t ws_sold_date_sk = 0; + constexpr std::size_t ws_sold_time_sk = 1; + constexpr std::size_t ws_ship_date_sk = 2; + constexpr std::size_t ws_item_sk = 3; + constexpr std::size_t ws_bill_customer_sk = 4; + constexpr std::size_t ws_bill_cdemo_sk = 5; + constexpr std::size_t ws_bill_hdemo_sk = 6; + constexpr std::size_t ws_bill_addr_sk = 7; + constexpr std::size_t ws_ship_customer_sk = 8; + constexpr std::size_t ws_ship_cdemo_sk = 9; + constexpr std::size_t ws_ship_hdemo_sk = 10; + constexpr std::size_t ws_ship_addr_sk = 11; + constexpr std::size_t ws_web_page_sk = 12; + constexpr std::size_t ws_web_site_sk = 13; + constexpr std::size_t ws_ship_mode_sk = 14; + constexpr std::size_t ws_warehouse_sk = 15; + constexpr std::size_t ws_promo_sk = 16; + constexpr std::size_t ws_order_number = 17; + constexpr std::size_t ws_quantity = 18; + constexpr std::size_t ws_wholesale_cost = 19; + constexpr std::size_t ws_list_price = 20; + constexpr std::size_t ws_sales_price = 21; + constexpr std::size_t ws_ext_discount_amt = 22; + constexpr std::size_t ws_ext_sales_price = 23; + constexpr std::size_t ws_ext_wholesale_cost = 24; + constexpr std::size_t ws_ext_list_price = 25; + constexpr std::size_t ws_ext_tax = 26; + constexpr std::size_t ws_coupon_amt = 27; + constexpr std::size_t ws_ext_ship_cost = 28; + constexpr std::size_t ws_net_paid = 29; + constexpr std::size_t ws_net_paid_inc_tax = 30; + constexpr std::size_t ws_net_paid_inc_ship = 31; + constexpr std::size_t ws_net_paid_inc_ship_tax = 32; + constexpr std::size_t ws_net_profit = 33; +} + +// web_site (26 columns) +namespace web_site { + constexpr std::size_t web_site_sk = 0; + constexpr std::size_t web_site_id = 1; + constexpr std::size_t web_rec_start_date_sk = 2; + constexpr std::size_t web_rec_end_date_sk = 3; + constexpr std::size_t web_name = 4; + constexpr std::size_t web_open_date_sk = 5; + constexpr std::size_t web_close_date_sk = 6; + constexpr std::size_t web_class = 7; + constexpr std::size_t web_manager = 8; + constexpr std::size_t web_mkt_id = 9; + constexpr std::size_t web_mkt_class = 10; + constexpr std::size_t web_mkt_desc = 11; + constexpr std::size_t web_market_manager = 12; + constexpr std::size_t web_company_id = 13; + constexpr std::size_t web_company_name = 14; + constexpr std::size_t web_street_number = 15; + constexpr std::size_t web_street_name = 16; + constexpr std::size_t web_street_type = 17; + constexpr std::size_t web_suite_number = 18; + constexpr std::size_t web_city = 19; + constexpr std::size_t web_county = 20; + constexpr std::size_t web_state = 21; + constexpr std::size_t web_zip = 22; + constexpr std::size_t web_country = 23; + constexpr std::size_t web_gmt_offset = 24; + constexpr std::size_t web_tax_percentage = 25; +} + +} // namespace col +} // namespace tpcds diff --git a/include/tpch/dsdgen_converter.hpp b/include/tpch/dsdgen_converter.hpp new file mode 100644 index 0000000..b65080a --- /dev/null +++ b/include/tpch/dsdgen_converter.hpp @@ -0,0 +1,202 @@ +#pragma once + +#include +#include +#include +#include + +namespace tpcds { + +using BuilderMap = std::vector>; + +/** + * Convert dsdgen C struct rows to Arrow array builders. + * + * Each function casts void* to the appropriate dsdgen struct, extracts + * fields, and appends to the matching Arrow builders. + */ + +/** + * Append a store_sales row (W_STORE_SALES_TBL*) to Arrow builders. + * Schema matches DSDGenWrapper::get_schema(TableType::StoreSales). + */ +void append_store_sales_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append an inventory row (W_INVENTORY_TBL*) to Arrow builders. + * Schema matches DSDGenWrapper::get_schema(TableType::Inventory). + */ +void append_inventory_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a catalog_sales row (W_CATALOG_SALES_TBL*) to Arrow builders. + */ +void append_catalog_sales_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a web_sales row (W_WEB_SALES_TBL*) to Arrow builders. + */ +void append_web_sales_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a customer row (W_CUSTOMER_TBL*) to Arrow builders. + */ +void append_customer_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append an item row (W_ITEM_TBL*) to Arrow builders. + */ +void append_item_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a date_dim row (W_DATE_TBL*) to Arrow builders. + */ +void append_date_dim_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a store_returns row (W_STORE_RETURNS_TBL*) to Arrow builders. + */ +void append_store_returns_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a catalog_returns row (W_CATALOG_RETURNS_TBL*) to Arrow builders. + */ +void append_catalog_returns_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a web_returns row (W_WEB_RETURNS_TBL*) to Arrow builders. + */ +void append_web_returns_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a call_center row (CALL_CENTER_TBL*) to Arrow builders. + */ +void append_call_center_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a catalog_page row (CATALOG_PAGE_TBL*) to Arrow builders. + */ +void append_catalog_page_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a web_page row (W_WEB_PAGE_TBL*) to Arrow builders. + */ +void append_web_page_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a web_site row (W_WEB_SITE_TBL*) to Arrow builders. + */ +void append_web_site_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a warehouse row (W_WAREHOUSE_TBL*) to Arrow builders. + */ +void append_warehouse_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a ship_mode row (W_SHIP_MODE_TBL*) to Arrow builders. + */ +void append_ship_mode_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a household_demographics row (W_HOUSEHOLD_DEMOGRAPHICS_TBL*) to Arrow builders. + */ +void append_household_demographics_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a customer_demographics row (W_CUSTOMER_DEMOGRAPHICS_TBL*) to Arrow builders. + */ +void append_customer_demographics_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a customer_address row (W_CUSTOMER_ADDRESS_TBL*) to Arrow builders. + */ +void append_customer_address_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append an income_band row (W_INCOME_BAND_TBL*) to Arrow builders. + */ +void append_income_band_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a reason row (W_REASON_TBL*) to Arrow builders. + */ +void append_reason_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a time_dim row (W_TIME_TBL*) to Arrow builders. + */ +void append_time_dim_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a promotion row (W_PROMOTION_TBL*) to Arrow builders. + */ +void append_promotion_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Append a store row (W_STORE_TBL*) to Arrow builders. + */ +void append_store_to_builders( + const void* row, + BuilderMap& builders); + +/** + * Generic dispatcher by table name. + */ +void append_dsdgen_row_to_builders( + const std::string& table_name, + const void* row, + BuilderMap& builders); + +/** + * Returns static dictionary Arrow array for dict8-encoded columns, or nullptr. + */ +std::shared_ptr get_dict_for_field(const std::string& field_name); + +} // namespace tpcds diff --git a/include/tpch/dsdgen_wrapper.hpp b/include/tpch/dsdgen_wrapper.hpp new file mode 100644 index 0000000..c04086b --- /dev/null +++ b/include/tpch/dsdgen_wrapper.hpp @@ -0,0 +1,242 @@ +#pragma once + +#include +#include +#include +#include + +#include + +namespace tpcds { + +/** + * TPC-DS table identifiers for the 24 standard W_ (warehouse) tables. + * + * The integer values are pinned to the generated `tables.h` constants and + * verified via static_asserts in the implementation TU. + */ +enum class TableType { + CallCenter = 0, + CatalogPage = 1, + CatalogReturns = 2, + CatalogSales = 3, + Customer = 4, + CustomerAddress = 5, + CustomerDemographics = 6, + DateDim = 7, + HouseholdDemographics = 8, + IncomeBand = 9, + Inventory = 10, + Item = 11, + Promotion = 12, + Reason = 13, + ShipMode = 14, + Store = 15, + StoreReturns = 16, + StoreSales = 17, + TimeDim = 18, + Warehouse = 19, + WebPage = 20, + WebReturns = 21, + WebSales = 22, + WebSite = 23, + Count_ +}; + +/** + * C++ wrapper around the TPC-DS dsdgen reference implementation. + * + * Initializes dsdgen global state (embedded distribution data, scale factor, + * RNG seeds) and provides per-table generation methods with callback API. + * + * THREAD-SAFETY: NOT thread-safe. dsdgen uses global mutable state. + * Use one DSDGenWrapper per process, generate tables sequentially. + */ +class DSDGenWrapper { +public: + /** + * Construct wrapper for the given scale factor. + * @param scale_factor TPC-DS scale factor (1 = ~1GB baseline) + * @param verbose Print verbose diagnostic messages + */ + explicit DSDGenWrapper(long scale_factor, bool verbose = false); + ~DSDGenWrapper(); + + DSDGenWrapper(const DSDGenWrapper&) = delete; + DSDGenWrapper& operator=(const DSDGenWrapper&) = delete; + + /** + * Generate store_sales rows. + * Calls callback once per row with a const W_STORE_SALES_TBL*. + * @param callback Invoked for each generated row. + * @param max_rows Limit; -1 or 0 means generate all rows. + */ + void generate_store_sales( + std::function callback, + long max_rows = -1); + + /** + * Generate inventory rows. + * Calls callback once per row with a const W_INVENTORY_TBL*. + */ + void generate_inventory( + std::function callback, + long max_rows = -1); + + /** + * Generate catalog_sales rows (master-detail via callback). + * Calls callback once per line item with a const W_CATALOG_SALES_TBL*. + */ + void generate_catalog_sales( + std::function callback, + long max_rows = -1); + + /** + * Generate web_sales rows (master-detail via callback). + * Calls callback once per line item with a const W_WEB_SALES_TBL*. + */ + void generate_web_sales( + std::function callback, + long max_rows = -1); + + /** + * Generate customer rows. + * Calls callback once per row with a const W_CUSTOMER_TBL*. + */ + void generate_customer( + std::function callback, + long max_rows = -1); + + /** + * Generate item rows. + * Calls callback once per row with a const W_ITEM_TBL*. + */ + void generate_item( + std::function callback, + long max_rows = -1); + + /** + * Generate date_dim rows. + * Calls callback once per row with a const W_DATE_TBL*. + */ + void generate_date_dim( + std::function callback, + long max_rows = -1); + + /** + * Generate store_returns rows. + * Calls callback once per row with a const W_STORE_RETURNS_TBL*. + */ + void generate_store_returns( + std::function callback, + long max_rows = -1); + + /** + * Generate catalog_returns rows. + * Calls callback once per row with a const W_CATALOG_RETURNS_TBL*. + */ + void generate_catalog_returns( + std::function callback, + long max_rows = -1); + + /** + * Generate web_returns rows. + * Calls callback once per row with a const W_WEB_RETURNS_TBL*. + */ + void generate_web_returns( + std::function callback, + long max_rows = -1); + + // ----------------------------------------------------------------------- + // Phase 5 dimension table generators + // ----------------------------------------------------------------------- + + void generate_call_center( + std::function callback, + long max_rows = -1); + + void generate_catalog_page( + std::function callback, + long max_rows = -1); + + void generate_web_page( + std::function callback, + long max_rows = -1); + + void generate_web_site( + std::function callback, + long max_rows = -1); + + void generate_warehouse( + std::function callback, + long max_rows = -1); + + void generate_ship_mode( + std::function callback, + long max_rows = -1); + + void generate_household_demographics( + std::function callback, + long max_rows = -1); + + void generate_customer_demographics( + std::function callback, + long max_rows = -1); + + void generate_customer_address( + std::function callback, + long max_rows = -1); + + void generate_income_band( + std::function callback, + long max_rows = -1); + + void generate_reason( + std::function callback, + long max_rows = -1); + + void generate_time_dim( + std::function callback, + long max_rows = -1); + + void generate_promotion( + std::function callback, + long max_rows = -1); + + void generate_store( + std::function callback, + long max_rows = -1); + + long scale_factor() const { return scale_factor_; } + + /** + * Return the Arrow schema for a table type. + */ + static std::shared_ptr get_schema(TableType table, double scale_factor = 1.0); + + /** + * Return expected row count for a table at the given scale factor. + * Uses dsdgen's get_rowcount() after initialization. + */ + long get_row_count(TableType table) const; + + /** + * Return the dsdgen integer table ID for a TableType. + */ + static int table_id(TableType table); + + /** + * Return the canonical lower-case table name string. + */ + static std::string table_name(TableType table); + +private: + long scale_factor_; + bool verbose_; + bool initialized_; + std::string tmp_dist_path_; // path to temporary tpcds.idx file + + void init_dsdgen(); +}; + +} // namespace tpcds diff --git a/include/tpch/lance_ffi.h b/include/tpch/lance_ffi.h index df21af1..5e535a0 100644 --- a/include/tpch/lance_ffi.h +++ b/include/tpch/lance_ffi.h @@ -76,6 +76,60 @@ int lance_writer_set_write_params( long long max_bytes_per_file, int skip_auto_cleanup); +/** + * Configure Tokio runtime settings for Lance streaming mode. + * Must be called before lance_writer_start_stream(). + * + * @param writer Pointer to LanceWriter from lance_writer_create() + * @param max_blocking_threads 0 = keep current, >0 = cap Tokio blocking pool size + * @return 0 on success, non-zero on failure + */ +int lance_writer_set_runtime_config( + LanceWriter* writer, + int max_blocking_threads); + +/** + * Configure runtime memory profiling for Lance streaming mode. + * Must be called before lance_writer_start_stream(). + * + * @param writer Pointer to LanceWriter from lance_writer_create() + * @param enable_mem_profile 1 to enable stage/batch RSS logging, 0 to disable + * @param report_every_batches Log every N batches when enabled (0 keeps default) + * @return 0 on success, non-zero on failure + */ +int lance_writer_set_profile_config( + LanceWriter* writer, + int enable_mem_profile, + int report_every_batches); + +/** + * Configure scatter/gather stream mode. + * Must be called before lance_writer_start_stream(). + * + * @param writer Pointer to LanceWriter from lance_writer_create() + * @param batches_per_chunk 1 = disabled, >1 enables chunked queue handoff + * @param queue_chunks Bounded queue capacity in chunks + * @return 0 on success, non-zero on failure + */ +int lance_writer_set_scatter_gather_config( + LanceWriter* writer, + int batches_per_chunk, + int queue_chunks); + +/** + * Configure buffered backend flush thresholds. + * Must be called before writes begin. + * + * @param writer Pointer to LanceWriter from lance_writer_create() + * @param batch_threshold Flush when buffered batch count reaches this value (>0) + * @param row_threshold Flush when buffered row count reaches this value (>0) + * @return 0 on success, non-zero on failure + */ +int lance_writer_set_buffered_flush_config( + LanceWriter* writer, + int batch_threshold, + int row_threshold); + /** * Enable or disable the io_uring write path for this writer. * Must be called before writing the first batch. diff --git a/include/tpch/lance_writer.hpp b/include/tpch/lance_writer.hpp index 2a62bc5..6c52b34 100644 --- a/include/tpch/lance_writer.hpp +++ b/include/tpch/lance_writer.hpp @@ -100,6 +100,57 @@ class LanceWriter : public WriterInterface { */ void set_stream_queue_depth(size_t depth) { stream_queue_depth_ = depth; } + /** + * Configure Tokio runtime settings used by Rust streaming writer. + * + * @param max_blocking_threads Cap for Tokio blocking thread pool (0 keeps default) + */ + void set_runtime_config(int max_blocking_threads) { + if (max_blocking_threads > 0) { + stream_max_blocking_threads_ = max_blocking_threads; + } + } + + /** + * Enable/disable Rust-side memory profiling logs for streaming mode. + * + * @param enabled Emit stage and per-batch RSS logs from Lance FFI + * @param report_every_batches Emit per-batch log every N batches + */ + void set_profile_config(bool enabled, size_t report_every_batches) { + stream_mem_profile_enabled_ = enabled; + if (report_every_batches > 0) { + stream_mem_profile_every_batches_ = report_every_batches; + } + } + + /** + * Configure Rust-side scatter/gather chunked stream handoff. + * + * @param batches_per_chunk 1 disables, >1 enables chunking + * @param queue_chunks Bounded queue size in chunks + */ + void set_scatter_gather_config(size_t batches_per_chunk, size_t queue_chunks) { + if (batches_per_chunk > 0) { + stream_scatter_gather_batches_ = batches_per_chunk; + } + if (queue_chunks > 0) { + stream_scatter_gather_queue_chunks_ = queue_chunks; + } + } + + /** + * Configure bounded buffering thresholds for synchronous Lance writes. + */ + void set_buffered_flush_config(size_t batch_threshold, size_t row_threshold) { + if (batch_threshold > 0) { + buffered_flush_batch_threshold_ = batch_threshold; + } + if (row_threshold > 0) { + buffered_flush_row_threshold_ = row_threshold; + } + } + /** * Enable io_uring write path (Linux only, requires io-uring feature compiled in). * Must be called before the first batch is written. @@ -136,7 +187,15 @@ class LanceWriter : public WriterInterface { bool use_io_uring_ = false; #endif - size_t stream_queue_depth_ = 16; + // Winning async default from SF=5 tuning: minimal queueing. + size_t stream_queue_depth_ = 1; + int stream_max_blocking_threads_ = 8; + bool stream_mem_profile_enabled_ = false; + size_t stream_mem_profile_every_batches_ = 100; + size_t stream_scatter_gather_batches_ = 1; + size_t stream_scatter_gather_queue_chunks_ = 1; + size_t buffered_flush_batch_threshold_ = 200; + size_t buffered_flush_row_threshold_ = 1'000'000; std::shared_ptr stream_state_; std::shared_ptr stream_reader_; diff --git a/include/tpch/parquet_writer.hpp b/include/tpch/parquet_writer.hpp index cda5a1e..2e81641 100644 --- a/include/tpch/parquet_writer.hpp +++ b/include/tpch/parquet_writer.hpp @@ -90,6 +90,13 @@ class ParquetWriter : public WriterInterface { */ void enable_streaming_write(bool use_threads = true); + /** + * Set compression codec for Parquet output. + * Must be called before the first write_batch(). + * Supported: "snappy" (default), "zstd", "none" + */ + void set_compression(const std::string& codec); + private: std::string filepath_; std::shared_ptr first_batch_; @@ -108,6 +115,7 @@ class ParquetWriter : public WriterInterface { bool streaming_mode_ = false; bool use_threads_ = true; std::unique_ptr parquet_file_writer_; + std::string compression_codec_ = "snappy"; // snappy, zstd, none // Initialize the Parquet FileWriter for streaming mode void init_file_writer(); diff --git a/scripts/gen_col_indices.py b/scripts/gen_col_indices.py new file mode 100644 index 0000000..969fe20 --- /dev/null +++ b/scripts/gen_col_indices.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +""" +Generate include/tpch/dsdgen_col_idx.hpp — constexpr column indices per table. +Then rewrite dsdgen_converter.cpp to use col::NAME instead of builders[N]. +Also fix append_addr_fields callers to use named base constants. +""" + +import re, sys + +WRAPPER = "/home/tsafin/src/tpch-cpp/src/dsdgen/dsdgen_wrapper.cpp" +CONVERTER = "/home/tsafin/src/tpch-cpp/src/dsdgen/dsdgen_converter.cpp" +HEADER = "/home/tsafin/src/tpch-cpp/include/tpch/dsdgen_col_idx.hpp" + +# --------------------------------------------------------------------------- +# 1. Parse schemas from dsdgen_wrapper.cpp +# --------------------------------------------------------------------------- +wrapper = open(WRAPPER).read() + +enum_to_table = {} +for m in re.finditer(r'case\s+TableType::(\w+):\s*return\s*"([^"]+)"', wrapper): + enum_to_table[m.group(1)] = m.group(2) + +case_pat = re.compile( + r'case\s+TableType::(\w+):\s*\n\s*return\s+arrow::schema\(\{(.*?)\}\);', re.DOTALL) +field_pat = re.compile(r'arrow::field\(\s*"([^"]+)"') + +table_columns = {} +for m in case_pat.finditer(wrapper): + enum_name = m.group(1) + if enum_name not in enum_to_table: + continue + tname = enum_to_table[enum_name] + table_columns[tname] = field_pat.findall(m.group(2)) + +# --------------------------------------------------------------------------- +# 2. Generate dsdgen_col_idx.hpp +# --------------------------------------------------------------------------- +lines = [ + "// AUTO-GENERATED by scripts/gen_col_indices.py — DO NOT EDIT MANUALLY", + "// Source of truth: src/dsdgen/dsdgen_wrapper.cpp (get_schema() switch)", + "//", + "// Provides zero-overhead named column indices for BuilderMap (vector) access.", + "// Usage: builders[col::store_returns::sr_returned_date_sk]", + "#pragma once", + "#include ", + "", + "namespace tpcds {", + "namespace col {", +] + +for tname in sorted(table_columns): + cols = table_columns[tname] + lines.append("") + lines.append(f"// {tname} ({len(cols)} columns)") + lines.append(f"namespace {tname} {{") + for i, col in enumerate(cols): + lines.append(f" constexpr std::size_t {col} = {i};") + lines.append("}") + +lines += ["", "} // namespace col", "} // namespace tpcds", ""] + +open(HEADER, 'w').write('\n'.join(lines)) +print(f"Wrote {HEADER} ({len(table_columns)} tables)") + +# --------------------------------------------------------------------------- +# 3. Rewrite dsdgen_converter.cpp: builders[N] → builders[col::TABLE::NAME] +# using the same find_func_end approach as before. +# --------------------------------------------------------------------------- +conv = open(CONVERTER).read() + +# Build reverse map: for each table, index→col_name +table_index_to_col = {t: {i: c for i, c in enumerate(cols)} + for t, cols in table_columns.items()} + +# Locate functions +func_pat = re.compile( + r'(void\s+append_(\w+)_to_builders\s*\([^)]*\)\s*\{)', re.DOTALL) + +def find_func_end(src, pos): + depth = 1 + i = pos + while i < len(src): + if src[i] == '{': + depth += 1 + elif src[i] == '}': + depth -= 1 + if depth == 0: + return i + 1 + i += 1 + return len(src) + +func_positions = [(m.start(), m.group(2), m.end()) for m in func_pat.finditer(conv)] + +idx_pat = re.compile(r'\bbuilders\[(\d+)\]') + +new_conv = conv +offset = 0 + +for start, raw_table, body_start in func_positions: + if raw_table not in table_index_to_col: + continue + idx_map = table_index_to_col[raw_table] + + adj_start = start + offset + adj_body = body_start + offset + adj_end = find_func_end(new_conv, adj_body) + func_body = new_conv[adj_start:adj_end] + + def replace_idx(m, idx_map=idx_map, tname=raw_table): + n = int(m.group(1)) + if n in idx_map: + return f"builders[col::{tname}::{idx_map[n]}]" + return m.group(0) # leave unchanged (e.g. addr base+N offsets) + + new_body = idx_pat.sub(replace_idx, func_body) + if new_body != func_body: + new_conv = new_conv[:adj_start] + new_body + new_conv[adj_end:] + offset += len(new_body) - len(func_body) + +# Fix append_addr_fields: base+N offsets (0..9) inside the function body itself +# Those are NOT in an append_ function, so the loop above skipped them. +# Replace builders[base + N] patterns — these are fine as-is (readable with named base). +# Also fix the append_addr_fields callers: col::TABLE::street_number as the base arg. +addr_callers = { + 'append_addr_fields(r->cc_address, 20, builders)': + 'append_addr_fields(r->cc_address, col::call_center::cc_street_number, builders)', + 'append_addr_fields(r->web_address, 15, builders)': + 'append_addr_fields(r->web_address, col::web_site::web_street_number, builders)', + 'append_addr_fields(r->w_address, 4, builders)': + 'append_addr_fields(r->w_address, col::warehouse::w_street_number, builders)', + 'append_addr_fields(r->ca_address, 2, builders)': + 'append_addr_fields(r->ca_address, col::customer_address::ca_street_number, builders)', + 'append_addr_fields(r->address, 18, builders)': + 'append_addr_fields(r->address, col::store::s_street_number, builders)', +} +for old, new in addr_callers.items(): + new_conv = new_conv.replace(old, new) + +# Add include of the new header near the top of dsdgen_converter.cpp +if '#include "tpch/dsdgen_col_idx.hpp"' not in new_conv: + new_conv = new_conv.replace( + '#include "tpch/dsdgen_converter.hpp"', + '#include "tpch/dsdgen_converter.hpp"\n#include "tpch/dsdgen_col_idx.hpp"' + ) + +# Count remaining numeric indices (should be only base+N inside append_addr_fields) +remaining = idx_pat.findall(new_conv) +named_count = new_conv.count('col::') +print(f"Named references inserted: {named_count}") +print(f"Remaining numeric indices (expected: base+0..9 in append_addr_fields): {len(remaining)}") + +open(CONVERTER, 'w').write(new_conv) +print(f"Wrote {CONVERTER}") diff --git a/src/dsdgen/dsdgen_converter.cpp b/src/dsdgen/dsdgen_converter.cpp new file mode 100644 index 0000000..3083b42 --- /dev/null +++ b/src/dsdgen/dsdgen_converter.cpp @@ -0,0 +1,1474 @@ +/** + * dsdgen_converter.cpp — Convert dsdgen C structs to Arrow array builders. + * + * Uses dec_to_double() to convert decimal_t (scaled integer) fields to double. + * ds_key_t (= int64_t on Linux) is mapped to arrow::int64(). + */ + +#include "tpch/dsdgen_converter.hpp" +#include "tpch/dsdgen_col_idx.hpp" + +#include +#include + +extern "C" { +#include "tpcds_dsdgen.h" +} + +namespace tpcds { + +// --------------------------------------------------------------------------- +// Helper: decimal_t → double +// +// In this dsdgen tree, decimal_t stores the number of fractional digits in +// `precision` (matching dectostr()/dectof() in decimal.c), while `scale` +// carries related metadata used by arithmetic helpers. +// Example: "12.34" → precision=2, number=1234. +// Conversion: (double)number / 10^precision. +// +// NOTE: dectoflt() in decimal.c is buggy (divides by 10^(precision-1) and +// mutates the struct). We implement the correct formula here. +// --------------------------------------------------------------------------- + +static inline double dec_to_double(const decimal_t* d) { + if (d->precision == 0) return static_cast(d->number); + double result = static_cast(d->number); + for (int i = 0; i < d->precision; ++i) { + result /= 10.0; + } + return result; +} + +// --------------------------------------------------------------------------- +// dict8 encoding helpers — O(1) or O(N) encode for known distributions +// --------------------------------------------------------------------------- +namespace { + +static inline int8_t encode_cd_gender(const char* s) { return s[0]=='M'?0:1; } + +static inline int8_t encode_cd_marital_status(const char* s) { + switch(s[0]) { case 'M':return 0; case 'S':return 1; case 'D':return 2; + case 'W':return 3; default:return 4; } +} + +static inline int8_t encode_cd_education_status(const char* s) { + switch(s[0]) { case 'P':return 0; case 'S':return 1; case 'C':return 2; + case '2':return 3; case '4':return 4; case 'A':return 5; default:return 6; } +} + +static inline int8_t encode_cd_credit_rating(const char* s) { + switch(s[0]) { case 'G':return 0; case 'L':return 1; case 'H':return 2; default:return 3; } +} + +static inline int8_t encode_c_salutation(const char* s) { + if(s[0]=='M') { if(s[1]=='r') return s[2]=='.'?0:1; return s[1]=='s'?2:3; } + return s[0]=='S'?4:5; +} + +static inline int8_t encode_ca_location_type(const char* s) { + switch(s[0]) { case 's':return 0; case 'c':return 1; default:return 2; } +} + +static inline int8_t encode_ca_street_type(const char* s) { + static const char* types[] = { + "Street","ST","Avenue","Ave","Boulevard","Blvd","Road","RD", + "Parkway","Pkwy","Way","Wy","Drive","Dr.","Circle","Cir.","Lane","Ln","Court","Ct." + }; + for (int i = 0; i < 20; i++) if (strcmp(s, types[i]) == 0) return (int8_t)i; + return 0; +} + +static inline int8_t encode_cc_class(const char* s) { + switch(s[0]) { case 's':return 0; case 'm':return 1; default:return 2; } +} + +static inline int8_t encode_cc_hours(const char* s) { + return s[5]=='4'?0:(s[5]=='1'?1:2); +} + +static inline int8_t encode_cc_name(const char* s) { + static const char* names[] = { + "New England","NY Metro","Mid Atlantic","Southeastern","North Midwest", + "Central Midwest","South Midwest","Pacific Northwest", + "California","Southwest","Hawaii/Alaska","Other" + }; + for (int i = 0; i < 12; i++) if (strcmp(s, names[i]) == 0) return (int8_t)i; + return 0; +} + +static inline int8_t encode_cp_type(const char* s) { + switch(s[0]) { case 'b':return 0; case 'q':return 1; default:return 2; } +} + +static inline int8_t encode_wp_type(const char* s) { + switch(s[0]) { case 'a':return 3; case 'f':return 4; case 'p':return 5; case 'd':return 6; + case 'w':return 2; case 'o':return 1; default:return 0; } +} + +static inline int8_t encode_sm_type(const char* s) { + switch(s[0]) { case 'R':return 0; case 'E':return 1; case 'N':return 2; + case 'O':return 3; case 'T':return 4; default:return 5; } +} + +static inline int8_t encode_sm_code(const char* s) { + switch(s[0]) { case 'A':return 0; case 'B':return 3; case 'H':return 4; + case 'M':return 5; case 'C':return 6; + default: return s[1]=='U'?1:2; } +} + +static inline int8_t encode_sm_carrier(const char* s) { + static const char* carriers[] = { + "UPS","FEDEX","AIRBORNE","USPS","DHL","TBS","ZHOU","ZOUROS","MSC","LATVIAN", + "ALLIANCE","ORIENTAL","BARIAN","BOXBUNDLES","GREAT EASTERN","DIAMOND", + "RUPEKSA","GERMA","HARMSTORF","PRIVATECARRIER" + }; + for (int i = 0; i < 20; i++) if (strcmp(s, carriers[i]) == 0) return (int8_t)i; + return 0; +} + +static inline int8_t encode_t_am_pm(const char* s) { return s[0]=='A'?0:1; } + +static inline int8_t encode_t_shift(const char* s) { + switch(s[0]) { case 'f':return 0; case 's':return 1; default:return 2; } +} + +static inline int8_t encode_t_sub_shift(const char* s) { + switch(s[0]) { case 'm':return 0; case 'a':return 1; case 'e':return 2; default:return 3; } +} + +static inline int8_t encode_t_meal_time(const char* s) { + if(!s || !s[0]) return 0; + switch(s[0]) { case 'b':return 1; case 'l':return 2; default:return 3; } +} + +static inline int8_t encode_hd_buy_potential(const char* s) { + if (!s || !s[0]) return 5; + switch (s[0]) { + case '>': return 0; // ">10000" + case '0': return 1; // "0-500" + case '5': return s[1]=='0'?2:3; // "501-1000" vs "5001-10000" + case '1': return 4; // "1001-5000" + default: return 5; // "unknown" + } +} + +static inline int8_t encode_d_day_name(const char* s) { + if(s[0]=='S') return s[1]=='u'?0:6; + switch(s[0]) { case 'M':return 1; case 'F':return 5; + case 'T': return s[1]=='u'?2:4; default:return 3; } +} + +static inline int8_t encode_i_category(const char* s) { + switch(s[0]) { + case 'W':return 0; case 'C':return 2; case 'J':return 5; + case 'H':return 6; case 'B':return 8; case 'E':return 9; + case 'S': return s[1]=='h'?3:7; + case 'M': return s[1]=='e'?1:4; + default:return 0; + } +} + +static inline int8_t encode_i_size(const char* s) { + switch(s[0]) { case 'p':return 0; case 's':return 1; case 'm':return 2; + case 'l':return 3; case 'e':return 4; + case 'N':return 6; default:return 5; } +} + +static inline int8_t encode_i_color(const char* s) { + static const char* colors[] = { + "almond","antique","aquamarine","azure","beige","bisque","black","blanched", + "blue","blush","brown","burlywood","burnished","chartreuse","chiffon","chocolate", + "coral","cornflower","cornsilk","cream","cyan","dark","deep","dim","dodger", + "drab","firebrick","floral","forest","frosted","gainsboro","ghost","goldenrod", + "green","grey","honeydew","hot","indian","ivory","khaki","lace","lavender", + "lawn","lemon","light","lime","linen","magenta","maroon","medium","metallic", + "midnight","mint","misty","moccasin","navajo","navy","olive","orange","orchid", + "pale","papaya","peach","peru","pink","plum","powder","puff","purple","red", + "rose","rosy","royal","saddle","salmon","sandy","seashell","sienna","sky", + "slate","smoke","snow","spring","steel","tan","thistle","tomato","turquoise", + "violet","wheat","white","yellow" + }; + for (int i = 0; i < 92; i++) if (strcmp(s, colors[i]) == 0) return (int8_t)i; + return 0; +} + +static inline int8_t encode_i_units(const char* s) { + static const char* units[] = { + "Unknown","Each","Dozen","Case","Pallet","Gross","Carton","Box","Bunch", + "Bundle","Oz","Lb","Ton","Ounce","Pound","Tsp","Tbl","Cup","Dram","Gram","N/A" + }; + for (int i = 0; i < 21; i++) if (strcmp(s, units[i]) == 0) return (int8_t)i; + return 0; +} + +static inline int8_t encode_state(const char* s) { + static const char* states[] = { + "AK","AL","AR","AZ","CA","CO","CT","DC","DE","FL","GA","HI","IA","ID", + "IL","IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC", + "ND","NE","NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD", + "TN","TX","UT","VA","VT","WA","WI","WV","WY" + }; + for (int i = 0; i < 52; i++) if (strcmp(s, states[i]) == 0) return (int8_t)i; + return 0; +} + +} // anonymous namespace + +// --------------------------------------------------------------------------- +// Static dictionary arrays and getter +// --------------------------------------------------------------------------- + +std::shared_ptr get_dict_for_field(const std::string& name) { + auto make = [](std::initializer_list vals) { + arrow::StringBuilder b; + for (auto v : vals) (void)b.Append(v, strlen(v)); + return *b.Finish(); + }; + + static auto gender = make({"M","F"}); + static auto marital = make({"M","S","D","W","U"}); + static auto education = make({"Primary","Secondary","College","2 yr Degree","4 yr Degree","Advanced Degree","Unknown"}); + static auto credit = make({"Good","Low Risk","High Risk","Unknown"}); + static auto salutation = make({"Mr.","Mrs.","Ms.","Miss","Sir","Dr."}); + static auto am_pm = make({"AM","PM"}); + static auto shift = make({"first","second","third"}); + static auto sub_shift = make({"morning","afternoon","evening","night"}); + static auto meal_time = make({"","breakfast","lunch","dinner"}); + static auto day_name = make({"Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"}); + static auto category = make({"Women","Men","Children","Shoes","Music","Jewelry","Home","Sports","Books","Electronics"}); + static auto item_size = make({"petite","small","medium","large","extra large","economy","N/A"}); + static auto cp_type_d = make({"bi-annual","quarterly","monthly"}); + static auto wp_type_d = make({"general","order","welcome","ad","feedback","protected","dynamic"}); + static auto sm_type_d = make({"REGULAR","EXPRESS","NEXT DAY","OVERNIGHT","TWO DAY","LIBRARY"}); + static auto sm_code_d = make({"AIR","SURFACE","SEA","BIKE","HAND CARRY","MESSENGER","COURIER"}); + static auto sm_carrier_d = make({"UPS","FEDEX","AIRBORNE","USPS","DHL","TBS","ZHOU","ZOUROS","MSC","LATVIAN","ALLIANCE","ORIENTAL","BARIAN","BOXBUNDLES","GREAT EASTERN","DIAMOND","RUPEKSA","GERMA","HARMSTORF","PRIVATECARRIER"}); + static auto loc_type = make({"single family","condo","apartment"}); + static auto cc_class_d = make({"small","medium","large"}); + static auto cc_hours_d = make({"8AM-4PM","8AM-12AM","8AM-8AM"}); + static auto cc_name_d = make({"New England","NY Metro","Mid Atlantic","Southeastern","North Midwest","Central Midwest","South Midwest","Pacific Northwest","California","Southwest","Hawaii/Alaska","Other"}); + static auto street_type_d = make({"Street","ST","Avenue","Ave","Boulevard","Blvd","Road","RD","Parkway","Pkwy","Way","Wy","Drive","Dr.","Circle","Cir.","Lane","Ln","Court","Ct."}); + static auto buy_potential = make({">10000","0-500","501-1000","5001-10000","1001-5000","unknown"}); + static auto one_unknown = make({"Unknown"}); + static auto one_dept = make({"DEPARTMENT"}); + static auto one_us = make({"United States"}); + static auto states = make({"AK","AL","AR","AZ","CA","CO","CT","DC","DE","FL","GA","HI","IA","ID","IL","IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE","NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VA","VT","WA","WI","WV","WY"}); + static auto colors = make({"almond","antique","aquamarine","azure","beige","bisque","black","blanched","blue","blush","brown","burlywood","burnished","chartreuse","chiffon","chocolate","coral","cornflower","cornsilk","cream","cyan","dark","deep","dim","dodger","drab","firebrick","floral","forest","frosted","gainsboro","ghost","goldenrod","green","grey","honeydew","hot","indian","ivory","khaki","lace","lavender","lawn","lemon","light","lime","linen","magenta","maroon","medium","metallic","midnight","mint","misty","moccasin","navajo","navy","olive","orange","orchid","pale","papaya","peach","peru","pink","plum","powder","puff","purple","red","rose","rosy","royal","saddle","salmon","sandy","seashell","sienna","sky","slate","smoke","snow","spring","steel","tan","thistle","tomato","turquoise","violet","wheat","white","yellow"}); + static auto units = make({"Unknown","Each","Dozen","Case","Pallet","Gross","Carton","Box","Bunch","Bundle","Oz","Lb","Ton","Ounce","Pound","Tsp","Tbl","Cup","Dram","Gram","N/A"}); + + static const std::unordered_map> registry = { + {"cd_gender", gender}, + {"cd_marital_status", marital}, + {"cd_education_status", education}, + {"cd_credit_rating", credit}, + {"c_salutation", salutation}, + {"t_am_pm", am_pm}, + {"t_shift", shift}, + {"t_sub_shift", sub_shift}, + {"t_meal_time", meal_time}, + {"d_day_name", day_name}, + {"i_category", category}, + {"i_size", item_size}, + {"i_container", one_unknown}, + {"i_color", colors}, + {"i_units", units}, + {"cp_department", one_dept}, + {"cp_type", cp_type_d}, + {"wp_type", wp_type_d}, + {"web_class", one_unknown}, + {"web_country", one_us}, + {"web_state", states}, + {"web_street_type", street_type_d}, + {"w_country", one_us}, + {"w_state", states}, + {"w_street_type", street_type_d}, + {"s_hours", cc_hours_d}, + {"s_geography_class", one_unknown}, + {"s_division_name", one_unknown}, + {"s_company_name", one_unknown}, + {"s_country", one_us}, + {"s_state", states}, + {"s_street_type", street_type_d}, + {"sm_type", sm_type_d}, + {"sm_code", sm_code_d}, + {"sm_carrier", sm_carrier_d}, + {"cc_class", cc_class_d}, + {"cc_hours", cc_hours_d}, + {"cc_name", cc_name_d}, + {"cc_country", one_us}, + {"cc_state", states}, + {"cc_street_type", street_type_d}, + {"ca_location_type", loc_type}, + {"ca_country", one_us}, + {"ca_state", states}, + {"ca_street_type", street_type_d}, + {"p_purpose", one_unknown}, + {"hd_buy_potential", buy_potential}, + }; + + auto it = registry.find(name); + return it != registry.end() ? it->second : nullptr; +} + +// --------------------------------------------------------------------------- +// store_sales +// --------------------------------------------------------------------------- + +void append_store_sales_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + // Surrogate keys (int64) + static_cast(builders[col::store_sales::ss_sold_date_sk].get()) + ->Append(static_cast(r->ss_sold_date_sk)); + static_cast(builders[col::store_sales::ss_sold_time_sk].get()) + ->Append(static_cast(r->ss_sold_time_sk)); + static_cast(builders[col::store_sales::ss_item_sk].get()) + ->Append(static_cast(r->ss_sold_item_sk)); + static_cast(builders[col::store_sales::ss_customer_sk].get()) + ->Append(static_cast(r->ss_sold_customer_sk)); + static_cast(builders[col::store_sales::ss_cdemo_sk].get()) + ->Append(static_cast(r->ss_sold_cdemo_sk)); + static_cast(builders[col::store_sales::ss_hdemo_sk].get()) + ->Append(static_cast(r->ss_sold_hdemo_sk)); + static_cast(builders[col::store_sales::ss_addr_sk].get()) + ->Append(static_cast(r->ss_sold_addr_sk)); + static_cast(builders[col::store_sales::ss_store_sk].get()) + ->Append(static_cast(r->ss_sold_store_sk)); + static_cast(builders[col::store_sales::ss_promo_sk].get()) + ->Append(static_cast(r->ss_sold_promo_sk)); + static_cast(builders[col::store_sales::ss_ticket_number].get()) + ->Append(static_cast(r->ss_ticket_number)); + + // Quantity (int) + static_cast(builders[col::store_sales::ss_quantity].get()) + ->Append(static_cast(r->ss_pricing.quantity)); + + // Decimal pricing fields → double + const ds_pricing_t* p = &r->ss_pricing; + + static_cast(builders[col::store_sales::ss_wholesale_cost].get()) + ->Append(dec_to_double(&p->wholesale_cost)); + static_cast(builders[col::store_sales::ss_list_price].get()) + ->Append(dec_to_double(&p->list_price)); + static_cast(builders[col::store_sales::ss_sales_price].get()) + ->Append(dec_to_double(&p->sales_price)); + static_cast(builders[col::store_sales::ss_ext_discount_amt].get()) + ->Append(dec_to_double(&p->ext_discount_amt)); + static_cast(builders[col::store_sales::ss_ext_sales_price].get()) + ->Append(dec_to_double(&p->ext_sales_price)); + static_cast(builders[col::store_sales::ss_ext_wholesale_cost].get()) + ->Append(dec_to_double(&p->ext_wholesale_cost)); + static_cast(builders[col::store_sales::ss_ext_list_price].get()) + ->Append(dec_to_double(&p->ext_list_price)); + static_cast(builders[col::store_sales::ss_ext_tax].get()) + ->Append(dec_to_double(&p->ext_tax)); + static_cast(builders[col::store_sales::ss_coupon_amt].get()) + ->Append(dec_to_double(&p->coupon_amt)); + static_cast(builders[col::store_sales::ss_net_paid].get()) + ->Append(dec_to_double(&p->net_paid)); + static_cast(builders[col::store_sales::ss_net_paid_inc_tax].get()) + ->Append(dec_to_double(&p->net_paid_inc_tax)); + static_cast(builders[col::store_sales::ss_net_profit].get()) + ->Append(dec_to_double(&p->net_profit)); +} + +// --------------------------------------------------------------------------- +// inventory +// --------------------------------------------------------------------------- + +void append_inventory_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::inventory::inv_date_sk].get()) + ->Append(static_cast(r->inv_date_sk)); + static_cast(builders[col::inventory::inv_item_sk].get()) + ->Append(static_cast(r->inv_item_sk)); + static_cast(builders[col::inventory::inv_warehouse_sk].get()) + ->Append(static_cast(r->inv_warehouse_sk)); + static_cast(builders[col::inventory::inv_quantity_on_hand].get()) + ->Append(static_cast(r->inv_quantity_on_hand)); +} + +// --------------------------------------------------------------------------- +// catalog_sales +// --------------------------------------------------------------------------- + +void append_catalog_sales_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::catalog_sales::cs_sold_date_sk].get()) + ->Append(static_cast(r->cs_sold_date_sk)); + static_cast(builders[col::catalog_sales::cs_sold_time_sk].get()) + ->Append(static_cast(r->cs_sold_time_sk)); + static_cast(builders[col::catalog_sales::cs_ship_date_sk].get()) + ->Append(static_cast(r->cs_ship_date_sk)); + static_cast(builders[col::catalog_sales::cs_bill_customer_sk].get()) + ->Append(static_cast(r->cs_bill_customer_sk)); + static_cast(builders[col::catalog_sales::cs_bill_cdemo_sk].get()) + ->Append(static_cast(r->cs_bill_cdemo_sk)); + static_cast(builders[col::catalog_sales::cs_bill_hdemo_sk].get()) + ->Append(static_cast(r->cs_bill_hdemo_sk)); + static_cast(builders[col::catalog_sales::cs_bill_addr_sk].get()) + ->Append(static_cast(r->cs_bill_addr_sk)); + static_cast(builders[col::catalog_sales::cs_ship_customer_sk].get()) + ->Append(static_cast(r->cs_ship_customer_sk)); + static_cast(builders[col::catalog_sales::cs_ship_cdemo_sk].get()) + ->Append(static_cast(r->cs_ship_cdemo_sk)); + static_cast(builders[col::catalog_sales::cs_ship_hdemo_sk].get()) + ->Append(static_cast(r->cs_ship_hdemo_sk)); + static_cast(builders[col::catalog_sales::cs_ship_addr_sk].get()) + ->Append(static_cast(r->cs_ship_addr_sk)); + static_cast(builders[col::catalog_sales::cs_call_center_sk].get()) + ->Append(static_cast(r->cs_call_center_sk)); + static_cast(builders[col::catalog_sales::cs_catalog_page_sk].get()) + ->Append(static_cast(r->cs_catalog_page_sk)); + static_cast(builders[col::catalog_sales::cs_ship_mode_sk].get()) + ->Append(static_cast(r->cs_ship_mode_sk)); + static_cast(builders[col::catalog_sales::cs_warehouse_sk].get()) + ->Append(static_cast(r->cs_warehouse_sk)); + static_cast(builders[col::catalog_sales::cs_item_sk].get()) + ->Append(static_cast(r->cs_sold_item_sk)); + static_cast(builders[col::catalog_sales::cs_promo_sk].get()) + ->Append(static_cast(r->cs_promo_sk)); + static_cast(builders[col::catalog_sales::cs_order_number].get()) + ->Append(static_cast(r->cs_order_number)); + + const ds_pricing_t* p = &r->cs_pricing; + static_cast(builders[col::catalog_sales::cs_quantity].get()) + ->Append(static_cast(p->quantity)); + static_cast(builders[col::catalog_sales::cs_wholesale_cost].get()) + ->Append(dec_to_double(&p->wholesale_cost)); + static_cast(builders[col::catalog_sales::cs_list_price].get()) + ->Append(dec_to_double(&p->list_price)); + static_cast(builders[col::catalog_sales::cs_sales_price].get()) + ->Append(dec_to_double(&p->sales_price)); + static_cast(builders[col::catalog_sales::cs_ext_discount_amt].get()) + ->Append(dec_to_double(&p->ext_discount_amt)); + static_cast(builders[col::catalog_sales::cs_ext_sales_price].get()) + ->Append(dec_to_double(&p->ext_sales_price)); + static_cast(builders[col::catalog_sales::cs_ext_wholesale_cost].get()) + ->Append(dec_to_double(&p->ext_wholesale_cost)); + static_cast(builders[col::catalog_sales::cs_ext_list_price].get()) + ->Append(dec_to_double(&p->ext_list_price)); + static_cast(builders[col::catalog_sales::cs_ext_tax].get()) + ->Append(dec_to_double(&p->ext_tax)); + static_cast(builders[col::catalog_sales::cs_coupon_amt].get()) + ->Append(dec_to_double(&p->coupon_amt)); + static_cast(builders[col::catalog_sales::cs_ext_ship_cost].get()) + ->Append(dec_to_double(&p->ext_ship_cost)); + static_cast(builders[col::catalog_sales::cs_net_paid].get()) + ->Append(dec_to_double(&p->net_paid)); + static_cast(builders[col::catalog_sales::cs_net_paid_inc_tax].get()) + ->Append(dec_to_double(&p->net_paid_inc_tax)); + static_cast(builders[col::catalog_sales::cs_net_paid_inc_ship].get()) + ->Append(dec_to_double(&p->net_paid_inc_ship)); + static_cast(builders[col::catalog_sales::cs_net_paid_inc_ship_tax].get()) + ->Append(dec_to_double(&p->net_paid_inc_ship_tax)); + static_cast(builders[col::catalog_sales::cs_net_profit].get()) + ->Append(dec_to_double(&p->net_profit)); +} + +// --------------------------------------------------------------------------- +// web_sales +// --------------------------------------------------------------------------- + +void append_web_sales_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::web_sales::ws_sold_date_sk].get()) + ->Append(static_cast(r->ws_sold_date_sk)); + static_cast(builders[col::web_sales::ws_sold_time_sk].get()) + ->Append(static_cast(r->ws_sold_time_sk)); + static_cast(builders[col::web_sales::ws_ship_date_sk].get()) + ->Append(static_cast(r->ws_ship_date_sk)); + static_cast(builders[col::web_sales::ws_item_sk].get()) + ->Append(static_cast(r->ws_item_sk)); + static_cast(builders[col::web_sales::ws_bill_customer_sk].get()) + ->Append(static_cast(r->ws_bill_customer_sk)); + static_cast(builders[col::web_sales::ws_bill_cdemo_sk].get()) + ->Append(static_cast(r->ws_bill_cdemo_sk)); + static_cast(builders[col::web_sales::ws_bill_hdemo_sk].get()) + ->Append(static_cast(r->ws_bill_hdemo_sk)); + static_cast(builders[col::web_sales::ws_bill_addr_sk].get()) + ->Append(static_cast(r->ws_bill_addr_sk)); + static_cast(builders[col::web_sales::ws_ship_customer_sk].get()) + ->Append(static_cast(r->ws_ship_customer_sk)); + static_cast(builders[col::web_sales::ws_ship_cdemo_sk].get()) + ->Append(static_cast(r->ws_ship_cdemo_sk)); + static_cast(builders[col::web_sales::ws_ship_hdemo_sk].get()) + ->Append(static_cast(r->ws_ship_hdemo_sk)); + static_cast(builders[col::web_sales::ws_ship_addr_sk].get()) + ->Append(static_cast(r->ws_ship_addr_sk)); + static_cast(builders[col::web_sales::ws_web_page_sk].get()) + ->Append(static_cast(r->ws_web_page_sk)); + static_cast(builders[col::web_sales::ws_web_site_sk].get()) + ->Append(static_cast(r->ws_web_site_sk)); + static_cast(builders[col::web_sales::ws_ship_mode_sk].get()) + ->Append(static_cast(r->ws_ship_mode_sk)); + static_cast(builders[col::web_sales::ws_warehouse_sk].get()) + ->Append(static_cast(r->ws_warehouse_sk)); + static_cast(builders[col::web_sales::ws_promo_sk].get()) + ->Append(static_cast(r->ws_promo_sk)); + static_cast(builders[col::web_sales::ws_order_number].get()) + ->Append(static_cast(r->ws_order_number)); + + const ds_pricing_t* p = &r->ws_pricing; + static_cast(builders[col::web_sales::ws_quantity].get()) + ->Append(static_cast(p->quantity)); + static_cast(builders[col::web_sales::ws_wholesale_cost].get()) + ->Append(dec_to_double(&p->wholesale_cost)); + static_cast(builders[col::web_sales::ws_list_price].get()) + ->Append(dec_to_double(&p->list_price)); + static_cast(builders[col::web_sales::ws_sales_price].get()) + ->Append(dec_to_double(&p->sales_price)); + static_cast(builders[col::web_sales::ws_ext_discount_amt].get()) + ->Append(dec_to_double(&p->ext_discount_amt)); + static_cast(builders[col::web_sales::ws_ext_sales_price].get()) + ->Append(dec_to_double(&p->ext_sales_price)); + static_cast(builders[col::web_sales::ws_ext_wholesale_cost].get()) + ->Append(dec_to_double(&p->ext_wholesale_cost)); + static_cast(builders[col::web_sales::ws_ext_list_price].get()) + ->Append(dec_to_double(&p->ext_list_price)); + static_cast(builders[col::web_sales::ws_ext_tax].get()) + ->Append(dec_to_double(&p->ext_tax)); + static_cast(builders[col::web_sales::ws_coupon_amt].get()) + ->Append(dec_to_double(&p->coupon_amt)); + static_cast(builders[col::web_sales::ws_ext_ship_cost].get()) + ->Append(dec_to_double(&p->ext_ship_cost)); + static_cast(builders[col::web_sales::ws_net_paid].get()) + ->Append(dec_to_double(&p->net_paid)); + static_cast(builders[col::web_sales::ws_net_paid_inc_tax].get()) + ->Append(dec_to_double(&p->net_paid_inc_tax)); + static_cast(builders[col::web_sales::ws_net_paid_inc_ship].get()) + ->Append(dec_to_double(&p->net_paid_inc_ship)); + static_cast(builders[col::web_sales::ws_net_paid_inc_ship_tax].get()) + ->Append(dec_to_double(&p->net_paid_inc_ship_tax)); + static_cast(builders[col::web_sales::ws_net_profit].get()) + ->Append(dec_to_double(&p->net_profit)); +} + +// --------------------------------------------------------------------------- +// customer +// --------------------------------------------------------------------------- + +void append_customer_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::customer::c_customer_sk].get()) + ->Append(static_cast(r->c_customer_sk)); + static_cast(builders[col::customer::c_customer_id].get()) + ->Append(r->c_customer_id); + static_cast(builders[col::customer::c_current_cdemo_sk].get()) + ->Append(static_cast(r->c_current_cdemo_sk)); + static_cast(builders[col::customer::c_current_hdemo_sk].get()) + ->Append(static_cast(r->c_current_hdemo_sk)); + static_cast(builders[col::customer::c_current_addr_sk].get()) + ->Append(static_cast(r->c_current_addr_sk)); + static_cast(builders[col::customer::c_first_shipto_date_id].get()) + ->Append(static_cast(r->c_first_shipto_date_id)); + static_cast(builders[col::customer::c_first_sales_date_id].get()) + ->Append(static_cast(r->c_first_sales_date_id)); + static_cast(builders[col::customer::c_salutation].get()) + ->Append(encode_c_salutation(r->c_salutation ? r->c_salutation : "")); + static_cast(builders[col::customer::c_first_name].get()) + ->Append(r->c_first_name ? r->c_first_name : ""); + static_cast(builders[col::customer::c_last_name].get()) + ->Append(r->c_last_name ? r->c_last_name : ""); + static_cast(builders[col::customer::c_preferred_cust_flag].get()) + ->Append(static_cast(r->c_preferred_cust_flag)); + static_cast(builders[col::customer::c_birth_day].get()) + ->Append(static_cast(r->c_birth_day)); + static_cast(builders[col::customer::c_birth_month].get()) + ->Append(static_cast(r->c_birth_month)); + static_cast(builders[col::customer::c_birth_year].get()) + ->Append(static_cast(r->c_birth_year)); + static_cast(builders[col::customer::c_birth_country].get()) + ->Append(r->c_birth_country ? r->c_birth_country : ""); + static_cast(builders[col::customer::c_login].get()) + ->Append(r->c_login); + static_cast(builders[col::customer::c_email_address].get()) + ->Append(r->c_email_address); + static_cast(builders[col::customer::c_last_review_date].get()) + ->Append(static_cast(r->c_last_review_date)); +} + +// --------------------------------------------------------------------------- +// item +// --------------------------------------------------------------------------- + +void append_item_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::item::i_item_sk].get()) + ->Append(static_cast(r->i_item_sk)); + static_cast(builders[col::item::i_item_id].get()) + ->Append(r->i_item_id); + static_cast(builders[col::item::i_rec_start_date_id].get()) + ->Append(static_cast(r->i_rec_start_date_id)); + static_cast(builders[col::item::i_rec_end_date_id].get()) + ->Append(static_cast(r->i_rec_end_date_id)); + static_cast(builders[col::item::i_item_desc].get()) + ->Append(r->i_item_desc); + static_cast(builders[col::item::i_current_price].get()) + ->Append(dec_to_double(&r->i_current_price)); + static_cast(builders[col::item::i_wholesale_cost].get()) + ->Append(dec_to_double(&r->i_wholesale_cost)); + static_cast(builders[col::item::i_brand_id].get()) + ->Append(static_cast(r->i_brand_id)); + static_cast(builders[col::item::i_brand].get()) + ->Append(r->i_brand); + static_cast(builders[col::item::i_class_id].get()) + ->Append(static_cast(r->i_class_id)); + static_cast(builders[col::item::i_class].get()) + ->Append(r->i_class ? r->i_class : ""); + static_cast(builders[col::item::i_category_id].get()) + ->Append(static_cast(r->i_category_id)); + static_cast(builders[col::item::i_category].get()) + ->Append(encode_i_category(r->i_category ? r->i_category : "")); + static_cast(builders[col::item::i_manufact_id].get()) + ->Append(static_cast(r->i_manufact_id)); + static_cast(builders[col::item::i_manufact].get()) + ->Append(r->i_manufact); + static_cast(builders[col::item::i_size].get()) + ->Append(encode_i_size(r->i_size ? r->i_size : "")); + static_cast(builders[col::item::i_formulation].get()) + ->Append(r->i_formulation); + static_cast(builders[col::item::i_color].get()) + ->Append(encode_i_color(r->i_color ? r->i_color : "")); + static_cast(builders[col::item::i_units].get()) + ->Append(encode_i_units(r->i_units ? r->i_units : "")); + static_cast(builders[col::item::i_container].get()) + ->Append(0); // always "Unknown" + static_cast(builders[col::item::i_manager_id].get()) + ->Append(static_cast(r->i_manager_id)); + static_cast(builders[col::item::i_product_name].get()) + ->Append(r->i_product_name); + static_cast(builders[col::item::i_promo_sk].get()) + ->Append(static_cast(r->i_promo_sk)); +} + +// --------------------------------------------------------------------------- +// date_dim +// --------------------------------------------------------------------------- + +void append_date_dim_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::date_dim::d_date_sk].get()) + ->Append(static_cast(r->d_date_sk)); + static_cast(builders[col::date_dim::d_date_id].get()) + ->Append(r->d_date_id); + static_cast(builders[col::date_dim::d_month_seq].get()) + ->Append(static_cast(r->d_month_seq)); + static_cast(builders[col::date_dim::d_week_seq].get()) + ->Append(static_cast(r->d_week_seq)); + static_cast(builders[col::date_dim::d_quarter_seq].get()) + ->Append(static_cast(r->d_quarter_seq)); + static_cast(builders[col::date_dim::d_year].get()) + ->Append(static_cast(r->d_year)); + static_cast(builders[col::date_dim::d_dow].get()) + ->Append(static_cast(r->d_dow)); + static_cast(builders[col::date_dim::d_moy].get()) + ->Append(static_cast(r->d_moy)); + static_cast(builders[col::date_dim::d_dom].get()) + ->Append(static_cast(r->d_dom)); + static_cast(builders[col::date_dim::d_qoy].get()) + ->Append(static_cast(r->d_qoy)); + static_cast(builders[col::date_dim::d_fy_year].get()) + ->Append(static_cast(r->d_fy_year)); + static_cast(builders[col::date_dim::d_fy_quarter_seq].get()) + ->Append(static_cast(r->d_fy_quarter_seq)); + static_cast(builders[col::date_dim::d_fy_week_seq].get()) + ->Append(static_cast(r->d_fy_week_seq)); + static_cast(builders[col::date_dim::d_day_name].get()) + ->Append(encode_d_day_name(r->d_day_name ? r->d_day_name : "")); + static_cast(builders[col::date_dim::d_holiday].get()) + ->Append(static_cast(r->d_holiday)); + static_cast(builders[col::date_dim::d_weekend].get()) + ->Append(static_cast(r->d_weekend)); + static_cast(builders[col::date_dim::d_following_holiday].get()) + ->Append(static_cast(r->d_following_holiday)); + static_cast(builders[col::date_dim::d_first_dom].get()) + ->Append(static_cast(r->d_first_dom)); + static_cast(builders[col::date_dim::d_last_dom].get()) + ->Append(static_cast(r->d_last_dom)); + static_cast(builders[col::date_dim::d_same_day_ly].get()) + ->Append(static_cast(r->d_same_day_ly)); + static_cast(builders[col::date_dim::d_same_day_lq].get()) + ->Append(static_cast(r->d_same_day_lq)); + static_cast(builders[col::date_dim::d_current_day].get()) + ->Append(static_cast(r->d_current_day)); + static_cast(builders[col::date_dim::d_current_week].get()) + ->Append(static_cast(r->d_current_week)); + static_cast(builders[col::date_dim::d_current_month].get()) + ->Append(static_cast(r->d_current_month)); + static_cast(builders[col::date_dim::d_current_quarter].get()) + ->Append(static_cast(r->d_current_quarter)); + static_cast(builders[col::date_dim::d_current_year].get()) + ->Append(static_cast(r->d_current_year)); +} + +// --------------------------------------------------------------------------- +// store_returns +// --------------------------------------------------------------------------- + +void append_store_returns_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::store_returns::sr_returned_date_sk].get()) + ->Append(static_cast(r->sr_returned_date_sk)); + static_cast(builders[col::store_returns::sr_returned_time_sk].get()) + ->Append(static_cast(r->sr_returned_time_sk)); + static_cast(builders[col::store_returns::sr_item_sk].get()) + ->Append(static_cast(r->sr_item_sk)); + static_cast(builders[col::store_returns::sr_customer_sk].get()) + ->Append(static_cast(r->sr_customer_sk)); + static_cast(builders[col::store_returns::sr_cdemo_sk].get()) + ->Append(static_cast(r->sr_cdemo_sk)); + static_cast(builders[col::store_returns::sr_hdemo_sk].get()) + ->Append(static_cast(r->sr_hdemo_sk)); + static_cast(builders[col::store_returns::sr_addr_sk].get()) + ->Append(static_cast(r->sr_addr_sk)); + static_cast(builders[col::store_returns::sr_store_sk].get()) + ->Append(static_cast(r->sr_store_sk)); + static_cast(builders[col::store_returns::sr_reason_sk].get()) + ->Append(static_cast(r->sr_reason_sk)); + static_cast(builders[col::store_returns::sr_ticket_number].get()) + ->Append(static_cast(r->sr_ticket_number)); + + const ds_pricing_t* p = &r->sr_pricing; + static_cast(builders[col::store_returns::sr_quantity].get()) + ->Append(static_cast(p->quantity)); + static_cast(builders[col::store_returns::sr_net_paid].get()) + ->Append(dec_to_double(&p->net_paid)); + static_cast(builders[col::store_returns::sr_ext_tax].get()) + ->Append(dec_to_double(&p->ext_tax)); + static_cast(builders[col::store_returns::sr_net_paid_inc_tax].get()) + ->Append(dec_to_double(&p->net_paid_inc_tax)); + static_cast(builders[col::store_returns::sr_fee].get()) + ->Append(dec_to_double(&p->fee)); + static_cast(builders[col::store_returns::sr_ext_ship_cost].get()) + ->Append(dec_to_double(&p->ext_ship_cost)); + static_cast(builders[col::store_returns::sr_refunded_cash].get()) + ->Append(dec_to_double(&p->refunded_cash)); + static_cast(builders[col::store_returns::sr_reversed_charge].get()) + ->Append(dec_to_double(&p->reversed_charge)); + static_cast(builders[col::store_returns::sr_store_credit].get()) + ->Append(dec_to_double(&p->store_credit)); + static_cast(builders[col::store_returns::sr_net_loss].get()) + ->Append(dec_to_double(&p->net_loss)); +} + +// --------------------------------------------------------------------------- +// catalog_returns +// --------------------------------------------------------------------------- + +void append_catalog_returns_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::catalog_returns::cr_returned_date_sk].get()) + ->Append(static_cast(r->cr_returned_date_sk)); + static_cast(builders[col::catalog_returns::cr_returned_time_sk].get()) + ->Append(static_cast(r->cr_returned_time_sk)); + static_cast(builders[col::catalog_returns::cr_item_sk].get()) + ->Append(static_cast(r->cr_item_sk)); + static_cast(builders[col::catalog_returns::cr_refunded_customer_sk].get()) + ->Append(static_cast(r->cr_refunded_customer_sk)); + static_cast(builders[col::catalog_returns::cr_refunded_cdemo_sk].get()) + ->Append(static_cast(r->cr_refunded_cdemo_sk)); + static_cast(builders[col::catalog_returns::cr_refunded_hdemo_sk].get()) + ->Append(static_cast(r->cr_refunded_hdemo_sk)); + static_cast(builders[col::catalog_returns::cr_refunded_addr_sk].get()) + ->Append(static_cast(r->cr_refunded_addr_sk)); + static_cast(builders[col::catalog_returns::cr_returning_customer_sk].get()) + ->Append(static_cast(r->cr_returning_customer_sk)); + static_cast(builders[col::catalog_returns::cr_returning_cdemo_sk].get()) + ->Append(static_cast(r->cr_returning_cdemo_sk)); + static_cast(builders[col::catalog_returns::cr_returning_hdemo_sk].get()) + ->Append(static_cast(r->cr_returning_hdemo_sk)); + static_cast(builders[col::catalog_returns::cr_returning_addr_sk].get()) + ->Append(static_cast(r->cr_returning_addr_sk)); + static_cast(builders[col::catalog_returns::cr_call_center_sk].get()) + ->Append(static_cast(r->cr_call_center_sk)); + static_cast(builders[col::catalog_returns::cr_catalog_page_sk].get()) + ->Append(static_cast(r->cr_catalog_page_sk)); + static_cast(builders[col::catalog_returns::cr_ship_mode_sk].get()) + ->Append(static_cast(r->cr_ship_mode_sk)); + static_cast(builders[col::catalog_returns::cr_warehouse_sk].get()) + ->Append(static_cast(r->cr_warehouse_sk)); + static_cast(builders[col::catalog_returns::cr_reason_sk].get()) + ->Append(static_cast(r->cr_reason_sk)); + static_cast(builders[col::catalog_returns::cr_order_number].get()) + ->Append(static_cast(r->cr_order_number)); + + const ds_pricing_t* p = &r->cr_pricing; + static_cast(builders[col::catalog_returns::cr_quantity].get()) + ->Append(static_cast(p->quantity)); + static_cast(builders[col::catalog_returns::cr_net_paid].get()) + ->Append(dec_to_double(&p->net_paid)); + static_cast(builders[col::catalog_returns::cr_ext_tax].get()) + ->Append(dec_to_double(&p->ext_tax)); + static_cast(builders[col::catalog_returns::cr_net_paid_inc_tax].get()) + ->Append(dec_to_double(&p->net_paid_inc_tax)); + static_cast(builders[col::catalog_returns::cr_fee].get()) + ->Append(dec_to_double(&p->fee)); + static_cast(builders[col::catalog_returns::cr_ext_ship_cost].get()) + ->Append(dec_to_double(&p->ext_ship_cost)); + static_cast(builders[col::catalog_returns::cr_refunded_cash].get()) + ->Append(dec_to_double(&p->refunded_cash)); + static_cast(builders[col::catalog_returns::cr_reversed_charge].get()) + ->Append(dec_to_double(&p->reversed_charge)); + static_cast(builders[col::catalog_returns::cr_store_credit].get()) + ->Append(dec_to_double(&p->store_credit)); + static_cast(builders[col::catalog_returns::cr_net_loss].get()) + ->Append(dec_to_double(&p->net_loss)); +} + +// --------------------------------------------------------------------------- +// web_returns +// --------------------------------------------------------------------------- + +void append_web_returns_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::web_returns::wr_returned_date_sk].get()) + ->Append(static_cast(r->wr_returned_date_sk)); + static_cast(builders[col::web_returns::wr_returned_time_sk].get()) + ->Append(static_cast(r->wr_returned_time_sk)); + static_cast(builders[col::web_returns::wr_item_sk].get()) + ->Append(static_cast(r->wr_item_sk)); + static_cast(builders[col::web_returns::wr_refunded_customer_sk].get()) + ->Append(static_cast(r->wr_refunded_customer_sk)); + static_cast(builders[col::web_returns::wr_refunded_cdemo_sk].get()) + ->Append(static_cast(r->wr_refunded_cdemo_sk)); + static_cast(builders[col::web_returns::wr_refunded_hdemo_sk].get()) + ->Append(static_cast(r->wr_refunded_hdemo_sk)); + static_cast(builders[col::web_returns::wr_refunded_addr_sk].get()) + ->Append(static_cast(r->wr_refunded_addr_sk)); + static_cast(builders[col::web_returns::wr_returning_customer_sk].get()) + ->Append(static_cast(r->wr_returning_customer_sk)); + static_cast(builders[col::web_returns::wr_returning_cdemo_sk].get()) + ->Append(static_cast(r->wr_returning_cdemo_sk)); + static_cast(builders[col::web_returns::wr_returning_hdemo_sk].get()) + ->Append(static_cast(r->wr_returning_hdemo_sk)); + static_cast(builders[col::web_returns::wr_returning_addr_sk].get()) + ->Append(static_cast(r->wr_returning_addr_sk)); + static_cast(builders[col::web_returns::wr_web_page_sk].get()) + ->Append(static_cast(r->wr_web_page_sk)); + static_cast(builders[col::web_returns::wr_reason_sk].get()) + ->Append(static_cast(r->wr_reason_sk)); + static_cast(builders[col::web_returns::wr_order_number].get()) + ->Append(static_cast(r->wr_order_number)); + + const ds_pricing_t* p = &r->wr_pricing; + static_cast(builders[col::web_returns::wr_quantity].get()) + ->Append(static_cast(p->quantity)); + static_cast(builders[col::web_returns::wr_net_paid].get()) + ->Append(dec_to_double(&p->net_paid)); + static_cast(builders[col::web_returns::wr_ext_tax].get()) + ->Append(dec_to_double(&p->ext_tax)); + static_cast(builders[col::web_returns::wr_net_paid_inc_tax].get()) + ->Append(dec_to_double(&p->net_paid_inc_tax)); + static_cast(builders[col::web_returns::wr_fee].get()) + ->Append(dec_to_double(&p->fee)); + static_cast(builders[col::web_returns::wr_ext_ship_cost].get()) + ->Append(dec_to_double(&p->ext_ship_cost)); + static_cast(builders[col::web_returns::wr_refunded_cash].get()) + ->Append(dec_to_double(&p->refunded_cash)); + static_cast(builders[col::web_returns::wr_reversed_charge].get()) + ->Append(dec_to_double(&p->reversed_charge)); + static_cast(builders[col::web_returns::wr_store_credit].get()) + ->Append(dec_to_double(&p->store_credit)); + static_cast(builders[col::web_returns::wr_net_loss].get()) + ->Append(dec_to_double(&p->net_loss)); +} + +// --------------------------------------------------------------------------- +// Helper: append ds_addr_t fields with given column-name prefix +// --------------------------------------------------------------------------- +// +// prefix_street_number, prefix_street_name, prefix_street_type, +// prefix_suite_number, prefix_city, prefix_county, prefix_state, +// prefix_zip (as string), prefix_country, prefix_gmt_offset +// +// base = col::TABLE::PREFIX_street_number (caller supplies named constant) +// Fixed layout relative to base: +// +0 street_number, +1 street_name, +2 street_type, +3 suite_number, +// +4 city, +5 county, +6 state, +7 zip, +// +8 country, +9 gmt_offset +static void append_addr_fields( + const ds_addr_t& addr, + std::size_t base, + tpcds::BuilderMap& builders) +{ + static_cast(builders[base + 0].get()) + ->Append(addr.street_num); + static_cast(builders[base + 1].get()) + ->Append(addr.street_name1 ? addr.street_name1 : ""); + static_cast(builders[base + 2].get()) + ->Append(encode_ca_street_type(addr.street_type ? addr.street_type : "")); + static_cast(builders[base + 3].get()) + ->Append(addr.suite_num); + static_cast(builders[base + 4].get()) + ->Append(addr.city ? addr.city : ""); + static_cast(builders[base + 5].get()) + ->Append(addr.county ? addr.county : ""); + static_cast(builders[base + 6].get()) + ->Append(encode_state(addr.state ? addr.state : "")); + char zip_buf[12]; + std::snprintf(zip_buf, sizeof(zip_buf), "%05d", addr.zip); + static_cast(builders[base + 7].get()) + ->Append(zip_buf); + static_cast(builders[base + 8].get()) + ->Append(0); // always "United States" + static_cast(builders[base + 9].get()) + ->Append(static_cast(addr.gmt_offset)); +} + +// --------------------------------------------------------------------------- +// call_center +// --------------------------------------------------------------------------- + +void append_call_center_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::call_center::cc_call_center_sk].get()) + ->Append(static_cast(r->cc_call_center_sk)); + static_cast(builders[col::call_center::cc_call_center_id].get()) + ->Append(r->cc_call_center_id); + static_cast(builders[col::call_center::cc_rec_start_date_sk].get()) + ->Append(static_cast(r->cc_rec_start_date_id)); + static_cast(builders[col::call_center::cc_rec_end_date_sk].get()) + ->Append(static_cast(r->cc_rec_end_date_id)); + static_cast(builders[col::call_center::cc_closed_date_sk].get()) + ->Append(static_cast(r->cc_closed_date_id)); + static_cast(builders[col::call_center::cc_open_date_sk].get()) + ->Append(static_cast(r->cc_open_date_id)); + static_cast(builders[col::call_center::cc_name].get()) + ->Append(encode_cc_name(r->cc_name ? r->cc_name : "")); + static_cast(builders[col::call_center::cc_class].get()) + ->Append(encode_cc_class(r->cc_class ? r->cc_class : "")); + static_cast(builders[col::call_center::cc_employees].get()) + ->Append(static_cast(r->cc_employees)); + static_cast(builders[col::call_center::cc_sq_ft].get()) + ->Append(static_cast(r->cc_sq_ft)); + static_cast(builders[col::call_center::cc_hours].get()) + ->Append(encode_cc_hours(r->cc_hours ? r->cc_hours : "")); + static_cast(builders[col::call_center::cc_manager].get()) + ->Append(r->cc_manager); + static_cast(builders[col::call_center::cc_mkt_id].get()) + ->Append(static_cast(r->cc_market_id)); + static_cast(builders[col::call_center::cc_mkt_class].get()) + ->Append(r->cc_market_class); + static_cast(builders[col::call_center::cc_mkt_desc].get()) + ->Append(r->cc_market_desc); + static_cast(builders[col::call_center::cc_market_manager].get()) + ->Append(r->cc_market_manager); + static_cast(builders[col::call_center::cc_division].get()) + ->Append(static_cast(r->cc_division_id)); + static_cast(builders[col::call_center::cc_division_name].get()) + ->Append(r->cc_division_name); + static_cast(builders[col::call_center::cc_company].get()) + ->Append(static_cast(r->cc_company)); + static_cast(builders[col::call_center::cc_company_name].get()) + ->Append(r->cc_company_name); + append_addr_fields(r->cc_address, col::call_center::cc_street_number, builders); + static_cast(builders[col::call_center::cc_tax_percentage].get()) + ->Append(dec_to_double(&r->cc_tax_percentage)); +} + +// --------------------------------------------------------------------------- +// catalog_page +// --------------------------------------------------------------------------- + +void append_catalog_page_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::catalog_page::cp_catalog_page_sk].get()) + ->Append(static_cast(r->cp_catalog_page_sk)); + static_cast(builders[col::catalog_page::cp_catalog_page_id].get()) + ->Append(r->cp_catalog_page_id); + static_cast(builders[col::catalog_page::cp_start_date_sk].get()) + ->Append(static_cast(r->cp_start_date_id)); + static_cast(builders[col::catalog_page::cp_end_date_sk].get()) + ->Append(static_cast(r->cp_end_date_id)); + static_cast(builders[col::catalog_page::cp_department].get()) + ->Append(0); // always "DEPARTMENT" + static_cast(builders[col::catalog_page::cp_catalog_number].get()) + ->Append(static_cast(r->cp_catalog_number)); + static_cast(builders[col::catalog_page::cp_catalog_page_number].get()) + ->Append(static_cast(r->cp_catalog_page_number)); + static_cast(builders[col::catalog_page::cp_description].get()) + ->Append(r->cp_description); + static_cast(builders[col::catalog_page::cp_type].get()) + ->Append(encode_cp_type(r->cp_type ? r->cp_type : "")); +} + +// --------------------------------------------------------------------------- +// web_page +// --------------------------------------------------------------------------- + +void append_web_page_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::web_page::wp_web_page_sk].get()) + ->Append(static_cast(r->wp_page_sk)); + static_cast(builders[col::web_page::wp_web_page_id].get()) + ->Append(r->wp_page_id); + static_cast(builders[col::web_page::wp_rec_start_date_sk].get()) + ->Append(static_cast(r->wp_rec_start_date_id)); + static_cast(builders[col::web_page::wp_rec_end_date_sk].get()) + ->Append(static_cast(r->wp_rec_end_date_id)); + static_cast(builders[col::web_page::wp_creation_date_sk].get()) + ->Append(static_cast(r->wp_creation_date_sk)); + static_cast(builders[col::web_page::wp_access_date_sk].get()) + ->Append(static_cast(r->wp_access_date_sk)); + static_cast(builders[col::web_page::wp_autogen_flag].get()) + ->Append(static_cast(r->wp_autogen_flag)); + static_cast(builders[col::web_page::wp_customer_sk].get()) + ->Append(static_cast(r->wp_customer_sk)); + static_cast(builders[col::web_page::wp_url].get()) + ->Append(r->wp_url); + static_cast(builders[col::web_page::wp_type].get()) + ->Append(encode_wp_type(r->wp_type ? r->wp_type : "")); + static_cast(builders[col::web_page::wp_char_count].get()) + ->Append(static_cast(r->wp_char_count)); + static_cast(builders[col::web_page::wp_link_count].get()) + ->Append(static_cast(r->wp_link_count)); + static_cast(builders[col::web_page::wp_image_count].get()) + ->Append(static_cast(r->wp_image_count)); + static_cast(builders[col::web_page::wp_max_ad_count].get()) + ->Append(static_cast(r->wp_max_ad_count)); +} + +// --------------------------------------------------------------------------- +// web_site +// --------------------------------------------------------------------------- + +void append_web_site_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::web_site::web_site_sk].get()) + ->Append(static_cast(r->web_site_sk)); + static_cast(builders[col::web_site::web_site_id].get()) + ->Append(r->web_site_id); + static_cast(builders[col::web_site::web_rec_start_date_sk].get()) + ->Append(static_cast(r->web_rec_start_date_id)); + static_cast(builders[col::web_site::web_rec_end_date_sk].get()) + ->Append(static_cast(r->web_rec_end_date_id)); + static_cast(builders[col::web_site::web_name].get()) + ->Append(r->web_name); + static_cast(builders[col::web_site::web_open_date_sk].get()) + ->Append(static_cast(r->web_open_date)); + static_cast(builders[col::web_site::web_close_date_sk].get()) + ->Append(static_cast(r->web_close_date)); + static_cast(builders[col::web_site::web_class].get()) + ->Append(0); // always "Unknown" + static_cast(builders[col::web_site::web_manager].get()) + ->Append(r->web_manager); + static_cast(builders[col::web_site::web_mkt_id].get()) + ->Append(static_cast(r->web_market_id)); + static_cast(builders[col::web_site::web_mkt_class].get()) + ->Append(r->web_market_class); + static_cast(builders[col::web_site::web_mkt_desc].get()) + ->Append(r->web_market_desc); + static_cast(builders[col::web_site::web_market_manager].get()) + ->Append(r->web_market_manager); + static_cast(builders[col::web_site::web_company_id].get()) + ->Append(static_cast(r->web_company_id)); + static_cast(builders[col::web_site::web_company_name].get()) + ->Append(r->web_company_name); + append_addr_fields(r->web_address, col::web_site::web_street_number, builders); + static_cast(builders[col::web_site::web_tax_percentage].get()) + ->Append(dec_to_double(&r->web_tax_percentage)); +} + +// --------------------------------------------------------------------------- +// warehouse +// --------------------------------------------------------------------------- + +void append_warehouse_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::warehouse::w_warehouse_sk].get()) + ->Append(static_cast(r->w_warehouse_sk)); + static_cast(builders[col::warehouse::w_warehouse_id].get()) + ->Append(r->w_warehouse_id); + static_cast(builders[col::warehouse::w_warehouse_name].get()) + ->Append(r->w_warehouse_name); + static_cast(builders[col::warehouse::w_warehouse_sq_ft].get()) + ->Append(static_cast(r->w_warehouse_sq_ft)); + append_addr_fields(r->w_address, col::warehouse::w_street_number, builders); +} + +// --------------------------------------------------------------------------- +// ship_mode +// --------------------------------------------------------------------------- + +void append_ship_mode_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::ship_mode::sm_ship_mode_sk].get()) + ->Append(static_cast(r->sm_ship_mode_sk)); + static_cast(builders[col::ship_mode::sm_ship_mode_id].get()) + ->Append(r->sm_ship_mode_id); + static_cast(builders[col::ship_mode::sm_type].get()) + ->Append(encode_sm_type(r->sm_type ? r->sm_type : "")); + static_cast(builders[col::ship_mode::sm_code].get()) + ->Append(encode_sm_code(r->sm_code ? r->sm_code : "")); + static_cast(builders[col::ship_mode::sm_carrier].get()) + ->Append(encode_sm_carrier(r->sm_carrier ? r->sm_carrier : "")); + static_cast(builders[col::ship_mode::sm_contract].get()) + ->Append(r->sm_contract); +} + +// --------------------------------------------------------------------------- +// household_demographics +// --------------------------------------------------------------------------- + +void append_household_demographics_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::household_demographics::hd_demo_sk].get()) + ->Append(static_cast(r->hd_demo_sk)); + static_cast(builders[col::household_demographics::hd_income_band_sk].get()) + ->Append(static_cast(r->hd_income_band_id)); + static_cast(builders[col::household_demographics::hd_buy_potential].get()) + ->Append(encode_hd_buy_potential(r->hd_buy_potential)); + static_cast(builders[col::household_demographics::hd_dep_count].get()) + ->Append(static_cast(r->hd_dep_count)); + static_cast(builders[col::household_demographics::hd_vehicle_count].get()) + ->Append(static_cast(r->hd_vehicle_count)); +} + +// --------------------------------------------------------------------------- +// customer_demographics +// --------------------------------------------------------------------------- + +void append_customer_demographics_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::customer_demographics::cd_demo_sk].get()) + ->Append(static_cast(r->cd_demo_sk)); + static_cast(builders[col::customer_demographics::cd_gender].get()) + ->Append(encode_cd_gender(r->cd_gender ? r->cd_gender : "")); + static_cast(builders[col::customer_demographics::cd_marital_status].get()) + ->Append(encode_cd_marital_status(r->cd_marital_status ? r->cd_marital_status : "")); + static_cast(builders[col::customer_demographics::cd_education_status].get()) + ->Append(encode_cd_education_status(r->cd_education_status ? r->cd_education_status : "")); + static_cast(builders[col::customer_demographics::cd_purchase_estimate].get()) + ->Append(static_cast(r->cd_purchase_estimate)); + static_cast(builders[col::customer_demographics::cd_credit_rating].get()) + ->Append(encode_cd_credit_rating(r->cd_credit_rating ? r->cd_credit_rating : "")); + static_cast(builders[col::customer_demographics::cd_dep_count].get()) + ->Append(static_cast(r->cd_dep_count)); + static_cast(builders[col::customer_demographics::cd_dep_employed_count].get()) + ->Append(static_cast(r->cd_dep_employed_count)); + static_cast(builders[col::customer_demographics::cd_dep_college_count].get()) + ->Append(static_cast(r->cd_dep_college_count)); +} + +// --------------------------------------------------------------------------- +// customer_address +// --------------------------------------------------------------------------- + +void append_customer_address_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::customer_address::ca_address_sk].get()) + ->Append(static_cast(r->ca_addr_sk)); + static_cast(builders[col::customer_address::ca_address_id].get()) + ->Append(r->ca_addr_id); + append_addr_fields(r->ca_address, col::customer_address::ca_street_number, builders); + static_cast(builders[col::customer_address::ca_location_type].get()) + ->Append(encode_ca_location_type(r->ca_location_type ? r->ca_location_type : "")); +} + +// --------------------------------------------------------------------------- +// income_band +// --------------------------------------------------------------------------- + +void append_income_band_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::income_band::ib_income_band_id].get()) + ->Append(static_cast(r->ib_income_band_id)); + static_cast(builders[col::income_band::ib_lower_bound].get()) + ->Append(static_cast(r->ib_lower_bound)); + static_cast(builders[col::income_band::ib_upper_bound].get()) + ->Append(static_cast(r->ib_upper_bound)); +} + +// --------------------------------------------------------------------------- +// reason +// --------------------------------------------------------------------------- + +void append_reason_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::reason::r_reason_sk].get()) + ->Append(static_cast(r->r_reason_sk)); + static_cast(builders[col::reason::r_reason_id].get()) + ->Append(r->r_reason_id); + static_cast(builders[col::reason::r_reason_desc].get()) + ->Append(r->r_reason_description ? r->r_reason_description : ""); +} + +// --------------------------------------------------------------------------- +// time_dim +// --------------------------------------------------------------------------- + +void append_time_dim_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::time_dim::t_time_sk].get()) + ->Append(static_cast(r->t_time_sk)); + static_cast(builders[col::time_dim::t_time_id].get()) + ->Append(r->t_time_id); + static_cast(builders[col::time_dim::t_time].get()) + ->Append(static_cast(r->t_time)); + static_cast(builders[col::time_dim::t_hour].get()) + ->Append(static_cast(r->t_hour)); + static_cast(builders[col::time_dim::t_minute].get()) + ->Append(static_cast(r->t_minute)); + static_cast(builders[col::time_dim::t_second].get()) + ->Append(static_cast(r->t_second)); + static_cast(builders[col::time_dim::t_am_pm].get()) + ->Append(encode_t_am_pm(r->t_am_pm ? r->t_am_pm : "")); + static_cast(builders[col::time_dim::t_shift].get()) + ->Append(encode_t_shift(r->t_shift ? r->t_shift : "")); + static_cast(builders[col::time_dim::t_sub_shift].get()) + ->Append(encode_t_sub_shift(r->t_sub_shift ? r->t_sub_shift : "")); + static_cast(builders[col::time_dim::t_meal_time].get()) + ->Append(encode_t_meal_time(r->t_meal_time ? r->t_meal_time : "")); +} + +// --------------------------------------------------------------------------- +// promotion +// --------------------------------------------------------------------------- + +void append_promotion_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::promotion::p_promo_sk].get()) + ->Append(static_cast(r->p_promo_sk)); + static_cast(builders[col::promotion::p_promo_id].get()) + ->Append(r->p_promo_id); + static_cast(builders[col::promotion::p_start_date_sk].get()) + ->Append(static_cast(r->p_start_date_id)); + static_cast(builders[col::promotion::p_end_date_sk].get()) + ->Append(static_cast(r->p_end_date_id)); + static_cast(builders[col::promotion::p_item_sk].get()) + ->Append(static_cast(r->p_item_sk)); + static_cast(builders[col::promotion::p_cost].get()) + ->Append(dec_to_double(&r->p_cost)); + static_cast(builders[col::promotion::p_response_target].get()) + ->Append(static_cast(r->p_response_target)); + static_cast(builders[col::promotion::p_promo_name].get()) + ->Append(r->p_promo_name); + static_cast(builders[col::promotion::p_channel_dmail].get()) + ->Append(static_cast(r->p_channel_dmail)); + static_cast(builders[col::promotion::p_channel_email].get()) + ->Append(static_cast(r->p_channel_email)); + static_cast(builders[col::promotion::p_channel_catalog].get()) + ->Append(static_cast(r->p_channel_catalog)); + static_cast(builders[col::promotion::p_channel_tv].get()) + ->Append(static_cast(r->p_channel_tv)); + static_cast(builders[col::promotion::p_channel_radio].get()) + ->Append(static_cast(r->p_channel_radio)); + static_cast(builders[col::promotion::p_channel_press].get()) + ->Append(static_cast(r->p_channel_press)); + static_cast(builders[col::promotion::p_channel_event].get()) + ->Append(static_cast(r->p_channel_event)); + static_cast(builders[col::promotion::p_channel_demo].get()) + ->Append(static_cast(r->p_channel_demo)); + static_cast(builders[col::promotion::p_channel_details].get()) + ->Append(r->p_channel_details); + static_cast(builders[col::promotion::p_purpose].get()) + ->Append(0); // always "Unknown" + static_cast(builders[col::promotion::p_discount_active].get()) + ->Append(static_cast(r->p_discount_active)); +} + +// --------------------------------------------------------------------------- +// store +// --------------------------------------------------------------------------- + +void append_store_to_builders( + const void* row, + tpcds::BuilderMap& builders) +{ + auto* r = static_cast(row); + + static_cast(builders[col::store::s_store_sk].get()) + ->Append(static_cast(r->store_sk)); + static_cast(builders[col::store::s_store_id].get()) + ->Append(r->store_id); + static_cast(builders[col::store::s_rec_start_date].get()) + ->Append(static_cast(r->rec_start_date_id)); + static_cast(builders[col::store::s_rec_end_date].get()) + ->Append(static_cast(r->rec_end_date_id)); + static_cast(builders[col::store::s_closed_date_sk].get()) + ->Append(static_cast(r->closed_date_id)); + static_cast(builders[col::store::s_store_name].get()) + ->Append(r->store_name); + static_cast(builders[col::store::s_number_employees].get()) + ->Append(static_cast(r->employees)); + static_cast(builders[col::store::s_floor_space].get()) + ->Append(static_cast(r->floor_space)); + static_cast(builders[col::store::s_hours].get()) + ->Append(encode_cc_hours(r->hours ? r->hours : "")); + static_cast(builders[col::store::s_manager].get()) + ->Append(r->store_manager); + static_cast(builders[col::store::s_market_id].get()) + ->Append(static_cast(r->market_id)); + static_cast(builders[col::store::s_geography_class].get()) + ->Append(0); // always "Unknown" + static_cast(builders[col::store::s_market_desc].get()) + ->Append(r->market_desc); + static_cast(builders[col::store::s_market_manager].get()) + ->Append(r->market_manager); + static_cast(builders[col::store::s_division_id].get()) + ->Append(static_cast(r->division_id)); + static_cast(builders[col::store::s_division_name].get()) + ->Append(0); // always "Unknown" + static_cast(builders[col::store::s_company_id].get()) + ->Append(static_cast(r->company_id)); + static_cast(builders[col::store::s_company_name].get()) + ->Append(0); // always "Unknown" + append_addr_fields(r->address, col::store::s_street_number, builders); + static_cast(builders[col::store::s_tax_percentage].get()) + ->Append(dec_to_double(&r->dTaxPercentage)); +} + +// --------------------------------------------------------------------------- +// Generic dispatcher +// --------------------------------------------------------------------------- + +void append_dsdgen_row_to_builders( + const std::string& tbl_name, + const void* row, + tpcds::BuilderMap& builders) +{ + if (tbl_name == "store_sales") { + append_store_sales_to_builders(row, builders); + } else if (tbl_name == "inventory") { + append_inventory_to_builders(row, builders); + } else if (tbl_name == "catalog_sales") { + append_catalog_sales_to_builders(row, builders); + } else if (tbl_name == "web_sales") { + append_web_sales_to_builders(row, builders); + } else if (tbl_name == "customer") { + append_customer_to_builders(row, builders); + } else if (tbl_name == "item") { + append_item_to_builders(row, builders); + } else if (tbl_name == "date_dim") { + append_date_dim_to_builders(row, builders); + } else if (tbl_name == "store_returns") { + append_store_returns_to_builders(row, builders); + } else if (tbl_name == "catalog_returns") { + append_catalog_returns_to_builders(row, builders); + } else if (tbl_name == "web_returns") { + append_web_returns_to_builders(row, builders); + } else if (tbl_name == "call_center") { + append_call_center_to_builders(row, builders); + } else if (tbl_name == "catalog_page") { + append_catalog_page_to_builders(row, builders); + } else if (tbl_name == "web_page") { + append_web_page_to_builders(row, builders); + } else if (tbl_name == "web_site") { + append_web_site_to_builders(row, builders); + } else if (tbl_name == "warehouse") { + append_warehouse_to_builders(row, builders); + } else if (tbl_name == "ship_mode") { + append_ship_mode_to_builders(row, builders); + } else if (tbl_name == "household_demographics") { + append_household_demographics_to_builders(row, builders); + } else if (tbl_name == "customer_demographics") { + append_customer_demographics_to_builders(row, builders); + } else if (tbl_name == "customer_address") { + append_customer_address_to_builders(row, builders); + } else if (tbl_name == "income_band") { + append_income_band_to_builders(row, builders); + } else if (tbl_name == "reason") { + append_reason_to_builders(row, builders); + } else if (tbl_name == "time_dim") { + append_time_dim_to_builders(row, builders); + } else if (tbl_name == "promotion") { + append_promotion_to_builders(row, builders); + } else if (tbl_name == "store") { + append_store_to_builders(row, builders); + } else { + throw std::invalid_argument("append_dsdgen_row_to_builders: unknown table: " + tbl_name); + } +} + +} // namespace tpcds diff --git a/src/dsdgen/dsdgen_wrapper.cpp b/src/dsdgen/dsdgen_wrapper.cpp new file mode 100644 index 0000000..9024e90 --- /dev/null +++ b/src/dsdgen/dsdgen_wrapper.cpp @@ -0,0 +1,1249 @@ +/** + * dsdgen_wrapper.cpp — C++ wrapper around TPC-DS dsdgen + * + * Initialises dsdgen global state using the embedded tpcds.idx binary + * (compiled into dsts_generated.c) and provides per-table generation methods. + */ + +#include "tpch/dsdgen_wrapper.hpp" + +#include +#include +#include +#include +#include +#include +#include + +// dsdgen C types and functions — single wrapper header +extern "C" { +#include "tpcds_dsdgen.h" +} + +// Embedded distribution data (compiled from tpcds.idx by cmake/gen_dsts.py) +extern "C" { + extern const uint8_t tpcds_idx_data[]; + extern const size_t tpcds_idx_size; +} + +namespace tpcds { + +// --------------------------------------------------------------------------- +// Static helpers +// --------------------------------------------------------------------------- + +static_assert(static_cast(TableType::CallCenter) == TPCDS_CALL_CENTER); +static_assert(static_cast(TableType::CatalogPage) == TPCDS_CATALOG_PAGE); +static_assert(static_cast(TableType::CatalogReturns) == TPCDS_CATALOG_RETURNS); +static_assert(static_cast(TableType::CatalogSales) == TPCDS_CATALOG_SALES); +static_assert(static_cast(TableType::Customer) == TPCDS_CUSTOMER); +static_assert(static_cast(TableType::CustomerAddress) == TPCDS_CUSTOMER_ADDRESS); +static_assert(static_cast(TableType::CustomerDemographics) == TPCDS_CUSTOMER_DEMOGRAPHICS); +static_assert(static_cast(TableType::DateDim) == TPCDS_DATE); +static_assert(static_cast(TableType::HouseholdDemographics) == TPCDS_HOUSEHOLD_DEMOGRAPHICS); +static_assert(static_cast(TableType::IncomeBand) == TPCDS_INCOME_BAND); +static_assert(static_cast(TableType::Inventory) == TPCDS_INVENTORY); +static_assert(static_cast(TableType::Item) == TPCDS_ITEM); +static_assert(static_cast(TableType::Promotion) == TPCDS_PROMOTION); +static_assert(static_cast(TableType::Reason) == TPCDS_REASON); +static_assert(static_cast(TableType::ShipMode) == TPCDS_SHIP_MODE); +static_assert(static_cast(TableType::Store) == TPCDS_STORE); +static_assert(static_cast(TableType::StoreReturns) == TPCDS_STORE_RETURNS); +static_assert(static_cast(TableType::StoreSales) == TPCDS_STORE_SALES); +static_assert(static_cast(TableType::TimeDim) == TPCDS_TIME); +static_assert(static_cast(TableType::Warehouse) == TPCDS_WAREHOUSE); +static_assert(static_cast(TableType::WebPage) == TPCDS_WEB_PAGE); +static_assert(static_cast(TableType::WebReturns) == TPCDS_WEB_RETURNS); +static_assert(static_cast(TableType::WebSales) == TPCDS_WEB_SALES); +static_assert(static_cast(TableType::WebSite) == TPCDS_WEB_SITE); + +int DSDGenWrapper::table_id(TableType t) { + return static_cast(t); +} + +std::string DSDGenWrapper::table_name(TableType t) { + switch (t) { + case TableType::CallCenter: return "call_center"; + case TableType::CatalogPage: return "catalog_page"; + case TableType::CatalogReturns: return "catalog_returns"; + case TableType::CatalogSales: return "catalog_sales"; + case TableType::Customer: return "customer"; + case TableType::CustomerAddress: return "customer_address"; + case TableType::CustomerDemographics: return "customer_demographics"; + case TableType::DateDim: return "date_dim"; + case TableType::HouseholdDemographics: return "household_demographics"; + case TableType::IncomeBand: return "income_band"; + case TableType::Inventory: return "inventory"; + case TableType::Item: return "item"; + case TableType::Promotion: return "promotion"; + case TableType::Reason: return "reason"; + case TableType::ShipMode: return "ship_mode"; + case TableType::Store: return "store"; + case TableType::StoreReturns: return "store_returns"; + case TableType::StoreSales: return "store_sales"; + case TableType::TimeDim: return "time_dim"; + case TableType::Warehouse: return "warehouse"; + case TableType::WebPage: return "web_page"; + case TableType::WebReturns: return "web_returns"; + case TableType::WebSales: return "web_sales"; + case TableType::WebSite: return "web_site"; + default: return "unknown"; + } +} + +// --------------------------------------------------------------------------- +// Arrow schemas +// --------------------------------------------------------------------------- + +// Helper: create an Arrow field with a pre-computed cardinality hint for Lance. +// Mirrors tpch_field() in dbgen_wrapper.cpp. Only use for utf8 fields with known +// bounded cardinality derived from TPC-DS spec or empirical measurements. +static std::shared_ptr tpcds_field( + const std::string& name, + std::shared_ptr type, + int64_t known_cardinality = -1) +{ + if (known_cardinality > 0) { + auto meta = arrow::key_value_metadata( + std::vector{"lance.cardinality"}, + std::vector{std::to_string(known_cardinality)}); + return arrow::field(name, type, /*nullable=*/true, meta); + } + return arrow::field(name, type); +} + +std::shared_ptr DSDGenWrapper::get_schema(TableType t, double scale_factor) { + auto dict8 = arrow::dictionary(arrow::int8(), arrow::utf8()); + + // TPC-DS row counts per TPC-DS v3 spec. + int64_t sf = static_cast(std::ceil(scale_factor)); + int64_t sf_sqrt = static_cast(std::ceil(std::sqrt(scale_factor))); + // Fixed-cardinality dimension tables + constexpr int64_t DATE_DIM_ROWS = 73'049; + constexpr int64_t TIME_DIM_ROWS = 86'400; + constexpr int64_t SHIP_MODE_ROWS = 20; + constexpr int64_t REASON_ROWS = 55; + // SF-scaled dimension tables + int64_t customer = 100'000LL * sf; + int64_t cust_addr = 50'000LL * sf; + int64_t item = 18'000LL * sf_sqrt; + int64_t store = 12LL * sf; + int64_t call_center = 6LL * sf; + int64_t catalog_page = 11'718LL * sf_sqrt; + int64_t web_page = 60LL * sf_sqrt; + int64_t web_site = 30LL * sf_sqrt; + int64_t warehouse = 5LL * sf; + int64_t promotion = 300LL * sf_sqrt; + switch (t) { + case TableType::StoreSales: + return arrow::schema({ + arrow::field("ss_sold_date_sk", arrow::int64()), + arrow::field("ss_sold_time_sk", arrow::int64()), + arrow::field("ss_item_sk", arrow::int64()), + arrow::field("ss_customer_sk", arrow::int64()), + arrow::field("ss_cdemo_sk", arrow::int64()), + arrow::field("ss_hdemo_sk", arrow::int64()), + arrow::field("ss_addr_sk", arrow::int64()), + arrow::field("ss_store_sk", arrow::int64()), + arrow::field("ss_promo_sk", arrow::int64()), + arrow::field("ss_ticket_number", arrow::int64()), + arrow::field("ss_quantity", arrow::int32()), + arrow::field("ss_wholesale_cost", arrow::float64()), + arrow::field("ss_list_price", arrow::float64()), + arrow::field("ss_sales_price", arrow::float64()), + arrow::field("ss_ext_discount_amt", arrow::float64()), + arrow::field("ss_ext_sales_price", arrow::float64()), + arrow::field("ss_ext_wholesale_cost", arrow::float64()), + arrow::field("ss_ext_list_price", arrow::float64()), + arrow::field("ss_ext_tax", arrow::float64()), + arrow::field("ss_coupon_amt", arrow::float64()), + arrow::field("ss_net_paid", arrow::float64()), + arrow::field("ss_net_paid_inc_tax", arrow::float64()), + arrow::field("ss_net_profit", arrow::float64()), + }); + + case TableType::Inventory: + return arrow::schema({ + arrow::field("inv_date_sk", arrow::int64()), + arrow::field("inv_item_sk", arrow::int64()), + arrow::field("inv_warehouse_sk", arrow::int64()), + arrow::field("inv_quantity_on_hand", arrow::int32()), + }); + + case TableType::CatalogSales: + return arrow::schema({ + arrow::field("cs_sold_date_sk", arrow::int64()), + arrow::field("cs_sold_time_sk", arrow::int64()), + arrow::field("cs_ship_date_sk", arrow::int64()), + arrow::field("cs_bill_customer_sk", arrow::int64()), + arrow::field("cs_bill_cdemo_sk", arrow::int64()), + arrow::field("cs_bill_hdemo_sk", arrow::int64()), + arrow::field("cs_bill_addr_sk", arrow::int64()), + arrow::field("cs_ship_customer_sk", arrow::int64()), + arrow::field("cs_ship_cdemo_sk", arrow::int64()), + arrow::field("cs_ship_hdemo_sk", arrow::int64()), + arrow::field("cs_ship_addr_sk", arrow::int64()), + arrow::field("cs_call_center_sk", arrow::int64()), + arrow::field("cs_catalog_page_sk", arrow::int64()), + arrow::field("cs_ship_mode_sk", arrow::int64()), + arrow::field("cs_warehouse_sk", arrow::int64()), + arrow::field("cs_item_sk", arrow::int64()), + arrow::field("cs_promo_sk", arrow::int64()), + arrow::field("cs_order_number", arrow::int64()), + arrow::field("cs_quantity", arrow::int32()), + arrow::field("cs_wholesale_cost", arrow::float64()), + arrow::field("cs_list_price", arrow::float64()), + arrow::field("cs_sales_price", arrow::float64()), + arrow::field("cs_ext_discount_amt", arrow::float64()), + arrow::field("cs_ext_sales_price", arrow::float64()), + arrow::field("cs_ext_wholesale_cost", arrow::float64()), + arrow::field("cs_ext_list_price", arrow::float64()), + arrow::field("cs_ext_tax", arrow::float64()), + arrow::field("cs_coupon_amt", arrow::float64()), + arrow::field("cs_ext_ship_cost", arrow::float64()), + arrow::field("cs_net_paid", arrow::float64()), + arrow::field("cs_net_paid_inc_tax", arrow::float64()), + arrow::field("cs_net_paid_inc_ship", arrow::float64()), + arrow::field("cs_net_paid_inc_ship_tax", arrow::float64()), + arrow::field("cs_net_profit", arrow::float64()), + }); + + case TableType::WebSales: + return arrow::schema({ + arrow::field("ws_sold_date_sk", arrow::int64()), + arrow::field("ws_sold_time_sk", arrow::int64()), + arrow::field("ws_ship_date_sk", arrow::int64()), + arrow::field("ws_item_sk", arrow::int64()), + arrow::field("ws_bill_customer_sk", arrow::int64()), + arrow::field("ws_bill_cdemo_sk", arrow::int64()), + arrow::field("ws_bill_hdemo_sk", arrow::int64()), + arrow::field("ws_bill_addr_sk", arrow::int64()), + arrow::field("ws_ship_customer_sk", arrow::int64()), + arrow::field("ws_ship_cdemo_sk", arrow::int64()), + arrow::field("ws_ship_hdemo_sk", arrow::int64()), + arrow::field("ws_ship_addr_sk", arrow::int64()), + arrow::field("ws_web_page_sk", arrow::int64()), + arrow::field("ws_web_site_sk", arrow::int64()), + arrow::field("ws_ship_mode_sk", arrow::int64()), + arrow::field("ws_warehouse_sk", arrow::int64()), + arrow::field("ws_promo_sk", arrow::int64()), + arrow::field("ws_order_number", arrow::int64()), + arrow::field("ws_quantity", arrow::int32()), + arrow::field("ws_wholesale_cost", arrow::float64()), + arrow::field("ws_list_price", arrow::float64()), + arrow::field("ws_sales_price", arrow::float64()), + arrow::field("ws_ext_discount_amt", arrow::float64()), + arrow::field("ws_ext_sales_price", arrow::float64()), + arrow::field("ws_ext_wholesale_cost", arrow::float64()), + arrow::field("ws_ext_list_price", arrow::float64()), + arrow::field("ws_ext_tax", arrow::float64()), + arrow::field("ws_coupon_amt", arrow::float64()), + arrow::field("ws_ext_ship_cost", arrow::float64()), + arrow::field("ws_net_paid", arrow::float64()), + arrow::field("ws_net_paid_inc_tax", arrow::float64()), + arrow::field("ws_net_paid_inc_ship", arrow::float64()), + arrow::field("ws_net_paid_inc_ship_tax", arrow::float64()), + arrow::field("ws_net_profit", arrow::float64()), + }); + + case TableType::Customer: + return arrow::schema({ + arrow::field("c_customer_sk", arrow::int64()), + tpcds_field("c_customer_id", arrow::utf8(), customer), + arrow::field("c_current_cdemo_sk", arrow::int64()), + arrow::field("c_current_hdemo_sk", arrow::int64()), + arrow::field("c_current_addr_sk", arrow::int64()), + arrow::field("c_first_shipto_date_id", arrow::int32()), + arrow::field("c_first_sales_date_id", arrow::int32()), + arrow::field("c_salutation", dict8), + tpcds_field("c_first_name", arrow::utf8(), 2000), + tpcds_field("c_last_name", arrow::utf8(), 5000), + arrow::field("c_preferred_cust_flag", arrow::int32()), + arrow::field("c_birth_day", arrow::int32()), + arrow::field("c_birth_month", arrow::int32()), + arrow::field("c_birth_year", arrow::int32()), + tpcds_field("c_birth_country", arrow::utf8(), 200), + tpcds_field("c_login", arrow::utf8(), customer), + tpcds_field("c_email_address", arrow::utf8(), customer), + arrow::field("c_last_review_date", arrow::int32()), + }); + + case TableType::Item: + return arrow::schema({ + arrow::field("i_item_sk", arrow::int64()), + tpcds_field("i_item_id", arrow::utf8(), item), + arrow::field("i_rec_start_date_id", arrow::int64()), + arrow::field("i_rec_end_date_id", arrow::int64()), + tpcds_field("i_item_desc", arrow::utf8(), item), + arrow::field("i_current_price", arrow::float64()), + arrow::field("i_wholesale_cost", arrow::float64()), + arrow::field("i_brand_id", arrow::int64()), + tpcds_field("i_brand", arrow::utf8(), 1000), + arrow::field("i_class_id", arrow::int64()), + tpcds_field("i_class", arrow::utf8(), 100), + arrow::field("i_category_id", arrow::int64()), + arrow::field("i_category", dict8), + arrow::field("i_manufact_id", arrow::int64()), + tpcds_field("i_manufact", arrow::utf8(), 1000), + arrow::field("i_size", dict8), + tpcds_field("i_formulation", arrow::utf8(), item), + arrow::field("i_color", dict8), + arrow::field("i_units", dict8), + arrow::field("i_container", dict8), + arrow::field("i_manager_id", arrow::int64()), + tpcds_field("i_product_name", arrow::utf8(), item), + arrow::field("i_promo_sk", arrow::int64()), + }); + + case TableType::DateDim: + return arrow::schema({ + arrow::field("d_date_sk", arrow::int64()), + tpcds_field("d_date_id", arrow::utf8(), DATE_DIM_ROWS), + arrow::field("d_month_seq", arrow::int32()), + arrow::field("d_week_seq", arrow::int32()), + arrow::field("d_quarter_seq", arrow::int32()), + arrow::field("d_year", arrow::int32()), + arrow::field("d_dow", arrow::int32()), + arrow::field("d_moy", arrow::int32()), + arrow::field("d_dom", arrow::int32()), + arrow::field("d_qoy", arrow::int32()), + arrow::field("d_fy_year", arrow::int32()), + arrow::field("d_fy_quarter_seq", arrow::int32()), + arrow::field("d_fy_week_seq", arrow::int32()), + arrow::field("d_day_name", dict8), + arrow::field("d_holiday", arrow::int32()), + arrow::field("d_weekend", arrow::int32()), + arrow::field("d_following_holiday", arrow::int32()), + arrow::field("d_first_dom", arrow::int32()), + arrow::field("d_last_dom", arrow::int32()), + arrow::field("d_same_day_ly", arrow::int32()), + arrow::field("d_same_day_lq", arrow::int32()), + arrow::field("d_current_day", arrow::int32()), + arrow::field("d_current_week", arrow::int32()), + arrow::field("d_current_month", arrow::int32()), + arrow::field("d_current_quarter", arrow::int32()), + arrow::field("d_current_year", arrow::int32()), + }); + + case TableType::StoreReturns: + return arrow::schema({ + arrow::field("sr_returned_date_sk", arrow::int64()), + arrow::field("sr_returned_time_sk", arrow::int64()), + arrow::field("sr_item_sk", arrow::int64()), + arrow::field("sr_customer_sk", arrow::int64()), + arrow::field("sr_cdemo_sk", arrow::int64()), + arrow::field("sr_hdemo_sk", arrow::int64()), + arrow::field("sr_addr_sk", arrow::int64()), + arrow::field("sr_store_sk", arrow::int64()), + arrow::field("sr_reason_sk", arrow::int64()), + arrow::field("sr_ticket_number", arrow::int64()), + arrow::field("sr_quantity", arrow::int32()), + arrow::field("sr_net_paid", arrow::float64()), + arrow::field("sr_ext_tax", arrow::float64()), + arrow::field("sr_net_paid_inc_tax", arrow::float64()), + arrow::field("sr_fee", arrow::float64()), + arrow::field("sr_ext_ship_cost", arrow::float64()), + arrow::field("sr_refunded_cash", arrow::float64()), + arrow::field("sr_reversed_charge", arrow::float64()), + arrow::field("sr_store_credit", arrow::float64()), + arrow::field("sr_net_loss", arrow::float64()), + }); + + case TableType::CatalogReturns: + return arrow::schema({ + arrow::field("cr_returned_date_sk", arrow::int64()), + arrow::field("cr_returned_time_sk", arrow::int64()), + arrow::field("cr_item_sk", arrow::int64()), + arrow::field("cr_refunded_customer_sk", arrow::int64()), + arrow::field("cr_refunded_cdemo_sk", arrow::int64()), + arrow::field("cr_refunded_hdemo_sk", arrow::int64()), + arrow::field("cr_refunded_addr_sk", arrow::int64()), + arrow::field("cr_returning_customer_sk", arrow::int64()), + arrow::field("cr_returning_cdemo_sk", arrow::int64()), + arrow::field("cr_returning_hdemo_sk", arrow::int64()), + arrow::field("cr_returning_addr_sk", arrow::int64()), + arrow::field("cr_call_center_sk", arrow::int64()), + arrow::field("cr_catalog_page_sk", arrow::int64()), + arrow::field("cr_ship_mode_sk", arrow::int64()), + arrow::field("cr_warehouse_sk", arrow::int64()), + arrow::field("cr_reason_sk", arrow::int64()), + arrow::field("cr_order_number", arrow::int64()), + arrow::field("cr_quantity", arrow::int32()), + arrow::field("cr_net_paid", arrow::float64()), + arrow::field("cr_ext_tax", arrow::float64()), + arrow::field("cr_net_paid_inc_tax", arrow::float64()), + arrow::field("cr_fee", arrow::float64()), + arrow::field("cr_ext_ship_cost", arrow::float64()), + arrow::field("cr_refunded_cash", arrow::float64()), + arrow::field("cr_reversed_charge", arrow::float64()), + arrow::field("cr_store_credit", arrow::float64()), + arrow::field("cr_net_loss", arrow::float64()), + }); + + case TableType::WebReturns: + return arrow::schema({ + arrow::field("wr_returned_date_sk", arrow::int64()), + arrow::field("wr_returned_time_sk", arrow::int64()), + arrow::field("wr_item_sk", arrow::int64()), + arrow::field("wr_refunded_customer_sk", arrow::int64()), + arrow::field("wr_refunded_cdemo_sk", arrow::int64()), + arrow::field("wr_refunded_hdemo_sk", arrow::int64()), + arrow::field("wr_refunded_addr_sk", arrow::int64()), + arrow::field("wr_returning_customer_sk", arrow::int64()), + arrow::field("wr_returning_cdemo_sk", arrow::int64()), + arrow::field("wr_returning_hdemo_sk", arrow::int64()), + arrow::field("wr_returning_addr_sk", arrow::int64()), + arrow::field("wr_web_page_sk", arrow::int64()), + arrow::field("wr_reason_sk", arrow::int64()), + arrow::field("wr_order_number", arrow::int64()), + arrow::field("wr_quantity", arrow::int32()), + arrow::field("wr_net_paid", arrow::float64()), + arrow::field("wr_ext_tax", arrow::float64()), + arrow::field("wr_net_paid_inc_tax", arrow::float64()), + arrow::field("wr_fee", arrow::float64()), + arrow::field("wr_ext_ship_cost", arrow::float64()), + arrow::field("wr_refunded_cash", arrow::float64()), + arrow::field("wr_reversed_charge", arrow::float64()), + arrow::field("wr_store_credit", arrow::float64()), + arrow::field("wr_net_loss", arrow::float64()), + }); + + case TableType::CallCenter: + return arrow::schema({ + arrow::field("cc_call_center_sk", arrow::int64()), + tpcds_field("cc_call_center_id", arrow::utf8(), call_center), + arrow::field("cc_rec_start_date_sk", arrow::int64()), + arrow::field("cc_rec_end_date_sk", arrow::int64()), + arrow::field("cc_closed_date_sk", arrow::int64()), + arrow::field("cc_open_date_sk", arrow::int64()), + arrow::field("cc_name", dict8), + arrow::field("cc_class", dict8), + arrow::field("cc_employees", arrow::int32()), + arrow::field("cc_sq_ft", arrow::int32()), + arrow::field("cc_hours", dict8), + tpcds_field("cc_manager", arrow::utf8(), call_center), + arrow::field("cc_mkt_id", arrow::int32()), + tpcds_field("cc_mkt_class", arrow::utf8(), call_center), + tpcds_field("cc_mkt_desc", arrow::utf8(), call_center), + tpcds_field("cc_market_manager", arrow::utf8(), call_center), + arrow::field("cc_division", arrow::int32()), + tpcds_field("cc_division_name", arrow::utf8(), call_center), + arrow::field("cc_company", arrow::int32()), + tpcds_field("cc_company_name", arrow::utf8(), call_center), + arrow::field("cc_street_number", arrow::int32()), + tpcds_field("cc_street_name", arrow::utf8(), call_center), + arrow::field("cc_street_type", dict8), + tpcds_field("cc_suite_number", arrow::utf8(), call_center), + tpcds_field("cc_city", arrow::utf8(), call_center), + tpcds_field("cc_county", arrow::utf8(), call_center), + arrow::field("cc_state", dict8), + tpcds_field("cc_zip", arrow::utf8(), call_center), + arrow::field("cc_country", dict8), + arrow::field("cc_gmt_offset", arrow::float64()), + arrow::field("cc_tax_percentage", arrow::float64()), + }); + + case TableType::CatalogPage: + return arrow::schema({ + arrow::field("cp_catalog_page_sk", arrow::int64()), + tpcds_field("cp_catalog_page_id", arrow::utf8(), catalog_page), + arrow::field("cp_start_date_sk", arrow::int64()), + arrow::field("cp_end_date_sk", arrow::int64()), + arrow::field("cp_department", dict8), + arrow::field("cp_catalog_number", arrow::int32()), + arrow::field("cp_catalog_page_number", arrow::int32()), + tpcds_field("cp_description", arrow::utf8(), catalog_page), + arrow::field("cp_type", dict8), + }); + + case TableType::WebPage: + return arrow::schema({ + arrow::field("wp_web_page_sk", arrow::int64()), + tpcds_field("wp_web_page_id", arrow::utf8(), web_page), + arrow::field("wp_rec_start_date_sk", arrow::int64()), + arrow::field("wp_rec_end_date_sk", arrow::int64()), + arrow::field("wp_creation_date_sk", arrow::int64()), + arrow::field("wp_access_date_sk", arrow::int64()), + arrow::field("wp_autogen_flag", arrow::int32()), + arrow::field("wp_customer_sk", arrow::int64()), + tpcds_field("wp_url", arrow::utf8(), web_page), + arrow::field("wp_type", dict8), + arrow::field("wp_char_count", arrow::int32()), + arrow::field("wp_link_count", arrow::int32()), + arrow::field("wp_image_count", arrow::int32()), + arrow::field("wp_max_ad_count", arrow::int32()), + }); + + case TableType::WebSite: + return arrow::schema({ + arrow::field("web_site_sk", arrow::int64()), + tpcds_field("web_site_id", arrow::utf8(), web_site), + arrow::field("web_rec_start_date_sk", arrow::int64()), + arrow::field("web_rec_end_date_sk", arrow::int64()), + tpcds_field("web_name", arrow::utf8(), web_site), + arrow::field("web_open_date_sk", arrow::int64()), + arrow::field("web_close_date_sk", arrow::int64()), + arrow::field("web_class", dict8), + tpcds_field("web_manager", arrow::utf8(), web_site), + arrow::field("web_mkt_id", arrow::int32()), + tpcds_field("web_mkt_class", arrow::utf8(), web_site), + tpcds_field("web_mkt_desc", arrow::utf8(), web_site), + tpcds_field("web_market_manager", arrow::utf8(), web_site), + arrow::field("web_company_id", arrow::int32()), + tpcds_field("web_company_name", arrow::utf8(), web_site), + arrow::field("web_street_number", arrow::int32()), + tpcds_field("web_street_name", arrow::utf8(), web_site), + arrow::field("web_street_type", dict8), + tpcds_field("web_suite_number", arrow::utf8(), web_site), + tpcds_field("web_city", arrow::utf8(), web_site), + tpcds_field("web_county", arrow::utf8(), web_site), + arrow::field("web_state", dict8), + tpcds_field("web_zip", arrow::utf8(), web_site), + arrow::field("web_country", dict8), + arrow::field("web_gmt_offset", arrow::float64()), + arrow::field("web_tax_percentage", arrow::float64()), + }); + + case TableType::Warehouse: + return arrow::schema({ + arrow::field("w_warehouse_sk", arrow::int64()), + tpcds_field("w_warehouse_id", arrow::utf8(), warehouse), + tpcds_field("w_warehouse_name", arrow::utf8(), warehouse), + arrow::field("w_warehouse_sq_ft", arrow::int32()), + arrow::field("w_street_number", arrow::int32()), + tpcds_field("w_street_name", arrow::utf8(), warehouse), + arrow::field("w_street_type", dict8), + tpcds_field("w_suite_number", arrow::utf8(), warehouse), + tpcds_field("w_city", arrow::utf8(), warehouse), + tpcds_field("w_county", arrow::utf8(), warehouse), + arrow::field("w_state", dict8), + tpcds_field("w_zip", arrow::utf8(), warehouse), + arrow::field("w_country", dict8), + arrow::field("w_gmt_offset", arrow::float64()), + }); + + case TableType::ShipMode: + return arrow::schema({ + arrow::field("sm_ship_mode_sk", arrow::int64()), + tpcds_field("sm_ship_mode_id", arrow::utf8(), SHIP_MODE_ROWS), + arrow::field("sm_type", dict8), + arrow::field("sm_code", dict8), + arrow::field("sm_carrier", dict8), + tpcds_field("sm_contract", arrow::utf8(), SHIP_MODE_ROWS), + }); + + case TableType::HouseholdDemographics: + return arrow::schema({ + arrow::field("hd_demo_sk", arrow::int64()), + arrow::field("hd_income_band_sk", arrow::int64()), + arrow::field("hd_buy_potential", dict8), + arrow::field("hd_dep_count", arrow::int32()), + arrow::field("hd_vehicle_count", arrow::int32()), + }); + + case TableType::CustomerDemographics: + return arrow::schema({ + arrow::field("cd_demo_sk", arrow::int64()), + arrow::field("cd_gender", dict8), + arrow::field("cd_marital_status", dict8), + arrow::field("cd_education_status", dict8), + arrow::field("cd_purchase_estimate", arrow::int32()), + arrow::field("cd_credit_rating", dict8), + arrow::field("cd_dep_count", arrow::int32()), + arrow::field("cd_dep_employed_count", arrow::int32()), + arrow::field("cd_dep_college_count", arrow::int32()), + }); + + case TableType::CustomerAddress: + return arrow::schema({ + arrow::field("ca_address_sk", arrow::int64()), + tpcds_field("ca_address_id", arrow::utf8(), cust_addr), + arrow::field("ca_street_number", arrow::int32()), + tpcds_field("ca_street_name", arrow::utf8(), 20000), + arrow::field("ca_street_type", dict8), + tpcds_field("ca_suite_number", arrow::utf8(), cust_addr), + tpcds_field("ca_city", arrow::utf8(), 1000), + tpcds_field("ca_county", arrow::utf8(), 1800), + arrow::field("ca_state", dict8), + tpcds_field("ca_zip", arrow::utf8(), 10000), + arrow::field("ca_country", dict8), + arrow::field("ca_gmt_offset", arrow::float64()), + arrow::field("ca_location_type", dict8), + }); + + case TableType::IncomeBand: + return arrow::schema({ + arrow::field("ib_income_band_id", arrow::int32()), + arrow::field("ib_lower_bound", arrow::int32()), + arrow::field("ib_upper_bound", arrow::int32()), + }); + + case TableType::Reason: + return arrow::schema({ + arrow::field("r_reason_sk", arrow::int64()), + tpcds_field("r_reason_id", arrow::utf8(), REASON_ROWS), + tpcds_field("r_reason_desc", arrow::utf8(), REASON_ROWS), + }); + + case TableType::TimeDim: + return arrow::schema({ + arrow::field("t_time_sk", arrow::int64()), + tpcds_field("t_time_id", arrow::utf8(), TIME_DIM_ROWS), + arrow::field("t_time", arrow::int32()), + arrow::field("t_hour", arrow::int32()), + arrow::field("t_minute", arrow::int32()), + arrow::field("t_second", arrow::int32()), + arrow::field("t_am_pm", dict8), + arrow::field("t_shift", dict8), + arrow::field("t_sub_shift", dict8), + arrow::field("t_meal_time", dict8), + }); + + case TableType::Promotion: + return arrow::schema({ + arrow::field("p_promo_sk", arrow::int64()), + tpcds_field("p_promo_id", arrow::utf8(), promotion), + arrow::field("p_start_date_sk", arrow::int64()), + arrow::field("p_end_date_sk", arrow::int64()), + arrow::field("p_item_sk", arrow::int64()), + arrow::field("p_cost", arrow::float64()), + arrow::field("p_response_target", arrow::int32()), + tpcds_field("p_promo_name", arrow::utf8(), promotion), + arrow::field("p_channel_dmail", arrow::int32()), + arrow::field("p_channel_email", arrow::int32()), + arrow::field("p_channel_catalog", arrow::int32()), + arrow::field("p_channel_tv", arrow::int32()), + arrow::field("p_channel_radio", arrow::int32()), + arrow::field("p_channel_press", arrow::int32()), + arrow::field("p_channel_event", arrow::int32()), + arrow::field("p_channel_demo", arrow::int32()), + tpcds_field("p_channel_details", arrow::utf8(), promotion), + arrow::field("p_purpose", dict8), + arrow::field("p_discount_active", arrow::int32()), + }); + + case TableType::Store: + return arrow::schema({ + arrow::field("s_store_sk", arrow::int64()), + tpcds_field("s_store_id", arrow::utf8(), store), + arrow::field("s_rec_start_date", arrow::int64()), + arrow::field("s_rec_end_date", arrow::int64()), + arrow::field("s_closed_date_sk", arrow::int64()), + tpcds_field("s_store_name", arrow::utf8(), store), + arrow::field("s_number_employees", arrow::int32()), + arrow::field("s_floor_space", arrow::int32()), + arrow::field("s_hours", dict8), + tpcds_field("s_manager", arrow::utf8(), store), + arrow::field("s_market_id", arrow::int32()), + arrow::field("s_geography_class", dict8), + tpcds_field("s_market_desc", arrow::utf8(), store), + tpcds_field("s_market_manager", arrow::utf8(), store), + arrow::field("s_division_id", arrow::int64()), + arrow::field("s_division_name", dict8), + arrow::field("s_company_id", arrow::int64()), + arrow::field("s_company_name", dict8), + arrow::field("s_street_number", arrow::int32()), + tpcds_field("s_street_name", arrow::utf8(), store), + arrow::field("s_street_type", dict8), + tpcds_field("s_suite_number", arrow::utf8(), store), + tpcds_field("s_city", arrow::utf8(), store), + tpcds_field("s_county", arrow::utf8(), store), + arrow::field("s_state", dict8), + tpcds_field("s_zip", arrow::utf8(), store), + arrow::field("s_country", dict8), + arrow::field("s_gmt_offset", arrow::float64()), + arrow::field("s_tax_percentage", arrow::float64()), + }); + + default: + throw std::invalid_argument( + "DSDGenWrapper::get_schema: schema not yet implemented for table " + + table_name(t)); + } +} + +// --------------------------------------------------------------------------- +// Constructor / destructor +// --------------------------------------------------------------------------- + +DSDGenWrapper::DSDGenWrapper(long scale_factor, bool verbose) + : scale_factor_(scale_factor), verbose_(verbose), initialized_(false) { + if (scale_factor <= 0) { + throw std::invalid_argument("scale_factor must be positive"); + } +} + +DSDGenWrapper::~DSDGenWrapper() { + if (!tmp_dist_path_.empty()) { + ::unlink(tmp_dist_path_.c_str()); + } +} + +// --------------------------------------------------------------------------- +// Initialization +// --------------------------------------------------------------------------- + +void DSDGenWrapper::init_dsdgen() { + if (initialized_) return; + + // 1. Write embedded tpcds.idx to a temp file (dsdgen opens it by path). + char tmp_tmpl[] = "/tmp/tpcds_idx_XXXXXX"; + int fd = ::mkstemp(tmp_tmpl); + if (fd < 0) { + throw std::runtime_error("DSDGenWrapper: mkstemp failed for tpcds.idx"); + } + const uint8_t* data = tpcds_idx_data; + size_t remaining = tpcds_idx_size; + while (remaining > 0) { + ssize_t written = ::write(fd, data, remaining); + if (written <= 0) { + ::close(fd); + ::unlink(tmp_tmpl); + throw std::runtime_error("DSDGenWrapper: write to tmp tpcds.idx failed"); + } + data += written; + remaining -= static_cast(written); + } + ::close(fd); + tmp_dist_path_ = tmp_tmpl; + + // 2. Initialise dsdgen parameter table and override relevant params. + init_params(); + + // 3. Point DISTRIBUTIONS at the temp file we just wrote. + set_str(const_cast("DISTRIBUTIONS"), + const_cast(tmp_dist_path_.c_str())); + + // 4. Set scale factor. + char scale_buf[32]; + std::snprintf(scale_buf, sizeof(scale_buf), "%ld", scale_factor_); + set_int(const_cast("SCALE"), scale_buf); + + // 5. Seed the RNG (must happen after init_params so streams are set up). + init_rand(); + + initialized_ = true; + + if (verbose_) { + std::fprintf(stderr, + "DSDGenWrapper: initialized (SF=%ld, dist=%s)\n", + scale_factor_, tmp_dist_path_.c_str()); + } +} + +// --------------------------------------------------------------------------- +// get_row_count +// --------------------------------------------------------------------------- + +long DSDGenWrapper::get_row_count(TableType t) const { + // get_rowcount() reads the global scale factor set in init_dsdgen(). + // const_cast is safe: we only call this after initialization. + const_cast(this)->init_dsdgen(); + return static_cast(get_rowcount(table_id(t))); +} + +// --------------------------------------------------------------------------- +// generate_store_sales +// --------------------------------------------------------------------------- +// +// store_sales is a master-detail table: each call to mk_w_store_sales(NULL, i) +// generates one "ticket" (master) with 8-16 line items (details). Each detail +// row is emitted via the callback g_w_store_sales_callback, which is the only +// way to capture the fully-populated rows (including pricing fields that live +// in the global g_w_store_sales, not in the caller-supplied struct). +// +// get_rowcount(STORE_SALES) returns the number of TICKETS (master rows). +// The total number of line-item rows emitted will be higher (8-16×). +// --------------------------------------------------------------------------- + +// C-linkage trampolines for master-detail tables +namespace { +template +struct CallbackState { + std::function* cb; + long max_rows; + long emitted; + std::exception_ptr error; +}; + +template +static void callback_trampoline_impl(const Row* row, void* ctx) { + auto* c = static_cast*>(ctx); + if (c->error != nullptr || (c->max_rows > 0 && c->emitted >= c->max_rows)) { + return; + } + try { + (*c->cb)(static_cast(row)); + ++c->emitted; + } catch (...) { + c->error = std::current_exception(); + } +} + +extern "C" void store_sales_trampoline( + const struct W_STORE_SALES_TBL* row, void* ctx) +{ + callback_trampoline_impl(row, ctx); +} + +extern "C" void catalog_sales_trampoline( + const struct W_CATALOG_SALES_TBL* row, void* ctx) +{ + callback_trampoline_impl(row, ctx); +} + +extern "C" void web_sales_trampoline( + const struct W_WEB_SALES_TBL* row, void* ctx) +{ + callback_trampoline_impl(row, ctx); +} + +template +struct CallbackGuard { + void (**slot)(const Row*, void*); + void** ctx_slot; + + ~CallbackGuard() { + *slot = nullptr; + *ctx_slot = nullptr; + } +}; + +template +using MasterDetailCallbackSlot = void (*)(const Row*, void*); + +template +static void run_master_detail_generation( + std::function callback, + long max_rows, + ds_key_t n_tickets, + const char* table_name, + bool verbose, + MasterDetailCallbackSlot* callback_slot, + void** callback_ctx_slot, + MasterDetailCallbackSlot trampoline, + int (*mk_row)(void*, ds_key_t)) +{ + CallbackState ctx{&callback, max_rows, 0L, nullptr}; + *callback_slot = trampoline; + *callback_ctx_slot = &ctx; + CallbackGuard guard{callback_slot, callback_ctx_slot}; + + if (verbose) { + std::fprintf(stderr, + "DSDGenWrapper: generating %s from %lld tickets\n", + table_name, + static_cast(n_tickets)); + } + + for (ds_key_t i = 1; i <= n_tickets; ++i) { + if (ctx.error != nullptr || (max_rows > 0 && ctx.emitted >= max_rows)) { + break; + } + mk_row(nullptr, i); + } + if (ctx.error != nullptr) { + std::rethrow_exception(ctx.error); + } + + if (verbose) { + std::fprintf(stderr, + "DSDGenWrapper: emitted %ld %s rows\n", ctx.emitted, table_name); + } +} +} // anonymous namespace + +void DSDGenWrapper::generate_store_sales( + std::function callback, + long max_rows) +{ + init_dsdgen(); + run_master_detail_generation( + std::move(callback), + max_rows, + get_rowcount(TPCDS_STORE_SALES), + "store_sales", + verbose_, + &g_w_store_sales_callback, + &g_w_store_sales_callback_ctx, + store_sales_trampoline, + mk_w_store_sales); +} + +// --------------------------------------------------------------------------- +// generate_inventory +// --------------------------------------------------------------------------- + +void DSDGenWrapper::generate_inventory( + std::function callback, + long max_rows) +{ + init_dsdgen(); + + ds_key_t total = get_rowcount(TPCDS_INVENTORY); + if (max_rows > 0 && static_cast(max_rows) < total) { + total = static_cast(max_rows); + } + + if (verbose_) { + std::fprintf(stderr, + "DSDGenWrapper: generating %lld inventory rows\n", + static_cast(total)); + } + + W_INVENTORY_TBL row; + for (ds_key_t i = 1; i <= total; ++i) { + mk_w_inventory(&row, i); + callback(&row); + } +} + +// --------------------------------------------------------------------------- +// generate_catalog_sales +// --------------------------------------------------------------------------- + +void DSDGenWrapper::generate_catalog_sales( + std::function callback, + long max_rows) +{ + init_dsdgen(); + run_master_detail_generation( + std::move(callback), + max_rows, + get_rowcount(TPCDS_CATALOG_SALES), + "catalog_sales", + verbose_, + &g_w_catalog_sales_callback, + &g_w_catalog_sales_callback_ctx, + catalog_sales_trampoline, + mk_w_catalog_sales); +} + +// --------------------------------------------------------------------------- +// generate_web_sales +// --------------------------------------------------------------------------- + +void DSDGenWrapper::generate_web_sales( + std::function callback, + long max_rows) +{ + init_dsdgen(); + run_master_detail_generation( + std::move(callback), + max_rows, + get_rowcount(TPCDS_WEB_SALES), + "web_sales", + verbose_, + &g_w_web_sales_callback, + &g_w_web_sales_callback_ctx, + web_sales_trampoline, + mk_w_web_sales); +} + +// --------------------------------------------------------------------------- +// generate_customer +// --------------------------------------------------------------------------- + +void DSDGenWrapper::generate_customer( + std::function callback, + long max_rows) +{ + init_dsdgen(); + + ds_key_t total = get_rowcount(TPCDS_CUSTOMER); + if (max_rows > 0 && static_cast(max_rows) < total) { + total = static_cast(max_rows); + } + + if (verbose_) { + std::fprintf(stderr, + "DSDGenWrapper: generating %lld customer rows\n", + static_cast(total)); + } + + W_CUSTOMER_TBL row; + for (ds_key_t i = 1; i <= total; ++i) { + mk_w_customer(&row, i); + callback(&row); + } +} + +// --------------------------------------------------------------------------- +// generate_item +// --------------------------------------------------------------------------- + +void DSDGenWrapper::generate_item( + std::function callback, + long max_rows) +{ + init_dsdgen(); + + ds_key_t total = get_rowcount(TPCDS_ITEM); + if (max_rows > 0 && static_cast(max_rows) < total) { + total = static_cast(max_rows); + } + + if (verbose_) { + std::fprintf(stderr, + "DSDGenWrapper: generating %lld item rows\n", + static_cast(total)); + } + + W_ITEM_TBL row; + for (ds_key_t i = 1; i <= total; ++i) { + mk_w_item(&row, i); + callback(&row); + } +} + +// --------------------------------------------------------------------------- +// generate_date_dim +// --------------------------------------------------------------------------- + +void DSDGenWrapper::generate_date_dim( + std::function callback, + long max_rows) +{ + init_dsdgen(); + + ds_key_t total = get_rowcount(TPCDS_DATE); + if (max_rows > 0 && static_cast(max_rows) < total) { + total = static_cast(max_rows); + } + + if (verbose_) { + std::fprintf(stderr, + "DSDGenWrapper: generating %lld date_dim rows\n", + static_cast(total)); + } + + W_DATE_TBL row; + for (ds_key_t i = 1; i <= total; ++i) { + mk_w_date(&row, i); + callback(&row); + } +} + +// --------------------------------------------------------------------------- +// generate_store_returns +// --------------------------------------------------------------------------- +// +// store_returns is generated as a side effect of store_sales: each sales row +// has a SR_RETURN_PCT (10%) chance of producing a return. The returns table +// has no standalone row count (get_rowcount returns -1). +// +// We drive generation through the store_sales ticket loop: for each ticket +// index we call mk_w_store_sales to populate g_w_store_sales, then call +// mk_w_store_returns to produce the corresponding return row. This gives +// correct referential integrity (the return references the just-generated +// sale). The 10% probability is NOT applied here — every sale generates a +// return row — which is intentional for benchmarking (avoids random skipping). +// --------------------------------------------------------------------------- + +void DSDGenWrapper::generate_store_returns( + std::function callback, + long max_rows) +{ + init_dsdgen(); + + // Use store_sales ticket count as the driver (returns have no own rowcount). + ds_key_t n_tickets = get_rowcount(TPCDS_STORE_SALES); + if (max_rows > 0 && static_cast(max_rows) < n_tickets) { + n_tickets = static_cast(max_rows); + } + + if (verbose_) { + std::fprintf(stderr, + "DSDGenWrapper: generating store_returns from %lld sales tickets\n", + static_cast(n_tickets)); + } + + // Use a no-op callback to suppress sales output while still populating g_w_store_sales. + g_w_store_sales_callback = [](const struct W_STORE_SALES_TBL*, void*) {}; + g_w_store_sales_callback_ctx = nullptr; + CallbackGuard guard{ + &g_w_store_sales_callback, + &g_w_store_sales_callback_ctx, + }; + + W_STORE_RETURNS_TBL row; + long emitted = 0; + for (ds_key_t i = 1; i <= n_tickets; ++i) { + // Populate g_w_store_sales so mk_w_store_returns has valid sale context. + // The no-op callback suppresses stdout printing. + mk_w_store_sales(nullptr, i); + mk_w_store_returns(&row, i); + callback(&row); + ++emitted; + if (max_rows > 0 && emitted >= max_rows) break; + } + if (verbose_) { + std::fprintf(stderr, + "DSDGenWrapper: emitted %ld store_returns rows\n", emitted); + } +} + +// --------------------------------------------------------------------------- +// generate_catalog_returns +// --------------------------------------------------------------------------- +// +// Same approach as generate_store_returns but driven by catalog_sales tickets. +// --------------------------------------------------------------------------- + +void DSDGenWrapper::generate_catalog_returns( + std::function callback, + long max_rows) +{ + init_dsdgen(); + + ds_key_t n_tickets = get_rowcount(TPCDS_CATALOG_SALES); + if (max_rows > 0 && static_cast(max_rows) < n_tickets) { + n_tickets = static_cast(max_rows); + } + + if (verbose_) { + std::fprintf(stderr, + "DSDGenWrapper: generating catalog_returns from %lld sales tickets\n", + static_cast(n_tickets)); + } + + // Use a no-op callback to suppress sales output while still populating g_w_catalog_sales. + g_w_catalog_sales_callback = [](const struct W_CATALOG_SALES_TBL*, void*) {}; + g_w_catalog_sales_callback_ctx = nullptr; + CallbackGuard guard{ + &g_w_catalog_sales_callback, + &g_w_catalog_sales_callback_ctx, + }; + + W_CATALOG_RETURNS_TBL row; + long emitted = 0; + for (ds_key_t i = 1; i <= n_tickets; ++i) { + // Populate g_w_catalog_sales so mk_w_catalog_returns has valid sale context. + mk_w_catalog_sales(nullptr, i); + mk_w_catalog_returns(&row, i); + callback(&row); + ++emitted; + if (max_rows > 0 && emitted >= max_rows) break; + } + if (verbose_) { + std::fprintf(stderr, + "DSDGenWrapper: emitted %ld catalog_returns rows\n", emitted); + } +} + +// --------------------------------------------------------------------------- +// generate_web_returns +// --------------------------------------------------------------------------- +// +// Same approach as generate_store_returns but driven by web_sales tickets. +// --------------------------------------------------------------------------- + +void DSDGenWrapper::generate_web_returns( + std::function callback, + long max_rows) +{ + init_dsdgen(); + + ds_key_t n_tickets = get_rowcount(TPCDS_WEB_SALES); + if (max_rows > 0 && static_cast(max_rows) < n_tickets) { + n_tickets = static_cast(max_rows); + } + + if (verbose_) { + std::fprintf(stderr, + "DSDGenWrapper: generating web_returns from %lld sales tickets\n", + static_cast(n_tickets)); + } + + // Use a no-op callback to suppress sales output while still populating g_w_web_sales. + g_w_web_sales_callback = [](const struct W_WEB_SALES_TBL*, void*) {}; + g_w_web_sales_callback_ctx = nullptr; + CallbackGuard guard{ + &g_w_web_sales_callback, + &g_w_web_sales_callback_ctx, + }; + + W_WEB_RETURNS_TBL row; + long emitted = 0; + for (ds_key_t i = 1; i <= n_tickets; ++i) { + // Populate g_w_web_sales so mk_w_web_returns has valid sale context. + mk_w_web_sales(nullptr, i); + mk_w_web_returns(&row, i); + callback(&row); + ++emitted; + if (max_rows > 0 && emitted >= max_rows) break; + } + if (verbose_) { + std::fprintf(stderr, + "DSDGenWrapper: emitted %ld web_returns rows\n", emitted); + } +} + +// --------------------------------------------------------------------------- +// Phase 5 dimension table generators (simple direct-struct pattern) +// --------------------------------------------------------------------------- + +#define TPCDS_SIMPLE_GENERATE(funcname, TBL_TYPE, TPCDS_CONST, mk_func, log_name) \ +void DSDGenWrapper::funcname( \ + std::function callback, \ + long max_rows) \ +{ \ + init_dsdgen(); \ + ds_key_t total = get_rowcount(TPCDS_CONST); \ + if (max_rows > 0 && static_cast(max_rows) < total) \ + total = static_cast(max_rows); \ + if (verbose_) { \ + std::fprintf(stderr, \ + "DSDGenWrapper: generating %lld " log_name " rows\n", \ + static_cast(total)); \ + } \ + TBL_TYPE row; \ + for (ds_key_t i = 1; i <= total; ++i) { \ + mk_func(&row, i); \ + callback(&row); \ + } \ +} + +TPCDS_SIMPLE_GENERATE(generate_call_center, struct CALL_CENTER_TBL, + TPCDS_CALL_CENTER, mk_w_call_center, "call_center") + +TPCDS_SIMPLE_GENERATE(generate_catalog_page, struct CATALOG_PAGE_TBL, + TPCDS_CATALOG_PAGE, mk_w_catalog_page, "catalog_page") + +TPCDS_SIMPLE_GENERATE(generate_web_page, struct W_WEB_PAGE_TBL, + TPCDS_WEB_PAGE, mk_w_web_page, "web_page") + +TPCDS_SIMPLE_GENERATE(generate_web_site, struct W_WEB_SITE_TBL, + TPCDS_WEB_SITE, mk_w_web_site, "web_site") + +TPCDS_SIMPLE_GENERATE(generate_warehouse, struct W_WAREHOUSE_TBL, + TPCDS_WAREHOUSE, mk_w_warehouse, "warehouse") + +TPCDS_SIMPLE_GENERATE(generate_ship_mode, struct W_SHIP_MODE_TBL, + TPCDS_SHIP_MODE, mk_w_ship_mode, "ship_mode") + +TPCDS_SIMPLE_GENERATE(generate_household_demographics, struct W_HOUSEHOLD_DEMOGRAPHICS_TBL, + TPCDS_HOUSEHOLD_DEMOGRAPHICS, mk_w_household_demographics, "household_demographics") + +TPCDS_SIMPLE_GENERATE(generate_customer_demographics, struct W_CUSTOMER_DEMOGRAPHICS_TBL, + TPCDS_CUSTOMER_DEMOGRAPHICS, mk_w_customer_demographics, "customer_demographics") + +TPCDS_SIMPLE_GENERATE(generate_customer_address, struct W_CUSTOMER_ADDRESS_TBL, + TPCDS_CUSTOMER_ADDRESS, mk_w_customer_address, "customer_address") + +TPCDS_SIMPLE_GENERATE(generate_income_band, struct W_INCOME_BAND_TBL, + TPCDS_INCOME_BAND, mk_w_income_band, "income_band") + +TPCDS_SIMPLE_GENERATE(generate_reason, struct W_REASON_TBL, + TPCDS_REASON, mk_w_reason, "reason") + +TPCDS_SIMPLE_GENERATE(generate_time_dim, struct W_TIME_TBL, + TPCDS_TIME, mk_w_time, "time_dim") + +TPCDS_SIMPLE_GENERATE(generate_promotion, struct W_PROMOTION_TBL, + TPCDS_PROMOTION, mk_w_promotion, "promotion") + +TPCDS_SIMPLE_GENERATE(generate_store, struct W_STORE_TBL, + TPCDS_STORE, mk_w_store, "store") + +} // namespace tpcds diff --git a/src/tpcds_main.cpp b/src/tpcds_main.cpp new file mode 100644 index 0000000..ad5daf0 --- /dev/null +++ b/src/tpcds_main.cpp @@ -0,0 +1,555 @@ +/** + * tpcds_main.cpp — TPC-DS data generator benchmark executable + * + * Generates TPC-DS benchmark data in multiple formats (Parquet, CSV, ORC, + * Lance, Paimon, Iceberg) using the official TPC-DS dsdgen generator. + * + * CLI mirrors tpch_benchmark: + * ./tpcds_benchmark --format parquet --table store_sales --scale-factor 1 + * ./tpcds_benchmark --format parquet --table inventory --scale-factor 5 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "tpch/writer_interface.hpp" +#include "tpch/csv_writer.hpp" +#include "tpch/parquet_writer.hpp" +#include "tpch/dsdgen_wrapper.hpp" +#include "tpch/dsdgen_converter.hpp" + +#ifdef TPCH_ENABLE_ORC +#include "tpch/orc_writer.hpp" +#endif +#ifdef TPCH_ENABLE_PAIMON +#include "tpch/paimon_writer.hpp" +#endif +#ifdef TPCH_ENABLE_ICEBERG +#include "tpch/iceberg_writer.hpp" +#endif +#ifdef TPCH_ENABLE_LANCE +#include "tpch/lance_writer.hpp" +#endif + +namespace { + +struct Options { + long scale_factor = 1; + std::string format = "parquet"; + std::string output_dir = "/tmp"; + long max_rows = 1000; + std::string table = "store_sales"; + std::string compression = "snappy"; // snappy, lz4, zstd, none + bool verbose = false; + bool zero_copy = false; // streaming mode: O(batch) memory instead of O(total) + std::string zero_copy_mode = "sync"; // sync, auto, async (lance-specific selection) +}; + +void print_usage(const char* prog) { + fprintf(stderr, + "Usage: %s [OPTIONS]\n" + "\n" + "Options:\n" + " --format Output format: parquet, csv" +#ifdef TPCH_ENABLE_ORC + ", orc" +#endif +#ifdef TPCH_ENABLE_PAIMON + ", paimon" +#endif +#ifdef TPCH_ENABLE_ICEBERG + ", iceberg" +#endif +#ifdef TPCH_ENABLE_LANCE + ", lance" +#endif + " (default: parquet)\n" + " --table TPC-DS table name (default: store_sales)\n" + " --scale-factor Scale factor (default: 1)\n" + " --output-dir Output directory (default: /tmp)\n" + " --max-rows Max rows to generate (0=all, default: 1000)\n" + " --compression Parquet compression: snappy (default), zstd, none\n" + " --zero-copy Streaming mode: flush each batch immediately (O(batch) RAM)\n" + " --zero-copy-mode Zero-copy mode for Lance: sync, auto, async (default: sync)\n" +#ifdef TPCH_ENABLE_LANCE +#endif + " --verbose Verbose output\n" + " --help Show this help\n" + "\n" + "TPC-DS tables (implemented):\n" + " Fact: store_sales, inventory, catalog_sales, web_sales,\n" + " store_returns, catalog_returns, web_returns\n" + " Dimension: customer, item, date_dim,\n" + " call_center, catalog_page, web_page, web_site,\n" + " warehouse, ship_mode, household_demographics,\n" + " customer_demographics, customer_address, income_band,\n" + " reason, time_dim, promotion, store\n", + prog); +} + +Options parse_args(int argc, char* argv[]) { + Options opts; + + enum { + OPT_COMPRESSION = 1000, + OPT_ZERO_COPY, + OPT_ZERO_COPY_MODE + }; + static struct option long_opts[] = { + {"format", required_argument, nullptr, 'f'}, + {"table", required_argument, nullptr, 't'}, + {"scale-factor", required_argument, nullptr, 's'}, + {"output-dir", required_argument, nullptr, 'o'}, + {"max-rows", required_argument, nullptr, 'm'}, + {"compression", required_argument, nullptr, OPT_COMPRESSION}, + {"zero-copy", no_argument, nullptr, OPT_ZERO_COPY}, + {"zero-copy-mode", required_argument, nullptr, OPT_ZERO_COPY_MODE}, + {"verbose", no_argument, nullptr, 'v'}, + {"help", no_argument, nullptr, 'h'}, + {nullptr, 0, nullptr, 0} + }; + + int c; + while ((c = getopt_long(argc, argv, "f:t:s:o:m:vzh", long_opts, nullptr)) != -1) { + switch (c) { + case 'f': opts.format = optarg; break; + case 't': opts.table = optarg; break; + case 's': opts.scale_factor = std::stol(optarg); break; + case 'o': opts.output_dir = optarg; break; + case 'm': opts.max_rows = std::stol(optarg); break; + case OPT_COMPRESSION: opts.compression = optarg; break; + case OPT_ZERO_COPY: opts.zero_copy = true; break; + case OPT_ZERO_COPY_MODE: opts.zero_copy_mode = optarg; break; + case 'z': opts.zero_copy = true; break; + case 'v': opts.verbose = true; break; + case 'h': print_usage(argv[0]); exit(0); + default: print_usage(argv[0]); exit(1); + } + } + return opts; +} + +std::string normalize_zero_copy_mode(std::string mode) { + for (char& c : mode) { + c = static_cast(std::tolower(static_cast(c))); + } + return mode; +} + +// Create writer for the given format and output path. +// When zero_copy=true, enables streaming write mode: each batch is flushed +// immediately to disk, capping RAM usage at O(batch_size) instead of O(total_rows). +std::unique_ptr create_writer( + const std::string& format, + const std::string& filepath, + const std::string& compression, + bool zero_copy = false, + bool lance_async_streaming = false) +{ + if (format == "csv") { + return std::make_unique(filepath); + } else if (format == "parquet") { + auto w = std::make_unique(filepath); + w->set_compression(compression); + if (zero_copy) { + w->enable_streaming_write(); + } + return w; + } +#ifdef TPCH_ENABLE_ORC + else if (format == "orc") { + return std::make_unique(filepath); + } +#endif +#ifdef TPCH_ENABLE_PAIMON + else if (format == "paimon") { + return std::make_unique(filepath); + } +#endif +#ifdef TPCH_ENABLE_ICEBERG + else if (format == "iceberg") { + return std::make_unique(filepath); + } +#endif +#ifdef TPCH_ENABLE_LANCE + else if (format == "lance") { + auto w = std::make_unique(filepath); + if (zero_copy && lance_async_streaming) { + w->enable_streaming_write(true); + } + return w; + } +#endif + throw std::invalid_argument("Unknown format: " + format); +} + +// Build Arrow array builders from schema (int32, int64, float64, string) +tpcds::BuilderMap +create_builders(std::shared_ptr schema, int64_t capacity) +{ + tpcds::BuilderMap builders; + builders.reserve(static_cast(schema->num_fields())); + + for (const auto& field : schema->fields()) { + switch (field->type()->id()) { + case arrow::Type::INT64: { + auto b = std::make_shared(); + (void)b->Reserve(capacity); + builders.push_back(b); + break; + } + case arrow::Type::INT32: { + auto b = std::make_shared(); + (void)b->Reserve(capacity); + builders.push_back(b); + break; + } + case arrow::Type::DOUBLE: { + auto b = std::make_shared(); + (void)b->Reserve(capacity); + builders.push_back(b); + break; + } + case arrow::Type::STRING: { + auto b = std::make_shared(); + (void)b->Reserve(capacity); + (void)b->ReserveData(capacity * 32); + builders.push_back(b); + break; + } + case arrow::Type::DICTIONARY: { + auto b = std::make_shared(); + (void)b->Reserve(capacity); + builders.push_back(b); + break; + } + default: + throw std::runtime_error( + "Unsupported Arrow type: " + field->type()->ToString()); + } + } + return builders; +} + +// Finish builders → RecordBatch, then reset +std::shared_ptr +finish_batch( + std::shared_ptr schema, + tpcds::BuilderMap& builders, + size_t num_rows) +{ + std::vector> arrays; + arrays.reserve(schema->num_fields()); + for (int i = 0; i < schema->num_fields(); ++i) { + const auto& field = schema->field(i); + std::shared_ptr array; + arrow::Status finish_status = + builders[static_cast(i)]->Finish(&array); + if (!finish_status.ok()) { + throw std::runtime_error( + "Failed to finish Arrow builder for field '" + + field->name() + "': " + finish_status.ToString()); + } + // Convert Int8 indices to DictionaryArray for DICTIONARY fields + if (field->type()->id() == arrow::Type::DICTIONARY) { + auto dict = tpcds::get_dict_for_field(field->name()); + if (dict) { + auto dict_result = + arrow::DictionaryArray::FromArrays(field->type(), array, dict); + if (!dict_result.ok()) { + throw std::runtime_error( + "Failed to build dictionary array for field '" + + field->name() + "': " + dict_result.status().ToString()); + } + array = dict_result.ValueOrDie(); + } + } + arrays.push_back(array); + } + return arrow::RecordBatch::Make(schema, static_cast(num_rows), arrays); +} + +void reset_builders(tpcds::BuilderMap& builders) { + for (auto& b : builders) { b->Reset(); } +} + +// --------------------------------------------------------------------------- +// main generation loop (row-by-row callback → batched Arrow writes) +// --------------------------------------------------------------------------- + +template +size_t run_generation( + const Options& opts, + std::shared_ptr schema, + std::unique_ptr& writer, + GenerateFn generate_fn) +{ + // 8192 = Lance max_rows_per_group default — aligns C++ batches to Lance row-group + // boundaries so the streaming encoder never sees split/leftover rows at group edges. + // This also benefits Parquet (common row-group granularity) and ORC stripe alignment. + const size_t batch_size = 8192; + size_t rows_in_batch = 0; + size_t total_rows = 0; + + auto builders = create_builders(schema, static_cast(batch_size)); + + auto callback = [&](const void* row) { + tpcds::append_dsdgen_row_to_builders(opts.table, row, builders); + ++rows_in_batch; + ++total_rows; + + if (rows_in_batch >= batch_size) { + writer->write_batch(finish_batch(schema, builders, rows_in_batch)); + reset_builders(builders); + rows_in_batch = 0; + + if (opts.verbose && (total_rows % 100000 == 0)) { + fprintf(stderr, " Generated %zu rows...\n", total_rows); + } + } + }; + + generate_fn(callback); + + // Flush final partial batch + if (rows_in_batch > 0) { + writer->write_batch(finish_batch(schema, builders, rows_in_batch)); + } + + return total_rows; +} + +// Map table name → TableType enum +tpcds::TableType parse_table(const std::string& name) { + if (name == "store_sales") return tpcds::TableType::StoreSales; + if (name == "inventory") return tpcds::TableType::Inventory; + if (name == "catalog_sales") return tpcds::TableType::CatalogSales; + if (name == "web_sales") return tpcds::TableType::WebSales; + if (name == "customer") return tpcds::TableType::Customer; + if (name == "item") return tpcds::TableType::Item; + if (name == "date_dim") return tpcds::TableType::DateDim; + if (name == "store_returns") return tpcds::TableType::StoreReturns; + if (name == "catalog_returns") return tpcds::TableType::CatalogReturns; + if (name == "web_returns") return tpcds::TableType::WebReturns; + if (name == "call_center") return tpcds::TableType::CallCenter; + if (name == "catalog_page") return tpcds::TableType::CatalogPage; + if (name == "web_page") return tpcds::TableType::WebPage; + if (name == "web_site") return tpcds::TableType::WebSite; + if (name == "warehouse") return tpcds::TableType::Warehouse; + if (name == "ship_mode") return tpcds::TableType::ShipMode; + if (name == "household_demographics") return tpcds::TableType::HouseholdDemographics; + if (name == "customer_demographics") return tpcds::TableType::CustomerDemographics; + if (name == "customer_address") return tpcds::TableType::CustomerAddress; + if (name == "income_band") return tpcds::TableType::IncomeBand; + if (name == "reason") return tpcds::TableType::Reason; + if (name == "time_dim") return tpcds::TableType::TimeDim; + if (name == "promotion") return tpcds::TableType::Promotion; + if (name == "store") return tpcds::TableType::Store; + throw std::invalid_argument("Table '" + name + "' not found. Use --help for list."); +} + +// Extension for a given format +std::string file_extension(const std::string& fmt) { + if (fmt == "parquet") return ".parquet"; + if (fmt == "csv") return ".csv"; + if (fmt == "orc") return ".orc"; + if (fmt == "paimon") return ".paimon"; + if (fmt == "iceberg") return ".iceberg"; + if (fmt == "lance") return ".lance"; + return "." + fmt; +} + +} // namespace + +int main(int argc, char* argv[]) { + if (argc < 2) { + print_usage(argv[0]); + return 1; + } + + Options opts; + try { + opts = parse_args(argc, argv); + } catch (const std::exception& e) { + fprintf(stderr, "Error parsing arguments: %s\n", e.what()); + return 1; + } + if (opts.scale_factor <= 0) { + fprintf(stderr, "tpcds_benchmark: --scale-factor must be > 0\n"); + return 1; + } + opts.zero_copy_mode = normalize_zero_copy_mode(opts.zero_copy_mode); + if (opts.zero_copy_mode != "auto" && opts.zero_copy_mode != "sync" && opts.zero_copy_mode != "async") { + fprintf(stderr, "tpcds_benchmark: --zero-copy-mode must be one of: auto, sync, async\n"); + return 1; + } + + // Resolve table + tpcds::TableType table_type; + try { + table_type = parse_table(opts.table); + } catch (const std::invalid_argument& e) { + fprintf(stderr, "tpcds_benchmark: %s\n", e.what()); + return 1; + } + + // Build output path + std::string filepath = opts.output_dir + "/" + opts.table + file_extension(opts.format); + + // single-table tpcds_benchmark: synchronous bounded path is default. + bool lance_async_streaming = + (opts.format == "lance" && opts.zero_copy && opts.zero_copy_mode == "async"); + + if (opts.verbose) { + fprintf(stderr, + "tpcds_benchmark: table=%s format=%s SF=%ld max_rows=%ld zero_copy=%s mode=%s\n" + " output: %s\n", + opts.table.c_str(), opts.format.c_str(), + opts.scale_factor, opts.max_rows, + opts.zero_copy ? "yes" : "no", + opts.zero_copy_mode.c_str(), + filepath.c_str()); + } + + // Create writer + std::unique_ptr writer; + try { + writer = create_writer( + opts.format, + filepath, + opts.compression, + opts.zero_copy, + lance_async_streaming); + } catch (const std::exception& e) { + fprintf(stderr, "tpcds_benchmark: failed to create writer: %s\n", e.what()); + return 1; + } + +#ifdef TPCH_ENABLE_LANCE + if (opts.format == "lance") { + if (auto* lw = dynamic_cast(writer.get())) { + if (opts.zero_copy && !lance_async_streaming) { + // Keep sync zero-copy bounded, but avoid tiny ~65K-row fragments that + // amplify Lance append/commit overhead at higher scale factors. + lw->set_buffered_flush_config(128, 1'048'576); + } + } + } +#endif + + // Get Arrow schema + auto schema = tpcds::DSDGenWrapper::get_schema(table_type, opts.scale_factor); + + // Build dsdgen wrapper + tpcds::DSDGenWrapper dsdgen(opts.scale_factor, opts.verbose); + + auto t_start = std::chrono::steady_clock::now(); + + // Generate + size_t actual_rows = 0; + try { + if (table_type == tpcds::TableType::StoreSales) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_store_sales(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::Inventory) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_inventory(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::CatalogSales) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_catalog_sales(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::WebSales) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_web_sales(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::Customer) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_customer(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::Item) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_item(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::DateDim) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_date_dim(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::StoreReturns) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_store_returns(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::CatalogReturns) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_catalog_returns(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::WebReturns) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_web_returns(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::CallCenter) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_call_center(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::CatalogPage) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_catalog_page(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::WebPage) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_web_page(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::WebSite) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_web_site(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::Warehouse) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_warehouse(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::ShipMode) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_ship_mode(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::HouseholdDemographics) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_household_demographics(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::CustomerDemographics) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_customer_demographics(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::CustomerAddress) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_customer_address(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::IncomeBand) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_income_band(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::Reason) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_reason(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::TimeDim) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_time_dim(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::Promotion) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_promotion(cb, opts.max_rows); }); + } else if (table_type == tpcds::TableType::Store) { + actual_rows = run_generation(opts, schema, writer, + [&](auto cb) { dsdgen.generate_store(cb, opts.max_rows); }); + } + } catch (const std::exception& e) { + fprintf(stderr, "tpcds_benchmark: generation error: %s\n", e.what()); + return 1; + } + + writer->close(); + + auto t_end = std::chrono::steady_clock::now(); + double elapsed = std::chrono::duration(t_end - t_start).count(); + + // Report: use actual emitted row count (avoids -1 for tables with no standalone rowcount) + long actual = static_cast(actual_rows); + + printf("tpcds_benchmark: %s SF=%ld rows=%ld elapsed=%.2fs rate=%.0f rows/s\n", + opts.table.c_str(), opts.scale_factor, actual, + elapsed, (elapsed > 0) ? actual / elapsed : 0.0); + printf(" output: %s\n", filepath.c_str()); + + return 0; +} diff --git a/src/writers/lance_writer.cpp b/src/writers/lance_writer.cpp index d0c994f..76a3883 100644 --- a/src/writers/lance_writer.cpp +++ b/src/writers/lance_writer.cpp @@ -272,6 +272,37 @@ void LanceWriter::initialize_lance_dataset( } } + int runtime_result = lance_writer_set_runtime_config( + reinterpret_cast<::LanceWriter*>(rust_writer_), + stream_max_blocking_threads_); + if (runtime_result != 0) { + throw std::runtime_error("Failed to configure Lance runtime parameters"); + } + + int profile_result = lance_writer_set_profile_config( + reinterpret_cast<::LanceWriter*>(rust_writer_), + stream_mem_profile_enabled_ ? 1 : 0, + static_cast(stream_mem_profile_every_batches_)); + if (profile_result != 0) { + throw std::runtime_error("Failed to configure Lance profile parameters"); + } + + int sg_cfg_result = lance_writer_set_scatter_gather_config( + reinterpret_cast<::LanceWriter*>(rust_writer_), + static_cast(stream_scatter_gather_batches_), + static_cast(stream_scatter_gather_queue_chunks_)); + if (sg_cfg_result != 0) { + throw std::runtime_error("Failed to configure Lance scatter/gather parameters"); + } + + int buffered_cfg_result = lance_writer_set_buffered_flush_config( + reinterpret_cast<::LanceWriter*>(rust_writer_), + static_cast(buffered_flush_batch_threshold_), + static_cast(buffered_flush_row_threshold_)); + if (buffered_cfg_result != 0) { + throw std::runtime_error("Failed to configure Lance buffered flush parameters"); + } + if (streaming_enabled_) { auto state = std::make_shared(stream_queue_depth_); auto reader = std::make_shared(schema_, state); @@ -495,6 +526,13 @@ void LanceWriter::close() { double stall_ms = static_cast(stats.first) / 1e6; std::cout << "Lance: Stream stalls " << stats.second << " times, " << stall_ms << " ms total\n"; + double peak_mb = static_cast(stream_state_->peak_bytes()) / (1024.0 * 1024.0); + std::cout << "Lance Copy Profile: mode=async" + << " cxx_to_rust_bytes=" << total_byte_count_ + << " cxx_queue_peak_mb=" << peak_mb << "\n"; + } else { + std::cout << "Lance Copy Profile: mode=sync" + << " cxx_to_rust_bytes=" << total_byte_count_ << "\n"; } std::cout << "Lance dataset finalized: " << dataset_path_ << "\n" diff --git a/src/writers/orc_writer.cpp b/src/writers/orc_writer.cpp index 9701490..e1829b0 100644 --- a/src/writers/orc_writer.cpp +++ b/src/writers/orc_writer.cpp @@ -84,10 +84,11 @@ void copy_array_to_orc_column( } } } else if (array->type()->id() == arrow::Type::INT32) { + // ORC uses LongVectorBatch for all integer types (tinyint/smallint/int/bigint) auto int_array = std::static_pointer_cast(array); - auto* long_col = dynamic_cast(col_batch); + auto* long_col = dynamic_cast(col_batch); if (!long_col) { - throw std::runtime_error("Failed to cast ORC column to IntVectorBatch"); + throw std::runtime_error("Failed to cast ORC column to LongVectorBatch (int32)"); } for (size_t i = 0; i < size; ++i) { if (int_array->IsNull(static_cast(i))) { diff --git a/src/writers/parquet_writer.cpp b/src/writers/parquet_writer.cpp index 0e9b5ab..a8ec4a8 100644 --- a/src/writers/parquet_writer.cpp +++ b/src/writers/parquet_writer.cpp @@ -157,6 +157,41 @@ void ParquetWriter::write_managed_batch(const ManagedRecordBatch& managed_batch) } } +// Build WriterProperties with chosen compression. +// Disables Parquet's auto-dict for numeric types (int64, int32, float64): +// those are high-cardinality columns (foreign keys, prices) where the +// Parquet DictEncoder hashtable is pure overhead. Arrow DictionaryArray +// columns (dict8 string fields) are unaffected — Parquet identifies them +// by column path, not Arrow type. +static parquet::Compression::type parse_compression(const std::string& codec) +{ + if (codec == "snappy") return parquet::Compression::SNAPPY; + if (codec == "zstd") return parquet::Compression::ZSTD; + if (codec == "none" || codec == "uncompressed") return parquet::Compression::UNCOMPRESSED; + throw std::invalid_argument("Unknown compression codec: " + codec + + " (supported: snappy, zstd, none)"); +} + +static std::shared_ptr +make_writer_props(const arrow::Schema& schema, const std::string& codec) +{ + auto builder = parquet::WriterProperties::Builder(); + builder.compression(parse_compression(codec)); + for (const auto& field : schema.fields()) { + auto tid = field->type()->id(); + if (tid == arrow::Type::INT64 || tid == arrow::Type::INT32 || + tid == arrow::Type::DOUBLE || tid == arrow::Type::FLOAT) { + builder.disable_dictionary(field->name()); + } + } + return builder.build(); +} + +void ParquetWriter::set_compression(const std::string& codec) +{ + compression_codec_ = codec; +} + void ParquetWriter::init_file_writer() { if (parquet_file_writer_) { return; // Already initialized @@ -167,9 +202,7 @@ void ParquetWriter::init_file_writer() { } // Configure Parquet writer properties - auto writer_props = parquet::WriterProperties::Builder() - .compression(parquet::Compression::SNAPPY) - ->build(); + auto writer_props = make_writer_props(*first_batch_->schema(), compression_codec_); auto arrow_props = parquet::ArrowWriterProperties::Builder() .set_use_threads(use_threads_) @@ -236,9 +269,7 @@ void ParquetWriter::close() { TPCH_SCOPED_TIMER("parquet_encode_batches"); // Configure Parquet writer properties - auto writer_props = parquet::WriterProperties::Builder() - .compression(parquet::Compression::SNAPPY) - ->build(); + auto writer_props = make_writer_props(*first_batch_->schema(), compression_codec_); auto arrow_props = parquet::ArrowWriterProperties::Builder() .set_use_threads(use_threads_) @@ -326,9 +357,7 @@ void ParquetWriter::close() { TPCH_SCOPED_TIMER("parquet_encode_sync"); // Configure Parquet writer properties - auto writer_props = parquet::WriterProperties::Builder() - .compression(parquet::Compression::SNAPPY) - ->build(); + auto writer_props = make_writer_props(*first_batch_->schema(), compression_codec_); auto arrow_props = parquet::ArrowWriterProperties::Builder() .set_use_threads(use_threads_) diff --git a/third_party/dsdgen/CMakeLists.txt b/third_party/dsdgen/CMakeLists.txt new file mode 100644 index 0000000..db10c4c --- /dev/null +++ b/third_party/dsdgen/CMakeLists.txt @@ -0,0 +1,356 @@ +# TPC-DS dsdgen integration +# Compiles dsdgen C sources into dsdgen_objs object library. +# Also builds the distcomp tool and uses it to generate: +# - tpcds.idx : binary distribution data (needed at runtime) +# - tpcds.idx.h : compile-time header with distribution constants + +set(DSDGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../tpcds/tools") +if(NOT EXISTS "${DSDGEN_SOURCE_DIR}") + message(FATAL_ERROR + "dsdgen source not found at ${DSDGEN_SOURCE_DIR}. " + "Initialize the submodule: git submodule update --init third_party/tpcds") +endif() + +# --------------------------------------------------------------------------- +# Step 1a: Build mkheader — generates tables.h, streams.h, columns.h +# --------------------------------------------------------------------------- +add_executable(mkheader EXCLUDE_FROM_ALL + ${DSDGEN_SOURCE_DIR}/mkheader.c + ${DSDGEN_SOURCE_DIR}/porting.c +) +target_compile_definitions(mkheader PRIVATE LINUX=1) +target_include_directories(mkheader PRIVATE "${DSDGEN_SOURCE_DIR}") +set_target_properties(mkheader PROPERTIES + C_STANDARD 99 + C_EXTENSIONS OFF + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" +) +target_compile_options(mkheader PRIVATE + -Wno-implicit-function-declaration + -Wno-unused-result +) + +# Run mkheader to generate tables.h, streams.h, columns.h +# mkheader reads column_list.txt from CWD and writes headers to CWD +set(TPCDS_TABLES_H "${CMAKE_CURRENT_BINARY_DIR}/tables.h") +set(TPCDS_COLUMNS_H "${CMAKE_CURRENT_BINARY_DIR}/columns.h") +set(TPCDS_STREAMS_H "${CMAKE_CURRENT_BINARY_DIR}/streams.h") + +add_custom_command( + OUTPUT "${TPCDS_TABLES_H}" "${TPCDS_COLUMNS_H}" "${TPCDS_STREAMS_H}" + COMMAND mkheader "${DSDGEN_SOURCE_DIR}/column_list.txt" + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + DEPENDS mkheader "${DSDGEN_SOURCE_DIR}/column_list.txt" + COMMENT "Generating TPC-DS headers (tables.h, streams.h, columns.h)" +) + +add_custom_target(tpcds_headers_gen + DEPENDS "${TPCDS_TABLES_H}" "${TPCDS_COLUMNS_H}" "${TPCDS_STREAMS_H}") + +# --------------------------------------------------------------------------- +# Step 1b: Build distcomp — the distribution compiler tool +# --------------------------------------------------------------------------- +set(DISTCOMP_SOURCES + ${DSDGEN_SOURCE_DIR}/dcgram.c + ${DSDGEN_SOURCE_DIR}/dcomp.c + ${DSDGEN_SOURCE_DIR}/grammar.c + ${DSDGEN_SOURCE_DIR}/error_msg.c + ${DSDGEN_SOURCE_DIR}/StringBuffer.c + ${DSDGEN_SOURCE_DIR}/r_params.c + ${DSDGEN_SOURCE_DIR}/porting.c +) + +add_executable(distcomp EXCLUDE_FROM_ALL ${DISTCOMP_SOURCES}) +target_compile_definitions(distcomp PRIVATE LINUX=1 TPCDS=1) +# Note: DECLARER is defined internally in dcomp.c (#define DECLARER at line 37), +# so we do NOT pass it as an external flag — that would cause double-definition. +# distcomp includes tdefs.h → tables.h, which is generated by mkheader +target_include_directories(distcomp PRIVATE + "${DSDGEN_SOURCE_DIR}" + "${CMAKE_CURRENT_BINARY_DIR}" +) +set_target_properties(distcomp PROPERTIES + C_STANDARD 99 + C_EXTENSIONS OFF + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" +) + +# distcomp needs tables.h/columns.h/streams.h before it can compile +add_dependencies(distcomp tpcds_headers_gen) + +# Suppress warnings in third-party code. +# -fcommon: dcomp.h defines dcomp_tokens as a non-static global in a header; +# the strict mold linker rejects duplicate definitions unless -fcommon is set. +target_compile_options(distcomp PRIVATE + -fcommon + -Wno-implicit-function-declaration + -Wno-unused-result + -Wno-format + -Wno-deprecated-declarations +) +# dcomp.h defines dcomp_tokens as an initialized global array in a header file, +# causing duplicate definition errors when included by both dcomp.c and dcgram.c. +# This is a bug in the upstream dsdgen source; work around it at link time. +target_link_options(distcomp PRIVATE -Wl,--allow-multiple-definition) + +# --------------------------------------------------------------------------- +# Step 2: Run distcomp to generate tpcds.idx + tpcds.idx.h +# --------------------------------------------------------------------------- +set(TPCDS_IDX "${CMAKE_CURRENT_BINARY_DIR}/tpcds.idx") +set(TPCDS_IDX_H "${CMAKE_CURRENT_BINARY_DIR}/tpcds.idx.h") + +# All .dst source files distcomp reads +set(DST_FILES + ${DSDGEN_SOURCE_DIR}/tpcds.dst + ${DSDGEN_SOURCE_DIR}/calendar.dst + ${DSDGEN_SOURCE_DIR}/cities.dst + ${DSDGEN_SOURCE_DIR}/english.dst + ${DSDGEN_SOURCE_DIR}/fips.dst + ${DSDGEN_SOURCE_DIR}/items.dst + ${DSDGEN_SOURCE_DIR}/names.dst + ${DSDGEN_SOURCE_DIR}/scaling.dst + ${DSDGEN_SOURCE_DIR}/streets.dst +) + +add_custom_command( + OUTPUT "${TPCDS_IDX}" "${TPCDS_IDX_H}" + COMMAND distcomp + -i "${DSDGEN_SOURCE_DIR}/tpcds.dst" + -o "${TPCDS_IDX}" + # distcomp opens all .dst files relative to CWD, so run from source dir. + # When -h is not given, it auto-generates .h next to the -o file, + # i.e. tpcds.idx.h ends up in ${CMAKE_CURRENT_BINARY_DIR} alongside tpcds.idx. + WORKING_DIRECTORY "${DSDGEN_SOURCE_DIR}" + DEPENDS distcomp ${DST_FILES} + COMMENT "Generating TPC-DS distribution index (tpcds.idx + tpcds.idx.h)" +) + +add_custom_target(tpcds_idx_gen DEPENDS "${TPCDS_IDX}" "${TPCDS_IDX_H}") + +# --------------------------------------------------------------------------- +# Step 2b: Embed tpcds.idx as a C byte array (dsts_generated.c) +# --------------------------------------------------------------------------- +set(DSTS_GENERATED_C "${CMAKE_CURRENT_BINARY_DIR}/dsts_generated.c") + +add_custom_command( + OUTPUT "${DSTS_GENERATED_C}" + COMMAND "${Python3_EXECUTABLE}" "${CMAKE_SOURCE_DIR}/cmake/gen_dsts.py" + "${TPCDS_IDX}" + "${DSTS_GENERATED_C}" + DEPENDS tpcds_idx_gen "${CMAKE_SOURCE_DIR}/cmake/gen_dsts.py" + COMMENT "Embedding tpcds.idx as C array (dsts_generated.c)" +) + +add_custom_target(tpcds_dsts_embedded DEPENDS "${DSTS_GENERATED_C}") + +# --------------------------------------------------------------------------- +# Step 2c: Pre-parse distribution values at build time (dist_cache_generated.c) +# +# Generates static const int[] / decimal_t[] arrays for all TKN_INT and +# TKN_DECIMAL value sets. dist.c's load_dist() points int_cache[]/dec_cache[] +# at these read-only arrays (no malloc, no atoi/strtodec) when EMBEDDED_DSDGEN. +# --------------------------------------------------------------------------- +set(DIST_CACHE_C "${CMAKE_CURRENT_BINARY_DIR}/dist_cache_generated.c") + +add_custom_command( + OUTPUT "${DIST_CACHE_C}" + COMMAND "${Python3_EXECUTABLE}" "${CMAKE_SOURCE_DIR}/cmake/gen_dist_cache.py" + "${TPCDS_IDX}" + "${DIST_CACHE_C}" + DEPENDS tpcds_idx_gen "${CMAKE_SOURCE_DIR}/cmake/gen_dist_cache.py" + COMMENT "Pre-parsing TPC-DS distribution values (dist_cache_generated.c)" +) + +add_custom_target(tpcds_dist_cache_gen DEPENDS "${DIST_CACHE_C}") + +# --------------------------------------------------------------------------- +# Step 3: Core dsdgen sources +# --------------------------------------------------------------------------- + +# COMMON_SRC (from makefile) minus driver.c (has main()), print.c (we replace it) +set(DSDGEN_COMMON_SOURCES + ${DSDGEN_SOURCE_DIR}/address.c + ${DSDGEN_SOURCE_DIR}/build_support.c + ${DSDGEN_SOURCE_DIR}/date.c + ${DSDGEN_SOURCE_DIR}/decimal.c + ${DSDGEN_SOURCE_DIR}/dist.c + ${DSDGEN_SOURCE_DIR}/error_msg.c + # expr.c and grammar_support.c are qgen-only (not in DBGEN_OBJ in makefile) + # They pull in flex scanner symbols not needed for data generation. + ${DSDGEN_SOURCE_DIR}/genrand.c + ${DSDGEN_SOURCE_DIR}/join.c + ${DSDGEN_SOURCE_DIR}/list.c + ${DSDGEN_SOURCE_DIR}/load.c + ${DSDGEN_SOURCE_DIR}/misc.c + ${DSDGEN_SOURCE_DIR}/nulls.c + ${DSDGEN_SOURCE_DIR}/parallel.c + ${DSDGEN_SOURCE_DIR}/permute.c + ${DSDGEN_SOURCE_DIR}/pricing.c + ${DSDGEN_SOURCE_DIR}/r_params.c + ${DSDGEN_SOURCE_DIR}/StringBuffer.c + ${DSDGEN_SOURCE_DIR}/tdef_functions.c + ${DSDGEN_SOURCE_DIR}/tdefs.c + ${DSDGEN_SOURCE_DIR}/text.c + ${DSDGEN_SOURCE_DIR}/scd.c + ${DSDGEN_SOURCE_DIR}/scaling.c + ${DSDGEN_SOURCE_DIR}/release.c + ${DSDGEN_SOURCE_DIR}/sparse.c + ${DSDGEN_SOURCE_DIR}/validate.c + ${DSDGEN_SOURCE_DIR}/porting.c +) + +# S_SRC: store/update-mode table generators (referenced by tdef_functions.c) +set(DSDGEN_S_SOURCES + ${DSDGEN_SOURCE_DIR}/s_brand.c + ${DSDGEN_SOURCE_DIR}/s_customer_address.c + ${DSDGEN_SOURCE_DIR}/s_call_center.c + ${DSDGEN_SOURCE_DIR}/s_catalog.c + ${DSDGEN_SOURCE_DIR}/s_catalog_order.c + ${DSDGEN_SOURCE_DIR}/s_catalog_order_lineitem.c + ${DSDGEN_SOURCE_DIR}/s_catalog_page.c + ${DSDGEN_SOURCE_DIR}/s_catalog_promotional_item.c + ${DSDGEN_SOURCE_DIR}/s_catalog_returns.c + ${DSDGEN_SOURCE_DIR}/s_category.c + ${DSDGEN_SOURCE_DIR}/s_class.c + ${DSDGEN_SOURCE_DIR}/s_company.c + ${DSDGEN_SOURCE_DIR}/s_customer.c + ${DSDGEN_SOURCE_DIR}/s_division.c + ${DSDGEN_SOURCE_DIR}/s_inventory.c + ${DSDGEN_SOURCE_DIR}/s_item.c + ${DSDGEN_SOURCE_DIR}/s_manager.c + ${DSDGEN_SOURCE_DIR}/s_manufacturer.c + ${DSDGEN_SOURCE_DIR}/s_market.c + ${DSDGEN_SOURCE_DIR}/s_pline.c + ${DSDGEN_SOURCE_DIR}/s_product.c + ${DSDGEN_SOURCE_DIR}/s_promotion.c + ${DSDGEN_SOURCE_DIR}/s_purchase.c + ${DSDGEN_SOURCE_DIR}/s_reason.c + ${DSDGEN_SOURCE_DIR}/s_store.c + ${DSDGEN_SOURCE_DIR}/s_store_promotional_item.c + ${DSDGEN_SOURCE_DIR}/s_store_returns.c + ${DSDGEN_SOURCE_DIR}/s_subcategory.c + ${DSDGEN_SOURCE_DIR}/s_subclass.c + ${DSDGEN_SOURCE_DIR}/s_warehouse.c + ${DSDGEN_SOURCE_DIR}/s_web_order.c + ${DSDGEN_SOURCE_DIR}/s_web_order_lineitem.c + ${DSDGEN_SOURCE_DIR}/s_web_page.c + ${DSDGEN_SOURCE_DIR}/s_web_promotinal_item.c + ${DSDGEN_SOURCE_DIR}/s_web_returns.c + ${DSDGEN_SOURCE_DIR}/s_web_site.c + ${DSDGEN_SOURCE_DIR}/s_zip_to_gmt.c +) + +# W_SRC: warehouse (TPC-DS standard schema) table generators +set(DSDGEN_W_SOURCES + ${DSDGEN_SOURCE_DIR}/w_call_center.c + ${DSDGEN_SOURCE_DIR}/w_catalog_page.c + ${DSDGEN_SOURCE_DIR}/w_catalog_returns.c + ${DSDGEN_SOURCE_DIR}/w_catalog_sales.c + ${DSDGEN_SOURCE_DIR}/w_customer_address.c + ${DSDGEN_SOURCE_DIR}/w_customer.c + ${DSDGEN_SOURCE_DIR}/w_customer_demographics.c + ${DSDGEN_SOURCE_DIR}/w_datetbl.c + ${DSDGEN_SOURCE_DIR}/w_household_demographics.c + ${DSDGEN_SOURCE_DIR}/w_income_band.c + ${DSDGEN_SOURCE_DIR}/w_inventory.c + ${DSDGEN_SOURCE_DIR}/w_item.c + ${DSDGEN_SOURCE_DIR}/w_promotion.c + ${DSDGEN_SOURCE_DIR}/w_reason.c + ${DSDGEN_SOURCE_DIR}/w_ship_mode.c + ${DSDGEN_SOURCE_DIR}/w_store.c + ${DSDGEN_SOURCE_DIR}/w_store_returns.c + ${DSDGEN_SOURCE_DIR}/w_store_sales.c + ${DSDGEN_SOURCE_DIR}/w_timetbl.c + ${DSDGEN_SOURCE_DIR}/w_warehouse.c + ${DSDGEN_SOURCE_DIR}/w_web_page.c + ${DSDGEN_SOURCE_DIR}/w_web_returns.c + ${DSDGEN_SOURCE_DIR}/w_web_sales.c + ${DSDGEN_SOURCE_DIR}/w_web_site.c + ${DSDGEN_SOURCE_DIR}/dbgen_version.c +) + +# driver.c provides gen_tbl() and other generation helpers we need, +# but also contains main(). We rename main() at compile time to avoid +# a symbol conflict with our tpcds_benchmark executable. +set(DSDGEN_DRIVER_SOURCE + ${DSDGEN_SOURCE_DIR}/driver.c + ${DSDGEN_SOURCE_DIR}/print.c +) + +# Our stubs: intercept file I/O for distribution data, provide dsdgen init API +set(DSDGEN_STUB_SOURCE + "${CMAKE_CURRENT_SOURCE_DIR}/dsdgen_stubs.c" +) + +# --------------------------------------------------------------------------- +# Step 4: Build the object library +# --------------------------------------------------------------------------- +add_library(dsdgen_objs OBJECT + ${DSDGEN_COMMON_SOURCES} + ${DSDGEN_S_SOURCES} + ${DSDGEN_W_SOURCES} + ${DSDGEN_DRIVER_SOURCE} + ${DSDGEN_STUB_SOURCE} + "${DSTS_GENERATED_C}" + "${DIST_CACHE_C}" +) + +# Must wait for all generated headers + embedded dist data + cache before compiling +add_dependencies(dsdgen_objs tpcds_headers_gen tpcds_idx_gen tpcds_dsts_embedded tpcds_dist_cache_gen) + +# Rename main() in driver.c so it doesn't conflict with tpcds_benchmark's main() +set_source_files_properties( + "${DSDGEN_SOURCE_DIR}/driver.c" + PROPERTIES COMPILE_DEFINITIONS "main=dsdgen_driver_main_" +) + +target_compile_definitions(dsdgen_objs PRIVATE + LINUX=1 + TPCDS=1 + EMBEDDED_DSDGEN=1 + # Note: DECLARER is defined internally at the top of driver.c only. + # Do NOT set it globally: it causes params.h to initialize the option array + # in every TU, requiring SetScaleIndex/etc. to be declared before params.h. +) + +# Include both the original source dir (for all the *.h files) and the binary +# dir where the generated tpcds.idx.h lands +target_include_directories(dsdgen_objs PUBLIC + "${DSDGEN_SOURCE_DIR}" + "${CMAKE_CURRENT_BINARY_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}" +) + +set_target_properties(dsdgen_objs PROPERTIES + C_STANDARD 99 + C_EXTENSIONS OFF +) + +# Suppress warnings in third-party generated/legacy C code +target_compile_options(dsdgen_objs PRIVATE + -Wno-implicit-function-declaration + -Wno-unused-result + -Wno-format + -Wno-deprecated-declarations + -Wno-misleading-indentation + -Wno-unused-variable + -Wno-sign-compare + -Wno-implicit-fallthrough + -Wno-conversion + -Wno-error +) + +# Expose objects and include paths to parent CMakeLists.txt +set(DSDGEN_OBJECTS $ PARENT_SCOPE) +set(DSDGEN_INCLUDE_DIRS + "${DSDGEN_SOURCE_DIR}" + "${CMAKE_CURRENT_BINARY_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}" + PARENT_SCOPE) +set(TPCDS_IDX_FILE "${TPCDS_IDX}" PARENT_SCOPE) + +message(STATUS "dsdgen configured:") +message(STATUS " Source directory: ${DSDGEN_SOURCE_DIR}") +message(STATUS " Binary directory: ${CMAKE_CURRENT_BINARY_DIR}") +message(STATUS " tpcds.idx will be generated at: ${TPCDS_IDX}") +message(STATUS " tpcds.idx.h will be generated at: ${TPCDS_IDX_H}") diff --git a/third_party/dsdgen/dsdgen_stubs.c b/third_party/dsdgen/dsdgen_stubs.c new file mode 100644 index 0000000..3f4d6ae --- /dev/null +++ b/third_party/dsdgen/dsdgen_stubs.c @@ -0,0 +1,10 @@ +/* + * dsdgen_stubs.c + * + * Placeholder for any stub implementations needed when embedding dsdgen + * as a library. Mirrors src/dbgen/dbgen_stubs.c for TPC-H dbgen. + * + * Currently empty: all required symbols are provided by the dsdgen sources + * themselves. The main() collision in driver.c is handled at compile time + * via -Dmain=dsdgen_driver_main_ in CMakeLists.txt. + */ diff --git a/third_party/dsdgen/tpcds_dsdgen.h b/third_party/dsdgen/tpcds_dsdgen.h new file mode 100644 index 0000000..adbde73 --- /dev/null +++ b/third_party/dsdgen/tpcds_dsdgen.h @@ -0,0 +1,125 @@ +/** + * tpcds_dsdgen.h — C++-safe entry point for TPC-DS dsdgen + * + * Instead of manually re-declaring struct definitions (which can silently + * diverge from the real tpcds sources), this header directly includes the + * canonical tpcds w_*.h table headers. All struct layouts (CALL_CENTER_TBL, + * ds_addr_t, etc.) therefore always match the generator implementation. + * + * Include this inside an extern "C" { } block from C++ translation units: + * + * extern "C" { + * #include "tpcds_dsdgen.h" + * } + * + * The dsdgen_objs CMake target exposes the tpcds/tools/ source directory and + * the build-time generated header directory (columns.h, tables.h, streams.h) + * as PUBLIC include paths, so all includes resolve correctly. + */ + +#ifndef TPCDS_DSDGEN_H +#define TPCDS_DSDGEN_H + +#include +#include + +/* ------------------------------------------------------------------------- + * Core tpcds types: ds_key_t, decimal_t, ds_pricing_t, ds_addr_t + * decimal.h pulls in config.h → porting.h (ds_key_t) + mathops.h. + * pricing.h pulls in decimal.h. + * address.h pulls in constants.h. + * ------------------------------------------------------------------------- */ +#include "decimal.h" +#include "pricing.h" +#include "address.h" + +/* ------------------------------------------------------------------------- + * All 24 W_ table struct definitions — use the canonical tpcds sources. + * Each w_*.h provides the struct definition and mk_w_* / pr_w_* / ld_w_* + * function declarations. + * ------------------------------------------------------------------------- */ +#include "w_store_sales.h" +#include "w_inventory.h" +#include "w_catalog_sales.h" +#include "w_web_sales.h" +#include "w_customer.h" +#include "w_item.h" +#include "w_datetbl.h" +#include "w_store_returns.h" +#include "w_catalog_returns.h" +#include "w_web_returns.h" +#include "w_call_center.h" +#include "w_catalog_page.h" +#include "w_web_page.h" +#include "w_web_site.h" +#include "w_warehouse.h" +#include "w_ship_mode.h" +#include "w_household_demographics.h" +#include "w_customer_demographics.h" +#include "w_customer_address.h" +#include "w_income_band.h" +#include "w_reason.h" +#include "w_timetbl.h" +#include "w_promotion.h" +#include "w_store.h" + +/* ------------------------------------------------------------------------- + * Utility headers: table ID constants, scaling, params, RNG init. + * tables.h is build-generated (columns.h, streams.h likewise). + * ------------------------------------------------------------------------- */ +#include "tables.h" /* CALL_CENTER=0, STORE_SALES=17, WAREHOUSE=19, … */ +#include "scaling.h" /* get_rowcount(), getIDCount() */ +#include "r_params.h" /* set_str(), set_int(), init_params() */ +#include "genrand.h" /* init_rand() */ + +/* ------------------------------------------------------------------------- + * TPCDS_* aliases — thin wrappers around the native tables.h constants. + * C++ code must use PascalCase TableType enum values (not these macros) + * to avoid collision with the ALL_CAPS macros defined in tables.h. + * These aliases exist only for internal use within C-linkage code that + * calls get_rowcount() with a table ID. + * ------------------------------------------------------------------------- */ +#define TPCDS_CALL_CENTER CALL_CENTER +#define TPCDS_CATALOG_PAGE CATALOG_PAGE +#define TPCDS_CATALOG_RETURNS CATALOG_RETURNS +#define TPCDS_CATALOG_SALES CATALOG_SALES +#define TPCDS_CUSTOMER CUSTOMER +#define TPCDS_CUSTOMER_ADDRESS CUSTOMER_ADDRESS +#define TPCDS_CUSTOMER_DEMOGRAPHICS CUSTOMER_DEMOGRAPHICS +#define TPCDS_DATE DATE +#define TPCDS_HOUSEHOLD_DEMOGRAPHICS HOUSEHOLD_DEMOGRAPHICS +#define TPCDS_INCOME_BAND INCOME_BAND +#define TPCDS_INVENTORY INVENTORY +#define TPCDS_ITEM ITEM +#define TPCDS_PROMOTION PROMOTION +#define TPCDS_REASON REASON +#define TPCDS_SHIP_MODE SHIP_MODE +#define TPCDS_STORE STORE +#define TPCDS_STORE_RETURNS STORE_RETURNS +#define TPCDS_STORE_SALES STORE_SALES +#define TPCDS_TIME TIME +#define TPCDS_WAREHOUSE WAREHOUSE +#define TPCDS_WEB_PAGE WEB_PAGE +#define TPCDS_WEB_RETURNS WEB_RETURNS +#define TPCDS_WEB_SALES WEB_SALES +#define TPCDS_WEB_SITE WEB_SITE + +/* ------------------------------------------------------------------------- + * Embedded-mode callbacks — our additions to the tpcds C sources, compiled + * in when EMBEDDED_DSDGEN is defined. The callbacks replace pr_w_*() file + * output with in-process row delivery for the master-detail tables. + * ------------------------------------------------------------------------- */ +#ifdef EMBEDDED_DSDGEN +extern void (*g_w_store_sales_callback)(const struct W_STORE_SALES_TBL *row, void *ctx); +extern void *g_w_store_sales_callback_ctx; +extern void (*g_w_catalog_sales_callback)(const struct W_CATALOG_SALES_TBL *row, void *ctx); +extern void *g_w_catalog_sales_callback_ctx; +extern void (*g_w_web_sales_callback)(const struct W_WEB_SALES_TBL *row, void *ctx); +extern void *g_w_web_sales_callback_ctx; +#endif /* EMBEDDED_DSDGEN */ + +/* Embedded distribution data (from dsts_generated.c) */ +extern const uint8_t tpcds_idx_data[]; +extern const size_t tpcds_idx_size; + +#endif /* TPCDS_DSDGEN_H */ diff --git a/third_party/lance-ffi/src/lib.rs b/third_party/lance-ffi/src/lib.rs index a52c4e8..292c9b8 100644 --- a/third_party/lance-ffi/src/lib.rs +++ b/third_party/lance-ffi/src/lib.rs @@ -4,7 +4,11 @@ use std::ffi::CStr; use std::os::raw::{c_char, c_int, c_void}; use std::panic::{catch_unwind, AssertUnwindSafe}; +use std::sync::mpsc::{sync_channel, Receiver}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::thread; +use std::time::Instant; use arrow::ffi::{FFI_ArrowSchema, FFI_ArrowArray}; use arrow::ffi_stream::{FFI_ArrowArrayStream, ArrowArrayStreamReader}; @@ -72,8 +76,11 @@ pub struct LanceWriterHandle { row_count: usize, closed: bool, runtime: Runtime, + use_streaming: bool, backend: WriterBackend, write_params: WriteParamsConfig, + runtime_config: RuntimeConfig, + profile_config: ProfileConfig, } const FLUSH_BATCH_THRESHOLD: usize = 200; @@ -86,6 +93,141 @@ struct WriteParamsConfig { max_bytes_per_file: usize, skip_auto_cleanup: bool, use_io_uring: bool, + scatter_gather_batches: usize, + scatter_gather_queue_chunks: usize, + buffered_flush_batch_threshold: usize, + buffered_flush_row_threshold: usize, +} + +#[derive(Debug, Clone, Copy)] +struct RuntimeConfig { + /// Cap Tokio blocking pool size to avoid large stack reservations. + max_blocking_threads: usize, +} + +#[derive(Debug, Clone, Copy)] +struct ProfileConfig { + enable_mem_profile: bool, + report_every_batches: usize, +} + +impl Default for ProfileConfig { + fn default() -> Self { + Self { + enable_mem_profile: false, + report_every_batches: 100, + } + } +} + +impl Default for RuntimeConfig { + fn default() -> Self { + Self { + max_blocking_threads: 8, + } + } +} + +fn current_rss_kb() -> Option { + let status = std::fs::read_to_string("/proc/self/status").ok()?; + for line in status.lines() { + if !line.starts_with("VmRSS:") { + continue; + } + let value = line.split_whitespace().nth(1)?; + return value.parse::().ok(); + } + None +} + +fn estimate_batch_bytes(batch: &RecordBatch) -> u64 { + let mut total: u64 = 0; + for col in batch.columns() { + let data = col.to_data(); + for buf in data.buffers() { + total = total.saturating_add(buf.len() as u64); + } + } + total +} + +#[derive(Default)] +struct StreamCopyStats { + reader_batches: AtomicU64, + reader_rows: AtomicU64, + reader_input_bytes: AtomicU64, + reader_rewrap_bytes: AtomicU64, + sg_queue_current_bytes: AtomicU64, + sg_queue_peak_bytes: AtomicU64, + sg_queue_enqueued_bytes: AtomicU64, + sg_queue_chunks: AtomicU64, +} + +impl StreamCopyStats { + fn note_reader_batch(&self, rows: usize, input_bytes: u64, rewrap_bytes: u64) { + self.reader_batches.fetch_add(1, Ordering::Relaxed); + self.reader_rows.fetch_add(rows as u64, Ordering::Relaxed); + self.reader_input_bytes.fetch_add(input_bytes, Ordering::Relaxed); + self.reader_rewrap_bytes.fetch_add(rewrap_bytes, Ordering::Relaxed); + } + + fn note_sg_chunk_enqueued(&self, chunk_bytes: u64) { + self.sg_queue_enqueued_bytes.fetch_add(chunk_bytes, Ordering::Relaxed); + self.sg_queue_chunks.fetch_add(1, Ordering::Relaxed); + let cur = self + .sg_queue_current_bytes + .fetch_add(chunk_bytes, Ordering::Relaxed) + .saturating_add(chunk_bytes); + let mut peak = self.sg_queue_peak_bytes.load(Ordering::Relaxed); + while cur > peak { + match self.sg_queue_peak_bytes.compare_exchange_weak( + peak, + cur, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => break, + Err(next) => peak = next, + } + } + } + + fn note_sg_chunk_dequeued(&self, chunk_bytes: u64) { + let _ = self.sg_queue_current_bytes.fetch_update( + Ordering::Relaxed, + Ordering::Relaxed, + |cur| Some(cur.saturating_sub(chunk_bytes)), + ); + } + + fn log_summary(&self) { + eprintln!( + "Lance FFI copy: reader_batches={} reader_rows={} reader_input_bytes={} reader_rewrap_bytes={} sg_queue_enqueued_bytes={} sg_queue_chunks={} sg_queue_peak_bytes={} sg_queue_current_bytes={}", + self.reader_batches.load(Ordering::Relaxed), + self.reader_rows.load(Ordering::Relaxed), + self.reader_input_bytes.load(Ordering::Relaxed), + self.reader_rewrap_bytes.load(Ordering::Relaxed), + self.sg_queue_enqueued_bytes.load(Ordering::Relaxed), + self.sg_queue_chunks.load(Ordering::Relaxed), + self.sg_queue_peak_bytes.load(Ordering::Relaxed), + self.sg_queue_current_bytes.load(Ordering::Relaxed), + ); + } +} + +fn log_mem_stage(profile: ProfileConfig, stage: &str, elapsed: Option) { + if !profile.enable_mem_profile { + return; + } + let rss = current_rss_kb().unwrap_or(0); + if let Some(sec) = elapsed { + eprintln!( + "Lance FFI mem: stage={} rss_kb={} elapsed_s={:.6}", + stage, rss, sec + ); + } else { + eprintln!("Lance FFI mem: stage={} rss_kb={}", stage, rss); + } } impl Default for WriteParamsConfig { @@ -96,31 +238,36 @@ impl Default for WriteParamsConfig { max_bytes_per_file: 0, skip_auto_cleanup: false, use_io_uring: false, + scatter_gather_batches: 1, + scatter_gather_queue_chunks: 4, + buffered_flush_batch_threshold: FLUSH_BATCH_THRESHOLD, + buffered_flush_row_threshold: FLUSH_ROW_THRESHOLD, } } } impl LanceWriterHandle { - fn new(uri: String, use_streaming: bool) -> Result { - // Buffered path: all work happens synchronously inside block_on() calls. - // A single-threaded executor is sufficient and avoids thread pool overhead. - // - // Streaming path: exactly one background task runs the Lance consumer. - // More than 1 worker thread adds unnecessary context-switch overhead and - // cross-core cache coherency cost without any parallelism benefit. - let runtime = if use_streaming { - tokio::runtime::Builder::new_multi_thread() - .worker_threads(1) - .enable_all() - .build() - .map_err(|e| format!("Failed to create Tokio runtime: {}", e))? + fn build_runtime(use_streaming: bool, runtime_config: RuntimeConfig) -> Result { + let max_blocking_threads = runtime_config.max_blocking_threads.max(1); + let mut builder = if use_streaming { + let mut b = tokio::runtime::Builder::new_multi_thread(); + b.worker_threads(1); + b } else { tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .map_err(|e| format!("Failed to create Tokio runtime: {}", e))? }; + builder + .max_blocking_threads(max_blocking_threads) + .enable_all() + .build() + .map_err(|e| format!("Failed to create Tokio runtime: {}", e)) + } + + fn new(uri: String, use_streaming: bool) -> Result { + let runtime_config = RuntimeConfig::default(); + let runtime = Self::build_runtime(use_streaming, runtime_config)?; + let backend = if use_streaming { WriterBackend::Streaming { task: None, // Initialized when stream is provided @@ -142,11 +289,47 @@ impl LanceWriterHandle { row_count: 0, closed: false, runtime, + use_streaming, backend, write_params: WriteParamsConfig::default(), + runtime_config, + profile_config: ProfileConfig::default(), }) } + fn set_runtime_config(&mut self, max_blocking_threads: usize) -> Result<(), String> { + let WriterBackend::Streaming { task } = &self.backend else { + return Ok(()); + }; + if task.is_some() { + return Err("Cannot change runtime config after streaming task has started".to_string()); + } + + if max_blocking_threads > 0 { + self.runtime_config.max_blocking_threads = max_blocking_threads; + } + self.runtime = Self::build_runtime(self.use_streaming, self.runtime_config)?; + Ok(()) + } + + fn set_profile_config(&mut self, enable_mem_profile: bool, report_every_batches: usize) -> Result<(), String> { + let WriterBackend::Streaming { task } = &self.backend else { + self.profile_config.enable_mem_profile = enable_mem_profile; + if report_every_batches > 0 { + self.profile_config.report_every_batches = report_every_batches; + } + return Ok(()); + }; + if task.is_some() { + return Err("Cannot change profile config after streaming task has started".to_string()); + } + self.profile_config.enable_mem_profile = enable_mem_profile; + if report_every_batches > 0 { + self.profile_config.report_every_batches = report_every_batches; + } + Ok(()) + } + fn import_ffi_batch(arrow_array_ptr: *mut FFI_ArrowArray, arrow_schema_ptr: *mut FFI_ArrowSchema) -> Result { unsafe { // TAKING OWNERSHIP: We convert raw pointers to unsafe FFI structs. @@ -224,19 +407,56 @@ impl LanceWriterHandle { unsafe { libc::free(stream_ptr as *mut c_void) }; let reader = result.map_err(|e| format!("Failed to import ArrowArrayStream: {}", e))?; - let compressed_schema = Arc::new(apply_compression_metadata(reader.schema().as_ref())); - let compression_reader = CompressionReader::new(reader, compressed_schema); - let source: Box = Box::new(compression_reader); + let profile = self.profile_config; + let copy_stats = Arc::new(StreamCopyStats::default()); + let source: Box = if config.scatter_gather_batches > 1 { + Box::new(ScatterGatherReader::spawn( + reader, + profile, + config.scatter_gather_batches, + config.scatter_gather_queue_chunks, + copy_stats.clone(), + )?) + } else { + let compressed_schema = Arc::new(apply_compression_metadata(reader.schema().as_ref())); + let compression_reader = + CompressionReader::new(reader, compressed_schema, profile, copy_stats.clone()); + Box::new(compression_reader) + }; let uri_clone = self.uri.clone(); let write_params = build_write_params_from(config, WriteMode::Overwrite); + let copy_stats_for_task = copy_stats.clone(); eprintln!("Lance FFI: Starting streaming background task with Arrow C Stream..."); + eprintln!( + "Lance FFI: Tokio runtime mode=multi-thread(1 worker), max_blocking_threads={}", + self.runtime_config.max_blocking_threads + ); + if self.profile_config.enable_mem_profile { + eprintln!( + "Lance FFI mem: enabled=1 report_every_batches={}", + self.profile_config.report_every_batches + ); + } + eprintln!( + "Lance FFI: scatter/gather batches_per_chunk={}, queue_chunks={}", + config.scatter_gather_batches, + config.scatter_gather_queue_chunks + ); let task_handle = self.runtime.spawn(async move { + log_mem_stage(profile, "stream_task_start", None); + let stream_begin = Instant::now(); + log_mem_stage(profile, "before_execute_uncommitted_stream", None); let transaction = lance::dataset::InsertBuilder::new(&uri_clone) .with_params(&write_params) .execute_uncommitted_stream(source) .await?; + log_mem_stage( + profile, + "after_execute_uncommitted_stream", + Some(stream_begin.elapsed().as_secs_f64()), + ); let mut commit_builder = CommitBuilder::new(&uri_clone) .use_stable_row_ids(write_params.enable_stable_row_ids) @@ -256,7 +476,16 @@ impl LanceWriterHandle { commit_builder = commit_builder.with_session(session); } - commit_builder.execute(transaction).await.map(|_| ()) + let commit_begin = Instant::now(); + log_mem_stage(profile, "before_commit_execute", None); + let result = commit_builder.execute(transaction).await.map(|_| ()); + log_mem_stage( + profile, + "after_commit_execute", + Some(commit_begin.elapsed().as_secs_f64()), + ); + copy_stats_for_task.log_summary(); + result }); *task = Some(task_handle); @@ -298,11 +527,25 @@ fn build_write_params_from(config: WriteParamsConfig, mode: WriteMode) -> WriteP struct CompressionReader { inner: ArrowArrayStreamReader, schema: Arc, + profile: ProfileConfig, + batch_count: usize, + copy_stats: Arc, } impl CompressionReader { - fn new(inner: ArrowArrayStreamReader, schema: Arc) -> Self { - Self { inner, schema } + fn new( + inner: ArrowArrayStreamReader, + schema: Arc, + profile: ProfileConfig, + copy_stats: Arc, + ) -> Self { + Self { + inner, + schema, + profile, + batch_count: 0, + copy_stats, + } } } @@ -318,12 +561,181 @@ impl Iterator for CompressionReader { fn next(&mut self) -> Option { self.inner.next().map(|res| { res.and_then(|batch| { - RecordBatch::try_new(self.schema.clone(), batch.columns().to_vec()) + self.batch_count += 1; + let input_bytes = estimate_batch_bytes(&batch); + if self.profile.enable_mem_profile + && (self.batch_count <= 3 + || self.batch_count % self.profile.report_every_batches == 0) + { + let rss = current_rss_kb().unwrap_or(0); + eprintln!( + "Lance FFI mem: stage=reader_next batch={} rows={} rss_kb={}", + self.batch_count, + batch.num_rows(), + rss + ); + } + let out = RecordBatch::try_new(self.schema.clone(), batch.columns().to_vec())?; + let rewrap_bytes = estimate_batch_bytes(&out); + self.copy_stats + .note_reader_batch(out.num_rows(), input_bytes, rewrap_bytes); + Ok(out) }) }) } } +enum ScatterGatherMsg { + Chunk { batches: Vec, chunk_bytes: u64 }, + End, + Err(String), +} + +struct ScatterGatherReader { + schema: Arc, + rx: Receiver, + current_chunk: Vec, + chunk_idx: usize, + copy_stats: Arc, +} + +impl ScatterGatherReader { + fn spawn( + mut inner: ArrowArrayStreamReader, + profile: ProfileConfig, + batches_per_chunk: usize, + queue_chunks: usize, + copy_stats: Arc, + ) -> Result { + let schema = Arc::new(apply_compression_metadata(inner.schema().as_ref())); + let (tx, rx) = sync_channel::(queue_chunks.max(1)); + let out_schema = schema.clone(); + let chunk_size = batches_per_chunk.max(1); + let stats = copy_stats.clone(); + + thread::spawn(move || { + let mut chunk = Vec::with_capacity(chunk_size); + let mut chunk_bytes: u64 = 0; + let mut seen_batches: usize = 0; + loop { + let next = inner.next(); + match next { + Some(Ok(batch)) => { + let input_bytes = estimate_batch_bytes(&batch); + let out_batch = match RecordBatch::try_new(out_schema.clone(), batch.columns().to_vec()) { + Ok(b) => b, + Err(e) => { + let _ = tx.send(ScatterGatherMsg::Err(format!( + "Scatter/gather schema rewrite failed: {}", + e + ))); + return; + } + }; + let out_bytes = estimate_batch_bytes(&out_batch); + stats.note_reader_batch(out_batch.num_rows(), input_bytes, out_bytes); + seen_batches += 1; + if profile.enable_mem_profile + && (seen_batches <= 3 + || seen_batches % profile.report_every_batches == 0) + { + let rss = current_rss_kb().unwrap_or(0); + eprintln!( + "Lance FFI mem: stage=sg_reader_next batch={} rows={} rss_kb={}", + seen_batches, + out_batch.num_rows(), + rss + ); + } + chunk_bytes = chunk_bytes.saturating_add(out_bytes); + chunk.push(out_batch); + if chunk.len() >= chunk_size { + let send_bytes = chunk_bytes; + if tx + .send(ScatterGatherMsg::Chunk { + batches: std::mem::take(&mut chunk), + chunk_bytes: send_bytes, + }) + .is_err() + { + return; + } + stats.note_sg_chunk_enqueued(send_bytes); + chunk_bytes = 0; + } + } + Some(Err(e)) => { + let _ = tx.send(ScatterGatherMsg::Err(format!("Scatter/gather reader error: {}", e))); + return; + } + None => { + if !chunk.is_empty() { + let send_bytes = chunk_bytes; + if tx + .send(ScatterGatherMsg::Chunk { + batches: chunk, + chunk_bytes: send_bytes, + }) + .is_ok() + { + stats.note_sg_chunk_enqueued(send_bytes); + } + } + let _ = tx.send(ScatterGatherMsg::End); + return; + } + } + } + }); + + Ok(Self { + schema, + rx, + current_chunk: Vec::new(), + chunk_idx: 0, + copy_stats, + }) + } +} + +impl RecordBatchReader for ScatterGatherReader { + fn schema(&self) -> Arc { + self.schema.clone() + } +} + +impl Iterator for ScatterGatherReader { + type Item = std::result::Result; + + fn next(&mut self) -> Option { + if self.chunk_idx < self.current_chunk.len() { + let out = self.current_chunk[self.chunk_idx].clone(); + self.chunk_idx += 1; + return Some(Ok(out)); + } + self.current_chunk.clear(); + self.chunk_idx = 0; + + match self.rx.recv() { + Ok(ScatterGatherMsg::Chunk { batches, chunk_bytes }) => { + self.copy_stats.note_sg_chunk_dequeued(chunk_bytes); + self.current_chunk = batches; + if self.current_chunk.is_empty() { + return self.next(); + } + let out = self.current_chunk[0].clone(); + self.chunk_idx = 1; + Some(Ok(out)) + } + Ok(ScatterGatherMsg::End) => None, + Ok(ScatterGatherMsg::Err(msg)) => Some(Err(ArrowError::ExternalError(Box::new( + std::io::Error::new(std::io::ErrorKind::Other, msg), + )))), + Err(_) => None, + } + } +} + // C Interface Exports #[no_mangle] @@ -375,7 +787,9 @@ pub extern "C" fn lance_writer_write_batch(writer_ptr: *mut LanceWriterHandle, a WriterBackend::Buffered { batches, pending_row_count, .. } => { *pending_row_count += record_batch.num_rows(); batches.push(record_batch); - if batches.len() >= FLUSH_BATCH_THRESHOLD || *pending_row_count >= FLUSH_ROW_THRESHOLD { + let flush_batch_threshold = writer.write_params.buffered_flush_batch_threshold.max(1); + let flush_row_threshold = writer.write_params.buffered_flush_row_threshold.max(1); + if batches.len() >= flush_batch_threshold || *pending_row_count >= flush_row_threshold { if let Err(e) = writer.flush_batches() { eprintln!("Flush Error: {}", e); return 5; } } }, @@ -447,6 +861,125 @@ pub extern "C" fn lance_writer_set_write_params( })).unwrap_or(3) } +/// Configure scatter/gather stream mode. +/// +/// batches_per_chunk: +/// 1 = disabled (default) +/// >1 = producer groups this many RecordBatches per queue chunk +/// +/// queue_chunks: +/// Number of chunk slots in the bounded producer/consumer queue. +#[no_mangle] +pub extern "C" fn lance_writer_set_scatter_gather_config( + writer_ptr: *mut LanceWriterHandle, + batches_per_chunk: c_int, + queue_chunks: c_int, +) -> c_int { + catch_unwind(AssertUnwindSafe(|| { + if writer_ptr.is_null() { return 1; } + let writer = unsafe { &mut *writer_ptr }; + if writer.closed { return 2; } + if let WriterBackend::Streaming { task } = &writer.backend { + if task.is_some() { + eprintln!("Scatter/Gather Config Error: cannot change after stream start"); + return 5; + } + } + if batches_per_chunk > 0 { + writer.write_params.scatter_gather_batches = batches_per_chunk as usize; + } + if queue_chunks > 0 { + writer.write_params.scatter_gather_queue_chunks = queue_chunks as usize; + } + 0 + })).unwrap_or(3) +} + +/// Configure flush thresholds for buffered backend. +#[no_mangle] +pub extern "C" fn lance_writer_set_buffered_flush_config( + writer_ptr: *mut LanceWriterHandle, + batch_threshold: c_int, + row_threshold: c_int, +) -> c_int { + catch_unwind(AssertUnwindSafe(|| { + if writer_ptr.is_null() { return 1; } + let writer = unsafe { &mut *writer_ptr }; + if writer.closed { return 2; } + if let WriterBackend::Streaming { task } = &writer.backend { + if task.is_some() { + eprintln!("Buffered Flush Config Error: cannot change after stream start"); + return 5; + } + } + if batch_threshold > 0 { + writer.write_params.buffered_flush_batch_threshold = batch_threshold as usize; + } + if row_threshold > 0 { + writer.write_params.buffered_flush_row_threshold = row_threshold as usize; + } + 0 + })).unwrap_or(3) +} + +/// Configure Tokio runtime for streaming mode. +/// +/// max_blocking_threads: +/// 0 = keep current value +/// >0 = set blocking pool cap +/// +/// Must be called before lance_writer_start_stream(). +#[no_mangle] +pub extern "C" fn lance_writer_set_runtime_config( + writer_ptr: *mut LanceWriterHandle, + max_blocking_threads: c_int, +) -> c_int { + catch_unwind(AssertUnwindSafe(|| { + if writer_ptr.is_null() { return 1; } + let writer = unsafe { &mut *writer_ptr }; + if writer.closed { return 2; } + + let max_threads = if max_blocking_threads > 0 { + max_blocking_threads as usize + } else { + 0 + }; + match writer.set_runtime_config(max_threads) { + Ok(_) => 0, + Err(e) => { + eprintln!("Runtime Config Error: {}", e); + 5 + } + } + })).unwrap_or(3) +} + +#[no_mangle] +pub extern "C" fn lance_writer_set_profile_config( + writer_ptr: *mut LanceWriterHandle, + enable_mem_profile: c_int, + report_every_batches: c_int, +) -> c_int { + catch_unwind(AssertUnwindSafe(|| { + if writer_ptr.is_null() { return 1; } + let writer = unsafe { &mut *writer_ptr }; + if writer.closed { return 2; } + + let every = if report_every_batches > 0 { + report_every_batches as usize + } else { + 0 + }; + match writer.set_profile_config(enable_mem_profile != 0, every) { + Ok(_) => 0, + Err(e) => { + eprintln!("Profile Config Error: {}", e); + 5 + } + } + })).unwrap_or(3) +} + /// Enable or disable io_uring write path for this writer. /// Must be called before the first batch is written. /// Returns 0 on success, 1 if writer_ptr is null, 2 if already closed. diff --git a/third_party/tpcds b/third_party/tpcds new file mode 160000 index 0000000..abaa79c --- /dev/null +++ b/third_party/tpcds @@ -0,0 +1 @@ +Subproject commit abaa79c7dea56a3e5ff409900aac90bf20a7b224