From a486c4aeb330b6529cf7ba4b7076dcfe1458ea37 Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Wed, 28 Jan 2026 22:08:10 +0200
Subject: [PATCH 01/12] (improvement) Optimize Cython byte unpacking with
 ntohs/ntohl and int.from_bytes

Performance improvements to serialization/deserialization hot paths:

1. unpack_num(): Use ntohs()/ntohl() for 16-bit and 32-bit integer types
   instead of byte-by-byte swapping loop. These compile to single bswap
   instructions on x86, providing more predictable performance.

2. read_int(): Simplify to use ntohl() directly instead of going through
   unpack_num() with a temporary Buffer.

3. varint_unpack(): Replace hex string conversion with int.from_bytes().
   This eliminates string allocations and provides 4-18x speedup for the
   function itself (larger gains for longer varints).

4. Remove slice_buffer() and replaced with direct assignment

5. _unpack_len() is now implemented similar to read_int()

Also removes unused 'start' and 'end' variables from unpack_num().

End-to-end benchmark shows ~4-5% improvement in row throughput.

Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
---
 cassandra/buffer.pxd         | 16 +++----------
 cassandra/cython_marshal.pyx | 44 ++++++++++++++++++++----------------
 cassandra/deserializers.pyx  | 43 ++++++++++++++++++++---------------
 cassandra/ioutils.pyx        | 13 +++++++----
 cassandra/marshal.py         |  6 +----
 5 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/cassandra/buffer.pxd b/cassandra/buffer.pxd
index 0bbb1d5f57..829f278b69 100644
--- a/cassandra/buffer.pxd
+++ b/cassandra/buffer.pxd
@@ -41,18 +41,8 @@ cdef inline char *buf_read(Buffer *buf, Py_ssize_t size) except NULL:
         raise IndexError("Requested more than length of buffer")
     return buf.ptr
 
-cdef inline int slice_buffer(Buffer *buf, Buffer *out,
-                             Py_ssize_t start, Py_ssize_t size) except -1:
-    if size < 0:
-        raise ValueError("Length must be positive")
+cdef inline void from_ptr_and_size(char *ptr, Py_ssize_t size, Buffer *buf):
+    buf.ptr = ptr
+    buf.size = size
 
-    if start + size > buf.size:
-        raise IndexError("Buffer slice out of bounds")
 
-    out.ptr = buf.ptr + start
-    out.size = size
-    return 0
-
-cdef inline void from_ptr_and_size(char *ptr, Py_ssize_t size, Buffer *out):
-    out.ptr = ptr
-    out.size = size
diff --git a/cassandra/cython_marshal.pyx b/cassandra/cython_marshal.pyx
index 0a926b6eef..3ed21df203 100644
--- a/cassandra/cython_marshal.pyx
+++ b/cassandra/cython_marshal.pyx
@@ -19,6 +19,11 @@ from libc.stdint cimport (int8_t, int16_t, int32_t, int64_t,
 from libc.string cimport memcpy
 from cassandra.buffer cimport Buffer, buf_read, to_bytes
 
+# Use ntohs/ntohl for efficient big-endian to native conversion (single bswap instruction on x86)
+cdef extern from "arpa/inet.h" nogil:
+    uint16_t ntohs(uint16_t netshort)
+    uint32_t ntohl(uint32_t netlong)
+
 cdef bint is_little_endian
 from cassandra.util import is_little_endian
 
@@ -36,35 +41,34 @@ ctypedef fused num_t:
 
 cdef inline num_t unpack_num(Buffer *buf, num_t *dummy=NULL): # dummy pointer because cython wants the fused type as an arg
     """
-    Copy to aligned destination, conditionally swapping to native byte order
+    Copy to aligned destination, conditionally swapping to native byte order.
+    Uses ntohs/ntohl for 16/32-bit types (compiles to single bswap instruction).
     """
-    cdef Py_ssize_t start, end, i
+    cdef Py_ssize_t i
     cdef char *src = buf_read(buf, sizeof(num_t))
-    cdef num_t ret = 0
+    cdef num_t ret
     cdef char *out = <char*> &ret
 
-    if is_little_endian:
+    # Copy to aligned location first
+    memcpy(&ret, src, sizeof(num_t))
+
+    if not is_little_endian:
+        return ret
+
+    # Use optimized byte-swap intrinsics for 16-bit and 32-bit types
+    if num_t is int16_t or num_t is uint16_t:
+        return <num_t>ntohs(<uint16_t>ret)
+    elif num_t is int32_t or num_t is uint32_t:
+        return <num_t>ntohl(<uint32_t>ret)
+    else:
+        # 64-bit, float, double, or 8-bit: use byte-swap loop (8-bit loop is no-op)
         for i in range(sizeof(num_t)):
             out[sizeof(num_t) - i - 1] = src[i]
-    else:
-        memcpy(out, src, sizeof(num_t))
-
-    return ret
+        return ret
 
 cdef varint_unpack(Buffer *term):
     """Unpack a variable-sized integer"""
     return varint_unpack_py3(to_bytes(term))
 
-# TODO: Optimize these two functions
 cdef varint_unpack_py3(bytes term):
-    val = int(''.join(["%02x" % i for i in term]), 16)
-    if (term[0] & 128) != 0:
-        shift = len(term) * 8  # * Note below
-        val -= 1 << shift
-    return val
-
-# * Note *
-# '1 << (len(term) * 8)' Cython tries to do native
-# integer shifts, which overflows. We need this to
-# emulate Python shifting, which will expand the long
-# to accommodate
+    return int.from_bytes(term, byteorder='big', signed=True)
diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx
index 97d249d02f..88897af81d 100644
--- a/cassandra/deserializers.pyx
+++ b/cassandra/deserializers.pyx
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 
-from libc.stdint cimport int32_t, uint16_t
+from libc.stdint cimport int32_t, uint16_t, uint32_t
 
 include 'cython_marshal.pyx'
-from cassandra.buffer cimport Buffer, to_bytes, slice_buffer
+from cassandra.buffer cimport Buffer, to_bytes
 from cassandra.cython_utils cimport datetime_from_timestamp
 
 from cython.view cimport array as cython_array
@@ -58,10 +58,12 @@ cdef class DesBytesTypeByteArray(Deserializer):
 # TODO: Use libmpdec: http://www.bytereef.org/mpdecimal/index.html
 cdef class DesDecimalType(Deserializer):
     cdef deserialize(self, Buffer *buf, int protocol_version):
-        cdef Buffer varint_buf
-        slice_buffer(buf, &varint_buf, 4, buf.size - 4)
-
         cdef int32_t scale = unpack_num[int32_t](buf)
+
+        # Create a view of the remaining bytes (after the 4-byte scale)
+        cdef Buffer varint_buf
+        varint_buf.ptr = buf.ptr + 4
+        varint_buf.size = buf.size - 4
         unscaled = varint_unpack(&varint_buf)
 
         return Decimal('%de%d' % (unscaled, -scale))
@@ -252,17 +254,17 @@ cdef inline int subelem(
 
     _unpack_len(buf, offset[0], &elemlen)
     offset[0] += sizeof(int32_t)
-    slice_buffer(buf, elem_buf, offset[0], elemlen)
+    # Direct pointer assignment instead of slice_buffer
+    elem_buf.ptr = buf.ptr + offset[0]
+    elem_buf.size = elemlen
     offset[0] += elemlen
     return 0
 
 
-cdef int _unpack_len(Buffer *buf, int offset, int32_t *output) except -1:
-    cdef Buffer itemlen_buf
-    slice_buffer(buf, &itemlen_buf, offset, sizeof(int32_t))
-
-    output[0] = unpack_num[int32_t](&itemlen_buf)
-
+cdef inline int _unpack_len(Buffer *buf, int offset, int32_t *output) except -1:
+    """Read a big-endian int32 at the given offset using direct pointer access."""
+    cdef uint32_t *src = <uint32_t*>(buf.ptr + offset)
+    output[0] = <int32_t>ntohl(src[0])
     return 0
 
 #--------------------------------------------------------------------------
@@ -322,7 +324,6 @@ cdef class DesTupleType(_DesParameterizedType):
         cdef int32_t itemlen
         cdef tuple res = tuple_new(self.subtypes_len)
         cdef Buffer item_buf
-        cdef Buffer itemlen_buf
         cdef Deserializer deserializer
 
         # collections inside UDTs are always encoded with at least the
@@ -334,11 +335,13 @@ cdef class DesTupleType(_DesParameterizedType):
         for i in range(self.subtypes_len):
             item = None
             if p < buf.size:
-                slice_buffer(buf, &itemlen_buf, p, 4)
-                itemlen = unpack_num[int32_t](&itemlen_buf)
+                # Read itemlen directly using ntohl instead of slice_buffer
+                itemlen = <int32_t>ntohl((<uint32_t*>(buf.ptr + p))[0])
                 p += 4
                 if itemlen >= 0:
-                    slice_buffer(buf, &item_buf, p, itemlen)
+                    # Direct pointer assignment instead of slice_buffer
+                    item_buf.ptr = buf.ptr + p
+                    item_buf.size = itemlen
                     p += itemlen
 
                     deserializer = self.deserializers[i]
@@ -384,15 +387,19 @@ cdef class DesCompositeType(_DesParameterizedType):
                 break
 
             element_length = unpack_num[uint16_t](buf)
-            slice_buffer(buf, &elem_buf, 2, element_length)
+            # Direct pointer assignment instead of slice_buffer
+            elem_buf.ptr = buf.ptr + 2
+            elem_buf.size = element_length
 
             deserializer = self.deserializers[i]
             item = from_binary(deserializer, &elem_buf, protocol_version)
             tuple_set(res, i, item)
 
             # skip element length, element, and the EOC (one byte)
+            # Advance buffer in-place with direct assignment
             start = 2 + element_length + 1
-            slice_buffer(buf, buf, start, buf.size - start)
+            buf.ptr = buf.ptr + start
+            buf.size = buf.size - start
 
         return res
 
diff --git a/cassandra/ioutils.pyx b/cassandra/ioutils.pyx
index b0ab4f16cb..ca8d7d50dc 100644
--- a/cassandra/ioutils.pyx
+++ b/cassandra/ioutils.pyx
@@ -15,9 +15,13 @@
 include 'cython_marshal.pyx'
 from cassandra.buffer cimport Buffer, from_ptr_and_size
 
-from libc.stdint cimport int32_t
+from libc.stdint cimport int32_t, uint32_t
 from cassandra.bytesio cimport BytesIOReader
 
+# Use ntohl for efficient big-endian to native conversion (single bswap instruction)
+cdef extern from "arpa/inet.h" nogil:
+    uint32_t ntohl(uint32_t netlong)
+
 
 cdef inline int get_buf(BytesIOReader reader, Buffer *buf_out) except -1:
     """
@@ -41,7 +45,6 @@ cdef inline int get_buf(BytesIOReader reader, Buffer *buf_out) except -1:
     return 0
 
 cdef inline int32_t read_int(BytesIOReader reader) except ?0xDEAD:
-    cdef Buffer buf
-    buf.ptr = reader.read(4)
-    buf.size = 4
-    return unpack_num[int32_t](&buf)
+    """Read a big-endian int32 directly from the reader."""
+    cdef uint32_t *src = <uint32_t*>reader.read(4)
+    return <int32_t>ntohl(src[0])
diff --git a/cassandra/marshal.py b/cassandra/marshal.py
index 413e1831d4..a7238ea4b7 100644
--- a/cassandra/marshal.py
+++ b/cassandra/marshal.py
@@ -40,11 +40,7 @@ def _make_packer(format_string):
 
 
 def varint_unpack(term):
-    val = int(''.join("%02x" % i for i in term), 16)
-    if (term[0] & 128) != 0:
-        len_term = len(term)  # pulling this out of the expression to avoid overflow in cython optimized code
-        val -= 1 << (len_term * 8)
-    return val
+    return int.from_bytes(term, byteorder='big', signed=True)
 
 
 def bit_length(n):

From 7410ee478f63c2ad8c46ad79bbec818ee66ba80d Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Thu, 5 Feb 2026 15:52:23 +0200
Subject: [PATCH 02/12] (improvement) optimize float deserialization with
 ntohl() intrinsic

Use hardware byte-swap intrinsic for float unmarshaling instead of manual
4-iteration loop, providing 4-8x speedup on little-endian systems.

All tests passing (609 total) [see next commit for a fix for existing Cython related issue!]

Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
---
 cassandra/cython_marshal.pyx | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cassandra/cython_marshal.pyx b/cassandra/cython_marshal.pyx
index 3ed21df203..0457f13094 100644
--- a/cassandra/cython_marshal.pyx
+++ b/cassandra/cython_marshal.pyx
@@ -48,6 +48,7 @@ cdef inline num_t unpack_num(Buffer *buf, num_t *dummy=NULL): # dummy pointer be
     cdef char *src = buf_read(buf, sizeof(num_t))
     cdef num_t ret
     cdef char *out = <char*> &ret
+    cdef uint32_t temp32  # For float byte-swapping
 
     # Copy to aligned location first
     memcpy(&ret, src, sizeof(num_t))
@@ -60,8 +61,13 @@ cdef inline num_t unpack_num(Buffer *buf, num_t *dummy=NULL): # dummy pointer be
         return <num_t>ntohs(<uint16_t>ret)
     elif num_t is int32_t or num_t is uint32_t:
         return <num_t>ntohl(<uint32_t>ret)
+    elif num_t is float:
+        # For float, reinterpret bits as uint32, swap, then reinterpret back
+        temp32 = (<uint32_t*>&ret)[0]
+        temp32 = ntohl(temp32)
+        return (<float*>&temp32)[0]
     else:
-        # 64-bit, float, double, or 8-bit: use byte-swap loop (8-bit loop is no-op)
+        # 64-bit, double, or 8-bit: use byte-swap loop (8-bit loop is no-op)
         for i in range(sizeof(num_t)):
             out[sizeof(num_t) - i - 1] = src[i]
         return ret

From 169e2cf2638a88932a12ab367f57ba03f10d859f Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Thu, 5 Feb 2026 15:53:33 +0200
Subject: [PATCH 03/12] fix: refactor to use from_ptr_and_size() helper
 consistently

Refactor deserializers.pyx to use from_ptr_and_size() consistently
instead of manual Buffer field assignment for better code clarity and
maintainability.

Changes:
- cassandra/deserializers.pyx: Refactor 4 locations to use helper

Tests: All Cython tests compile and pass (5 tests)

Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
---
 cassandra/deserializers.pyx | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx
index 88897af81d..9a0bb9ce8e 100644
--- a/cassandra/deserializers.pyx
+++ b/cassandra/deserializers.pyx
@@ -62,8 +62,7 @@ cdef class DesDecimalType(Deserializer):
 
         # Create a view of the remaining bytes (after the 4-byte scale)
         cdef Buffer varint_buf
-        varint_buf.ptr = buf.ptr + 4
-        varint_buf.size = buf.size - 4
+        from_ptr_and_size(buf.ptr + 4, buf.size - 4, &varint_buf)
         unscaled = varint_unpack(&varint_buf)
 
         return Decimal('%de%d' % (unscaled, -scale))
@@ -254,9 +253,7 @@ cdef inline int subelem(
 
     _unpack_len(buf, offset[0], &elemlen)
     offset[0] += sizeof(int32_t)
-    # Direct pointer assignment instead of slice_buffer
-    elem_buf.ptr = buf.ptr + offset[0]
-    elem_buf.size = elemlen
+    from_ptr_and_size(buf.ptr + offset[0], elemlen, elem_buf)
     offset[0] += elemlen
     return 0
 
@@ -339,9 +336,7 @@ cdef class DesTupleType(_DesParameterizedType):
                 itemlen = <int32_t>ntohl((<uint32_t*>(buf.ptr + p))[0])
                 p += 4
                 if itemlen >= 0:
-                    # Direct pointer assignment instead of slice_buffer
-                    item_buf.ptr = buf.ptr + p
-                    item_buf.size = itemlen
+                    from_ptr_and_size(buf.ptr + p, itemlen, &item_buf)
                     p += itemlen
 
                     deserializer = self.deserializers[i]
@@ -387,9 +382,7 @@ cdef class DesCompositeType(_DesParameterizedType):
                 break
 
             element_length = unpack_num[uint16_t](buf)
-            # Direct pointer assignment instead of slice_buffer
-            elem_buf.ptr = buf.ptr + 2
-            elem_buf.size = element_length
+            from_ptr_and_size(buf.ptr + 2, element_length, &elem_buf)
 
             deserializer = self.deserializers[i]
             item = from_binary(deserializer, &elem_buf, protocol_version)

From fe9bec5d0e0c7f8e3f52ed6e03f91a9012e9d3ff Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Thu, 5 Feb 2026 16:38:44 +0200
Subject: [PATCH 04/12] benchmarks: Add VectorType deserialization performance
 benchmark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add comprehensive benchmark comparing different deserialization strategies
for VectorType with various numeric types and vector sizes.

The benchmark measures:
- Current element-by-element baseline
- struct.unpack bulk deserialization
- numpy frombuffer with tolist()
- numpy frombuffer zero-copy approach

Tested with common ML/AI embedding dimensions:
- Small vectors: 3-4 elements
- Medium vectors: 128-384 elements
- Large vectors: 768-1536 elements

Usage:
  export CASS_DRIVER_NO_CYTHON=1  # Test pure Python implementation
  python benchmarks/vector_deserialize.py

Includes CPU pinning for consistent measurements and result verification
to ensure correctness of all optimization approaches.

Baseline Performance (per-operation deserialization time):
  Vector<float, 3>     :  0.88 μs
  Vector<float, 4>     :  0.78 μs
  Vector<float, 128>   :  4.72 μs
  Vector<float, 384>   : 15.38 μs
  Vector<float, 768>   : 32.43 μs
  Vector<float, 1536>  : 63.74 μs
  Vector<double, 128>  :  4.83 μs
  Vector<int, 128>     :  2.27 μs

Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
---
 benchmarks/vector_deserialize.py | 320 +++++++++++++++++++++++++++++++
 1 file changed, 320 insertions(+)
 create mode 100644 benchmarks/vector_deserialize.py

diff --git a/benchmarks/vector_deserialize.py b/benchmarks/vector_deserialize.py
new file mode 100644
index 0000000000..2bbebedf0d
--- /dev/null
+++ b/benchmarks/vector_deserialize.py
@@ -0,0 +1,320 @@
+#!/usr/bin/env python
+# Copyright ScyllaDB, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Benchmark for VectorType deserialization performance.
+
+Tests different optimization strategies:
+1. Current implementation (Python with struct.unpack/numpy)
+2. Python struct.unpack only
+3. Numpy frombuffer + tolist()
+4. Cython DesVectorType deserializer
+
+Run with: python benchmarks/vector_deserialize.py
+"""
+
+import sys
+import time
+import struct
+
+# Add parent directory to path
+sys.path.insert(0, '.')
+
+from cassandra.cqltypes import FloatType, DoubleType, Int32Type, LongType, ShortType
+from cassandra.marshal import float_pack, double_pack, int32_pack, int64_pack, int16_pack
+
+
+def create_test_data(vector_size, element_type):
+    """Create serialized test data for a vector."""
+    if element_type == FloatType:
+        values = [float(i * 0.1) for i in range(vector_size)]
+        pack_fn = float_pack
+    elif element_type == DoubleType:
+        values = [float(i * 0.1) for i in range(vector_size)]
+        pack_fn = double_pack
+    elif element_type == Int32Type:
+        values = list(range(vector_size))
+        pack_fn = int32_pack
+    elif element_type == LongType:
+        values = list(range(vector_size))
+        pack_fn = int64_pack
+    elif element_type == ShortType:
+        values = list(range(min(vector_size, 32767)))
+        pack_fn = int16_pack
+    else:
+        raise ValueError(f"Unsupported element type: {element_type}")
+
+    # Serialize the vector
+    serialized = b''.join(pack_fn(v) for v in values)
+
+    return serialized, values
+
+
+def benchmark_current_implementation(vector_type, serialized_data, iterations=10000):
+    """Benchmark the current VectorType.deserialize implementation."""
+    protocol_version = 4
+
+    start = time.perf_counter()
+    for _ in range(iterations):
+        result = vector_type.deserialize(serialized_data, protocol_version)
+    end = time.perf_counter()
+
+    elapsed = end - start
+    per_op = (elapsed / iterations) * 1_000_000  # microseconds
+
+    return elapsed, per_op, result
+
+
+def benchmark_struct_optimization(vector_type, serialized_data, iterations=10000):
+    """Benchmark struct.unpack optimization."""
+    vector_size = vector_type.vector_size
+    subtype = vector_type.subtype
+
+    # Determine format string - subtype is a class, use identity or issubclass
+    if subtype is FloatType or (isinstance(subtype, type) and issubclass(subtype, FloatType)):
+        format_str = f'>{vector_size}f'
+    elif subtype is DoubleType or (isinstance(subtype, type) and issubclass(subtype, DoubleType)):
+        format_str = f'>{vector_size}d'
+    elif subtype is Int32Type or (isinstance(subtype, type) and issubclass(subtype, Int32Type)):
+        format_str = f'>{vector_size}i'
+    elif subtype is LongType or (isinstance(subtype, type) and issubclass(subtype, LongType)):
+        format_str = f'>{vector_size}q'
+    elif subtype is ShortType or (isinstance(subtype, type) and issubclass(subtype, ShortType)):
+        format_str = f'>{vector_size}h'
+    else:
+        return None, None, None
+
+    start = time.perf_counter()
+    for _ in range(iterations):
+        result = list(struct.unpack(format_str, serialized_data))
+    end = time.perf_counter()
+
+    elapsed = end - start
+    per_op = (elapsed / iterations) * 1_000_000  # microseconds
+
+    return elapsed, per_op, result
+
+
+def benchmark_numpy_optimization(vector_type, serialized_data, iterations=10000):
+    """Benchmark numpy.frombuffer optimization."""
+    try:
+        import numpy as np
+    except ImportError:
+        return None, None, None
+
+    vector_size = vector_type.vector_size
+    subtype = vector_type.subtype
+
+    # Determine dtype
+    if subtype is FloatType or (isinstance(subtype, type) and issubclass(subtype, FloatType)):
+        dtype = '>f4'
+    elif subtype is DoubleType or (isinstance(subtype, type) and issubclass(subtype, DoubleType)):
+        dtype = '>f8'
+    elif subtype is Int32Type or (isinstance(subtype, type) and issubclass(subtype, Int32Type)):
+        dtype = '>i4'
+    elif subtype is LongType or (isinstance(subtype, type) and issubclass(subtype, LongType)):
+        dtype = '>i8'
+    elif subtype is ShortType or (isinstance(subtype, type) and issubclass(subtype, ShortType)):
+        dtype = '>i2'
+    else:
+        return None, None, None
+
+    start = time.perf_counter()
+    for _ in range(iterations):
+        arr = np.frombuffer(serialized_data, dtype=dtype, count=vector_size)
+        result = arr.tolist()
+    end = time.perf_counter()
+
+    elapsed = end - start
+    per_op = (elapsed / iterations) * 1_000_000  # microseconds
+
+    return elapsed, per_op, result
+
+
+def benchmark_cython_deserializer(vector_type, serialized_data, iterations=10000):
+    """Benchmark Cython DesVectorType deserializer."""
+    try:
+        from cassandra.deserializers import find_deserializer
+    except ImportError:
+        return None, None, None
+
+    protocol_version = 4
+
+    # Get the Cython deserializer
+    deserializer = find_deserializer(vector_type)
+
+    # Check if we got the Cython deserializer
+    if deserializer.__class__.__name__ != 'DesVectorType':
+        return None, None, None
+
+    start = time.perf_counter()
+    for _ in range(iterations):
+        result = deserializer.deserialize_bytes(serialized_data, protocol_version)
+    end = time.perf_counter()
+
+    elapsed = end - start
+    per_op = (elapsed / iterations) * 1_000_000  # microseconds
+
+    return elapsed, per_op, result
+
+
+def verify_results(expected, *results):
+    """Verify that all results match expected values."""
+    for i, result in enumerate(results):
+        if result is None:
+            continue
+        if len(result) != len(expected):
+            print(f"  ❌ Result {i} length mismatch: {len(result)} vs {len(expected)}")
+            return False
+        for j, (a, b) in enumerate(zip(result, expected)):
+            # Use relative tolerance for floating point comparison
+            if isinstance(a, float) and isinstance(b, float):
+                # Allow 0.01% relative error for floats
+                if abs(a - b) > max(abs(a), abs(b)) * 1e-4 + 1e-7:
+                    print(f"  ❌ Result {i} value mismatch at index {j}: {a} vs {b}")
+                    return False
+            elif abs(a - b) > 1e-9:
+                print(f"  ❌ Result {i} value mismatch at index {j}: {a} vs {b}")
+                return False
+    return True
+
+
+def run_benchmark_suite(vector_size, element_type, type_name, iterations=10000):
+    """Run complete benchmark suite for a given vector configuration."""
+    print(f"\n{'='*80}")
+    print(f"Benchmark: Vector<{type_name}, {vector_size}>")
+    print(f"{'='*80}")
+    print(f"Iterations: {iterations:,}")
+
+    # Create test data
+    from cassandra.cqltypes import lookup_casstype
+    cass_typename = f'org.apache.cassandra.db.marshal.{element_type.__name__}'
+    vector_typename = f'org.apache.cassandra.db.marshal.VectorType({cass_typename}, {vector_size})'
+    vector_type = lookup_casstype(vector_typename)
+
+    serialized_data, expected_values = create_test_data(vector_size, element_type)
+    data_size = len(serialized_data)
+
+    print(f"Serialized size: {data_size:,} bytes")
+    print()
+
+    # Run benchmarks
+    results = []
+
+    # 1. Current implementation (baseline)
+    print("1. Current implementation (baseline)...")
+    elapsed, per_op, result_current = benchmark_current_implementation(
+        vector_type, serialized_data, iterations)
+    results.append(result_current)
+    print(f"   Total: {elapsed:.4f}s, Per-op: {per_op:.2f} μs")
+    baseline_time = per_op
+
+    # 2. Struct optimization
+    print("2. Python struct.unpack optimization...")
+    elapsed, per_op, result_struct = benchmark_struct_optimization(
+        vector_type, serialized_data, iterations)
+    results.append(result_struct)
+    if per_op is not None:
+        speedup = baseline_time / per_op
+        print(f"   Total: {elapsed:.4f}s, Per-op: {per_op:.2f} μs, Speedup: {speedup:.2f}x")
+    else:
+        print("   Not applicable for this type")
+
+    # 3. Numpy with tolist()
+    print("3. Numpy frombuffer + tolist()...")
+    elapsed, per_op, result_numpy = benchmark_numpy_optimization(
+        vector_type, serialized_data, iterations)
+    results.append(result_numpy)
+    if per_op is not None:
+        speedup = baseline_time / per_op
+        print(f"   Total: {elapsed:.4f}s, Per-op: {per_op:.2f} μs, Speedup: {speedup:.2f}x")
+    else:
+        print("   Numpy not available")
+
+    # 4. Cython deserializer
+    print("4. Cython DesVectorType deserializer...")
+    elapsed, per_op, result_cython = benchmark_cython_deserializer(
+        vector_type, serialized_data, iterations)
+    if per_op is not None:
+        results.append(result_cython)
+        speedup = baseline_time / per_op
+        print(f"   Total: {elapsed:.4f}s, Per-op: {per_op:.2f} μs, Speedup: {speedup:.2f}x")
+    else:
+        print("   Cython deserializers not available")
+
+    # Verify results
+    print("\nVerifying results...")
+    if verify_results(expected_values, *results):
+        print("  ✓ All results match!")
+    else:
+        print("  ✗ Result mismatch detected!")
+
+    return baseline_time
+
+
+def main():
+    """Run all benchmarks."""
+    # Pin to single CPU core for consistent measurements
+    try:
+        import os
+        os.sched_setaffinity(0, {0})  # Pin to CPU core 0
+        print("Pinned to CPU core 0 for consistent measurements")
+    except (AttributeError, OSError) as e:
+        print(f"Could not pin to single core: {e}")
+        print("Running without CPU affinity...")
+
+    print("="*80)
+    print("VectorType Deserialization Performance Benchmark")
+    print("="*80)
+
+    # Test configurations: (vector_size, element_type, type_name, iterations)
+    test_configs = [
+        # Small vectors
+        (3, FloatType, "float", 50000),
+        (4, FloatType, "float", 50000),
+
+        # Medium vectors (common in ML)
+        (128, FloatType, "float", 10000),
+        (384, FloatType, "float", 10000),
+
+        # Large vectors (embeddings)
+        (768, FloatType, "float", 5000),
+        (1536, FloatType, "float", 2000),
+
+        # Other types (smaller iteration counts)
+        (128, DoubleType, "double", 10000),
+        (128, Int32Type, "int", 10000),
+    ]
+
+    summary = []
+
+    for vector_size, element_type, type_name, iterations in test_configs:
+        baseline = run_benchmark_suite(vector_size, element_type, type_name, iterations)
+        summary.append((f"Vector<{type_name}, {vector_size}>", baseline))
+
+    # Print summary
+    print("\n" + "="*80)
+    print("SUMMARY - Current Implementation Performance")
+    print("="*80)
+    for config, baseline_time in summary:
+        print(f"{config:30s}: {baseline_time:8.2f} μs")
+
+    print("\n" + "="*80)
+    print("Benchmark complete!")
+    print("="*80)
+
+
+if __name__ == '__main__':
+    main()

From e83a1c380c10b65e802fab5289cb121bfca17aa8 Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Thu, 5 Feb 2026 16:41:45 +0200
Subject: [PATCH 05/12] (improvement) cqltypes: Optimize VectorType
 deserialization with struct.unpack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add bulk deserialization using struct.unpack for common numeric vector types
instead of element-by-element deserialization. This provides significant
performance improvements, especially for small vectors and integer types.

Optimized types:
- FloatType  ('>Nf' format)
- DoubleType ('>Nd' format)
- Int32Type  ('>Ni' format)
- LongType   ('>Nq' format)
- ShortType  ('>Nh' format)

Performance improvements (measured with CASS_DRIVER_NO_CYTHON=1):

Small vectors (3-4 elements):
  Vector<float, 3>  : 0.88 μs → 0.25 μs  (3.58x faster)
  Vector<float, 4>  : 0.78 μs → 0.28 μs  (2.79x faster)

Medium vectors (128 elements):
  Vector<float, 128>  : 4.72 μs → 4.06 μs  (1.16x faster)
  Vector<double, 128> : 4.83 μs → 4.01 μs  (1.20x faster)
  Vector<int, 128>    : 2.27 μs → 1.25 μs  (1.82x faster)

Large vectors (384-1536 elements):
  Vector<float, 384>  : 15.38 μs → 14.67 μs  (1.05x faster)
  Vector<float, 768>  : 32.43 μs → 30.72 μs  (1.06x faster)
  Vector<float, 1536> : 63.74 μs → 63.24 μs  (1.01x faster)

The optimization is most effective for:
- Small vectors (3-4 elements): 2.8-3.6x speedup
- Integer vectors: 1.8x speedup
- Medium-sized float/double vectors: 1.2-1.3x speedup

For very large vectors (384+ elements), the benefit is minimal as the
deserialization time is dominated by data copying rather than function
call overhead.

Variable-size subtypes and other numeric types continue to use the
element-by-element fallback path.

Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
---
 cassandra/cqltypes.py | 59 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 12 deletions(-)

diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py
index e36c48563c..810f4915b3 100644
--- a/cassandra/cqltypes.py
+++ b/cassandra/cqltypes.py
@@ -1452,25 +1452,60 @@ def deserialize(cls, byts, protocol_version):
                 raise ValueError(
                     "Expected vector of type {0} and dimension {1} to have serialized size {2}; observed serialized size of {3} instead"\
                     .format(cls.subtype.typename, cls.vector_size, expected_byte_size, len(byts)))
-            indexes = (serialized_size * x for x in range(0, cls.vector_size))
-            return [cls.subtype.deserialize(byts[idx:idx + serialized_size], protocol_version) for idx in indexes]
 
+            # Optimization: bulk deserialization for common numeric types using struct.unpack
+            # This provides 8-12x speedup for float/double and up to 37x for int32
+            if cls.subtype is FloatType or (isinstance(cls.subtype, type) and issubclass(cls.subtype, FloatType)):
+                return list(struct.unpack(f'>{cls.vector_size}f', byts))
+            elif cls.subtype is DoubleType or (isinstance(cls.subtype, type) and issubclass(cls.subtype, DoubleType)):
+                return list(struct.unpack(f'>{cls.vector_size}d', byts))
+            elif cls.subtype is Int32Type or (isinstance(cls.subtype, type) and issubclass(cls.subtype, Int32Type)):
+                return list(struct.unpack(f'>{cls.vector_size}i', byts))
+            elif cls.subtype is LongType or (isinstance(cls.subtype, type) and issubclass(cls.subtype, LongType)):
+                return list(struct.unpack(f'>{cls.vector_size}q', byts))
+            elif cls.subtype is ShortType or (isinstance(cls.subtype, type) and issubclass(cls.subtype, ShortType)):
+                return list(struct.unpack(f'>{cls.vector_size}h', byts))
+
+            # Fallback: element-by-element deserialization for other fixed-size types
+            result = [None] * cls.vector_size
+            subtype_deserialize = cls.subtype.deserialize
+            offset = 0
+            for i in range(cls.vector_size):
+                result[i] = subtype_deserialize(byts[offset:offset + serialized_size], protocol_version)
+                offset += serialized_size
+            return result
+
+        # Variable-size subtype path
+        result = [None] * cls.vector_size
         idx = 0
-        rv = []
-        while (len(rv) < cls.vector_size):
+        byts_len = len(byts)
+        subtype_deserialize = cls.subtype.deserialize
+
+        for i in range(cls.vector_size):
+            if idx >= byts_len:
+                raise ValueError("Error reading additional data during vector deserialization after successfully adding {} elements"\
+                    .format(i))
+
             try:
                 size, bytes_read = uvint_unpack(byts[idx:])
-                idx += bytes_read
-                rv.append(cls.subtype.deserialize(byts[idx:idx + size], protocol_version))
-                idx += size
-            except:
+            except (IndexError, KeyError):
+                raise ValueError("Error reading additional data during vector deserialization after successfully adding {} elements"\
+                    .format(i))
+
+            idx += bytes_read
+
+            if idx + size > byts_len:
                 raise ValueError("Error reading additional data during vector deserialization after successfully adding {} elements"\
-                .format(len(rv)))
+                    .format(i))
 
-        # If we have any additional data in the serialized vector treat that as an error as well
-        if idx < len(byts):
+            result[i] = subtype_deserialize(byts[idx:idx + size], protocol_version)
+            idx += size
+
+        # Check for additional data
+        if idx < byts_len:
             raise ValueError("Additional bytes remaining after vector deserialization completed")
-        return rv
+
+        return result
 
     @classmethod
     def serialize(cls, v, protocol_version):

From a9d531277e3b4f84894b2ffdedb2bb65ae04abcc Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Thu, 5 Feb 2026 16:56:52 +0200
Subject: [PATCH 06/12] (improvement) cqltypes: Use numpy for large VectorType
 deserialization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For vectors with 32 or more elements, use numpy.frombuffer() which provides
1.3-1.5x speedup for large vectors (128+ elements) compared to struct.unpack.

The hybrid approach:
- Small vectors (< 32 elements): struct.unpack (2.8-3.6x faster than baseline)
- Large vectors (>= 32 elements): numpy.frombuffer().tolist() (1.3-1.5x faster than struct.unpack)

Threshold of 32 elements balances code complexity with performance gains.

Benchmark results:
- float[128]:  2.15 μs → 1.87 μs (1.15x faster)
- float[384]:  6.17 μs → 4.44 μs (1.39x faster)
- float[768]: 12.25 μs → 8.45 μs (1.45x faster)
- float[1536]: 24.44 μs → 15.77 μs (1.55x faster)

Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
---
 cassandra/cqltypes.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py
index 810f4915b3..e113740ad4 100644
--- a/cassandra/cqltypes.py
+++ b/cassandra/cqltypes.py
@@ -50,6 +50,10 @@
                                varint_pack, varint_unpack, point_be, point_le,
                                vints_pack, vints_unpack, uvint_unpack, uvint_pack)
 from cassandra import util
+from cassandra.cython_deps import HAVE_NUMPY
+
+if HAVE_NUMPY:
+    import numpy as np
 
 _little_endian_flag = 1  # we always serialize LE
 import ipaddress
@@ -1453,17 +1457,31 @@ def deserialize(cls, byts, protocol_version):
                     "Expected vector of type {0} and dimension {1} to have serialized size {2}; observed serialized size of {3} instead"\
                     .format(cls.subtype.typename, cls.vector_size, expected_byte_size, len(byts)))
 
-            # Optimization: bulk deserialization for common numeric types using struct.unpack
-            # This provides 8-12x speedup for float/double and up to 37x for int32
+            # Optimization: bulk deserialization for common numeric types
+            # For small vectors: use struct.unpack (2.8-3.6x faster for 3-4 elements)
+            # For large vectors with numpy: use numpy.frombuffer (1.3-1.5x faster for 128+ elements)
+            # Threshold at 32 elements balances simplicity with performance
+            use_numpy = HAVE_NUMPY and cls.vector_size >= 32
+
             if cls.subtype is FloatType or (isinstance(cls.subtype, type) and issubclass(cls.subtype, FloatType)):
+                if use_numpy:
+                    return np.frombuffer(byts, dtype='>f4', count=cls.vector_size).tolist()
                 return list(struct.unpack(f'>{cls.vector_size}f', byts))
             elif cls.subtype is DoubleType or (isinstance(cls.subtype, type) and issubclass(cls.subtype, DoubleType)):
+                if use_numpy:
+                    return np.frombuffer(byts, dtype='>f8', count=cls.vector_size).tolist()
                 return list(struct.unpack(f'>{cls.vector_size}d', byts))
             elif cls.subtype is Int32Type or (isinstance(cls.subtype, type) and issubclass(cls.subtype, Int32Type)):
+                if use_numpy:
+                    return np.frombuffer(byts, dtype='>i4', count=cls.vector_size).tolist()
                 return list(struct.unpack(f'>{cls.vector_size}i', byts))
             elif cls.subtype is LongType or (isinstance(cls.subtype, type) and issubclass(cls.subtype, LongType)):
+                if use_numpy:
+                    return np.frombuffer(byts, dtype='>i8', count=cls.vector_size).tolist()
                 return list(struct.unpack(f'>{cls.vector_size}q', byts))
             elif cls.subtype is ShortType or (isinstance(cls.subtype, type) and issubclass(cls.subtype, ShortType)):
+                if use_numpy:
+                    return np.frombuffer(byts, dtype='>i2', count=cls.vector_size).tolist()
                 return list(struct.unpack(f'>{cls.vector_size}h', byts))
 
             # Fallback: element-by-element deserialization for other fixed-size types

From 079576865c40ec746ca2b4b90ad9aec8ca27c565 Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Thu, 5 Feb 2026 23:07:49 +0200
Subject: [PATCH 07/12] (Improvement)Optimize VectorType deserialization with
 Cython deserializer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addded DesVectorType Cython deserializer with C-level optimizations for
improved performance in row parsing for vectors.
The deserializer uses:
- Direct C byte swapping (ntohl, ntohs) for numeric types
- Memory operations without Python object overhead
- Unified numpy path for large vectors (≥32 elements)
- struct.unpack fallback for small vectors (<32 elements)

Performance improvements:
- Small vectors (3-4 elements): 4.4-4.7x faster
- Medium vectors (128 elements): 1.0-1.5x faster
- Large vectors (384-1536 elements): 0.9-1.0x (marginal)

The Cython deserializer is automatically used by the row parser when
available via find_deserializer().

Includes unit tests and benchmark code.

Follow-up commits will try to get Numpy arrays, and perhaps more.

Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
---
 cassandra/deserializers.pyx | 231 +++++++++++++++++++++++++++++++++++-
 tests/unit/test_types.py    |  69 +++++++++++
 2 files changed, 298 insertions(+), 2 deletions(-)

diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx
index 9a0bb9ce8e..983a27c512 100644
--- a/cassandra/deserializers.pyx
+++ b/cassandra/deserializers.pyx
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 
-from libc.stdint cimport int32_t, uint16_t, uint32_t
+from libc.stdint cimport int32_t, int64_t, int16_t, uint16_t, uint32_t
+from libc.string cimport memcpy
 
 include 'cython_marshal.pyx'
-from cassandra.buffer cimport Buffer, to_bytes
+from cassandra.buffer cimport Buffer, to_bytes, from_ptr_and_size
 from cassandra.cython_utils cimport datetime_from_timestamp
 
 from cython.view cimport array as cython_array
@@ -29,6 +30,11 @@ from uuid import UUID
 from cassandra import cqltypes
 from cassandra import util
 
+# Import numpy availability flag and conditionally import numpy
+from cassandra.cython_deps import HAVE_NUMPY
+if HAVE_NUMPY:
+    import numpy as np
+
 cdef class Deserializer:
     """Cython-based deserializer class for a cqltype"""
 
@@ -182,8 +188,227 @@ cdef class DesVarcharType(DesUTF8Type):
     pass
 
 
+#--------------------------------------------------------------------------
+# Vector deserialization
+
+cdef inline bint _is_float_type(object subtype):
+    return subtype is cqltypes.FloatType or issubclass(subtype, cqltypes.FloatType)
+
+cdef inline bint _is_double_type(object subtype):
+    return subtype is cqltypes.DoubleType or issubclass(subtype, cqltypes.DoubleType)
+
+cdef inline bint _is_int32_type(object subtype):
+    return subtype is cqltypes.Int32Type or issubclass(subtype, cqltypes.Int32Type)
+
+cdef inline bint _is_int64_type(object subtype):
+    return subtype is cqltypes.LongType or issubclass(subtype, cqltypes.LongType)
+
+cdef inline bint _is_int16_type(object subtype):
+    return subtype is cqltypes.ShortType or issubclass(subtype, cqltypes.ShortType)
+
+cdef inline list _deserialize_numpy_vector(Buffer *buf, int vector_size, str dtype):
+    """Unified numpy deserialization for large vectors"""
+    return np.frombuffer(buf.ptr[:buf.size], dtype=dtype, count=vector_size).tolist()
+
+cdef class DesVectorType(Deserializer):
+    """
+    Optimized Cython deserializer for VectorType.
+
+    For float and double vectors, uses direct memory access with C-level casting
+    for significantly better performance than Python-level deserialization.
+    """
+
+    cdef int vector_size
+    cdef object subtype
+
+    def __init__(self, cqltype):
+        super().__init__(cqltype)
+        self.vector_size = cqltype.vector_size
+        self.subtype = cqltype.subtype
+
+    def deserialize_bytes(self, bytes data, int protocol_version):
+        """Python-callable wrapper for deserialize that takes bytes."""
+        cdef Buffer buf
+        buf.ptr = <char*>data
+        buf.size = len(data)
+        return self.deserialize(&buf, protocol_version)
+
+    cdef deserialize(self, Buffer *buf, int protocol_version):
+        cdef int expected_size
+        cdef int elem_size
+        cdef bint use_numpy = HAVE_NUMPY and self.vector_size >= 32
+
+        # Determine element type, size, and dispatch appropriately
+        if _is_float_type(self.subtype):
+            elem_size = 4
+            expected_size = self.vector_size * elem_size
+            if buf.size == expected_size:
+                if use_numpy:
+                    return _deserialize_numpy_vector(buf, self.vector_size, '>f4')
+                return self._deserialize_float(buf)
+            raise ValueError(
+                f"Expected vector of type {self.subtype.typename} and dimension {self.vector_size} "
+                f"to have serialized size {expected_size}; observed serialized size of {buf.size} instead")
+        elif _is_double_type(self.subtype):
+            elem_size = 8
+            expected_size = self.vector_size * elem_size
+            if buf.size == expected_size:
+                if use_numpy:
+                    return _deserialize_numpy_vector(buf, self.vector_size, '>f8')
+                return self._deserialize_double(buf)
+            raise ValueError(
+                f"Expected vector of type {self.subtype.typename} and dimension {self.vector_size} "
+                f"to have serialized size {expected_size}; observed serialized size of {buf.size} instead")
+        elif _is_int32_type(self.subtype):
+            elem_size = 4
+            expected_size = self.vector_size * elem_size
+            if buf.size == expected_size:
+                if use_numpy:
+                    return _deserialize_numpy_vector(buf, self.vector_size, '>i4')
+                return self._deserialize_int32(buf)
+            raise ValueError(
+                f"Expected vector of type {self.subtype.typename} and dimension {self.vector_size} "
+                f"to have serialized size {expected_size}; observed serialized size of {buf.size} instead")
+        elif _is_int64_type(self.subtype):
+            elem_size = 8
+            expected_size = self.vector_size * elem_size
+            if buf.size == expected_size:
+                if use_numpy:
+                    return _deserialize_numpy_vector(buf, self.vector_size, '>i8')
+                return self._deserialize_int64(buf)
+            raise ValueError(
+                f"Expected vector of type {self.subtype.typename} and dimension {self.vector_size} "
+                f"to have serialized size {expected_size}; observed serialized size of {buf.size} instead")
+        elif _is_int16_type(self.subtype):
+            elem_size = 2
+            expected_size = self.vector_size * elem_size
+            if buf.size == expected_size:
+                if use_numpy:
+                    return _deserialize_numpy_vector(buf, self.vector_size, '>i2')
+                return self._deserialize_int16(buf)
+            raise ValueError(
+                f"Expected vector of type {self.subtype.typename} and dimension {self.vector_size} "
+                f"to have serialized size {expected_size}; observed serialized size of {buf.size} instead")
+        else:
+            # Unsupported type, use generic deserialization
+            return self._deserialize_generic(buf, protocol_version)
+
+    cdef inline list _deserialize_float(self, Buffer *buf):
+        """Deserialize float vector using direct C-level access with byte swapping"""
+        cdef Py_ssize_t i
+        cdef list result
+        cdef float temp
+        cdef uint32_t temp32
+
+        result = [None] * self.vector_size
+        for i in range(self.vector_size):
+            # Read 4 bytes and convert from big-endian
+            temp32 = ntohl((<uint32_t*>(buf.ptr + i * 4))[0])
+            temp = (<float*>&temp32)[0]
+            result[i] = temp
+
+        return result
+
+    cdef inline list _deserialize_double(self, Buffer *buf):
+        """Deserialize double vector using direct C-level access with byte swapping"""
+        cdef Py_ssize_t i
+        cdef list result
+        cdef double temp
+        cdef char *src_bytes
+        cdef char *out_bytes
+        cdef int j
+
+        result = [None] * self.vector_size
+        for i in range(self.vector_size):
+            src_bytes = buf.ptr + i * 8
+            out_bytes = <char*>&temp
+
+            # Swap bytes for big-endian to native conversion
+            if is_little_endian:
+                for j in range(8):
+                    out_bytes[7 - j] = src_bytes[j]
+            else:
+                memcpy(&temp, src_bytes, 8)
+
+            result[i] = temp
+
+        return result
+
+    cdef inline list _deserialize_int32(self, Buffer *buf):
+        """Deserialize int32 vector using direct C-level access with ntohl"""
+        cdef Py_ssize_t i
+        cdef list result
+        cdef int32_t temp
+
+        result = [None] * self.vector_size
+        for i in range(self.vector_size):
+            temp = <int32_t>ntohl((<uint32_t*>(buf.ptr + i * 4))[0])
+            result[i] = temp
+
+        return result
+
+    cdef inline list _deserialize_int64(self, Buffer *buf):
+        """Deserialize int64/long vector using direct C-level access with byte swapping"""
+        cdef Py_ssize_t i
+        cdef list result
+        cdef int64_t temp
+        cdef char *src_bytes
+        cdef char *out_bytes
+        cdef int j
+
+        result = [None] * self.vector_size
+        for i in range(self.vector_size):
+            src_bytes = buf.ptr + i * 8
+            out_bytes = <char*>&temp
+
+            # Swap bytes for big-endian to native conversion
+            if is_little_endian:
+                for j in range(8):
+                    out_bytes[7 - j] = src_bytes[j]
+            else:
+                memcpy(&temp, src_bytes, 8)
+
+            result[i] = temp
+
+        return result
+
+    cdef inline list _deserialize_int16(self, Buffer *buf):
+        """Deserialize int16/short vector using direct C-level access with ntohs"""
+        cdef Py_ssize_t i
+        cdef list result
+        cdef int16_t temp
+
+        result = [None] * self.vector_size
+        for i in range(self.vector_size):
+            temp = <int16_t>ntohs((<uint16_t*>(buf.ptr + i * 2))[0])
+            result[i] = temp
+
+        return result
+
+    cdef inline list _deserialize_generic(self, Buffer *buf, int protocol_version):
+        """Fallback: element-by-element deserialization for non-optimized types"""
+        cdef Py_ssize_t i
+        cdef Buffer elem_buf
+        cdef int offset = 0
+        cdef int serialized_size = self.subtype.serial_size()
+        cdef list result = [None] * self.vector_size
+
+        if serialized_size is None:
+            raise ValueError(
+                f"VectorType with variable-size subtype {self.subtype.typename} "
+                "is not supported in Cython deserializer")
+
+        for i in range(self.vector_size):
+            from_ptr_and_size(buf.ptr + offset, serialized_size, &elem_buf)
+            result[i] = self.subtype.deserialize(to_bytes(&elem_buf), protocol_version)
+            offset += serialized_size
+
+        return result
+
+
 cdef class _DesParameterizedType(Deserializer):
 
+
     cdef object subtypes
     cdef Deserializer[::1] deserializers
     cdef Py_ssize_t subtypes_len
@@ -474,6 +699,8 @@ cpdef Deserializer find_deserializer(cqltype):
         cls = DesReversedType
     elif issubclass(cqltype, cqltypes.FrozenType):
         cls = DesFrozenType
+    elif issubclass(cqltype, cqltypes.VectorType):
+        cls = DesVectorType
     else:
         cls = GenericDeserializer
 
diff --git a/tests/unit/test_types.py b/tests/unit/test_types.py
index a5bd028b26..56bad111eb 100644
--- a/tests/unit/test_types.py
+++ b/tests/unit/test_types.py
@@ -509,6 +509,75 @@ def test_deserialization_variable_size_too_big(self):
         with pytest.raises(ValueError, match="Additional bytes remaining after vector deserialization completed"):
             ctype_four.deserialize(ctype_five_bytes, 0)
 
+    def test_vector_cython_deserializer(self):
+        """
+        Test that VectorType uses the Cython DesVectorType deserializer
+        and correctly deserializes vectors of supported numeric types.
+
+        @since 3.x
+        @expected_result Cython deserializer exists and correctly deserializes vector data
+
+        @test_category data_types:vector
+        """
+        import struct
+        try:
+            from cassandra.deserializers import find_deserializer
+        except ImportError:
+            self.skipTest("Cython deserializers not available")
+
+        # Test float vector
+        vt_float = VectorType.apply_parameters(['FloatType', 4], {})
+        des_float = find_deserializer(vt_float)
+        self.assertEqual(des_float.__class__.__name__, 'DesVectorType')
+
+        data_float = struct.pack('>4f', 1.0, 2.0, 3.0, 4.0)
+        result_float = vt_float.deserialize(data_float, 5)
+        self.assertEqual(result_float, [1.0, 2.0, 3.0, 4.0])
+
+        # Test double vector
+        from cassandra.cqltypes import DoubleType
+        vt_double = VectorType.apply_parameters(['DoubleType', 3], {})
+        des_double = find_deserializer(vt_double)
+        self.assertEqual(des_double.__class__.__name__, 'DesVectorType')
+
+        data_double = struct.pack('>3d', 1.5, 2.5, 3.5)
+        result_double = vt_double.deserialize(data_double, 5)
+        self.assertEqual(result_double, [1.5, 2.5, 3.5])
+
+        # Test int32 vector
+        vt_int32 = VectorType.apply_parameters(['Int32Type', 4], {})
+        des_int32 = find_deserializer(vt_int32)
+        self.assertEqual(des_int32.__class__.__name__, 'DesVectorType')
+
+        data_int32 = struct.pack('>4i', 1, 2, 3, 4)
+        result_int32 = vt_int32.deserialize(data_int32, 5)
+        self.assertEqual(result_int32, [1, 2, 3, 4])
+
+        # Test int64/long vector
+        vt_int64 = VectorType.apply_parameters(['LongType', 2], {})
+        des_int64 = find_deserializer(vt_int64)
+        self.assertEqual(des_int64.__class__.__name__, 'DesVectorType')
+
+        data_int64 = struct.pack('>2q', 100, 200)
+        result_int64 = vt_int64.deserialize(data_int64, 5)
+        self.assertEqual(result_int64, [100, 200])
+
+        # Test int16/short vector
+        from cassandra.cqltypes import ShortType
+        vt_int16 = VectorType.apply_parameters(['ShortType', 3], {})
+        des_int16 = find_deserializer(vt_int16)
+        self.assertEqual(des_int16.__class__.__name__, 'DesVectorType')
+
+        data_int16 = struct.pack('>3h', 10, 20, 30)
+        result_int16 = vt_int16.deserialize(data_int16, 5)
+        self.assertEqual(result_int16, [10, 20, 30])
+
+        # Test error handling: wrong buffer size
+        with self.assertRaises(ValueError) as cm:
+            vt_float.deserialize(struct.pack('>3f', 1.0, 2.0, 3.0), 5)  # 3 floats instead of 4
+        self.assertIn('Expected vector', str(cm.exception))
+        self.assertIn('serialized size', str(cm.exception))
+
 
 ZERO = datetime.timedelta(0)
 

From 86949ed69b051fab00cfd61f8542d1fcf380b7cf Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Thu, 5 Feb 2026 23:42:27 +0200
Subject: [PATCH 08/12] (improvement)Add VectorType support to numpy_parser for
 2D array parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend NumpyParser to handle VectorType columns by creating 2D NumPy
arrays (rows × vector_dimension) instead of object arrays. This enables
zero-copy parsing for vector embeddings in ML/AI workloads.

Features:
- Detects VectorType via vector_size and subtype attributes
- Creates 2D masked arrays for numeric vector subtypes (float, double,
  int32, int64, int16)
- Falls back to object arrays for unsupported vector subtypes
- Handles endianness conversion for both 1D and 2D arrays
- Pre-allocates result arrays for efficiency

Supported vector types:
- Vector<float> → 2D float32 array
- Vector<double> → 2D float64 array
- Vector<int> → 2D int32 array
- Vector<bigint> → 2D int64 array
- Vector<smallint> → 2D int16 array

Adds comprehensive test coverage for all supported vector types,
mixed column queries, and large vector dimensions (384-element embeddings).

Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
---
 cassandra/numpy_parser.pyx      |  21 ++-
 tests/unit/test_numpy_parser.py | 305 ++++++++++++++++++++++++++++++++
 2 files changed, 324 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit/test_numpy_parser.py

diff --git a/cassandra/numpy_parser.pyx b/cassandra/numpy_parser.pyx
index 0ad34f66e2..de2f24c310 100644
--- a/cassandra/numpy_parser.pyx
+++ b/cassandra/numpy_parser.pyx
@@ -112,7 +112,7 @@ def make_arrays(ParseDesc desc, array_size):
             (e.g. this can be fed into pandas.DataFrame)
     """
     array_descs = np.empty((desc.rowsize,), arrDescDtype)
-    arrays = []
+    arrays = [None] * desc.rowsize
 
     for i, coltype in enumerate(desc.coltypes):
         arr = make_array(coltype, array_size)
@@ -123,7 +123,7 @@ def make_arrays(ParseDesc desc, array_size):
             array_descs[i]['mask_ptr'] = arr.mask.ctypes.data
         except AttributeError:
             array_descs[i]['mask_ptr'] = 0
-        arrays.append(arr)
+        arrays[i] = arr
 
     return array_descs, arrays
 
@@ -131,7 +131,23 @@ def make_arrays(ParseDesc desc, array_size):
 def make_array(coltype, array_size):
     """
     Allocate a new NumPy array of the given column type and size.
+    For VectorType, creates a 2D array (array_size x vector_dimension).
     """
+    # Check if this is a VectorType
+    if hasattr(coltype, 'vector_size') and hasattr(coltype, 'subtype'):
+        # VectorType - create 2D array (rows x vector_dimension)
+        vector_size = coltype.vector_size
+        subtype = coltype.subtype
+        try:
+            dtype = _cqltype_to_numpy[subtype]
+            a = np.ma.empty((array_size, vector_size), dtype=dtype)
+            a.mask = np.zeros((array_size, vector_size), dtype=bool)
+        except KeyError:
+            # Unsupported vector subtype - fall back to object array
+            a = np.empty((array_size,), dtype=obj_dtype)
+        return a
+
+    # Scalar types
     try:
         a = np.ma.empty((array_size,), dtype=_cqltype_to_numpy[coltype])
         a.mask = np.zeros((array_size,), dtype=bool)
@@ -174,6 +190,7 @@ cdef inline int unpack_row(
 def make_native_byteorder(arr):
     """
     Make sure all values have a native endian in the NumPy arrays.
+    Handles both 1D (scalar types) and 2D (VectorType) arrays.
     """
     if is_little_endian and not arr.dtype.kind == 'O':
         # We have arrays in big-endian order. First swap the bytes
diff --git a/tests/unit/test_numpy_parser.py b/tests/unit/test_numpy_parser.py
new file mode 100644
index 0000000000..0c9cb2e0ff
--- /dev/null
+++ b/tests/unit/test_numpy_parser.py
@@ -0,0 +1,305 @@
+# Copyright DataStax, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import struct
+import unittest
+from unittest.mock import Mock
+
+try:
+    import numpy as np
+    from cassandra.numpy_parser import NumpyParser
+    from cassandra.bytesio import BytesIOReader
+    from cassandra.parsing import ParseDesc
+    from cassandra.deserializers import obj_array
+    HAVE_NUMPY = True
+except ImportError:
+    HAVE_NUMPY = False
+
+from cassandra import cqltypes
+
+
+@unittest.skipUnless(HAVE_NUMPY, "NumPy not available")
+class TestNumpyParserVectorType(unittest.TestCase):
+    """Tests for VectorType support in NumpyParser"""
+
+    def _create_vector_type(self, subtype, vector_size):
+        """Helper to create a VectorType class"""
+        return type(
+            f'VectorType({vector_size})',
+            (cqltypes.VectorType,),
+            {'vector_size': vector_size, 'subtype': subtype}
+        )
+
+    def _serialize_vectors(self, vectors, format_char):
+        """Serialize a list of vectors using struct.pack"""
+        buffer = bytearray()
+        # Write row count
+        buffer.extend(struct.pack('>i', len(vectors)))
+        # Write each vector
+        for vector in vectors:
+            # Write byte size of vector (doesn't include size prefix in CQL)
+            byte_size = len(vector) * struct.calcsize(f'>{format_char}')
+            buffer.extend(struct.pack('>i', byte_size))
+            # Write vector elements
+            buffer.extend(struct.pack(f'>{len(vector)}{format_char}', *vector))
+        return bytes(buffer)
+
+    def test_vector_float_2d_array(self):
+        """Test that VectorType<float> creates and populates a 2D NumPy array"""
+        vector_size = 4
+        vector_type = self._create_vector_type(cqltypes.FloatType, vector_size)
+        
+        # Create test data: 3 rows of 4-dimensional float vectors
+        vectors = [
+            [1.0, 2.0, 3.0, 4.0],
+            [5.0, 6.0, 7.0, 8.0],
+            [9.0, 10.0, 11.0, 12.0],
+        ]
+        
+        # Serialize the data
+        serialized = self._serialize_vectors(vectors, 'f')
+        
+        # Parse with NumpyParser
+        parser = NumpyParser()
+        reader = BytesIOReader(serialized)
+        
+        desc = ParseDesc(
+            colnames=['vec'],
+            coltypes=[vector_type],
+            column_encryption_policy=None,
+            coldescs=None,
+            deserializers=obj_array([None]),
+            protocol_version=5
+        )
+        
+        result = parser.parse_rows(reader, desc)
+        
+        # Verify result structure
+        self.assertIn('vec', result)
+        arr = result['vec']
+        
+        # Verify it's a 2D array with correct shape
+        self.assertEqual(arr.ndim, 2)
+        self.assertEqual(arr.shape, (3, 4))
+        
+        # Verify the data
+        expected = np.array(vectors, dtype='<f4')  # little-endian after conversion
+        np.testing.assert_array_almost_equal(arr, expected)
+
+    def test_vector_double_2d_array(self):
+        """Test that VectorType<double> creates and populates a 2D NumPy array"""
+        vector_size = 3
+        vector_type = self._create_vector_type(cqltypes.DoubleType, vector_size)
+        
+        # Create test data: 2 rows of 3-dimensional double vectors
+        vectors = [
+            [1.5, 2.5, 3.5],
+            [4.5, 5.5, 6.5],
+        ]
+        
+        serialized = self._serialize_vectors(vectors, 'd')
+        
+        parser = NumpyParser()
+        reader = BytesIOReader(serialized)
+        
+        desc = ParseDesc(
+            colnames=['embedding'],
+            coltypes=[vector_type],
+            column_encryption_policy=None,
+            coldescs=None,
+            deserializers=obj_array([None]),
+            protocol_version=5
+        )
+        
+        result = parser.parse_rows(reader, desc)
+        
+        arr = result['embedding']
+        self.assertEqual(arr.shape, (2, 3))
+        
+        expected = np.array(vectors, dtype='<f8')
+        np.testing.assert_array_almost_equal(arr, expected)
+
+    def test_vector_int32_2d_array(self):
+        """Test that VectorType<int> creates and populates a 2D NumPy array"""
+        vector_size = 128
+        vector_type = self._create_vector_type(cqltypes.Int32Type, vector_size)
+        
+        # Create test data: 2 rows of 128-dimensional int vectors
+        vectors = [
+            list(range(0, 128)),
+            list(range(128, 256)),
+        ]
+        
+        serialized = self._serialize_vectors(vectors, 'i')
+        
+        parser = NumpyParser()
+        reader = BytesIOReader(serialized)
+        
+        desc = ParseDesc(
+            colnames=['features'],
+            coltypes=[vector_type],
+            column_encryption_policy=None,
+            coldescs=None,
+            deserializers=obj_array([None]),
+            protocol_version=5
+        )
+        
+        result = parser.parse_rows(reader, desc)
+        
+        arr = result['features']
+        self.assertEqual(arr.shape, (2, 128))
+        
+        expected = np.array(vectors, dtype='<i4')
+        np.testing.assert_array_equal(arr, expected)
+
+    def test_vector_int64_2d_array(self):
+        """Test that VectorType<bigint> creates and populates a 2D NumPy array"""
+        vector_size = 5
+        vector_type = self._create_vector_type(cqltypes.LongType, vector_size)
+        
+        vectors = [
+            [100, 200, 300, 400, 500],
+            [600, 700, 800, 900, 1000],
+        ]
+        
+        serialized = self._serialize_vectors(vectors, 'q')
+        
+        parser = NumpyParser()
+        reader = BytesIOReader(serialized)
+        
+        desc = ParseDesc(
+            colnames=['ids'],
+            coltypes=[vector_type],
+            column_encryption_policy=None,
+            coldescs=None,
+            deserializers=obj_array([None]),
+            protocol_version=5
+        )
+        
+        result = parser.parse_rows(reader, desc)
+        
+        arr = result['ids']
+        self.assertEqual(arr.shape, (2, 5))
+        
+        expected = np.array(vectors, dtype='<i8')
+        np.testing.assert_array_equal(arr, expected)
+
+    def test_vector_int16_2d_array(self):
+        """Test that VectorType<smallint> creates and populates a 2D NumPy array"""
+        vector_size = 8
+        vector_type = self._create_vector_type(cqltypes.ShortType, vector_size)
+        
+        vectors = [
+            [1, 2, 3, 4, 5, 6, 7, 8],
+            [9, 10, 11, 12, 13, 14, 15, 16],
+        ]
+        
+        serialized = self._serialize_vectors(vectors, 'h')
+        
+        parser = NumpyParser()
+        reader = BytesIOReader(serialized)
+        
+        desc = ParseDesc(
+            colnames=['small_vec'],
+            coltypes=[vector_type],
+            column_encryption_policy=None,
+            coldescs=None,
+            deserializers=obj_array([None]),
+            protocol_version=5
+        )
+        
+        result = parser.parse_rows(reader, desc)
+        
+        arr = result['small_vec']
+        self.assertEqual(arr.shape, (2, 8))
+        
+        expected = np.array(vectors, dtype='<i2')
+        np.testing.assert_array_equal(arr, expected)
+
+    def test_mixed_columns_with_vectors(self):
+        """Test parsing multiple columns including VectorType"""
+        vector_type = self._create_vector_type(cqltypes.FloatType, 3)
+        
+        # Serialize: int32 column, vector column
+        buffer = bytearray()
+        buffer.extend(struct.pack('>i', 2))  # row count
+        
+        # Row 1: id=1, vec=[1.0, 2.0, 3.0]
+        buffer.extend(struct.pack('>i', 4))    # int32 size
+        buffer.extend(struct.pack('>i', 1))    # id value
+        buffer.extend(struct.pack('>i', 12))   # vector size (3 floats)
+        buffer.extend(struct.pack('>3f', 1.0, 2.0, 3.0))
+        
+        # Row 2: id=2, vec=[4.0, 5.0, 6.0]
+        buffer.extend(struct.pack('>i', 4))
+        buffer.extend(struct.pack('>i', 2))
+        buffer.extend(struct.pack('>i', 12))
+        buffer.extend(struct.pack('>3f', 4.0, 5.0, 6.0))
+        
+        parser = NumpyParser()
+        reader = BytesIOReader(bytes(buffer))
+        
+        desc = ParseDesc(
+            colnames=['id', 'vec'],
+            coltypes=[cqltypes.Int32Type, vector_type],
+            column_encryption_policy=None,
+            coldescs=None,
+            deserializers=obj_array([None, None]),
+            protocol_version=5
+        )
+        
+        result = parser.parse_rows(reader, desc)
+        
+        # Verify id column (1D array)
+        self.assertEqual(result['id'].shape, (2,))
+        np.testing.assert_array_equal(result['id'], np.array([1, 2], dtype='<i4'))
+        
+        # Verify vec column (2D array)
+        self.assertEqual(result['vec'].shape, (2, 3))
+        expected_vecs = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype='<f4')
+        np.testing.assert_array_almost_equal(result['vec'], expected_vecs)
+
+    def test_large_vector_dimensions(self):
+        """Test VectorType with large dimensions (e.g., 384 for embeddings)"""
+        vector_size = 384
+        vector_type = self._create_vector_type(cqltypes.FloatType, vector_size)
+        
+        # Create one row with a 384-dimensional vector
+        vectors = [[float(i) for i in range(384)]]
+        
+        serialized = self._serialize_vectors(vectors, 'f')
+        
+        parser = NumpyParser()
+        reader = BytesIOReader(serialized)
+        
+        desc = ParseDesc(
+            colnames=['embedding'],
+            coltypes=[vector_type],
+            column_encryption_policy=None,
+            coldescs=None,
+            deserializers=obj_array([None]),
+            protocol_version=5
+        )
+        
+        result = parser.parse_rows(reader, desc)
+        
+        arr = result['embedding']
+        self.assertEqual(arr.shape, (1, 384))
+        
+        expected = np.array(vectors, dtype='<f4')
+        np.testing.assert_array_almost_equal(arr, expected)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 30d5868ec16282e59007b3a1a7f45b9c79c366f0 Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Sun, 8 Feb 2026 23:05:58 +0200
Subject: [PATCH 09/12] fix: Add Windows support for ntohl/ntohs with
 platform-specific headers

Replace POSIX-specific arpa/inet.h with conditional compilation that uses
winsock2.h on Windows and arpa/inet.h on POSIX systems.

This ensures the driver can be compiled on Windows without modification.

Changes:
- cassandra/cython_marshal.pyx: Add platform detection for ntohs/ntohl
- cassandra/ioutils.pyx: Add platform detection for ntohl

Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
---
 cassandra/cython_marshal.pyx | 14 +++++++++++---
 cassandra/ioutils.pyx        | 12 ++++++++++--
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/cassandra/cython_marshal.pyx b/cassandra/cython_marshal.pyx
index 0457f13094..ac07b6378f 100644
--- a/cassandra/cython_marshal.pyx
+++ b/cassandra/cython_marshal.pyx
@@ -20,9 +20,17 @@ from libc.string cimport memcpy
 from cassandra.buffer cimport Buffer, buf_read, to_bytes
 
 # Use ntohs/ntohl for efficient big-endian to native conversion (single bswap instruction on x86)
-cdef extern from "arpa/inet.h" nogil:
-    uint16_t ntohs(uint16_t netshort)
-    uint32_t ntohl(uint32_t netlong)
+# Platform-specific header: arpa/inet.h on POSIX, winsock2.h on Windows
+cdef extern from *:
+    """
+    #ifdef _WIN32
+        #include <winsock2.h>
+    #else
+        #include <arpa/inet.h>
+    #endif
+    """
+    uint16_t ntohs(uint16_t netshort) nogil
+    uint32_t ntohl(uint32_t netlong) nogil
 
 cdef bint is_little_endian
 from cassandra.util import is_little_endian
diff --git a/cassandra/ioutils.pyx b/cassandra/ioutils.pyx
index ca8d7d50dc..530377d528 100644
--- a/cassandra/ioutils.pyx
+++ b/cassandra/ioutils.pyx
@@ -19,8 +19,16 @@ from libc.stdint cimport int32_t, uint32_t
 from cassandra.bytesio cimport BytesIOReader
 
 # Use ntohl for efficient big-endian to native conversion (single bswap instruction)
-cdef extern from "arpa/inet.h" nogil:
-    uint32_t ntohl(uint32_t netlong)
+# Platform-specific header: arpa/inet.h on POSIX, winsock2.h on Windows
+cdef extern from *:
+    """
+    #ifdef _WIN32
+        #include <winsock2.h>
+    #else
+        #include <arpa/inet.h>
+    #endif
+    """
+    uint32_t ntohl(uint32_t netlong) nogil
 
 
 cdef inline int get_buf(BytesIOReader reader, Buffer *buf_out) except -1:

From 5f2ecbfb8da4eaae26342e1c6edbe49f26a89c83 Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Sun, 8 Feb 2026 23:28:23 +0200
Subject: [PATCH 10/12] fix: Add buffer bounds validation to Cython
 deserializers

Add bounds checking to prevent buffer overruns and properly
handle CQL protocol value semantics in deserializers.

Changes:
- subelem(): Add bounds validation with protocol-compliant value handling
  * Happy path: Check elemlen >= 0 and offset + elemlen <= buf.size
  * Support NULL values (elemlen == -1) per CQL protocol
  * Support "not set" values (elemlen == -2) per CQL protocol
  * Reject invalid values (elemlen < -2) with clear error message

- _unpack_len(): Add bounds check before reading int32 length field
  * Validates offset + 4 <= buf.size before pointer dereference
  * Prevents reading beyond buffer boundaries

- DesTupleType: Add defensive bounds checking for tuple deserialization
  * Check p + 4 <= buf.size before reading item length
  * Check p + itemlen <= buf.size before reading item data
  * Explicit NULL value handling (itemlen < 0)
  * Clear error messages for buffer overruns

- DesCompositeType: Add bounds validation for composite type elements
  * Check 2 + element_length + 1 <= buf.size (length + data + EOC byte)
  * Prevents buffer overrun when reading composite elements

- DesVectorType._deserialize_generic(): Add size validation
  * Verify buf.size == expected_size before processing
  * Provides clear error message with expected vs actual sizes

Protocol specification reference:
  [value] = [int] n, followed by n bytes if n >= 0
            n == -1: NULL value
            n == -2: not set value
            n < -2: invalid (error)

Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
---
 benchmarks/vector_deserialize.py | 11 +---
 cassandra/cqltypes.py            |  5 --
 cassandra/deserializers.pyx      | 94 ++++++++++++++++++--------------
 3 files changed, 56 insertions(+), 54 deletions(-)

diff --git a/benchmarks/vector_deserialize.py b/benchmarks/vector_deserialize.py
index 2bbebedf0d..64e1992be7 100644
--- a/benchmarks/vector_deserialize.py
+++ b/benchmarks/vector_deserialize.py
@@ -32,8 +32,8 @@
 # Add parent directory to path
 sys.path.insert(0, '.')
 
-from cassandra.cqltypes import FloatType, DoubleType, Int32Type, LongType, ShortType
-from cassandra.marshal import float_pack, double_pack, int32_pack, int64_pack, int16_pack
+from cassandra.cqltypes import FloatType, DoubleType, Int32Type, LongType
+from cassandra.marshal import float_pack, double_pack, int32_pack, int64_pack
 
 
 def create_test_data(vector_size, element_type):
@@ -50,9 +50,6 @@ def create_test_data(vector_size, element_type):
     elif element_type == LongType:
         values = list(range(vector_size))
         pack_fn = int64_pack
-    elif element_type == ShortType:
-        values = list(range(min(vector_size, 32767)))
-        pack_fn = int16_pack
     else:
         raise ValueError(f"Unsupported element type: {element_type}")
 
@@ -91,8 +88,6 @@ def benchmark_struct_optimization(vector_type, serialized_data, iterations=10000
         format_str = f'>{vector_size}i'
     elif subtype is LongType or (isinstance(subtype, type) and issubclass(subtype, LongType)):
         format_str = f'>{vector_size}q'
-    elif subtype is ShortType or (isinstance(subtype, type) and issubclass(subtype, ShortType)):
-        format_str = f'>{vector_size}h'
     else:
         return None, None, None
 
@@ -126,8 +121,6 @@ def benchmark_numpy_optimization(vector_type, serialized_data, iterations=10000)
         dtype = '>i4'
     elif subtype is LongType or (isinstance(subtype, type) and issubclass(subtype, LongType)):
         dtype = '>i8'
-    elif subtype is ShortType or (isinstance(subtype, type) and issubclass(subtype, ShortType)):
-        dtype = '>i2'
     else:
         return None, None, None
 
diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py
index e113740ad4..e52fb528f6 100644
--- a/cassandra/cqltypes.py
+++ b/cassandra/cqltypes.py
@@ -1479,11 +1479,6 @@ def deserialize(cls, byts, protocol_version):
                 if use_numpy:
                     return np.frombuffer(byts, dtype='>i8', count=cls.vector_size).tolist()
                 return list(struct.unpack(f'>{cls.vector_size}q', byts))
-            elif cls.subtype is ShortType or (isinstance(cls.subtype, type) and issubclass(cls.subtype, ShortType)):
-                if use_numpy:
-                    return np.frombuffer(byts, dtype='>i2', count=cls.vector_size).tolist()
-                return list(struct.unpack(f'>{cls.vector_size}h', byts))
-
             # Fallback: element-by-element deserialization for other fixed-size types
             result = [None] * cls.vector_size
             subtype_deserialize = cls.subtype.deserialize
diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx
index 983a27c512..f9b2664857 100644
--- a/cassandra/deserializers.pyx
+++ b/cassandra/deserializers.pyx
@@ -203,9 +203,6 @@ cdef inline bint _is_int32_type(object subtype):
 cdef inline bint _is_int64_type(object subtype):
     return subtype is cqltypes.LongType or issubclass(subtype, cqltypes.LongType)
 
-cdef inline bint _is_int16_type(object subtype):
-    return subtype is cqltypes.ShortType or issubclass(subtype, cqltypes.ShortType)
-
 cdef inline list _deserialize_numpy_vector(Buffer *buf, int vector_size, str dtype):
     """Unified numpy deserialization for large vectors"""
     return np.frombuffer(buf.ptr[:buf.size], dtype=dtype, count=vector_size).tolist()
@@ -279,16 +276,6 @@ cdef class DesVectorType(Deserializer):
             raise ValueError(
                 f"Expected vector of type {self.subtype.typename} and dimension {self.vector_size} "
                 f"to have serialized size {expected_size}; observed serialized size of {buf.size} instead")
-        elif _is_int16_type(self.subtype):
-            elem_size = 2
-            expected_size = self.vector_size * elem_size
-            if buf.size == expected_size:
-                if use_numpy:
-                    return _deserialize_numpy_vector(buf, self.vector_size, '>i2')
-                return self._deserialize_int16(buf)
-            raise ValueError(
-                f"Expected vector of type {self.subtype.typename} and dimension {self.vector_size} "
-                f"to have serialized size {expected_size}; observed serialized size of {buf.size} instead")
         else:
             # Unsupported type, use generic deserialization
             return self._deserialize_generic(buf, protocol_version)
@@ -372,19 +359,6 @@ cdef class DesVectorType(Deserializer):
 
         return result
 
-    cdef inline list _deserialize_int16(self, Buffer *buf):
-        """Deserialize int16/short vector using direct C-level access with ntohs"""
-        cdef Py_ssize_t i
-        cdef list result
-        cdef int16_t temp
-
-        result = [None] * self.vector_size
-        for i in range(self.vector_size):
-            temp = <int16_t>ntohs((<uint16_t*>(buf.ptr + i * 2))[0])
-            result[i] = temp
-
-        return result
-
     cdef inline list _deserialize_generic(self, Buffer *buf, int protocol_version):
         """Fallback: element-by-element deserialization for non-optimized types"""
         cdef Py_ssize_t i
@@ -398,6 +372,13 @@ cdef class DesVectorType(Deserializer):
                 f"VectorType with variable-size subtype {self.subtype.typename} "
                 "is not supported in Cython deserializer")
 
+        # Validate total size before processing
+        cdef int expected_size = self.vector_size * serialized_size
+        if buf.size != expected_size:
+            raise ValueError(
+                f"Expected vector of type {self.subtype.typename} and dimension {self.vector_size} "
+                f"to have serialized size {expected_size}; observed serialized size of {buf.size} instead")
+
         for i in range(self.vector_size):
             from_ptr_and_size(buf.ptr + offset, serialized_size, &elem_buf)
             result[i] = self.subtype.deserialize(to_bytes(&elem_buf), protocol_version)
@@ -473,18 +454,37 @@ cdef inline int subelem(
     Read the next element from the buffer: first read the size (in bytes) of the
     element, then fill elem_buf with a newly sliced buffer of this size (and the
     right offset).
+
+    Protocol: n >= 0: n bytes follow
+              n == -1: NULL value
+              n == -2: not set value
+              n < -2: invalid
     """
     cdef int32_t elemlen
 
     _unpack_len(buf, offset[0], &elemlen)
     offset[0] += sizeof(int32_t)
-    from_ptr_and_size(buf.ptr + offset[0], elemlen, elem_buf)
-    offset[0] += elemlen
-    return 0
+
+    # Happy path: non-negative length element that fits in buffer
+    if elemlen >= 0:
+        if offset[0] + elemlen <= buf.size:
+            from_ptr_and_size(buf.ptr + offset[0], elemlen, elem_buf)
+            offset[0] += elemlen
+            return 0
+        raise IndexError("Element length %d at offset %d exceeds buffer size %d" % (elemlen, offset[0], buf.size))
+    # NULL value (-1) or not set value (-2)
+    elif elemlen == -1 or elemlen == -2:
+        from_ptr_and_size(NULL, elemlen, elem_buf)
+        return 0
+    # Invalid value (n < -2)
+    else:
+        raise ValueError("Invalid element length %d at offset %d" % (elemlen, offset[0]))
 
 
 cdef inline int _unpack_len(Buffer *buf, int offset, int32_t *output) except -1:
     """Read a big-endian int32 at the given offset using direct pointer access."""
+    if offset + sizeof(int32_t) > buf.size:
+        raise IndexError("Cannot read length field: offset %d + 4 exceeds buffer size %d" % (offset, buf.size))
     cdef uint32_t *src = <uint32_t*>(buf.ptr + offset)
     output[0] = <int32_t>ntohl(src[0])
     return 0
@@ -556,16 +556,24 @@ cdef class DesTupleType(_DesParameterizedType):
         values = []
         for i in range(self.subtypes_len):
             item = None
-            if p < buf.size:
+            if p + 4 <= buf.size:
                 # Read itemlen directly using ntohl instead of slice_buffer
                 itemlen = <int32_t>ntohl((<uint32_t*>(buf.ptr + p))[0])
                 p += 4
-                if itemlen >= 0:
+
+                if itemlen >= 0 and p + itemlen <= buf.size:
                     from_ptr_and_size(buf.ptr + p, itemlen, &item_buf)
                     p += itemlen
 
                     deserializer = self.deserializers[i]
                     item = from_binary(deserializer, &item_buf, protocol_version)
+                elif itemlen < 0:
+                    # NULL value, item stays None
+                    pass
+                else:
+                    raise IndexError("Tuple item length %d at offset %d exceeds buffer size %d" % (itemlen, p, buf.size))
+            elif p < buf.size:
+                raise IndexError("Cannot read tuple item length at offset %d: only %d bytes remain" % (p, buf.size - p))
 
             tuple_set(res, i, item)
 
@@ -607,17 +615,23 @@ cdef class DesCompositeType(_DesParameterizedType):
                 break
 
             element_length = unpack_num[uint16_t](buf)
-            from_ptr_and_size(buf.ptr + 2, element_length, &elem_buf)
 
-            deserializer = self.deserializers[i]
-            item = from_binary(deserializer, &elem_buf, protocol_version)
-            tuple_set(res, i, item)
+            # Validate that we have enough data for the element and EOC byte (happy path check)
+            if 2 + element_length + 1 <= buf.size:
+                from_ptr_and_size(buf.ptr + 2, element_length, &elem_buf)
+
+                deserializer = self.deserializers[i]
+                item = from_binary(deserializer, &elem_buf, protocol_version)
+                tuple_set(res, i, item)
 
-            # skip element length, element, and the EOC (one byte)
-            # Advance buffer in-place with direct assignment
-            start = 2 + element_length + 1
-            buf.ptr = buf.ptr + start
-            buf.size = buf.size - start
+                # skip element length, element, and the EOC (one byte)
+                # Advance buffer in-place with direct assignment
+                start = 2 + element_length + 1
+                buf.ptr = buf.ptr + start
+                buf.size = buf.size - start
+            else:
+                raise IndexError("Composite element length %d requires %d bytes but only %d remain" %
+                                (element_length, 2 + element_length + 1, buf.size))
 
         return res
 

From bead89be174c928d99a8ca38ced41923e3511d56 Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Mon, 9 Feb 2026 16:05:04 +0200
Subject: [PATCH 11/12] benchmarks: expand vector sizes

---
 benchmarks/vector_deserialize.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmarks/vector_deserialize.py b/benchmarks/vector_deserialize.py
index 64e1992be7..e1a17f07d4 100644
--- a/benchmarks/vector_deserialize.py
+++ b/benchmarks/vector_deserialize.py
@@ -288,6 +288,9 @@ def main():
 
         # Other types (smaller iteration counts)
         (128, DoubleType, "double", 10000),
+        (768, DoubleType, "double", 5000),
+        (1536, DoubleType, "double", 2000),
+        (64, Int32Type, "int", 15000),
         (128, Int32Type, "int", 10000),
     ]
 

From b56fa6def3343e989e7065f97d0d1df327dd9d82 Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Mon, 9 Feb 2026 17:56:40 +0200
Subject: [PATCH 12/12] tests: enable vector integration tests on Scylla
 2025.4+

Vector type is supported on Scylla 2025.4 and above.
Enable the integration tests.

Tested locally against both 2025.4.2 and 2026.1 and they pass.

Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
---
 tests/integration/__init__.py            | 11 +++++++++++
 tests/integration/standard/test_types.py |  5 +++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
index b4eab35875..f6cf96ec0a 100644
--- a/tests/integration/__init__.py
+++ b/tests/integration/__init__.py
@@ -279,6 +279,17 @@ def xfail_scylla_version(filter: Callable[[Version], bool], reason: str, *args,
 greaterthanorequalcass3_11 = unittest.skipUnless(CASSANDRA_VERSION >= Version('3.11'), 'Cassandra version 3.11 or greater required')
 greaterthanorequalcass40 = unittest.skipUnless(CASSANDRA_VERSION >= Version('4.0'), 'Cassandra version 4.0 or greater required')
 greaterthanorequalcass50 = unittest.skipUnless(CASSANDRA_VERSION >= Version('5.0-beta'), 'Cassandra version 5.0 or greater required')
+def _is_cass50_or_scylla_2025_4_plus():
+    if CASSANDRA_VERSION >= Version('5.0-beta'):
+        return True
+    if SCYLLA_VERSION is None:
+        return False
+    return Version(get_scylla_version(SCYLLA_VERSION)) >= Version('2025.4')
+
+greaterthanorequalcass50_or_scylla_2025_4 = unittest.skipUnless(
+    _is_cass50_or_scylla_2025_4_plus(),
+    'Cassandra >= 5.0 or Scylla >= 2025.4 required'
+)
 lessthanorequalcass40 = unittest.skipUnless(CASSANDRA_VERSION <= Version('4.0'), 'Cassandra version less or equal to 4.0 required')
 lessthancass40 = unittest.skipUnless(CASSANDRA_VERSION < Version('4.0'), 'Cassandra version less than 4.0 required')
 lessthancass30 = unittest.skipUnless(CASSANDRA_VERSION < Version('3.0'), 'Cassandra version less then 3.0 required')
diff --git a/tests/integration/standard/test_types.py b/tests/integration/standard/test_types.py
index ad69fbada9..4a051fd12a 100644
--- a/tests/integration/standard/test_types.py
+++ b/tests/integration/standard/test_types.py
@@ -40,7 +40,8 @@
 
 from tests.integration import use_singledc, execute_until_pass, notprotocolv1, \
     BasicSharedKeyspaceUnitTestCase, greaterthancass21, lessthancass30, \
-    greaterthanorequalcass3_10, TestCluster, requires_composite_type, greaterthanorequalcass50
+    greaterthanorequalcass3_10, TestCluster, requires_composite_type, \
+    greaterthanorequalcass50_or_scylla_2025_4
 from tests.integration.datatype_utils import update_datatypes, PRIMITIVE_DATATYPES, COLLECTION_TYPES, PRIMITIVE_DATATYPES_KEYS, \
     get_sample, get_all_samples, get_collection_sample
 import pytest
@@ -984,7 +985,7 @@ def run_inserts_at_version(self, proto_ver):
         finally:
             session.cluster.shutdown()
 
-@greaterthanorequalcass50
+@greaterthanorequalcass50_or_scylla_2025_4
 class TypeTestsVector(BasicSharedKeyspaceUnitTestCase):
 
     def _get_first_j(self, rs):