From ba24e9df6bd72224779d1a45ceb1fb1660ef9ca2 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Wed, 14 Jan 2026 14:44:20 +0100 Subject: [PATCH 1/9] arrow-row: Add ListView support --- arrow-row/src/lib.rs | 233 +++++++++++++++++++++++++++++++++++++++++- arrow-row/src/list.rs | 169 +++++++++++++++++++++++++++++- 2 files changed, 396 insertions(+), 6 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 1da0439ee9b5..4e8e87e6679e 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -179,6 +179,39 @@ use crate::variable::{decode_binary, decode_string}; use arrow_array::types::{Int16Type, Int32Type, Int64Type}; mod fixed; + +/// Computes the minimum offset and maximum end (offset + size) for a ListView array. +/// Returns (min_offset, max_end) which can be used to slice the values array. +fn compute_list_view_bounds(array: &GenericListViewArray) -> (usize, usize) { + if array.is_empty() { + return (0, 0); + } + + let offsets = array.value_offsets(); + let sizes = array.value_sizes(); + + let mut min_offset = usize::MAX; + let mut max_end = 0usize; + + for i in 0..array.len() { + let offset = offsets[i].as_usize(); + let size = sizes[i].as_usize(); + let end = offset + size; + + if size > 0 { + min_offset = min_offset.min(offset); + max_end = max_end.max(end); + } + } + + if min_offset == usize::MAX { + // All lists are empty + (0, 0) + } else { + (min_offset, max_end) + } +} + mod list; mod run; mod variable; @@ -535,7 +568,10 @@ impl Codec { Ok(Self::RunEndEncoded(converter)) } d if !d.is_nested() => Ok(Self::Stateless), - DataType::List(f) | DataType::LargeList(f) => { + DataType::List(f) + | DataType::LargeList(f) + | DataType::ListView(f) + | DataType::LargeListView(f) => { // The encoded contents will be inverted if descending is set to true // As such we set `descending` to false and negate nulls first if it // it set to true @@ -646,6 +682,20 @@ impl Codec { .values() .slice(first_offset, last_offset - first_offset) } + DataType::ListView(_) => { + let list_view_array = array.as_list_view::(); + let (min_offset, max_end) = compute_list_view_bounds(list_view_array); + list_view_array + .values() + .slice(min_offset, max_end - min_offset) + } + DataType::LargeListView(_) => { + let list_view_array = array.as_list_view::(); + let (min_offset, max_end) = compute_list_view_bounds(list_view_array); + list_view_array + .values() + .slice(min_offset, max_end - min_offset) + } DataType::FixedSizeList(_, _) => { as_fixed_size_list_array(array).values().clone() } @@ -783,9 +833,11 @@ impl RowConverter { fn supports_datatype(d: &DataType) -> bool { match d { _ if !d.is_nested() => true, - DataType::List(f) | DataType::LargeList(f) | DataType::FixedSizeList(f, _) => { - Self::supports_datatype(f.data_type()) - } + DataType::List(f) + | DataType::LargeList(f) + | DataType::ListView(f) + | DataType::LargeListView(f) + | DataType::FixedSizeList(f, _) => Self::supports_datatype(f.data_type()), DataType::Struct(f) => f.iter().all(|x| Self::supports_datatype(x.data_type())), DataType::RunEndEncoded(_, values) => Self::supports_datatype(values.data_type()), DataType::Union(fs, _mode) => fs @@ -1603,6 +1655,26 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker { DataType::LargeList(_) => { list::compute_lengths(tracker.materialized(), rows, as_large_list_array(array)) } + DataType::ListView(_) => { + let list_view = array.as_list_view::(); + let (min_offset, _) = compute_list_view_bounds(list_view); + list::compute_lengths_list_view( + tracker.materialized(), + rows, + list_view, + min_offset, + ) + } + DataType::LargeListView(_) => { + let list_view = array.as_list_view::(); + let (min_offset, _) = compute_list_view_bounds(list_view); + list::compute_lengths_list_view( + tracker.materialized(), + rows, + list_view, + min_offset, + ) + } DataType::FixedSizeList(_, _) => compute_lengths_fixed_size_list( &mut tracker, rows, @@ -1796,6 +1868,16 @@ fn encode_column( DataType::LargeList(_) => { list::encode(data, offsets, rows, opts, as_large_list_array(column)) } + DataType::ListView(_) => { + let list_view = column.as_list_view::(); + let (min_offset, _) = compute_list_view_bounds(list_view); + list::encode_list_view(data, offsets, rows, opts, list_view, min_offset) + } + DataType::LargeListView(_) => { + let list_view = column.as_list_view::(); + let (min_offset, _) = compute_list_view_bounds(list_view); + list::encode_list_view(data, offsets, rows, opts, list_view, min_offset) + } DataType::FixedSizeList(_, _) => { encode_fixed_size_list(data, offsets, rows, opts, as_fixed_size_list_array(column)) } @@ -1945,6 +2027,12 @@ unsafe fn decode_column( DataType::LargeList(_) => { Arc::new(unsafe { list::decode::(converter, rows, field, validate_utf8) }?) } + DataType::ListView(_) => Arc::new(unsafe { + list::decode_list_view::(converter, rows, field, validate_utf8) + }?), + DataType::LargeListView(_) => Arc::new(unsafe { + list::decode_list_view::(converter, rows, field, validate_utf8) + }?), DataType::FixedSizeList(_, value_length) => Arc::new(unsafe { list::decode_fixed_size_list( converter, @@ -3132,6 +3220,143 @@ mod tests { test_nested_list::(); } + fn test_single_list_view() { + let mut builder = GenericListViewBuilder::::new(Int32Builder::new()); + builder.values().append_value(32); + builder.values().append_value(52); + builder.values().append_value(32); + builder.append(true); + builder.values().append_value(32); + builder.values().append_value(52); + builder.values().append_value(12); + builder.append(true); + builder.values().append_value(32); + builder.values().append_value(52); + builder.append(true); + builder.values().append_value(32); // MASKED + builder.values().append_value(52); // MASKED + builder.append(false); + builder.values().append_value(32); + builder.values().append_null(); + builder.append(true); + builder.append(true); + builder.values().append_value(17); // MASKED + builder.values().append_null(); // MASKED + builder.append(false); + + let list = Arc::new(builder.finish()) as ArrayRef; + let d = list.data_type().clone(); + + let converter = RowConverter::new(vec![SortField::new(d.clone())]).unwrap(); + + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12] + assert!(rows.row(2) < rows.row(1)); // [32, 52] < [32, 52, 12] + assert!(rows.row(3) < rows.row(2)); // null < [32, 52] + assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 52] + assert!(rows.row(5) < rows.row(2)); // [] < [32, 52] + assert!(rows.row(3) < rows.row(5)); // null < [] + assert_eq!(rows.row(3), rows.row(6)); // null = null (different masked values) + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].to_data().validate_full().unwrap(); + + // Verify the content matches (ListView may have different physical layout but same logical content) + let back_list_view = back[0] + .as_any() + .downcast_ref::>() + .unwrap(); + let orig_list_view = list + .as_any() + .downcast_ref::>() + .unwrap(); + + assert_eq!(back_list_view.len(), orig_list_view.len()); + for i in 0..back_list_view.len() { + assert_eq!(back_list_view.is_valid(i), orig_list_view.is_valid(i)); + if back_list_view.is_valid(i) { + assert_eq!(&back_list_view.value(i), &orig_list_view.value(i)); + } + } + + let options = SortOptions::default().asc().with_nulls_first(false); + let field = SortField::new_with_options(d.clone(), options); + let converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12] + assert!(rows.row(2) < rows.row(1)); // [32, 52] < [32, 52, 12] + assert!(rows.row(3) > rows.row(2)); // null > [32, 52] + assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 52] + assert!(rows.row(5) < rows.row(2)); // [] < [32, 52] + assert!(rows.row(3) > rows.row(5)); // null > [] + assert_eq!(rows.row(3), rows.row(6)); // null = null (different masked values) + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].to_data().validate_full().unwrap(); + + let options = SortOptions::default().desc().with_nulls_first(false); + let field = SortField::new_with_options(d.clone(), options); + let converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12] + assert!(rows.row(2) > rows.row(1)); // [32, 52] > [32, 52, 12] + assert!(rows.row(3) > rows.row(2)); // null > [32, 52] + assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 52] + assert!(rows.row(5) > rows.row(2)); // [] > [32, 52] + assert!(rows.row(3) > rows.row(5)); // null > [] + assert_eq!(rows.row(3), rows.row(6)); // null = null (different masked values) + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].to_data().validate_full().unwrap(); + + let options = SortOptions::default().desc().with_nulls_first(true); + let field = SortField::new_with_options(d, options); + let converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12] + assert!(rows.row(2) > rows.row(1)); // [32, 52] > [32, 52, 12] + assert!(rows.row(3) < rows.row(2)); // null < [32, 52] + assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 52] + assert!(rows.row(5) > rows.row(2)); // [] > [32, 52] + assert!(rows.row(3) < rows.row(5)); // null < [] + assert_eq!(rows.row(3), rows.row(6)); // null = null (different masked values) + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].to_data().validate_full().unwrap(); + + let sliced_list = list.slice(1, 5); + let rows_on_sliced_list = converter + .convert_columns(&[Arc::clone(&sliced_list)]) + .unwrap(); + + assert!(rows_on_sliced_list.row(1) > rows_on_sliced_list.row(0)); // [32, 52] > [32, 52, 12] + assert!(rows_on_sliced_list.row(2) < rows_on_sliced_list.row(1)); // null < [32, 52] + assert!(rows_on_sliced_list.row(3) < rows_on_sliced_list.row(1)); // [32, null] < [32, 52] + assert!(rows_on_sliced_list.row(4) > rows_on_sliced_list.row(1)); // [] > [32, 52] + assert!(rows_on_sliced_list.row(2) < rows_on_sliced_list.row(4)); // null < [] + + let back = converter.convert_rows(&rows_on_sliced_list).unwrap(); + assert_eq!(back.len(), 1); + back[0].to_data().validate_full().unwrap(); + } + + #[test] + fn test_list_view() { + test_single_list_view::(); + } + + #[test] + fn test_large_list_view() { + test_single_list_view::(); + } + #[test] fn test_fixed_size_list() { let mut builder = FixedSizeListBuilder::new(Int32Builder::new(), 3); diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index 6e552b0a93b9..faefa7f0b1ea 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -16,8 +16,13 @@ // under the License. use crate::{LengthTracker, RowConverter, Rows, SortField, fixed, null_sentinel}; -use arrow_array::{Array, FixedSizeListArray, GenericListArray, OffsetSizeTrait, new_null_array}; -use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; +use arrow_array::{ + Array, FixedSizeListArray, GenericListArray, GenericListViewArray, OffsetSizeTrait, + new_null_array, +}; +use arrow_buffer::{ + ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer, NullBuffer, ScalarBuffer, +}; use arrow_data::ArrayDataBuilder; use arrow_schema::{ArrowError, DataType, SortOptions}; use std::{ops::Range, sync::Arc}; @@ -323,3 +328,163 @@ pub unsafe fn decode_fixed_size_list( builder.build_unchecked() })) } + +/// Computes the encoded lengths for a `GenericListViewArray` +/// +/// `rows` should contain the encoded child elements +pub fn compute_lengths_list_view( + lengths: &mut [usize], + rows: &Rows, + array: &GenericListViewArray, + shift: usize, +) { + let offsets = array.value_offsets(); + let sizes = array.value_sizes(); + + lengths.iter_mut().enumerate().for_each(|(idx, length)| { + let start = offsets[idx].as_usize() - shift; + let size = sizes[idx].as_usize(); + let range = array.is_valid(idx).then_some(start..start + size); + *length += encoded_len(rows, range); + }); +} + +/// Encodes the provided `GenericListViewArray` to `out` with the provided `SortOptions` +/// +/// `rows` should contain the encoded child elements +pub fn encode_list_view( + data: &mut [u8], + out_offsets: &mut [usize], + rows: &Rows, + opts: SortOptions, + array: &GenericListViewArray, + shift: usize, +) { + let offsets = array.value_offsets(); + let sizes = array.value_sizes(); + + out_offsets + .iter_mut() + .skip(1) + .enumerate() + .for_each(|(idx, offset)| { + let start = offsets[idx].as_usize() - shift; + let size = sizes[idx].as_usize(); + let range = array.is_valid(idx).then_some(start..start + size); + let out = &mut data[*offset..]; + *offset += encode_one(out, rows, range, opts) + }); +} + +/// Decodes a `GenericListViewArray` from `rows` with the provided `options` +/// +/// # Safety +/// +/// `rows` must contain valid data for the provided `converter` +pub unsafe fn decode_list_view( + converter: &RowConverter, + rows: &mut [&[u8]], + field: &SortField, + validate_utf8: bool, +) -> Result, ArrowError> { + let opts = field.options; + + let mut values_bytes = 0; + + let mut child_count = 0usize; + let mut list_sizes: Vec = Vec::with_capacity(rows.len()); + + // First pass: count children and compute sizes + for row in rows.iter_mut() { + let mut row_offset = 0; + let mut list_size = 0usize; + loop { + let decoded = super::variable::decode_blocks(&row[row_offset..], opts, |x| { + values_bytes += x.len(); + }); + if decoded <= 1 { + list_sizes.push(O::usize_as(list_size)); + break; + } + row_offset += decoded; + child_count += 1; + list_size += 1; + } + } + O::from_usize(child_count).expect("overflow"); + + let mut null_count = 0; + let nulls = MutableBuffer::collect_bool(rows.len(), |x| { + let valid = rows[x][0] != null_sentinel(opts); + null_count += !valid as usize; + valid + }); + + let mut values_offsets_vec = Vec::with_capacity(child_count); + let mut values_bytes = Vec::with_capacity(values_bytes); + for row in rows.iter_mut() { + let mut row_offset = 0; + loop { + let decoded = super::variable::decode_blocks(&row[row_offset..], opts, |x| { + values_bytes.extend_from_slice(x) + }); + row_offset += decoded; + if decoded <= 1 { + break; + } + values_offsets_vec.push(values_bytes.len()); + } + *row = &row[row_offset..]; + } + + if opts.descending { + values_bytes.iter_mut().for_each(|o| *o = !*o); + } + + let mut last_value_offset = 0; + let mut child_rows: Vec<_> = values_offsets_vec + .into_iter() + .map(|offset| { + let v = &values_bytes[last_value_offset..offset]; + last_value_offset = offset; + v + }) + .collect(); + + let child = unsafe { converter.convert_raw(&mut child_rows, validate_utf8) }?; + assert_eq!(child.len(), 1); + + let child_data = child[0].to_data(); + + // Technically ListViews don't have to have offsets follow each other precisely, but can be + // reused. However, because we cannot preserve that sharing within the row format, this is the + // best we can do. + let mut list_offsets: Vec = Vec::with_capacity(rows.len()); + let mut current_offset = O::usize_as(0); + for size in &list_sizes { + list_offsets.push(current_offset); + current_offset += *size; + } + + // Since RowConverter flattens certain data types (i.e. Dictionary), + // we need to use updated data type instead of original field + let corrected_inner_field = match &field.data_type { + DataType::ListView(inner_field) | DataType::LargeListView(inner_field) => Arc::new( + inner_field + .as_ref() + .clone() + .with_data_type(child_data.data_type().clone()), + ), + _ => unreachable!(), + }; + + let null_buffer = NullBuffer::new(BooleanBuffer::new(nulls.into(), 0, rows.len())); + + GenericListViewArray::try_new( + corrected_inner_field, + ScalarBuffer::from(list_offsets), + ScalarBuffer::from(list_sizes), + child[0].clone(), + Some(null_buffer).filter(|n| n.null_count() > 0), + ) +} From a9564b73479b200df71898570deb82049629df1a Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Thu, 15 Jan 2026 15:28:36 +0100 Subject: [PATCH 2/9] arrow-row: Refactor to deduplicate code for encoded list length --- arrow-row/src/list.rs | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index faefa7f0b1ea..e6c000b3e842 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -32,29 +32,17 @@ pub fn compute_lengths( rows: &Rows, array: &GenericListArray, ) { - let offsets = array.value_offsets().windows(2); - let mut rows_length_iter = rows.lengths(); + let shift = array.value_offsets()[0].as_usize(); lengths .iter_mut() - .zip(offsets) + .zip(array.value_offsets().windows(2)) .enumerate() .for_each(|(idx, (length, offsets))| { - let len = offsets[1].as_usize() - offsets[0].as_usize(); - if array.is_valid(idx) { - *length += 1 + rows_length_iter - .by_ref() - .take(len) - .map(Some) - .map(super::variable::padded_length) - .sum::() - } else { - // Advance rows iterator by len - if len > 0 { - rows_length_iter.nth(len - 1); - } - *length += 1; - } + let start = offsets[0].as_usize() - shift; + let end = offsets[1].as_usize() - shift; + let range = array.is_valid(idx).then_some(start..end); + *length += list_element_encoded_len(rows, range); }); } @@ -329,6 +317,23 @@ pub unsafe fn decode_fixed_size_list( })) } +/// Computes the encoded length for a single list element given its child rows. +/// +/// This is used by list types (List, LargeList, ListView, LargeListView) to determine +/// the encoded length of a list element. For null elements, returns 1 (null sentinel only). +/// For valid elements, returns 1 + the sum of padded lengths for each child row. +#[inline] +fn list_element_encoded_len(rows: &Rows, range: Option>) -> usize { + match range { + None => 1, + Some(range) => { + 1 + range + .map(|i| super::variable::padded_length(Some(rows.row(i).as_ref().len()))) + .sum::() + } + } +} + /// Computes the encoded lengths for a `GenericListViewArray` /// /// `rows` should contain the encoded child elements @@ -345,7 +350,7 @@ pub fn compute_lengths_list_view( let start = offsets[idx].as_usize() - shift; let size = sizes[idx].as_usize(); let range = array.is_valid(idx).then_some(start..start + size); - *length += encoded_len(rows, range); + *length += list_element_encoded_len(rows, range); }); } From d36eaa9cb89f0845cf8d1bae7f361ef6db2e92e6 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 19 Jan 2026 10:26:49 +0100 Subject: [PATCH 3/9] arrow-row: Move bounds check function further down in the file --- arrow-row/src/lib.rs | 65 ++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 4e8e87e6679e..c930a5d48603 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -179,39 +179,6 @@ use crate::variable::{decode_binary, decode_string}; use arrow_array::types::{Int16Type, Int32Type, Int64Type}; mod fixed; - -/// Computes the minimum offset and maximum end (offset + size) for a ListView array. -/// Returns (min_offset, max_end) which can be used to slice the values array. -fn compute_list_view_bounds(array: &GenericListViewArray) -> (usize, usize) { - if array.is_empty() { - return (0, 0); - } - - let offsets = array.value_offsets(); - let sizes = array.value_sizes(); - - let mut min_offset = usize::MAX; - let mut max_end = 0usize; - - for i in 0..array.len() { - let offset = offsets[i].as_usize(); - let size = sizes[i].as_usize(); - let end = offset + size; - - if size > 0 { - min_offset = min_offset.min(offset); - max_end = max_end.max(end); - } - } - - if min_offset == usize::MAX { - // All lists are empty - (0, 0) - } else { - (min_offset, max_end) - } -} - mod list; mod run; mod variable; @@ -539,6 +506,38 @@ enum Codec { Union(Vec, Vec), } +/// Computes the minimum offset and maximum end (offset + size) for a ListView array. +/// Returns (min_offset, max_end) which can be used to slice the values array. +fn compute_list_view_bounds(array: &GenericListViewArray) -> (usize, usize) { + if array.is_empty() { + return (0, 0); + } + + let offsets = array.value_offsets(); + let sizes = array.value_sizes(); + + let mut min_offset = usize::MAX; + let mut max_end = 0usize; + + for i in 0..array.len() { + let offset = offsets[i].as_usize(); + let size = sizes[i].as_usize(); + let end = offset + size; + + if size > 0 { + min_offset = min_offset.min(offset); + max_end = max_end.max(end); + } + } + + if min_offset == usize::MAX { + // All lists are empty + (0, 0) + } else { + (min_offset, max_end) + } +} + impl Codec { fn new(sort_field: &SortField) -> Result { match &sort_field.data_type { From 80c24deaedbf6a65f17447655848f1978d10a8b2 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 20 Jan 2026 11:28:11 +0100 Subject: [PATCH 4/9] arrow-row: Use unchecked new since the null count is already retrieved --- arrow-row/src/list.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index e6c000b3e842..ffd754f986d2 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -483,7 +483,10 @@ pub unsafe fn decode_list_view( _ => unreachable!(), }; - let null_buffer = NullBuffer::new(BooleanBuffer::new(nulls.into(), 0, rows.len())); + // SAFETY: null_count was computed correctly when building the nulls buffer above + let null_buffer = unsafe { + NullBuffer::new_unchecked(BooleanBuffer::new(nulls.into(), 0, rows.len()), null_count) + }; GenericListViewArray::try_new( corrected_inner_field, From 144182659e4f44220c5751fe30ee21a35e58ee37 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 20 Jan 2026 11:29:08 +0100 Subject: [PATCH 5/9] arrow-row: Return list bounds early if full range is already found --- arrow-row/src/lib.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index c930a5d48603..d0b0622ad2b2 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -515,6 +515,7 @@ fn compute_list_view_bounds(array: &GenericListViewArray) let offsets = array.value_offsets(); let sizes = array.value_sizes(); + let values_len = array.values().len(); let mut min_offset = usize::MAX; let mut max_end = 0usize; @@ -528,6 +529,13 @@ fn compute_list_view_bounds(array: &GenericListViewArray) min_offset = min_offset.min(offset); max_end = max_end.max(end); } + + // Early exit if we've found the full range of the values array. This is possible with + // ListViews since offsets and sizes are arbitrary and the full range can be covered early + // in the iteration contrary to regular Lists. + if min_offset == 0 && max_end == values_len { + break; + } } if min_offset == usize::MAX { From 5bf5fc5c314d24ee0fc7bce639f7a7c1b37253a3 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 20 Jan 2026 14:30:11 +0100 Subject: [PATCH 6/9] arrow-ord: Add ability to compare ListView --- arrow-ord/src/ord.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index c09fff807ac9..8c8edc1aeb72 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -233,6 +233,42 @@ fn compare_fixed_list( Ok(f) } +fn compare_list_view( + left: &dyn Array, + right: &dyn Array, + opts: SortOptions, +) -> Result { + let left = left.as_list_view::(); + let right = right.as_list_view::(); + + let c_opts = child_opts(opts); + let cmp = make_comparator(left.values().as_ref(), right.values().as_ref(), c_opts)?; + + let l_offsets = left.offsets().clone(); + let l_sizes = left.sizes().clone(); + let r_offsets = right.offsets().clone(); + let r_sizes = right.sizes().clone(); + + let f = compare(left, right, opts, move |i, j| { + let l_start = l_offsets[i].as_usize(); + let l_len = l_sizes[i].as_usize(); + let l_end = l_start + l_len; + + let r_start = r_offsets[j].as_usize(); + let r_len = r_sizes[j].as_usize(); + let r_end = r_start + r_len; + + for (i, j) in (l_start..l_end).zip(r_start..r_end) { + match cmp(i, j) { + Ordering::Equal => continue, + r => return r, + } + } + l_len.cmp(&r_len) + }); + Ok(f) +} + fn compare_map( left: &dyn Array, right: &dyn Array, @@ -470,6 +506,8 @@ pub fn make_comparator( }, (List(_), List(_)) => compare_list::(left, right, opts), (LargeList(_), LargeList(_)) => compare_list::(left, right, opts), + (ListView(_), ListView(_)) => compare_list_view::(left, right, opts), + (LargeListView(_), LargeListView(_)) => compare_list_view::(left, right, opts), (FixedSizeList(_, _), FixedSizeList(_, _)) => compare_fixed_list(left, right, opts), (Struct(_), Struct(_)) => compare_struct(left, right, opts), (Dictionary(l_key, _), Dictionary(r_key, _)) => { From b2d8491f9b41e1c26d18c4e12f1b7fc462ec6443 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 20 Jan 2026 14:35:46 +0100 Subject: [PATCH 7/9] arrow-row: Add various cases with offset and size orders for ListView --- arrow-row/src/lib.rs | 105 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index d0b0622ad2b2..7cc612338363 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -3364,6 +3364,111 @@ mod tests { test_single_list_view::(); } + fn test_list_view_with_shared_values() { + // Create a values array: [1, 2, 3, 4, 5, 6, 7, 8] + let values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8]); + let field = Arc::new(Field::new_list_field(DataType::Int32, true)); + + // Create a ListView where: + // - Row 0: offset=0, size=3 -> [1, 2, 3] + // - Row 1: offset=0, size=3 -> [1, 2, 3] (same offset+size as row 0) + // - Row 2: offset=5, size=2 -> [6, 7] (non-monotonic offset) + // - Row 3: offset=2, size=2 -> [3, 4] (offset goes back) + // - Row 4: offset=1, size=4 -> [2, 3, 4, 5] (subset of values that contains row 3's range) + // - Row 5: offset=2, size=1 -> [3] (subset of row 3 and row 4) + let offsets = ScalarBuffer::::from(vec![ + O::from_usize(0).unwrap(), + O::from_usize(0).unwrap(), + O::from_usize(5).unwrap(), + O::from_usize(2).unwrap(), + O::from_usize(1).unwrap(), + O::from_usize(2).unwrap(), + ]); + let sizes = ScalarBuffer::::from(vec![ + O::from_usize(3).unwrap(), + O::from_usize(3).unwrap(), + O::from_usize(2).unwrap(), + O::from_usize(2).unwrap(), + O::from_usize(4).unwrap(), + O::from_usize(1).unwrap(), + ]); + + let list_view: GenericListViewArray = + GenericListViewArray::try_new(field, offsets, sizes, Arc::new(values), None).unwrap(); + + let d = list_view.data_type().clone(); + let list = Arc::new(list_view) as ArrayRef; + + let converter = RowConverter::new(vec![SortField::new(d.clone())]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + // Row 0 and Row 1 have the same content [1, 2, 3], so they should be equal + assert_eq!(rows.row(0), rows.row(1)); + + // [1, 2, 3] < [6, 7] (comparing first elements: 1 < 6) + assert!(rows.row(0) < rows.row(2)); + + // [3, 4] > [1, 2, 3] (comparing first elements: 3 > 1) + assert!(rows.row(3) > rows.row(0)); + + // [2, 3, 4, 5] > [1, 2, 3] (comparing first elements: 2 > 1) + assert!(rows.row(4) > rows.row(0)); + + // [3] < [3, 4] (same prefix but shorter) + assert!(rows.row(5) < rows.row(3)); + + // [3] < [2, 3, 4, 5] (comparing first elements: 3 > 2) + assert!(rows.row(5) > rows.row(4)); + + // Round-trip conversion + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].to_data().validate_full().unwrap(); + + // Verify logical content matches + let back_list_view = back[0] + .as_any() + .downcast_ref::>() + .unwrap(); + let orig_list_view = list + .as_any() + .downcast_ref::>() + .unwrap(); + + assert_eq!(back_list_view.len(), orig_list_view.len()); + for i in 0..back_list_view.len() { + assert_eq!(back_list_view.is_valid(i), orig_list_view.is_valid(i)); + if back_list_view.is_valid(i) { + assert_eq!(&back_list_view.value(i), &orig_list_view.value(i)); + } + } + + // Test with descending order + let options = SortOptions::default().desc(); + let field = SortField::new_with_options(d, options); + let converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + // In descending order, comparisons are reversed + assert_eq!(rows.row(0), rows.row(1)); // Equal rows stay equal + assert!(rows.row(0) > rows.row(2)); // [1, 2, 3] > [6, 7] in desc + assert!(rows.row(3) < rows.row(0)); // [3, 4] < [1, 2, 3] in desc + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].to_data().validate_full().unwrap(); + } + + #[test] + fn test_list_view_shared_values() { + test_list_view_with_shared_values::(); + } + + #[test] + fn test_large_list_view_shared_values() { + test_list_view_with_shared_values::(); + } + #[test] fn test_fixed_size_list() { let mut builder = FixedSizeListBuilder::new(Int32Builder::new(), 3); From 57a8d3ff47a9ae4439675cdbb8797d58afe8a227 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 20 Jan 2026 14:36:56 +0100 Subject: [PATCH 8/9] arrow-row: Add ListView to fuzz tests and fix failure --- arrow-row/src/lib.rs | 50 ++++++++++++++++++++++++++++++++++++++++++- arrow-row/src/list.rs | 24 +++++++++++++++++---- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 7cc612338363..4718bba5f841 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -3927,9 +3927,42 @@ mod tests { ListArray::new(field, offsets, values, Some(nulls)) } + fn generate_list_view(len: usize, valid_percent: f64, values: F) -> ListViewArray + where + F: FnOnce(usize) -> ArrayRef, + { + let mut rng = rng(); + // Generate sizes first, then create a values array large enough + let sizes: Vec = (0..len).map(|_| rng.random_range(0..10)).collect(); + let values_len: usize = sizes.iter().map(|s| *s as usize).sum::().max(1); + let values = values(values_len); + + // Generate offsets that can overlap, be non-monotonic, or share ranges + let offsets: Vec = sizes + .iter() + .map(|&size| { + if size == 0 { + 0 + } else { + rng.random_range(0..=(values_len as i32 - size)) + } + }) + .collect(); + + let nulls = NullBuffer::from_iter((0..len).map(|_| rng.random_bool(valid_percent))); + let field = Arc::new(Field::new_list_field(values.data_type().clone(), true)); + ListViewArray::new( + field, + ScalarBuffer::from(offsets), + ScalarBuffer::from(sizes), + values, + Some(nulls), + ) + } + fn generate_column(len: usize) -> ArrayRef { let mut rng = rng(); - match rng.random_range(0..18) { + match rng.random_range(0..22) { 0 => Arc::new(generate_primitive_array::(len, 0.8)), 1 => Arc::new(generate_primitive_array::(len, 0.8)), 2 => Arc::new(generate_primitive_array::(len, 0.8)), @@ -3972,6 +4005,21 @@ mod tests { }) .slice(500, len), ), + 18 => Arc::new(generate_list_view(len, 0.8, |values_len| { + Arc::new(generate_primitive_array::(values_len, 0.8)) + })), + 19 => Arc::new(generate_list_view(len, 0.8, |values_len| { + Arc::new(generate_strings::(values_len, 0.8)) + })), + 20 => Arc::new(generate_list_view(len, 0.8, |values_len| { + Arc::new(generate_struct(values_len, 0.8)) + })), + 21 => Arc::new( + generate_list_view(len + 1000, 0.8, |values_len| { + Arc::new(generate_primitive_array::(values_len, 0.8)) + }) + .slice(500, len), + ), _ => unreachable!(), } } diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index ffd754f986d2..843101673f05 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -347,9 +347,17 @@ pub fn compute_lengths_list_view( let sizes = array.value_sizes(); lengths.iter_mut().enumerate().for_each(|(idx, length)| { - let start = offsets[idx].as_usize() - shift; let size = sizes[idx].as_usize(); - let range = array.is_valid(idx).then_some(start..start + size); + let range = array.is_valid(idx).then(|| { + // For empty lists (size=0), offset may be arbitrary and could underflow when shifted. + // Use 0 as start since the range is empty anyway. + let start = if size > 0 { + offsets[idx].as_usize() - shift + } else { + 0 + }; + start..start + size + }); *length += list_element_encoded_len(rows, range); }); } @@ -373,9 +381,17 @@ pub fn encode_list_view( .skip(1) .enumerate() .for_each(|(idx, offset)| { - let start = offsets[idx].as_usize() - shift; let size = sizes[idx].as_usize(); - let range = array.is_valid(idx).then_some(start..start + size); + let range = array.is_valid(idx).then(|| { + // For empty lists (size=0), offset may be arbitrary and could underflow when shifted. + // Use 0 as start since the range is empty anyway. + let start = if size > 0 { + offsets[idx].as_usize() - shift + } else { + 0 + }; + start..start + size + }); let out = &mut data[*offset..]; *offset += encode_one(out, rows, range, opts) }); From 3f9da4de1d5458af152548da64244584bd97b454 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Tue, 20 Jan 2026 14:51:42 +0100 Subject: [PATCH 9/9] arrow-row: Add tests with nested ListView --- arrow-row/src/lib.rs | 155 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 4718bba5f841..b949b150f019 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -3354,14 +3354,169 @@ mod tests { back[0].to_data().validate_full().unwrap(); } + fn test_nested_list_view() { + let mut builder = GenericListViewBuilder::::new(GenericListViewBuilder::::new( + Int32Builder::new(), + )); + + // Row 0: [[1, 2], [1, null]] + builder.values().values().append_value(1); + builder.values().values().append_value(2); + builder.values().append(true); + builder.values().values().append_value(1); + builder.values().values().append_null(); + builder.values().append(true); + builder.append(true); + + // Row 1: [[1, null], [1, null]] + builder.values().values().append_value(1); + builder.values().values().append_null(); + builder.values().append(true); + builder.values().values().append_value(1); + builder.values().values().append_null(); + builder.values().append(true); + builder.append(true); + + // Row 2: [[1, null], null] + builder.values().values().append_value(1); + builder.values().values().append_null(); + builder.values().append(true); + builder.values().append(false); + builder.append(true); + + // Row 3: null + builder.append(false); + + // Row 4: [[1, 2]] + builder.values().values().append_value(1); + builder.values().values().append_value(2); + builder.values().append(true); + builder.append(true); + + let list = Arc::new(builder.finish()) as ArrayRef; + let d = list.data_type().clone(); + + // [ + // [[1, 2], [1, null]], + // [[1, null], [1, null]], + // [[1, null], null] + // null + // [[1, 2]] + // ] + let options = SortOptions::default().asc().with_nulls_first(true); + let field = SortField::new_with_options(d.clone(), options); + let converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + assert!(rows.row(0) > rows.row(1)); + assert!(rows.row(1) > rows.row(2)); + assert!(rows.row(2) > rows.row(3)); + assert!(rows.row(4) < rows.row(0)); + assert!(rows.row(4) > rows.row(1)); + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].to_data().validate_full().unwrap(); + + // Verify the content matches (ListView may have different physical layout but same logical content) + let back_list_view = back[0] + .as_any() + .downcast_ref::>() + .unwrap(); + let orig_list_view = list + .as_any() + .downcast_ref::>() + .unwrap(); + + assert_eq!(back_list_view.len(), orig_list_view.len()); + for i in 0..back_list_view.len() { + assert_eq!(back_list_view.is_valid(i), orig_list_view.is_valid(i)); + if back_list_view.is_valid(i) { + assert_eq!(&back_list_view.value(i), &orig_list_view.value(i)); + } + } + + let options = SortOptions::default().desc().with_nulls_first(true); + let field = SortField::new_with_options(d.clone(), options); + let converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + assert!(rows.row(0) > rows.row(1)); + assert!(rows.row(1) > rows.row(2)); + assert!(rows.row(2) > rows.row(3)); + assert!(rows.row(4) > rows.row(0)); + assert!(rows.row(4) > rows.row(1)); + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].to_data().validate_full().unwrap(); + + // Verify the content matches + let back_list_view = back[0] + .as_any() + .downcast_ref::>() + .unwrap(); + + assert_eq!(back_list_view.len(), orig_list_view.len()); + for i in 0..back_list_view.len() { + assert_eq!(back_list_view.is_valid(i), orig_list_view.is_valid(i)); + if back_list_view.is_valid(i) { + assert_eq!(&back_list_view.value(i), &orig_list_view.value(i)); + } + } + + let options = SortOptions::default().desc().with_nulls_first(false); + let field = SortField::new_with_options(d.clone(), options); + let converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + assert!(rows.row(0) < rows.row(1)); + assert!(rows.row(1) < rows.row(2)); + assert!(rows.row(2) < rows.row(3)); + assert!(rows.row(4) > rows.row(0)); + assert!(rows.row(4) < rows.row(1)); + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].to_data().validate_full().unwrap(); + + // Verify the content matches + let back_list_view = back[0] + .as_any() + .downcast_ref::>() + .unwrap(); + + assert_eq!(back_list_view.len(), orig_list_view.len()); + for i in 0..back_list_view.len() { + assert_eq!(back_list_view.is_valid(i), orig_list_view.is_valid(i)); + if back_list_view.is_valid(i) { + assert_eq!(&back_list_view.value(i), &orig_list_view.value(i)); + } + } + + let sliced_list = list.slice(1, 3); + let rows = converter + .convert_columns(&[Arc::clone(&sliced_list)]) + .unwrap(); + + assert!(rows.row(0) < rows.row(1)); + assert!(rows.row(1) < rows.row(2)); + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].to_data().validate_full().unwrap(); + } + #[test] fn test_list_view() { test_single_list_view::(); + test_nested_list_view::(); } #[test] fn test_large_list_view() { test_single_list_view::(); + test_nested_list_view::(); } fn test_list_view_with_shared_values() {