extern crate criterion;
use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait};
use arrow::datatypes::{DataType, Field};
use arrow::util::bench_util::{
create_string_array_with_len, create_string_view_array_with_len,
};
use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode};
use datafusion_common::DataFusionError;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use datafusion_functions::unicode;
use std::sync::Arc;
fn create_args_without_count<O: OffsetSizeTrait>(
size: usize,
str_len: usize,
start_half_way: bool,
force_view_types: bool,
) -> Vec<ColumnarValue> {
let start_array = Arc::new(Int64Array::from(
(0..size)
.map(|_| {
if start_half_way {
(str_len / 2) as i64
} else {
1i64
}
})
.collect::<Vec<_>>(),
));
if force_view_types {
let string_array =
Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
vec![
ColumnarValue::Array(string_array),
ColumnarValue::Array(start_array),
]
} else {
let string_array =
Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
vec![
ColumnarValue::Array(string_array),
ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
]
}
}
fn create_args_with_count<O: OffsetSizeTrait>(
size: usize,
str_len: usize,
count_max: usize,
force_view_types: bool,
) -> Vec<ColumnarValue> {
let start_array =
Arc::new(Int64Array::from((0..size).map(|_| 1).collect::<Vec<_>>()));
let count = count_max.min(str_len) as i64;
let count_array = Arc::new(Int64Array::from(
(0..size).map(|_| count).collect::<Vec<_>>(),
));
if force_view_types {
let string_array =
Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
vec![
ColumnarValue::Array(string_array),
ColumnarValue::Array(start_array),
ColumnarValue::Array(count_array),
]
} else {
let string_array =
Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
vec![
ColumnarValue::Array(string_array),
ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
ColumnarValue::Array(Arc::clone(&count_array) as ArrayRef),
]
}
}
fn invoke_substr_with_args(
args: Vec<ColumnarValue>,
number_rows: usize,
) -> Result<ColumnarValue, DataFusionError> {
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
.collect::<Vec<_>>();
unicode::substr().invoke_with_args(ScalarFunctionArgs {
args: args.clone(),
arg_fields,
number_rows,
return_field: Field::new("f", DataType::Utf8View, true).into(),
})
}
fn criterion_benchmark(c: &mut Criterion) {
for size in [1024, 4096] {
let len = 12;
let mut group = c.benchmark_group("SHORTER THAN 12");
group.sampling_mode(SamplingMode::Flat);
group.sample_size(10);
let args = create_args_without_count::<i32>(size, len, true, true);
group.bench_function(
format!("substr_string_view [size={size}, strlen={len}]"),
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
);
let args = create_args_without_count::<i32>(size, len, false, false);
group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| {
b.iter(|| black_box(invoke_substr_with_args(args.clone(), size)))
});
let args = create_args_without_count::<i64>(size, len, true, false);
group.bench_function(
format!("substr_large_string [size={size}, strlen={len}]"),
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
);
group.finish();
let len = 128;
let count = 64;
let mut group = c.benchmark_group("LONGER THAN 12");
group.sampling_mode(SamplingMode::Flat);
group.sample_size(10);
let args = create_args_with_count::<i32>(size, len, count, true);
group.bench_function(
format!("substr_string_view [size={size}, count={count}, strlen={len}]",),
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
);
let args = create_args_with_count::<i32>(size, len, count, false);
group.bench_function(
format!("substr_string [size={size}, count={count}, strlen={len}]",),
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
);
let args = create_args_with_count::<i64>(size, len, count, false);
group.bench_function(
format!("substr_large_string [size={size}, count={count}, strlen={len}]",),
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
);
group.finish();
let len = 128;
let count = 6;
let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12");
group.sampling_mode(SamplingMode::Flat);
group.sample_size(10);
let args = create_args_with_count::<i32>(size, len, count, true);
group.bench_function(
format!("substr_string_view [size={size}, count={count}, strlen={len}]",),
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
);
let args = create_args_with_count::<i32>(size, len, count, false);
group.bench_function(
format!("substr_string [size={size}, count={count}, strlen={len}]",),
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
);
let args = create_args_with_count::<i64>(size, len, count, false);
group.bench_function(
format!("substr_large_string [size={size}, count={count}, strlen={len}]",),
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
);
group.finish();
}
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);