extern crate criterion;
use arrow::array::{ArrayRef, StringArray, StringViewBuilder};
use arrow::datatypes::{DataType, Field};
use arrow::util::bench_util::{
create_string_array_with_len, create_string_view_array_with_len,
};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use datafusion_functions::string;
use std::sync::Arc;
fn create_args1(size: usize, str_len: usize) -> Vec<ColumnarValue> {
let array = Arc::new(create_string_array_with_len::<i32>(size, 0.2, str_len));
vec![ColumnarValue::Array(array)]
}
fn create_args2(size: usize) -> Vec<ColumnarValue> {
let mut items = Vec::with_capacity(size);
items.push("农历新年".to_string());
for i in 1..size {
items.push(format!("DATAFUSION {i}"));
}
let array = Arc::new(StringArray::from(items)) as ArrayRef;
vec![ColumnarValue::Array(array)]
}
fn create_args3(size: usize) -> Vec<ColumnarValue> {
let mut items = Vec::with_capacity(size);
let half = size / 2;
for i in 0..half {
items.push(format!("DATAFUSION {i}"));
}
items.push("Ⱦ".to_string());
for i in half + 1..size {
items.push(format!("DATAFUSION {i}"));
}
let array = Arc::new(StringArray::from(items)) as ArrayRef;
vec![ColumnarValue::Array(array)]
}
fn create_args4(
size: usize,
str_len: usize,
null_density: f32,
mixed: bool,
) -> Vec<ColumnarValue> {
let array = Arc::new(create_string_view_array_with_len(
size,
null_density,
str_len,
mixed,
));
vec![ColumnarValue::Array(array)]
}
fn create_args5(
size: usize,
non_ascii_density: f32,
null_density: f32,
) -> Vec<ColumnarValue> {
let mut string_view_builder = StringViewBuilder::with_capacity(size);
for _ in 0..size {
if rand::random::<f32>() < null_density {
string_view_builder.append_null();
continue;
}
if rand::random::<f32>() < non_ascii_density {
string_view_builder.append_value("农历新年农历新年农历新年农历新年农历新年");
} else {
string_view_builder.append_value("DATAFUSIONDATAFUSIONDATAFUSION");
}
}
let array = Arc::new(string_view_builder.finish()) as ArrayRef;
vec![ColumnarValue::Array(array)]
}
fn criterion_benchmark(c: &mut Criterion) {
let lower = string::lower();
for size in [1024, 4096, 8192] {
let args = create_args1(size, 32);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();
c.bench_function(&format!("lower_all_values_are_ascii: {size}"), |b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(lower.invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8, true).into(),
}))
})
});
let args = create_args2(size);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();
c.bench_function(&format!("lower_the_first_value_is_nonascii: {size}"), |b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(lower.invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8, true).into(),
}))
})
});
let args = create_args3(size);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();
c.bench_function(
&format!("lower_the_middle_value_is_nonascii: {size}"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(lower.invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8, true).into(),
}))
})
},
);
}
let sizes = [4096, 8192];
let str_lens = [10, 64, 128];
let mixes = [true, false];
let null_densities = [0.0f32, 0.1f32];
for null_density in &null_densities {
for &mixed in &mixes {
for &str_len in &str_lens {
for &size in &sizes {
let args = create_args4(size, str_len, *null_density, mixed);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();
c.bench_function(
&format!("lower_all_values_are_ascii_string_views: size: {size}, str_len: {str_len}, null_density: {null_density}, mixed: {mixed}"),
|b| b.iter(|| {
let args_cloned = args.clone();
black_box(lower.invoke_with_args(ScalarFunctionArgs{
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8, true).into(),
}))
}),
);
let args = create_args4(size, str_len, *null_density, mixed);
c.bench_function(
&format!("lower_all_values_are_ascii_string_views: size: {size}, str_len: {str_len}, null_density: {null_density}, mixed: {mixed}"),
|b| b.iter(|| {
let args_cloned = args.clone();
black_box(lower.invoke_with_args(ScalarFunctionArgs{
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8, true).into(),
}))
}),
);
let args = create_args5(size, 0.1, *null_density);
c.bench_function(
&format!("lower_some_values_are_nonascii_string_views: size: {}, str_len: {}, non_ascii_density: {}, null_density: {}, mixed: {}",
size, str_len, 0.1, null_density, mixed),
|b| b.iter(|| {
let args_cloned = args.clone();
black_box(lower.invoke_with_args(ScalarFunctionArgs{
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8, true).into(),
}))
}),
);
}
}
}
}
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);