extern crate criterion;
use arrow::array::{StringArray, StringViewArray};
use arrow::datatypes::{DataType, Field};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use rand::distr::Alphanumeric;
use rand::prelude::StdRng;
use rand::{Rng, SeedableRng};
use std::str::Chars;
use std::sync::Arc;
fn gen_string_array(
n_rows: usize,
str_len_chars: usize,
null_density: f32,
utf8_density: f32,
is_string_view: bool, ) -> Vec<ColumnarValue> {
let mut rng = StdRng::seed_from_u64(42);
let rng_ref = &mut rng;
let utf8 = "DatafusionДатаФусион数据融合📊🔥"; let corpus_char_count = utf8.chars().count();
let mut output_string_vec: Vec<Option<String>> = Vec::with_capacity(n_rows);
let mut output_sub_string_vec: Vec<Option<String>> = Vec::with_capacity(n_rows);
for _ in 0..n_rows {
let rand_num = rng_ref.random::<f32>(); if rand_num < null_density {
output_sub_string_vec.push(None);
output_string_vec.push(None);
} else if rand_num < null_density + utf8_density {
let mut generated_string = String::with_capacity(str_len_chars);
for _ in 0..str_len_chars {
let idx = rng_ref.random_range(0..corpus_char_count);
let char = utf8.chars().nth(idx).unwrap();
generated_string.push(char);
}
output_sub_string_vec.push(Some(random_substring(generated_string.chars())));
output_string_vec.push(Some(generated_string));
} else {
let value = rng_ref
.sample_iter(&Alphanumeric)
.take(str_len_chars)
.collect();
let value = String::from_utf8(value).unwrap();
output_sub_string_vec.push(Some(random_substring(value.chars())));
output_string_vec.push(Some(value));
}
}
if is_string_view {
let string_view_array: StringViewArray = output_string_vec.into_iter().collect();
let sub_string_view_array: StringViewArray =
output_sub_string_vec.into_iter().collect();
vec![
ColumnarValue::Array(Arc::new(string_view_array)),
ColumnarValue::Array(Arc::new(sub_string_view_array)),
]
} else {
let string_array: StringArray = output_string_vec.clone().into_iter().collect();
let sub_string_array: StringArray = output_sub_string_vec.into_iter().collect();
vec![
ColumnarValue::Array(Arc::new(string_array)),
ColumnarValue::Array(Arc::new(sub_string_array)),
]
}
}
fn random_substring(chars: Chars) -> String {
let mut rng = StdRng::seed_from_u64(44);
let count = chars.clone().count();
let start = rng.random_range(0..count - 1);
let end = rng.random_range(start + 1..count);
chars
.enumerate()
.filter(|(i, _)| *i >= start && *i < end)
.map(|(_, c)| c)
.collect()
}
fn criterion_benchmark(c: &mut Criterion) {
let strpos = datafusion_functions::unicode::strpos();
let n_rows = 8192;
for str_len in [8, 32, 128, 4096] {
let args_string_ascii = gen_string_array(n_rows, str_len, 0.1, 0.0, false);
let arg_fields =
vec![Field::new("a", args_string_ascii[0].data_type(), true).into()];
let return_field = Field::new("f", DataType::Int32, true).into();
c.bench_function(
&format!("strpos_StringArray_ascii_str_len_{str_len}"),
|b| {
b.iter(|| {
black_box(strpos.invoke_with_args(ScalarFunctionArgs {
args: args_string_ascii.clone(),
arg_fields: arg_fields.clone(),
number_rows: n_rows,
return_field: Arc::clone(&return_field),
}))
})
},
);
let args_string_utf8 = gen_string_array(n_rows, str_len, 0.1, 0.5, false);
let arg_fields =
vec![Field::new("a", args_string_utf8[0].data_type(), true).into()];
let return_field = Field::new("f", DataType::Int32, true).into();
c.bench_function(&format!("strpos_StringArray_utf8_str_len_{str_len}"), |b| {
b.iter(|| {
black_box(strpos.invoke_with_args(ScalarFunctionArgs {
args: args_string_utf8.clone(),
arg_fields: arg_fields.clone(),
number_rows: n_rows,
return_field: Arc::clone(&return_field),
}))
})
});
let args_string_view_ascii = gen_string_array(n_rows, str_len, 0.1, 0.0, true);
let arg_fields =
vec![Field::new("a", args_string_view_ascii[0].data_type(), true).into()];
let return_field = Field::new("f", DataType::Int32, true).into();
c.bench_function(
&format!("strpos_StringViewArray_ascii_str_len_{str_len}"),
|b| {
b.iter(|| {
black_box(strpos.invoke_with_args(ScalarFunctionArgs {
args: args_string_view_ascii.clone(),
arg_fields: arg_fields.clone(),
number_rows: n_rows,
return_field: Arc::clone(&return_field),
}))
})
},
);
let args_string_view_utf8 = gen_string_array(n_rows, str_len, 0.1, 0.5, true);
let arg_fields =
vec![Field::new("a", args_string_view_utf8[0].data_type(), true).into()];
let return_field = Field::new("f", DataType::Int32, true).into();
c.bench_function(
&format!("strpos_StringViewArray_utf8_str_len_{str_len}"),
|b| {
b.iter(|| {
black_box(strpos.invoke_with_args(ScalarFunctionArgs {
args: args_string_view_utf8.clone(),
arg_fields: arg_fields.clone(),
number_rows: n_rows,
return_field: Arc::clone(&return_field),
}))
})
},
);
}
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);