extern crate criterion;
use arrow::array::builder::StringBuilder;
use arrow::array::{ArrayRef, AsArray, Int64Array, StringArray, StringViewArray};
use arrow::compute::cast;
use arrow::datatypes::DataType;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_functions::regex::regexpcount::regexp_count_func;
use datafusion_functions::regex::regexplike::regexp_like;
use datafusion_functions::regex::regexpmatch::regexp_match;
use datafusion_functions::regex::regexpreplace::regexp_replace;
use rand::distr::Alphanumeric;
use rand::prelude::IndexedRandom;
use rand::rngs::ThreadRng;
use rand::Rng;
use std::iter;
use std::sync::Arc;
fn data(rng: &mut ThreadRng) -> StringArray {
let mut data: Vec<String> = vec![];
for _ in 0..1000 {
data.push(
rng.sample_iter(&Alphanumeric)
.take(7)
.map(char::from)
.collect(),
);
}
StringArray::from(data)
}
fn regex(rng: &mut ThreadRng) -> StringArray {
let samples = [
".*([A-Z]{1}).*".to_string(),
"^(A).*".to_string(),
r#"[\p{Letter}-]+"#.to_string(),
r#"[\p{L}-]+"#.to_string(),
"[a-zA-Z]_[a-zA-Z]{2}".to_string(),
];
let mut data: Vec<String> = vec![];
for _ in 0..1000 {
data.push(samples.choose(rng).unwrap().to_string());
}
StringArray::from(data)
}
fn start(rng: &mut ThreadRng) -> Int64Array {
let mut data: Vec<i64> = vec![];
for _ in 0..1000 {
data.push(rng.random_range(1..5));
}
Int64Array::from(data)
}
fn flags(rng: &mut ThreadRng) -> StringArray {
let samples = [Some("i".to_string()), Some("im".to_string()), None];
let mut sb = StringBuilder::new();
for _ in 0..1000 {
let sample = samples.choose(rng).unwrap();
if sample.is_some() {
sb.append_value(sample.clone().unwrap());
} else {
sb.append_null();
}
}
sb.finish()
}
fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("regexp_count_1000 string", |b| {
let mut rng = rand::rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
let regex = Arc::new(regex(&mut rng)) as ArrayRef;
let start = Arc::new(start(&mut rng)) as ArrayRef;
let flags = Arc::new(flags(&mut rng)) as ArrayRef;
b.iter(|| {
black_box(
regexp_count_func(&[
Arc::clone(&data),
Arc::clone(®ex),
Arc::clone(&start),
Arc::clone(&flags),
])
.expect("regexp_count should work on utf8"),
)
})
});
c.bench_function("regexp_count_1000 utf8view", |b| {
let mut rng = rand::rng();
let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap();
let regex = cast(®ex(&mut rng), &DataType::Utf8View).unwrap();
let start = Arc::new(start(&mut rng)) as ArrayRef;
let flags = cast(&flags(&mut rng), &DataType::Utf8View).unwrap();
b.iter(|| {
black_box(
regexp_count_func(&[
Arc::clone(&data),
Arc::clone(®ex),
Arc::clone(&start),
Arc::clone(&flags),
])
.expect("regexp_count should work on utf8view"),
)
})
});
c.bench_function("regexp_like_1000", |b| {
let mut rng = rand::rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
let regex = Arc::new(regex(&mut rng)) as ArrayRef;
let flags = Arc::new(flags(&mut rng)) as ArrayRef;
b.iter(|| {
black_box(
regexp_like(&[Arc::clone(&data), Arc::clone(®ex), Arc::clone(&flags)])
.expect("regexp_like should work on valid values"),
)
})
});
c.bench_function("regexp_like_1000 utf8view", |b| {
let mut rng = rand::rng();
let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap();
let regex = cast(®ex(&mut rng), &DataType::Utf8View).unwrap();
let flags = cast(&flags(&mut rng), &DataType::Utf8View).unwrap();
b.iter(|| {
black_box(
regexp_like(&[Arc::clone(&data), Arc::clone(®ex), Arc::clone(&flags)])
.expect("regexp_like should work on valid values"),
)
})
});
c.bench_function("regexp_match_1000", |b| {
let mut rng = rand::rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
let regex = Arc::new(regex(&mut rng)) as ArrayRef;
let flags = Arc::new(flags(&mut rng)) as ArrayRef;
b.iter(|| {
black_box(
regexp_match(&[
Arc::clone(&data),
Arc::clone(®ex),
Arc::clone(&flags),
])
.expect("regexp_match should work on valid values"),
)
})
});
c.bench_function("regexp_match_1000 utf8view", |b| {
let mut rng = rand::rng();
let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap();
let regex = cast(®ex(&mut rng), &DataType::Utf8View).unwrap();
let flags = cast(&flags(&mut rng), &DataType::Utf8View).unwrap();
b.iter(|| {
black_box(
regexp_match(&[
Arc::clone(&data),
Arc::clone(®ex),
Arc::clone(&flags),
])
.expect("regexp_match should work on valid values"),
)
})
});
c.bench_function("regexp_replace_1000", |b| {
let mut rng = rand::rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
let regex = Arc::new(regex(&mut rng)) as ArrayRef;
let flags = Arc::new(flags(&mut rng)) as ArrayRef;
let replacement =
Arc::new(StringArray::from_iter_values(iter::repeat_n("XX", 1000)))
as ArrayRef;
b.iter(|| {
black_box(
regexp_replace::<i32, _, _>(
data.as_string::<i32>(),
regex.as_string::<i32>(),
replacement.as_string::<i32>(),
Some(&flags),
)
.expect("regexp_replace should work on valid values"),
)
})
});
c.bench_function("regexp_replace_1000 utf8view", |b| {
let mut rng = rand::rng();
let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap();
let regex = cast(®ex(&mut rng), &DataType::Utf8View).unwrap();
let flags = Arc::new(flags(&mut rng)) as ArrayRef;
let replacement = Arc::new(StringViewArray::from_iter_values(iter::repeat_n(
"XX", 1000,
)));
b.iter(|| {
black_box(
regexp_replace::<i32, _, _>(
data.as_string_view(),
regex.as_string_view(),
&replacement,
Some(&flags),
)
.expect("regexp_replace should work on valid values"),
)
})
});
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);