use arrow::error::ArrowError;
use regex::Regex;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Arc;
pub mod regexpcount;
pub mod regexpinstr;
pub mod regexplike;
pub mod regexpmatch;
pub mod regexpreplace;
make_udf_function!(regexpcount::RegexpCountFunc, regexp_count);
make_udf_function!(regexpinstr::RegexpInstrFunc, regexp_instr);
make_udf_function!(regexpmatch::RegexpMatchFunc, regexp_match);
make_udf_function!(regexplike::RegexpLikeFunc, regexp_like);
make_udf_function!(regexpreplace::RegexpReplaceFunc, regexp_replace);
pub mod expr_fn {
use datafusion_expr::Expr;
pub fn regexp_count(
values: Expr,
regex: Expr,
start: Option<Expr>,
flags: Option<Expr>,
) -> Expr {
let mut args = vec![values, regex];
if let Some(start) = start {
args.push(start);
};
if let Some(flags) = flags {
args.push(flags);
};
super::regexp_count().call(args)
}
pub fn regexp_match(values: Expr, regex: Expr, flags: Option<Expr>) -> Expr {
let mut args = vec![values, regex];
if let Some(flags) = flags {
args.push(flags);
};
super::regexp_match().call(args)
}
pub fn regexp_instr(
values: Expr,
regex: Expr,
start: Option<Expr>,
n: Option<Expr>,
endoption: Option<Expr>,
flags: Option<Expr>,
subexpr: Option<Expr>,
) -> Expr {
let mut args = vec![values, regex];
if let Some(start) = start {
args.push(start);
};
if let Some(n) = n {
args.push(n);
};
if let Some(endoption) = endoption {
args.push(endoption);
};
if let Some(flags) = flags {
args.push(flags);
};
if let Some(subexpr) = subexpr {
args.push(subexpr);
};
super::regexp_instr().call(args)
}
pub fn regexp_like(values: Expr, regex: Expr, flags: Option<Expr>) -> Expr {
let mut args = vec![values, regex];
if let Some(flags) = flags {
args.push(flags);
};
super::regexp_like().call(args)
}
pub fn regexp_replace(
string: Expr,
pattern: Expr,
replacement: Expr,
flags: Option<Expr>,
) -> Expr {
let mut args = vec![string, pattern, replacement];
if let Some(flags) = flags {
args.push(flags);
};
super::regexp_replace().call(args)
}
}
pub fn functions() -> Vec<Arc<datafusion_expr::ScalarUDF>> {
vec![
regexp_count(),
regexp_match(),
regexp_instr(),
regexp_like(),
regexp_replace(),
]
}
pub fn compile_and_cache_regex<'strings, 'cache>(
regex: &'strings str,
flags: Option<&'strings str>,
regex_cache: &'cache mut HashMap<(&'strings str, Option<&'strings str>), Regex>,
) -> Result<&'cache Regex, ArrowError>
where
'strings: 'cache,
{
let result = match regex_cache.entry((regex, flags)) {
Entry::Occupied(occupied_entry) => occupied_entry.into_mut(),
Entry::Vacant(vacant_entry) => {
let compiled = compile_regex(regex, flags)?;
vacant_entry.insert(compiled)
}
};
Ok(result)
}
pub fn compile_regex(regex: &str, flags: Option<&str>) -> Result<Regex, ArrowError> {
let pattern = match flags {
None | Some("") => regex.to_string(),
Some(flags) => {
if flags.contains("g") {
return Err(ArrowError::ComputeError(
"regexp_count()/regexp_instr() does not support the global flag"
.to_string(),
));
}
format!("(?{flags}){regex}")
}
};
Regex::new(&pattern).map_err(|_| {
ArrowError::ComputeError(format!("Regular expression did not compile: {pattern}"))
})
}