datafusion_expr/udf.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`ScalarUDF`]: Scalar User Defined Functions
19
20use crate::async_udf::AsyncScalarUDF;
21use crate::expr::schema_name_from_exprs_comma_separated_without_space;
22use crate::simplify::{ExprSimplifyResult, SimplifyInfo};
23use crate::sort_properties::{ExprProperties, SortProperties};
24use crate::{ColumnarValue, Documentation, Expr, Signature};
25use arrow::datatypes::{DataType, Field, FieldRef};
26use datafusion_common::{not_impl_err, ExprSchema, Result, ScalarValue};
27use datafusion_expr_common::interval_arithmetic::Interval;
28use std::any::Any;
29use std::cmp::Ordering;
30use std::fmt::Debug;
31use std::hash::{DefaultHasher, Hash, Hasher};
32use std::sync::Arc;
33
34/// Logical representation of a Scalar User Defined Function.
35///
36/// A scalar function produces a single row output for each row of input. This
37/// struct contains the information DataFusion needs to plan and invoke
38/// functions you supply such as name, type signature, return type, and actual
39/// implementation.
40///
41/// 1. For simple use cases, use [`create_udf`] (examples in [`simple_udf.rs`]).
42///
43/// 2. For advanced use cases, use [`ScalarUDFImpl`] which provides full API
44/// access (examples in [`advanced_udf.rs`]).
45///
46/// See [`Self::call`] to create an `Expr` which invokes a `ScalarUDF` with arguments.
47///
48/// # API Note
49///
50/// This is a separate struct from [`ScalarUDFImpl`] to maintain backwards
51/// compatibility with the older API.
52///
53/// [`create_udf`]: crate::expr_fn::create_udf
54/// [`simple_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udf.rs
55/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs
56#[derive(Debug, Clone)]
57pub struct ScalarUDF {
58 inner: Arc<dyn ScalarUDFImpl>,
59}
60
61impl PartialEq for ScalarUDF {
62 fn eq(&self, other: &Self) -> bool {
63 self.inner.equals(other.inner.as_ref())
64 }
65}
66
67// Manual implementation based on `ScalarUDFImpl::equals`
68impl PartialOrd for ScalarUDF {
69 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
70 match self.name().partial_cmp(other.name()) {
71 Some(Ordering::Equal) => self.signature().partial_cmp(other.signature()),
72 cmp => cmp,
73 }
74 }
75}
76
77impl Eq for ScalarUDF {}
78
79impl Hash for ScalarUDF {
80 fn hash<H: Hasher>(&self, state: &mut H) {
81 self.inner.hash_value().hash(state)
82 }
83}
84
85impl ScalarUDF {
86 /// Create a new `ScalarUDF` from a `[ScalarUDFImpl]` trait object
87 ///
88 /// Note this is the same as using the `From` impl (`ScalarUDF::from`)
89 pub fn new_from_impl<F>(fun: F) -> ScalarUDF
90 where
91 F: ScalarUDFImpl + 'static,
92 {
93 Self::new_from_shared_impl(Arc::new(fun))
94 }
95
96 /// Create a new `ScalarUDF` from a `[ScalarUDFImpl]` trait object
97 pub fn new_from_shared_impl(fun: Arc<dyn ScalarUDFImpl>) -> ScalarUDF {
98 Self { inner: fun }
99 }
100
101 /// Return the underlying [`ScalarUDFImpl`] trait object for this function
102 pub fn inner(&self) -> &Arc<dyn ScalarUDFImpl> {
103 &self.inner
104 }
105
106 /// Adds additional names that can be used to invoke this function, in
107 /// addition to `name`
108 ///
109 /// If you implement [`ScalarUDFImpl`] directly you should return aliases directly.
110 pub fn with_aliases(self, aliases: impl IntoIterator<Item = &'static str>) -> Self {
111 Self::new_from_impl(AliasedScalarUDFImpl::new(Arc::clone(&self.inner), aliases))
112 }
113
114 /// Returns a [`Expr`] logical expression to call this UDF with specified
115 /// arguments.
116 ///
117 /// This utility allows easily calling UDFs
118 ///
119 /// # Example
120 /// ```no_run
121 /// use datafusion_expr::{col, lit, ScalarUDF};
122 /// # fn my_udf() -> ScalarUDF { unimplemented!() }
123 /// let my_func: ScalarUDF = my_udf();
124 /// // Create an expr for `my_func(a, 12.3)`
125 /// let expr = my_func.call(vec![col("a"), lit(12.3)]);
126 /// ```
127 pub fn call(&self, args: Vec<Expr>) -> Expr {
128 Expr::ScalarFunction(crate::expr::ScalarFunction::new_udf(
129 Arc::new(self.clone()),
130 args,
131 ))
132 }
133
134 /// Returns this function's name.
135 ///
136 /// See [`ScalarUDFImpl::name`] for more details.
137 pub fn name(&self) -> &str {
138 self.inner.name()
139 }
140
141 /// Returns this function's display_name.
142 ///
143 /// See [`ScalarUDFImpl::display_name`] for more details
144 pub fn display_name(&self, args: &[Expr]) -> Result<String> {
145 self.inner.display_name(args)
146 }
147
148 /// Returns this function's schema_name.
149 ///
150 /// See [`ScalarUDFImpl::schema_name`] for more details
151 pub fn schema_name(&self, args: &[Expr]) -> Result<String> {
152 self.inner.schema_name(args)
153 }
154
155 /// Returns the aliases for this function.
156 ///
157 /// See [`ScalarUDF::with_aliases`] for more details
158 pub fn aliases(&self) -> &[String] {
159 self.inner.aliases()
160 }
161
162 /// Returns this function's [`Signature`] (what input types are accepted).
163 ///
164 /// See [`ScalarUDFImpl::signature`] for more details.
165 pub fn signature(&self) -> &Signature {
166 self.inner.signature()
167 }
168
169 /// The datatype this function returns given the input argument types.
170 /// This function is used when the input arguments are [`DataType`]s.
171 ///
172 /// # Notes
173 ///
174 /// If a function implement [`ScalarUDFImpl::return_field_from_args`],
175 /// its [`ScalarUDFImpl::return_type`] should raise an error.
176 ///
177 /// See [`ScalarUDFImpl::return_type`] for more details.
178 pub fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
179 self.inner.return_type(arg_types)
180 }
181
182 /// Return the datatype this function returns given the input argument types.
183 ///
184 /// See [`ScalarUDFImpl::return_field_from_args`] for more details.
185 pub fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
186 self.inner.return_field_from_args(args)
187 }
188
189 /// Do the function rewrite
190 ///
191 /// See [`ScalarUDFImpl::simplify`] for more details.
192 pub fn simplify(
193 &self,
194 args: Vec<Expr>,
195 info: &dyn SimplifyInfo,
196 ) -> Result<ExprSimplifyResult> {
197 self.inner.simplify(args, info)
198 }
199
200 #[allow(deprecated)]
201 pub fn is_nullable(&self, args: &[Expr], schema: &dyn ExprSchema) -> bool {
202 self.inner.is_nullable(args, schema)
203 }
204
205 /// Invoke the function on `args`, returning the appropriate result.
206 ///
207 /// See [`ScalarUDFImpl::invoke_with_args`] for details.
208 pub fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
209 self.inner.invoke_with_args(args)
210 }
211
212 /// Get the circuits of inner implementation
213 pub fn short_circuits(&self) -> bool {
214 self.inner.short_circuits()
215 }
216
217 /// Computes the output interval for a [`ScalarUDF`], given the input
218 /// intervals.
219 ///
220 /// # Parameters
221 ///
222 /// * `inputs` are the intervals for the inputs (children) of this function.
223 ///
224 /// # Example
225 ///
226 /// If the function is `ABS(a)`, and the input interval is `a: [-3, 2]`,
227 /// then the output interval would be `[0, 3]`.
228 pub fn evaluate_bounds(&self, inputs: &[&Interval]) -> Result<Interval> {
229 self.inner.evaluate_bounds(inputs)
230 }
231
232 /// Updates bounds for child expressions, given a known interval for this
233 /// function. This is used to propagate constraints down through an expression
234 /// tree.
235 ///
236 /// # Parameters
237 ///
238 /// * `interval` is the currently known interval for this function.
239 /// * `inputs` are the current intervals for the inputs (children) of this function.
240 ///
241 /// # Returns
242 ///
243 /// A `Vec` of new intervals for the children, in order.
244 ///
245 /// If constraint propagation reveals an infeasibility for any child, returns
246 /// [`None`]. If none of the children intervals change as a result of
247 /// propagation, may return an empty vector instead of cloning `children`.
248 /// This is the default (and conservative) return value.
249 ///
250 /// # Example
251 ///
252 /// If the function is `ABS(a)`, the current `interval` is `[4, 5]` and the
253 /// input `a` is given as `[-7, 3]`, then propagation would return `[-5, 3]`.
254 pub fn propagate_constraints(
255 &self,
256 interval: &Interval,
257 inputs: &[&Interval],
258 ) -> Result<Option<Vec<Interval>>> {
259 self.inner.propagate_constraints(interval, inputs)
260 }
261
262 /// Calculates the [`SortProperties`] of this function based on its
263 /// children's properties.
264 pub fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties> {
265 self.inner.output_ordering(inputs)
266 }
267
268 pub fn preserves_lex_ordering(&self, inputs: &[ExprProperties]) -> Result<bool> {
269 self.inner.preserves_lex_ordering(inputs)
270 }
271
272 /// See [`ScalarUDFImpl::coerce_types`] for more details.
273 pub fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
274 self.inner.coerce_types(arg_types)
275 }
276
277 /// Returns the documentation for this Scalar UDF.
278 ///
279 /// Documentation can be accessed programmatically as well as
280 /// generating publicly facing documentation.
281 pub fn documentation(&self) -> Option<&Documentation> {
282 self.inner.documentation()
283 }
284
285 /// Return true if this function is an async function
286 pub fn as_async(&self) -> Option<&AsyncScalarUDF> {
287 self.inner().as_any().downcast_ref::<AsyncScalarUDF>()
288 }
289}
290
291impl<F> From<F> for ScalarUDF
292where
293 F: ScalarUDFImpl + 'static,
294{
295 fn from(fun: F) -> Self {
296 Self::new_from_impl(fun)
297 }
298}
299
300/// Arguments passed to [`ScalarUDFImpl::invoke_with_args`] when invoking a
301/// scalar function.
302#[derive(Debug, Clone)]
303pub struct ScalarFunctionArgs {
304 /// The evaluated arguments to the function
305 pub args: Vec<ColumnarValue>,
306 /// Field associated with each arg, if it exists
307 pub arg_fields: Vec<FieldRef>,
308 /// The number of rows in record batch being evaluated
309 pub number_rows: usize,
310 /// The return field of the scalar function returned (from `return_type`
311 /// or `return_field_from_args`) when creating the physical expression
312 /// from the logical expression
313 pub return_field: FieldRef,
314}
315
316impl ScalarFunctionArgs {
317 /// The return type of the function. See [`Self::return_field`] for more
318 /// details.
319 pub fn return_type(&self) -> &DataType {
320 self.return_field.data_type()
321 }
322}
323
324/// Information about arguments passed to the function
325///
326/// This structure contains metadata about how the function was called
327/// such as the type of the arguments, any scalar arguments and if the
328/// arguments can (ever) be null
329///
330/// See [`ScalarUDFImpl::return_field_from_args`] for more information
331#[derive(Debug)]
332pub struct ReturnFieldArgs<'a> {
333 /// The data types of the arguments to the function
334 pub arg_fields: &'a [FieldRef],
335 /// Is argument `i` to the function a scalar (constant)?
336 ///
337 /// If the argument `i` is not a scalar, it will be None
338 ///
339 /// For example, if a function is called like `my_function(column_a, 5)`
340 /// this field will be `[None, Some(ScalarValue::Int32(Some(5)))]`
341 pub scalar_arguments: &'a [Option<&'a ScalarValue>],
342}
343
344/// Trait for implementing user defined scalar functions.
345///
346/// This trait exposes the full API for implementing user defined functions and
347/// can be used to implement any function.
348///
349/// See [`advanced_udf.rs`] for a full example with complete implementation and
350/// [`ScalarUDF`] for other available options.
351///
352/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs
353///
354/// # Basic Example
355/// ```
356/// # use std::any::Any;
357/// # use std::sync::LazyLock;
358/// # use arrow::datatypes::DataType;
359/// # use datafusion_common::{DataFusionError, plan_err, Result};
360/// # use datafusion_expr::{col, ColumnarValue, Documentation, ScalarFunctionArgs, Signature, Volatility};
361/// # use datafusion_expr::{ScalarUDFImpl, ScalarUDF};
362/// # use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH;
363/// /// This struct for a simple UDF that adds one to an int32
364/// #[derive(Debug)]
365/// struct AddOne {
366/// signature: Signature,
367/// }
368///
369/// impl AddOne {
370/// fn new() -> Self {
371/// Self {
372/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable),
373/// }
374/// }
375/// }
376///
377/// static DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
378/// Documentation::builder(DOC_SECTION_MATH, "Add one to an int32", "add_one(2)")
379/// .with_argument("arg1", "The int32 number to add one to")
380/// .build()
381/// });
382///
383/// fn get_doc() -> &'static Documentation {
384/// &DOCUMENTATION
385/// }
386///
387/// /// Implement the ScalarUDFImpl trait for AddOne
388/// impl ScalarUDFImpl for AddOne {
389/// fn as_any(&self) -> &dyn Any { self }
390/// fn name(&self) -> &str { "add_one" }
391/// fn signature(&self) -> &Signature { &self.signature }
392/// fn return_type(&self, args: &[DataType]) -> Result<DataType> {
393/// if !matches!(args.get(0), Some(&DataType::Int32)) {
394/// return plan_err!("add_one only accepts Int32 arguments");
395/// }
396/// Ok(DataType::Int32)
397/// }
398/// // The actual implementation would add one to the argument
399/// fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
400/// unimplemented!()
401/// }
402/// fn documentation(&self) -> Option<&Documentation> {
403/// Some(get_doc())
404/// }
405/// }
406///
407/// // Create a new ScalarUDF from the implementation
408/// let add_one = ScalarUDF::from(AddOne::new());
409///
410/// // Call the function `add_one(col)`
411/// let expr = add_one.call(vec![col("a")]);
412/// ```
413pub trait ScalarUDFImpl: Debug + Send + Sync {
414 // Note: When adding any methods (with default implementations), remember to add them also
415 // into the AliasedScalarUDFImpl below!
416
417 /// Returns this object as an [`Any`] trait object
418 fn as_any(&self) -> &dyn Any;
419
420 /// Returns this function's name
421 fn name(&self) -> &str;
422
423 /// Returns any aliases (alternate names) for this function.
424 ///
425 /// Aliases can be used to invoke the same function using different names.
426 /// For example in some databases `now()` and `current_timestamp()` are
427 /// aliases for the same function. This behavior can be obtained by
428 /// returning `current_timestamp` as an alias for the `now` function.
429 ///
430 /// Note: `aliases` should only include names other than [`Self::name`].
431 /// Defaults to `[]` (no aliases)
432 fn aliases(&self) -> &[String] {
433 &[]
434 }
435
436 /// Returns the user-defined display name of function, given the arguments
437 ///
438 /// This can be used to customize the output column name generated by this
439 /// function.
440 ///
441 /// Defaults to `name(args[0], args[1], ...)`
442 fn display_name(&self, args: &[Expr]) -> Result<String> {
443 let names: Vec<String> = args.iter().map(ToString::to_string).collect();
444 // TODO: join with ", " to standardize the formatting of Vec<Expr>, <https://github.com/apache/datafusion/issues/10364>
445 Ok(format!("{}({})", self.name(), names.join(",")))
446 }
447
448 /// Returns the name of the column this expression would create
449 ///
450 /// See [`Expr::schema_name`] for details
451 fn schema_name(&self, args: &[Expr]) -> Result<String> {
452 Ok(format!(
453 "{}({})",
454 self.name(),
455 schema_name_from_exprs_comma_separated_without_space(args)?
456 ))
457 }
458
459 /// Returns the function's [`Signature`] for information about what input
460 /// types are accepted and the function's Volatility.
461 fn signature(&self) -> &Signature;
462
463 /// What [`DataType`] will be returned by this function, given the types of
464 /// the arguments.
465 ///
466 /// # Notes
467 ///
468 /// If you provide an implementation for [`Self::return_field_from_args`],
469 /// DataFusion will not call `return_type` (this function). In such cases
470 /// is recommended to return [`DataFusionError::Internal`].
471 ///
472 /// [`DataFusionError::Internal`]: datafusion_common::DataFusionError::Internal
473 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType>;
474
475 /// What type will be returned by this function, given the arguments?
476 ///
477 /// By default, this function calls [`Self::return_type`] with the
478 /// types of each argument.
479 ///
480 /// # Notes
481 ///
482 /// For the majority of UDFs, implementing [`Self::return_type`] is sufficient,
483 /// as the result type is typically a deterministic function of the input types
484 /// (e.g., `sqrt(f32)` consistently yields `f32`). Implementing this method directly
485 /// is generally unnecessary unless the return type depends on runtime values.
486 ///
487 /// This function can be used for more advanced cases such as:
488 ///
489 /// 1. specifying nullability
490 /// 2. return types based on the **values** of the arguments (rather than
491 /// their **types**.
492 ///
493 /// # Example creating `Field`
494 ///
495 /// Note the name of the [`Field`] is ignored, except for structured types such as
496 /// `DataType::Struct`.
497 ///
498 /// ```rust
499 /// # use std::sync::Arc;
500 /// # use arrow::datatypes::{DataType, Field, FieldRef};
501 /// # use datafusion_common::Result;
502 /// # use datafusion_expr::ReturnFieldArgs;
503 /// # struct Example{}
504 /// # impl Example {
505 /// fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
506 /// // report output is only nullable if any one of the arguments are nullable
507 /// let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
508 /// let field = Arc::new(Field::new("ignored_name", DataType::Int32, true));
509 /// Ok(field)
510 /// }
511 /// # }
512 /// ```
513 ///
514 /// # Output Type based on Values
515 ///
516 /// For example, the following two function calls get the same argument
517 /// types (something and a `Utf8` string) but return different types based
518 /// on the value of the second argument:
519 ///
520 /// * `arrow_cast(x, 'Int16')` --> `Int16`
521 /// * `arrow_cast(x, 'Float32')` --> `Float32`
522 ///
523 /// # Requirements
524 ///
525 /// This function **must** consistently return the same type for the same
526 /// logical input even if the input is simplified (e.g. it must return the same
527 /// value for `('foo' | 'bar')` as it does for ('foobar').
528 fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
529 let data_types = args
530 .arg_fields
531 .iter()
532 .map(|f| f.data_type())
533 .cloned()
534 .collect::<Vec<_>>();
535 let return_type = self.return_type(&data_types)?;
536 Ok(Arc::new(Field::new(self.name(), return_type, true)))
537 }
538
539 #[deprecated(
540 since = "45.0.0",
541 note = "Use `return_field_from_args` instead. if you use `is_nullable` that returns non-nullable with `return_type`, you would need to switch to `return_field_from_args`, you might have error"
542 )]
543 fn is_nullable(&self, _args: &[Expr], _schema: &dyn ExprSchema) -> bool {
544 true
545 }
546
547 /// Invoke the function returning the appropriate result.
548 ///
549 /// # Performance
550 ///
551 /// For the best performance, the implementations should handle the common case
552 /// when one or more of their arguments are constant values (aka
553 /// [`ColumnarValue::Scalar`]).
554 ///
555 /// [`ColumnarValue::values_to_arrays`] can be used to convert the arguments
556 /// to arrays, which will likely be simpler code, but be slower.
557 fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue>;
558
559 /// Optionally apply per-UDF simplification / rewrite rules.
560 ///
561 /// This can be used to apply function specific simplification rules during
562 /// optimization (e.g. `arrow_cast` --> `Expr::Cast`). The default
563 /// implementation does nothing.
564 ///
565 /// Note that DataFusion handles simplifying arguments and "constant
566 /// folding" (replacing a function call with constant arguments such as
567 /// `my_add(1,2) --> 3` ). Thus, there is no need to implement such
568 /// optimizations manually for specific UDFs.
569 ///
570 /// # Arguments
571 /// * `args`: The arguments of the function
572 /// * `info`: The necessary information for simplification
573 ///
574 /// # Returns
575 /// [`ExprSimplifyResult`] indicating the result of the simplification NOTE
576 /// if the function cannot be simplified, the arguments *MUST* be returned
577 /// unmodified
578 fn simplify(
579 &self,
580 args: Vec<Expr>,
581 _info: &dyn SimplifyInfo,
582 ) -> Result<ExprSimplifyResult> {
583 Ok(ExprSimplifyResult::Original(args))
584 }
585
586 /// Returns true if some of this `exprs` subexpressions may not be evaluated
587 /// and thus any side effects (like divide by zero) may not be encountered.
588 ///
589 /// Setting this to true prevents certain optimizations such as common
590 /// subexpression elimination
591 fn short_circuits(&self) -> bool {
592 false
593 }
594
595 /// Computes the output [`Interval`] for a [`ScalarUDFImpl`], given the input
596 /// intervals.
597 ///
598 /// # Parameters
599 ///
600 /// * `children` are the intervals for the children (inputs) of this function.
601 ///
602 /// # Example
603 ///
604 /// If the function is `ABS(a)`, and the input interval is `a: [-3, 2]`,
605 /// then the output interval would be `[0, 3]`.
606 fn evaluate_bounds(&self, _input: &[&Interval]) -> Result<Interval> {
607 // We cannot assume the input datatype is the same of output type.
608 Interval::make_unbounded(&DataType::Null)
609 }
610
611 /// Updates bounds for child expressions, given a known [`Interval`]s for this
612 /// function.
613 ///
614 /// This function is used to propagate constraints down through an
615 /// expression tree.
616 ///
617 /// # Parameters
618 ///
619 /// * `interval` is the currently known interval for this function.
620 /// * `inputs` are the current intervals for the inputs (children) of this function.
621 ///
622 /// # Returns
623 ///
624 /// A `Vec` of new intervals for the children, in order.
625 ///
626 /// If constraint propagation reveals an infeasibility for any child, returns
627 /// [`None`]. If none of the children intervals change as a result of
628 /// propagation, may return an empty vector instead of cloning `children`.
629 /// This is the default (and conservative) return value.
630 ///
631 /// # Example
632 ///
633 /// If the function is `ABS(a)`, the current `interval` is `[4, 5]` and the
634 /// input `a` is given as `[-7, 3]`, then propagation would return `[-5, 3]`.
635 fn propagate_constraints(
636 &self,
637 _interval: &Interval,
638 _inputs: &[&Interval],
639 ) -> Result<Option<Vec<Interval>>> {
640 Ok(Some(vec![]))
641 }
642
643 /// Calculates the [`SortProperties`] of this function based on its children's properties.
644 fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties> {
645 if !self.preserves_lex_ordering(inputs)? {
646 return Ok(SortProperties::Unordered);
647 }
648
649 let Some(first_order) = inputs.first().map(|p| &p.sort_properties) else {
650 return Ok(SortProperties::Singleton);
651 };
652
653 if inputs
654 .iter()
655 .skip(1)
656 .all(|input| &input.sort_properties == first_order)
657 {
658 Ok(*first_order)
659 } else {
660 Ok(SortProperties::Unordered)
661 }
662 }
663
664 /// Returns true if the function preserves lexicographical ordering based on
665 /// the input ordering.
666 ///
667 /// For example, `concat(a || b)` preserves lexicographical ordering, but `abs(a)` does not.
668 fn preserves_lex_ordering(&self, _inputs: &[ExprProperties]) -> Result<bool> {
669 Ok(false)
670 }
671
672 /// Coerce arguments of a function call to types that the function can evaluate.
673 ///
674 /// This function is only called if [`ScalarUDFImpl::signature`] returns
675 /// [`crate::TypeSignature::UserDefined`]. Most UDFs should return one of
676 /// the other variants of [`TypeSignature`] which handle common cases.
677 ///
678 /// See the [type coercion module](crate::type_coercion)
679 /// documentation for more details on type coercion
680 ///
681 /// [`TypeSignature`]: crate::TypeSignature
682 ///
683 /// For example, if your function requires a floating point arguments, but the user calls
684 /// it like `my_func(1::int)` (i.e. with `1` as an integer), coerce_types can return `[DataType::Float64]`
685 /// to ensure the argument is converted to `1::double`
686 ///
687 /// # Parameters
688 /// * `arg_types`: The argument types of the arguments this function with
689 ///
690 /// # Return value
691 /// A Vec the same length as `arg_types`. DataFusion will `CAST` the function call
692 /// arguments to these specific types.
693 fn coerce_types(&self, _arg_types: &[DataType]) -> Result<Vec<DataType>> {
694 not_impl_err!("Function {} does not implement coerce_types", self.name())
695 }
696
697 /// Return true if this scalar UDF is equal to the other.
698 ///
699 /// Allows customizing the equality of scalar UDFs.
700 /// *Must* be implemented explicitly if the UDF type has internal state.
701 /// Must be consistent with [`Self::hash_value`] and follow the same rules as [`Eq`]:
702 ///
703 /// - reflexive: `a.equals(a)`;
704 /// - symmetric: `a.equals(b)` implies `b.equals(a)`;
705 /// - transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`.
706 ///
707 /// By default, compares type, [`Self::name`], [`Self::aliases`] and [`Self::signature`].
708 fn equals(&self, other: &dyn ScalarUDFImpl) -> bool {
709 self.as_any().type_id() == other.as_any().type_id()
710 && self.name() == other.name()
711 && self.aliases() == other.aliases()
712 && self.signature() == other.signature()
713 }
714
715 /// Returns a hash value for this scalar UDF.
716 ///
717 /// Allows customizing the hash code of scalar UDFs.
718 /// *Must* be implemented explicitly whenever [`Self::equals`] is implemented.
719 ///
720 /// Similarly to [`Hash`] and [`Eq`], if [`Self::equals`] returns true for two UDFs,
721 /// their `hash_value`s must be the same.
722 ///
723 /// By default, it is consistent with default implementation of [`Self::equals`].
724 fn hash_value(&self) -> u64 {
725 let hasher = &mut DefaultHasher::new();
726 self.as_any().type_id().hash(hasher);
727 self.name().hash(hasher);
728 self.aliases().hash(hasher);
729 self.signature().hash(hasher);
730 hasher.finish()
731 }
732
733 /// Returns the documentation for this Scalar UDF.
734 ///
735 /// Documentation can be accessed programmatically as well as generating
736 /// publicly facing documentation.
737 fn documentation(&self) -> Option<&Documentation> {
738 None
739 }
740}
741
742/// ScalarUDF that adds an alias to the underlying function. It is better to
743/// implement [`ScalarUDFImpl`], which supports aliases, directly if possible.
744#[derive(Debug)]
745struct AliasedScalarUDFImpl {
746 inner: Arc<dyn ScalarUDFImpl>,
747 aliases: Vec<String>,
748}
749
750impl AliasedScalarUDFImpl {
751 pub fn new(
752 inner: Arc<dyn ScalarUDFImpl>,
753 new_aliases: impl IntoIterator<Item = &'static str>,
754 ) -> Self {
755 let mut aliases = inner.aliases().to_vec();
756 aliases.extend(new_aliases.into_iter().map(|s| s.to_string()));
757 Self { inner, aliases }
758 }
759}
760
761impl ScalarUDFImpl for AliasedScalarUDFImpl {
762 fn as_any(&self) -> &dyn Any {
763 self
764 }
765
766 fn name(&self) -> &str {
767 self.inner.name()
768 }
769
770 fn display_name(&self, args: &[Expr]) -> Result<String> {
771 self.inner.display_name(args)
772 }
773
774 fn schema_name(&self, args: &[Expr]) -> Result<String> {
775 self.inner.schema_name(args)
776 }
777
778 fn signature(&self) -> &Signature {
779 self.inner.signature()
780 }
781
782 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
783 self.inner.return_type(arg_types)
784 }
785
786 fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
787 self.inner.return_field_from_args(args)
788 }
789
790 fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
791 self.inner.invoke_with_args(args)
792 }
793
794 fn aliases(&self) -> &[String] {
795 &self.aliases
796 }
797
798 fn simplify(
799 &self,
800 args: Vec<Expr>,
801 info: &dyn SimplifyInfo,
802 ) -> Result<ExprSimplifyResult> {
803 self.inner.simplify(args, info)
804 }
805
806 fn short_circuits(&self) -> bool {
807 self.inner.short_circuits()
808 }
809
810 fn evaluate_bounds(&self, input: &[&Interval]) -> Result<Interval> {
811 self.inner.evaluate_bounds(input)
812 }
813
814 fn propagate_constraints(
815 &self,
816 interval: &Interval,
817 inputs: &[&Interval],
818 ) -> Result<Option<Vec<Interval>>> {
819 self.inner.propagate_constraints(interval, inputs)
820 }
821
822 fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties> {
823 self.inner.output_ordering(inputs)
824 }
825
826 fn preserves_lex_ordering(&self, inputs: &[ExprProperties]) -> Result<bool> {
827 self.inner.preserves_lex_ordering(inputs)
828 }
829
830 fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
831 self.inner.coerce_types(arg_types)
832 }
833
834 fn equals(&self, other: &dyn ScalarUDFImpl) -> bool {
835 if let Some(other) = other.as_any().downcast_ref::<AliasedScalarUDFImpl>() {
836 self.inner.equals(other.inner.as_ref()) && self.aliases == other.aliases
837 } else {
838 false
839 }
840 }
841
842 fn hash_value(&self) -> u64 {
843 let hasher = &mut DefaultHasher::new();
844 std::any::type_name::<Self>().hash(hasher);
845 self.inner.hash_value().hash(hasher);
846 self.aliases.hash(hasher);
847 hasher.finish()
848 }
849
850 fn documentation(&self) -> Option<&Documentation> {
851 self.inner.documentation()
852 }
853}
854
855// Scalar UDF doc sections for use in public documentation
856pub mod scalar_doc_sections {
857 use crate::DocSection;
858
859 pub fn doc_sections() -> Vec<DocSection> {
860 vec![
861 DOC_SECTION_MATH,
862 DOC_SECTION_CONDITIONAL,
863 DOC_SECTION_STRING,
864 DOC_SECTION_BINARY_STRING,
865 DOC_SECTION_REGEX,
866 DOC_SECTION_DATETIME,
867 DOC_SECTION_ARRAY,
868 DOC_SECTION_STRUCT,
869 DOC_SECTION_MAP,
870 DOC_SECTION_HASHING,
871 DOC_SECTION_UNION,
872 DOC_SECTION_OTHER,
873 ]
874 }
875
876 pub const fn doc_sections_const() -> &'static [DocSection] {
877 &[
878 DOC_SECTION_MATH,
879 DOC_SECTION_CONDITIONAL,
880 DOC_SECTION_STRING,
881 DOC_SECTION_BINARY_STRING,
882 DOC_SECTION_REGEX,
883 DOC_SECTION_DATETIME,
884 DOC_SECTION_ARRAY,
885 DOC_SECTION_STRUCT,
886 DOC_SECTION_MAP,
887 DOC_SECTION_HASHING,
888 DOC_SECTION_UNION,
889 DOC_SECTION_OTHER,
890 ]
891 }
892
893 pub const DOC_SECTION_MATH: DocSection = DocSection {
894 include: true,
895 label: "Math Functions",
896 description: None,
897 };
898
899 pub const DOC_SECTION_CONDITIONAL: DocSection = DocSection {
900 include: true,
901 label: "Conditional Functions",
902 description: None,
903 };
904
905 pub const DOC_SECTION_STRING: DocSection = DocSection {
906 include: true,
907 label: "String Functions",
908 description: None,
909 };
910
911 pub const DOC_SECTION_BINARY_STRING: DocSection = DocSection {
912 include: true,
913 label: "Binary String Functions",
914 description: None,
915 };
916
917 pub const DOC_SECTION_REGEX: DocSection = DocSection {
918 include: true,
919 label: "Regular Expression Functions",
920 description: Some(
921 r#"Apache DataFusion uses a [PCRE-like](https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions)
922regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax)
923(minus support for several features including look-around and backreferences).
924The following regular expression functions are supported:"#,
925 ),
926 };
927
928 pub const DOC_SECTION_DATETIME: DocSection = DocSection {
929 include: true,
930 label: "Time and Date Functions",
931 description: None,
932 };
933
934 pub const DOC_SECTION_ARRAY: DocSection = DocSection {
935 include: true,
936 label: "Array Functions",
937 description: None,
938 };
939
940 pub const DOC_SECTION_STRUCT: DocSection = DocSection {
941 include: true,
942 label: "Struct Functions",
943 description: None,
944 };
945
946 pub const DOC_SECTION_MAP: DocSection = DocSection {
947 include: true,
948 label: "Map Functions",
949 description: None,
950 };
951
952 pub const DOC_SECTION_HASHING: DocSection = DocSection {
953 include: true,
954 label: "Hashing Functions",
955 description: None,
956 };
957
958 pub const DOC_SECTION_OTHER: DocSection = DocSection {
959 include: true,
960 label: "Other Functions",
961 description: None,
962 };
963
964 pub const DOC_SECTION_UNION: DocSection = DocSection {
965 include: true,
966 label: "Union Functions",
967 description: Some("Functions to work with the union data type, also know as tagged unions, variant types, enums or sum types. Note: Not related to the SQL UNION operator"),
968 };
969}