datafusion_expr/
udf.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`ScalarUDF`]: Scalar User Defined Functions
19
20use crate::async_udf::AsyncScalarUDF;
21use crate::expr::schema_name_from_exprs_comma_separated_without_space;
22use crate::simplify::{ExprSimplifyResult, SimplifyInfo};
23use crate::sort_properties::{ExprProperties, SortProperties};
24use crate::{ColumnarValue, Documentation, Expr, Signature};
25use arrow::datatypes::{DataType, Field, FieldRef};
26use datafusion_common::{not_impl_err, ExprSchema, Result, ScalarValue};
27use datafusion_expr_common::interval_arithmetic::Interval;
28use std::any::Any;
29use std::cmp::Ordering;
30use std::fmt::Debug;
31use std::hash::{DefaultHasher, Hash, Hasher};
32use std::sync::Arc;
33
34/// Logical representation of a Scalar User Defined Function.
35///
36/// A scalar function produces a single row output for each row of input. This
37/// struct contains the information DataFusion needs to plan and invoke
38/// functions you supply such as name, type signature, return type, and actual
39/// implementation.
40///
41/// 1. For simple use cases, use [`create_udf`] (examples in [`simple_udf.rs`]).
42///
43/// 2. For advanced use cases, use [`ScalarUDFImpl`] which provides full API
44///    access (examples in  [`advanced_udf.rs`]).
45///
46/// See [`Self::call`] to create an `Expr` which invokes a `ScalarUDF` with arguments.
47///
48/// # API Note
49///
50/// This is a separate struct from [`ScalarUDFImpl`] to maintain backwards
51/// compatibility with the older API.
52///
53/// [`create_udf`]: crate::expr_fn::create_udf
54/// [`simple_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/simple_udf.rs
55/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs
56#[derive(Debug, Clone)]
57pub struct ScalarUDF {
58    inner: Arc<dyn ScalarUDFImpl>,
59}
60
61impl PartialEq for ScalarUDF {
62    fn eq(&self, other: &Self) -> bool {
63        self.inner.equals(other.inner.as_ref())
64    }
65}
66
67// Manual implementation based on `ScalarUDFImpl::equals`
68impl PartialOrd for ScalarUDF {
69    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
70        match self.name().partial_cmp(other.name()) {
71            Some(Ordering::Equal) => self.signature().partial_cmp(other.signature()),
72            cmp => cmp,
73        }
74    }
75}
76
77impl Eq for ScalarUDF {}
78
79impl Hash for ScalarUDF {
80    fn hash<H: Hasher>(&self, state: &mut H) {
81        self.inner.hash_value().hash(state)
82    }
83}
84
85impl ScalarUDF {
86    /// Create a new `ScalarUDF` from a `[ScalarUDFImpl]` trait object
87    ///
88    /// Note this is the same as using the `From` impl (`ScalarUDF::from`)
89    pub fn new_from_impl<F>(fun: F) -> ScalarUDF
90    where
91        F: ScalarUDFImpl + 'static,
92    {
93        Self::new_from_shared_impl(Arc::new(fun))
94    }
95
96    /// Create a new `ScalarUDF` from a `[ScalarUDFImpl]` trait object
97    pub fn new_from_shared_impl(fun: Arc<dyn ScalarUDFImpl>) -> ScalarUDF {
98        Self { inner: fun }
99    }
100
101    /// Return the underlying [`ScalarUDFImpl`] trait object for this function
102    pub fn inner(&self) -> &Arc<dyn ScalarUDFImpl> {
103        &self.inner
104    }
105
106    /// Adds additional names that can be used to invoke this function, in
107    /// addition to `name`
108    ///
109    /// If you implement [`ScalarUDFImpl`] directly you should return aliases directly.
110    pub fn with_aliases(self, aliases: impl IntoIterator<Item = &'static str>) -> Self {
111        Self::new_from_impl(AliasedScalarUDFImpl::new(Arc::clone(&self.inner), aliases))
112    }
113
114    /// Returns a [`Expr`] logical expression to call this UDF with specified
115    /// arguments.
116    ///
117    /// This utility allows easily calling UDFs
118    ///
119    /// # Example
120    /// ```no_run
121    /// use datafusion_expr::{col, lit, ScalarUDF};
122    /// # fn my_udf() -> ScalarUDF { unimplemented!() }
123    /// let my_func: ScalarUDF = my_udf();
124    /// // Create an expr for `my_func(a, 12.3)`
125    /// let expr = my_func.call(vec![col("a"), lit(12.3)]);
126    /// ```
127    pub fn call(&self, args: Vec<Expr>) -> Expr {
128        Expr::ScalarFunction(crate::expr::ScalarFunction::new_udf(
129            Arc::new(self.clone()),
130            args,
131        ))
132    }
133
134    /// Returns this function's name.
135    ///
136    /// See [`ScalarUDFImpl::name`] for more details.
137    pub fn name(&self) -> &str {
138        self.inner.name()
139    }
140
141    /// Returns this function's display_name.
142    ///
143    /// See [`ScalarUDFImpl::display_name`] for more details
144    pub fn display_name(&self, args: &[Expr]) -> Result<String> {
145        self.inner.display_name(args)
146    }
147
148    /// Returns this function's schema_name.
149    ///
150    /// See [`ScalarUDFImpl::schema_name`] for more details
151    pub fn schema_name(&self, args: &[Expr]) -> Result<String> {
152        self.inner.schema_name(args)
153    }
154
155    /// Returns the aliases for this function.
156    ///
157    /// See [`ScalarUDF::with_aliases`] for more details
158    pub fn aliases(&self) -> &[String] {
159        self.inner.aliases()
160    }
161
162    /// Returns this function's [`Signature`] (what input types are accepted).
163    ///
164    /// See [`ScalarUDFImpl::signature`] for more details.
165    pub fn signature(&self) -> &Signature {
166        self.inner.signature()
167    }
168
169    /// The datatype this function returns given the input argument types.
170    /// This function is used when the input arguments are [`DataType`]s.
171    ///
172    ///  # Notes
173    ///
174    /// If a function implement [`ScalarUDFImpl::return_field_from_args`],
175    /// its [`ScalarUDFImpl::return_type`] should raise an error.
176    ///
177    /// See [`ScalarUDFImpl::return_type`] for more details.
178    pub fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
179        self.inner.return_type(arg_types)
180    }
181
182    /// Return the datatype this function returns given the input argument types.
183    ///
184    /// See [`ScalarUDFImpl::return_field_from_args`] for more details.
185    pub fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
186        self.inner.return_field_from_args(args)
187    }
188
189    /// Do the function rewrite
190    ///
191    /// See [`ScalarUDFImpl::simplify`] for more details.
192    pub fn simplify(
193        &self,
194        args: Vec<Expr>,
195        info: &dyn SimplifyInfo,
196    ) -> Result<ExprSimplifyResult> {
197        self.inner.simplify(args, info)
198    }
199
200    #[allow(deprecated)]
201    pub fn is_nullable(&self, args: &[Expr], schema: &dyn ExprSchema) -> bool {
202        self.inner.is_nullable(args, schema)
203    }
204
205    /// Invoke the function on `args`, returning the appropriate result.
206    ///
207    /// See [`ScalarUDFImpl::invoke_with_args`] for details.
208    pub fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
209        self.inner.invoke_with_args(args)
210    }
211
212    /// Get the circuits of inner implementation
213    pub fn short_circuits(&self) -> bool {
214        self.inner.short_circuits()
215    }
216
217    /// Computes the output interval for a [`ScalarUDF`], given the input
218    /// intervals.
219    ///
220    /// # Parameters
221    ///
222    /// * `inputs` are the intervals for the inputs (children) of this function.
223    ///
224    /// # Example
225    ///
226    /// If the function is `ABS(a)`, and the input interval is `a: [-3, 2]`,
227    /// then the output interval would be `[0, 3]`.
228    pub fn evaluate_bounds(&self, inputs: &[&Interval]) -> Result<Interval> {
229        self.inner.evaluate_bounds(inputs)
230    }
231
232    /// Updates bounds for child expressions, given a known interval for this
233    /// function. This is used to propagate constraints down through an expression
234    /// tree.
235    ///
236    /// # Parameters
237    ///
238    /// * `interval` is the currently known interval for this function.
239    /// * `inputs` are the current intervals for the inputs (children) of this function.
240    ///
241    /// # Returns
242    ///
243    /// A `Vec` of new intervals for the children, in order.
244    ///
245    /// If constraint propagation reveals an infeasibility for any child, returns
246    /// [`None`]. If none of the children intervals change as a result of
247    /// propagation, may return an empty vector instead of cloning `children`.
248    /// This is the default (and conservative) return value.
249    ///
250    /// # Example
251    ///
252    /// If the function is `ABS(a)`, the current `interval` is `[4, 5]` and the
253    /// input `a` is given as `[-7, 3]`, then propagation would return `[-5, 3]`.
254    pub fn propagate_constraints(
255        &self,
256        interval: &Interval,
257        inputs: &[&Interval],
258    ) -> Result<Option<Vec<Interval>>> {
259        self.inner.propagate_constraints(interval, inputs)
260    }
261
262    /// Calculates the [`SortProperties`] of this function based on its
263    /// children's properties.
264    pub fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties> {
265        self.inner.output_ordering(inputs)
266    }
267
268    pub fn preserves_lex_ordering(&self, inputs: &[ExprProperties]) -> Result<bool> {
269        self.inner.preserves_lex_ordering(inputs)
270    }
271
272    /// See [`ScalarUDFImpl::coerce_types`] for more details.
273    pub fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
274        self.inner.coerce_types(arg_types)
275    }
276
277    /// Returns the documentation for this Scalar UDF.
278    ///
279    /// Documentation can be accessed programmatically as well as
280    /// generating publicly facing documentation.
281    pub fn documentation(&self) -> Option<&Documentation> {
282        self.inner.documentation()
283    }
284
285    /// Return true if this function is an async function
286    pub fn as_async(&self) -> Option<&AsyncScalarUDF> {
287        self.inner().as_any().downcast_ref::<AsyncScalarUDF>()
288    }
289}
290
291impl<F> From<F> for ScalarUDF
292where
293    F: ScalarUDFImpl + 'static,
294{
295    fn from(fun: F) -> Self {
296        Self::new_from_impl(fun)
297    }
298}
299
300/// Arguments passed to [`ScalarUDFImpl::invoke_with_args`] when invoking a
301/// scalar function.
302#[derive(Debug, Clone)]
303pub struct ScalarFunctionArgs {
304    /// The evaluated arguments to the function
305    pub args: Vec<ColumnarValue>,
306    /// Field associated with each arg, if it exists
307    pub arg_fields: Vec<FieldRef>,
308    /// The number of rows in record batch being evaluated
309    pub number_rows: usize,
310    /// The return field of the scalar function returned (from `return_type`
311    /// or `return_field_from_args`) when creating the physical expression
312    /// from the logical expression
313    pub return_field: FieldRef,
314}
315
316impl ScalarFunctionArgs {
317    /// The return type of the function. See [`Self::return_field`] for more
318    /// details.
319    pub fn return_type(&self) -> &DataType {
320        self.return_field.data_type()
321    }
322}
323
324/// Information about arguments passed to the function
325///
326/// This structure contains metadata about how the function was called
327/// such as the type of the arguments, any scalar arguments and if the
328/// arguments can (ever) be null
329///
330/// See [`ScalarUDFImpl::return_field_from_args`] for more information
331#[derive(Debug)]
332pub struct ReturnFieldArgs<'a> {
333    /// The data types of the arguments to the function
334    pub arg_fields: &'a [FieldRef],
335    /// Is argument `i` to the function a scalar (constant)?
336    ///
337    /// If the argument `i` is not a scalar, it will be None
338    ///
339    /// For example, if a function is called like `my_function(column_a, 5)`
340    /// this field will be `[None, Some(ScalarValue::Int32(Some(5)))]`
341    pub scalar_arguments: &'a [Option<&'a ScalarValue>],
342}
343
344/// Trait for implementing user defined scalar functions.
345///
346/// This trait exposes the full API for implementing user defined functions and
347/// can be used to implement any function.
348///
349/// See [`advanced_udf.rs`] for a full example with complete implementation and
350/// [`ScalarUDF`] for other available options.
351///
352/// [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs
353///
354/// # Basic Example
355/// ```
356/// # use std::any::Any;
357/// # use std::sync::LazyLock;
358/// # use arrow::datatypes::DataType;
359/// # use datafusion_common::{DataFusionError, plan_err, Result};
360/// # use datafusion_expr::{col, ColumnarValue, Documentation, ScalarFunctionArgs, Signature, Volatility};
361/// # use datafusion_expr::{ScalarUDFImpl, ScalarUDF};
362/// # use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH;
363/// /// This struct for a simple UDF that adds one to an int32
364/// #[derive(Debug)]
365/// struct AddOne {
366///   signature: Signature,
367/// }
368///
369/// impl AddOne {
370///   fn new() -> Self {
371///     Self {
372///       signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable),
373///      }
374///   }
375/// }
376///
377/// static DOCUMENTATION: LazyLock<Documentation> = LazyLock::new(|| {
378///         Documentation::builder(DOC_SECTION_MATH, "Add one to an int32", "add_one(2)")
379///             .with_argument("arg1", "The int32 number to add one to")
380///             .build()
381///     });
382///
383/// fn get_doc() -> &'static Documentation {
384///     &DOCUMENTATION
385/// }
386///
387/// /// Implement the ScalarUDFImpl trait for AddOne
388/// impl ScalarUDFImpl for AddOne {
389///    fn as_any(&self) -> &dyn Any { self }
390///    fn name(&self) -> &str { "add_one" }
391///    fn signature(&self) -> &Signature { &self.signature }
392///    fn return_type(&self, args: &[DataType]) -> Result<DataType> {
393///      if !matches!(args.get(0), Some(&DataType::Int32)) {
394///        return plan_err!("add_one only accepts Int32 arguments");
395///      }
396///      Ok(DataType::Int32)
397///    }
398///    // The actual implementation would add one to the argument
399///    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
400///         unimplemented!()
401///    }
402///    fn documentation(&self) -> Option<&Documentation> {
403///         Some(get_doc())
404///     }
405/// }
406///
407/// // Create a new ScalarUDF from the implementation
408/// let add_one = ScalarUDF::from(AddOne::new());
409///
410/// // Call the function `add_one(col)`
411/// let expr = add_one.call(vec![col("a")]);
412/// ```
413pub trait ScalarUDFImpl: Debug + Send + Sync {
414    // Note: When adding any methods (with default implementations), remember to add them also
415    // into the AliasedScalarUDFImpl below!
416
417    /// Returns this object as an [`Any`] trait object
418    fn as_any(&self) -> &dyn Any;
419
420    /// Returns this function's name
421    fn name(&self) -> &str;
422
423    /// Returns any aliases (alternate names) for this function.
424    ///
425    /// Aliases can be used to invoke the same function using different names.
426    /// For example in some databases `now()` and `current_timestamp()` are
427    /// aliases for the same function. This behavior can be obtained by
428    /// returning `current_timestamp` as an alias for the `now` function.
429    ///
430    /// Note: `aliases` should only include names other than [`Self::name`].
431    /// Defaults to `[]` (no aliases)
432    fn aliases(&self) -> &[String] {
433        &[]
434    }
435
436    /// Returns the user-defined display name of function, given the arguments
437    ///
438    /// This can be used to customize the output column name generated by this
439    /// function.
440    ///
441    /// Defaults to `name(args[0], args[1], ...)`
442    fn display_name(&self, args: &[Expr]) -> Result<String> {
443        let names: Vec<String> = args.iter().map(ToString::to_string).collect();
444        // TODO: join with ", " to standardize the formatting of Vec<Expr>, <https://github.com/apache/datafusion/issues/10364>
445        Ok(format!("{}({})", self.name(), names.join(",")))
446    }
447
448    /// Returns the name of the column this expression would create
449    ///
450    /// See [`Expr::schema_name`] for details
451    fn schema_name(&self, args: &[Expr]) -> Result<String> {
452        Ok(format!(
453            "{}({})",
454            self.name(),
455            schema_name_from_exprs_comma_separated_without_space(args)?
456        ))
457    }
458
459    /// Returns the function's [`Signature`] for information about what input
460    /// types are accepted and the function's Volatility.
461    fn signature(&self) -> &Signature;
462
463    /// What [`DataType`] will be returned by this function, given the types of
464    /// the arguments.
465    ///
466    /// # Notes
467    ///
468    /// If you provide an implementation for [`Self::return_field_from_args`],
469    /// DataFusion will not call `return_type` (this function). In such cases
470    /// is recommended to return [`DataFusionError::Internal`].
471    ///
472    /// [`DataFusionError::Internal`]: datafusion_common::DataFusionError::Internal
473    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType>;
474
475    /// What type will be returned by this function, given the arguments?
476    ///
477    /// By default, this function calls [`Self::return_type`] with the
478    /// types of each argument.
479    ///
480    /// # Notes
481    ///
482    /// For the majority of UDFs, implementing [`Self::return_type`] is sufficient,
483    /// as the result type is typically a deterministic function of the input types
484    /// (e.g., `sqrt(f32)` consistently yields `f32`). Implementing this method directly
485    /// is generally unnecessary unless the return type depends on runtime values.
486    ///
487    /// This function can be used for more advanced cases such as:
488    ///
489    /// 1. specifying nullability
490    /// 2. return types based on the **values** of the arguments (rather than
491    ///    their **types**.
492    ///
493    /// # Example creating `Field`
494    ///
495    /// Note the name of the [`Field`] is ignored, except for structured types such as
496    /// `DataType::Struct`.
497    ///
498    /// ```rust
499    /// # use std::sync::Arc;
500    /// # use arrow::datatypes::{DataType, Field, FieldRef};
501    /// # use datafusion_common::Result;
502    /// # use datafusion_expr::ReturnFieldArgs;
503    /// # struct Example{}
504    /// # impl Example {
505    /// fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
506    ///   // report output is only nullable if any one of the arguments are nullable
507    ///   let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
508    ///   let field = Arc::new(Field::new("ignored_name", DataType::Int32, true));
509    ///   Ok(field)
510    /// }
511    /// # }
512    /// ```
513    ///
514    /// # Output Type based on Values
515    ///
516    /// For example, the following two function calls get the same argument
517    /// types (something and a `Utf8` string) but return different types based
518    /// on the value of the second argument:
519    ///
520    /// * `arrow_cast(x, 'Int16')` --> `Int16`
521    /// * `arrow_cast(x, 'Float32')` --> `Float32`
522    ///
523    /// # Requirements
524    ///
525    /// This function **must** consistently return the same type for the same
526    /// logical input even if the input is simplified (e.g. it must return the same
527    /// value for `('foo' | 'bar')` as it does for ('foobar').
528    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
529        let data_types = args
530            .arg_fields
531            .iter()
532            .map(|f| f.data_type())
533            .cloned()
534            .collect::<Vec<_>>();
535        let return_type = self.return_type(&data_types)?;
536        Ok(Arc::new(Field::new(self.name(), return_type, true)))
537    }
538
539    #[deprecated(
540        since = "45.0.0",
541        note = "Use `return_field_from_args` instead. if you use `is_nullable` that returns non-nullable with `return_type`, you would need to switch to `return_field_from_args`, you might have error"
542    )]
543    fn is_nullable(&self, _args: &[Expr], _schema: &dyn ExprSchema) -> bool {
544        true
545    }
546
547    /// Invoke the function returning the appropriate result.
548    ///
549    /// # Performance
550    ///
551    /// For the best performance, the implementations should handle the common case
552    /// when one or more of their arguments are constant values (aka
553    /// [`ColumnarValue::Scalar`]).
554    ///
555    /// [`ColumnarValue::values_to_arrays`] can be used to convert the arguments
556    /// to arrays, which will likely be simpler code, but be slower.
557    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue>;
558
559    /// Optionally apply per-UDF simplification / rewrite rules.
560    ///
561    /// This can be used to apply function specific simplification rules during
562    /// optimization (e.g. `arrow_cast` --> `Expr::Cast`). The default
563    /// implementation does nothing.
564    ///
565    /// Note that DataFusion handles simplifying arguments and  "constant
566    /// folding" (replacing a function call with constant arguments such as
567    /// `my_add(1,2) --> 3` ). Thus, there is no need to implement such
568    /// optimizations manually for specific UDFs.
569    ///
570    /// # Arguments
571    /// * `args`: The arguments of the function
572    /// * `info`: The necessary information for simplification
573    ///
574    /// # Returns
575    /// [`ExprSimplifyResult`] indicating the result of the simplification NOTE
576    /// if the function cannot be simplified, the arguments *MUST* be returned
577    /// unmodified
578    fn simplify(
579        &self,
580        args: Vec<Expr>,
581        _info: &dyn SimplifyInfo,
582    ) -> Result<ExprSimplifyResult> {
583        Ok(ExprSimplifyResult::Original(args))
584    }
585
586    /// Returns true if some of this `exprs` subexpressions may not be evaluated
587    /// and thus any side effects (like divide by zero) may not be encountered.
588    ///
589    /// Setting this to true prevents certain optimizations such as common
590    /// subexpression elimination
591    fn short_circuits(&self) -> bool {
592        false
593    }
594
595    /// Computes the output [`Interval`] for a [`ScalarUDFImpl`], given the input
596    /// intervals.
597    ///
598    /// # Parameters
599    ///
600    /// * `children` are the intervals for the children (inputs) of this function.
601    ///
602    /// # Example
603    ///
604    /// If the function is `ABS(a)`, and the input interval is `a: [-3, 2]`,
605    /// then the output interval would be `[0, 3]`.
606    fn evaluate_bounds(&self, _input: &[&Interval]) -> Result<Interval> {
607        // We cannot assume the input datatype is the same of output type.
608        Interval::make_unbounded(&DataType::Null)
609    }
610
611    /// Updates bounds for child expressions, given a known [`Interval`]s for this
612    /// function.
613    ///
614    /// This function is used to propagate constraints down through an
615    /// expression tree.
616    ///
617    /// # Parameters
618    ///
619    /// * `interval` is the currently known interval for this function.
620    /// * `inputs` are the current intervals for the inputs (children) of this function.
621    ///
622    /// # Returns
623    ///
624    /// A `Vec` of new intervals for the children, in order.
625    ///
626    /// If constraint propagation reveals an infeasibility for any child, returns
627    /// [`None`]. If none of the children intervals change as a result of
628    /// propagation, may return an empty vector instead of cloning `children`.
629    /// This is the default (and conservative) return value.
630    ///
631    /// # Example
632    ///
633    /// If the function is `ABS(a)`, the current `interval` is `[4, 5]` and the
634    /// input `a` is given as `[-7, 3]`, then propagation would return `[-5, 3]`.
635    fn propagate_constraints(
636        &self,
637        _interval: &Interval,
638        _inputs: &[&Interval],
639    ) -> Result<Option<Vec<Interval>>> {
640        Ok(Some(vec![]))
641    }
642
643    /// Calculates the [`SortProperties`] of this function based on its children's properties.
644    fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties> {
645        if !self.preserves_lex_ordering(inputs)? {
646            return Ok(SortProperties::Unordered);
647        }
648
649        let Some(first_order) = inputs.first().map(|p| &p.sort_properties) else {
650            return Ok(SortProperties::Singleton);
651        };
652
653        if inputs
654            .iter()
655            .skip(1)
656            .all(|input| &input.sort_properties == first_order)
657        {
658            Ok(*first_order)
659        } else {
660            Ok(SortProperties::Unordered)
661        }
662    }
663
664    /// Returns true if the function preserves lexicographical ordering based on
665    /// the input ordering.
666    ///
667    /// For example, `concat(a || b)` preserves lexicographical ordering, but `abs(a)` does not.
668    fn preserves_lex_ordering(&self, _inputs: &[ExprProperties]) -> Result<bool> {
669        Ok(false)
670    }
671
672    /// Coerce arguments of a function call to types that the function can evaluate.
673    ///
674    /// This function is only called if [`ScalarUDFImpl::signature`] returns
675    /// [`crate::TypeSignature::UserDefined`]. Most UDFs should return one of
676    /// the other variants of [`TypeSignature`] which handle common cases.
677    ///
678    /// See the [type coercion module](crate::type_coercion)
679    /// documentation for more details on type coercion
680    ///
681    /// [`TypeSignature`]: crate::TypeSignature
682    ///
683    /// For example, if your function requires a floating point arguments, but the user calls
684    /// it like `my_func(1::int)` (i.e. with `1` as an integer), coerce_types can return `[DataType::Float64]`
685    /// to ensure the argument is converted to `1::double`
686    ///
687    /// # Parameters
688    /// * `arg_types`: The argument types of the arguments  this function with
689    ///
690    /// # Return value
691    /// A Vec the same length as `arg_types`. DataFusion will `CAST` the function call
692    /// arguments to these specific types.
693    fn coerce_types(&self, _arg_types: &[DataType]) -> Result<Vec<DataType>> {
694        not_impl_err!("Function {} does not implement coerce_types", self.name())
695    }
696
697    /// Return true if this scalar UDF is equal to the other.
698    ///
699    /// Allows customizing the equality of scalar UDFs.
700    /// *Must* be implemented explicitly if the UDF type has internal state.
701    /// Must be consistent with [`Self::hash_value`] and follow the same rules as [`Eq`]:
702    ///
703    /// - reflexive: `a.equals(a)`;
704    /// - symmetric: `a.equals(b)` implies `b.equals(a)`;
705    /// - transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`.
706    ///
707    /// By default, compares type, [`Self::name`], [`Self::aliases`] and [`Self::signature`].
708    fn equals(&self, other: &dyn ScalarUDFImpl) -> bool {
709        self.as_any().type_id() == other.as_any().type_id()
710            && self.name() == other.name()
711            && self.aliases() == other.aliases()
712            && self.signature() == other.signature()
713    }
714
715    /// Returns a hash value for this scalar UDF.
716    ///
717    /// Allows customizing the hash code of scalar UDFs.
718    /// *Must* be implemented explicitly whenever [`Self::equals`] is implemented.
719    ///
720    /// Similarly to [`Hash`] and [`Eq`], if [`Self::equals`] returns true for two UDFs,
721    /// their `hash_value`s must be the same.
722    ///
723    /// By default, it is consistent with default implementation of [`Self::equals`].
724    fn hash_value(&self) -> u64 {
725        let hasher = &mut DefaultHasher::new();
726        self.as_any().type_id().hash(hasher);
727        self.name().hash(hasher);
728        self.aliases().hash(hasher);
729        self.signature().hash(hasher);
730        hasher.finish()
731    }
732
733    /// Returns the documentation for this Scalar UDF.
734    ///
735    /// Documentation can be accessed programmatically as well as generating
736    /// publicly facing documentation.
737    fn documentation(&self) -> Option<&Documentation> {
738        None
739    }
740}
741
742/// ScalarUDF that adds an alias to the underlying function. It is better to
743/// implement [`ScalarUDFImpl`], which supports aliases, directly if possible.
744#[derive(Debug)]
745struct AliasedScalarUDFImpl {
746    inner: Arc<dyn ScalarUDFImpl>,
747    aliases: Vec<String>,
748}
749
750impl AliasedScalarUDFImpl {
751    pub fn new(
752        inner: Arc<dyn ScalarUDFImpl>,
753        new_aliases: impl IntoIterator<Item = &'static str>,
754    ) -> Self {
755        let mut aliases = inner.aliases().to_vec();
756        aliases.extend(new_aliases.into_iter().map(|s| s.to_string()));
757        Self { inner, aliases }
758    }
759}
760
761impl ScalarUDFImpl for AliasedScalarUDFImpl {
762    fn as_any(&self) -> &dyn Any {
763        self
764    }
765
766    fn name(&self) -> &str {
767        self.inner.name()
768    }
769
770    fn display_name(&self, args: &[Expr]) -> Result<String> {
771        self.inner.display_name(args)
772    }
773
774    fn schema_name(&self, args: &[Expr]) -> Result<String> {
775        self.inner.schema_name(args)
776    }
777
778    fn signature(&self) -> &Signature {
779        self.inner.signature()
780    }
781
782    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
783        self.inner.return_type(arg_types)
784    }
785
786    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
787        self.inner.return_field_from_args(args)
788    }
789
790    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
791        self.inner.invoke_with_args(args)
792    }
793
794    fn aliases(&self) -> &[String] {
795        &self.aliases
796    }
797
798    fn simplify(
799        &self,
800        args: Vec<Expr>,
801        info: &dyn SimplifyInfo,
802    ) -> Result<ExprSimplifyResult> {
803        self.inner.simplify(args, info)
804    }
805
806    fn short_circuits(&self) -> bool {
807        self.inner.short_circuits()
808    }
809
810    fn evaluate_bounds(&self, input: &[&Interval]) -> Result<Interval> {
811        self.inner.evaluate_bounds(input)
812    }
813
814    fn propagate_constraints(
815        &self,
816        interval: &Interval,
817        inputs: &[&Interval],
818    ) -> Result<Option<Vec<Interval>>> {
819        self.inner.propagate_constraints(interval, inputs)
820    }
821
822    fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties> {
823        self.inner.output_ordering(inputs)
824    }
825
826    fn preserves_lex_ordering(&self, inputs: &[ExprProperties]) -> Result<bool> {
827        self.inner.preserves_lex_ordering(inputs)
828    }
829
830    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
831        self.inner.coerce_types(arg_types)
832    }
833
834    fn equals(&self, other: &dyn ScalarUDFImpl) -> bool {
835        if let Some(other) = other.as_any().downcast_ref::<AliasedScalarUDFImpl>() {
836            self.inner.equals(other.inner.as_ref()) && self.aliases == other.aliases
837        } else {
838            false
839        }
840    }
841
842    fn hash_value(&self) -> u64 {
843        let hasher = &mut DefaultHasher::new();
844        std::any::type_name::<Self>().hash(hasher);
845        self.inner.hash_value().hash(hasher);
846        self.aliases.hash(hasher);
847        hasher.finish()
848    }
849
850    fn documentation(&self) -> Option<&Documentation> {
851        self.inner.documentation()
852    }
853}
854
855// Scalar UDF doc sections for use in public documentation
856pub mod scalar_doc_sections {
857    use crate::DocSection;
858
859    pub fn doc_sections() -> Vec<DocSection> {
860        vec![
861            DOC_SECTION_MATH,
862            DOC_SECTION_CONDITIONAL,
863            DOC_SECTION_STRING,
864            DOC_SECTION_BINARY_STRING,
865            DOC_SECTION_REGEX,
866            DOC_SECTION_DATETIME,
867            DOC_SECTION_ARRAY,
868            DOC_SECTION_STRUCT,
869            DOC_SECTION_MAP,
870            DOC_SECTION_HASHING,
871            DOC_SECTION_UNION,
872            DOC_SECTION_OTHER,
873        ]
874    }
875
876    pub const fn doc_sections_const() -> &'static [DocSection] {
877        &[
878            DOC_SECTION_MATH,
879            DOC_SECTION_CONDITIONAL,
880            DOC_SECTION_STRING,
881            DOC_SECTION_BINARY_STRING,
882            DOC_SECTION_REGEX,
883            DOC_SECTION_DATETIME,
884            DOC_SECTION_ARRAY,
885            DOC_SECTION_STRUCT,
886            DOC_SECTION_MAP,
887            DOC_SECTION_HASHING,
888            DOC_SECTION_UNION,
889            DOC_SECTION_OTHER,
890        ]
891    }
892
893    pub const DOC_SECTION_MATH: DocSection = DocSection {
894        include: true,
895        label: "Math Functions",
896        description: None,
897    };
898
899    pub const DOC_SECTION_CONDITIONAL: DocSection = DocSection {
900        include: true,
901        label: "Conditional Functions",
902        description: None,
903    };
904
905    pub const DOC_SECTION_STRING: DocSection = DocSection {
906        include: true,
907        label: "String Functions",
908        description: None,
909    };
910
911    pub const DOC_SECTION_BINARY_STRING: DocSection = DocSection {
912        include: true,
913        label: "Binary String Functions",
914        description: None,
915    };
916
917    pub const DOC_SECTION_REGEX: DocSection = DocSection {
918        include: true,
919        label: "Regular Expression Functions",
920        description: Some(
921            r#"Apache DataFusion uses a [PCRE-like](https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions)
922regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax)
923(minus support for several features including look-around and backreferences).
924The following regular expression functions are supported:"#,
925        ),
926    };
927
928    pub const DOC_SECTION_DATETIME: DocSection = DocSection {
929        include: true,
930        label: "Time and Date Functions",
931        description: None,
932    };
933
934    pub const DOC_SECTION_ARRAY: DocSection = DocSection {
935        include: true,
936        label: "Array Functions",
937        description: None,
938    };
939
940    pub const DOC_SECTION_STRUCT: DocSection = DocSection {
941        include: true,
942        label: "Struct Functions",
943        description: None,
944    };
945
946    pub const DOC_SECTION_MAP: DocSection = DocSection {
947        include: true,
948        label: "Map Functions",
949        description: None,
950    };
951
952    pub const DOC_SECTION_HASHING: DocSection = DocSection {
953        include: true,
954        label: "Hashing Functions",
955        description: None,
956    };
957
958    pub const DOC_SECTION_OTHER: DocSection = DocSection {
959        include: true,
960        label: "Other Functions",
961        description: None,
962    };
963
964    pub const DOC_SECTION_UNION: DocSection = DocSection {
965        include: true,
966        label: "Union Functions",
967        description: Some("Functions to work with the union data type, also know as tagged unions, variant types, enums or sum types. Note: Not related to the SQL UNION operator"),
968    };
969}
datafusion_expr/udf.rs

datafusion_expr/
udf.rs