Deprecated: The each() function is deprecated. This message will be suppressed on further calls in /home/zhenxiangba/zhenxiangba.com/public_html/phproxy-improved-master/index.php on line 456
lib.rs - source
[go: Go Back, main page]

datafusion/
lib.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#![doc(
19    html_logo_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg",
20    html_favicon_url = "https://raw.githubusercontent.com/apache/datafusion/19fe44cf2f30cbdd63d4a4f52c74055163c6cc38/docs/logos/standalone_logo/logo_original.svg"
21)]
22#![cfg_attr(docsrs, feature(doc_auto_cfg))]
23// Make sure fast / cheap clones on Arc are explicit:
24// https://github.com/apache/datafusion/issues/11143
25#![cfg_attr(not(test), deny(clippy::clone_on_ref_ptr))]
26#![warn(missing_docs, clippy::needless_borrow)]
27
28//! [DataFusion] is an extensible query engine written in Rust that
29//! uses [Apache Arrow] as its in-memory format. DataFusion's target users are
30//! developers building fast and feature rich database and analytic systems,
31//! customized to particular workloads. See [use cases] for examples.
32//!
33//! "Out of the box," DataFusion offers [SQL] and [`Dataframe`] APIs,
34//! excellent [performance], built-in support for CSV, Parquet, JSON, and Avro,
35//! extensive customization, and a great community.
36//! [Python Bindings] are also available.
37//!
38//! DataFusion features a full query planner, a columnar, streaming, multi-threaded,
39//! vectorized execution engine, and partitioned data  sources. You can
40//! customize DataFusion at almost all points including additional data sources,
41//! query languages, functions, custom operators and more.
42//! See the [Architecture] section below for more details.
43//!
44//! [DataFusion]: https://datafusion.apache.org/
45//! [Apache Arrow]: https://arrow.apache.org
46//! [use cases]: https://datafusion.apache.org/user-guide/introduction.html#use-cases
47//! [SQL]: https://datafusion.apache.org/user-guide/sql/index.html
48//! [`DataFrame`]: dataframe::DataFrame
49//! [performance]: https://benchmark.clickhouse.com/
50//! [Python Bindings]: https://github.com/apache/datafusion-python
51//! [Architecture]: #architecture
52//!
53//! # Examples
54//!
55//! The main entry point for interacting with DataFusion is the
56//! [`SessionContext`]. [`Expr`]s represent expressions such as `a + b`.
57//!
58//! [`SessionContext`]: execution::context::SessionContext
59//!
60//! ## DataFrame
61//!
62//! To execute a query against data stored
63//! in a CSV file using a [`DataFrame`]:
64//!
65//! ```rust
66//! # use datafusion::prelude::*;
67//! # use datafusion::error::Result;
68//! # use datafusion::functions_aggregate::expr_fn::min;
69//! # use datafusion::arrow::array::RecordBatch;
70//!
71//! # #[tokio::main]
72//! # async fn main() -> Result<()> {
73//! let ctx = SessionContext::new();
74//!
75//! // create the dataframe
76//! let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
77//!
78//! // create a plan
79//! let df = df.filter(col("a").lt_eq(col("b")))?
80//!            .aggregate(vec![col("a")], vec![min(col("b"))])?
81//!            .limit(0, Some(100))?;
82//!
83//! // execute the plan
84//! let results: Vec<RecordBatch> = df.collect().await?;
85//!
86//! // format the results
87//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?
88//!    .to_string();
89//!
90//! let expected = vec![
91//!     "+---+----------------+",
92//!     "| a | min(?table?.b) |",
93//!     "+---+----------------+",
94//!     "| 1 | 2              |",
95//!     "+---+----------------+"
96//! ];
97//!
98//! assert_eq!(pretty_results.trim().lines().collect::<Vec<_>>(), expected);
99//! # Ok(())
100//! # }
101//! ```
102//!
103//! ## SQL
104//!
105//! To execute a query against a CSV file using [SQL]:
106//!
107//! ```
108//! # use datafusion::prelude::*;
109//! # use datafusion::error::Result;
110//! # use datafusion::arrow::array::RecordBatch;
111//!
112//! # #[tokio::main]
113//! # async fn main() -> Result<()> {
114//! let ctx = SessionContext::new();
115//!
116//! ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?;
117//!
118//! // create a plan
119//! let df = ctx.sql("SELECT a, MIN(b) FROM example WHERE a <= b GROUP BY a LIMIT 100").await?;
120//!
121//! // execute the plan
122//! let results: Vec<RecordBatch> = df.collect().await?;
123//!
124//! // format the results
125//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?
126//!   .to_string();
127//!
128//! let expected = vec![
129//!     "+---+----------------+",
130//!     "| a | min(example.b) |",
131//!     "+---+----------------+",
132//!     "| 1 | 2              |",
133//!     "+---+----------------+"
134//! ];
135//!
136//! assert_eq!(pretty_results.trim().lines().collect::<Vec<_>>(), expected);
137//! # Ok(())
138//! # }
139//! ```
140//!
141//! ## More Examples
142//!
143//! There are many additional annotated examples of using DataFusion in the [datafusion-examples] directory.
144//!
145//! [datafusion-examples]: https://github.com/apache/datafusion/tree/main/datafusion-examples
146//!
147//! # Architecture
148//!
149//! <!-- NOTE: The goal of this section is to provide a high level
150//! overview of how DataFusion is organized and then link to other
151//! sections of the docs with more details -->
152//!
153//! You can find a formal description of DataFusion's architecture in our
154//! [SIGMOD 2024 Paper].
155//!
156//! [SIGMOD 2024 Paper]: https://dl.acm.org/doi/10.1145/3626246.3653368
157//!
158//! ## Design Goals
159//! DataFusion's Architecture Goals are:
160//!
161//! 1. Work β€œout of the box”: Provide a very fast, world class query engine with
162//!    minimal setup or required configuration.
163//!
164//! 2. Customizable everything: All behavior should be customizable by
165//!    implementing traits.
166//!
167//! 3. Architecturally boring πŸ₯±: Follow industrial best practice rather than
168//!    trying cutting edge, but unproven, techniques.
169//!
170//! With these principles, users start with a basic, high-performance engine
171//! and specialize it over time to suit their needs and available engineering
172//! capacity.
173//!
174//! ## Overview  Presentations
175//!
176//! The following presentations offer high level overviews of the
177//! different components and how they interact together.
178//!
179//! - [Apr 2023]: The Apache DataFusion Architecture talks
180//!   - _Query Engine_: [recording](https://youtu.be/NVKujPxwSBA) and [slides](https://docs.google.com/presentation/d/1D3GDVas-8y0sA4c8EOgdCvEjVND4s2E7I6zfs67Y4j8/edit#slide=id.p)
181//!   - _Logical Plan and Expressions_: [recording](https://youtu.be/EzZTLiSJnhY) and [slides](https://docs.google.com/presentation/d/1ypylM3-w60kVDW7Q6S99AHzvlBgciTdjsAfqNP85K30)
182//!   - _Physical Plan and Execution_: [recording](https://youtu.be/2jkWU3_w6z0) and [slides](https://docs.google.com/presentation/d/1cA2WQJ2qg6tx6y4Wf8FH2WVSm9JQ5UgmBWATHdik0hg)
183//! - [July 2022]: DataFusion and Arrow: Supercharge Your Data Analytical Tool with a Rusty Query Engine: [recording](https://www.youtube.com/watch?v=Rii1VTn3seQ) and [slides](https://docs.google.com/presentation/d/1q1bPibvu64k2b7LPi7Yyb0k3gA1BiUYiUbEklqW1Ckc/view#slide=id.g11054eeab4c_0_1165)
184//! - [March 2021]: The DataFusion architecture is described in _Query Engine Design and the Rust-Based DataFusion in Apache Arrow_: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts [~ 15 minutes in](https://www.youtube.com/watch?v=K6eCAVEk4kU&t=875s)) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934)
185//! - [February 2021]: How DataFusion is used within the Ballista Project is described in _Ballista: Distributed Compute with Rust and Apache Arrow_: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ)
186//!
187//! ## Customization and Extension
188//!
189//! DataFusion is designed to be highly extensible, so you can
190//! start with a working, full featured engine, and then
191//! specialize any behavior for your use case. For example,
192//! some projects may add custom [`ExecutionPlan`] operators, or create their own
193//! query language that directly creates [`LogicalPlan`] rather than using the
194//! built in SQL planner, [`SqlToRel`].
195//!
196//! In order to achieve this, DataFusion supports extension at many points:
197//!
198//! * read from any datasource ([`TableProvider`])
199//! * define your own catalogs, schemas, and table lists ([`catalog`] and [`CatalogProvider`])
200//! * build your own query language or plans ([`LogicalPlanBuilder`])
201//! * declare and use user-defined functions ([`ScalarUDF`], and [`AggregateUDF`], [`WindowUDF`])
202//! * add custom plan rewrite passes ([`AnalyzerRule`], [`OptimizerRule`]  and [`PhysicalOptimizerRule`])
203//! * extend the planner to use user-defined logical and physical nodes ([`QueryPlanner`])
204//!
205//! You can find examples of each of them in the [datafusion-examples] directory.
206//!
207//! [`TableProvider`]: crate::datasource::TableProvider
208//! [`CatalogProvider`]: crate::catalog::CatalogProvider
209//! [`LogicalPlanBuilder`]: datafusion_expr::logical_plan::builder::LogicalPlanBuilder
210//! [`ScalarUDF`]: crate::logical_expr::ScalarUDF
211//! [`AggregateUDF`]: crate::logical_expr::AggregateUDF
212//! [`WindowUDF`]: crate::logical_expr::WindowUDF
213//! [`QueryPlanner`]: execution::context::QueryPlanner
214//! [`OptimizerRule`]: datafusion_optimizer::optimizer::OptimizerRule
215//! [`AnalyzerRule`]:  datafusion_optimizer::analyzer::AnalyzerRule
216//! [`PhysicalOptimizerRule`]: datafusion_physical_optimizer::PhysicalOptimizerRule
217//!
218//! ## Query Planning and Execution Overview
219//!
220//! ### SQL
221//!
222//! ```text
223//!                 Parsed with            SqlToRel creates
224//!                 sqlparser              initial plan
225//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”           β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”             β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
226//! β”‚   SELECT *    β”‚           β”‚Query {  β”‚             β”‚Project      β”‚
227//! β”‚   FROM ...    │──────────▢│..       │────────────▢│  TableScan  β”‚
228//! β”‚               β”‚           β”‚}        β”‚             β”‚    ...      β”‚
229//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜           β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜             β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
230//!
231//!   SQL String                 sqlparser               LogicalPlan
232//!                              AST nodes
233//! ```
234//!
235//! 1. The query string is parsed to an Abstract Syntax Tree (AST)
236//!    [`Statement`] using [sqlparser].
237//!
238//! 2. The AST is converted to a [`LogicalPlan`] and logical expressions
239//!    [`Expr`]s to compute the desired result by [`SqlToRel`]. This phase
240//!    also includes name and type resolution ("binding").
241//!
242//! [`Statement`]: https://docs.rs/sqlparser/latest/sqlparser/ast/enum.Statement.html
243//!
244//! ### DataFrame
245//!
246//! When executing plans using the [`DataFrame`] API, the process is
247//! identical as with SQL, except the DataFrame API builds the
248//! [`LogicalPlan`] directly using [`LogicalPlanBuilder`]. Systems
249//! that have their own custom query languages typically also build
250//! [`LogicalPlan`] directly.
251//!
252//! ### Planning
253//!
254//! ```text
255//!             AnalyzerRules and      PhysicalPlanner          PhysicalOptimizerRules
256//!             OptimizerRules         creates ExecutionPlan    improve performance
257//!             rewrite plan
258//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”        β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”      β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”        β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
259//! β”‚Project      β”‚        β”‚Project(x, y)β”‚      β”‚ProjectExec      β”‚        β”‚ProjectExec      β”‚
260//! β”‚  TableScan  │──...──▢│  TableScan  │─────▢│  ...            │──...──▢│  ...            β”‚
261//! β”‚    ...      β”‚        β”‚    ...      β”‚      β”‚   DataSourceExecβ”‚        β”‚   DataSourceExecβ”‚
262//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜        β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜      β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜        β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
263//!
264//!  LogicalPlan            LogicalPlan         ExecutionPlan             ExecutionPlan
265//! ```
266//!
267//! To process large datasets with many rows as efficiently as
268//! possible, significant effort is spent planning and
269//! optimizing, in the following manner:
270//!
271//! 1. The [`LogicalPlan`] is checked and rewritten to enforce
272//!    semantic rules, such as type coercion, by [`AnalyzerRule`]s
273//!
274//! 2. The [`LogicalPlan`] is rewritten by [`OptimizerRule`]s, such as
275//!    projection and filter pushdown, to improve its efficiency.
276//!
277//! 3. The [`LogicalPlan`] is converted to an [`ExecutionPlan`] by a
278//!    [`PhysicalPlanner`]
279//!
280//! 4. The [`ExecutionPlan`] is rewritten by
281//!    [`PhysicalOptimizerRule`]s, such as sort and join selection, to
282//!    improve its efficiency.
283//!
284//! ## Data Sources
285//!
286//! ```text
287//! Planning       β”‚
288//! requests       β”‚            TableProvider::scan
289//! information    β”‚            creates an
290//! such as schema β”‚            ExecutionPlan
291//!                β”‚
292//!                β–Ό
293//!   β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”         β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
294//!   β”‚                         β”‚         β”‚               β”‚
295//!   β”‚impl TableProvider       │────────▢│DataSourceExec β”‚
296//!   β”‚                         β”‚         β”‚               β”‚
297//!   β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜         β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
298//!         TableProvider
299//!         (built in or user provided)    ExecutionPlan
300//! ```
301//!
302//! A [`TableProvider`] provides information for planning and
303//! an [`ExecutionPlan`]s for execution. DataFusion includes [`ListingTable`]
304//! which supports reading several common file formats, and you can support any
305//! new file format by implementing the [`TableProvider`] trait. See also:
306//!
307//! 1. [`ListingTable`]: Reads data from Parquet, JSON, CSV, or AVRO
308//!    files.  Supports single files or multiple files with HIVE style
309//!    partitioning, optional compression, directly reading from remote
310//!    object store and more.
311//!
312//! 2. [`MemTable`]: Reads data from in memory [`RecordBatch`]es.
313//!
314//! 3. [`StreamingTable`]: Reads data from potentially unbounded inputs.
315//!
316//! [`ListingTable`]: crate::datasource::listing::ListingTable
317//! [`MemTable`]: crate::datasource::memory::MemTable
318//! [`StreamingTable`]: crate::catalog::streaming::StreamingTable
319//!
320//! ## Plan Representations
321//!
322//! ### Logical Plans
323//! Logical planning yields [`LogicalPlan`] nodes and [`Expr`]
324//! representing expressions which are [`Schema`] aware and represent statements
325//! independent of how they are physically executed.
326//! A [`LogicalPlan`] is a Directed Acyclic Graph (DAG) of other
327//! [`LogicalPlan`]s, each potentially containing embedded [`Expr`]s.
328//!
329//! `LogicalPlan`s can be rewritten with [`TreeNode`] API, see the
330//! [`tree_node module`] for more details.
331//!
332//! [`Expr`]s can also be rewritten with [`TreeNode`] API and simplified using
333//! [`ExprSimplifier`]. Examples of working with and executing `Expr`s can be
334//! found in the [`expr_api`.rs] example
335//!
336//! [`TreeNode`]: datafusion_common::tree_node::TreeNode
337//! [`tree_node module`]: datafusion_expr::logical_plan::tree_node
338//! [`ExprSimplifier`]: crate::optimizer::simplify_expressions::ExprSimplifier
339//! [`expr_api`.rs]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/expr_api.rs
340//!
341//! ### Physical Plans
342//!
343//! An [`ExecutionPlan`] (sometimes referred to as a "physical plan")
344//! is a plan that can be executed against data. It a DAG of other
345//! [`ExecutionPlan`]s each potentially containing expressions that implement the
346//! [`PhysicalExpr`] trait.
347//!
348//! Compared to a [`LogicalPlan`], an [`ExecutionPlan`] has additional concrete
349//! information about how to perform calculations (e.g. hash vs merge
350//! join), and how data flows during execution (e.g. partitioning and
351//! sortedness).
352//!
353//! [cp_solver] performs range propagation analysis on [`PhysicalExpr`]s and
354//! [`PruningPredicate`] can prove certain boolean [`PhysicalExpr`]s used for
355//! filtering can never be `true` using additional statistical information.
356//!
357//! [cp_solver]: crate::physical_expr::intervals::cp_solver
358//! [`PruningPredicate`]: datafusion_physical_optimizer::pruning::PruningPredicate
359//! [`PhysicalExpr`]: crate::physical_plan::PhysicalExpr
360//!
361//! ## Execution
362//!
363//! ```text
364//!            ExecutionPlan::execute             Calling next() on the
365//!            produces a stream                  stream produces the data
366//!
367//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”      β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”         β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
368//! β”‚ProjectExec     β”‚      β”‚impl                     β”‚    β”Œβ”€β”€β”€β–Άβ”‚RecordBatch β”‚
369//! β”‚  ...           │─────▢│SendableRecordBatchStream│─────    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
370//! β”‚  DataSourceExecβ”‚      β”‚                         β”‚    β”‚    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
371//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜      β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜    β”œβ”€β”€β”€β–Άβ”‚RecordBatch β”‚
372//!               β–²                                        β”‚    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
373//! ExecutionPlan β”‚                                        β”‚         ...
374//!               β”‚                                        β”‚
375//!               β”‚                                        β”‚    β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
376//!             PhysicalOptimizerRules                     β”œβ”€β”€β”€β–Άβ”‚RecordBatch β”‚
377//!             request information                        β”‚    β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
378//!             such as partitioning                       β”‚    β”Œ ─ ─ ─ ─ ─ ─
379//!                                                        └───▢ None        β”‚
380//!                                                             β”” ─ ─ ─ ─ ─ ─
381//! ```
382//!
383//! [`ExecutionPlan`]s process data using the [Apache Arrow] memory
384//! format, making heavy use of functions from the [arrow]
385//! crate. Values are represented with [`ColumnarValue`], which are either
386//! [`ScalarValue`] (single constant values) or [`ArrayRef`] (Arrow
387//! Arrays).
388//!
389//! Calling [`execute`] produces 1 or more partitions of data,
390//! as a [`SendableRecordBatchStream`], which implements a pull based execution
391//! API. Calling [`next()`]`.await` will incrementally compute and return the next
392//! [`RecordBatch`]. Balanced parallelism is achieved using [Volcano style]
393//! "Exchange" operations implemented by [`RepartitionExec`].
394//!
395//! While some recent research such as [Morsel-Driven Parallelism] describes challenges
396//! with the pull style Volcano execution model on NUMA architectures, in practice DataFusion achieves
397//! similar scalability as systems that use push driven schedulers [such as DuckDB].
398//! See the [DataFusion paper in SIGMOD 2024] for more details.
399//!
400//! [`execute`]: physical_plan::ExecutionPlan::execute
401//! [`SendableRecordBatchStream`]: crate::physical_plan::SendableRecordBatchStream
402//! [`ColumnarValue`]: datafusion_expr::ColumnarValue
403//! [`ScalarValue`]: crate::scalar::ScalarValue
404//! [`ArrayRef`]: arrow::array::ArrayRef
405//! [`Stream`]: futures::stream::Stream
406//!
407//! See the [implementors of `ExecutionPlan`] for a list of physical operators available.
408//!
409//! [`RepartitionExec`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/repartition/struct.RepartitionExec.html
410//! [Volcano style]: https://doi.org/10.1145/93605.98720
411//! [Morsel-Driven Parallelism]: https://db.in.tum.de/~leis/papers/morsels.pdf
412//! [DataFusion paper in SIGMOD 2024]: https://github.com/apache/datafusion/files/15149988/DataFusion_Query_Engine___SIGMOD_2024-FINAL-mk4.pdf
413//! [such as DuckDB]: https://github.com/duckdb/duckdb/issues/1583
414//! [implementors of `ExecutionPlan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#implementors
415//!
416//! ## Streaming Execution
417//!
418//! DataFusion is a "streaming" query engine which means `ExecutionPlan`s incrementally
419//! read from their input(s) and compute output one [`RecordBatch`] at a time
420//! by continually polling [`SendableRecordBatchStream`]s. Output and
421//! intermediate `RecordBatch`s each have approximately `batch_size` rows,
422//! which amortizes per-batch overhead of execution.
423//!
424//! Note that certain operations, sometimes called "pipeline breakers",
425//! (for example full sorts or hash aggregations) are fundamentally non streaming and
426//! must read their input fully before producing **any** output. As much as possible,
427//! other operators read a single [`RecordBatch`] from their input to produce a
428//! single `RecordBatch` as output.
429//!
430//! For example, given this SQL query:
431//!
432//! ```sql
433//! SELECT date_trunc('month', time) FROM data WHERE id IN (10,20,30);
434//! ```
435//!
436//! The diagram below shows the call sequence when a consumer calls [`next()`] to
437//! get the next `RecordBatch` of output. While it is possible that some
438//! steps run on different threads, typically tokio will use the same thread
439//! that called `next()` to read from the input, apply the filter, and
440//! return the results without interleaving any other operations. This results
441//! in excellent cache locality as the same CPU core that produces the data often
442//! consumes it immediately as well.
443//!
444//! ```text
445//!
446//! Step 3: FilterExec calls next()       Step 2: ProjectionExec calls
447//!         on input Stream                  next() on input Stream
448//!         β”Œ ─ ─ ─ ─ ─ ─ ─ ─ ─      β”Œ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐
449//!                            β”‚                                               Step 1: Consumer
450//!         β–Ό                        β–Ό                           β”‚               calls next()
451//! ┏━━━━━━━━━━━━━━━━┓     ┏━━━━━┻━━━━━━━━━━━━━┓      ┏━━━━━━━━━━━━━━━━━━━━━━━━┓
452//! ┃                ┃     ┃                   ┃      ┃                        β—€ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
453//! ┃  DataSource    ┃     ┃                   ┃      ┃                        ┃
454//! ┃    (e.g.       ┃     ┃    FilterExec     ┃      ┃     ProjectionExec     ┃
455//! ┃ ParquetSource) ┃     ┃id IN (10, 20, 30) ┃      ┃date_bin('month', time) ┃
456//! ┃                ┃     ┃                   ┃      ┃                        ┣ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ β–Ά
457//! ┃                ┃     ┃                   ┃      ┃                        ┃
458//! ┗━━━━━━━━━━━━━━━━┛     ┗━━━━━━━━━━━┳━━━━━━━┛      ┗━━━━━━━━━━━━━━━━━━━━━━━━┛
459//!         β”‚                  β–²                                 β–²          Step 6: ProjectionExec
460//!                            β”‚     β”‚                           β”‚        computes date_trunc into a
461//!         β”” ─ ─ ─ ─ ─ ─ ─ ─ ─       ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─          new RecordBatch returned
462//!              β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”                β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”          from client
463//!              β”‚     RecordBatch     β”‚                β”‚ RecordBatch β”‚
464//!              β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜                β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
465//!
466//!           Step 4: DataSource returns a        Step 5: FilterExec returns a new
467//!                single RecordBatch            RecordBatch with only matching rows
468//! ```
469//!
470//! [`next()`]: futures::StreamExt::next
471//!
472//! ## Thread Scheduling, CPU / IO Thread Pools, and [Tokio] [`Runtime`]s
473//!
474//! DataFusion automatically runs each plan with multiple CPU cores using
475//! a [Tokio] [`Runtime`] as a thread pool. While tokio is most commonly used
476//! for asynchronous network I/O, the combination of an efficient, work-stealing
477//! scheduler and first class compiler support for automatic continuation
478//! generation (`async`), also makes it a compelling choice for CPU intensive
479//! applications as explained in the [Using Rustlang’s Async Tokio
480//! Runtime for CPU-Bound Tasks] blog.
481//!
482//! The number of cores used is determined by the `target_partitions`
483//! configuration setting, which defaults to the number of CPU cores.
484//! While preparing for execution, DataFusion tries to create this many distinct
485//! `async` [`Stream`]s for each `ExecutionPlan`.
486//! The `Stream`s for certain `ExecutionPlans`, such as as [`RepartitionExec`]
487//! and [`CoalescePartitionsExec`], spawn [Tokio] [`task`]s, that are run by
488//! threads managed by the `Runtime`.
489//! Many DataFusion `Stream`s perform CPU intensive processing.
490//!
491//! Using `async` for CPU intensive tasks makes it easy for [`TableProvider`]s
492//! to perform network I/O using standard Rust `async` during execution.
493//! However, this design also makes it very easy to mix CPU intensive and latency
494//! sensitive I/O work on the same thread pool ([`Runtime`]).
495//! Using the same (default) `Runtime` is convenient, and often works well for
496//! initial development and processing local files, but it can lead to problems
497//! under load and/or when reading from network sources such as AWS S3.
498//!
499//! If your system does not fully utilize either the CPU or network bandwidth
500//! during execution, or you see significantly higher tail (e.g. p99) latencies
501//! responding to network requests, **it is likely you need to use a different
502//! `Runtime` for CPU intensive DataFusion plans**. This effect can be especially
503//! pronounced when running several queries concurrently.
504//!
505//! As shown in the following figure, using the same `Runtime` for both CPU
506//! intensive processing and network requests can introduce significant
507//! delays in responding to those network requests. Delays in processing network
508//! requests can and does lead network flow control to throttle the available
509//! bandwidth in response.
510//!
511//! ```text
512//!                                                                          Legend
513//!
514//!                                                                          ┏━━━━━━┓
515//!                            Processing network request                    ┃      ┃  CPU bound work
516//!                            is delayed due to processing                  ┗━━━━━━┛
517//!                            CPU bound work                                β”Œβ”€β”
518//!                                                                          β”‚ β”‚       Network request
519//!                                         β”‚β”‚                               β””β”€β”˜       processing
520//!
521//!                                         β”‚β”‚
522//!                                ─ ─ ─ ─ ─  ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
523//!                               β”‚                                            β”‚
524//!
525//!                               β–Ό                                            β–Ό
526//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”           β”Œβ”€β”β”Œβ”€β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”“β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”“β”Œβ”€β”
527//! β”‚             β”‚thread 1   β”‚ β”‚β”‚ │┃     Decoding      ┃┃     Filtering     ┃│ β”‚
528//! β”‚             β”‚           β””β”€β”˜β””β”€β”˜β”—β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”›β”—β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”›β””β”€β”˜
529//! β”‚             β”‚           ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
530//! β”‚Tokio Runtimeβ”‚thread 2   ┃   Decoding   ┃     Filtering     ┃   Decoding   ┃       ...
531//! β”‚(thread pool)β”‚           ┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┛
532//! β”‚             β”‚     ...                               ...
533//! β”‚             β”‚           β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”³β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”“β”Œβ”€β” ┏━━━━━━━━━━━━━━┓
534//! β”‚             β”‚thread N   ┃     Decoding      ┃     Filtering     ┃│ β”‚ ┃   Decoding   ┃
535//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜           β”—β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”»β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”β”›β””β”€β”˜ ┗━━━━━━━━━━━━━━┛
536//!                           ─────────────────────────────────────────────────────────────▢
537//!                                                                                           time
538//! ```
539//!
540//! The bottleneck resulting from network throttling can be avoided
541//! by using separate [`Runtime`]s for the different types of work, as shown
542//! in the diagram below.
543//!
544//! ```text
545//!                    A separate thread pool processes network       Legend
546//!                    requests, reducing the latency for
547//!                    processing each request                        ┏━━━━━━┓
548//!                                                                   ┃      ┃  CPU bound work
549//!                                         β”‚                         ┗━━━━━━┛
550//!                                          β”‚                        β”Œβ”€β”
551//!                               β”Œ ─ ─ ─ ─ β”˜                         β”‚ β”‚       Network request
552//!                                  β”Œ ─ ─ ─ β”˜                        β””β”€β”˜       processing
553//!                               β”‚
554//!                               β–Ό  β–Ό
555//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”           β”Œβ”€β”β”Œβ”€β”β”Œβ”€β”
556//! β”‚             β”‚thread 1   β”‚ β”‚β”‚ β”‚β”‚ β”‚
557//! β”‚             β”‚           β””β”€β”˜β””β”€β”˜β””β”€β”˜
558//! β”‚Tokio Runtimeβ”‚                                          ...
559//! β”‚(thread pool)β”‚thread 2
560//! β”‚             β”‚
561//! β”‚"IO Runtime" β”‚     ...
562//! β”‚             β”‚                                                   β”Œβ”€β”
563//! β”‚             β”‚thread N                                           β”‚ β”‚
564//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜                                                   β””β”€β”˜
565//!                           ─────────────────────────────────────────────────────────────▢
566//!                                                                                           time
567//!
568//! β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”           ┏━━━━━━━━━━━━━━━━━━━┓┏━━━━━━━━━━━━━━━━━━━┓
569//! β”‚             β”‚thread 1   ┃     Decoding      ┃┃     Filtering     ┃
570//! β”‚             β”‚           ┗━━━━━━━━━━━━━━━━━━━┛┗━━━━━━━━━━━━━━━━━━━┛
571//! β”‚Tokio Runtimeβ”‚           ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
572//! β”‚(thread pool)β”‚thread 2   ┃   Decoding   ┃     Filtering     ┃   Decoding   ┃       ...
573//! β”‚             β”‚           ┗━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┛
574//! β”‚ CPU Runtime β”‚     ...                               ...
575//! β”‚             β”‚           ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
576//! β”‚             β”‚thread N   ┃     Decoding      ┃     Filtering     ┃   Decoding   ┃
577//! β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜           ┗━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┛
578//!                          ─────────────────────────────────────────────────────────────▢
579//!                                                                                           time
580//!```
581//!
582//! Note that DataFusion does not use [`tokio::task::spawn_blocking`] for
583//! CPU-bounded work, because `spawn_blocking` is designed for blocking **IO**,
584//! not designed CPU bound tasks. Among other challenges, spawned blocking
585//! tasks can't yield waiting for input (can't call `await`) so they
586//! can't be used to limit the number of concurrent CPU bound tasks or
587//! keep the processing pipeline to the same core.
588//!
589//! [Tokio]:  https://tokio.rs
590//! [`Runtime`]: tokio::runtime::Runtime
591//! [`task`]: tokio::task
592//! [Using Rustlang’s Async Tokio Runtime for CPU-Bound Tasks]: https://thenewstack.io/using-rustlangs-async-tokio-runtime-for-cpu-bound-tasks/
593//! [`RepartitionExec`]: physical_plan::repartition::RepartitionExec
594//! [`CoalescePartitionsExec`]: physical_plan::coalesce_partitions::CoalescePartitionsExec
595//!
596//! ## State Management and Configuration
597//!
598//! [`ConfigOptions`] contain options to control DataFusion's
599//! execution.
600//!
601//! [`ConfigOptions`]: datafusion_common::config::ConfigOptions
602//!
603//! The state required to execute queries is managed by the following
604//! structures:
605//!
606//! 1. [`SessionContext`]: State needed for create [`LogicalPlan`]s such
607//!    as the table definitions, and the function registries.
608//!
609//! 2. [`TaskContext`]: State needed for execution such as the
610//!    [`MemoryPool`], [`DiskManager`], and [`ObjectStoreRegistry`].
611//!
612//! 3. [`ExecutionProps`]: Per-execution properties and data (such as
613//!    starting timestamps, etc).
614//!
615//! [`SessionContext`]: crate::execution::context::SessionContext
616//! [`TaskContext`]: crate::execution::context::TaskContext
617//! [`ExecutionProps`]: crate::execution::context::ExecutionProps
618//!
619//! ### Resource Management
620//!
621//! The amount of memory and temporary local disk space used by
622//! DataFusion when running a plan can be controlled using the
623//! [`MemoryPool`] and [`DiskManager`]. Other runtime options can be
624//! found on [`RuntimeEnv`].
625//!
626//! [`DiskManager`]: crate::execution::DiskManager
627//! [`MemoryPool`]: crate::execution::memory_pool::MemoryPool
628//! [`RuntimeEnv`]: crate::execution::runtime_env::RuntimeEnv
629//! [`ObjectStoreRegistry`]: crate::datasource::object_store::ObjectStoreRegistry
630//!
631//! ## Crate Organization
632//!
633//! Most users interact with DataFusion via this crate (`datafusion`), which re-exports
634//! all functionality needed to build and execute queries.
635//!
636//! There are three other crates that provide additional functionality that
637//! must be used directly:
638//! * [`datafusion_proto`]: Plan serialization and deserialization
639//! * [`datafusion_substrait`]: Support for the substrait plan serialization format
640//! * [`datafusion_sqllogictest`] : The DataFusion SQL logic test runner
641//!
642//! [`datafusion_proto`]: https://crates.io/crates/datafusion-proto
643//! [`datafusion_substrait`]: https://crates.io/crates/datafusion-substrait
644//! [`datafusion_sqllogictest`]: https://crates.io/crates/datafusion-sqllogictest
645//!
646//! DataFusion is internally split into multiple sub crates to
647//! enforce modularity and improve compilation times. See the
648//! [list of modules](#modules) for all available sub-crates. Major ones are
649//!
650//! * [datafusion_common]: Common traits and types
651//! * [datafusion_catalog]: Catalog APIs such as [`SchemaProvider`] and [`CatalogProvider`]
652//! * [datafusion_datasource]: File and Data IO such as [`FileSource`] and [`DataSink`]
653//! * [datafusion_session]: [`Session`] and related structures
654//! * [datafusion_execution]: State and structures needed for execution
655//! * [datafusion_expr]: [`LogicalPlan`], [`Expr`] and related logical planning structure
656//! * [datafusion_functions]: Scalar function packages
657//! * [datafusion_functions_aggregate]: Aggregate functions such as `MIN`, `MAX`, `SUM`, etc
658//! * [datafusion_functions_nested]: Scalar function packages for `ARRAY`s, `MAP`s and `STRUCT`s
659//! * [datafusion_functions_table]: Table Functions such as `GENERATE_SERIES`
660//! * [datafusion_functions_window]: Window functions such as `ROW_NUMBER`, `RANK`, etc
661//! * [datafusion_optimizer]: [`OptimizerRule`]s and [`AnalyzerRule`]s
662//! * [datafusion_physical_expr]: [`PhysicalExpr`] and related expressions
663//! * [datafusion_physical_plan]: [`ExecutionPlan`] and related expressions
664//! * [datafusion_physical_optimizer]: [`ExecutionPlan`] and related expressions
665//! * [datafusion_sql]: SQL planner ([`SqlToRel`])
666//!
667//! [`SchemaProvider`]: datafusion_catalog::SchemaProvider
668//! [`CatalogProvider`]: datafusion_catalog::CatalogProvider
669//! [`Session`]: datafusion_session::Session
670//! [`FileSource`]: datafusion_datasource::file::FileSource
671//! [`DataSink`]: datafusion_datasource::sink::DataSink
672//!
673//! ## Citing DataFusion in Academic Papers
674//!
675//! You can use the following citation to reference DataFusion in academic papers:
676//!
677//! ```text
678//! @inproceedings{lamb2024apache
679//!   title={Apache Arrow DataFusion: A Fast, Embeddable, Modular Analytic Query Engine},
680//!   author={Lamb, Andrew and Shen, Yijie and Heres, Dani{\"e}l and Chakraborty, Jayjeet and Kabak, Mehmet Ozan and Hsieh, Liang-Chi and Sun, Chao},
681//!   booktitle={Companion of the 2024 International Conference on Management of Data},
682//!   pages={5--17},
683//!   year={2024}
684//! }
685//! ```
686//!
687//! [sqlparser]: https://docs.rs/sqlparser/latest/sqlparser
688//! [`SqlToRel`]: sql::planner::SqlToRel
689//! [`Expr`]: datafusion_expr::Expr
690//! [`LogicalPlan`]: datafusion_expr::LogicalPlan
691//! [`AnalyzerRule`]: datafusion_optimizer::analyzer::AnalyzerRule
692//! [`OptimizerRule`]: optimizer::optimizer::OptimizerRule
693//! [`ExecutionPlan`]: physical_plan::ExecutionPlan
694//! [`PhysicalPlanner`]: physical_planner::PhysicalPlanner
695//! [`PhysicalOptimizerRule`]: datafusion_physical_optimizer::PhysicalOptimizerRule
696//! [`Schema`]: arrow::datatypes::Schema
697//! [`PhysicalExpr`]: physical_plan::PhysicalExpr
698//! [`RecordBatch`]: arrow::array::RecordBatch
699//! [`RecordBatchReader`]: arrow::record_batch::RecordBatchReader
700//! [`Array`]: arrow::array::Array
701
702/// DataFusion crate version
703pub const DATAFUSION_VERSION: &str = env!("CARGO_PKG_VERSION");
704
705extern crate core;
706extern crate sqlparser;
707
708pub mod dataframe;
709pub mod datasource;
710pub mod error;
711pub mod execution;
712pub mod physical_planner;
713pub mod prelude;
714pub mod scalar;
715
716// re-export dependencies from arrow-rs to minimize version maintenance for crate users
717pub use arrow;
718#[cfg(feature = "parquet")]
719pub use parquet;
720
721// re-export DataFusion sub-crates at the top level. Use `pub use *`
722// so that the contents of the subcrates appears in rustdocs
723// for details, see https://github.com/apache/datafusion/issues/6648
724
725/// re-export of [`datafusion_common`] crate
726pub mod common {
727    pub use datafusion_common::*;
728
729    /// re-export of [`datafusion_common_runtime`] crate
730    pub mod runtime {
731        pub use datafusion_common_runtime::*;
732    }
733}
734
735// Backwards compatibility
736pub use common::config;
737
738// NB datafusion execution is re-exported in the `execution` module
739
740/// re-export of [`datafusion_catalog`] crate
741pub mod catalog {
742    pub use datafusion_catalog::*;
743}
744
745/// re-export of [`datafusion_expr`] crate
746pub mod logical_expr {
747    pub use datafusion_expr::*;
748}
749
750/// re-export of [`datafusion_expr_common`] crate
751pub mod logical_expr_common {
752    pub use datafusion_expr_common::*;
753}
754
755/// re-export of [`datafusion_optimizer`] crate
756pub mod optimizer {
757    pub use datafusion_optimizer::*;
758}
759
760/// re-export of [`datafusion_physical_optimizer`] crate
761pub mod physical_optimizer {
762    pub use datafusion_physical_optimizer::*;
763}
764
765/// re-export of [`datafusion_physical_expr`] crate
766pub mod physical_expr_common {
767    pub use datafusion_physical_expr_common::*;
768}
769
770/// re-export of [`datafusion_physical_expr`] crate
771pub mod physical_expr {
772    pub use datafusion_physical_expr::*;
773}
774
775/// re-export of [`datafusion_physical_plan`] crate
776pub mod physical_plan {
777    pub use datafusion_physical_plan::*;
778}
779
780// Reexport testing macros for compatibility
781pub use datafusion_common::assert_batches_eq;
782pub use datafusion_common::assert_batches_sorted_eq;
783
784/// re-export of [`datafusion_sql`] crate
785pub mod sql {
786    pub use datafusion_sql::*;
787}
788
789/// re-export of [`datafusion_functions`] crate
790pub mod functions {
791    pub use datafusion_functions::*;
792}
793
794/// re-export of [`datafusion_functions_nested`] crate, if "nested_expressions" feature is enabled
795pub mod functions_nested {
796    #[cfg(feature = "nested_expressions")]
797    pub use datafusion_functions_nested::*;
798}
799
800/// re-export of [`datafusion_functions_nested`] crate as [`functions_array`] for backward compatibility, if "nested_expressions" feature is enabled
801#[deprecated(since = "41.0.0", note = "use datafusion-functions-nested instead")]
802pub mod functions_array {
803    #[cfg(feature = "nested_expressions")]
804    pub use datafusion_functions_nested::*;
805}
806
807/// re-export of [`datafusion_functions_aggregate`] crate
808pub mod functions_aggregate {
809    pub use datafusion_functions_aggregate::*;
810}
811
812/// re-export of [`datafusion_functions_window`] crate
813pub mod functions_window {
814    pub use datafusion_functions_window::*;
815}
816
817/// re-export of [`datafusion_functions_table`] crate
818pub mod functions_table {
819    pub use datafusion_functions_table::*;
820}
821
822/// re-export of variable provider for `@name` and `@@name` style runtime values.
823pub mod variable {
824    pub use datafusion_expr::var_provider::{VarProvider, VarType};
825}
826
827#[cfg(not(target_arch = "wasm32"))]
828pub mod test;
829
830mod schema_equivalence;
831pub mod test_util;
832
833#[cfg(doctest)]
834doc_comment::doctest!("../../../README.md", readme_example_test);
835
836// Instructions for Documentation Examples
837//
838// The following commands test the examples from the user guide as part of
839// `cargo test --doc`
840//
841// # Adding new tests:
842//
843// Simply add code like this to your .md file and ensure your md file is
844// included in the lists below.
845//
846// ```rust
847// <code here will be tested>
848// ```
849//
850// Note that sometimes it helps to author the doctest as a standalone program
851// first, and then copy it into the user guide.
852//
853// # Debugging Test Failures
854//
855// Unfortunately, the line numbers reported by doctest do not correspond to the
856// line numbers of in the .md files. Thus, if a doctest fails, use the name of
857// the test to find the relevant file in the list below, and then find the
858// example in that file to fix.
859//
860// For example, if `user_guide_expressions(line 123)` fails,
861// go to `docs/source/user-guide/expressions.md` to find the relevant problem.
862//
863#[cfg(doctest)]
864doc_comment::doctest!(
865    "../../../docs/source/user-guide/concepts-readings-events.md",
866    user_guide_concepts_readings_events
867);
868
869#[cfg(doctest)]
870doc_comment::doctest!(
871    "../../../docs/source/user-guide/configs.md",
872    user_guide_configs
873);
874
875#[cfg(doctest)]
876doc_comment::doctest!(
877    "../../../docs/source/user-guide/crate-configuration.md",
878    user_guide_crate_configuration
879);
880
881#[cfg(doctest)]
882doc_comment::doctest!(
883    "../../../docs/source/user-guide/dataframe.md",
884    user_guide_dataframe
885);
886
887#[cfg(doctest)]
888doc_comment::doctest!(
889    "../../../docs/source/user-guide/example-usage.md",
890    user_guide_example_usage
891);
892
893#[cfg(doctest)]
894doc_comment::doctest!(
895    "../../../docs/source/user-guide/explain-usage.md",
896    user_guide_explain_usage
897);
898
899#[cfg(doctest)]
900doc_comment::doctest!(
901    "../../../docs/source/user-guide/expressions.md",
902    user_guide_expressions
903);
904
905#[cfg(doctest)]
906doc_comment::doctest!("../../../docs/source/user-guide/faq.md", user_guide_faq);
907
908#[cfg(doctest)]
909doc_comment::doctest!(
910    "../../../docs/source/user-guide/introduction.md",
911    user_guide_introduction
912);
913
914#[cfg(doctest)]
915doc_comment::doctest!(
916    "../../../docs/source/user-guide/cli/datasources.md",
917    user_guide_cli_datasource
918);
919
920#[cfg(doctest)]
921doc_comment::doctest!(
922    "../../../docs/source/user-guide/cli/installation.md",
923    user_guide_cli_installation
924);
925
926#[cfg(doctest)]
927doc_comment::doctest!(
928    "../../../docs/source/user-guide/cli/overview.md",
929    user_guide_cli_overview
930);
931
932#[cfg(doctest)]
933doc_comment::doctest!(
934    "../../../docs/source/user-guide/cli/usage.md",
935    user_guide_cli_usage
936);
937
938#[cfg(doctest)]
939doc_comment::doctest!(
940    "../../../docs/source/user-guide/features.md",
941    user_guide_features
942);
943
944#[cfg(doctest)]
945doc_comment::doctest!(
946    "../../../docs/source/user-guide/sql/aggregate_functions.md",
947    user_guide_sql_aggregate_functions
948);
949
950#[cfg(doctest)]
951doc_comment::doctest!(
952    "../../../docs/source/user-guide/sql/data_types.md",
953    user_guide_sql_data_types
954);
955
956#[cfg(doctest)]
957doc_comment::doctest!(
958    "../../../docs/source/user-guide/sql/ddl.md",
959    user_guide_sql_ddl
960);
961
962#[cfg(doctest)]
963doc_comment::doctest!(
964    "../../../docs/source/user-guide/sql/dml.md",
965    user_guide_sql_dml
966);
967
968#[cfg(doctest)]
969doc_comment::doctest!(
970    "../../../docs/source/user-guide/sql/explain.md",
971    user_guide_sql_exmplain
972);
973
974#[cfg(doctest)]
975doc_comment::doctest!(
976    "../../../docs/source/user-guide/sql/information_schema.md",
977    user_guide_sql_information_schema
978);
979
980#[cfg(doctest)]
981doc_comment::doctest!(
982    "../../../docs/source/user-guide/sql/operators.md",
983    user_guide_sql_operators
984);
985
986#[cfg(doctest)]
987doc_comment::doctest!(
988    "../../../docs/source/user-guide/sql/prepared_statements.md",
989    user_guide_prepared_statements
990);
991
992#[cfg(doctest)]
993doc_comment::doctest!(
994    "../../../docs/source/user-guide/sql/scalar_functions.md",
995    user_guide_sql_scalar_functions
996);
997
998#[cfg(doctest)]
999doc_comment::doctest!(
1000    "../../../docs/source/user-guide/sql/select.md",
1001    user_guide_sql_select
1002);
1003
1004#[cfg(doctest)]
1005doc_comment::doctest!(
1006    "../../../docs/source/user-guide/sql/special_functions.md",
1007    user_guide_sql_special_functions
1008);
1009
1010#[cfg(doctest)]
1011doc_comment::doctest!(
1012    "../../../docs/source/user-guide/sql/subqueries.md",
1013    user_guide_sql_subqueries
1014);
1015
1016#[cfg(doctest)]
1017doc_comment::doctest!(
1018    "../../../docs/source/user-guide/sql/window_functions.md",
1019    user_guide_sql_window_functions
1020);
1021
1022#[cfg(doctest)]
1023doc_comment::doctest!(
1024    "../../../docs/source/user-guide/sql/write_options.md",
1025    user_guide_sql_write_options
1026);
1027
1028#[cfg(doctest)]
1029doc_comment::doctest!(
1030    "../../../docs/source/library-user-guide/adding-udfs.md",
1031    library_user_guide_adding_udfs
1032);
1033
1034#[cfg(doctest)]
1035doc_comment::doctest!(
1036    "../../../docs/source/library-user-guide/building-logical-plans.md",
1037    library_user_guide_building_logical_plans
1038);
1039
1040#[cfg(doctest)]
1041doc_comment::doctest!(
1042    "../../../docs/source/library-user-guide/catalogs.md",
1043    library_user_guide_catalogs
1044);
1045
1046#[cfg(doctest)]
1047doc_comment::doctest!(
1048    "../../../docs/source/library-user-guide/custom-table-providers.md",
1049    library_user_guide_custom_table_providers
1050);
1051
1052#[cfg(doctest)]
1053doc_comment::doctest!(
1054    "../../../docs/source/library-user-guide/extending-operators.md",
1055    library_user_guide_extending_operators
1056);
1057
1058#[cfg(doctest)]
1059doc_comment::doctest!(
1060    "../../../docs/source/library-user-guide/extensions.md",
1061    library_user_guide_extensions
1062);
1063
1064#[cfg(doctest)]
1065doc_comment::doctest!(
1066    "../../../docs/source/library-user-guide/index.md",
1067    library_user_guide_index
1068);
1069
1070#[cfg(doctest)]
1071doc_comment::doctest!(
1072    "../../../docs/source/library-user-guide/profiling.md",
1073    library_user_guide_profiling
1074);
1075
1076#[cfg(doctest)]
1077doc_comment::doctest!(
1078    "../../../docs/source/library-user-guide/query-optimizer.md",
1079    library_user_guide_query_optimizer
1080);
1081
1082#[cfg(doctest)]
1083doc_comment::doctest!(
1084    "../../../docs/source/library-user-guide/using-the-dataframe-api.md",
1085    library_user_guide_dataframe_api
1086);
1087
1088#[cfg(doctest)]
1089doc_comment::doctest!(
1090    "../../../docs/source/library-user-guide/using-the-sql-api.md",
1091    library_user_guide_sql_api
1092);
1093
1094#[cfg(doctest)]
1095doc_comment::doctest!(
1096    "../../../docs/source/library-user-guide/working-with-exprs.md",
1097    library_user_guide_working_with_exprs
1098);
1099
1100#[cfg(doctest)]
1101doc_comment::doctest!(
1102    "../../../docs/source/library-user-guide/upgrading.md",
1103    library_user_guide_upgrading
1104);
1105
1106#[cfg(doctest)]
1107doc_comment::doctest!(
1108    "../../../docs/source/contributor-guide/api-health.md",
1109    contributor_guide_api_health
1110);