by Marco Neumann
Rust Hack & Learn Hamburg, October 2024
Public service announcements.
A demo project.
WITH stations_hamburg AS (
SELECT stations.id, stations.name
FROM stations
JOIN station_settings ON stations.id_of_station_setting = station_settings.id
WHERE city = 'Hamburg' AND station_settings.translated_short_name = 'urban'
),
worst_station AS (
SELECT air_quality.station_id, air_quality.index, count(NOT incomplete) AS n
FROM air_quality
JOIN stations_hamburg ON air_quality.station_id = stations_hamburg.id
WHERE date_start >= '2024-01-01' AND date_end < '2024-01-03'
GROUP BY air_quality.station_id, air_quality.index
ORDER BY air_quality.index DESC, n DESC
LIMIT 1
)
SELECT stations_hamburg.name
FROM worst_station
JOIN stations_hamburg ON worst_station.station_id = stations_hamburg.id;
REST
rust struct
arrow
text
arrow
DataFusion
JSON
reqwest
serde
Arrow
glue code
SQL query
parameters
ranges
What you get from using "open data".
{
"i1": 42,
"i2": 1337,
"i3": 7
}
{
"i1": "42",
"i2": "1337",
"i3": 7
}
Normal People
Them
{
"f1": 4.2,
"f2": 13.37
}
{
"f1": "4.2",
"f2": "13.37"
}
Normal People
Them
{
"b1": true,
"b2": false
}
{
"b1": "1",
"b2": "0"
}
Normal People
Them
{
"s1": null,
"s2": "foo"
}
{
"s1": "",
"s2": "foo"
}
Normal People
Them
{
"z1": null,
"z2": null,
"z3": "12345"
}
{
"z1": "",
"z2": "00000",
"z3": "12345"
}
Normal People
Them
{
"t1": "2024-01-02T03:04:05+01:00",
"t2": "2024-01-03T00:00:00+01:00"
}
{
"t1": "2024-01-02 03:04:05",
"t2": "2024-01-00 24:00:00"
}
Normal People
Them
{
"known_key": "foo",
"data": {
"k1": "a",
"k2": "b"
}
}
{
"known_key": "foo",
"k1": "a",
"k2": "b"
}
Normal People
Them
{
"data": [
{
"k1": "a",
"k2": 2,
"sub": [
{
"k1": 1,
"k2": "foo"
},
{
"k1": 2,
"k2": "bar"
}
]
},
{
"k1": "b",
"k2": 3,
"sub": [
{
"k1": 3,
"k2": "baz"
}
]
}
]
}
{
"data": [
[
"a",
2,
[
1,
"foo"
],
[
2,
"bar"
]
],
[
"b",
3,
[
3,
"baz"
]
]
]
}
Normal People
Them
The details.
From query to execution plan.
SELECT station_id, date_start, date_end, index
FROM air_quality
WHERE date_start >= '2024-01-01' AND date_end < '2024-01-02';
{
"Select": {
"distinct": null,
"top": null,
"projection": [
{
"UnnamedExpr": {
"Identifier": {
"value": "station_id",
"quote_style": null
}
}
},
{
"UnnamedExpr": {
"Identifier": {
"value": "date_start",
"quote_style": null
}
}
},
{
"UnnamedExpr": {
"Identifier": {
"value": "date_end",
"quote_style": null
}
}
},
{
"UnnamedExpr": {
"Identifier": {
"value": "index",
"quote_style": null
}
}
}
],
"into": null,
"from": [
{
"relation": {
"Table": {
"name": [
{
"value": "air_quality",
"quote_style": null
}
],
"alias": null,
"args": null,
"with_hints": [],
"version": null,
"with_ordinality": false,
"partitions": []
}
},
"joins": []
}
],
"lateral_views": [],
"prewhere": null,
"selection": {
"BinaryOp": {
"left": {
"BinaryOp": {
"left": {
"Identifier": {
"value": "date_start",
"quote_style": null
}
},
"op": "Eq",
"right": {
"Value": {
"SingleQuotedString": "2024-01-01"
}
}
}
},
"op": "And",
"right": {
"BinaryOp": {
"left": {
"Identifier": {
"value": "date_end",
"quote_style": null
}
},
"op": "Lt",
"right": {
"Value": {
"SingleQuotedString": "2024-01-02"
}
}
}
}
}
},
"group_by": {
"Expressions": [
[],
[]
]
},
"cluster_by": [],
"distribute_by": [],
"sort_by": [],
"having": null,
"named_window": [],
"qualify": null,
"window_before_qualify": false,
"value_table_mode": null,
"connect_by": null
}
}
Projection: air_quality.station_id, air_quality.date_start, air_quality.date_end, air_quality.index
Filter: air_quality.date_start >= Utf8("2024-01-01") AND air_quality.date_end < Utf8("2024-01-02")
TableScan: air_quality
{
"Select": {
"distinct": null,
"top": null,
"projection": [
{
"UnnamedExpr": {
"Identifier": {
"value": "station_id",
"quote_style": null
}
}
},
{
"UnnamedExpr": {
"Identifier": {
"value": "date_start",
"quote_style": null
}
}
},
{
"UnnamedExpr": {
"Identifier": {
"value": "date_end",
"quote_style": null
}
}
},
{
"UnnamedExpr": {
"Identifier": {
"value": "index",
"quote_style": null
}
}
}
],
"into": null,
"from": [
{
"relation": {
"Table": {
"name": [
{
"value": "air_quality",
"quote_style": null
}
],
"alias": null,
"args": null,
"with_hints": [],
"version": null,
"with_ordinality": false,
"partitions": []
}
},
"joins": []
}
],
"lateral_views": [],
"prewhere": null,
"selection": {
"BinaryOp": {
"left": {
"BinaryOp": {
"left": {
"Identifier": {
"value": "date_start",
"quote_style": null
}
},
"op": "Eq",
"right": {
"Value": {
"SingleQuotedString": "2024-01-01"
}
}
}
},
"op": "And",
"right": {
"BinaryOp": {
"left": {
"Identifier": {
"value": "date_end",
"quote_style": null
}
},
"op": "Lt",
"right": {
"Value": {
"SingleQuotedString": "2024-01-02"
}
}
}
}
}
},
"group_by": {
"Expressions": [
[],
[]
]
},
"cluster_by": [],
"distribute_by": [],
"sort_by": [],
"having": null,
"named_window": [],
"qualify": null,
"window_before_qualify": false,
"value_table_mode": null,
"connect_by": null
}
}
Projection: air_quality.station_id, air_quality.date_start, air_quality.date_end, air_quality.index
Filter: air_quality.date_start >= Utf8("2024-01-01") AND air_quality.date_end < Utf8("2024-01-02")
TableScan: air_quality
Projection: air_quality.station_id, air_quality.date_start, air_quality.date_end, air_quality.index
Filter: (
(
CAST(air_quality.date_start AS Timestamp(Nanosecond, Some("CET")))
>= CAST(Utf8("2024-01-01")) AS Timestamp(Nanosecond, Some("CET"))
) AND (
CAST(air_quality.date_end AS Timestamp(Nanosecond, Some("CET")))
< CAST(Utf8("2024-01-02") AS Timestamp(Nanosecond, Some("CET")))
)
)
TableScan: air_quality
Projection: air_quality.station_id, air_quality.date_start, air_quality.date_end, air_quality.index
Filter: (
(
CAST(air_quality.date_start AS Timestamp(Nanosecond, Some("CET")))
>= CAST(Utf8("2024-01-01")) AS Timestamp(Nanosecond, Some("CET"))
) AND (
CAST(air_quality.date_end AS Timestamp(Nanosecond, Some("CET")))
< CAST(Utf8("2024-01-02") AS Timestamp(Nanosecond, Some("CET")))
)
)
TableScan: air_quality
Projection: air_quality.station_id, air_quality.date_start, air_quality.date_end, air_quality.index
Filter: (
(
CAST(air_quality.date_start AS Timestamp(Nanosecond, Some("CET")))
>= TimestampNanosecond(1704063600000000000, Some("CET"))
) AND (
CAST(air_quality.date_end AS Timestamp(Nanosecond, Some("CET")))
< TimestampNanosecond(1704150000000000000, Some("CET"))
)
)
TableScan: air_quality
Projection: air_quality.station_id, air_quality.date_start, air_quality.date_end, air_quality.index
Filter: (
(
CAST(air_quality.date_start AS Timestamp(Nanosecond, Some("CET")))
>= TimestampNanosecond(1704063600000000000, Some("CET"))
) AND (
CAST(air_quality.date_end AS Timestamp(Nanosecond, Some("CET")))
< TimestampNanosecond(1704150000000000000, Some("CET"))
)
)
TableScan: air_quality
Projection: air_quality.station_id, air_quality.date_start, air_quality.date_end, air_quality.index
Filter: (
(
air_quality.date_start >= TimestampSecond(1704063600, Some("CET"))
) AND (
air_quality.date_end < TimestampSecond(1704150000, Some("CET"))
)
)
TableScan: air_quality
Projection: air_quality.station_id, air_quality.date_start, air_quality.date_end, air_quality.index
Filter: (
(
air_quality.date_start >= TimestampSecond(1704063600, Some("CET"))
) AND (
air_quality.date_end < TimestampSecond(1704150000, Some("CET"))
)
)
TableScan: air_quality
Projection: air_quality.station_id, air_quality.date_start, air_quality.date_end, air_quality.index
Filter: (
(
air_quality.date_start >= TimestampSecond(1704063600, Some("CET"))
) AND (
air_quality.date_end < TimestampSecond(1704150000, Some("CET"))
)
)
TableScan: air_quality, partial_filters=[
air_quality.date_start >= TimestampSecond(1704063600, Some("CET")),
air_quality.date_end < TimestampSecond(1704150000, Some("CET")),
]
Projection: air_quality.station_id, air_quality.date_start, air_quality.date_end, air_quality.index
Filter: (
(
air_quality.date_start >= TimestampSecond(1704063600, Some("CET"))
) AND (
air_quality.date_end < TimestampSecond(1704150000, Some("CET"))
)
)
TableScan: air_quality, partial_filters=[
air_quality.date_start >= TimestampSecond(1704063600, Some("CET")),
air_quality.date_end < TimestampSecond(1704150000, Some("CET")),
]
Filter: (
(
air_quality.date_start >= TimestampSecond(1704063600, Some("CET"))
) AND (
air_quality.date_end < TimestampSecond(1704150000, Some("CET"))
)
)
TableScan: air_quality projection=[station_id, date_start, date_end, index], partial_filters=[
air_quality.date_start >= TimestampSecond(1704063600, Some("CET")),
air_quality.date_end < TimestampSecond(1704150000, Some("CET")),
]
Filter: (
(
air_quality.date_start >= TimestampSecond(1704063600, Some("CET"))
) AND (
air_quality.date_end < TimestampSecond(1704150000, Some("CET"))
)
)
TableScan: air_quality projection=[station_id, date_start, date_end, index], partial_filters=[
air_quality.date_start >= TimestampSecond(1704063600, Some("CET")),
air_quality.date_end < TimestampSecond(1704150000, Some("CET")),
]
FilterExec: date_start@1 >= 1704063600 AND date_end@2 < 1704150000
StreamingTableExec: partition_sizes=1, projection=[station_id, date_start, date_end, index]
FilterExec: date_start@1 >= 1704063600 AND date_end@2 < 1704150000
StreamingTableExec: partition_sizes=1, projection=[station_id, date_start, date_end, index]
OutputRequirementExec
FilterExec: date_start@1 >= 1704063600 AND date_end@2 < 1704150000
StreamingTableExec: partition_sizes=1, projection=[station_id, date_start, date_end, index]
OutputRequirementExec
FilterExec: date_start@1 >= 1704063600 AND date_end@2 < 1704150000
StreamingTableExec: partition_sizes=1, projection=[station_id, date_start, date_end, index]
OutputRequirementExec
FilterExec: date_start@1 >= 1704063600 AND date_end@2 < 1704150000
RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1
StreamingTableExec: partition_sizes=1, projection=[station_id, date_start, date_end, index]
OutputRequirementExec
FilterExec: date_start@1 >= 1704063600 AND date_end@2 < 1704150000
RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1
StreamingTableExec: partition_sizes=1, projection=[station_id, date_start, date_end, index]
OutputRequirementExec
CoalesceBatchesExec: target_batch_size=8192
FilterExec: date_start@1 >= 1704063600 AND date_end@2 < 1704150000
RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1
StreamingTableExec: partition_sizes=1, projection=[station_id, date_start, date_end, index]
OutputRequirementExec
CoalesceBatchesExec: target_batch_size=8192
FilterExec: date_start@1 >= 1704063600 AND date_end@2 < 1704150000
RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1
StreamingTableExec: partition_sizes=1, projection=[station_id, date_start, date_end, index]
CoalesceBatchesExec: target_batch_size=8192
FilterExec: date_start@1 >= 1704063600 AND date_end@2 < 1704150000
RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1
StreamingTableExec: partition_sizes=1, projection=[station_id, date_start, date_end, index]
Show that it works.
What else could we do?
ScalarUDF: map 0, 1, or many input columns or scalars to one output column
AggregateUDF: custom aggregation function
WindowUDF: custom SQL window function
the DataFusion API user, NOT the end user
DataFusion can read and write:
Or add your own.
Hook into:
The
Database
Apache FlightSQL
REST Queries
Read&Write Transaction
REST Insertion
Stream Insertion
Large Batch Insertion
User Auth
Entity Management
Quotas
Observability
Self-protection
Data Compaction
(Auto) Scaling
Works in Cloud
PostgreSQL Wire Protocol