ibis-project · cpcloud · Aug 9, 2024 · Aug 9, 2024 · Aug 9, 2024 · Aug 9, 2024
diff --git a/ibis/backends/clickhouse/__init__.py b/ibis/backends/clickhouse/__init__.py
@@ -247,6 +247,12 @@
             n += 1
             if not (schema := obj.schema):
                 raise TypeError(f"Schema is empty for external table {name}")
+            if null_fields := schema.null_fields:
+                raise com.IbisTypeError(
+                    "ClickHouse doesn't support NULL-typed fields. "
+                    "Consider assigning a type through casting or on construction. "
+                    f"Got null typed fields: {null_fields}"
+                )
 
             structure = [
                 f"{name} {type_mapper.to_string(typ.copy(nullable=not typ.is_nested()))}"

diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py
@@ -27,7 +27,7 @@
 import ibis.expr.types as ir
 from ibis import util
 from ibis.backends import CanCreateDatabase, UrlFromPath
-from ibis.backends.duckdb.converter import DuckDBPandasData
+from ibis.backends.duckdb.converter import DuckDBPandasData, DuckDBPyArrowData
 from ibis.backends.sql import SQLBackend
 from ibis.backends.sql.compilers.base import STAR, AlterTable, C, RenameTable
 from ibis.common.dispatch import lazy_singledispatch
@@ -150,8 +150,6 @@ def create_table(
 
         if obj is None and schema is None:
             raise ValueError("Either `obj` or `schema` must be specified")
-        if schema is not None:
-            schema = ibis.schema(schema)
 
         quoted = self.compiler.quoted
         dialect = self.dialect
@@ -174,16 +172,25 @@ def create_table(
         else:
             query = None
 
+        if schema is None:
+            schema = table.schema()
+        else:
+            schema = ibis.schema(schema)
+
+        if null_fields := schema.null_fields:
+            raise exc.IbisTypeError(
+                "DuckDB does not support creating tables with NULL typed columns. "
+                "Ensure that every column has non-NULL type. "
+                f"NULL columns: {null_fields}"
+            )
+
         if overwrite:
             temp_name = util.gen_name("duckdb_table")
         else:
             temp_name = name
 
         initial_table = sg.table(temp_name, catalog=catalog, db=database, quoted=quoted)
-        target = sge.Schema(
-            this=initial_table,
-            expressions=(schema or table.schema()).to_sqlglot(dialect),
-        )
+        target = sge.Schema(this=initial_table, expressions=schema.to_sqlglot(dialect))
 
         create_stmt = sge.Create(
             kind="TABLE",
@@ -254,7 +261,7 @@ def table(self, name: str, database: str | None = None) -> ir.Table:
 
         table_schema = self.get_schema(name, catalog=catalog, database=database)
         # load geospatial only if geo columns
-        if any(typ.is_geospatial() for typ in table_schema.types):
+        if table_schema.geospatial:
             self.load_extension("spatial")
         return ops.DatabaseTable(
             name,
@@ -1419,7 +1426,7 @@ def to_pyarrow(
         **_: Any,
     ) -> pa.Table:
         table = self._to_duckdb_relation(expr, params=params, limit=limit).arrow()
-        return expr.__pyarrow_result__(table)
+        return expr.__pyarrow_result__(table, data_mapper=DuckDBPyArrowData)
 
     def execute(
         self,

diff --git a/ibis/backends/duckdb/converter.py b/ibis/backends/duckdb/converter.py
@@ -1,9 +1,31 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
+import pyarrow as pa
+
 from ibis.formats.pandas import PandasData
+from ibis.formats.pyarrow import PyArrowData
+
+if TYPE_CHECKING:
+    import ibis.expr.datatypes as dt
 
 
 class DuckDBPandasData(PandasData):
     @staticmethod
     def convert_Array(s, dtype, pandas_type):
         return s.replace(float("nan"), None)
+
+
+class DuckDBPyArrowData(PyArrowData):
+    @classmethod
+    def convert_scalar(cls, scalar: pa.Scalar, dtype: dt.DataType) -> pa.Scalar:
+        if dtype.is_null():
+            return pa.scalar(None)
+        return super().convert_scalar(scalar, dtype)
+
+    @classmethod
+    def convert_column(cls, column: pa.Array, dtype: dt.DataType) -> pa.Array:
+        if dtype.is_null():
+            return pa.nulls(len(column))
+        return super().convert_column(column, dtype)
diff --git a/ibis/backends/duckdb/tests/test_client.py b/ibis/backends/duckdb/tests/test_client.py
@@ -11,6 +11,7 @@
 from pytest import param
 
 import ibis
+import ibis.common.exceptions as com
 import ibis.expr.datatypes as dt
 from ibis.conftest import LINUX, SANDBOXED, not_windows
 from ibis.util import gen_name
@@ -420,3 +421,24 @@ def test_memtable_doesnt_leak(con, monkeypatch):
     df = ibis.memtable({"a": [1, 2, 3]}, name=name).execute()
     assert name not in con.list_tables()
     assert len(df) == 3
+
+
+def test_create_table_with_nulls(con):
+    t = ibis.memtable({"a": [None]})
+    schema = t.schema()
+
+    assert schema == ibis.schema({"a": "null"})
+    assert schema.null_fields == ("a",)
+
+    name = gen_name("duckdb_all_nulls")
+
+    match = "NULL typed columns"
+
+    with pytest.raises(com.IbisTypeError, match=match):
+        con.create_table(name, obj=t)
+
+    with pytest.raises(com.IbisTypeError, match=match):
+        con.create_table(name, obj=t, schema=schema)
+
+    with pytest.raises(com.IbisTypeError, match=match):
+        con.create_table(name, schema=schema)
diff --git a/ibis/backends/exasol/__init__.py b/ibis/backends/exasol/__init__.py
@@ -278,7 +278,7 @@ def _get_schema_using_query(self, query: str) -> sch.Schema:
 
     def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
         schema = op.schema
-        if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
+        if null_columns := schema.null_fields:
             raise com.IbisTypeError(
                 "Exasol cannot yet reliably handle `null` typed columns; "
                 f"got null typed columns: {null_columns}"

diff --git a/ibis/backends/flink/__init__.py b/ibis/backends/flink/__init__.py
@@ -369,13 +369,22 @@
             expr, params=params, pretty=pretty
         )  # Discard `limit` and other kwargs.
 
+    def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
+        if null_columns := op.schema.null_fields:
+            raise exc.IbisTypeError(
+                f"{self.name} cannot yet reliably handle `null` typed columns; "
+                f"got null typed columns: {null_columns}"
+            )
+        self.create_view(op.name, op.data.to_frame(), schema=op.schema, temp=True)
+
+    def _finalize_memtable(self, name: str) -> None:
+        self.drop_view(name, temp=True, force=True)
+
     def execute(self, expr: ir.Expr, **kwargs: Any) -> Any:
         """Execute an expression."""
-        self._verify_in_memory_tables_are_unique(expr)
-        self._register_udfs(expr)
+        self._run_pre_execute_hooks(expr)
 
-        table_expr = expr.as_table()
-        sql = self.compile(table_expr, **kwargs)
+        sql = self.compile(expr.as_table(), **kwargs)
         df = self._table_env.sql_query(sql).to_pandas()
 
         return expr.__pandas_result__(df)

diff --git a/ibis/backends/impala/__init__.py b/ibis/backends/impala/__init__.py
@@ -1225,7 +1225,7 @@ def explain(
 
     def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
         schema = op.schema
-        if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
+        if null_columns := schema.null_fields:
             raise com.IbisTypeError(
                 "Impala cannot yet reliably handle `null` typed columns; "
                 f"got null typed columns: {null_columns}"

diff --git a/ibis/backends/mssql/__init__.py b/ibis/backends/mssql/__init__.py
@@ -751,7 +751,7 @@ def create_table(
 
     def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
         schema = op.schema
-        if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
+        if null_columns := schema.null_fields:
             raise com.IbisTypeError(
                 "MS SQL cannot yet reliably handle `null` typed columns; "
                 f"got null typed columns: {null_columns}"

diff --git a/ibis/backends/mysql/__init__.py b/ibis/backends/mysql/__init__.py
@@ -462,7 +462,7 @@ def create_table(
 
     def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
         schema = op.schema
-        if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
+        if null_columns := schema.null_fields:
             raise com.IbisTypeError(
                 "MySQL cannot yet reliably handle `null` typed columns; "
                 f"got null typed columns: {null_columns}"

diff --git a/ibis/backends/oracle/__init__.py b/ibis/backends/oracle/__init__.py
@@ -509,6 +509,11 @@
 
     def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
         schema = op.schema
+        if null_columns := schema.null_fields:
+            raise exc.IbisTypeError(
+                f"{self.name} cannot yet reliably handle `null` typed columns; "
+                f"got null typed columns: {null_columns}"
+            )
 
         name = op.name
         quoted = self.compiler.quoted

diff --git a/ibis/backends/postgres/__init__.py b/ibis/backends/postgres/__init__.py
@@ -93,7 +93,7 @@ def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
         from psycopg2.extras import execute_batch
 
         schema = op.schema
-        if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
+        if null_columns := schema.null_fields:
             raise exc.IbisTypeError(
                 f"{self.name} cannot yet reliably handle `null` typed columns; "
                 f"got null typed columns: {null_columns}"

diff --git a/ibis/backends/risingwave/__init__.py b/ibis/backends/risingwave/__init__.py
@@ -264,7 +264,7 @@ def create_table(
 
     def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
         schema = op.schema
-        if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
+        if null_columns := schema.null_fields:
             raise com.IbisTypeError(
                 f"{self.name} cannot yet reliably handle `null` typed columns; "
                 f"got null typed columns: {null_columns}"

diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py
@@ -940,6 +940,9 @@ def test_array_intersect(con, data):
 @pytest.mark.notimpl(
     ["trino"], reason="inserting maps into structs doesn't work", raises=TrinoUserError
 )
+@pytest.mark.notyet(
+    ["flink"], raises=ValueError, reason="array of struct is not supported"
+)
 def test_unnest_struct(con):
     data = {"value": [[{"a": 1}, {"a": 2}], [{"a": 3}, {"a": 4}]]}
     t = ibis.memtable(data, schema=ibis.schema({"value": "!array<!struct<a: !int>>"}))
@@ -959,8 +962,8 @@ def test_unnest_struct(con):
 @pytest.mark.notimpl(
     ["trino"], reason="inserting maps into structs doesn't work", raises=TrinoUserError
 )
-@pytest.mark.notimpl(
-    ["flink"], reason="flink unnests a and b as separate columns", raises=Py4JJavaError
+@pytest.mark.notyet(
+    ["flink"], raises=ValueError, reason="array of struct is not supported"
 )
 def test_unnest_struct_with_multiple_fields(con):
     data = {
@@ -1051,9 +1054,7 @@ def test_zip_null(con, fn):
     ["trino"], reason="inserting maps into structs doesn't work", raises=TrinoUserError
 )
 @pytest.mark.notyet(
-    ["flink"],
-    raises=Py4JJavaError,
-    reason="does not seem to support field selection on unnest",
+    ["flink"], raises=ValueError, reason="array of struct is not supported"
 )
 def test_array_of_struct_unnest(con):
     jobs = ibis.memtable(
@@ -1573,15 +1574,16 @@ def test_table_unnest_column_expr(backend):
     assert set(result.values) == set(expected.replace({np.nan: None}).values)
 
 
-@pytest.mark.notimpl(
-    ["datafusion", "polars", "flink"], raises=com.OperationNotDefinedError
-)
+@pytest.mark.notimpl(["datafusion", "polars"], raises=com.OperationNotDefinedError)
 @pytest.mark.notimpl(["trino"], raises=TrinoUserError)
 @pytest.mark.notimpl(["postgres"], raises=PsycoPg2SyntaxError)
 @pytest.mark.notimpl(["risingwave"], raises=PsycoPg2ProgrammingError)
 @pytest.mark.notyet(
     ["risingwave"], raises=PsycoPg2InternalError, reason="not supported in risingwave"
 )
+@pytest.mark.notyet(
+    ["flink"], raises=ValueError, reason="array of struct is not supported"
+)
 def test_table_unnest_array_of_struct_of_array(con):
     t = ibis.memtable(
         {

diff --git a/ibis/backends/tests/test_client.py b/ibis/backends/tests/test_client.py
@@ -1720,9 +1720,7 @@ def test_insert_into_table_missing_columns(con, temp_table):
 
 @pytest.mark.notyet(["druid"], raises=AssertionError, reason="can't drop tables")
 @pytest.mark.notyet(
-    ["clickhouse", "flink"],
-    raises=AssertionError,
-    reason="memtables are assembled every time",
+    ["clickhouse"], raises=AssertionError, reason="memtables are assembled every time"
 )
 @pytest.mark.notyet(
     ["bigquery"], raises=AssertionError, reason="test is flaky", strict=False
@@ -1768,7 +1766,7 @@ def test_same_name_memtable_is_overwritten(con):
 
 
 @pytest.mark.notimpl(
-    ["clickhouse", "flink"],
+    ["clickhouse"],
     raises=AssertionError,
     reason="backend doesn't use _register_in_memory_table",
 )

diff --git a/ibis/backends/tests/test_export.py b/ibis/backends/tests/test_export.py
@@ -7,6 +7,7 @@
 from pytest import param
 
 import ibis
+import ibis.common.exceptions as com
 import ibis.expr.datatypes as dt
 from ibis import util
 from ibis.backends.tests.errors import (
@@ -27,6 +28,7 @@
 
 pd = pytest.importorskip("pandas")
 pa = pytest.importorskip("pyarrow")
+pat = pytest.importorskip("pyarrow.types")
 
 limit = [param(42, id="limit")]
 
@@ -593,4 +595,44 @@ def test_scalar_to_memory(limit, awards_players, output_format, converter):
 
     expr = awards_players.filter(awards_players.awardID == "DEADBEEF").yearID.min()
     res = method(expr)
+
     assert converter(res) is None
+
+
+mark_notyet_nulls = pytest.mark.notyet(
+    [
+        "clickhouse",
+        "exasol",
+        "flink",
+        "impala",
+        "mssql",
+        "mysql",
+        "oracle",
+        "postgres",
+        "risingwave",
+        "trino",
+    ],
+    raises=com.IbisTypeError,
+    reason="unable to handle null types as input",
+)
+
+
+@mark_notyet_nulls
+def test_all_null_table(con):
+    t = ibis.memtable({"a": [None]})
+    result = con.to_pyarrow(t)
+    assert pat.is_null(result["a"].type)
+
+
+@mark_notyet_nulls
+def test_all_null_column(con):
+    t = ibis.memtable({"a": [None]})
+    result = con.to_pyarrow(t.a)
+    assert pat.is_null(result.type)
+
+
+@mark_notyet_nulls
+def test_all_null_scalar(con):
+    e = ibis.literal(None)
+    result = con.to_pyarrow(e)
+    assert pat.is_null(result.type)
diff --git a/ibis/backends/trino/__init__.py b/ibis/backends/trino/__init__.py
@@ -550,7 +550,7 @@ def _fetch_from_cursor(self, cursor, schema: sch.Schema) -> pd.DataFrame:
 
     def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
         schema = op.schema
-        if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
+        if null_columns := schema.null_fields:
             raise com.IbisTypeError(
                 "Trino cannot yet reliably handle `null` typed columns; "
                 f"got null typed columns: {null_columns}"

diff --git a/ibis/expr/schema.py b/ibis/expr/schema.py
@@ -70,6 +70,10 @@ def types(self):
     def geospatial(self) -> tuple[str, ...]:
         return tuple(name for name, typ in self.fields.items() if typ.is_geospatial())
 
+    @attribute
+    def null_fields(self) -> tuple[str, ...]:
+        return tuple(name for name, typ in self.fields.items() if typ.is_null())
+
     @attribute
     def _name_locs(self) -> dict[str, int]:
         return {v: i for i, v in enumerate(self.names)}

diff --git a/ibis/expr/tests/test_schema.py b/ibis/expr/tests/test_schema.py
@@ -472,3 +472,9 @@ def test_schema_from_to_pandas_dtypes():
         ("d", pd.DatetimeTZDtype(tz="US/Eastern", unit="ns")),
     ]
     assert restored_dtypes == expected_dtypes
+
+
+def test_null_fields():
+    assert sch.schema({"a": "int64", "b": "string"}).null_fields == ()
+    assert sch.schema({"a": "null", "b": "string"}).null_fields == ("a",)
+    assert sch.schema({"a": "null", "b": "null"}).null_fields == ("a", "b")