FIX: drop(index=[pd.NA]) fails for Arrow-backed index (#63304) #63933

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed

dedlYTiTAN wants to merge 19 commits into pandas-dev:main from dedlYTiTAN:fix-gh63304-v3

pandas/core/arrays/arrow/array.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -60,7 +60,6 @@ @@
         pandas_dtype,
     )
     from pandas.core.dtypes.dtypes import DatetimeTZDtype
-    from pandas.core.dtypes.missing import isna
     from pandas.core import (
         algorithms as algos,
@@ Expand Down Expand Up / @@ -341,6 +340,8 @@ def _from_sequence_of_strings( @@
             """
             Construct a new ExtensionArray from a sequence of strings.
             """
+            from pandas.core.dtypes.missing import isna
             mask = isna(strings)
             if isinstance(strings, cls):
@@ Expand Down Expand Up @@
             -------
             pa.Scalar
             """
+            from pandas.core.dtypes.missing import isna
             if isinstance(value, pa.Scalar):
                 pa_scalar = value
             elif isna(value) and not (lib.is_float(value) and not is_nan_na()):
@@ Expand Down Expand Up / @@ -581,6 +584,8 @@ def _box_pa_array( @@
             -------
             pa.Array or pa.ChunkedArray
             """
+            from pandas.core.dtypes.missing import isna
             value = extract_array(value, extract_numpy=True)
             if isinstance(value, cls):
                 pa_array = value._pa_array
@@ Expand Down Expand Up / @@ -903,6 +908,8 @@ def __setstate__(self, state) -> None: @@
             self.__dict__.update(state)
         def _cmp_method(self, other, op) -> ArrowExtensionArray:
+            from pandas.core.dtypes.missing import isna
             pc_func = ARROW_CMP_FUNCS[op.__name__]
             ltype = self._pa_array.type
@@ Expand Down Expand Up / @@ -1112,6 +1119,8 @@ def __len__(self) -> int: @@
             return len(self._pa_array)
         def __contains__(self, key) -> bool:
+            from pandas.core.dtypes.missing import isna
             # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604
             if isna(key) and key is not self.dtype.na_value:
                 if lib.is_float(key) and is_nan_na():
@@ Expand Down Expand Up @@
             if not len(values):
                 return np.zeros(len(self), dtype=bool)
-            result = pc.is_in(self._pa_array, value_set=pa.array(values))
+            value_set = pa.array(values, from_pandas=True)
+            # Fix for GH#63304
+            if pa.types.is_null(value_set.type):
+                has_pd_na = False
+                for x in values:
+                    # GH#63304: Check for pd.NA (NAType) specifically
+                    if is_pdna_or_none(x) and x is not None:
+                        has_pd_na = True
+                        break
+                if has_pd_na:
+                    value_set = value_set.cast(self._pa_array.type)
+                # else: pass-through to pc.is_in below, ensuring
+                # legacy behavior (crash or otherwise) is preserved
+                # for non-pd.NA nulls (e.g. [None], [np.nan])
+            result = pc.is_in(self._pa_array, value_set=value_set)
             # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
             # to False
             return np.array(result, dtype=np.bool_)
@@ Expand Down Expand Up / @@ -1599,6 +1624,8 @@ def take( @@
                 raise IndexError("out of bounds value in 'indices'.")
             if allow_fill:
+                from pandas.core.dtypes.missing import isna
                 fill_mask = indices_array < 0
                 if fill_mask.any():
                     validate_indices(indices_array, len(self._pa_array))
@@ Expand Down Expand Up / @@ -1671,6 +1698,8 @@ def to_numpy( @@
             copy: bool = False,
             na_value: object = lib.no_default,
         ) -> np.ndarray:
+            from pandas.core.dtypes.missing import isna
             original_na_value = na_value
             dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna)
             pa_type = self._pa_array.type
@@ Expand Down Expand Up @@
             ]
         def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
+            from pandas.core.dtypes.missing import isna
             if na is not lib.no_default and not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
                 result = result.fill_null(na)
             return self._from_pyarrow_array(result)
@@ Expand Down @@

pandas/tests/indexing/test_gh63304.py

-Original file line number
+Diff line change
@@ -0,0 +1,47 @@
+    import pytest
+    import pandas as pd
+    import pandas._testing as tm
+    pa = pytest.importorskip("pyarrow", minversion="13.0.0")
+    def test_drop_na_arrow_index():
+        # GH#63304
+        # Test that dropping pd.NA from PyArrow-backed Index does not raise ArrowInvalid
+        # integer
+        df = pd.DataFrame(
+            {"A": [1, 2, 3]}, index=pd.Index([1, 2, 3], dtype="int64[pyarrow]")
+        )
+        # pd.NA is not in index, should raise KeyError, but NOT ArrowInvalid
+        with pytest.raises(KeyError, match="not found in axis"):
+            df.drop(index=[pd.NA])
+        # string
+        df = pd.DataFrame(
+            {"A": [1, 2, 3]}, index=pd.Index(["a", "b", "c"], dtype="string[pyarrow]")
+        )
+        with pytest.raises(KeyError, match="not found in axis"):
+            df.drop(index=[pd.NA])
+        # binary
+        df = pd.DataFrame(
+            {"A": [1, 2, 3]},
+            index=pd.Index([b"a", b"b", b"c"], dtype="binary[pyarrow]"),
+        )
+        with pytest.raises(KeyError, match="not found in axis"):
+            df.drop(index=[pd.NA])
+        # Case where NA IS in the index (should verify it drops correctly)
+        df = pd.DataFrame({"A": [1, 2]}, index=pd.Index([1, pd.NA], dtype="int64[pyarrow]"))
+        result = df.drop(index=[pd.NA])
+        expected = pd.DataFrame({"A": [1]}, index=pd.Index([1], dtype="int64[pyarrow]"))
+        tm.assert_frame_equal(result, expected)
+        df = pd.DataFrame(
+            {"A": [1, 2]}, index=pd.Index(["a", pd.NA], dtype="string[pyarrow]")
+        )
+        result = df.drop(index=[pd.NA])
+        expected = pd.DataFrame({"A": [1]}, index=pd.Index(["a"], dtype="string[pyarrow]"))
+        tm.assert_frame_equal(result, expected)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

FIX: drop(index=[pd.NA]) fails for Arrow-backed index (#63304) #63933

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!