pandas/core/arrays/arrow/array.py

-Original file line number
+Diff line change
@@ Expand Up @@
             if not len(values):
                 return np.zeros(len(self), dtype=bool)
-            result = pc.is_in(self._pa_array, value_set=pa.array(values))
+            value_set = pa.array(values, from_pandas=True)
+            if pa.types.is_null(value_set.type):
+                # GH#63304: If we have explicit pd.NA, we want to allow the comparison
+                # to return False (not found) rather than raising ArrowInvalid.
+                # However, we need to be careful not to swallow other types that might
+                # be inferred as null (e.g. [np.nan]) which logic elsewhere might rely
+                # on crashing to trigger fallback (e.g. in parsers).
+                has_pd_na = False
+                for x in values:
+                    # GH#63304: Check for pd.NA (NAType) specifically
+                    if isna(x) and not isinstance(x, (float, np.floating, type(None))):
+                        has_pd_na = True
+                        break
+                if has_pd_na:
+                    value_set = value_set.cast(self._pa_array.type)
+            result = pc.is_in(self._pa_array, value_set=value_set)
             # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
             # to False
             return np.array(result, dtype=np.bool_)
@@ Expand Down @@

pandas/tests/indexing/test_gh63304.py

-Original file line number
+Diff line change
@@ -0,0 +1,47 @@
+    import pytest
+    import pandas as pd
+    import pandas._testing as tm
+    pa = pytest.importorskip("pyarrow", minversion="13.0.0")
+    def test_drop_na_arrow_index():
+        # GH#63304
+        # Test that dropping pd.NA from PyArrow-backed Index does not raise ArrowInvalid
+        # integer
+        df = pd.DataFrame(
+            {"A": [1, 2, 3]}, index=pd.Index([1, 2, 3], dtype="int64[pyarrow]")
+        )
+        # pd.NA is not in index, should raise KeyError, but NOT ArrowInvalid
+        with pytest.raises(KeyError, match="not found in axis"):
+            df.drop(index=[pd.NA])
+        # string
+        df = pd.DataFrame(
+            {"A": [1, 2, 3]}, index=pd.Index(["a", "b", "c"], dtype="string[pyarrow]")
+        )
+        with pytest.raises(KeyError, match="not found in axis"):
+            df.drop(index=[pd.NA])
+        # binary
+        df = pd.DataFrame(
+            {"A": [1, 2, 3]},
+            index=pd.Index([b"a", b"b", b"c"], dtype="binary[pyarrow]"),
+        )
+        with pytest.raises(KeyError, match="not found in axis"):
+            df.drop(index=[pd.NA])
+        # Case where NA IS in the index (should verify it drops correctly)
+        df = pd.DataFrame({"A": [1, 2]}, index=pd.Index([1, pd.NA], dtype="int64[pyarrow]"))
+        result = df.drop(index=[pd.NA])
+        expected = pd.DataFrame({"A": [1]}, index=pd.Index([1], dtype="int64[pyarrow]"))
+        tm.assert_frame_equal(result, expected)
+        df = pd.DataFrame(
+            {"A": [1, 2]}, index=pd.Index(["a", pd.NA], dtype="string[pyarrow]")
+        )
+        result = df.drop(index=[pd.NA])
+        expected = pd.DataFrame({"A": [1]}, index=pd.Index(["a"], dtype="string[pyarrow]"))
+        tm.assert_frame_equal(result, expected)

BUG: Fix drop(index=[pd.NA]) for PyArrow-backed Index (GH#63304) #63910

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed

dedlYTiTAN wants to merge 8 commits into pandas-dev:main from dedlYTiTAN:fix-gh63304-arrow-drop-na-v2

+65 −1

-Original file line number
+Diff line change
@@ Expand Up @@
             if not len(values):
                 return np.zeros(len(self), dtype=bool)
-            result = pc.is_in(self._pa_array, value_set=pa.array(values))
+            value_set = pa.array(values, from_pandas=True)
+            if pa.types.is_null(value_set.type):
+                # GH#63304: If we have explicit pd.NA, we want to allow the comparison
+                # to return False (not found) rather than raising ArrowInvalid.
+                # However, we need to be careful not to swallow other types that might
+                # be inferred as null (e.g. [np.nan]) which logic elsewhere might rely
+                # on crashing to trigger fallback (e.g. in parsers).
+                has_pd_na = False
+                for x in values:
+                    # GH#63304: Check for pd.NA (NAType) specifically
+                    if isna(x) and not isinstance(x, (float, np.floating, type(None))):
+                        has_pd_na = True
+                        break
+                if has_pd_na:
+                    value_set = value_set.cast(self._pa_array.type)
+            result = pc.is_in(self._pa_array, value_set=value_set)
             # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
             # to False
             return np.array(result, dtype=np.bool_)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ -0,0 +1,47 @@
+    import pytest
+    import pandas as pd
+    import pandas._testing as tm
+    pa = pytest.importorskip("pyarrow", minversion="13.0.0")
+    def test_drop_na_arrow_index():
+        # GH#63304
+        # Test that dropping pd.NA from PyArrow-backed Index does not raise ArrowInvalid
+        # integer
+        df = pd.DataFrame(
+            {"A": [1, 2, 3]}, index=pd.Index([1, 2, 3], dtype="int64[pyarrow]")
+        )
+        # pd.NA is not in index, should raise KeyError, but NOT ArrowInvalid
+        with pytest.raises(KeyError, match="not found in axis"):
+            df.drop(index=[pd.NA])
+        # string
+        df = pd.DataFrame(
+            {"A": [1, 2, 3]}, index=pd.Index(["a", "b", "c"], dtype="string[pyarrow]")
+        )
+        with pytest.raises(KeyError, match="not found in axis"):
+            df.drop(index=[pd.NA])
+        # binary
+        df = pd.DataFrame(
+            {"A": [1, 2, 3]},
+            index=pd.Index([b"a", b"b", b"c"], dtype="binary[pyarrow]"),
+        )
+        with pytest.raises(KeyError, match="not found in axis"):
+            df.drop(index=[pd.NA])
+        # Case where NA IS in the index (should verify it drops correctly)
+        df = pd.DataFrame({"A": [1, 2]}, index=pd.Index([1, pd.NA], dtype="int64[pyarrow]"))
+        result = df.drop(index=[pd.NA])
+        expected = pd.DataFrame({"A": [1]}, index=pd.Index([1], dtype="int64[pyarrow]"))
+        tm.assert_frame_equal(result, expected)
+        df = pd.DataFrame(
+            {"A": [1, 2]}, index=pd.Index(["a", pd.NA], dtype="string[pyarrow]")
+        )
+        result = df.drop(index=[pd.NA])
+        expected = pd.DataFrame({"A": [1]}, index=pd.Index(["a"], dtype="string[pyarrow]"))
+        tm.assert_frame_equal(result, expected)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

BUG: Fix drop(index=[pd.NA]) for PyArrow-backed Index (GH#63304) #63910

Diff view

Diff view

There are no files selected for viewing

Uh oh!