diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 610c5f9f1cc33..2375e3e8e4df9 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1418,7 +1418,24 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: if not len(values): return np.zeros(len(self), dtype=bool) - result = pc.is_in(self._pa_array, value_set=pa.array(values)) + value_set = pa.array(values, from_pandas=True) + if pa.types.is_null(value_set.type): + # GH#63304: If we have explicit pd.NA, we want to allow the comparison + # to return False (not found) rather than raising ArrowInvalid. + # However, we need to be careful not to swallow other types that might + # be inferred as null (e.g. [np.nan]) which logic elsewhere might rely + # on crashing to trigger fallback (e.g. in parsers). + has_pd_na = False + for x in values: + # GH#63304: Check for pd.NA (NAType) specifically + if isna(x) and not isinstance(x, (float, np.floating, type(None))): + has_pd_na = True + break + + if has_pd_na: + value_set = value_set.cast(self._pa_array.type) + + result = pc.is_in(self._pa_array, value_set=value_set) # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_) diff --git a/pandas/tests/indexing/test_gh63304.py b/pandas/tests/indexing/test_gh63304.py new file mode 100644 index 0000000000000..bba6bd329daad --- /dev/null +++ b/pandas/tests/indexing/test_gh63304.py @@ -0,0 +1,47 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + +pa = pytest.importorskip("pyarrow", minversion="13.0.0") + + +def test_drop_na_arrow_index(): + # GH#63304 + # Test that dropping pd.NA from PyArrow-backed Index does not raise ArrowInvalid + + # integer + df = pd.DataFrame( + {"A": [1, 2, 3]}, index=pd.Index([1, 2, 3], dtype="int64[pyarrow]") + ) + # pd.NA is not in index, should raise KeyError, but NOT ArrowInvalid + with pytest.raises(KeyError, match="not found in axis"): + df.drop(index=[pd.NA]) + + # string + df = pd.DataFrame( + {"A": [1, 2, 3]}, index=pd.Index(["a", "b", "c"], dtype="string[pyarrow]") + ) + with pytest.raises(KeyError, match="not found in axis"): + df.drop(index=[pd.NA]) + + # binary + df = pd.DataFrame( + {"A": [1, 2, 3]}, + index=pd.Index([b"a", b"b", b"c"], dtype="binary[pyarrow]"), + ) + with pytest.raises(KeyError, match="not found in axis"): + df.drop(index=[pd.NA]) + + # Case where NA IS in the index (should verify it drops correctly) + df = pd.DataFrame({"A": [1, 2]}, index=pd.Index([1, pd.NA], dtype="int64[pyarrow]")) + result = df.drop(index=[pd.NA]) + expected = pd.DataFrame({"A": [1]}, index=pd.Index([1], dtype="int64[pyarrow]")) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame( + {"A": [1, 2]}, index=pd.Index(["a", pd.NA], dtype="string[pyarrow]") + ) + result = df.drop(index=[pd.NA]) + expected = pd.DataFrame({"A": [1]}, index=pd.Index(["a"], dtype="string[pyarrow]")) + tm.assert_frame_equal(result, expected)