Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@
pandas_dtype,
)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.missing import isna

from pandas.core import (
algorithms as algos,
Expand Down Expand Up @@ -341,6 +340,8 @@ def _from_sequence_of_strings(
"""
Construct a new ExtensionArray from a sequence of strings.
"""
from pandas.core.dtypes.missing import isna

mask = isna(strings)

if isinstance(strings, cls):
Expand Down Expand Up @@ -539,6 +540,8 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
-------
pa.Scalar
"""
from pandas.core.dtypes.missing import isna

if isinstance(value, pa.Scalar):
pa_scalar = value
elif isna(value) and not (lib.is_float(value) and not is_nan_na()):
Expand Down Expand Up @@ -581,6 +584,8 @@ def _box_pa_array(
-------
pa.Array or pa.ChunkedArray
"""
from pandas.core.dtypes.missing import isna

value = extract_array(value, extract_numpy=True)
if isinstance(value, cls):
pa_array = value._pa_array
Expand Down Expand Up @@ -903,6 +908,8 @@ def __setstate__(self, state) -> None:
self.__dict__.update(state)

def _cmp_method(self, other, op) -> ArrowExtensionArray:
from pandas.core.dtypes.missing import isna

pc_func = ARROW_CMP_FUNCS[op.__name__]
ltype = self._pa_array.type

Expand Down Expand Up @@ -1112,6 +1119,8 @@ def __len__(self) -> int:
return len(self._pa_array)

def __contains__(self, key) -> bool:
from pandas.core.dtypes.missing import isna

# https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604
if isna(key) and key is not self.dtype.na_value:
if lib.is_float(key) and is_nan_na():
Expand Down Expand Up @@ -1418,7 +1427,23 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
if not len(values):
return np.zeros(len(self), dtype=bool)

result = pc.is_in(self._pa_array, value_set=pa.array(values))
value_set = pa.array(values, from_pandas=True)
# Fix for GH#63304
if pa.types.is_null(value_set.type):
has_pd_na = False
for x in values:
# GH#63304: Check for pd.NA (NAType) specifically
if is_pdna_or_none(x) and x is not None:
has_pd_na = True
break

if has_pd_na:
value_set = value_set.cast(self._pa_array.type)
# else: pass-through to pc.is_in below, ensuring
# legacy behavior (crash or otherwise) is preserved
# for non-pd.NA nulls (e.g. [None], [np.nan])

result = pc.is_in(self._pa_array, value_set=value_set)
# pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
# to False
return np.array(result, dtype=np.bool_)
Expand Down Expand Up @@ -1599,6 +1624,8 @@ def take(
raise IndexError("out of bounds value in 'indices'.")

if allow_fill:
from pandas.core.dtypes.missing import isna

fill_mask = indices_array < 0
if fill_mask.any():
validate_indices(indices_array, len(self._pa_array))
Expand Down Expand Up @@ -1671,6 +1698,8 @@ def to_numpy(
copy: bool = False,
na_value: object = lib.no_default,
) -> np.ndarray:
from pandas.core.dtypes.missing import isna

original_na_value = na_value
dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna)
pa_type = self._pa_array.type
Expand Down Expand Up @@ -2682,6 +2711,8 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
]

def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
from pandas.core.dtypes.missing import isna

if na is not lib.no_default and not isna(na): # pyright: ignore [reportGeneralTypeIssues]
result = result.fill_null(na)
return self._from_pyarrow_array(result)
Expand Down
47 changes: 47 additions & 0 deletions pandas/tests/indexing/test_gh63304.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pytest

import pandas as pd
import pandas._testing as tm

pa = pytest.importorskip("pyarrow", minversion="13.0.0")


def test_drop_na_arrow_index():
# GH#63304
# Test that dropping pd.NA from PyArrow-backed Index does not raise ArrowInvalid

# integer
df = pd.DataFrame(
{"A": [1, 2, 3]}, index=pd.Index([1, 2, 3], dtype="int64[pyarrow]")
)
# pd.NA is not in index, should raise KeyError, but NOT ArrowInvalid
with pytest.raises(KeyError, match="not found in axis"):
df.drop(index=[pd.NA])

# string
df = pd.DataFrame(
{"A": [1, 2, 3]}, index=pd.Index(["a", "b", "c"], dtype="string[pyarrow]")
)
with pytest.raises(KeyError, match="not found in axis"):
df.drop(index=[pd.NA])

# binary
df = pd.DataFrame(
{"A": [1, 2, 3]},
index=pd.Index([b"a", b"b", b"c"], dtype="binary[pyarrow]"),
)
with pytest.raises(KeyError, match="not found in axis"):
df.drop(index=[pd.NA])

# Case where NA IS in the index (should verify it drops correctly)
df = pd.DataFrame({"A": [1, 2]}, index=pd.Index([1, pd.NA], dtype="int64[pyarrow]"))
result = df.drop(index=[pd.NA])
expected = pd.DataFrame({"A": [1]}, index=pd.Index([1], dtype="int64[pyarrow]"))
tm.assert_frame_equal(result, expected)

df = pd.DataFrame(
{"A": [1, 2]}, index=pd.Index(["a", pd.NA], dtype="string[pyarrow]")
)
result = df.drop(index=[pd.NA])
expected = pd.DataFrame({"A": [1]}, index=pd.Index(["a"], dtype="string[pyarrow]"))
tm.assert_frame_equal(result, expected)
Loading