Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions pandas/io/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,4 +166,26 @@ def _post_convert_dtypes(
# GH#44901 reraise to keep api consistent
raise ValueError(str(err)) from err

if (
not using_string_dtype()
and dtype != "str"
and (dtype_backend is lib.no_default or dtype_backend == "numpy")
):
# Convert any StringDtype columns back to object dtype (pyarrow always
# uses string dtype even when the infer_string option is False)
for col, dtype in zip(df.columns, df.dtypes, strict=True):
if isinstance(dtype, pd.StringDtype) and dtype.na_value is np.nan:
df[col] = df[col].astype("object").fillna(None)
if isinstance(dtype, pd.CategoricalDtype):
cat_dtype = dtype.categories.dtype
if (
isinstance(cat_dtype, pd.StringDtype)
and cat_dtype.na_value is np.nan
):
cat_dtype = pd.CategoricalDtype(
categories=dtype.categories.astype("object"),
ordered=dtype.ordered,
)
df[col] = df[col].astype(cat_dtype)

return df
11 changes: 10 additions & 1 deletion pandas/io/feather_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
)
import warnings

import numpy as np

from pandas._config import using_string_dtype

from pandas._libs import lib
Expand All @@ -17,6 +19,7 @@
from pandas.util._validators import check_dtype_backend

from pandas.core.api import DataFrame
from pandas.core.arrays.string_ import StringDtype

from pandas.io._util import arrow_table_to_pandas
from pandas.io.common import get_handle
Expand Down Expand Up @@ -162,9 +165,15 @@ def read_feather(
Pandas4Warning,
)

return feather.read_feather(
df = feather.read_feather(
handles.handle, columns=columns, use_threads=bool(use_threads)
)
# Convert any StringDtype columns to object dtype (pyarrow always
# uses string dtype even when the infer_string option is False)
for col, dtype in zip(df.columns, df.dtypes, strict=True):
if isinstance(dtype, StringDtype) and dtype.na_value is np.nan:
df[col] = df[col].astype("object")
return df

pa_table = feather.read_table(
handles.handle, columns=columns, use_threads=bool(use_threads)
Expand Down
54 changes: 27 additions & 27 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat.pyarrow import pa_version_under19p0

from pandas.core.dtypes.common import is_dtype_equal
Expand Down Expand Up @@ -351,18 +349,16 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
assert table.field("a").type == "large_string"
with pd.option_context("string_storage", string_storage):
result = table.to_pandas()
if dtype.na_value is np.nan and not using_infer_string:
assert result["a"].dtype == "object"
else:
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
if using_infer_string:
expected.columns = expected.columns.astype(
pd.StringDtype(string_storage, na_value=np.nan)
)
tm.assert_frame_equal(result, expected)
# ensure the missing value is represented by NA and not np.nan or None
assert result.loc[2, "a"] is result["a"].dtype.na_value

assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
if using_infer_string:
expected.columns = expected.columns.astype(
pd.StringDtype(string_storage, na_value=np.nan)
)
tm.assert_frame_equal(result, expected)
# ensure the missing value is represented by NA and not np.nan or None
assert result.loc[2, "a"] is result["a"].dtype.na_value


@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
Expand All @@ -373,10 +369,17 @@ def test_arrow_from_string(using_infer_string):

result = table.to_pandas()

if using_infer_string and not pa_version_under19p0:
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str")
else:
if not using_infer_string:
if pa_version_under19p0:
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object")
else:
expected = pd.DataFrame(
{"a": ["a", "b", None]}, dtype=pd.StringDtype(na_value=np.nan)
)
elif pa_version_under19p0:
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object")
else:
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str")
tm.assert_frame_equal(result, expected)


Expand All @@ -397,16 +400,13 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
with pd.option_context("string_storage", string_storage):
result = table.to_pandas()

if dtype.na_value is np.nan and not using_string_dtype():
assert result["a"].dtype == "object"
else:
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
if using_infer_string:
expected.columns = expected.columns.astype(
pd.StringDtype(string_storage, na_value=np.nan)
)
tm.assert_frame_equal(result, expected)
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
if using_infer_string:
expected.columns = expected.columns.astype(
pd.StringDtype(string_storage, na_value=np.nan)
)
tm.assert_frame_equal(result, expected)


def test_value_counts_na(dtype):
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/frame/test_arrow_interface.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import ctypes

import numpy as np
import pytest

import pandas.util._test_decorators as td
Expand Down Expand Up @@ -65,12 +66,14 @@ def __arrow_c_stream__(self, requested_schema=None):


@td.skip_if_no("pyarrow", min_version="14.0")
def test_dataframe_from_arrow():
def test_dataframe_from_arrow(using_infer_string):
# objects with __arrow_c_stream__
table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})

result = pd.DataFrame.from_arrow(table)
expected = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
if not using_infer_string:
expected["b"] = expected["b"].astype(pd.StringDtype(na_value=np.nan))
tm.assert_frame_equal(result, expected)

# not only pyarrow object are supported
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def test_bytesiowrapper_returns_correct_bytes(self):
assert result == data.encode("utf-8")

# Test that pyarrow can handle a file opened with get_handle
def test_get_handle_pyarrow_compat(self):
def test_get_handle_pyarrow_compat(sel, using_infer_string):
pa_csv = pytest.importorskip("pyarrow.csv")

# Test latin1, ucs-2, and ucs-4 chars
Expand All @@ -154,6 +154,8 @@ def test_get_handle_pyarrow_compat(self):
df = pa_csv.read_csv(handles.handle).to_pandas()
if pa_version_under19p0:
expected = expected.astype("object")
elif not using_infer_string:
expected = expected.astype(pd.StringDtype(na_value=np.nan))
tm.assert_frame_equal(df, expected)
assert not s.closed

Expand Down
Loading