pandas/io/_util.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -166,4 +166,26 @@ def _post_convert_dtypes( @@
                 # GH#44901 reraise to keep api consistent
                 raise ValueError(str(err)) from err
+        if (
+            not using_string_dtype()
+            and dtype != "str"
+            and (dtype_backend is lib.no_default or dtype_backend == "numpy")
+        ):
+            # Convert any StringDtype columns back to object dtype (pyarrow always
+            # uses string dtype even when the infer_string option is False)
+            for col, dtype in zip(df.columns, df.dtypes, strict=True):
+                if isinstance(dtype, pd.StringDtype) and dtype.na_value is np.nan:
+                    df[col] = df[col].astype("object").fillna(None)
+                if isinstance(dtype, pd.CategoricalDtype):
+                    cat_dtype = dtype.categories.dtype
+                    if (
+                        isinstance(cat_dtype, pd.StringDtype)
+                        and cat_dtype.na_value is np.nan
+                    ):
+                        cat_dtype = pd.CategoricalDtype(
+                            categories=dtype.categories.astype("object"),
+                            ordered=dtype.ordered,
+                        )
+                        df[col] = df[col].astype(cat_dtype)
         return df

pandas/io/feather_format.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,8 @@ @@
     )
     import warnings
+    import numpy as np
     from pandas._config import using_string_dtype
     from pandas._libs import lib
@@ Expand All / @@ -17,6 +19,7 @@ @@
     from pandas.util._validators import check_dtype_backend
     from pandas.core.api import DataFrame
+    from pandas.core.arrays.string_ import StringDtype
     from pandas.io._util import arrow_table_to_pandas
     from pandas.io.common import get_handle
@@ Expand Down Expand Up / @@ -162,9 +165,15 @@ def read_feather( @@
                         Pandas4Warning,
                     )
-                    return feather.read_feather(
+                    df = feather.read_feather(
                         handles.handle, columns=columns, use_threads=bool(use_threads)
                     )
+                    # Convert any StringDtype columns to object dtype (pyarrow always
+                    # uses string dtype even when the infer_string option is False)
+                    for col, dtype in zip(df.columns, df.dtypes, strict=True):
+                        if isinstance(dtype, StringDtype) and dtype.na_value is np.nan:
+                            df[col] = df[col].astype("object")
+                    return df
             pa_table = feather.read_table(
                 handles.handle, columns=columns, use_threads=bool(use_threads)
@@ Expand Down @@

pandas/tests/arrays/string_/test_string.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -6,8 +6,6 @@ @@
     import numpy as np
     import pytest
-    from pandas._config import using_string_dtype
     from pandas.compat.pyarrow import pa_version_under19p0
     from pandas.core.dtypes.common import is_dtype_equal
@@ Expand Down Expand Up @@
             assert table.field("a").type == "large_string"
         with pd.option_context("string_storage", string_storage):
             result = table.to_pandas()
-        if dtype.na_value is np.nan and not using_infer_string:
-            assert result["a"].dtype == "object"
-        else:
-            assert isinstance(result["a"].dtype, pd.StringDtype)
-            expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
-            if using_infer_string:
-                expected.columns = expected.columns.astype(
-                    pd.StringDtype(string_storage, na_value=np.nan)
-                )
-            tm.assert_frame_equal(result, expected)
-            # ensure the missing value is represented by NA and not np.nan or None
-            assert result.loc[2, "a"] is result["a"].dtype.na_value
+        assert isinstance(result["a"].dtype, pd.StringDtype)
+        expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
+        if using_infer_string:
+            expected.columns = expected.columns.astype(
+                pd.StringDtype(string_storage, na_value=np.nan)
+            )
+        tm.assert_frame_equal(result, expected)
+        # ensure the missing value is represented by NA and not np.nan or None
+        assert result.loc[2, "a"] is result["a"].dtype.na_value
     @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
@@ Expand All / @@ -373,10 +369,17 @@ def test_arrow_from_string(using_infer_string): @@
         result = table.to_pandas()
-        if using_infer_string and not pa_version_under19p0:
-            expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str")
-        else:
+        if not using_infer_string:
+            if pa_version_under19p0:
+                expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object")
+            else:
+                expected = pd.DataFrame(
+                    {"a": ["a", "b", None]}, dtype=pd.StringDtype(na_value=np.nan)
+                )
+        elif pa_version_under19p0:
             expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object")
+        else:
+            expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str")
         tm.assert_frame_equal(result, expected)
@@ Expand All @@
         with pd.option_context("string_storage", string_storage):
             result = table.to_pandas()
-        if dtype.na_value is np.nan and not using_string_dtype():
-            assert result["a"].dtype == "object"
-        else:
-            assert isinstance(result["a"].dtype, pd.StringDtype)
-            expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
-            if using_infer_string:
-                expected.columns = expected.columns.astype(
-                    pd.StringDtype(string_storage, na_value=np.nan)
-                )
-            tm.assert_frame_equal(result, expected)
+        assert isinstance(result["a"].dtype, pd.StringDtype)
+        expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
+        if using_infer_string:
+            expected.columns = expected.columns.astype(
+                pd.StringDtype(string_storage, na_value=np.nan)
+            )
+        tm.assert_frame_equal(result, expected)
     def test_value_counts_na(dtype):
@@ Expand Down @@

pandas/tests/frame/test_arrow_interface.py

-Original file line number
+Diff line change
@@ -1,5 +1,6 @@
     import ctypes
+    import numpy as np
     import pytest
     import pandas.util._test_decorators as td
@@ Expand Down Expand Up / @@ -65,12 +66,14 @@ def __arrow_c_stream__(self, requested_schema=None): @@
     @td.skip_if_no("pyarrow", min_version="14.0")
-    def test_dataframe_from_arrow():
+    def test_dataframe_from_arrow(using_infer_string):
         # objects with __arrow_c_stream__
         table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
         result = pd.DataFrame.from_arrow(table)
         expected = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+        if not using_infer_string:
+            expected["b"] = expected["b"].astype(pd.StringDtype(na_value=np.nan))
         tm.assert_frame_equal(result, expected)
         # not only pyarrow object are supported
@@ Expand Down @@

pandas/tests/io/test_common.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -138,7 +138,7 @@ def test_bytesiowrapper_returns_correct_bytes(self): @@
                 assert result == data.encode("utf-8")
         # Test that pyarrow can handle a file opened with get_handle
-        def test_get_handle_pyarrow_compat(self):
+        def test_get_handle_pyarrow_compat(sel, using_infer_string):
             pa_csv = pytest.importorskip("pyarrow.csv")
             # Test latin1, ucs-2, and ucs-4 chars
@@ Expand All / @@ -154,6 +154,8 @@ def test_get_handle_pyarrow_compat(self): @@
                 df = pa_csv.read_csv(handles.handle).to_pandas()
                 if pa_version_under19p0:
                     expected = expected.astype("object")
+                elif not using_infer_string:
+                    expected = expected.astype(pd.StringDtype(na_value=np.nan))
                 tm.assert_frame_equal(df, expected)
                 assert not s.closed
@@ Expand Down @@

[backport 3.0.x] BUG: still use object dtype for pyarrow-backed IO methods with infer_strings disabled (#63900) #63932

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

mroeschke merged 1 commit into pandas-dev:3.0.x from jorisvandenbossche:backport-63900

Jan 29, 2026

+66 −30

-Original file line number
+Diff line change
@@ Expand Up / @@ -166,4 +166,26 @@ def _post_convert_dtypes( @@
                 # GH#44901 reraise to keep api consistent
                 raise ValueError(str(err)) from err
+        if (
+            not using_string_dtype()
+            and dtype != "str"
+            and (dtype_backend is lib.no_default or dtype_backend == "numpy")
+        ):
+            # Convert any StringDtype columns back to object dtype (pyarrow always
+            # uses string dtype even when the infer_string option is False)
+            for col, dtype in zip(df.columns, df.dtypes, strict=True):
+                if isinstance(dtype, pd.StringDtype) and dtype.na_value is np.nan:
+                    df[col] = df[col].astype("object").fillna(None)
+                if isinstance(dtype, pd.CategoricalDtype):
+                    cat_dtype = dtype.categories.dtype
+                    if (
+                        isinstance(cat_dtype, pd.StringDtype)
+                        and cat_dtype.na_value is np.nan
+                    ):
+                        cat_dtype = pd.CategoricalDtype(
+                            categories=dtype.categories.astype("object"),
+                            ordered=dtype.ordered,
+                        )
+                        df[col] = df[col].astype(cat_dtype)
         return df

-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,8 @@ @@
     )
     import warnings
+    import numpy as np
     from pandas._config import using_string_dtype
     from pandas._libs import lib
@@ Expand All / @@ -17,6 +19,7 @@ @@
     from pandas.util._validators import check_dtype_backend
     from pandas.core.api import DataFrame
+    from pandas.core.arrays.string_ import StringDtype
     from pandas.io._util import arrow_table_to_pandas
     from pandas.io.common import get_handle
@@ Expand Down Expand Up / @@ -162,9 +165,15 @@ def read_feather( @@
                         Pandas4Warning,
                     )
-                    return feather.read_feather(
+                    df = feather.read_feather(
                         handles.handle, columns=columns, use_threads=bool(use_threads)
                     )
+                    # Convert any StringDtype columns to object dtype (pyarrow always
+                    # uses string dtype even when the infer_string option is False)
+                    for col, dtype in zip(df.columns, df.dtypes, strict=True):
+                        if isinstance(dtype, StringDtype) and dtype.na_value is np.nan:
+                            df[col] = df[col].astype("object")
+                    return df
             pa_table = feather.read_table(
                 handles.handle, columns=columns, use_threads=bool(use_threads)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -6,8 +6,6 @@ @@
     import numpy as np
     import pytest
-    from pandas._config import using_string_dtype
     from pandas.compat.pyarrow import pa_version_under19p0
     from pandas.core.dtypes.common import is_dtype_equal
@@ Expand Down Expand Up @@
             assert table.field("a").type == "large_string"
         with pd.option_context("string_storage", string_storage):
             result = table.to_pandas()
-        if dtype.na_value is np.nan and not using_infer_string:
-            assert result["a"].dtype == "object"
-        else:
-            assert isinstance(result["a"].dtype, pd.StringDtype)
-            expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
-            if using_infer_string:
-                expected.columns = expected.columns.astype(
-                    pd.StringDtype(string_storage, na_value=np.nan)
-                )
-            tm.assert_frame_equal(result, expected)
-            # ensure the missing value is represented by NA and not np.nan or None
-            assert result.loc[2, "a"] is result["a"].dtype.na_value
+        assert isinstance(result["a"].dtype, pd.StringDtype)
+        expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
+        if using_infer_string:
+            expected.columns = expected.columns.astype(
+                pd.StringDtype(string_storage, na_value=np.nan)
+            )
+        tm.assert_frame_equal(result, expected)
+        # ensure the missing value is represented by NA and not np.nan or None
+        assert result.loc[2, "a"] is result["a"].dtype.na_value
     @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
@@ Expand All / @@ -373,10 +369,17 @@ def test_arrow_from_string(using_infer_string): @@
         result = table.to_pandas()
-        if using_infer_string and not pa_version_under19p0:
-            expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str")
-        else:
+        if not using_infer_string:
+            if pa_version_under19p0:
+                expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object")
+            else:
+                expected = pd.DataFrame(
+                    {"a": ["a", "b", None]}, dtype=pd.StringDtype(na_value=np.nan)
+                )
+        elif pa_version_under19p0:
             expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object")
+        else:
+            expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str")
         tm.assert_frame_equal(result, expected)
@@ Expand All @@
         with pd.option_context("string_storage", string_storage):
             result = table.to_pandas()
-        if dtype.na_value is np.nan and not using_string_dtype():
-            assert result["a"].dtype == "object"
-        else:
-            assert isinstance(result["a"].dtype, pd.StringDtype)
-            expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
-            if using_infer_string:
-                expected.columns = expected.columns.astype(
-                    pd.StringDtype(string_storage, na_value=np.nan)
-                )
-            tm.assert_frame_equal(result, expected)
+        assert isinstance(result["a"].dtype, pd.StringDtype)
+        expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
+        if using_infer_string:
+            expected.columns = expected.columns.astype(
+                pd.StringDtype(string_storage, na_value=np.nan)
+            )
+        tm.assert_frame_equal(result, expected)
     def test_value_counts_na(dtype):
@@ Expand Down @@

-Original file line number
+Diff line change
@@ -1,5 +1,6 @@
     import ctypes
+    import numpy as np
     import pytest
     import pandas.util._test_decorators as td
@@ Expand Down Expand Up / @@ -65,12 +66,14 @@ def __arrow_c_stream__(self, requested_schema=None): @@
     @td.skip_if_no("pyarrow", min_version="14.0")
-    def test_dataframe_from_arrow():
+    def test_dataframe_from_arrow(using_infer_string):
         # objects with __arrow_c_stream__
         table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
         result = pd.DataFrame.from_arrow(table)
         expected = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+        if not using_infer_string:
+            expected["b"] = expected["b"].astype(pd.StringDtype(na_value=np.nan))
         tm.assert_frame_equal(result, expected)
         # not only pyarrow object are supported
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -138,7 +138,7 @@ def test_bytesiowrapper_returns_correct_bytes(self): @@
                 assert result == data.encode("utf-8")
         # Test that pyarrow can handle a file opened with get_handle
-        def test_get_handle_pyarrow_compat(self):
+        def test_get_handle_pyarrow_compat(sel, using_infer_string):
             pa_csv = pytest.importorskip("pyarrow.csv")
             # Test latin1, ucs-2, and ucs-4 chars
@@ Expand All / @@ -154,6 +154,8 @@ def test_get_handle_pyarrow_compat(self): @@
                 df = pa_csv.read_csv(handles.handle).to_pandas()
                 if pa_version_under19p0:
                     expected = expected.astype("object")
+                elif not using_infer_string:
+                    expected = expected.astype(pd.StringDtype(na_value=np.nan))
                 tm.assert_frame_equal(df, expected)
                 assert not s.closed
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[backport 3.0.x] BUG: still use object dtype for pyarrow-backed IO methods with infer_strings disabled (#63900) #63932

Diff view

Diff view

There are no files selected for viewing

Uh oh!