Skip to content
Closed
1 change: 1 addition & 0 deletions doc/source/getting_started/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ Dependency Minimum Version pip ex
`zlib <https://github.com/madler/zlib>`__ hdf5 Compression for HDF5
`fastparquet <https://github.com/dask/fastparquet>`__ 2024.11.0 - Parquet reading / writing (pyarrow is default)
`pyarrow <https://github.com/apache/arrow>`__ 13.0.0 parquet, feather Parquet, ORC, and feather reading / writing
`orjson <https://github.com/ijl/orjson>`__ 3.11.5 - Optional JSON parsing engine for ``read_json``
`PyIceberg <https://py.iceberg.apache.org/>`__ 0.8.1 iceberg Apache Iceberg reading / writing
`pyreadstat <https://github.com/Roche/pyreadstat>`__ 1.2.8 spss SPSS files (.sav) reading
`odfpy <https://github.com/eea/odfpy>`__ 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing
Expand Down
13 changes: 11 additions & 2 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1915,8 +1915,17 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
* ``lines`` : reads file as one json object per line.
* ``encoding`` : The encoding to use to decode py3 bytes.
* ``chunksize`` : when used in combination with ``lines=True``, return a ``pandas.api.typing.JsonReader`` which reads in ``chunksize`` lines per iteration.
* ``engine``: Either ``"ujson"``, the built-in JSON parser, or ``"pyarrow"`` which dispatches to pyarrow's ``pyarrow.json.read_json``.
The ``"pyarrow"`` is only available when ``lines=True``
* ``engine``: ``"ujson"`` or ``"orjson"`` or ``"pyarrow"``, default ``"ujson"``

* ``"ujson"`` is the default built-in parser.
* ``"orjson"`` enables parsing of very large integer values that may fail
with ``"ujson"``. This engine requires the optional dependency
``orjson`` to be installed. Very large integers may be decoded as
floating point values, following ``orjson`` semantics. This engine is
stricter about JSON compliance; in particular, unquoted ``NaN`` ,
``Infinity``, and ``-Infinity`` values are not supported.
* ``"pyarrow"`` dispatches to ``pyarrow.json.read_json`` and is only
available when ``lines=True``.

The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable.

Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1369,6 +1369,8 @@ I/O
- Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`)
- Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`)
- Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`)
- Fixed :meth:`read_json` with ``engine="orjson"`` to correctly parse JSON files containing integers larger than 64-bit, matching the behavior of Python's
built-in ``json`` module instead of raising an overflow error. (:issue:`63572`)

Period
^^^^^^
Expand Down
2 changes: 1 addition & 1 deletion pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ def closed(self) -> bool:
CSVEngine: TypeAlias = Literal["c", "python", "pyarrow", "python-fwf"]

# read_json engines
JSONEngine: TypeAlias = Literal["ujson", "pyarrow"]
JSONEngine: TypeAlias = Literal["ujson", "orjson", "pyarrow"]

# read_xml parsers
XMLParsers: TypeAlias = Literal["lxml", "etree"]
Expand Down
90 changes: 78 additions & 12 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,10 +722,18 @@ def read_json(

.. versionadded:: 2.0

engine : {{"ujson", "pyarrow"}}, default "ujson"
engine : {{"ujson", "orjson", "pyarrow"}}, default "ujson"
Parser engine to use. The ``"pyarrow"`` engine is only available when
``lines=True``.

``"orjson"`` enables parsing of very large integer values that may fail
with ``"ujson"``. Requires the optional dependency ``orjson`` to be
installed. Extremely large integers may be decoded as floating point
values, following orjson semantics.

This engine is stricter about JSON compliance. In particular, unquoted
``NaN``, ``Infinity``, and ``-Infinity`` values are not supported.

.. versionadded:: 2.0

Returns
Expand Down Expand Up @@ -808,6 +816,17 @@ def read_json(
index a b c d e
0 0 1 2.5 True a 1577.2
1 1 <NA> 4.5 False b 1577.1

Using the ``orjson`` engine with very large integers:

>>> from io import StringIO
>>> json_str = (
... '[{"composition":"Nb:100","n_atoms":128000,'
... '"space_group":229,"time":1000000000000000000000000}]'
... )
>>> pd.read_json(StringIO(json_str), engine="orjson") # doctest: +SKIP
composition n_atoms space_group time
0 Nb:100 128000 229 1.000000e+24
""" # noqa: E501
if orient == "table" and dtype:
raise ValueError("cannot pass both dtype and orient='table'")
Expand Down Expand Up @@ -904,7 +923,7 @@ def __init__(
self.handles: IOHandles[str] | None = None
self.dtype_backend = dtype_backend

if self.engine not in {"pyarrow", "ujson"}:
if self.engine not in {"pyarrow", "ujson", "orjson"}:
raise ValueError(
f"The engine type {self.engine} is currently not supported."
)
Expand All @@ -927,7 +946,14 @@ def __init__(
"the line-delimited JSON format"
)
self.data = filepath_or_buffer
elif self.engine == "ujson":
elif self.engine in ["ujson", "orjson"]:
# Fail early if orjson not installed
if self.engine == "orjson":
import_optional_dependency(
"orjson",
extra="orjson is required when engine='orjson' for read_json",
)

data = self._get_data_from_filepath(filepath_or_buffer)
# If self.chunksize, we prepare the data for the `__next__` method.
# Otherwise, we read it into memory for the `read` method.
Expand Down Expand Up @@ -987,6 +1013,8 @@ def read(self) -> DataFrame | Series:
obj = self._read_pyarrow()
elif self.engine == "ujson":
obj = self._read_ujson()
elif self.engine == "orjson":
obj = self._read_orjson()

return obj

Expand Down Expand Up @@ -1041,6 +1069,36 @@ def _read_ujson(self) -> DataFrame | Series:
else:
return obj

def _read_orjson(self) -> DataFrame | Series:
"""
Read JSON using the orjson engine.
"""

obj: DataFrame | Series

if self.lines:
if self.chunksize:
obj = concat(self)
elif self.nrows:
lines = list(islice(self.data, self.nrows))
lines_json = self._combine_lines(lines)
obj = self._get_object_parser(lines_json)
else:
data = ensure_str(self.data)
data_lines = data.split("\n")
obj = self._get_object_parser(self._combine_lines(data_lines))
else:
obj = self._get_object_parser(self.data)

if self.dtype_backend is not lib.no_default:
with option_context("future.distinguish_nan_and_na", False):
return obj.convert_dtypes(
infer_objects=False,
dtype_backend=self.dtype_backend,
)
else:
return obj

def _get_object_parser(self, json: str) -> DataFrame | Series:
"""
Parses a json document into a pandas object.
Expand All @@ -1056,6 +1114,7 @@ def _get_object_parser(self, json: str) -> DataFrame | Series:
"precise_float": self.precise_float,
"date_unit": self.date_unit,
"dtype_backend": self.dtype_backend,
"engine": self.engine,
}
if typ == "frame":
return FrameParser(json, **kwargs).parse()
Expand Down Expand Up @@ -1155,8 +1214,10 @@ def __init__(
precise_float: bool = False,
date_unit=None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
engine: str = "ujson",
) -> None:
self.json = json
self.engine = engine

if orient is None:
orient = self._default_orient
Expand Down Expand Up @@ -1380,7 +1441,11 @@ class SeriesParser(Parser):
_split_keys = ("name", "index", "data")

def _parse(self) -> Series:
data = ujson_loads(self.json, precise_float=self.precise_float)
if self.engine == "orjson":
orjson = import_optional_dependency("orjson")
data = orjson.loads(self.json)
else:
data = ujson_loads(self.json, precise_float=self.precise_float)

if self.orient == "split":
decoded = {str(k): v for k, v in data.items()}
Expand All @@ -1402,11 +1467,14 @@ def _parse(self) -> DataFrame:
json = self.json
orient = self.orient

if self.engine == "orjson":
orjson = import_optional_dependency("orjson")
loads = orjson.loads(json)
else:
loads = ujson_loads(json, precise_float=self.precise_float)

if orient == "split":
decoded = {
str(k): v
for k, v in ujson_loads(json, precise_float=self.precise_float).items()
}
decoded = {str(k): v for k, v in loads.items()}
self.check_keys_split(decoded)
orig_names = [
(tuple(col) if isinstance(col, list) else col)
Expand All @@ -1419,17 +1487,15 @@ def _parse(self) -> DataFrame:
return DataFrame(dtype=None, **decoded)
elif orient == "index":
return DataFrame.from_dict(
ujson_loads(json, precise_float=self.precise_float),
loads,
dtype=None,
orient="index",
)
elif orient == "table":
return parse_table_schema(json, precise_float=self.precise_float)
else:
# includes orient == "columns"
return DataFrame(
ujson_loads(json, precise_float=self.precise_float), dtype=None
)
return DataFrame(loads, dtype=None)

def _try_convert_types(self, obj: DataFrame) -> DataFrame:
arrays = []
Expand Down
75 changes: 75 additions & 0 deletions pandas/tests/io/json/test_orjson.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from io import StringIO

import pytest

from pandas import (
DataFrame,
Series,
read_json,
)
from pandas.testing import (
assert_frame_equal,
assert_series_equal,
)

pytest.importorskip("orjson")


class TestOrjson:
def test_read_json_very_large_integer(self):
json_str = """
[
{
"composition": "Nb:100",
"n_atoms": 128000,
"space_group": 229,
"time": 1000000000000000000000000
}
]
"""

result = read_json(StringIO(json_str), engine="orjson")

expected = DataFrame(
{
"composition": "Nb:100",
"n_atoms": [128000],
"space_group": [229],
"time": [1e24], # orjson parses very large integers as float
}
)

assert_frame_equal(result, expected)

def test_read_json_very_large_integer_series(self):
json_str = """
[
{
"value": 1000000000000000000000000
}
]
"""

result = read_json(
StringIO(json_str),
engine="orjson",
typ="series",
)

expected = Series(
[{"value": 1e24}]
) # orjson parses very large integers as float

assert_series_equal(result, expected)

@pytest.mark.xfail(reason="orjson does not allow trailing commas")
def test_read_json_trailing_comma_orjson(self):
data = StringIO(
"""
{
"a": 1,
}
"""
)

read_json(data, engine="orjson")
Loading
Loading