Skip to content

Commit 5d6d7f0

Browse files
DEP: Deprecate support for abbreviations in decode_stream_data (#3617)
The inline image abbreviations were previously supported in the publicly facing `decode_stream_data` function. Doing some analysis showed that this support was limited: * Our own code would always map to the "proper" names beforehand, thus we possibly never tested this variant and it has been incomplete. * Only abbreviations for the filter names were supported. Already using `/F` instead of `/Filter` for the filter names itself or specifying `/DP` instead of `/DecodeParams` would fail, as well as all other keys from table 91 and 92 of the PDF 2.0 specification. With this in mind, only external API users might have used this API in the full manner, although we never seem to have received specific reports about the not supported/mapped parameters. Thus, deprecating the abbreviations specific to inline images in this generic function seems like the best way to move forward. While I was working on this to analyze the current behavior, I wrote some basic inline image tests for the previously unsupported filter abbreviations CCF and DCT (at least in `decode_stream_data`). The new abbreviations were added for completeness (although deprecated directly), showing that using `PageObject.images` never errored out or would run into the deprecation now due to proper mapping in our own code.
1 parent eb84ca5 commit 5d6d7f0

File tree

2 files changed

+132
-4
lines changed

2 files changed

+132
-4
lines changed

pypdf/filters.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
from ._codecs._codecs import LzwCodec as _LzwCodec
5151
from ._utils import (
5252
WHITESPACES_AS_BYTES,
53+
deprecate,
5354
deprecation_with_replacement,
5455
logger_warning,
5556
)
@@ -753,7 +754,16 @@ def _is_binary_compatible() -> bool:
753754
return Version(version) >= Version("0.19")
754755

755756

756-
def decode_stream_data(stream: Any) -> bytes:
757+
def _deprecate_inline_image_filters(filter_name: str, old_name: str, new_name: str) -> None:
758+
if filter_name != old_name:
759+
return
760+
deprecate(
761+
f"The filter name {old_name} is deprecated and will be removed in pypdf 7.0.0. Use {new_name} instead.",
762+
4,
763+
)
764+
765+
766+
def decode_stream_data(stream: StreamObject) -> bytes:
757767
"""
758768
Decode the stream data based on the specified filters.
759769
@@ -787,19 +797,26 @@ def decode_stream_data(stream: Any) -> bytes:
787797
if isinstance(params, NullObject):
788798
params = {}
789799
if filter_name in (FT.ASCII_HEX_DECODE, FTA.AHx):
800+
_deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.AHx, new_name=FT.ASCII_HEX_DECODE)
790801
data = ASCIIHexDecode.decode(data)
791802
elif filter_name in (FT.ASCII_85_DECODE, FTA.A85):
803+
_deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.A85, new_name=FT.ASCII_85_DECODE)
792804
data = ASCII85Decode.decode(data)
793805
elif filter_name in (FT.LZW_DECODE, FTA.LZW):
806+
_deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.LZW, new_name=FT.LZW_DECODE)
794807
data = LZWDecode.decode(data, params)
795808
elif filter_name in (FT.FLATE_DECODE, FTA.FL):
809+
_deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.FL, new_name=FT.FLATE_DECODE)
796810
data = FlateDecode.decode(data, params)
797811
elif filter_name in (FT.RUN_LENGTH_DECODE, FTA.RL):
812+
_deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.RL, new_name=FT.RUN_LENGTH_DECODE)
798813
data = RunLengthDecode.decode(data)
799-
elif filter_name == FT.CCITT_FAX_DECODE:
814+
elif filter_name in (FT.CCITT_FAX_DECODE, FTA.CCF):
815+
_deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.CCF, new_name=FT.CCITT_FAX_DECODE)
800816
height = stream.get(IA.HEIGHT, ())
801817
data = CCITTFaxDecode.decode(data, params, height)
802-
elif filter_name == FT.DCT_DECODE:
818+
elif filter_name in (FT.DCT_DECODE, FTA.DCT):
819+
_deprecate_inline_image_filters(filter_name=filter_name, old_name=FTA.DCT, new_name=FT.DCT_DECODE)
803820
data = DCTDecode.decode(data)
804821
elif filter_name == FT.JPX_DECODE:
805822
data = JPXDecode.decode(data)

tests/test_filters.py

Lines changed: 112 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import pytest
1414
from PIL import Image, ImageOps
1515

16-
from pypdf import PdfReader
16+
from pypdf import PdfReader, PdfWriter
1717
from pypdf.errors import DependencyError, DeprecationError, LimitReachedError, PdfReadError, PdfStreamError
1818
from pypdf.filters import (
1919
ASCII85Decode,
@@ -24,6 +24,7 @@
2424
FlateDecode,
2525
JBIG2Decode,
2626
RunLengthDecode,
27+
decode_stream_data,
2728
decompress,
2829
)
2930
from pypdf.generic import (
@@ -36,6 +37,7 @@
3637
NullObject,
3738
NumberObject,
3839
StreamObject,
40+
TextStringObject,
3941
)
4042

4143
from . import PILContext, get_data_from_url, get_image_data
@@ -902,3 +904,112 @@ def test_decompress__logging_on_invalid_data(caplog):
902904
assert len(encoded) > 5
903905
assert codec.decode(encoded[5:]) == b""
904906
assert caplog.messages == ["Error -3 while decompressing data: incorrect header check"]
907+
908+
909+
def test_ccittfaxdecode__ccf_inline():
910+
writer = PdfWriter(clone_from=RESOURCE_ROOT / "jpeg.pdf")
911+
page = writer.pages[0]
912+
writer.remove_images()
913+
914+
image_data = (
915+
b"\nBI\n /W 16\n /H 16\n /CS /G\n /BPC 1\n /F [/CCF]\n"
916+
b" /DP [ << /K -1 /BlackIs1 false /Columns 16 /Rows 16 >> ]\nID\n"
917+
b"&\xa0\xbf\xcc9\x14|G#\x1f\xff\xf1\xcc9\x18\xfe\xbbX\xfc\x00@\x04"
918+
b"\nEI\n"
919+
)
920+
content_stream = page.get_contents()
921+
content_stream.set_data(
922+
content_stream.get_data().replace(b"/Im4 Do", b"").replace(b"\nET", image_data)
923+
)
924+
page.replace_contents(content_stream)
925+
926+
expected = PdfReader(RESOURCE_ROOT / "imagemagick-CCITTFaxDecode.pdf").pages[0].images[0].image
927+
assert get_image_data(expected) == get_image_data(page.images[0].image)
928+
929+
930+
def test_dctdecode__dct_inline():
931+
writer = PdfWriter(clone_from=RESOURCE_ROOT / "jpeg.pdf")
932+
page = writer.pages[0]
933+
writer.remove_images()
934+
935+
image_data = (
936+
b"\nBI\n /W 16\n /H 16\n /CS /G\n /BPC 8\n /F [/DCT]\nID\n"
937+
b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x01,\x01,\x00\x00\xff\xfe\x00\x13Created with GIMP\xff\xe2"
938+
b"\x02\xb0ICC_PROFILE\x00\x01\x01\x00\x00\x02\xa0lcms\x040\x00\x00mntrRGB XYZ \x07\xe6\x00\x04\x00\x0f\x00"
939+
b"\t\x00\x1d\x007acspAPPL\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
940+
b"\x00\x00\x00\x00\x00\x00\xf6\xd6\x00\x01\x00\x00\x00\x00\xd3-lcms\x00\x00\x00\x00\x00\x00\x00\x00\x00"
941+
b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
942+
b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\rdesc\x00\x00\x01 \x00\x00\x00@cprt\x00\x00\x01`"
943+
b"\x00\x00\x006wtpt\x00\x00\x01\x98\x00\x00\x00\x14chad\x00\x00\x01\xac\x00\x00\x00,rXYZ\x00\x00\x01\xd8"
944+
b"\x00\x00\x00\x14bXYZ\x00\x00\x01\xec\x00\x00\x00\x14gXYZ\x00\x00\x02\x00\x00\x00\x00\x14rTRC\x00\x00"
945+
b"\x02\x14\x00\x00\x00 gTRC\x00\x00\x02\x14\x00\x00\x00 bTRC\x00\x00\x02\x14\x00\x00\x00 chrm\x00\x00"
946+
b"\x024\x00\x00\x00$dmnd\x00\x00\x02X\x00\x00\x00$dmdd\x00\x00\x02|\x00\x00\x00$mluc\x00\x00\x00\x00"
947+
b"\x00\x00\x00\x01\x00\x00\x00\x0cenUS\x00\x00\x00$\x00\x00\x00\x1c\x00G\x00I\x00M\x00P\x00 \x00b\x00"
948+
b"u\x00i\x00l\x00t\x00-\x00i\x00n\x00 \x00s\x00R\x00G\x00Bmluc\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00"
949+
b"\x00\x0cenUS\x00\x00\x00\x1a\x00\x00\x00\x1c\x00P\x00u\x00b\x00l\x00i\x00c\x00 \x00D\x00o\x00m\x00a"
950+
b"\x00i\x00n\x00\x00XYZ \x00\x00\x00\x00\x00\x00\xf6\xd6\x00\x01\x00\x00\x00\x00\xd3-sf32\x00\x00\x00"
951+
b"\x00\x00\x01\x0cB\x00\x00\x05\xde\xff\xff\xf3%\x00\x00\x07\x93\x00\x00\xfd\x90\xff\xff\xfb\xa1\xff"
952+
b"\xff\xfd\xa2\x00\x00\x03\xdc\x00\x00\xc0nXYZ \x00\x00\x00\x00\x00\x00o\xa0\x00\x008\xf5\x00\x00\x03"
953+
b"\x90XYZ \x00\x00\x00\x00\x00\x00$\x9f\x00\x00\x0f\x84\x00\x00\xb6\xc4XYZ \x00\x00\x00\x00\x00\x00b"
954+
b"\x97\x00\x00\xb7\x87\x00\x00\x18\xd9para\x00\x00\x00\x00\x00\x03\x00\x00\x00\x02ff\x00\x00\xf2\xa7"
955+
b"\x00\x00\rY\x00\x00\x13\xd0\x00\x00\n[chrm\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\xa3\xd7\x00\x00T|"
956+
b"\x00\x00L\xcd\x00\x00\x99\x9a\x00\x00&g\x00\x00\x0f\\mluc\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00"
957+
b"\x00\x0cenUS\x00\x00\x00\x08\x00\x00\x00\x1c\x00G\x00I\x00M\x00Pmluc\x00\x00\x00\x00\x00\x00\x00"
958+
b"\x01\x00\x00\x00\x0cenUS\x00\x00\x00\x08\x00\x00\x00\x1c\x00s\x00R\x00G\x00B\xff\xdb\x00C\x00\x01"
959+
b"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
960+
b"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
961+
b"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xff\xc0\x00\x0b\x08\x00\x10\x00\x10"
962+
b"\x01\x01\x11\x00\xff\xc4\x00\x17\x00\x00\x03\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
963+
b"\x00\x06\x07\x08\n\xff\xc4\x00\x1d\x10\x00\x03\x00\x03\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00"
964+
b"\x00\x00\x05\x06\x07\x01\x04\x08\x02\x03\x13\x15\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xc4D\x0eA"
965+
b"\x8e\x91\xa8\xf3\xcf5N\xb5\x7f\x87k\xbc_\x96\xe3\x83]\x9c\\\xff\x00\x19f1^=:A\x98jm.\x03\x9f\x10"
966+
b"mW\xc2\xcbYF\xd2T\x06\xef,OXfX`^\x18\x0ez\xb4U \x91\x17\xd4\xf6\xbe\xc2\xb7\x85s:{\xa1\x8f\xec;}"
967+
b"\x8f-l/1|\x19\x86|\x14\xc5+j\x8cm\xf0\xde\x10\xba\x7f\xa5=\xe2\x86\xd8\x18\r\xed$o\xab2h\xbc\xad"
968+
b"\x8cS\x18\xba\xd8,\xb2\xa3\xbf\xd9\xd8I\x84+\x07\x9d\x1ay\x1cr\xba\x81\nu\x0f\xa7yk\xa0%5\xf2\xf4"
969+
b"\xf4\x9e\x8d\xe6\x19\x90+s;P\xfd\xd1\xb3\x8f\xac\xf8\x0e@5\xf5\x8f(i\xc3\x0e\xf3\xd3\xbc\xf5\xa5"
970+
b"\xed:\x85<$\xee\xd1@%i\xde\x1ao\xdaF$\t?Vq\xce\x92\xde\xe1\xbd\x14H\x8a'\"\x8d\xbf75\xaef\x90\xc3|"
971+
b"\xe8~\x82\x04\xab+3O.\xdeX&\xac\xf2t\x89\xcf\xd3\xfa\x85\xbdFu=\x8e*\xa9\xfb!\x96\xed\xfa\xe3S\xe5A"
972+
b"\xf2\xa8\xf5\xe8\xd7\x85\xa5\x05\t\xf8a\xff\x00\xff\xd9"
973+
b"\nEI\n"
974+
)
975+
content_stream = page.get_contents()
976+
content_stream.set_data(
977+
content_stream.get_data().replace(b"/Im4 Do", b"").replace(b"\nET", image_data)
978+
)
979+
page.replace_contents(content_stream)
980+
981+
expected = PdfReader(RESOURCE_ROOT / "imagemagick-images.pdf").pages[3].images[0].image
982+
assert get_image_data(expected) == get_image_data(page.images[0].image)
983+
984+
985+
def test_deprecate_inline_image_filters():
986+
stream = ContentStream(stream=None, pdf=None)
987+
stream.set_data(b"&\xa0\xbf\xcc9\x14|G#\x1f\xff\xf1\xcc9\x18\xfe\xbbX\xfc\x00@\x04")
988+
989+
# The abbreviations do not work here, which is one of the reasons for the deprecation.
990+
stream[NameObject("/Width")] = NumberObject(16)
991+
stream[NameObject("/Height")] = NumberObject(16)
992+
stream[NameObject("/ColorSpace")] = NameObject("/DeviceGray")
993+
stream[NameObject("/BitsPerComponent")] = NumberObject(1)
994+
stream[NameObject("/Filter")] = NameObject("/CCF")
995+
stream[NameObject("/DecodeParams")] = ArrayObject(
996+
[
997+
DictionaryObject(
998+
{
999+
NameObject("/K"): NumberObject(-1),
1000+
NameObject("/BlackIs1"): TextStringObject("false"),
1001+
NameObject("/Columns"): NumberObject(16),
1002+
NameObject("/Rows"): NumberObject(16),
1003+
}
1004+
)
1005+
]
1006+
)
1007+
1008+
with pytest.warns(
1009+
expected_warning=DeprecationWarning,
1010+
match=r"^The filter name /CCF is deprecated and will be removed in pypdf 7\.0\.0\. Use /CCITTFaxDecode instead\.$" # noqa: E501
1011+
):
1012+
decode_stream_data(stream)
1013+
1014+
stream[NameObject("/Filter")] = NameObject("/CCITTFaxDecode")
1015+
assert decode_stream_data(stream).startswith(b"II*")

0 commit comments

Comments
 (0)