This repository was archived by the owner on Jun 3, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 18
Handle marshaling and emitting unicode encoded byte strings #16
Copy link
Copy link
Open
Labels
Description
I'm having problems encoding unicode correctly when using transit. I would expect that it would work similarly to the json encoder when using transit with the json encoder. Note below that when encoding a unicode string the json encoder consistently provides the same output. The transit encoder fails on the last two variations and produces different output from the json encoder on the first variation.
In [1]: import json
In [2]: json.dumps(u'Av. Zañartu 1482, Ñuñoa, Santiago, Chile, 7780272')
Out[2]: '"Av. Za\\u00f1artu 1482, \\u00d1u\\u00f1oa, Santiago, Chile, 7780272"'
In [3]: json.dumps('Av. Zañartu 1482, Ñuñoa, Santiago, Chile, 7780272')
Out[3]: '"Av. Za\\u00f1artu 1482, \\u00d1u\\u00f1oa, Santiago, Chile, 7780272"'
In [4]: json.dumps(u'Av. Zañartu 1482, Ñuñoa, Santiago, Chile, 7780272'.encode('utf-8'))
Out[4]: '"Av. Za\\u00f1artu 1482, \\u00d1u\\u00f1oa, Santiago, Chile, 7780272"'
In [5]: from transit.writer import Writer
In [6]: from StringIO import StringIO
In [9]: def transit_dumps(value):
...: io = StringIO()
...: writer = Writer(io, 'json')
...: writer.write(value)
...: return io.getvalue()
...:
In [10]: transit_dumps(u'Av. Zañartu 1482, Ñuñoa, Santiago, Chile, 7780272')
Out[10]: u'["~#\'","Av. Za\xf1artu 1482, \xd1u\xf1oa, Santiago, Chile, 7780272"]'
In [11]: transit_dumps(u'Av. Zañartu 1482, Ñuñoa, Santiago, Chile, 7780272'.encode('utf-8'))
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-11-91a52539e1c8> in <module>()
----> 1 transit_dumps(u'Av. Zañartu 1482, Ñuñoa, Santiago, Chile, 7780272'.encode('utf-8'))
<ipython-input-9-f0149f39269b> in transit_dumps(value)
2 io = StringIO()
3 writer = Writer(io, 'json')
----> 4 writer.write(value)
5 return io.getvalue()
6
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in write(self, obj)
42 the 'io' source.
43 """
---> 44 self.marshaler.marshal_top(obj)
45
46 def register(self, obj_type, handler_class):
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in marshal_top(self, obj, cache)
196 if tag:
197 if len(tag) == 1:
--> 198 self.marshal(TaggedValue(QUOTE, obj), False, cache)
199 else:
200 self.marshal(obj, False, cache)
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in marshal(self, obj, as_map_key, cache)
179
180 if f:
--> 181 f(self, rep, as_map_key, cache)
182 else:
183 self.emit_encoded(tag, handler, obj, as_map_key, cache)
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in <lambda>(self, rep, _, cache)
227 "n": lambda self, rep, as_map_key, cache: Marshaler.emit_int(self, "n", rep, as_map_key, cache),
228 "d": Marshaler.emit_double,
--> 229 "'": lambda self, rep, _, cache: Marshaler.emit_tagged(self, "'", rep, cache),
230 "array": Marshaler.emit_array,
231 "map": Marshaler.dispatch_map}
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in emit_tagged(self, tag, rep, cache)
141 self.emit_array_start(2)
142 self.emit_string(ESC, "#", tag, False, cache)
--> 143 self.marshal(rep, False, cache)
144 self.emit_array_end()
145
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in marshal(self, obj, as_map_key, cache)
179
180 if f:
--> 181 f(self, rep, as_map_key, cache)
182 else:
183 self.emit_encoded(tag, handler, obj, as_map_key, cache)
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in <lambda>(self, rep, as_map_key, cache)
223 marshal_dispatch = {"_": Marshaler.emit_nil,
224 "?": Marshaler.emit_boolean,
--> 225 "s": lambda self, rep, as_map_key, cache: Marshaler.emit_string(self, "", "", escape(rep), as_map_key, cache),
226 "i": lambda self, rep, as_map_key, cache: Marshaler.emit_int(self, "i", rep, as_map_key, cache),
227 "n": lambda self, rep, as_map_key, cache: Marshaler.emit_int(self, "n", rep, as_map_key, cache),
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in emit_string(self, prefix, tag, string, as_map_key, cache)
106 #if "cache_enabled" in self.opts and is_cacheable(encoded, as_map_key):
107 # return self.emit_object(cache.value_to_key[encoded], as_map_key)
--> 108 return self.emit_object(encoded, as_map_key)
109
110 def emit_boolean(self, b, as_map_key, cache):
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in emit_object(self, obj, as_map_key)
357 self.io.write(u"".join([(c.encode("unicode_escape"))
358 if c in JSON_ESCAPED_CHARS
--> 359 else c for c in obj]).replace("\"", "\\\""))
360 self.io.write(u"\"")
361 elif tp is int or tp is long or tp is float:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128)
In [12]: transit_dumps('Av. Zañartu 1482, Ñuñoa, Santiago, Chile, 7780272')
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-12-e53e995c1ef3> in <module>()
----> 1 transit_dumps('Av. Zañartu 1482, Ñuñoa, Santiago, Chile, 7780272')
<ipython-input-9-f0149f39269b> in transit_dumps(value)
2 io = StringIO()
3 writer = Writer(io, 'json')
----> 4 writer.write(value)
5 return io.getvalue()
6
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in write(self, obj)
42 the 'io' source.
43 """
---> 44 self.marshaler.marshal_top(obj)
45
46 def register(self, obj_type, handler_class):
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in marshal_top(self, obj, cache)
196 if tag:
197 if len(tag) == 1:
--> 198 self.marshal(TaggedValue(QUOTE, obj), False, cache)
199 else:
200 self.marshal(obj, False, cache)
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in marshal(self, obj, as_map_key, cache)
179
180 if f:
--> 181 f(self, rep, as_map_key, cache)
182 else:
183 self.emit_encoded(tag, handler, obj, as_map_key, cache)
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in <lambda>(self, rep, _, cache)
227 "n": lambda self, rep, as_map_key, cache: Marshaler.emit_int(self, "n", rep, as_map_key, cache),
228 "d": Marshaler.emit_double,
--> 229 "'": lambda self, rep, _, cache: Marshaler.emit_tagged(self, "'", rep, cache),
230 "array": Marshaler.emit_array,
231 "map": Marshaler.dispatch_map}
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in emit_tagged(self, tag, rep, cache)
141 self.emit_array_start(2)
142 self.emit_string(ESC, "#", tag, False, cache)
--> 143 self.marshal(rep, False, cache)
144 self.emit_array_end()
145
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in marshal(self, obj, as_map_key, cache)
179
180 if f:
--> 181 f(self, rep, as_map_key, cache)
182 else:
183 self.emit_encoded(tag, handler, obj, as_map_key, cache)
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in <lambda>(self, rep, as_map_key, cache)
223 marshal_dispatch = {"_": Marshaler.emit_nil,
224 "?": Marshaler.emit_boolean,
--> 225 "s": lambda self, rep, as_map_key, cache: Marshaler.emit_string(self, "", "", escape(rep), as_map_key, cache),
226 "i": lambda self, rep, as_map_key, cache: Marshaler.emit_int(self, "i", rep, as_map_key, cache),
227 "n": lambda self, rep, as_map_key, cache: Marshaler.emit_int(self, "n", rep, as_map_key, cache),
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in emit_string(self, prefix, tag, string, as_map_key, cache)
106 #if "cache_enabled" in self.opts and is_cacheable(encoded, as_map_key):
107 # return self.emit_object(cache.value_to_key[encoded], as_map_key)
--> 108 return self.emit_object(encoded, as_map_key)
109
110 def emit_boolean(self, b, as_map_key, cache):
/Users/nithin/.virtualenvs/weblims/lib/python2.7/site-packages/transit/writer.pyc in emit_object(self, obj, as_map_key)
357 self.io.write(u"".join([(c.encode("unicode_escape"))
358 if c in JSON_ESCAPED_CHARS
--> 359 else c for c in obj]).replace("\"", "\\\""))
360 self.io.write(u"\"")
361 elif tp is int or tp is long or tp is float: