Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_incomplete_sequences():
test_bytes = b'surrogates: \xed\xa0\x80\xed\xb0\x80 / null: \xc0\x80'
test_string = 'surrogates: \U00010000 / null: \x00'
# Test that we can feed this string to decode() in multiple pieces, and no
# matter where the break between those pieces is, we get the same result.
for split_point in range(len(test_string) + 1):
left = test_bytes[:split_point]
right = test_bytes[split_point:]
decoder = IncrementalDecoder()
got = decoder.decode(left, final=False)
got += decoder.decode(right)
eq_(got, test_string)
own will be displayed as a slightly ugly ellipsis instead of a replacement
character.)
Aside from these cases, it acts the same as the "utf-8-variants" decoder.
Encoding with "sloppy-utf-8" is the same as encoding with "utf-8".
"""
from __future__ import unicode_literals
import codecs
from ftfy.bad_codecs.utf8_variants import (
IncrementalEncoder, IncrementalDecoder,
UTF8IncrementalDecoder
)
NAME = 'sloppy-utf-8'
class SloppyIncrementalDecoder(IncrementalDecoder):
def _buffer_decode_step(self, input, errors, final):
"""
There are three possibilities for each decoding step:
- Decode as much apparently-real UTF-8 as possible.
- Decode a six-byte CESU-8 sequence at the current position.
- Decode a Java-style null at the current position.
When decoding "apparently-real UTF-8", we might get an error,
and that's where the sloppiness kicks in. If the error is something
we recognize and can fix, we'll fix it.
"""
# Get a reference to the superclass method that we'll be using for
# most of the real work.
sup = UTF8IncrementalDecoder._buffer_decode
class StreamWriter(codecs.StreamWriter):
encode = encode
class StreamReader(codecs.StreamReader):
decode = decode
CODEC_INFO = codecs.CodecInfo(
name=NAME,
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)
class StreamWriter(codecs.StreamWriter):
encode = encode
class StreamReader(codecs.StreamReader):
decode = decode
CODEC_INFO = codecs.CodecInfo(
name=NAME,
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)
def decode(input, errors='strict'):
return IncrementalDecoder(errors).decode(input, final=True), len(input)
def decode(input, errors='strict'):
return IncrementalDecoder(errors).decode(input), len(input)