Edit File by line

[0] Fix | Delete

# Author: Ben Gertzfield, Barry Warsaw

[1] Fix | Delete

# Contact: email-sig@python.org

[2] Fix | Delete

[3] Fix | Delete

"""Header encoding and decoding functionality."""

[4] Fix | Delete

[5] Fix | Delete

__all__ = [

[6] Fix | Delete

'Header',

[7] Fix | Delete

'decode_header',

[8] Fix | Delete

'make_header',

[9] Fix | Delete

]

[10] Fix | Delete

[11] Fix | Delete

import re

[12] Fix | Delete

import binascii

[13] Fix | Delete

[14] Fix | Delete

import email.quoprimime

[15] Fix | Delete

import email.base64mime

[16] Fix | Delete

[17] Fix | Delete

from email.errors import HeaderParseError

[18] Fix | Delete

from email.charset import Charset

[19] Fix | Delete

[20] Fix | Delete

NL = '\n'

[21] Fix | Delete

SPACE = ' '

[22] Fix | Delete

USPACE = u' '

[23] Fix | Delete

SPACE8 = ' ' * 8

[24] Fix | Delete

UEMPTYSTRING = u''

[25] Fix | Delete

[26] Fix | Delete

MAXLINELEN = 76

[27] Fix | Delete

[28] Fix | Delete

USASCII = Charset('us-ascii')

[29] Fix | Delete

UTF8 = Charset('utf-8')

[30] Fix | Delete

[31] Fix | Delete

# Match encoded-word strings in the form =?charset?q?Hello_World?=

[32] Fix | Delete

ecre = re.compile(r'''

[33] Fix | Delete

=\? # literal =?

[34] Fix | Delete

(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset

[35] Fix | Delete

\? # literal ?

[36] Fix | Delete

(?P<encoding>[qb]) # either a "q" or a "b", case insensitive

[37] Fix | Delete

\? # literal ?

[38] Fix | Delete

(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string

[39] Fix | Delete

\?= # literal ?=

[40] Fix | Delete

(?=[ \t]|$) # whitespace or the end of the string

[41] Fix | Delete

''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)

[42] Fix | Delete

[43] Fix | Delete

# Field name regexp, including trailing colon, but not separating whitespace,

[44] Fix | Delete

# according to RFC 2822. Character range is from tilde to exclamation mark.

[45] Fix | Delete

# For use with .match()

[46] Fix | Delete

fcre = re.compile(r'[\041-\176]+:$')

[47] Fix | Delete

[48] Fix | Delete

# Find a header embedded in a putative header value. Used to check for

[49] Fix | Delete

# header injection attack.

[50] Fix | Delete

_embeded_header = re.compile(r'\n[^ \t]+:')

[51] Fix | Delete

[52] Fix | Delete

[53] Fix | Delete

[54] Fix | Delete

# Helpers

[55] Fix | Delete

_max_append = email.quoprimime._max_append

[56] Fix | Delete

[57] Fix | Delete

[58] Fix | Delete

[59] Fix | Delete

def decode_header(header):

[60] Fix | Delete

"""Decode a message header value without converting charset.

[61] Fix | Delete

[62] Fix | Delete

Returns a list of (decoded_string, charset) pairs containing each of the

[63] Fix | Delete

decoded parts of the header. Charset is None for non-encoded parts of the

[64] Fix | Delete

header, otherwise a lower-case string containing the name of the character

[65] Fix | Delete

set specified in the encoded string.

[66] Fix | Delete

[67] Fix | Delete

An email.errors.HeaderParseError may be raised when certain decoding error

[68] Fix | Delete

occurs (e.g. a base64 decoding exception).

[69] Fix | Delete

"""

[70] Fix | Delete

# If no encoding, just return the header

[71] Fix | Delete

header = str(header)

[72] Fix | Delete

if not ecre.search(header):

[73] Fix | Delete

return [(header, None)]

[74] Fix | Delete

decoded = []

[75] Fix | Delete

dec = ''

[76] Fix | Delete

for line in header.splitlines():

[77] Fix | Delete

# This line might not have an encoding in it

[78] Fix | Delete

if not ecre.search(line):

[79] Fix | Delete

decoded.append((line, None))

[80] Fix | Delete

continue

[81] Fix | Delete

parts = ecre.split(line)

[82] Fix | Delete

while parts:

[83] Fix | Delete

unenc = parts.pop(0).strip()

[84] Fix | Delete

if unenc:

[85] Fix | Delete

# Should we continue a long line?

[86] Fix | Delete

if decoded and decoded[-1][1] is None:

[87] Fix | Delete

decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)

[88] Fix | Delete

else:

[89] Fix | Delete

decoded.append((unenc, None))

[90] Fix | Delete

if parts:

[91] Fix | Delete

charset, encoding = [s.lower() for s in parts[0:2]]

[92] Fix | Delete

encoded = parts[2]

[93] Fix | Delete

dec = None

[94] Fix | Delete

if encoding == 'q':

[95] Fix | Delete

dec = email.quoprimime.header_decode(encoded)

[96] Fix | Delete

elif encoding == 'b':

[97] Fix | Delete

paderr = len(encoded) % 4 # Postel's law: add missing padding

[98] Fix | Delete

if paderr:

[99] Fix | Delete

encoded += '==='[:4 - paderr]

[100] Fix | Delete

try:

[101] Fix | Delete

dec = email.base64mime.decode(encoded)

[102] Fix | Delete

except binascii.Error:

[103] Fix | Delete

# Turn this into a higher level exception. BAW: Right

[104] Fix | Delete

# now we throw the lower level exception away but

[105] Fix | Delete

# when/if we get exception chaining, we'll preserve it.

[106] Fix | Delete

raise HeaderParseError

[107] Fix | Delete

if dec is None:

[108] Fix | Delete

dec = encoded

[109] Fix | Delete

[110] Fix | Delete

if decoded and decoded[-1][1] == charset:

[111] Fix | Delete

decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])

[112] Fix | Delete

else:

[113] Fix | Delete

decoded.append((dec, charset))

[114] Fix | Delete

del parts[0:3]

[115] Fix | Delete

return decoded

[116] Fix | Delete

[117] Fix | Delete

[118] Fix | Delete

[119] Fix | Delete

def make_header(decoded_seq, maxlinelen=None, header_name=None,

[120] Fix | Delete

continuation_ws=' '):

[121] Fix | Delete

"""Create a Header from a sequence of pairs as returned by decode_header()

[122] Fix | Delete

[123] Fix | Delete

decode_header() takes a header value string and returns a sequence of

[124] Fix | Delete

pairs of the format (decoded_string, charset) where charset is the string

[125] Fix | Delete

name of the character set.

[126] Fix | Delete

[127] Fix | Delete

This function takes one of those sequence of pairs and returns a Header

[128] Fix | Delete

instance. Optional maxlinelen, header_name, and continuation_ws are as in

[129] Fix | Delete

the Header constructor.

[130] Fix | Delete

"""

[131] Fix | Delete

h = Header(maxlinelen=maxlinelen, header_name=header_name,

[132] Fix | Delete

continuation_ws=continuation_ws)

[133] Fix | Delete

for s, charset in decoded_seq:

[134] Fix | Delete

# None means us-ascii but we can simply pass it on to h.append()

[135] Fix | Delete

if charset is not None and not isinstance(charset, Charset):

[136] Fix | Delete

charset = Charset(charset)

[137] Fix | Delete

h.append(s, charset)

[138] Fix | Delete

return h

[139] Fix | Delete

[140] Fix | Delete

[141] Fix | Delete

[142] Fix | Delete

class Header:

[143] Fix | Delete

def __init__(self, s=None, charset=None,

[144] Fix | Delete

maxlinelen=None, header_name=None,

[145] Fix | Delete

continuation_ws=' ', errors='strict'):

[146] Fix | Delete

"""Create a MIME-compliant header that can contain many character sets.

[147] Fix | Delete

[148] Fix | Delete

Optional s is the initial header value. If None, the initial header

[149] Fix | Delete

value is not set. You can later append to the header with .append()

[150] Fix | Delete

method calls. s may be a byte string or a Unicode string, but see the

[151] Fix | Delete

.append() documentation for semantics.

[152] Fix | Delete

[153] Fix | Delete

Optional charset serves two purposes: it has the same meaning as the

[154] Fix | Delete

charset argument to the .append() method. It also sets the default

[155] Fix | Delete

character set for all subsequent .append() calls that omit the charset

[156] Fix | Delete

argument. If charset is not provided in the constructor, the us-ascii

[157] Fix | Delete

charset is used both as s's initial charset and as the default for

[158] Fix | Delete

subsequent .append() calls.

[159] Fix | Delete

[160] Fix | Delete

The maximum line length can be specified explicit via maxlinelen. For

[161] Fix | Delete

splitting the first line to a shorter value (to account for the field

[162] Fix | Delete

header which isn't included in s, e.g. `Subject') pass in the name of

[163] Fix | Delete

the field in header_name. The default maxlinelen is 76.

[164] Fix | Delete

[165] Fix | Delete

continuation_ws must be RFC 2822 compliant folding whitespace (usually

[166] Fix | Delete

either a space or a hard tab) which will be prepended to continuation

[167] Fix | Delete

lines.

[168] Fix | Delete

[169] Fix | Delete

errors is passed through to the .append() call.

[170] Fix | Delete

"""

[171] Fix | Delete

if charset is None:

[172] Fix | Delete

charset = USASCII

[173] Fix | Delete

if not isinstance(charset, Charset):

[174] Fix | Delete

charset = Charset(charset)

[175] Fix | Delete

self._charset = charset

[176] Fix | Delete

self._continuation_ws = continuation_ws

[177] Fix | Delete

cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))

[178] Fix | Delete

# BAW: I believe `chunks' and `maxlinelen' should be non-public.

[179] Fix | Delete

self._chunks = []

[180] Fix | Delete

if s is not None:

[181] Fix | Delete

self.append(s, charset, errors)

[182] Fix | Delete

if maxlinelen is None:

[183] Fix | Delete

maxlinelen = MAXLINELEN

[184] Fix | Delete

if header_name is None:

[185] Fix | Delete

# We don't know anything about the field header so the first line

[186] Fix | Delete

# is the same length as subsequent lines.

[187] Fix | Delete

self._firstlinelen = maxlinelen

[188] Fix | Delete

else:

[189] Fix | Delete

# The first line should be shorter to take into account the field

[190] Fix | Delete

# header. Also subtract off 2 extra for the colon and space.

[191] Fix | Delete

self._firstlinelen = maxlinelen - len(header_name) - 2

[192] Fix | Delete

# Second and subsequent lines should subtract off the length in

[193] Fix | Delete

# columns of the continuation whitespace prefix.

[194] Fix | Delete

self._maxlinelen = maxlinelen - cws_expanded_len

[195] Fix | Delete

[196] Fix | Delete

def __str__(self):

[197] Fix | Delete

"""A synonym for self.encode()."""

[198] Fix | Delete

return self.encode()

[199] Fix | Delete

[200] Fix | Delete

def __unicode__(self):

[201] Fix | Delete

"""Helper for the built-in unicode function."""

[202] Fix | Delete

uchunks = []

[203] Fix | Delete

lastcs = None

[204] Fix | Delete

for s, charset in self._chunks:

[205] Fix | Delete

# We must preserve spaces between encoded and non-encoded word

[206] Fix | Delete

# boundaries, which means for us we need to add a space when we go

[207] Fix | Delete

# from a charset to None/us-ascii, or from None/us-ascii to a

[208] Fix | Delete

# charset. Only do this for the second and subsequent chunks.

[209] Fix | Delete

nextcs = charset

[210] Fix | Delete

if uchunks:

[211] Fix | Delete

if lastcs not in (None, 'us-ascii'):

[212] Fix | Delete

if nextcs in (None, 'us-ascii'):

[213] Fix | Delete

uchunks.append(USPACE)

[214] Fix | Delete

nextcs = None

[215] Fix | Delete

elif nextcs not in (None, 'us-ascii'):

[216] Fix | Delete

uchunks.append(USPACE)

[217] Fix | Delete

lastcs = nextcs

[218] Fix | Delete

uchunks.append(unicode(s, str(charset)))

[219] Fix | Delete

return UEMPTYSTRING.join(uchunks)

[220] Fix | Delete

[221] Fix | Delete

# Rich comparison operators for equality only. BAW: does it make sense to

[222] Fix | Delete

# have or explicitly disable <, <=, >, >= operators?

[223] Fix | Delete

def __eq__(self, other):

[224] Fix | Delete

# other may be a Header or a string. Both are fine so coerce

[225] Fix | Delete

# ourselves to a string, swap the args and do another comparison.

[226] Fix | Delete

return other == self.encode()

[227] Fix | Delete

[228] Fix | Delete

def __ne__(self, other):

[229] Fix | Delete

return not self == other

[230] Fix | Delete

[231] Fix | Delete

def append(self, s, charset=None, errors='strict'):

[232] Fix | Delete

"""Append a string to the MIME header.

[233] Fix | Delete

[234] Fix | Delete

Optional charset, if given, should be a Charset instance or the name

[235] Fix | Delete

of a character set (which will be converted to a Charset instance). A

[236] Fix | Delete

value of None (the default) means that the charset given in the

[237] Fix | Delete

constructor is used.

[238] Fix | Delete

[239] Fix | Delete

s may be a byte string or a Unicode string. If it is a byte string

[240] Fix | Delete

(i.e. isinstance(s, str) is true), then charset is the encoding of

[241] Fix | Delete

that byte string, and a UnicodeError will be raised if the string

[242] Fix | Delete

cannot be decoded with that charset. If s is a Unicode string, then

[243] Fix | Delete

charset is a hint specifying the character set of the characters in

[244] Fix | Delete

the string. In this case, when producing an RFC 2822 compliant header

[245] Fix | Delete

using RFC 2047 rules, the Unicode string will be encoded using the

[246] Fix | Delete

following charsets in order: us-ascii, the charset hint, utf-8. The

[247] Fix | Delete

first character set not to provoke a UnicodeError is used.

[248] Fix | Delete

[249] Fix | Delete

Optional `errors' is passed as the third argument to any unicode() or

[250] Fix | Delete

ustr.encode() call.

[251] Fix | Delete

"""

[252] Fix | Delete

if charset is None:

[253] Fix | Delete

charset = self._charset

[254] Fix | Delete

elif not isinstance(charset, Charset):

[255] Fix | Delete

charset = Charset(charset)

[256] Fix | Delete

# If the charset is our faux 8bit charset, leave the string unchanged

[257] Fix | Delete

if charset != '8bit':

[258] Fix | Delete

# We need to test that the string can be converted to unicode and

[259] Fix | Delete

# back to a byte string, given the input and output codecs of the

[260] Fix | Delete

# charset.

[261] Fix | Delete

if isinstance(s, str):

[262] Fix | Delete

# Possibly raise UnicodeError if the byte string can't be

[263] Fix | Delete

# converted to a unicode with the input codec of the charset.

[264] Fix | Delete

incodec = charset.input_codec or 'us-ascii'

[265] Fix | Delete

ustr = unicode(s, incodec, errors)

[266] Fix | Delete

# Now make sure that the unicode could be converted back to a

[267] Fix | Delete

# byte string with the output codec, which may be different

[268] Fix | Delete

# than the iput coded. Still, use the original byte string.

[269] Fix | Delete

outcodec = charset.output_codec or 'us-ascii'

[270] Fix | Delete

ustr.encode(outcodec, errors)

[271] Fix | Delete

elif isinstance(s, unicode):

[272] Fix | Delete

# Now we have to be sure the unicode string can be converted

[273] Fix | Delete

# to a byte string with a reasonable output codec. We want to

[274] Fix | Delete

# use the byte string in the chunk.

[275] Fix | Delete

for charset in USASCII, charset, UTF8:

[276] Fix | Delete

try:

[277] Fix | Delete

outcodec = charset.output_codec or 'us-ascii'

[278] Fix | Delete

s = s.encode(outcodec, errors)

[279] Fix | Delete

break

[280] Fix | Delete

except UnicodeError:

[281] Fix | Delete

pass

[282] Fix | Delete

else:

[283] Fix | Delete

assert False, 'utf-8 conversion failed'

[284] Fix | Delete

self._chunks.append((s, charset))

[285] Fix | Delete

[286] Fix | Delete

def _split(self, s, charset, maxlinelen, splitchars):

[287] Fix | Delete

# Split up a header safely for use with encode_chunks.

[288] Fix | Delete

splittable = charset.to_splittable(s)

[289] Fix | Delete

encoded = charset.from_splittable(splittable, True)

[290] Fix | Delete

elen = charset.encoded_header_len(encoded)

[291] Fix | Delete

# If the line's encoded length first, just return it

[292] Fix | Delete

if elen <= maxlinelen:

[293] Fix | Delete

return [(encoded, charset)]

[294] Fix | Delete

# If we have undetermined raw 8bit characters sitting in a byte

[295] Fix | Delete

# string, we really don't know what the right thing to do is. We

[296] Fix | Delete

# can't really split it because it might be multibyte data which we

[297] Fix | Delete

# could break if we split it between pairs. The least harm seems to

[298] Fix | Delete

# be to not split the header at all, but that means they could go out

[299] Fix | Delete

# longer than maxlinelen.

[300] Fix | Delete

if charset == '8bit':

[301] Fix | Delete

return [(s, charset)]

[302] Fix | Delete

# BAW: I'm not sure what the right test here is. What we're trying to

[303] Fix | Delete

# do is be faithful to RFC 2822's recommendation that ($2.2.3):

[304] Fix | Delete

[305] Fix | Delete

# "Note: Though structured field bodies are defined in such a way that

[306] Fix | Delete

# folding can take place between many of the lexical tokens (and even

[307] Fix | Delete

# within some of the lexical tokens), folding SHOULD be limited to

[308] Fix | Delete

# placing the CRLF at higher-level syntactic breaks."

[309] Fix | Delete

[310] Fix | Delete

# For now, I can only imagine doing this when the charset is us-ascii,

[311] Fix | Delete

# although it's possible that other charsets may also benefit from the

[312] Fix | Delete

# higher-level syntactic breaks.

[313] Fix | Delete

elif charset == 'us-ascii':

[314] Fix | Delete

return self._split_ascii(s, charset, maxlinelen, splitchars)

[315] Fix | Delete

# BAW: should we use encoded?

[316] Fix | Delete

elif elen == len(s):

[317] Fix | Delete

# We can split on _maxlinelen boundaries because we know that the

[318] Fix | Delete

# encoding won't change the size of the string

[319] Fix | Delete

splitpnt = maxlinelen

[320] Fix | Delete

first = charset.from_splittable(splittable[:splitpnt], False)

[321] Fix | Delete

last = charset.from_splittable(splittable[splitpnt:], False)

[322] Fix | Delete

else:

[323] Fix | Delete

# Binary search for split point

[324] Fix | Delete

first, last = _binsplit(splittable, charset, maxlinelen)

[325] Fix | Delete

# first is of the proper length so just wrap it in the appropriate

[326] Fix | Delete

# chrome. last must be recursively split.

[327] Fix | Delete

fsplittable = charset.to_splittable(first)

[328] Fix | Delete

fencoded = charset.from_splittable(fsplittable, True)

[329] Fix | Delete

chunk = [(fencoded, charset)]

[330] Fix | Delete

return chunk + self._split(last, charset, self._maxlinelen, splitchars)

[331] Fix | Delete

[332] Fix | Delete

def _split_ascii(self, s, charset, firstlen, splitchars):

[333] Fix | Delete

chunks = _split_ascii(s, firstlen, self._maxlinelen,

[334] Fix | Delete

self._continuation_ws, splitchars)

[335] Fix | Delete

return zip(chunks, [charset]*len(chunks))

[336] Fix | Delete

[337] Fix | Delete

def _encode_chunks(self, newchunks, maxlinelen):

[338] Fix | Delete

# MIME-encode a header with many different charsets and/or encodings.

[339] Fix | Delete

[340] Fix | Delete

# Given a list of pairs (string, charset), return a MIME-encoded

[341] Fix | Delete

# string suitable for use in a header field. Each pair may have

[342] Fix | Delete

# different charsets and/or encodings, and the resulting header will

[343] Fix | Delete

# accurately reflect each setting.

[344] Fix | Delete

[345] Fix | Delete

# Each encoding can be email.utils.QP (quoted-printable, for

[346] Fix | Delete

# ASCII-like character sets like iso-8859-1), email.utils.BASE64

[347] Fix | Delete

# (Base64, for non-ASCII like character sets like KOI8-R and

[348] Fix | Delete

# iso-2022-jp), or None (no encoding).

[349] Fix | Delete

[350] Fix | Delete

# Each pair will be represented on a separate line; the resulting

[351] Fix | Delete

# string will be in the format:

[352] Fix | Delete

[353] Fix | Delete

# =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n

[354] Fix | Delete

# =?charset2?b?SvxyZ2VuIEL2aW5n?="

[355] Fix | Delete

chunks = []

[356] Fix | Delete

for header, charset in newchunks:

[357] Fix | Delete

if not header:

[358] Fix | Delete

continue

[359] Fix | Delete

if charset is None or charset.header_encoding is None:

[360] Fix | Delete

s = header

[361] Fix | Delete

else:

[362] Fix | Delete

s = charset.header_encode(header)

[363] Fix | Delete

# Don't add more folding whitespace than necessary

[364] Fix | Delete

if chunks and chunks[-1].endswith(' '):

[365] Fix | Delete

extra = ''

[366] Fix | Delete

else:

[367] Fix | Delete

extra = ' '

[368] Fix | Delete

_max_append(chunks, s, maxlinelen, extra)

[369] Fix | Delete

joiner = NL + self._continuation_ws

[370] Fix | Delete

return joiner.join(chunks)

[371] Fix | Delete

[372] Fix | Delete

def encode(self, splitchars=';, '):

[373] Fix | Delete

"""Encode a message header into an RFC-compliant format.

[374] Fix | Delete

[375] Fix | Delete

There are many issues involved in converting a given string for use in

[376] Fix | Delete

an email header. Only certain character sets are readable in most

[377] Fix | Delete

email clients, and as header strings can only contain a subset of

[378] Fix | Delete

7-bit ASCII, care must be taken to properly convert and encode (with

[379] Fix | Delete

Base64 or quoted-printable) header strings. In addition, there is a

[380] Fix | Delete

75-character length limit on any given encoded header field, so

[381] Fix | Delete

line-wrapping must be performed, even with double-byte character sets.

[382] Fix | Delete

[383] Fix | Delete

This method will do its best to convert the string to the correct

[384] Fix | Delete

character set used in email, and encode and line wrap it safely with

[385] Fix | Delete

the appropriate scheme for that character set.

[386] Fix | Delete

[387] Fix | Delete

If the given charset is not known or an error occurs during

[388] Fix | Delete

conversion, this function will return the header untouched.

[389] Fix | Delete

[390] Fix | Delete

Optional splitchars is a string containing characters to split long

[391] Fix | Delete

ASCII lines on, in rough support of RFC 2822's `highest level

[392] Fix | Delete

syntactic breaks'. This doesn't affect RFC 2047 encoded lines.

[393] Fix | Delete

"""

[394] Fix | Delete

newchunks = []

[395] Fix | Delete

maxlinelen = self._firstlinelen

[396] Fix | Delete

lastlen = 0

[397] Fix | Delete

for s, charset in self._chunks:

[398] Fix | Delete

# The first bit of the next chunk should be just long enough to

[399] Fix | Delete

# fill the next line. Don't forget the space separating the

[400] Fix | Delete

# encoded words.

[401] Fix | Delete

targetlen = maxlinelen - lastlen - 1

[402] Fix | Delete

if targetlen < charset.encoded_header_len(''):

[403] Fix | Delete

# Stick it on the next line

[404] Fix | Delete

targetlen = maxlinelen

[405] Fix | Delete

newchunks += self._split(s, charset, targetlen, splitchars)

[406] Fix | Delete

lastchunk, lastcharset = newchunks[-1]

[407] Fix | Delete

lastlen = lastcharset.encoded_header_len(lastchunk)

[408] Fix | Delete

value = self._encode_chunks(newchunks, maxlinelen)

[409] Fix | Delete

if _embeded_header.search(value):

[410] Fix | Delete

raise HeaderParseError("header value appears to contain "

[411] Fix | Delete

"an embedded header: {!r}".format(value))

[412] Fix | Delete

return value

[413] Fix | Delete

[414] Fix | Delete

[415] Fix | Delete

[416] Fix | Delete

def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):

[417] Fix | Delete

lines = []

[418] Fix | Delete

maxlen = firstlen

[419] Fix | Delete

for line in s.splitlines():

[420] Fix | Delete

# Ignore any leading whitespace (i.e. continuation whitespace) already

[421] Fix | Delete

# on the line, since we'll be adding our own.

[422] Fix | Delete

line = line.lstrip()

[423] Fix | Delete

if len(line) < maxlen:

[424] Fix | Delete

lines.append(line)

[425] Fix | Delete

maxlen = restlen

[426] Fix | Delete

continue

[427] Fix | Delete

# Attempt to split the line at the highest-level syntactic break

[428] Fix | Delete

# possible. Note that we don't have a lot of smarts about field

[429] Fix | Delete

# syntax; we just try to break on semi-colons, then commas, then

[430] Fix | Delete

# whitespace.

[431] Fix | Delete

for ch in splitchars:

[432] Fix | Delete

if ch in line:

[433] Fix | Delete

break

[434] Fix | Delete

else:

[435] Fix | Delete

# There's nothing useful to split the line on, not even spaces, so

[436] Fix | Delete

# just append this line unchanged

[437] Fix | Delete

lines.append(line)

[438] Fix | Delete

maxlen = restlen

[439] Fix | Delete

continue

[440] Fix | Delete

# Now split the line on the character plus trailing whitespace

[441] Fix | Delete

cre = re.compile(r'%s\s*' % ch)

[442] Fix | Delete

if ch in ';,':

[443] Fix | Delete

eol = ch

[444] Fix | Delete

else:

[445] Fix | Delete

eol = ''

[446] Fix | Delete

joiner = eol + ' '

[447] Fix | Delete

joinlen = len(joiner)

[448] Fix | Delete

wslen = len(continuation_ws.replace('\t', SPACE8))

[449] Fix | Delete

this = []

[450] Fix | Delete

linelen = 0

[451] Fix | Delete

for part in cre.split(line):

[452] Fix | Delete

curlen = linelen + max(0, len(this)-1) * joinlen

[453] Fix | Delete

partlen = len(part)

[454] Fix | Delete

onfirstline = not lines

[455] Fix | Delete

# We don't want to split after the field name, if we're on the

[456] Fix | Delete

# first line and the field name is present in the header string.

[457] Fix | Delete

if ch == ' ' and onfirstline and \

[458] Fix | Delete

len(this) == 1 and fcre.match(this[0]):

[459] Fix | Delete

this.append(part)

[460] Fix | Delete

linelen += partlen

[461] Fix | Delete

elif curlen + partlen > maxlen:

[462] Fix | Delete

if this:

[463] Fix | Delete

lines.append(joiner.join(this) + eol)

[464] Fix | Delete

# If this part is longer than maxlen and we aren't already

[465] Fix | Delete

# splitting on whitespace, try to recursively split this line

[466] Fix | Delete

# on whitespace.

[467] Fix | Delete

if partlen > maxlen and ch != ' ':

[468] Fix | Delete

subl = _split_ascii(part, maxlen, restlen,

[469] Fix | Delete

continuation_ws, ' ')

[470] Fix | Delete

lines.extend(subl[:-1])

[471] Fix | Delete

this = [subl[-1]]

[472] Fix | Delete

else:

[473] Fix | Delete

this = [part]

[474] Fix | Delete

linelen = wslen + len(this[-1])

[475] Fix | Delete

maxlen = restlen

[476] Fix | Delete

else:

[477] Fix | Delete

this.append(part)

[478] Fix | Delete

linelen += partlen

[479] Fix | Delete

# Put any left over parts on a line by themselves

[480] Fix | Delete

if this:

[481] Fix | Delete

lines.append(joiner.join(this))

[482] Fix | Delete

return lines

[483] Fix | Delete

[484] Fix | Delete

[485] Fix | Delete

[486] Fix | Delete

def _binsplit(splittable, charset, maxlinelen):

[487] Fix | Delete

i = 0

[488] Fix | Delete

j = len(splittable)

[489] Fix | Delete

while i < j:

[490] Fix | Delete

# Invariants:

[491] Fix | Delete

# 1. splittable[:k] fits for all k <= i (note that we *assume*,

[492] Fix | Delete

# at the start, that splittable[:0] fits).

[493] Fix | Delete

# 2. splittable[:k] does not fit for any k > j (at the start,

[494] Fix | Delete

# this means we shouldn't look at any k > len(splittable)).

[495] Fix | Delete

# 3. We don't know about splittable[:k] for k in i+1..j.

[496] Fix | Delete

# 4. We want to set i to the largest k that fits, with i <= k <= j.

[497] Fix | Delete

[498] Fix | Delete

m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j

[499] Fix | Delete