Edit File by line

[0] Fix | Delete

# Author: Ben Gertzfield

[1] Fix | Delete

# Contact: email-sig@python.org

[2] Fix | Delete

[3] Fix | Delete

"""Quoted-printable content transfer encoding per RFCs 2045-2047.

[4] Fix | Delete

[5] Fix | Delete

This module handles the content transfer encoding method defined in RFC 2045

[6] Fix | Delete

to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to

[7] Fix | Delete

safely encode text that is in a character set similar to the 7-bit US ASCII

[8] Fix | Delete

character set, but that includes some 8-bit characters that are normally not

[9] Fix | Delete

allowed in email bodies or headers.

[10] Fix | Delete

[11] Fix | Delete

Quoted-printable is very space-inefficient for encoding binary files; use the

[12] Fix | Delete

email.base64mime module for that instead.

[13] Fix | Delete

[14] Fix | Delete

This module provides an interface to encode and decode both headers and bodies

[15] Fix | Delete

with quoted-printable encoding.

[16] Fix | Delete

[17] Fix | Delete

RFC 2045 defines a method for including character set information in an

[18] Fix | Delete

`encoded-word' in a header. This method is commonly used for 8-bit real names

[19] Fix | Delete

in To:/From:/Cc: etc. fields, as well as Subject: lines.

[20] Fix | Delete

[21] Fix | Delete

This module does not do the line wrapping or end-of-line character

[22] Fix | Delete

conversion necessary for proper internationalized headers; it only

[23] Fix | Delete

does dumb encoding and decoding. To deal with the various line

[24] Fix | Delete

wrapping issues, use the email.header module.

[25] Fix | Delete

"""

[26] Fix | Delete

[27] Fix | Delete

__all__ = [

[28] Fix | Delete

'body_decode',

[29] Fix | Delete

'body_encode',

[30] Fix | Delete

'body_length',

[31] Fix | Delete

'decode',

[32] Fix | Delete

'decodestring',

[33] Fix | Delete

'header_decode',

[34] Fix | Delete

'header_encode',

[35] Fix | Delete

'header_length',

[36] Fix | Delete

'quote',

[37] Fix | Delete

'unquote',

[38] Fix | Delete

]

[39] Fix | Delete

[40] Fix | Delete

import re

[41] Fix | Delete

[42] Fix | Delete

from string import ascii_letters, digits, hexdigits

[43] Fix | Delete

[44] Fix | Delete

CRLF = '\r\n'

[45] Fix | Delete

NL = '\n'

[46] Fix | Delete

EMPTYSTRING = ''

[47] Fix | Delete

[48] Fix | Delete

# Build a mapping of octets to the expansion of that octet. Since we're only

[49] Fix | Delete

# going to have 256 of these things, this isn't terribly inefficient

[50] Fix | Delete

# space-wise. Remember that headers and bodies have different sets of safe

[51] Fix | Delete

# characters. Initialize both maps with the full expansion, and then override

[52] Fix | Delete

# the safe bytes with the more compact form.

[53] Fix | Delete

_QUOPRI_MAP = ['=%02X' % c for c in range(256)]

[54] Fix | Delete

_QUOPRI_HEADER_MAP = _QUOPRI_MAP[:]

[55] Fix | Delete

_QUOPRI_BODY_MAP = _QUOPRI_MAP[:]

[56] Fix | Delete

[57] Fix | Delete

# Safe header bytes which need no encoding.

[58] Fix | Delete

for c in b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii'):

[59] Fix | Delete

_QUOPRI_HEADER_MAP[c] = chr(c)

[60] Fix | Delete

# Headers have one other special encoding; spaces become underscores.

[61] Fix | Delete

_QUOPRI_HEADER_MAP[ord(' ')] = '_'

[62] Fix | Delete

[63] Fix | Delete

# Safe body bytes which need no encoding.

[64] Fix | Delete

for c in (b' !"#$%&\'()*+,-./0123456789:;<>'

[65] Fix | Delete

b'?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`'

[66] Fix | Delete

b'abcdefghijklmnopqrstuvwxyz{|}~\t'):

[67] Fix | Delete

_QUOPRI_BODY_MAP[c] = chr(c)

[68] Fix | Delete

[69] Fix | Delete

[70] Fix | Delete

[71] Fix | Delete

# Helpers

[72] Fix | Delete

def header_check(octet):

[73] Fix | Delete

"""Return True if the octet should be escaped with header quopri."""

[74] Fix | Delete

return chr(octet) != _QUOPRI_HEADER_MAP[octet]

[75] Fix | Delete

[76] Fix | Delete

[77] Fix | Delete

def body_check(octet):

[78] Fix | Delete

"""Return True if the octet should be escaped with body quopri."""

[79] Fix | Delete

return chr(octet) != _QUOPRI_BODY_MAP[octet]

[80] Fix | Delete

[81] Fix | Delete

[82] Fix | Delete

def header_length(bytearray):

[83] Fix | Delete

"""Return a header quoted-printable encoding length.

[84] Fix | Delete

[85] Fix | Delete

Note that this does not include any RFC 2047 chrome added by

[86] Fix | Delete

`header_encode()`.

[87] Fix | Delete

[88] Fix | Delete

:param bytearray: An array of bytes (a.k.a. octets).

[89] Fix | Delete

:return: The length in bytes of the byte array when it is encoded with

[90] Fix | Delete

quoted-printable for headers.

[91] Fix | Delete

"""

[92] Fix | Delete

return sum(len(_QUOPRI_HEADER_MAP[octet]) for octet in bytearray)

[93] Fix | Delete

[94] Fix | Delete

[95] Fix | Delete

def body_length(bytearray):

[96] Fix | Delete

"""Return a body quoted-printable encoding length.

[97] Fix | Delete

[98] Fix | Delete

:param bytearray: An array of bytes (a.k.a. octets).

[99] Fix | Delete

:return: The length in bytes of the byte array when it is encoded with

[100] Fix | Delete

quoted-printable for bodies.

[101] Fix | Delete

"""

[102] Fix | Delete

return sum(len(_QUOPRI_BODY_MAP[octet]) for octet in bytearray)

[103] Fix | Delete

[104] Fix | Delete

[105] Fix | Delete

def _max_append(L, s, maxlen, extra=''):

[106] Fix | Delete

if not isinstance(s, str):

[107] Fix | Delete

s = chr(s)

[108] Fix | Delete

if not L:

[109] Fix | Delete

L.append(s.lstrip())

[110] Fix | Delete

elif len(L[-1]) + len(s) <= maxlen:

[111] Fix | Delete

L[-1] += extra + s

[112] Fix | Delete

else:

[113] Fix | Delete

L.append(s.lstrip())

[114] Fix | Delete

[115] Fix | Delete

[116] Fix | Delete

def unquote(s):

[117] Fix | Delete

"""Turn a string in the form =AB to the ASCII character with value 0xab"""

[118] Fix | Delete

return chr(int(s[1:3], 16))

[119] Fix | Delete

[120] Fix | Delete

[121] Fix | Delete

def quote(c):

[122] Fix | Delete

return _QUOPRI_MAP[ord(c)]

[123] Fix | Delete

[124] Fix | Delete

[125] Fix | Delete

def header_encode(header_bytes, charset='iso-8859-1'):

[126] Fix | Delete

"""Encode a single header line with quoted-printable (like) encoding.

[127] Fix | Delete

[128] Fix | Delete

Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but

[129] Fix | Delete

used specifically for email header fields to allow charsets with mostly 7

[130] Fix | Delete

bit characters (and some 8 bit) to remain more or less readable in non-RFC

[131] Fix | Delete

2045 aware mail clients.

[132] Fix | Delete

[133] Fix | Delete

charset names the character set to use in the RFC 2046 header. It

[134] Fix | Delete

defaults to iso-8859-1.

[135] Fix | Delete

"""

[136] Fix | Delete

# Return empty headers as an empty string.

[137] Fix | Delete

if not header_bytes:

[138] Fix | Delete

return ''

[139] Fix | Delete

# Iterate over every byte, encoding if necessary.

[140] Fix | Delete

encoded = header_bytes.decode('latin1').translate(_QUOPRI_HEADER_MAP)

[141] Fix | Delete

# Now add the RFC chrome to each encoded chunk and glue the chunks

[142] Fix | Delete

# together.

[143] Fix | Delete

return '=?%s?q?%s?=' % (charset, encoded)

[144] Fix | Delete

[145] Fix | Delete

[146] Fix | Delete

_QUOPRI_BODY_ENCODE_MAP = _QUOPRI_BODY_MAP[:]

[147] Fix | Delete

for c in b'\r\n':

[148] Fix | Delete

_QUOPRI_BODY_ENCODE_MAP[c] = chr(c)

[149] Fix | Delete

[150] Fix | Delete

def body_encode(body, maxlinelen=76, eol=NL):

[151] Fix | Delete

"""Encode with quoted-printable, wrapping at maxlinelen characters.

[152] Fix | Delete

[153] Fix | Delete

Each line of encoded text will end with eol, which defaults to "\\n". Set

[154] Fix | Delete

this to "\\r\\n" if you will be using the result of this function directly

[155] Fix | Delete

in an email.

[156] Fix | Delete

[157] Fix | Delete

Each line will be wrapped at, at most, maxlinelen characters before the

[158] Fix | Delete

eol string (maxlinelen defaults to 76 characters, the maximum value

[159] Fix | Delete

permitted by RFC 2045). Long lines will have the 'soft line break'

[160] Fix | Delete

quoted-printable character "=" appended to them, so the decoded text will

[161] Fix | Delete

be identical to the original text.

[162] Fix | Delete

[163] Fix | Delete

The minimum maxlinelen is 4 to have room for a quoted character ("=XX")

[164] Fix | Delete

followed by a soft line break. Smaller values will generate a

[165] Fix | Delete

ValueError.

[166] Fix | Delete

[167] Fix | Delete

"""

[168] Fix | Delete

[169] Fix | Delete

if maxlinelen < 4:

[170] Fix | Delete

raise ValueError("maxlinelen must be at least 4")

[171] Fix | Delete

if not body:

[172] Fix | Delete

return body

[173] Fix | Delete

[174] Fix | Delete

# quote special characters

[175] Fix | Delete

body = body.translate(_QUOPRI_BODY_ENCODE_MAP)

[176] Fix | Delete

[177] Fix | Delete

soft_break = '=' + eol

[178] Fix | Delete

# leave space for the '=' at the end of a line

[179] Fix | Delete

maxlinelen1 = maxlinelen - 1

[180] Fix | Delete

[181] Fix | Delete

encoded_body = []

[182] Fix | Delete

append = encoded_body.append

[183] Fix | Delete

[184] Fix | Delete

for line in body.splitlines():

[185] Fix | Delete

# break up the line into pieces no longer than maxlinelen - 1

[186] Fix | Delete

start = 0

[187] Fix | Delete

laststart = len(line) - 1 - maxlinelen

[188] Fix | Delete

while start <= laststart:

[189] Fix | Delete

stop = start + maxlinelen1

[190] Fix | Delete

# make sure we don't break up an escape sequence

[191] Fix | Delete

if line[stop - 2] == '=':

[192] Fix | Delete

append(line[start:stop - 1])

[193] Fix | Delete

start = stop - 2

[194] Fix | Delete

elif line[stop - 1] == '=':

[195] Fix | Delete

append(line[start:stop])

[196] Fix | Delete

start = stop - 1

[197] Fix | Delete

else:

[198] Fix | Delete

append(line[start:stop] + '=')

[199] Fix | Delete

start = stop

[200] Fix | Delete

[201] Fix | Delete

# handle rest of line, special case if line ends in whitespace

[202] Fix | Delete

if line and line[-1] in ' \t':

[203] Fix | Delete

room = start - laststart

[204] Fix | Delete

if room >= 3:

[205] Fix | Delete

# It's a whitespace character at end-of-line, and we have room

[206] Fix | Delete

# for the three-character quoted encoding.

[207] Fix | Delete

q = quote(line[-1])

[208] Fix | Delete

elif room == 2:

[209] Fix | Delete

# There's room for the whitespace character and a soft break.

[210] Fix | Delete

q = line[-1] + soft_break

[211] Fix | Delete

else:

[212] Fix | Delete

# There's room only for a soft break. The quoted whitespace

[213] Fix | Delete

# will be the only content on the subsequent line.

[214] Fix | Delete

q = soft_break + quote(line[-1])

[215] Fix | Delete

append(line[start:-1] + q)

[216] Fix | Delete

else:

[217] Fix | Delete

append(line[start:])

[218] Fix | Delete

[219] Fix | Delete

# add back final newline if present

[220] Fix | Delete

if body[-1] in CRLF:

[221] Fix | Delete

append('')

[222] Fix | Delete

[223] Fix | Delete

return eol.join(encoded_body)

[224] Fix | Delete

[225] Fix | Delete

[226] Fix | Delete

[227] Fix | Delete

# BAW: I'm not sure if the intent was for the signature of this function to be

[228] Fix | Delete

# the same as base64MIME.decode() or not...

[229] Fix | Delete

def decode(encoded, eol=NL):

[230] Fix | Delete

"""Decode a quoted-printable string.

[231] Fix | Delete

[232] Fix | Delete

Lines are separated with eol, which defaults to \\n.

[233] Fix | Delete

"""

[234] Fix | Delete

if not encoded:

[235] Fix | Delete

return encoded

[236] Fix | Delete

# BAW: see comment in encode() above. Again, we're building up the

[237] Fix | Delete

# decoded string with string concatenation, which could be done much more

[238] Fix | Delete

# efficiently.

[239] Fix | Delete

decoded = ''

[240] Fix | Delete

[241] Fix | Delete

for line in encoded.splitlines():

[242] Fix | Delete

line = line.rstrip()

[243] Fix | Delete

if not line:

[244] Fix | Delete

decoded += eol

[245] Fix | Delete

continue

[246] Fix | Delete

[247] Fix | Delete

i = 0

[248] Fix | Delete

n = len(line)

[249] Fix | Delete

while i < n:

[250] Fix | Delete

c = line[i]

[251] Fix | Delete

if c != '=':

[252] Fix | Delete

decoded += c

[253] Fix | Delete

i += 1

[254] Fix | Delete

# Otherwise, c == "=". Are we at the end of the line? If so, add

[255] Fix | Delete

# a soft line break.

[256] Fix | Delete

elif i+1 == n:

[257] Fix | Delete

i += 1

[258] Fix | Delete

continue

[259] Fix | Delete

# Decode if in form =AB

[260] Fix | Delete

elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:

[261] Fix | Delete

decoded += unquote(line[i:i+3])

[262] Fix | Delete

i += 3

[263] Fix | Delete

# Otherwise, not in form =AB, pass literally

[264] Fix | Delete

else:

[265] Fix | Delete

decoded += c

[266] Fix | Delete

i += 1

[267] Fix | Delete

[268] Fix | Delete

if i == n:

[269] Fix | Delete

decoded += eol

[270] Fix | Delete

# Special case if original string did not end with eol

[271] Fix | Delete

if encoded[-1] not in '\r\n' and decoded.endswith(eol):

[272] Fix | Delete

decoded = decoded[:-1]

[273] Fix | Delete

return decoded

[274] Fix | Delete

[275] Fix | Delete

[276] Fix | Delete

# For convenience and backwards compatibility w/ standard base64 module

[277] Fix | Delete

body_decode = decode

[278] Fix | Delete

decodestring = decode

[279] Fix | Delete

[280] Fix | Delete

[281] Fix | Delete

[282] Fix | Delete

def _unquote_match(match):

[283] Fix | Delete

"""Turn a match in the form =AB to the ASCII character with value 0xab"""

[284] Fix | Delete

s = match.group(0)

[285] Fix | Delete

return unquote(s)

[286] Fix | Delete

[287] Fix | Delete

[288] Fix | Delete

# Header decoding is done a bit differently

[289] Fix | Delete

def header_decode(s):

[290] Fix | Delete

"""Decode a string encoded with RFC 2045 MIME header `Q' encoding.

[291] Fix | Delete

[292] Fix | Delete

This function does not parse a full MIME header value encoded with

[293] Fix | Delete

quoted-printable (like =?iso-8859-1?q?Hello_World?=) -- please use

[294] Fix | Delete

the high level email.header class for that functionality.

[295] Fix | Delete

"""

[296] Fix | Delete

s = s.replace('_', ' ')

[297] Fix | Delete

return re.sub(r'=[a-fA-F0-9]{2}', _unquote_match, s, flags=re.ASCII)

[298] Fix | Delete

[299] Fix | Delete