Edit File by line

[0] Fix | Delete

# Author: Ben Gertzfield

[1] Fix | Delete

# Contact: email-sig@python.org

[2] Fix | Delete

[3] Fix | Delete

"""Quoted-printable content transfer encoding per RFCs 2045-2047.

[4] Fix | Delete

[5] Fix | Delete

This module handles the content transfer encoding method defined in RFC 2045

[6] Fix | Delete

to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to

[7] Fix | Delete

safely encode text that is in a character set similar to the 7-bit US ASCII

[8] Fix | Delete

character set, but that includes some 8-bit characters that are normally not

[9] Fix | Delete

allowed in email bodies or headers.

[10] Fix | Delete

[11] Fix | Delete

Quoted-printable is very space-inefficient for encoding binary files; use the

[12] Fix | Delete

email.base64mime module for that instead.

[13] Fix | Delete

[14] Fix | Delete

This module provides an interface to encode and decode both headers and bodies

[15] Fix | Delete

with quoted-printable encoding.

[16] Fix | Delete

[17] Fix | Delete

RFC 2045 defines a method for including character set information in an

[18] Fix | Delete

`encoded-word' in a header. This method is commonly used for 8-bit real names

[19] Fix | Delete

in To:/From:/Cc: etc. fields, as well as Subject: lines.

[20] Fix | Delete

[21] Fix | Delete

This module does not do the line wrapping or end-of-line character

[22] Fix | Delete

conversion necessary for proper internationalized headers; it only

[23] Fix | Delete

does dumb encoding and decoding. To deal with the various line

[24] Fix | Delete

wrapping issues, use the email.header module.

[25] Fix | Delete

"""

[26] Fix | Delete

[27] Fix | Delete

__all__ = [

[28] Fix | Delete

'body_decode',

[29] Fix | Delete

'body_encode',

[30] Fix | Delete

'body_quopri_check',

[31] Fix | Delete

'body_quopri_len',

[32] Fix | Delete

'decode',

[33] Fix | Delete

'decodestring',

[34] Fix | Delete

'encode',

[35] Fix | Delete

'encodestring',

[36] Fix | Delete

'header_decode',

[37] Fix | Delete

'header_encode',

[38] Fix | Delete

'header_quopri_check',

[39] Fix | Delete

'header_quopri_len',

[40] Fix | Delete

'quote',

[41] Fix | Delete

'unquote',

[42] Fix | Delete

]

[43] Fix | Delete

[44] Fix | Delete

import re

[45] Fix | Delete

[46] Fix | Delete

from string import hexdigits

[47] Fix | Delete

from email.utils import fix_eols

[48] Fix | Delete

[49] Fix | Delete

CRLF = '\r\n'

[50] Fix | Delete

NL = '\n'

[51] Fix | Delete

[52] Fix | Delete

# See also Charset.py

[53] Fix | Delete

MISC_LEN = 7

[54] Fix | Delete

[55] Fix | Delete

hqre = re.compile(r'[^-a-zA-Z0-9!*+/ ]')

[56] Fix | Delete

bqre = re.compile(r'[^ !-<>-~\t]')

[57] Fix | Delete

[58] Fix | Delete

[59] Fix | Delete

[60] Fix | Delete

# Helpers

[61] Fix | Delete

def header_quopri_check(c):

[62] Fix | Delete

"""Return True if the character should be escaped with header quopri."""

[63] Fix | Delete

return bool(hqre.match(c))

[64] Fix | Delete

[65] Fix | Delete

[66] Fix | Delete

def body_quopri_check(c):

[67] Fix | Delete

"""Return True if the character should be escaped with body quopri."""

[68] Fix | Delete

return bool(bqre.match(c))

[69] Fix | Delete

[70] Fix | Delete

[71] Fix | Delete

def header_quopri_len(s):

[72] Fix | Delete

"""Return the length of str when it is encoded with header quopri."""

[73] Fix | Delete

count = 0

[74] Fix | Delete

for c in s:

[75] Fix | Delete

if hqre.match(c):

[76] Fix | Delete

count += 3

[77] Fix | Delete

else:

[78] Fix | Delete

count += 1

[79] Fix | Delete

return count

[80] Fix | Delete

[81] Fix | Delete

[82] Fix | Delete

def body_quopri_len(str):

[83] Fix | Delete

"""Return the length of str when it is encoded with body quopri."""

[84] Fix | Delete

count = 0

[85] Fix | Delete

for c in str:

[86] Fix | Delete

if bqre.match(c):

[87] Fix | Delete

count += 3

[88] Fix | Delete

else:

[89] Fix | Delete

count += 1

[90] Fix | Delete

return count

[91] Fix | Delete

[92] Fix | Delete

[93] Fix | Delete

def _max_append(L, s, maxlen, extra=''):

[94] Fix | Delete

if not L:

[95] Fix | Delete

L.append(s.lstrip())

[96] Fix | Delete

elif len(L[-1]) + len(s) <= maxlen:

[97] Fix | Delete

L[-1] += extra + s

[98] Fix | Delete

else:

[99] Fix | Delete

L.append(s.lstrip())

[100] Fix | Delete

[101] Fix | Delete

[102] Fix | Delete

def unquote(s):

[103] Fix | Delete

"""Turn a string in the form =AB to the ASCII character with value 0xab"""

[104] Fix | Delete

return chr(int(s[1:3], 16))

[105] Fix | Delete

[106] Fix | Delete

[107] Fix | Delete

def quote(c):

[108] Fix | Delete

return "=%02X" % ord(c)

[109] Fix | Delete

[110] Fix | Delete

[111] Fix | Delete

[112] Fix | Delete

def header_encode(header, charset="iso-8859-1", keep_eols=False,

[113] Fix | Delete

maxlinelen=76, eol=NL):

[114] Fix | Delete

"""Encode a single header line with quoted-printable (like) encoding.

[115] Fix | Delete

[116] Fix | Delete

Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but

[117] Fix | Delete

used specifically for email header fields to allow charsets with mostly 7

[118] Fix | Delete

bit characters (and some 8 bit) to remain more or less readable in non-RFC

[119] Fix | Delete

2045 aware mail clients.

[120] Fix | Delete

[121] Fix | Delete

charset names the character set to use to encode the header. It defaults

[122] Fix | Delete

to iso-8859-1.

[123] Fix | Delete

[124] Fix | Delete

The resulting string will be in the form:

[125] Fix | Delete

[126] Fix | Delete

"=?charset?q?I_f=E2rt_in_your_g=E8n=E8ral_dire=E7tion?\\n

[127] Fix | Delete

=?charset?q?Silly_=C8nglish_Kn=EEghts?="

[128] Fix | Delete

[129] Fix | Delete

with each line wrapped safely at, at most, maxlinelen characters (defaults

[130] Fix | Delete

to 76 characters). If maxlinelen is None, the entire string is encoded in

[131] Fix | Delete

one chunk with no splitting.

[132] Fix | Delete

[133] Fix | Delete

End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted

[134] Fix | Delete

to the canonical email line separator \\r\\n unless the keep_eols

[135] Fix | Delete

parameter is True (the default is False).

[136] Fix | Delete

[137] Fix | Delete

Each line of the header will be terminated in the value of eol, which

[138] Fix | Delete

defaults to "\\n". Set this to "\\r\\n" if you are using the result of

[139] Fix | Delete

this function directly in email.

[140] Fix | Delete

"""

[141] Fix | Delete

# Return empty headers unchanged

[142] Fix | Delete

if not header:

[143] Fix | Delete

return header

[144] Fix | Delete

[145] Fix | Delete

if not keep_eols:

[146] Fix | Delete

header = fix_eols(header)

[147] Fix | Delete

[148] Fix | Delete

# Quopri encode each line, in encoded chunks no greater than maxlinelen in

[149] Fix | Delete

# length, after the RFC chrome is added in.

[150] Fix | Delete

quoted = []

[151] Fix | Delete

if maxlinelen is None:

[152] Fix | Delete

# An obnoxiously large number that's good enough

[153] Fix | Delete

max_encoded = 100000

[154] Fix | Delete

else:

[155] Fix | Delete

max_encoded = maxlinelen - len(charset) - MISC_LEN - 1

[156] Fix | Delete

[157] Fix | Delete

for c in header:

[158] Fix | Delete

# Space may be represented as _ instead of =20 for readability

[159] Fix | Delete

if c == ' ':

[160] Fix | Delete

_max_append(quoted, '_', max_encoded)

[161] Fix | Delete

# These characters can be included verbatim

[162] Fix | Delete

elif not hqre.match(c):

[163] Fix | Delete

_max_append(quoted, c, max_encoded)

[164] Fix | Delete

# Otherwise, replace with hex value like =E2

[165] Fix | Delete

else:

[166] Fix | Delete

_max_append(quoted, "=%02X" % ord(c), max_encoded)

[167] Fix | Delete

[168] Fix | Delete

# Now add the RFC chrome to each encoded chunk and glue the chunks

[169] Fix | Delete

# together. BAW: should we be able to specify the leading whitespace in

[170] Fix | Delete

# the joiner?

[171] Fix | Delete

joiner = eol + ' '

[172] Fix | Delete

return joiner.join(['=?%s?q?%s?=' % (charset, line) for line in quoted])

[173] Fix | Delete

[174] Fix | Delete

[175] Fix | Delete

[176] Fix | Delete

def encode(body, binary=False, maxlinelen=76, eol=NL):

[177] Fix | Delete

"""Encode with quoted-printable, wrapping at maxlinelen characters.

[178] Fix | Delete

[179] Fix | Delete

If binary is False (the default), end-of-line characters will be converted

[180] Fix | Delete

to the canonical email end-of-line sequence \\r\\n. Otherwise they will

[181] Fix | Delete

be left verbatim.

[182] Fix | Delete

[183] Fix | Delete

Each line of encoded text will end with eol, which defaults to "\\n". Set

[184] Fix | Delete

this to "\\r\\n" if you will be using the result of this function directly

[185] Fix | Delete

in an email.

[186] Fix | Delete

[187] Fix | Delete

Each line will be wrapped at, at most, maxlinelen characters (defaults to

[188] Fix | Delete

76 characters). Long lines will have the `soft linefeed' quoted-printable

[189] Fix | Delete

character "=" appended to them, so the decoded text will be identical to

[190] Fix | Delete

the original text.

[191] Fix | Delete

"""

[192] Fix | Delete

if not body:

[193] Fix | Delete

return body

[194] Fix | Delete

[195] Fix | Delete

if not binary:

[196] Fix | Delete

body = fix_eols(body)

[197] Fix | Delete

[198] Fix | Delete

# BAW: We're accumulating the body text by string concatenation. That

[199] Fix | Delete

# can't be very efficient, but I don't have time now to rewrite it. It

[200] Fix | Delete

# just feels like this algorithm could be more efficient.

[201] Fix | Delete

encoded_body = ''

[202] Fix | Delete

lineno = -1

[203] Fix | Delete

# Preserve line endings here so we can check later to see an eol needs to

[204] Fix | Delete

# be added to the output later.

[205] Fix | Delete

lines = body.splitlines(1)

[206] Fix | Delete

for line in lines:

[207] Fix | Delete

# But strip off line-endings for processing this line.

[208] Fix | Delete

if line.endswith(CRLF):

[209] Fix | Delete

line = line[:-2]

[210] Fix | Delete

elif line[-1] in CRLF:

[211] Fix | Delete

line = line[:-1]

[212] Fix | Delete

[213] Fix | Delete

lineno += 1

[214] Fix | Delete

encoded_line = ''

[215] Fix | Delete

prev = None

[216] Fix | Delete

linelen = len(line)

[217] Fix | Delete

# Now we need to examine every character to see if it needs to be

[218] Fix | Delete

# quopri encoded. BAW: again, string concatenation is inefficient.

[219] Fix | Delete

for j in range(linelen):

[220] Fix | Delete

c = line[j]

[221] Fix | Delete

prev = c

[222] Fix | Delete

if bqre.match(c):

[223] Fix | Delete

c = quote(c)

[224] Fix | Delete

elif j+1 == linelen:

[225] Fix | Delete

# Check for whitespace at end of line; special case

[226] Fix | Delete

if c not in ' \t':

[227] Fix | Delete

encoded_line += c

[228] Fix | Delete

prev = c

[229] Fix | Delete

continue

[230] Fix | Delete

# Check to see to see if the line has reached its maximum length

[231] Fix | Delete

if len(encoded_line) + len(c) >= maxlinelen:

[232] Fix | Delete

encoded_body += encoded_line + '=' + eol

[233] Fix | Delete

encoded_line = ''

[234] Fix | Delete

encoded_line += c

[235] Fix | Delete

# Now at end of line..

[236] Fix | Delete

if prev and prev in ' \t':

[237] Fix | Delete

# Special case for whitespace at end of file

[238] Fix | Delete

if lineno + 1 == len(lines):

[239] Fix | Delete

prev = quote(prev)

[240] Fix | Delete

if len(encoded_line) + len(prev) > maxlinelen:

[241] Fix | Delete

encoded_body += encoded_line + '=' + eol + prev

[242] Fix | Delete

else:

[243] Fix | Delete

encoded_body += encoded_line + prev

[244] Fix | Delete

# Just normal whitespace at end of line

[245] Fix | Delete

else:

[246] Fix | Delete

encoded_body += encoded_line + prev + '=' + eol

[247] Fix | Delete

encoded_line = ''

[248] Fix | Delete

# Now look at the line we just finished and it has a line ending, we

[249] Fix | Delete

# need to add eol to the end of the line.

[250] Fix | Delete

if lines[lineno].endswith(CRLF) or lines[lineno][-1] in CRLF:

[251] Fix | Delete

encoded_body += encoded_line + eol

[252] Fix | Delete

else:

[253] Fix | Delete

encoded_body += encoded_line

[254] Fix | Delete

encoded_line = ''

[255] Fix | Delete

return encoded_body

[256] Fix | Delete

[257] Fix | Delete

[258] Fix | Delete

# For convenience and backwards compatibility w/ standard base64 module

[259] Fix | Delete

body_encode = encode

[260] Fix | Delete

encodestring = encode

[261] Fix | Delete

[262] Fix | Delete

[263] Fix | Delete

[264] Fix | Delete

# BAW: I'm not sure if the intent was for the signature of this function to be

[265] Fix | Delete

# the same as base64MIME.decode() or not...

[266] Fix | Delete

def decode(encoded, eol=NL):

[267] Fix | Delete

"""Decode a quoted-printable string.

[268] Fix | Delete

[269] Fix | Delete

Lines are separated with eol, which defaults to \\n.

[270] Fix | Delete

"""

[271] Fix | Delete

if not encoded:

[272] Fix | Delete

return encoded

[273] Fix | Delete

# BAW: see comment in encode() above. Again, we're building up the

[274] Fix | Delete

# decoded string with string concatenation, which could be done much more

[275] Fix | Delete

# efficiently.

[276] Fix | Delete

decoded = ''

[277] Fix | Delete

[278] Fix | Delete

for line in encoded.splitlines():

[279] Fix | Delete

line = line.rstrip()

[280] Fix | Delete

if not line:

[281] Fix | Delete

decoded += eol

[282] Fix | Delete

continue

[283] Fix | Delete

[284] Fix | Delete

i = 0

[285] Fix | Delete

n = len(line)

[286] Fix | Delete

while i < n:

[287] Fix | Delete

c = line[i]

[288] Fix | Delete

if c != '=':

[289] Fix | Delete

decoded += c

[290] Fix | Delete

i += 1

[291] Fix | Delete

# Otherwise, c == "=". Are we at the end of the line? If so, add

[292] Fix | Delete

# a soft line break.

[293] Fix | Delete

elif i+1 == n:

[294] Fix | Delete

i += 1

[295] Fix | Delete

continue

[296] Fix | Delete

# Decode if in form =AB

[297] Fix | Delete

elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:

[298] Fix | Delete

decoded += unquote(line[i:i+3])

[299] Fix | Delete

i += 3

[300] Fix | Delete

# Otherwise, not in form =AB, pass literally

[301] Fix | Delete

else:

[302] Fix | Delete

decoded += c

[303] Fix | Delete

i += 1

[304] Fix | Delete

[305] Fix | Delete

if i == n:

[306] Fix | Delete

decoded += eol

[307] Fix | Delete

# Special case if original string did not end with eol

[308] Fix | Delete

if not encoded.endswith(eol) and decoded.endswith(eol):

[309] Fix | Delete

decoded = decoded[:-1]

[310] Fix | Delete

return decoded

[311] Fix | Delete

[312] Fix | Delete

[313] Fix | Delete

# For convenience and backwards compatibility w/ standard base64 module

[314] Fix | Delete

body_decode = decode

[315] Fix | Delete

decodestring = decode

[316] Fix | Delete

[317] Fix | Delete

[318] Fix | Delete

[319] Fix | Delete

def _unquote_match(match):

[320] Fix | Delete

"""Turn a match in the form =AB to the ASCII character with value 0xab"""

[321] Fix | Delete

s = match.group(0)

[322] Fix | Delete

return unquote(s)

[323] Fix | Delete

[324] Fix | Delete

[325] Fix | Delete

# Header decoding is done a bit differently

[326] Fix | Delete

def header_decode(s):

[327] Fix | Delete

"""Decode a string encoded with RFC 2045 MIME header `Q' encoding.

[328] Fix | Delete

[329] Fix | Delete

This function does not parse a full MIME header value encoded with

[330] Fix | Delete

quoted-printable (like =?iso-8859-1?q?Hello_World?=) -- please use

[331] Fix | Delete

the high level email.header class for that functionality.

[332] Fix | Delete

"""

[333] Fix | Delete

s = s.replace('_', ' ')

[334] Fix | Delete

return re.sub(r'=[a-fA-F0-9]{2}', _unquote_match, s)

[335] Fix | Delete

[336] Fix | Delete