Edit File by line

"""A parser for XML, using the derived class as static DTD."""

[0] Fix | Delete

[1] Fix | Delete

# Author: Sjoerd Mullender.

[2] Fix | Delete

[3] Fix | Delete

import re

[4] Fix | Delete

import string

[5] Fix | Delete

[6] Fix | Delete

import warnings

[7] Fix | Delete

warnings.warn("The xmllib module is obsolete. Use xml.sax instead.",

[8] Fix | Delete

DeprecationWarning, 2)

[9] Fix | Delete

del warnings

[10] Fix | Delete

[11] Fix | Delete

version = '0.3'

[12] Fix | Delete

[13] Fix | Delete

class Error(RuntimeError):

[14] Fix | Delete

pass

[15] Fix | Delete

[16] Fix | Delete

# Regular expressions used for parsing

[17] Fix | Delete

[18] Fix | Delete

_S = '[ \t\r\n]+' # white space

[19] Fix | Delete

_opS = '[ \t\r\n]*' # optional white space

[20] Fix | Delete

_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name

[21] Fix | Delete

_QStr = "(?:'[^']*'|\"[^\"]*\")" # quoted XML string

[22] Fix | Delete

illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content

[23] Fix | Delete

interesting = re.compile('[]&<]')

[24] Fix | Delete

[25] Fix | Delete

amp = re.compile('&')

[26] Fix | Delete

ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')

[27] Fix | Delete

entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')

[28] Fix | Delete

charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')

[29] Fix | Delete

space = re.compile(_S + '$')

[30] Fix | Delete

newline = re.compile('\n')

[31] Fix | Delete

[32] Fix | Delete

attrfind = re.compile(

[33] Fix | Delete

_S + '(?P<name>' + _Name + ')'

[34] Fix | Delete

'(' + _opS + '=' + _opS +

[35] Fix | Delete

'(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!_#=~]+))?')

[36] Fix | Delete

starttagopen = re.compile('<' + _Name)

[37] Fix | Delete

starttagend = re.compile(_opS + '(?P<slash>/?)>')

[38] Fix | Delete

starttagmatch = re.compile('<(?P<tagname>'+_Name+')'

[39] Fix | Delete

'(?P<attrs>(?:'+attrfind.pattern+')*)'+

[40] Fix | Delete

starttagend.pattern)

[41] Fix | Delete

endtagopen = re.compile('</')

[42] Fix | Delete

endbracket = re.compile(_opS + '>')

[43] Fix | Delete

endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')

[44] Fix | Delete

tagfind = re.compile(_Name)

[45] Fix | Delete

cdataopen = re.compile(r'<!\[CDATA\[')

[46] Fix | Delete

cdataclose = re.compile(r'\]\]>')

[47] Fix | Delete

# this matches one of the following:

[48] Fix | Delete

# SYSTEM SystemLiteral

[49] Fix | Delete

# PUBLIC PubidLiteral SystemLiteral

[50] Fix | Delete

_SystemLiteral = '(?P<%s>'+_QStr+')'

[51] Fix | Delete

_PublicLiteral = '(?P<%s>"[-\'+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \

[52] Fix | Delete

"'[-+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"

[53] Fix | Delete

_ExternalId = '(?:SYSTEM|' \

[54] Fix | Delete

'PUBLIC'+_S+_PublicLiteral%'pubid'+ \

[55] Fix | Delete

')'+_S+_SystemLiteral%'syslit'

[56] Fix | Delete

doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'

[57] Fix | Delete

'(?:'+_S+_ExternalId+')?'+_opS)

[58] Fix | Delete

xmldecl = re.compile('<\?xml'+_S+

[59] Fix | Delete

'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+

[60] Fix | Delete

'(?:'+_S+'encoding'+_opS+'='+_opS+

[61] Fix | Delete

"(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"

[62] Fix | Delete

'"[A-Za-z][-A-Za-z0-9._]*"))?'

[63] Fix | Delete

'(?:'+_S+'standalone'+_opS+'='+_opS+

[64] Fix | Delete

'(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+

[65] Fix | Delete

_opS+'\?>')

[66] Fix | Delete

procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)

[67] Fix | Delete

procclose = re.compile(_opS + r'\?>')

[68] Fix | Delete

commentopen = re.compile('<!--')

[69] Fix | Delete

commentclose = re.compile('-->')

[70] Fix | Delete

doubledash = re.compile('--')

[71] Fix | Delete

attrtrans = string.maketrans(' \r\n\t', ' ')

[72] Fix | Delete

[73] Fix | Delete

# definitions for XML namespaces

[74] Fix | Delete

_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":"

[75] Fix | Delete

ncname = re.compile(_NCName + '$')

[76] Fix | Delete

qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix

[77] Fix | Delete

'(?P<local>' + _NCName + ')$')

[78] Fix | Delete

[79] Fix | Delete

xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')

[80] Fix | Delete

[81] Fix | Delete

# XML parser base class -- find tags and call handler functions.

[82] Fix | Delete

# Usage: p = XMLParser(); p.feed(data); ...; p.close().

[83] Fix | Delete

# The dtd is defined by deriving a class which defines methods with

[84] Fix | Delete

# special names to handle tags: start_foo and end_foo to handle <foo>

[85] Fix | Delete

# and </foo>, respectively. The data between tags is passed to the

[86] Fix | Delete

# parser by calling self.handle_data() with some data as argument (the

[87] Fix | Delete

# data may be split up in arbitrary chunks).

[88] Fix | Delete

[89] Fix | Delete

class XMLParser:

[90] Fix | Delete

attributes = {} # default, to be overridden

[91] Fix | Delete

elements = {} # default, to be overridden

[92] Fix | Delete

[93] Fix | Delete

# parsing options, settable using keyword args in __init__

[94] Fix | Delete

__accept_unquoted_attributes = 0

[95] Fix | Delete

__accept_missing_endtag_name = 0

[96] Fix | Delete

__map_case = 0

[97] Fix | Delete

__accept_utf8 = 0

[98] Fix | Delete

__translate_attribute_references = 1

[99] Fix | Delete

[100] Fix | Delete

# Interface -- initialize and reset this instance

[101] Fix | Delete

def __init__(self, **kw):

[102] Fix | Delete

self.__fixed = 0

[103] Fix | Delete

if 'accept_unquoted_attributes' in kw:

[104] Fix | Delete

self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']

[105] Fix | Delete

if 'accept_missing_endtag_name' in kw:

[106] Fix | Delete

self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']

[107] Fix | Delete

if 'map_case' in kw:

[108] Fix | Delete

self.__map_case = kw['map_case']

[109] Fix | Delete

if 'accept_utf8' in kw:

[110] Fix | Delete

self.__accept_utf8 = kw['accept_utf8']

[111] Fix | Delete

if 'translate_attribute_references' in kw:

[112] Fix | Delete

self.__translate_attribute_references = kw['translate_attribute_references']

[113] Fix | Delete

self.reset()

[114] Fix | Delete

[115] Fix | Delete

def __fixelements(self):

[116] Fix | Delete

self.__fixed = 1

[117] Fix | Delete

self.elements = {}

[118] Fix | Delete

self.__fixdict(self.__dict__)

[119] Fix | Delete

self.__fixclass(self.__class__)

[120] Fix | Delete

[121] Fix | Delete

def __fixclass(self, kl):

[122] Fix | Delete

self.__fixdict(kl.__dict__)

[123] Fix | Delete

for k in kl.__bases__:

[124] Fix | Delete

self.__fixclass(k)

[125] Fix | Delete

[126] Fix | Delete

def __fixdict(self, dict):

[127] Fix | Delete

for key in dict.keys():

[128] Fix | Delete

if key[:6] == 'start_':

[129] Fix | Delete

tag = key[6:]

[130] Fix | Delete

start, end = self.elements.get(tag, (None, None))

[131] Fix | Delete

if start is None:

[132] Fix | Delete

self.elements[tag] = getattr(self, key), end

[133] Fix | Delete

elif key[:4] == 'end_':

[134] Fix | Delete

tag = key[4:]

[135] Fix | Delete

start, end = self.elements.get(tag, (None, None))

[136] Fix | Delete

if end is None:

[137] Fix | Delete

self.elements[tag] = start, getattr(self, key)

[138] Fix | Delete

[139] Fix | Delete

# Interface -- reset this instance. Loses all unprocessed data

[140] Fix | Delete

def reset(self):

[141] Fix | Delete

self.rawdata = ''

[142] Fix | Delete

self.stack = []

[143] Fix | Delete

self.nomoretags = 0

[144] Fix | Delete

self.literal = 0

[145] Fix | Delete

self.lineno = 1

[146] Fix | Delete

self.__at_start = 1

[147] Fix | Delete

self.__seen_doctype = None

[148] Fix | Delete

self.__seen_starttag = 0

[149] Fix | Delete

self.__use_namespaces = 0

[150] Fix | Delete

self.__namespaces = {'xml':None} # xml is implicitly declared

[151] Fix | Delete

# backward compatibility hack: if elements not overridden,

[152] Fix | Delete

# fill it in ourselves

[153] Fix | Delete

if self.elements is XMLParser.elements:

[154] Fix | Delete

self.__fixelements()

[155] Fix | Delete

[156] Fix | Delete

# For derived classes only -- enter literal mode (CDATA) till EOF

[157] Fix | Delete

def setnomoretags(self):

[158] Fix | Delete

self.nomoretags = self.literal = 1

[159] Fix | Delete

[160] Fix | Delete

# For derived classes only -- enter literal mode (CDATA)

[161] Fix | Delete

def setliteral(self, *args):

[162] Fix | Delete

self.literal = 1

[163] Fix | Delete

[164] Fix | Delete

# Interface -- feed some data to the parser. Call this as

[165] Fix | Delete

# often as you want, with as little or as much text as you

[166] Fix | Delete

# want (may include '\n'). (This just saves the text, all the

[167] Fix | Delete

# processing is done by goahead().)

[168] Fix | Delete

def feed(self, data):

[169] Fix | Delete

self.rawdata = self.rawdata + data

[170] Fix | Delete

self.goahead(0)

[171] Fix | Delete

[172] Fix | Delete

# Interface -- handle the remaining data

[173] Fix | Delete

def close(self):

[174] Fix | Delete

self.goahead(1)

[175] Fix | Delete

if self.__fixed:

[176] Fix | Delete

self.__fixed = 0

[177] Fix | Delete

# remove self.elements so that we don't leak

[178] Fix | Delete

del self.elements

[179] Fix | Delete

[180] Fix | Delete

# Interface -- translate references

[181] Fix | Delete

def translate_references(self, data, all = 1):

[182] Fix | Delete

if not self.__translate_attribute_references:

[183] Fix | Delete

return data

[184] Fix | Delete

i = 0

[185] Fix | Delete

while 1:

[186] Fix | Delete

res = amp.search(data, i)

[187] Fix | Delete

if res is None:

[188] Fix | Delete

return data

[189] Fix | Delete

s = res.start(0)

[190] Fix | Delete

res = ref.match(data, s)

[191] Fix | Delete

if res is None:

[192] Fix | Delete

self.syntax_error("bogus `&'")

[193] Fix | Delete

i = s+1

[194] Fix | Delete

continue

[195] Fix | Delete

i = res.end(0)

[196] Fix | Delete

str = res.group(1)

[197] Fix | Delete

rescan = 0

[198] Fix | Delete

if str[0] == '#':

[199] Fix | Delete

if str[1] == 'x':

[200] Fix | Delete

str = chr(int(str[2:], 16))

[201] Fix | Delete

else:

[202] Fix | Delete

str = chr(int(str[1:]))

[203] Fix | Delete

if data[i - 1] != ';':

[204] Fix | Delete

self.syntax_error("`;' missing after char reference")

[205] Fix | Delete

i = i-1

[206] Fix | Delete

elif all:

[207] Fix | Delete

if str in self.entitydefs:

[208] Fix | Delete

str = self.entitydefs[str]

[209] Fix | Delete

rescan = 1

[210] Fix | Delete

elif data[i - 1] != ';':

[211] Fix | Delete

self.syntax_error("bogus `&'")

[212] Fix | Delete

i = s + 1 # just past the &

[213] Fix | Delete

continue

[214] Fix | Delete

else:

[215] Fix | Delete

self.syntax_error("reference to unknown entity `&%s;'" % str)

[216] Fix | Delete

str = '&' + str + ';'

[217] Fix | Delete

elif data[i - 1] != ';':

[218] Fix | Delete

self.syntax_error("bogus `&'")

[219] Fix | Delete

i = s + 1 # just past the &

[220] Fix | Delete

continue

[221] Fix | Delete

[222] Fix | Delete

# when we get here, str contains the translated text and i points

[223] Fix | Delete

# to the end of the string that is to be replaced

[224] Fix | Delete

data = data[:s] + str + data[i:]

[225] Fix | Delete

if rescan:

[226] Fix | Delete

i = s

[227] Fix | Delete

else:

[228] Fix | Delete

i = s + len(str)

[229] Fix | Delete

[230] Fix | Delete

# Interface - return a dictionary of all namespaces currently valid

[231] Fix | Delete

def getnamespace(self):

[232] Fix | Delete

nsdict = {}

[233] Fix | Delete

for t, d, nst in self.stack:

[234] Fix | Delete

nsdict.update(d)

[235] Fix | Delete

return nsdict

[236] Fix | Delete

[237] Fix | Delete

# Internal -- handle data as far as reasonable. May leave state

[238] Fix | Delete

# and data to be processed by a subsequent call. If 'end' is

[239] Fix | Delete

# true, force handling all data as if followed by EOF marker.

[240] Fix | Delete

def goahead(self, end):

[241] Fix | Delete

rawdata = self.rawdata

[242] Fix | Delete

i = 0

[243] Fix | Delete

n = len(rawdata)

[244] Fix | Delete

while i < n:

[245] Fix | Delete

if i > 0:

[246] Fix | Delete

self.__at_start = 0

[247] Fix | Delete

if self.nomoretags:

[248] Fix | Delete

data = rawdata[i:n]

[249] Fix | Delete

self.handle_data(data)

[250] Fix | Delete

self.lineno = self.lineno + data.count('\n')

[251] Fix | Delete

i = n

[252] Fix | Delete

break

[253] Fix | Delete

res = interesting.search(rawdata, i)

[254] Fix | Delete

if res:

[255] Fix | Delete

j = res.start(0)

[256] Fix | Delete

else:

[257] Fix | Delete

j = n

[258] Fix | Delete

if i < j:

[259] Fix | Delete

data = rawdata[i:j]

[260] Fix | Delete

if self.__at_start and space.match(data) is None:

[261] Fix | Delete

self.syntax_error('illegal data at start of file')

[262] Fix | Delete

self.__at_start = 0

[263] Fix | Delete

if not self.stack and space.match(data) is None:

[264] Fix | Delete

self.syntax_error('data not in content')

[265] Fix | Delete

if not self.__accept_utf8 and illegal.search(data):

[266] Fix | Delete

self.syntax_error('illegal character in content')

[267] Fix | Delete

self.handle_data(data)

[268] Fix | Delete

self.lineno = self.lineno + data.count('\n')

[269] Fix | Delete

i = j

[270] Fix | Delete

if i == n: break

[271] Fix | Delete

if rawdata[i] == '<':

[272] Fix | Delete

if starttagopen.match(rawdata, i):

[273] Fix | Delete

if self.literal:

[274] Fix | Delete

data = rawdata[i]

[275] Fix | Delete

self.handle_data(data)

[276] Fix | Delete

self.lineno = self.lineno + data.count('\n')

[277] Fix | Delete

i = i+1

[278] Fix | Delete

continue

[279] Fix | Delete

k = self.parse_starttag(i)

[280] Fix | Delete

if k < 0: break

[281] Fix | Delete

self.__seen_starttag = 1

[282] Fix | Delete

self.lineno = self.lineno + rawdata[i:k].count('\n')

[283] Fix | Delete

i = k

[284] Fix | Delete

continue

[285] Fix | Delete

if endtagopen.match(rawdata, i):

[286] Fix | Delete

k = self.parse_endtag(i)

[287] Fix | Delete

if k < 0: break

[288] Fix | Delete

self.lineno = self.lineno + rawdata[i:k].count('\n')

[289] Fix | Delete

i = k

[290] Fix | Delete

continue

[291] Fix | Delete

if commentopen.match(rawdata, i):

[292] Fix | Delete

if self.literal:

[293] Fix | Delete

data = rawdata[i]

[294] Fix | Delete

self.handle_data(data)

[295] Fix | Delete

self.lineno = self.lineno + data.count('\n')

[296] Fix | Delete

i = i+1

[297] Fix | Delete

continue

[298] Fix | Delete

k = self.parse_comment(i)

[299] Fix | Delete

if k < 0: break

[300] Fix | Delete

self.lineno = self.lineno + rawdata[i:k].count('\n')

[301] Fix | Delete

i = k

[302] Fix | Delete

continue

[303] Fix | Delete

if cdataopen.match(rawdata, i):

[304] Fix | Delete

k = self.parse_cdata(i)

[305] Fix | Delete

if k < 0: break

[306] Fix | Delete

self.lineno = self.lineno + rawdata[i:k].count('\n')

[307] Fix | Delete

i = k

[308] Fix | Delete

continue

[309] Fix | Delete

res = xmldecl.match(rawdata, i)

[310] Fix | Delete

if res:

[311] Fix | Delete

if not self.__at_start:

[312] Fix | Delete

self.syntax_error("<?xml?> declaration not at start of document")

[313] Fix | Delete

version, encoding, standalone = res.group('version',

[314] Fix | Delete

'encoding',

[315] Fix | Delete

'standalone')

[316] Fix | Delete

if version[1:-1] != '1.0':

[317] Fix | Delete

raise Error('only XML version 1.0 supported')

[318] Fix | Delete

if encoding: encoding = encoding[1:-1]

[319] Fix | Delete

if standalone: standalone = standalone[1:-1]

[320] Fix | Delete

self.handle_xml(encoding, standalone)

[321] Fix | Delete

i = res.end(0)

[322] Fix | Delete

continue

[323] Fix | Delete

res = procopen.match(rawdata, i)

[324] Fix | Delete

if res:

[325] Fix | Delete

k = self.parse_proc(i)

[326] Fix | Delete

if k < 0: break

[327] Fix | Delete

self.lineno = self.lineno + rawdata[i:k].count('\n')

[328] Fix | Delete

i = k

[329] Fix | Delete

continue

[330] Fix | Delete

res = doctype.match(rawdata, i)

[331] Fix | Delete

if res:

[332] Fix | Delete

if self.literal:

[333] Fix | Delete

data = rawdata[i]

[334] Fix | Delete

self.handle_data(data)

[335] Fix | Delete

self.lineno = self.lineno + data.count('\n')

[336] Fix | Delete

i = i+1

[337] Fix | Delete

continue

[338] Fix | Delete

if self.__seen_doctype:

[339] Fix | Delete

self.syntax_error('multiple DOCTYPE elements')

[340] Fix | Delete

if self.__seen_starttag:

[341] Fix | Delete

self.syntax_error('DOCTYPE not at beginning of document')

[342] Fix | Delete

k = self.parse_doctype(res)

[343] Fix | Delete

if k < 0: break

[344] Fix | Delete

self.__seen_doctype = res.group('name')

[345] Fix | Delete

if self.__map_case:

[346] Fix | Delete

self.__seen_doctype = self.__seen_doctype.lower()

[347] Fix | Delete

self.lineno = self.lineno + rawdata[i:k].count('\n')

[348] Fix | Delete

i = k

[349] Fix | Delete

continue

[350] Fix | Delete

elif rawdata[i] == '&':

[351] Fix | Delete

if self.literal:

[352] Fix | Delete

data = rawdata[i]

[353] Fix | Delete

self.handle_data(data)

[354] Fix | Delete

i = i+1

[355] Fix | Delete

continue

[356] Fix | Delete

res = charref.match(rawdata, i)

[357] Fix | Delete

if res is not None:

[358] Fix | Delete

i = res.end(0)

[359] Fix | Delete

if rawdata[i-1] != ';':

[360] Fix | Delete

self.syntax_error("`;' missing in charref")

[361] Fix | Delete

i = i-1

[362] Fix | Delete

if not self.stack:

[363] Fix | Delete

self.syntax_error('data not in content')

[364] Fix | Delete

self.handle_charref(res.group('char')[:-1])

[365] Fix | Delete

self.lineno = self.lineno + res.group(0).count('\n')

[366] Fix | Delete

continue

[367] Fix | Delete

res = entityref.match(rawdata, i)

[368] Fix | Delete

if res is not None:

[369] Fix | Delete

i = res.end(0)

[370] Fix | Delete

if rawdata[i-1] != ';':

[371] Fix | Delete

self.syntax_error("`;' missing in entityref")

[372] Fix | Delete

i = i-1

[373] Fix | Delete

name = res.group('name')

[374] Fix | Delete

if self.__map_case:

[375] Fix | Delete

name = name.lower()

[376] Fix | Delete

if name in self.entitydefs:

[377] Fix | Delete

self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]

[378] Fix | Delete

n = len(rawdata)

[379] Fix | Delete

i = res.start(0)

[380] Fix | Delete

else:

[381] Fix | Delete

self.unknown_entityref(name)

[382] Fix | Delete

self.lineno = self.lineno + res.group(0).count('\n')

[383] Fix | Delete

continue

[384] Fix | Delete

elif rawdata[i] == ']':

[385] Fix | Delete

if self.literal:

[386] Fix | Delete

data = rawdata[i]

[387] Fix | Delete

self.handle_data(data)

[388] Fix | Delete

i = i+1

[389] Fix | Delete

continue

[390] Fix | Delete

if n-i < 3:

[391] Fix | Delete

break

[392] Fix | Delete

if cdataclose.match(rawdata, i):

[393] Fix | Delete

self.syntax_error("bogus `]]>'")

[394] Fix | Delete

self.handle_data(rawdata[i])

[395] Fix | Delete

i = i+1

[396] Fix | Delete

continue

[397] Fix | Delete

else:

[398] Fix | Delete

raise Error('neither < nor & ??')

[399] Fix | Delete

# We get here only if incomplete matches but

[400] Fix | Delete

# nothing else

[401] Fix | Delete

break

[402] Fix | Delete

# end while

[403] Fix | Delete

if i > 0:

[404] Fix | Delete

self.__at_start = 0

[405] Fix | Delete

if end and i < n:

[406] Fix | Delete

data = rawdata[i]

[407] Fix | Delete

self.syntax_error("bogus `%s'" % data)

[408] Fix | Delete

if not self.__accept_utf8 and illegal.search(data):

[409] Fix | Delete

self.syntax_error('illegal character in content')

[410] Fix | Delete

self.handle_data(data)

[411] Fix | Delete

self.lineno = self.lineno + data.count('\n')

[412] Fix | Delete

self.rawdata = rawdata[i+1:]

[413] Fix | Delete

return self.goahead(end)

[414] Fix | Delete

self.rawdata = rawdata[i:]

[415] Fix | Delete

if end:

[416] Fix | Delete

if not self.__seen_starttag:

[417] Fix | Delete

self.syntax_error('no elements in file')

[418] Fix | Delete

if self.stack:

[419] Fix | Delete

self.syntax_error('missing end tags')

[420] Fix | Delete

while self.stack:

[421] Fix | Delete

self.finish_endtag(self.stack[-1][0])

[422] Fix | Delete

[423] Fix | Delete

# Internal -- parse comment, return length or -1 if not terminated

[424] Fix | Delete

def parse_comment(self, i):

[425] Fix | Delete

rawdata = self.rawdata

[426] Fix | Delete

if rawdata[i:i+4] != '<!--':

[427] Fix | Delete

raise Error('unexpected call to handle_comment')

[428] Fix | Delete

res = commentclose.search(rawdata, i+4)

[429] Fix | Delete

if res is None:

[430] Fix | Delete

return -1

[431] Fix | Delete

if doubledash.search(rawdata, i+4, res.start(0)):

[432] Fix | Delete

self.syntax_error("`--' inside comment")

[433] Fix | Delete

if rawdata[res.start(0)-1] == '-':

[434] Fix | Delete

self.syntax_error('comment cannot end in three dashes')

[435] Fix | Delete

if not self.__accept_utf8 and \

[436] Fix | Delete

illegal.search(rawdata, i+4, res.start(0)):

[437] Fix | Delete

self.syntax_error('illegal character in comment')

[438] Fix | Delete

self.handle_comment(rawdata[i+4: res.start(0)])

[439] Fix | Delete

return res.end(0)

[440] Fix | Delete

[441] Fix | Delete

# Internal -- handle DOCTYPE tag, return length or -1 if not terminated

[442] Fix | Delete

def parse_doctype(self, res):

[443] Fix | Delete

rawdata = self.rawdata

[444] Fix | Delete

n = len(rawdata)

[445] Fix | Delete

name = res.group('name')

[446] Fix | Delete

if self.__map_case:

[447] Fix | Delete

name = name.lower()

[448] Fix | Delete

pubid, syslit = res.group('pubid', 'syslit')

[449] Fix | Delete

if pubid is not None:

[450] Fix | Delete

pubid = pubid[1:-1] # remove quotes

[451] Fix | Delete

pubid = ' '.join(pubid.split()) # normalize

[452] Fix | Delete

if syslit is not None: syslit = syslit[1:-1] # remove quotes

[453] Fix | Delete

j = k = res.end(0)

[454] Fix | Delete

if k >= n:

[455] Fix | Delete

return -1

[456] Fix | Delete

if rawdata[k] == '[':

[457] Fix | Delete

level = 0

[458] Fix | Delete

k = k+1

[459] Fix | Delete

dq = sq = 0

[460] Fix | Delete

while k < n:

[461] Fix | Delete

c = rawdata[k]

[462] Fix | Delete

if not sq and c == '"':

[463] Fix | Delete

dq = not dq

[464] Fix | Delete

elif not dq and c == "'":

[465] Fix | Delete

sq = not sq

[466] Fix | Delete

elif sq or dq:

[467] Fix | Delete

pass

[468] Fix | Delete

elif level <= 0 and c == ']':

[469] Fix | Delete

res = endbracket.match(rawdata, k+1)

[470] Fix | Delete

if res is None:

[471] Fix | Delete

return -1

[472] Fix | Delete

self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])

[473] Fix | Delete

return res.end(0)

[474] Fix | Delete

elif c == '<':

[475] Fix | Delete

level = level + 1

[476] Fix | Delete

elif c == '>':

[477] Fix | Delete

level = level - 1

[478] Fix | Delete

if level < 0:

[479] Fix | Delete

self.syntax_error("bogus `>' in DOCTYPE")

[480] Fix | Delete

k = k+1

[481] Fix | Delete

res = endbracketfind.match(rawdata, k)

[482] Fix | Delete

if res is None:

[483] Fix | Delete

return -1

[484] Fix | Delete

if endbracket.match(rawdata, k) is None:

[485] Fix | Delete

self.syntax_error('garbage in DOCTYPE')

[486] Fix | Delete

self.handle_doctype(name, pubid, syslit, None)

[487] Fix | Delete

return res.end(0)

[488] Fix | Delete

[489] Fix | Delete

# Internal -- handle CDATA tag, return length or -1 if not terminated

[490] Fix | Delete

def parse_cdata(self, i):

[491] Fix | Delete

rawdata = self.rawdata

[492] Fix | Delete

if rawdata[i:i+9] != '<![CDATA[':

[493] Fix | Delete

raise Error('unexpected call to parse_cdata')

[494] Fix | Delete

res = cdataclose.search(rawdata, i+9)

[495] Fix | Delete

if res is None:

[496] Fix | Delete

return -1

[497] Fix | Delete

if not self.__accept_utf8 and \

[498] Fix | Delete

illegal.search(rawdata, i+9, res.start(0)):

[499] Fix | Delete