Edit File by line

"""A parser for SGML, using the derived class as a static DTD."""

[0] Fix | Delete

[1] Fix | Delete

# XXX This only supports those SGML features used by HTML.

[2] Fix | Delete

[3] Fix | Delete

# XXX There should be a way to distinguish between PCDATA (parsed

[4] Fix | Delete

# character data -- the normal case), RCDATA (replaceable character

[5] Fix | Delete

# data -- only char and entity references and end tags are special)

[6] Fix | Delete

# and CDATA (character data -- only end tags are special). RCDATA is

[7] Fix | Delete

# not supported at all.

[8] Fix | Delete

[9] Fix | Delete

[10] Fix | Delete

from warnings import warnpy3k

[11] Fix | Delete

warnpy3k("the sgmllib module has been removed in Python 3.0",

[12] Fix | Delete

stacklevel=2)

[13] Fix | Delete

del warnpy3k

[14] Fix | Delete

[15] Fix | Delete

import markupbase

[16] Fix | Delete

import re

[17] Fix | Delete

[18] Fix | Delete

__all__ = ["SGMLParser", "SGMLParseError"]

[19] Fix | Delete

[20] Fix | Delete

# Regular expressions used for parsing

[21] Fix | Delete

[22] Fix | Delete

interesting = re.compile('[&<]')

[23] Fix | Delete

incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'

[24] Fix | Delete

'<([a-zA-Z][^<>]*|'

[25] Fix | Delete

'/([a-zA-Z][^<>]*)?|'

[26] Fix | Delete

'![^<>]*)?')

[27] Fix | Delete

[28] Fix | Delete

entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')

[29] Fix | Delete

charref = re.compile('&#([0-9]+)[^0-9]')

[30] Fix | Delete

[31] Fix | Delete

starttagopen = re.compile('<[>a-zA-Z]')

[32] Fix | Delete

shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')

[33] Fix | Delete

shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')

[34] Fix | Delete

piclose = re.compile('>')

[35] Fix | Delete

endbracket = re.compile('[<>]')

[36] Fix | Delete

tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')

[37] Fix | Delete

attrfind = re.compile(

[38] Fix | Delete

r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'

[39] Fix | Delete

r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$_#=~\'"@]*))?')

[40] Fix | Delete

[41] Fix | Delete

[42] Fix | Delete

class SGMLParseError(RuntimeError):

[43] Fix | Delete

"""Exception raised for all parse errors."""

[44] Fix | Delete

pass

[45] Fix | Delete

[46] Fix | Delete

[47] Fix | Delete

# SGML parser base class -- find tags and call handler functions.

[48] Fix | Delete

# Usage: p = SGMLParser(); p.feed(data); ...; p.close().

[49] Fix | Delete

# The dtd is defined by deriving a class which defines methods

[50] Fix | Delete

# with special names to handle tags: start_foo and end_foo to handle

[51] Fix | Delete

# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.

[52] Fix | Delete

# (Tags are converted to lower case for this purpose.) The data

[53] Fix | Delete

# between tags is passed to the parser by calling self.handle_data()

[54] Fix | Delete

# with some data as argument (the data may be split up in arbitrary

[55] Fix | Delete

# chunks). Entity references are passed by calling

[56] Fix | Delete

# self.handle_entityref() with the entity reference as argument.

[57] Fix | Delete

[58] Fix | Delete

class SGMLParser(markupbase.ParserBase):

[59] Fix | Delete

# Definition of entities -- derived classes may override

[60] Fix | Delete

entity_or_charref = re.compile('&(?:'

[61] Fix | Delete

'([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'

[62] Fix | Delete

')(;?)')

[63] Fix | Delete

[64] Fix | Delete

def __init__(self, verbose=0):

[65] Fix | Delete

"""Initialize and reset this instance."""

[66] Fix | Delete

self.verbose = verbose

[67] Fix | Delete

self.reset()

[68] Fix | Delete

[69] Fix | Delete

def reset(self):

[70] Fix | Delete

"""Reset this instance. Loses all unprocessed data."""

[71] Fix | Delete

self.__starttag_text = None

[72] Fix | Delete

self.rawdata = ''

[73] Fix | Delete

self.stack = []

[74] Fix | Delete

self.lasttag = '???'

[75] Fix | Delete

self.nomoretags = 0

[76] Fix | Delete

self.literal = 0

[77] Fix | Delete

markupbase.ParserBase.reset(self)

[78] Fix | Delete

[79] Fix | Delete

def setnomoretags(self):

[80] Fix | Delete

"""Enter literal mode (CDATA) till EOF.

[81] Fix | Delete

[82] Fix | Delete

Intended for derived classes only.

[83] Fix | Delete

"""

[84] Fix | Delete

self.nomoretags = self.literal = 1

[85] Fix | Delete

[86] Fix | Delete

def setliteral(self, *args):

[87] Fix | Delete

"""Enter literal mode (CDATA).

[88] Fix | Delete

[89] Fix | Delete

Intended for derived classes only.

[90] Fix | Delete

"""

[91] Fix | Delete

self.literal = 1

[92] Fix | Delete

[93] Fix | Delete

def feed(self, data):

[94] Fix | Delete

"""Feed some data to the parser.

[95] Fix | Delete

[96] Fix | Delete

Call this as often as you want, with as little or as much text

[97] Fix | Delete

as you want (may include '\n'). (This just saves the text,

[98] Fix | Delete

all the processing is done by goahead().)

[99] Fix | Delete

"""

[100] Fix | Delete

[101] Fix | Delete

self.rawdata = self.rawdata + data

[102] Fix | Delete

self.goahead(0)

[103] Fix | Delete

[104] Fix | Delete

def close(self):

[105] Fix | Delete

"""Handle the remaining data."""

[106] Fix | Delete

self.goahead(1)

[107] Fix | Delete

[108] Fix | Delete

def error(self, message):

[109] Fix | Delete

raise SGMLParseError(message)

[110] Fix | Delete

[111] Fix | Delete

# Internal -- handle data as far as reasonable. May leave state

[112] Fix | Delete

# and data to be processed by a subsequent call. If 'end' is

[113] Fix | Delete

# true, force handling all data as if followed by EOF marker.

[114] Fix | Delete

def goahead(self, end):

[115] Fix | Delete

rawdata = self.rawdata

[116] Fix | Delete

i = 0

[117] Fix | Delete

n = len(rawdata)

[118] Fix | Delete

while i < n:

[119] Fix | Delete

if self.nomoretags:

[120] Fix | Delete

self.handle_data(rawdata[i:n])

[121] Fix | Delete

i = n

[122] Fix | Delete

break

[123] Fix | Delete

match = interesting.search(rawdata, i)

[124] Fix | Delete

if match: j = match.start()

[125] Fix | Delete

else: j = n

[126] Fix | Delete

if i < j:

[127] Fix | Delete

self.handle_data(rawdata[i:j])

[128] Fix | Delete

i = j

[129] Fix | Delete

if i == n: break

[130] Fix | Delete

if rawdata[i] == '<':

[131] Fix | Delete

if starttagopen.match(rawdata, i):

[132] Fix | Delete

if self.literal:

[133] Fix | Delete

self.handle_data(rawdata[i])

[134] Fix | Delete

i = i+1

[135] Fix | Delete

continue

[136] Fix | Delete

k = self.parse_starttag(i)

[137] Fix | Delete

if k < 0: break

[138] Fix | Delete

i = k

[139] Fix | Delete

continue

[140] Fix | Delete

if rawdata.startswith("</", i):

[141] Fix | Delete

k = self.parse_endtag(i)

[142] Fix | Delete

if k < 0: break

[143] Fix | Delete

i = k

[144] Fix | Delete

self.literal = 0

[145] Fix | Delete

continue

[146] Fix | Delete

if self.literal:

[147] Fix | Delete

if n > (i + 1):

[148] Fix | Delete

self.handle_data("<")

[149] Fix | Delete

i = i+1

[150] Fix | Delete

else:

[151] Fix | Delete

# incomplete

[152] Fix | Delete

break

[153] Fix | Delete

continue

[154] Fix | Delete

if rawdata.startswith("<!--", i):

[155] Fix | Delete

# Strictly speaking, a comment is --.*--

[156] Fix | Delete

# within a declaration tag <!...>.

[157] Fix | Delete

# This should be removed,

[158] Fix | Delete

# and comments handled only in parse_declaration.

[159] Fix | Delete

k = self.parse_comment(i)

[160] Fix | Delete

if k < 0: break

[161] Fix | Delete

i = k

[162] Fix | Delete

continue

[163] Fix | Delete

if rawdata.startswith("<?", i):

[164] Fix | Delete

k = self.parse_pi(i)

[165] Fix | Delete

if k < 0: break

[166] Fix | Delete

i = i+k

[167] Fix | Delete

continue

[168] Fix | Delete

if rawdata.startswith("<!", i):

[169] Fix | Delete

# This is some sort of declaration; in "HTML as

[170] Fix | Delete

# deployed," this should only be the document type

[171] Fix | Delete

# declaration ("<!DOCTYPE html...>").

[172] Fix | Delete

k = self.parse_declaration(i)

[173] Fix | Delete

if k < 0: break

[174] Fix | Delete

i = k

[175] Fix | Delete

continue

[176] Fix | Delete

elif rawdata[i] == '&':

[177] Fix | Delete

if self.literal:

[178] Fix | Delete

self.handle_data(rawdata[i])

[179] Fix | Delete

i = i+1

[180] Fix | Delete

continue

[181] Fix | Delete

match = charref.match(rawdata, i)

[182] Fix | Delete

if match:

[183] Fix | Delete

name = match.group(1)

[184] Fix | Delete

self.handle_charref(name)

[185] Fix | Delete

i = match.end(0)

[186] Fix | Delete

if rawdata[i-1] != ';': i = i-1

[187] Fix | Delete

continue

[188] Fix | Delete

match = entityref.match(rawdata, i)

[189] Fix | Delete

if match:

[190] Fix | Delete

name = match.group(1)

[191] Fix | Delete

self.handle_entityref(name)

[192] Fix | Delete

i = match.end(0)

[193] Fix | Delete

if rawdata[i-1] != ';': i = i-1

[194] Fix | Delete

continue

[195] Fix | Delete

else:

[196] Fix | Delete

self.error('neither < nor & ??')

[197] Fix | Delete

# We get here only if incomplete matches but

[198] Fix | Delete

# nothing else

[199] Fix | Delete

match = incomplete.match(rawdata, i)

[200] Fix | Delete

if not match:

[201] Fix | Delete

self.handle_data(rawdata[i])

[202] Fix | Delete

i = i+1

[203] Fix | Delete

continue

[204] Fix | Delete

j = match.end(0)

[205] Fix | Delete

if j == n:

[206] Fix | Delete

break # Really incomplete

[207] Fix | Delete

self.handle_data(rawdata[i:j])

[208] Fix | Delete

i = j

[209] Fix | Delete

# end while

[210] Fix | Delete

if end and i < n:

[211] Fix | Delete

self.handle_data(rawdata[i:n])

[212] Fix | Delete

i = n

[213] Fix | Delete

self.rawdata = rawdata[i:]

[214] Fix | Delete

# XXX if end: check for empty stack

[215] Fix | Delete

[216] Fix | Delete

# Extensions for the DOCTYPE scanner:

[217] Fix | Delete

_decl_otherchars = '='

[218] Fix | Delete

[219] Fix | Delete

# Internal -- parse processing instr, return length or -1 if not terminated

[220] Fix | Delete

def parse_pi(self, i):

[221] Fix | Delete

rawdata = self.rawdata

[222] Fix | Delete

if rawdata[i:i+2] != '<?':

[223] Fix | Delete

self.error('unexpected call to parse_pi()')

[224] Fix | Delete

match = piclose.search(rawdata, i+2)

[225] Fix | Delete

if not match:

[226] Fix | Delete

return -1

[227] Fix | Delete

j = match.start(0)

[228] Fix | Delete

self.handle_pi(rawdata[i+2: j])

[229] Fix | Delete

j = match.end(0)

[230] Fix | Delete

return j-i

[231] Fix | Delete

[232] Fix | Delete

def get_starttag_text(self):

[233] Fix | Delete

return self.__starttag_text

[234] Fix | Delete

[235] Fix | Delete

# Internal -- handle starttag, return length or -1 if not terminated

[236] Fix | Delete

def parse_starttag(self, i):

[237] Fix | Delete

self.__starttag_text = None

[238] Fix | Delete

start_pos = i

[239] Fix | Delete

rawdata = self.rawdata

[240] Fix | Delete

if shorttagopen.match(rawdata, i):

[241] Fix | Delete

# SGML shorthand: <tag/data/ == <tag>data</tag>

[242] Fix | Delete

# XXX Can data contain &... (entity or char refs)?

[243] Fix | Delete

# XXX Can data contain < or > (tag characters)?

[244] Fix | Delete

# XXX Can there be whitespace before the first /?

[245] Fix | Delete

match = shorttag.match(rawdata, i)

[246] Fix | Delete

if not match:

[247] Fix | Delete

return -1

[248] Fix | Delete

tag, data = match.group(1, 2)

[249] Fix | Delete

self.__starttag_text = '<%s/' % tag

[250] Fix | Delete

tag = tag.lower()

[251] Fix | Delete

k = match.end(0)

[252] Fix | Delete

self.finish_shorttag(tag, data)

[253] Fix | Delete

self.__starttag_text = rawdata[start_pos:match.end(1) + 1]

[254] Fix | Delete

return k

[255] Fix | Delete

# XXX The following should skip matching quotes (' or ")

[256] Fix | Delete

# As a shortcut way to exit, this isn't so bad, but shouldn't

[257] Fix | Delete

# be used to locate the actual end of the start tag since the

[258] Fix | Delete

# < or > characters may be embedded in an attribute value.

[259] Fix | Delete

match = endbracket.search(rawdata, i+1)

[260] Fix | Delete

if not match:

[261] Fix | Delete

return -1

[262] Fix | Delete

j = match.start(0)

[263] Fix | Delete

# Now parse the data between i+1 and j into a tag and attrs

[264] Fix | Delete

attrs = []

[265] Fix | Delete

if rawdata[i:i+2] == '<>':

[266] Fix | Delete

# SGML shorthand: <> == <last open tag seen>

[267] Fix | Delete

k = j

[268] Fix | Delete

tag = self.lasttag

[269] Fix | Delete

else:

[270] Fix | Delete

match = tagfind.match(rawdata, i+1)

[271] Fix | Delete

if not match:

[272] Fix | Delete

self.error('unexpected call to parse_starttag')

[273] Fix | Delete

k = match.end(0)

[274] Fix | Delete

tag = rawdata[i+1:k].lower()

[275] Fix | Delete

self.lasttag = tag

[276] Fix | Delete

while k < j:

[277] Fix | Delete

match = attrfind.match(rawdata, k)

[278] Fix | Delete

if not match: break

[279] Fix | Delete

attrname, rest, attrvalue = match.group(1, 2, 3)

[280] Fix | Delete

if not rest:

[281] Fix | Delete

attrvalue = attrname

[282] Fix | Delete

else:

[283] Fix | Delete

if (attrvalue[:1] == "'" == attrvalue[-1:] or

[284] Fix | Delete

attrvalue[:1] == '"' == attrvalue[-1:]):

[285] Fix | Delete

# strip quotes

[286] Fix | Delete

attrvalue = attrvalue[1:-1]

[287] Fix | Delete

attrvalue = self.entity_or_charref.sub(

[288] Fix | Delete

self._convert_ref, attrvalue)

[289] Fix | Delete

attrs.append((attrname.lower(), attrvalue))

[290] Fix | Delete

k = match.end(0)

[291] Fix | Delete

if rawdata[j] == '>':

[292] Fix | Delete

j = j+1

[293] Fix | Delete

self.__starttag_text = rawdata[start_pos:j]

[294] Fix | Delete

self.finish_starttag(tag, attrs)

[295] Fix | Delete

return j

[296] Fix | Delete

[297] Fix | Delete

# Internal -- convert entity or character reference

[298] Fix | Delete

def _convert_ref(self, match):

[299] Fix | Delete

if match.group(2):

[300] Fix | Delete

return self.convert_charref(match.group(2)) or \

[301] Fix | Delete

'&#%s%s' % match.groups()[1:]

[302] Fix | Delete

elif match.group(3):

[303] Fix | Delete

return self.convert_entityref(match.group(1)) or \

[304] Fix | Delete

'&%s;' % match.group(1)

[305] Fix | Delete

else:

[306] Fix | Delete

return '&%s' % match.group(1)

[307] Fix | Delete

[308] Fix | Delete

# Internal -- parse endtag

[309] Fix | Delete

def parse_endtag(self, i):

[310] Fix | Delete

rawdata = self.rawdata

[311] Fix | Delete

match = endbracket.search(rawdata, i+1)

[312] Fix | Delete

if not match:

[313] Fix | Delete

return -1

[314] Fix | Delete

j = match.start(0)

[315] Fix | Delete

tag = rawdata[i+2:j].strip().lower()

[316] Fix | Delete

if rawdata[j] == '>':

[317] Fix | Delete

j = j+1

[318] Fix | Delete

self.finish_endtag(tag)

[319] Fix | Delete

return j

[320] Fix | Delete

[321] Fix | Delete

# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)

[322] Fix | Delete

def finish_shorttag(self, tag, data):

[323] Fix | Delete

self.finish_starttag(tag, [])

[324] Fix | Delete

self.handle_data(data)

[325] Fix | Delete

self.finish_endtag(tag)

[326] Fix | Delete

[327] Fix | Delete

# Internal -- finish processing of start tag

[328] Fix | Delete

# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag

[329] Fix | Delete

def finish_starttag(self, tag, attrs):

[330] Fix | Delete

try:

[331] Fix | Delete

method = getattr(self, 'start_' + tag)

[332] Fix | Delete

except AttributeError:

[333] Fix | Delete

try:

[334] Fix | Delete

method = getattr(self, 'do_' + tag)

[335] Fix | Delete

except AttributeError:

[336] Fix | Delete

self.unknown_starttag(tag, attrs)

[337] Fix | Delete

return -1

[338] Fix | Delete

else:

[339] Fix | Delete

self.handle_starttag(tag, method, attrs)

[340] Fix | Delete

return 0

[341] Fix | Delete

else:

[342] Fix | Delete

self.stack.append(tag)

[343] Fix | Delete

self.handle_starttag(tag, method, attrs)

[344] Fix | Delete

return 1

[345] Fix | Delete

[346] Fix | Delete

# Internal -- finish processing of end tag

[347] Fix | Delete

def finish_endtag(self, tag):

[348] Fix | Delete

if not tag:

[349] Fix | Delete

found = len(self.stack) - 1

[350] Fix | Delete

if found < 0:

[351] Fix | Delete

self.unknown_endtag(tag)

[352] Fix | Delete

return

[353] Fix | Delete

else:

[354] Fix | Delete

if tag not in self.stack:

[355] Fix | Delete

try:

[356] Fix | Delete

method = getattr(self, 'end_' + tag)

[357] Fix | Delete

except AttributeError:

[358] Fix | Delete

self.unknown_endtag(tag)

[359] Fix | Delete

else:

[360] Fix | Delete

self.report_unbalanced(tag)

[361] Fix | Delete

return

[362] Fix | Delete

found = len(self.stack)

[363] Fix | Delete

for i in range(found):

[364] Fix | Delete

if self.stack[i] == tag: found = i

[365] Fix | Delete

while len(self.stack) > found:

[366] Fix | Delete

tag = self.stack[-1]

[367] Fix | Delete

try:

[368] Fix | Delete

method = getattr(self, 'end_' + tag)

[369] Fix | Delete

except AttributeError:

[370] Fix | Delete

method = None

[371] Fix | Delete

if method:

[372] Fix | Delete

self.handle_endtag(tag, method)

[373] Fix | Delete

else:

[374] Fix | Delete

self.unknown_endtag(tag)

[375] Fix | Delete

del self.stack[-1]

[376] Fix | Delete

[377] Fix | Delete

# Overridable -- handle start tag

[378] Fix | Delete

def handle_starttag(self, tag, method, attrs):

[379] Fix | Delete

method(attrs)

[380] Fix | Delete

[381] Fix | Delete

# Overridable -- handle end tag

[382] Fix | Delete

def handle_endtag(self, tag, method):

[383] Fix | Delete

method()

[384] Fix | Delete

[385] Fix | Delete

# Example -- report an unbalanced </...> tag.

[386] Fix | Delete

def report_unbalanced(self, tag):

[387] Fix | Delete

if self.verbose:

[388] Fix | Delete

print '*** Unbalanced </' + tag + '>'

[389] Fix | Delete

print '*** Stack:', self.stack

[390] Fix | Delete

[391] Fix | Delete

def convert_charref(self, name):

[392] Fix | Delete

"""Convert character reference, may be overridden."""

[393] Fix | Delete

try:

[394] Fix | Delete

n = int(name)

[395] Fix | Delete

except ValueError:

[396] Fix | Delete

return

[397] Fix | Delete

if not 0 <= n <= 127:

[398] Fix | Delete

return

[399] Fix | Delete

return self.convert_codepoint(n)

[400] Fix | Delete

[401] Fix | Delete

def convert_codepoint(self, codepoint):

[402] Fix | Delete

return chr(codepoint)

[403] Fix | Delete

[404] Fix | Delete

def handle_charref(self, name):

[405] Fix | Delete

"""Handle character reference, no need to override."""

[406] Fix | Delete

replacement = self.convert_charref(name)

[407] Fix | Delete

if replacement is None:

[408] Fix | Delete

self.unknown_charref(name)

[409] Fix | Delete

else:

[410] Fix | Delete

self.handle_data(replacement)

[411] Fix | Delete

[412] Fix | Delete

# Definition of entities -- derived classes may override

[413] Fix | Delete

entitydefs = \

[414] Fix | Delete

{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}

[415] Fix | Delete

[416] Fix | Delete

def convert_entityref(self, name):

[417] Fix | Delete

"""Convert entity references.

[418] Fix | Delete

[419] Fix | Delete

As an alternative to overriding this method; one can tailor the

[420] Fix | Delete

results by setting up the self.entitydefs mapping appropriately.

[421] Fix | Delete

"""

[422] Fix | Delete

table = self.entitydefs

[423] Fix | Delete

if name in table:

[424] Fix | Delete

return table[name]

[425] Fix | Delete

else:

[426] Fix | Delete

return

[427] Fix | Delete

[428] Fix | Delete

def handle_entityref(self, name):

[429] Fix | Delete

"""Handle entity references, no need to override."""

[430] Fix | Delete

replacement = self.convert_entityref(name)

[431] Fix | Delete

if replacement is None:

[432] Fix | Delete

self.unknown_entityref(name)

[433] Fix | Delete

else:

[434] Fix | Delete

self.handle_data(replacement)

[435] Fix | Delete

[436] Fix | Delete

# Example -- handle data, should be overridden

[437] Fix | Delete

def handle_data(self, data):

[438] Fix | Delete

pass

[439] Fix | Delete

[440] Fix | Delete

# Example -- handle comment, could be overridden

[441] Fix | Delete

def handle_comment(self, data):

[442] Fix | Delete

pass

[443] Fix | Delete

[444] Fix | Delete

# Example -- handle declaration, could be overridden

[445] Fix | Delete

def handle_decl(self, decl):

[446] Fix | Delete

pass

[447] Fix | Delete

[448] Fix | Delete

# Example -- handle processing instruction, could be overridden

[449] Fix | Delete

def handle_pi(self, data):

[450] Fix | Delete

pass

[451] Fix | Delete

[452] Fix | Delete

# To be overridden -- handlers for unknown objects

[453] Fix | Delete

def unknown_starttag(self, tag, attrs): pass

[454] Fix | Delete

def unknown_endtag(self, tag): pass

[455] Fix | Delete

def unknown_charref(self, ref): pass

[456] Fix | Delete

def unknown_entityref(self, ref): pass

[457] Fix | Delete

[458] Fix | Delete

[459] Fix | Delete

class TestSGMLParser(SGMLParser):

[460] Fix | Delete

[461] Fix | Delete

def __init__(self, verbose=0):

[462] Fix | Delete

self.testdata = ""

[463] Fix | Delete

SGMLParser.__init__(self, verbose)

[464] Fix | Delete

[465] Fix | Delete

def handle_data(self, data):

[466] Fix | Delete

self.testdata = self.testdata + data

[467] Fix | Delete

if len(repr(self.testdata)) >= 70:

[468] Fix | Delete

self.flush()

[469] Fix | Delete

[470] Fix | Delete

def flush(self):

[471] Fix | Delete

data = self.testdata

[472] Fix | Delete

if data:

[473] Fix | Delete

self.testdata = ""

[474] Fix | Delete

print 'data:', repr(data)

[475] Fix | Delete

[476] Fix | Delete

def handle_comment(self, data):

[477] Fix | Delete

self.flush()

[478] Fix | Delete

r = repr(data)

[479] Fix | Delete

if len(r) > 68:

[480] Fix | Delete

r = r[:32] + '...' + r[-32:]

[481] Fix | Delete

print 'comment:', r

[482] Fix | Delete

[483] Fix | Delete

def unknown_starttag(self, tag, attrs):

[484] Fix | Delete

self.flush()

[485] Fix | Delete

if not attrs:

[486] Fix | Delete

print 'start tag: <' + tag + '>'

[487] Fix | Delete

else:

[488] Fix | Delete

print 'start tag: <' + tag,

[489] Fix | Delete

for name, value in attrs:

[490] Fix | Delete

print name + '=' + '"' + value + '"',

[491] Fix | Delete

print '>'

[492] Fix | Delete

[493] Fix | Delete

def unknown_endtag(self, tag):

[494] Fix | Delete

self.flush()

[495] Fix | Delete

print 'end tag: </' + tag + '>'

[496] Fix | Delete

[497] Fix | Delete

def unknown_entityref(self, ref):

[498] Fix | Delete

self.flush()

[499] Fix | Delete