"""A parser for XML, using the derived class as static DTD."""
# Author: Sjoerd Mullender.
warnings.warn("The xmllib module is obsolete. Use xml.sax instead.",
class Error(RuntimeError):
# Regular expressions used for parsing
_S = '[ \t\r\n]+' # white space
_opS = '[ \t\r\n]*' # optional white space
_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name
_QStr = "(?:'[^']*'|\"[^\"]*\")" # quoted XML string
illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
interesting = re.compile('[]&<]')
ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
space = re.compile(_S + '$')
newline = re.compile('\n')
_S + '(?P<name>' + _Name + ')'
'(' + _opS + '=' + _opS +
'(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
starttagopen = re.compile('<' + _Name)
starttagend = re.compile(_opS + '(?P<slash>/?)>')
starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
'(?P<attrs>(?:'+attrfind.pattern+')*)'+
endtagopen = re.compile('</')
endbracket = re.compile(_opS + '>')
endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
tagfind = re.compile(_Name)
cdataopen = re.compile(r'<!\[CDATA\[')
cdataclose = re.compile(r'\]\]>')
# this matches one of the following:
# PUBLIC PubidLiteral SystemLiteral
_SystemLiteral = '(?P<%s>'+_QStr+')'
_PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
"'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
_ExternalId = '(?:SYSTEM|' \
'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
')'+_S+_SystemLiteral%'syslit'
doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
'(?:'+_S+_ExternalId+')?'+_opS)
xmldecl = re.compile('<\?xml'+_S+
'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
'(?:'+_S+'encoding'+_opS+'='+_opS+
"(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
'"[A-Za-z][-A-Za-z0-9._]*"))?'
'(?:'+_S+'standalone'+_opS+'='+_opS+
'(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
procclose = re.compile(_opS + r'\?>')
commentopen = re.compile('<!--')
commentclose = re.compile('-->')
doubledash = re.compile('--')
attrtrans = string.maketrans(' \r\n\t', ' ')
# definitions for XML namespaces
_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":"
ncname = re.compile(_NCName + '$')
qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
'(?P<local>' + _NCName + ')$')
xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
# XML parser base class -- find tags and call handler functions.
# Usage: p = XMLParser(); p.feed(data); ...; p.close().
# The dtd is defined by deriving a class which defines methods with
# special names to handle tags: start_foo and end_foo to handle <foo>
# and </foo>, respectively. The data between tags is passed to the
# parser by calling self.handle_data() with some data as argument (the
# data may be split up in arbitrary chunks).
attributes = {} # default, to be overridden
elements = {} # default, to be overridden
# parsing options, settable using keyword args in __init__
__accept_unquoted_attributes = 0
__accept_missing_endtag_name = 0
__translate_attribute_references = 1
# Interface -- initialize and reset this instance
def __init__(self, **kw):
if 'accept_unquoted_attributes' in kw:
self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']
if 'accept_missing_endtag_name' in kw:
self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']
self.__map_case = kw['map_case']
self.__accept_utf8 = kw['accept_utf8']
if 'translate_attribute_references' in kw:
self.__translate_attribute_references = kw['translate_attribute_references']
self.__fixdict(self.__dict__)
self.__fixclass(self.__class__)
def __fixclass(self, kl):
self.__fixdict(kl.__dict__)
def __fixdict(self, dict):
start, end = self.elements.get(tag, (None, None))
self.elements[tag] = getattr(self, key), end
start, end = self.elements.get(tag, (None, None))
self.elements[tag] = start, getattr(self, key)
# Interface -- reset this instance. Loses all unprocessed data
self.__seen_doctype = None
self.__use_namespaces = 0
self.__namespaces = {'xml':None} # xml is implicitly declared
# backward compatibility hack: if elements not overridden,
if self.elements is XMLParser.elements:
# For derived classes only -- enter literal mode (CDATA) till EOF
self.nomoretags = self.literal = 1
# For derived classes only -- enter literal mode (CDATA)
def setliteral(self, *args):
# Interface -- feed some data to the parser. Call this as
# often as you want, with as little or as much text as you
# want (may include '\n'). (This just saves the text, all the
# processing is done by goahead().)
self.rawdata = self.rawdata + data
# Interface -- handle the remaining data
# remove self.elements so that we don't leak
# Interface -- translate references
def translate_references(self, data, all = 1):
if not self.__translate_attribute_references:
res = amp.search(data, i)
self.syntax_error("bogus `&'")
str = chr(int(str[2:], 16))
self.syntax_error("`;' missing after char reference")
if str in self.entitydefs:
str = self.entitydefs[str]
self.syntax_error("bogus `&'")
i = s + 1 # just past the &
self.syntax_error("reference to unknown entity `&%s;'" % str)
self.syntax_error("bogus `&'")
i = s + 1 # just past the &
# when we get here, str contains the translated text and i points
# to the end of the string that is to be replaced
data = data[:s] + str + data[i:]
# Interface - return a dictionary of all namespaces currently valid
for t, d, nst in self.stack:
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
self.lineno = self.lineno + data.count('\n')
res = interesting.search(rawdata, i)
if self.__at_start and space.match(data) is None:
self.syntax_error('illegal data at start of file')
if not self.stack and space.match(data) is None:
self.syntax_error('data not in content')
if not self.__accept_utf8 and illegal.search(data):
self.syntax_error('illegal character in content')
self.lineno = self.lineno + data.count('\n')
if starttagopen.match(rawdata, i):
self.lineno = self.lineno + data.count('\n')
k = self.parse_starttag(i)
self.lineno = self.lineno + rawdata[i:k].count('\n')
if endtagopen.match(rawdata, i):
self.lineno = self.lineno + rawdata[i:k].count('\n')
if commentopen.match(rawdata, i):
self.lineno = self.lineno + data.count('\n')
k = self.parse_comment(i)
self.lineno = self.lineno + rawdata[i:k].count('\n')
if cdataopen.match(rawdata, i):
self.lineno = self.lineno + rawdata[i:k].count('\n')
res = xmldecl.match(rawdata, i)
self.syntax_error("<?xml?> declaration not at start of document")
version, encoding, standalone = res.group('version',
if version[1:-1] != '1.0':
raise Error('only XML version 1.0 supported')
if encoding: encoding = encoding[1:-1]
if standalone: standalone = standalone[1:-1]
self.handle_xml(encoding, standalone)
res = procopen.match(rawdata, i)
self.lineno = self.lineno + rawdata[i:k].count('\n')
res = doctype.match(rawdata, i)
self.lineno = self.lineno + data.count('\n')
self.syntax_error('multiple DOCTYPE elements')
self.syntax_error('DOCTYPE not at beginning of document')
k = self.parse_doctype(res)
self.__seen_doctype = res.group('name')
self.__seen_doctype = self.__seen_doctype.lower()
self.lineno = self.lineno + rawdata[i:k].count('\n')
res = charref.match(rawdata, i)
self.syntax_error("`;' missing in charref")
self.syntax_error('data not in content')
self.handle_charref(res.group('char')[:-1])
self.lineno = self.lineno + res.group(0).count('\n')
res = entityref.match(rawdata, i)
self.syntax_error("`;' missing in entityref")
if name in self.entitydefs:
self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
self.unknown_entityref(name)
self.lineno = self.lineno + res.group(0).count('\n')
if cdataclose.match(rawdata, i):
self.syntax_error("bogus `]]>'")
self.handle_data(rawdata[i])
raise Error('neither < nor & ??')
# We get here only if incomplete matches but
self.syntax_error("bogus `%s'" % data)
if not self.__accept_utf8 and illegal.search(data):
self.syntax_error('illegal character in content')
self.lineno = self.lineno + data.count('\n')
self.rawdata = rawdata[i+1:]
self.rawdata = rawdata[i:]
if not self.__seen_starttag:
self.syntax_error('no elements in file')
self.syntax_error('missing end tags')
self.finish_endtag(self.stack[-1][0])
# Internal -- parse comment, return length or -1 if not terminated
def parse_comment(self, i):
if rawdata[i:i+4] != '<!--':
raise Error('unexpected call to handle_comment')
res = commentclose.search(rawdata, i+4)
if doubledash.search(rawdata, i+4, res.start(0)):
self.syntax_error("`--' inside comment")
if rawdata[res.start(0)-1] == '-':
self.syntax_error('comment cannot end in three dashes')
if not self.__accept_utf8 and \
illegal.search(rawdata, i+4, res.start(0)):
self.syntax_error('illegal character in comment')
self.handle_comment(rawdata[i+4: res.start(0)])
# Internal -- handle DOCTYPE tag, return length or -1 if not terminated
def parse_doctype(self, res):
pubid, syslit = res.group('pubid', 'syslit')
pubid = pubid[1:-1] # remove quotes
pubid = ' '.join(pubid.split()) # normalize
if syslit is not None: syslit = syslit[1:-1] # remove quotes
elif not dq and c == "'":
elif level <= 0 and c == ']':
res = endbracket.match(rawdata, k+1)
self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
self.syntax_error("bogus `>' in DOCTYPE")
res = endbracketfind.match(rawdata, k)
if endbracket.match(rawdata, k) is None:
self.syntax_error('garbage in DOCTYPE')
self.handle_doctype(name, pubid, syslit, None)
# Internal -- handle CDATA tag, return length or -1 if not terminated
def parse_cdata(self, i):
if rawdata[i:i+9] != '<![CDATA[':
raise Error('unexpected call to parse_cdata')
res = cdataclose.search(rawdata, i+9)
if not self.__accept_utf8 and \
illegal.search(rawdata, i+9, res.start(0)):