# Copyright (c) 2003-2016 Paul T. McGuire
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
pyparsing module - Classes and methods to define and execute parsing grammars
The pyparsing module is an alternative approach to creating and executing simple grammars,
vs. the traditional lex/yacc approach, or the use of regular expressions. With pyparsing, you
don't need to learn a new syntax for defining grammars or matching expressions - the parsing module
provides a library of classes that you use to construct the grammar directly in Python.
Here is a program to parse "Hello, World!" (or any greeting of the form
C{"<salutation>, <addressee>!"}), built up using L{Word}, L{Literal}, and L{And} elements
(L{'+'<ParserElement.__add__>} operator gives L{And} expressions, strings are auto-converted to
L{Literal} expressions)::
from pyparsing import Word, alphas
# define grammar of a greeting
greet = Word(alphas) + "," + Word(alphas) + "!"
print (hello, "->", greet.parseString(hello))
The program outputs the following::
Hello, World! -> ['Hello', ',', 'World', '!']
The Python representation of the grammar is quite readable, owing to the self-explanatory
class names, and the use of '+', '|' and '^' operators.
The L{ParseResults} object returned from L{ParserElement.parseString<ParserElement.parseString>} can be accessed as a nested list, a dictionary, or an
object with named attributes.
The pyparsing module handles some of the problems that are typically vexing when writing text parsers:
- extra or missing whitespace (the above program will also handle "Hello,World!", "Hello , World !", etc.)
__versionTime__ = "07 Oct 2016 01:31 UTC"
__author__ = "Paul McGuire <ptmcg@users.sourceforge.net>"
from weakref import ref as wkref
from datetime import datetime
from _thread import RLock
from threading import RLock
from collections import OrderedDict as _OrderedDict
from ordereddict import OrderedDict as _OrderedDict
#~ sys.stderr.write( "testing pyparsing module, version %s, %s\n" % (__version__,__versionTime__ ) )
'And', 'CaselessKeyword', 'CaselessLiteral', 'CharsNotIn', 'Combine', 'Dict', 'Each', 'Empty',
'FollowedBy', 'Forward', 'GoToColumn', 'Group', 'Keyword', 'LineEnd', 'LineStart', 'Literal',
'MatchFirst', 'NoMatch', 'NotAny', 'OneOrMore', 'OnlyOnce', 'Optional', 'Or',
'ParseBaseException', 'ParseElementEnhance', 'ParseException', 'ParseExpression', 'ParseFatalException',
'ParseResults', 'ParseSyntaxException', 'ParserElement', 'QuotedString', 'RecursiveGrammarException',
'Regex', 'SkipTo', 'StringEnd', 'StringStart', 'Suppress', 'Token', 'TokenConverter',
'White', 'Word', 'WordEnd', 'WordStart', 'ZeroOrMore',
'alphanums', 'alphas', 'alphas8bit', 'anyCloseTag', 'anyOpenTag', 'cStyleComment', 'col',
'commaSeparatedList', 'commonHTMLEntity', 'countedArray', 'cppStyleComment', 'dblQuotedString',
'dblSlashComment', 'delimitedList', 'dictOf', 'downcaseTokens', 'empty', 'hexnums',
'htmlComment', 'javaStyleComment', 'line', 'lineEnd', 'lineStart', 'lineno',
'makeHTMLTags', 'makeXMLTags', 'matchOnlyAtCol', 'matchPreviousExpr', 'matchPreviousLiteral',
'nestedExpr', 'nullDebugAction', 'nums', 'oneOf', 'opAssoc', 'operatorPrecedence', 'printables',
'punc8bit', 'pythonStyleComment', 'quotedString', 'removeQuotes', 'replaceHTMLEntity',
'replaceWith', 'restOfLine', 'sglQuotedString', 'srange', 'stringEnd',
'stringStart', 'traceParseAction', 'unicodeString', 'upcaseTokens', 'withAttribute',
'indentedBlock', 'originalTextFor', 'ungroup', 'infixNotation','locatedExpr', 'withClass',
'CloseMatch', 'tokenMap', 'pyparsing_common',
system_version = tuple(sys.version_info)[:3]
PY_3 = system_version[0] == 3
# build list of single arg builtins, that can be used as parse actions
singleArgBuiltins = [sum, len, sorted, reversed, list, tuple, set, any, all, min, max]
"""Drop-in replacement for str(obj) that tries to be Unicode friendly. It first tries
str(obj). If that fails with a UnicodeEncodeError, then it tries unicode(obj). It
then < returns the unicode object | encodes it with the default encoding | ... >.
if isinstance(obj,unicode):
# If this works, then _ustr(obj) has the same behaviour as str(obj), so
# it won't break any existing code.
except UnicodeEncodeError:
ret = unicode(obj).encode(sys.getdefaultencoding(), 'xmlcharrefreplace')
xmlcharref = Regex('&#\d+;')
xmlcharref.setParseAction(lambda t: '\\u' + hex(int(t[0][2:-1]))[2:])
return xmlcharref.transformString(ret)
# build list of single arg builtins, tolerant of Python version, that can be used as parse actions
for fname in "sum len sorted reversed list tuple set any all min max".split():
singleArgBuiltins.append(getattr(__builtin__,fname))
_generatorType = type((y for y in range(1)))
"""Escape &, <, >, ", ', etc. in a string of data."""
# ampersand must be replaced first
to_symbols = ('&'+s+';' for s in "amp gt lt quot apos".split())
for from_,to_ in zip(from_symbols, to_symbols):
data = data.replace(from_, to_)
class _Constants(object):
alphas = string.ascii_uppercase + string.ascii_lowercase
hexnums = nums + "ABCDEFabcdef"
alphanums = alphas + nums
printables = "".join(c for c in string.printable if c not in string.whitespace)
class ParseBaseException(Exception):
"""base exception class for all parsing runtime exceptions"""
# Performance tuning: we construct a *lot* of these, so keep this
# constructor as small and fast as possible
def __init__( self, pstr, loc=0, msg=None, elem=None ):
self.parserElement = elem
self.args = (pstr, loc, msg)
def _from_exception(cls, pe):
internal factory method to simplify creating one type of ParseException
from another - avoids having __init__ signature conflicts among subclasses
return cls(pe.pstr, pe.loc, pe.msg, pe.parserElement)
def __getattr__( self, aname ):
"""supported attributes by name are:
- lineno - returns the line number of the exception text
- col - returns the column number of the exception text
- line - returns the line containing the exception text
return lineno( self.loc, self.pstr )
elif( aname in ("col", "column") ):
return col( self.loc, self.pstr )
return line( self.loc, self.pstr )
raise AttributeError(aname)
return "%s (at char %d), (line:%d, col:%d)" % \
( self.msg, self.loc, self.lineno, self.column )
def markInputline( self, markerString = ">!<" ):
"""Extracts the exception line from the input string, and marks
the location of the exception with a special symbol.
line_column = self.column - 1
line_str = "".join((line_str[:line_column],
markerString, line_str[line_column:]))
return "lineno col line".split() + dir(type(self))
class ParseException(ParseBaseException):
Exception thrown when parse expressions don't match class;
supported attributes by name are:
- lineno - returns the line number of the exception text
- col - returns the column number of the exception text
- line - returns the line containing the exception text
Word(nums).setName("integer").parseString("ABC")
except ParseException as pe:
print("column: {}".format(pe.col))
Expected integer (at char 0), (line:1, col:1)
class ParseFatalException(ParseBaseException):
"""user-throwable exception thrown when inconsistent parse content
is found; stops all parsing immediately"""
class ParseSyntaxException(ParseFatalException):
"""just like L{ParseFatalException}, but thrown internally when an
L{ErrorStop<And._ErrorStop>} ('-' operator) indicates that parsing is to stop
immediately because an unbacktrackable syntax error has been found"""
#~ class ReparseException(ParseBaseException):
#~ """Experimental class - parse actions can raise this exception to cause
#~ pyparsing to reparse the input string:
#~ - with a modified input string, and/or
#~ - with a modified start location
#~ Set the values of the ReparseException in the constructor, and raise the
#~ exception in a parse action to cause pyparsing to use the new string/location.
#~ Setting the values as None causes no change to be made.
#~ def __init_( self, newstring, restartLoc ):
#~ self.newParseText = newstring
#~ self.reparseLoc = restartLoc
class RecursiveGrammarException(Exception):
"""exception thrown by L{ParserElement.validate} if the grammar could be improperly recursive"""
def __init__( self, parseElementList ):
self.parseElementTrace = parseElementList
return "RecursiveGrammarException: %s" % self.parseElementTrace
class _ParseResultsWithOffset(object):
def __init__(self,p1,p2):
self.tup = (self.tup[0],i)
class ParseResults(object):
Structured parse results, to provide multiple means of access to the parsed data:
- as a list (C{len(results)})
- by list index (C{results[0], results[1]}, etc.)
- by attribute (C{results.<resultsName>} - see L{ParserElement.setResultsName})
date_str = (integer.setResultsName("year") + '/'
+ integer.setResultsName("month") + '/'
+ integer.setResultsName("day"))
# date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
# parseString returns a ParseResults object
result = date_str.parseString("1999/12/31")
print("%s -> %s" % (s, fn(eval(s))))
test("'month' in result")
test("'minutes' in result")
test("result.dump()", str)
list(result) -> ['1999', '/', '12', '/', '31']
'month' in result -> True
'minutes' in result -> False
result.dump() -> ['1999', '/', '12', '/', '31']
def __new__(cls, toklist=None, name=None, asList=True, modal=True ):
if isinstance(toklist, cls):
retobj = object.__new__(cls)
# Performance tuning: we construct a *lot* of these, so keep this
# constructor as small and fast as possible
def __init__( self, toklist=None, name=None, asList=True, modal=True, isinstance=isinstance ):
if isinstance(toklist, list):
self.__toklist = toklist[:]
elif isinstance(toklist, _generatorType):
self.__toklist = list(toklist)
self.__toklist = [toklist]
if name is not None and name:
self.__accumNames[name] = 0
name = _ustr(name) # will always return a str, but use _ustr for consistency
if not (isinstance(toklist, (type(None), basestring, list)) and toklist in (None,'',[])):
if isinstance(toklist,basestring):
if isinstance(toklist,ParseResults):
self[name] = _ParseResultsWithOffset(toklist.copy(),0)
self[name] = _ParseResultsWithOffset(ParseResults(toklist[0]),0)
except (KeyError,TypeError,IndexError):
def __getitem__( self, i ):
if isinstance( i, (int,slice) ):
if i not in self.__accumNames:
return self.__tokdict[i][-1][0]
return ParseResults([ v[0] for v in self.__tokdict[i] ])
def __setitem__( self, k, v, isinstance=isinstance ):
if isinstance(v,_ParseResultsWithOffset):
self.__tokdict[k] = self.__tokdict.get(k,list()) + [v]
elif isinstance(k,(int,slice)):
self.__tokdict[k] = self.__tokdict.get(k,list()) + [_ParseResultsWithOffset(v,0)]
if isinstance(sub,ParseResults):
sub.__parent = wkref(self)
def __delitem__( self, i ):
if isinstance(i,(int,slice)):
mylen = len( self.__toklist )
removed = list(range(*i.indices(mylen)))
# fixup indices in token dictionary
for name,occurrences in self.__tokdict.items():
for k, (value, position) in enumerate(occurrences):
occurrences[k] = _ParseResultsWithOffset(value, position - (position > j))
def __contains__( self, k ):
return k in self.__tokdict
def __len__( self ): return len( self.__toklist )
def __bool__(self): return ( not not self.__toklist )
def __iter__( self ): return iter( self.__toklist )
def __reversed__( self ): return iter( self.__toklist[::-1] )
if hasattr(self.__tokdict, "iterkeys"):
return self.__tokdict.iterkeys()
return iter(self.__tokdict)
return (self[k] for k in self._iterkeys())
return ((k, self[k]) for k in self._iterkeys())
"""Returns an iterator of all named result keys (Python 3.x only)."""
"""Returns an iterator of all named result values (Python 3.x only)."""
"""Returns an iterator of all named result key-value tuples (Python 3.x only)."""
"""Returns an iterator of all named result keys (Python 2.x only)."""
"""Returns an iterator of all named result values (Python 2.x only)."""
"""Returns an iterator of all named result key-value tuples (Python 2.x only)."""
"""Returns all named result keys (as a list in Python 2.x, as an iterator in Python 3.x)."""
return list(self.iterkeys())
"""Returns all named result values (as a list in Python 2.x, as an iterator in Python 3.x)."""
return list(self.itervalues())
"""Returns all named result key-values (as a list of tuples in Python 2.x, as an iterator in Python 3.x)."""
return list(self.iteritems())
"""Since keys() returns an iterator, this method is helpful in bypassing
code that looks for the existence of any defined results names."""
return bool(self.__tokdict)
def pop( self, *args, **kwargs):
Removes and returns item at specified index (default=C{last}).
Supports both C{list} and C{dict} semantics for C{pop()}. If passed no
argument or an integer argument, it will use C{list} semantics
and pop tokens from the list of parsed tokens. If passed a
non-integer argument (most likely a string), it will use C{dict}
semantics and pop the corresponding value from any defined
results names. A second default return value argument is
supported, just as in C{dict.pop()}.
def remove_first(tokens):