Lmnl.def
From LMNLWiki
(back to LMNL Experiment page LMNL Parser Experiment)
#
# LMNL grammar rendered in the SimpleParse EBNF variant
#
# Matt Palmer, 7 July 2008
#
# LMNL: http://www.lmnl.org/wiki/index.php/Main_Page
# SimpleParse: http://simpleparse.sourceforge.net/simpleparse_grammars.html
#
# , sequences are separated by commas
# / alternatives are separated by forward slash (takes precedence over sequences)
# - not the expression that follows
# + one or more
# * zero to many
# ? zero or one
# ( ) grouped expression
# < > indicates not to output the production in the final parser output
# > < indicates not to output the production, but to output child productions
#
# Current choices for <> and >< have no particular significance,
# and only reflect what I was doing the last time I was playing with the grammar.
# Feel free to tweak them to see how the parser matches the productions.
# Content -> Markup -> Tag -> Metadata -> Annotation -> Content
# adding ? to Content (to allow empty document) - Content itself must match something.
LMNLDocument ::= Prolog, Content?
Prolog ::= LMNLDeclaration?, Misc*
LMNLDeclaration ::= '[!lmnl', S, VersionInfo, S?, EncodingDecl?, S?, ']'
VersionInfo ::= 'version', Eq, Quote, VersionNum, Quote
EncodingDecl ::= 'encoding', Eq, Quote, EncName, Quote
EncName ::= EncNameStartChar, EncNameChar*
<EncNameStartChar> ::= [A-Za-z]
<EncNameChar> ::= [A-Za-z0-9._] / '-'
VersionNum ::= ([a-zA-Z0-9_.:] / '-')+
>Misc< ::= NSDeclaration / Comment / S
NSDeclaration ::= '[!ns', S, (Prefix, Eq)?, URILiteral, S?, ']'
Comment ::= OpenComment, CommentText, CloseComment
<OpenComment> ::= '[!--'
<CloseComment> ::= '--]'
CommentText ::= (Char, -CloseComment)*
# Definition of Content might be problematic - can match zero characters
# Triggers infinite loop when recursive definition of it within Annotation is used?
# Content ::= CharData?, ((Tag / Atom / NSDeclaration / Comment), CharData?)*
# Trying another formulation - * zero to many alternatives, so allowing an empty document,
# but the alternatives themselves always fully match or not.
# ... which worked until I tried to replace Content in Annotation, and it blew up again.
# Changing Content to specify it must match something too using + quantifier - still blows up.
# Reverting back to * - so zero content can match and an empty document is possible, but
# still can't embed Content in Annotation without a big nothing.
Content ::= (Plain / Markup)+
>Plain< ::= CharData
>Markup< ::= Tag / Atom / NSDeclaration / Comment
# Changing CharData to use + quantifier instead of * quantifier.
# Ensures CharData always matches or doesn't match something
# and can't get caught in an infinite loop matching zero width.
CharData ::= -ReservedCharacters+
<ReservedCharacters> ::= OpenStartTag / OpenEndTag
<OpenStartTag> ::= '['
<CloseStartTag> ::= '}'
<OpenEndTag> ::= '{'
<CloseEndTag> ::= ']'
>Tag< ::= StartTag / EndTag
#/ EmptyTag
StartTag ::= OpenStartTag, TagContent, CloseStartTag
EndTag ::= OpenEndTag, TagContent, CloseEndTag
# EmptyTag ::= OpenStartTag, TagContent, CloseEndTag
# Grouping of spaces in TagContent needs to change as SimpleParse doesn't do backtracking.
# Otherwise too many spaces match, and Metadata never matches.
#>TagContent< ::= (TagName, S?)?, (IdentitySpec, S?)?, (S, MetaData)?, S?
# Changing tagname to remove ? still creates infinite loop when setting Content in Annotation
>TagContent< ::= TagName?, (S?, IdentitySpec)?, (S, MetaData)?, S?
>TagName< ::= QName
>IdentitySpec< ::= Eq, Identifier
Identifier ::= Name
>MetaData< ::= (NSDeclaration / Comment / Annotation / S)+
# Content in Annotation causes a problem - SimpleParse doesn't come back with anything...
# Replacing with CharData for the time being.
# Might be something to do with "Quantified Potentiality"
# (from "Text Processing in Python", 2003, D. Mertz):
# Consider the following productions:
# a := (b? / c)*
# x := (y?, z?)+
# Either of them can match zero characters, so you get an infinite loop of zero width matches.
# Can appear in complicated ways - can be several productions, or sub-productions involved.
# Not sure if this is the cause.
# CharData?, (Tag, CharData?)* instead of CharData sends it off into nowhere too.
# CharData?, Tag?, CharData returns, but matches almost nothing anyway.
#Annotation ::= (AnnotationStartTag, Content, AnnotationEndTag) / EmptyAnnotationTag
>Annotation< ::= (AnnotationStartTag, CharData, AnnotationEndTag)
# / EmptyAnnotationTag
#AnnotationStartTag ::= '[', AnnotationName, (S, MetaData)?, S?, '}'
AnnotationStartTag ::= '[', AnnotationName, S?, '}'
>AnnotationEndTag< ::= AbbreviatedAnnotationEndTag / FullAnnotationEndTag
AbbreviatedAnnotationEndTag ::= '{]'
#FullAnnotationEndTag ::= '{', AnnotationName, (S, MetaData)?, S?, ']'
FullAnnotationEndTag ::= '{', AnnotationName, S?, ']'
#EmptyAnnotationTag ::= '[', AnnotationName, (S, MetaData)?, S?, ']'
EmptyAnnotationTag ::= '[', AnnotationName, S?, ']'
>AnnotationName< ::= QName
# Added named productions for the components of the atom
>Atom< ::= NamedAtom / UnicodeAtom / BracketAtom
<OpenAtomChars> ::= '{{'
<CloseAtomChars> ::= '}}'
#NamedAtom ::= OpenAtomChars, (TagName, S?)?, (MetaData)?, CloseAtomChars
NamedAtom ::= OpenAtomChars, (TagName, S?)?, CloseAtomChars
UnicodeAtom ::= OpenAtomChars,'#x', CodePoint, CloseAtomChars
BracketAtom ::= OpenAtomChars, '#', BracketCodes, CloseAtomChars
CodePoint ::= ([0-9] / [A-F] / [a-f])+
BracketCodes ::= ('lsqb' / 'lcub' / 'rsqb' / 'rcub')
Literal ::= Quote, LiteralCharData, Quote
LiteralCharData ::= -Quote*
# Need a proper definition for URILiteral:
URILiteral ::= Literal
>QName< ::= (Prefix, ':')?, LocalName
Prefix ::= Name
LocalName ::= Name
<Name> ::= NameStartChar, (NameChar)*
<NameStartChar> ::= [A-Z] / "_" / [a-z] / [\u00C0-\u02FF] / [\u0370-\u037D] /
[\u037F-\u1FFF] / [\u200C-\u200D] / [\u2070-\u218F] /
[\u2C00-\u2FEF] / [\u3001-\uD7FF] / [\uF900-\uFFFF] /
[\u2C00-\u2FEF] / [\u3001-\uD7FF] /
[\uF900-\U000EFFFF]
<NameChar> ::= NameStartChar / '-' / '.' / [0-9] /
[\xB7] / [\u0300-\u036F] / [\u0203F-\u2040]
Char ::= [\x9] / [\xA] / [\xD] / [\x20-\x7E] / [\x85] /
[\u00A0-\uD7FF] / [\uE000-\uFFFD] /
[\u00A0-\uD7FF] / [\uE000-\uFFFD] / [\U00010000-\U0010FFFF]
<S> ::= ([\x20] / [\x9] / [\xA])+
<Eq> ::= S?, '=', S?
<Quote> ::= '"'
