Lmnl.def

From LMNLWiki

(back to LMNL Experiment page LMNL Parser Experiment)

#
# LMNL grammar rendered in the SimpleParse EBNF variant
# 
# Matt Palmer, 7 July 2008
#
# LMNL: 	http://www.lmnl.org/wiki/index.php/Main_Page
# SimpleParse:	http://simpleparse.sourceforge.net/simpleparse_grammars.html
#
#  ,	sequences are separated by commas
#  /	alternatives are separated by forward slash (takes precedence over sequences)
#  -	not the expression that follows
#  +	one or more
#  *	zero to many
#  ?	zero or one
# ( )	grouped expression
# < > 	indicates not to output the production in the final parser output
# > < 	indicates not to output the production, but to output child productions
#
# Current choices for <> and >< have no particular significance, 
# and only reflect what I was doing the last time I was playing with the grammar. 
# Feel free to tweak them to see how the parser matches the productions.

# Content -> Markup -> Tag -> Metadata -> Annotation -> Content

# adding ? to Content (to allow empty document) - Content itself must match something.
LMNLDocument			::=  	Prolog, Content?

Prolog				::= 	LMNLDeclaration?, Misc*

LMNLDeclaration  		::= 	'[!lmnl', S, VersionInfo, S?, EncodingDecl?, S?, ']'
VersionInfo 			::= 	'version', Eq, Quote, VersionNum, Quote
EncodingDecl			::= 	'encoding', Eq, Quote, EncName, Quote
EncName				::= 	EncNameStartChar, EncNameChar*
<EncNameStartChar>      	::=	[A-Za-z]
<EncNameChar>			::=	[A-Za-z0-9._] / '-'
VersionNum			::= 	([a-zA-Z0-9_.:] / '-')+

>Misc<				::= 	NSDeclaration / Comment / S
NSDeclaration			::= 	'[!ns', S, (Prefix, Eq)?, URILiteral, S?, ']'
Comment				::= 	OpenComment, CommentText, CloseComment
<OpenComment>			::=	'[!--'
<CloseComment>			::=	'--]'
CommentText			::=	(Char, -CloseComment)*

# Definition of Content might be problematic - can match zero characters
# Triggers infinite loop when recursive definition of it within Annotation is used?
# Content			::=  	CharData?, ((Tag / Atom / NSDeclaration / Comment), CharData?)*
# Trying another formulation - * zero to many alternatives, so allowing an empty document,
# but the alternatives themselves always fully match or not.
# ... which worked until I tried to replace Content in Annotation, and it blew up again.
# Changing Content to specify it must match something too using + quantifier - still blows up.
# Reverting back to * - so zero content can match and an empty document is possible, but
# still can't embed Content in Annotation without a big nothing.

Content			::=  	(Plain / Markup)+
>Plain<			::=	CharData
>Markup<			::=	Tag / Atom / NSDeclaration / Comment

# Changing CharData to use + quantifier instead of * quantifier.
# Ensures CharData always matches or doesn't match something
# and can't get caught in an infinite loop matching zero width.
CharData			::= 	-ReservedCharacters+
<ReservedCharacters>		::=	OpenStartTag / OpenEndTag
<OpenStartTag>			::=	'['
<CloseStartTag>			::=	'}'
<OpenEndTag>			::=	'{'
<CloseEndTag>			::=	']'

>Tag<				::= 	StartTag / EndTag 
#/ EmptyTag
StartTag			::= 	OpenStartTag, TagContent, CloseStartTag
EndTag				::= 	OpenEndTag, TagContent, CloseEndTag
# EmptyTag			::= 	OpenStartTag, TagContent, CloseEndTag

# Grouping of spaces in TagContent needs to change as SimpleParse doesn't do backtracking.
# Otherwise too many spaces match, and Metadata never matches.
#>TagContent<			::= 	(TagName, S?)?, (IdentitySpec, S?)?, (S, MetaData)?, S?

# Changing tagname to remove ? still creates infinite loop when setting Content in Annotation
>TagContent<			::= 	TagName?, (S?, IdentitySpec)?, (S, MetaData)?, S?

>TagName<			::= 	QName
>IdentitySpec<			::= 	Eq, Identifier
Identifier			::= 	Name
>MetaData<			::= 	(NSDeclaration / Comment / Annotation / S)+

# Content in Annotation causes a problem - SimpleParse doesn't come back with anything...
# Replacing with CharData for the time being.
# Might be something to do with "Quantified Potentiality"
# (from "Text Processing in Python", 2003, D. Mertz):
# Consider the following productions:
# a := (b? / c)*
# x := (y?, z?)+
# Either of them can match zero characters, so you get an infinite loop of zero width matches.  
# Can appear in complicated ways - can be several productions, or sub-productions involved.
# Not sure if this is the cause.
# CharData?, (Tag, CharData?)* instead of CharData sends it off into nowhere too.
# CharData?, Tag?, CharData returns, but matches almost nothing anyway.

#Annotation	   		::= 	(AnnotationStartTag, Content, AnnotationEndTag) / EmptyAnnotationTag
>Annotation<   		::= 	(AnnotationStartTag, CharData, AnnotationEndTag)
# / EmptyAnnotationTag

#AnnotationStartTag		::= 	'[', AnnotationName, (S, MetaData)?, S?, '}'
AnnotationStartTag		::= 	'[', AnnotationName, S?, '}'
>AnnotationEndTag< 		::= 	AbbreviatedAnnotationEndTag / FullAnnotationEndTag
AbbreviatedAnnotationEndTag 	::= 	'{]'
#FullAnnotationEndTag		::= 	'{', AnnotationName, (S, MetaData)?, S?, ']'
FullAnnotationEndTag		::= 	'{', AnnotationName, S?, ']'
#EmptyAnnotationTag 		::= 	'[', AnnotationName, (S, MetaData)?, S?, ']'
EmptyAnnotationTag 		::= 	'[', AnnotationName, S?, ']'

>AnnotationName<        	::= 	QName

# Added named productions for the components of the atom
>Atom<				::= 	NamedAtom / UnicodeAtom / BracketAtom
<OpenAtomChars>		::=	'{{'
<CloseAtomChars>		::=	'}}'
#NamedAtom			::=	OpenAtomChars, (TagName, S?)?, (MetaData)?, CloseAtomChars
NamedAtom			::=	OpenAtomChars, (TagName, S?)?, CloseAtomChars
UnicodeAtom			::=    	OpenAtomChars,'#x', CodePoint, CloseAtomChars
BracketAtom     		::=    	OpenAtomChars, '#', BracketCodes, CloseAtomChars
CodePoint			::=	([0-9] / [A-F] / [a-f])+
BracketCodes			::=	('lsqb' / 'lcub' / 'rsqb' / 'rcub')

Literal			::= 	Quote, LiteralCharData, Quote
LiteralCharData		::=	-Quote*
# Need a proper definition for URILiteral:
URILiteral			::= 	Literal

>QName<			::= 	(Prefix, ':')?, LocalName
Prefix				::= 	Name
LocalName			::= 	Name
<Name>				::= 	NameStartChar, (NameChar)*
<NameStartChar>			::= 	[A-Z] / "_" / [a-z] / [\u00C0-\u02FF] / [\u0370-\u037D] /
		                        [\u037F-\u1FFF] / [\u200C-\u200D] / [\u2070-\u218F] /
          		                [\u2C00-\u2FEF] / [\u3001-\uD7FF] / [\uF900-\uFFFF] /
          		                [\u2C00-\u2FEF] / [\u3001-\uD7FF] /
					[\uF900-\U000EFFFF]          		                
<NameChar> 			::= 	NameStartChar / '-' / '.' / [0-9] / 
					[\xB7] / [\u0300-\u036F] / [\u0203F-\u2040]
Char				::= 	[\x9] / [\xA] / [\xD] / [\x20-\x7E] / [\x85] / 
					[\u00A0-\uD7FF] / [\uE000-\uFFFD] /
					[\u00A0-\uD7FF] / [\uE000-\uFFFD] / [\U00010000-\U0010FFFF]
<S>				::= 	([\x20] / [\x9] / [\xA])+
<Eq>				::= 	S?, '=', S?
<Quote>			::=	'"'