LMNL.g

From LMNLWiki

(back to the LMNL Parser Experiment page LMNL Parser Experiment)

grammar LMNL;

options {
	backtrack = true ; // this fixes numerous non LL(*) recursive rule problems, plus warnings about multiple independant matches.
	language = Python ; // comment this out - defaults to Java.
}

lmnlDocument
	:	prolog content EOF
	;

prolog	:	lmnlDeclaration? misc*
	;
	
lmnlDeclaration
	:	WS? LEFT_SQUARE_BRACKET '!lmnl' versionInfo encodingDecl? WS? RIGHT_SQUARE_BRACKET // allow optional whitespace right at start?
	;

versionInfo
	:	WS 'version' Eq QUOTE versionNum QUOTE
	;

versionNum
	:	VERSION_NUM
	;
	
encodingDecl
	:	WS 'encoding' Eq QUOTE encName QUOTE
	;
	
encName	
	:	ENCODING_NAME
	;

fragment misc	
	:	nsDeclaration
	|	comment
	|	WS
	;

nsDeclaration
	:	LEFT_SQUARE_BRACKET '!ns' WS (prefix Eq)? uriLiteral WS? RIGHT_SQUARE_BRACKET
	;

comment	
	:	START_COMMENT commentText END_COMMENT
	;
	
commentText
	:	(COMMENT_CHAR ~END_COMMENT)*
	;
	
content	
	:	(	charData
	|		markup		)*
	;
	
charData	
	:	CHARDATA
	;

fragment markup	
	:	tag
	|	atom
	|	nsDeclaration
	|	comment
	;
	
fragment tag
	:	startTag
	|	endTag
	|	emptyTag
	;
	
// DEBUG NOTES:
// running with CharData in annotation:
// including annotation in metadata causes fatal non LL(*) recursion - resolve by left factoring, syntactic predicates, etc.
// but does not if annotations cannot themselves contain metadata.
// Can include metadata in Annotation tag if Empty Tag does not exist.
// Clearly can match empty tag + metadata - get multiple alternative matches:
// startTag and emptyTag.
// OPEN_START_TAG NAME OPEN_START_TAG NAME
// [FIRST[SECOND 
// Putting all else (still with CharData in annotation) back as it should be (metadata include)
// Now having emptyTag causes non-LL* recursive rule invocations, and we get multiple alternative matches.
// backtrack = true seems to solve all these issues.

startTag
	:	openStartTag tagContent closeStartTag 
	;

endTag	:	openEndTag tagContent closeEndTag 
	;
	
emptyTag
	:	openStartTag tagContent closeEndTag
	;

fragment tagContent
	:	tagName? (Eq tagIdentifier)? metaData* // should tag name be optional? not quite as in original lmnl spec.
	;
	
fragment atom
	:	namedAtom
	|	unicodeAtom
	|	shortAtom
	;

namedAtom
	:	openAtom atomName? metaData* closeAtom // should atom name be optional?
	;
	
unicodeAtom
	:	openAtom NUMBER_SIGN HEX_INDICATOR codePoint closeAtom
	;
	
shortAtom
	:	openAtom NUMBER_SIGN shortHand closeAtom
	;

fragment metaData
	:	annotation 	
	|	misc		
	;

annotation
	:	( annotationStartTag content annotationEndTag ) // change Content to CharData
	|	emptyAnnotationTag // causes unexpected end of AST tree - can match the same info somehow?
	;
	
annotationStartTag
	:	openStartTag annotationName metaData* closeStartTag // MetaData doesn't need S - it includes the possibility
	;
	
annotationEndTag
	:	abbreviatedAnnotationEndTag
	|	fullAnnotationEndTag
	;

fragment abbreviatedAnnotationEndTag
	:	openEndTag closeEndTag
	;

fragment fullAnnotationEndTag
	:	openEndTag annotationName metaData* closeEndTag // removed S? before end tag
	;

fragment emptyAnnotationTag
	:	openStartTag annotationName metaData* closeEndTag // removed S? before end tag 
	;
	

	
// ---------------------------------------------------------------------------------------
// LEXER rules 
// Define the basic tokens to recognise in an stream of text:
// ---------------------------------------------------------------------------------------


// charData is any run of text that isn't a reserved character 
// This matches all document text between markup.

tagName
	:	qName ;

tagIdentifier
	:	NAME ;
	
fragment openStartTag
	:	OPEN_START_TAG ;
	
fragment closeStartTag
	:	CLOSE_START_TAG ;
	
fragment openEndTag
	:	OPEN_END_TAG ;
	
fragment closeEndTag
	:	CLOSE_END_TAG ;

atomName
	:	qName ;

fragment openAtom
	:	OPEN_END_TAG OPEN_END_TAG ;
	
fragment closeAtom
	:	CLOSE_START_TAG CLOSE_START_TAG ;

annotationName
	:	qName ;

fragment qName
	:	(prefix COLON)? localPart ;

prefix	:	NAME ;
	
localPart
	:	NAME ;

codePoint
	:	HEX_DIGIT+ ;
	
shortHand
	:	'lsqb' | 'rsqb' | 'lcub' | 'rcub' ;

uriLiteral
	:	'uriLiteral' ; // need definition of URI literal
	
	
// ---------------------------------------------------------------------------------------
// Fragments - named patterns or literals we can re-use in higher levels 
// (doesn't create a token in the lexer)
// ---------------------------------------------------------------------------------------

	
fragment CHARDATA		:	( ~RESERVED_CHARACTERS )+ ;
fragment RESERVED_CHARACTERS	:	OPEN_START_TAG | OPEN_END_TAG ;
fragment NAME			:	NAME_START_CHAR NAME_CHAR* ;
fragment VERSION_NUM		:	( ALPHA_NUM_CHAR | FULL_STOP | UNDERSCORE
				|	  HYPHEN | COLON )+ ;
fragment ENCODING_NAME 	:	MIXED_CASE_CHAR 
				 	( ALPHA_NUM_CHAR | FULL_STOP | UNDERSCORE | HYPHEN )* ;
fragment START_COMMENT		:	'[!--' ;
fragment COMMENT_CHAR		:	TAB | NEW_LINE | CARRIAGE_RETURN 
				|	PRINTABLE_ASCII | ELLIPSIS
				|	'\u00A0' .. '\uD7FF'
				|	'\uE000' .. '\uFFFD' ;
//	|	'\u10000' .. '\u10FFFF'	
// Definition of char includes reserved characters. [ and {
// Is this a problem? // why not just a range from SPACE to \uD7FF?
fragment END_COMMENT 		:	'--]' ;				
fragment OPEN_START_TAG	:	LEFT_SQUARE_BRACKET ;
fragment CLOSE_START_TAG	:	RIGHT_CURLY_BRACKET ;
fragment OPEN_END_TAG		:	LEFT_CURLY_BRACKET ;
fragment CLOSE_END_TAG		:	RIGHT_SQUARE_BRACKET ;
fragment Eq			:	WS? EQUAL WS? ;
fragment WS			:	( SPACE | TAB | NEW_LINE )+ ;
fragment NAME_CHAR		:	NAME_START_CHAR | DIGIT | HYPHEN 
				| 	FULL_STOP | MIDDLE_DOT 
				|	'\u0300' .. '\u036F'
				|	'\u203F' .. '\u2040' ;
fragment NAME_START_CHAR	:	UPPER_CASE_CHAR | LOWER_CASE_CHAR | UNDERSCORE
				|	'\u00C0' .. '\u02FF'
				|	'\u0370' .. '\u037D' // beginning of greek & coptic block?
				|	'\u037F' .. '\u1FFF'
				|	'\u200C' .. '\u200D'
				|	'\u2070' .. '\u218F'
				|	'\u2C00' .. '\u2FEF'
				|	'\u3001' .. '\uD7FF' ;
//	|	'\uF900' .. '\uEFFFF' // Don't think can handle beyond four hexchars
fragment ALPHA_NUM_CHAR	: 	MIXED_CASE_CHAR | DIGIT ;
fragment MIXED_CASE_CHAR	: 	UPPER_CASE_CHAR | LOWER_CASE_CHAR ;
fragment PRINTABLE_ASCII	: 	' ' .. '~' ; // Comment: doesn't like using named fragments for ranges, for some reason.
fragment HEX_DIGIT		: 	DIGIT | 'A'..'F' | 'a'..'f' ;
fragment HEX_INDICATOR		: 	'x' ;
fragment DIGIT			: 	'0'..'9' ;
fragment UPPER_CASE_CHAR	: 	'A'..'Z' ;
fragment LOWER_CASE_CHAR	: 	'a'..'z' ;
fragment SPACE			: 	' ' ;
fragment LEFT_SQUARE_BRACKET	: 	'[' ;
fragment RIGHT_SQUARE_BRACKET	: 	']' ;
fragment LEFT_CURLY_BRACKET 	: 	'{' ;
fragment RIGHT_CURLY_BRACKET 	: 	'}' ;	
fragment NUMBER_SIGN		: 	'#' ;
fragment TILDE 		: 	'~' ;
fragment ELLIPSIS		: 	'\u008E' ;
fragment MIDDLE_DOT		: 	'\u00B7' ;
fragment TAB			: 	'\u0009' ;
fragment NEW_LINE		: 	'\u000A' ;
fragment CARRIAGE_RETURN	: 	'\u000D' ;	
fragment FULL_STOP		: 	'.' ;
fragment UNDERSCORE		: 	'_' ;
fragment HYPHEN		: 	'-' ;
fragment COLON			: 	':' ;
fragment QUOTE			: 	'"' ;
fragment EQUAL			: 	'=' ;