LMNL.g
From LMNLWiki
(back to the LMNL Parser Experiment page LMNL Parser Experiment)
grammar LMNL;
options {
backtrack = true ; // this fixes numerous non LL(*) recursive rule problems, plus warnings about multiple independant matches.
language = Python ; // comment this out - defaults to Java.
}
lmnlDocument
: prolog content EOF
;
prolog : lmnlDeclaration? misc*
;
lmnlDeclaration
: WS? LEFT_SQUARE_BRACKET '!lmnl' versionInfo encodingDecl? WS? RIGHT_SQUARE_BRACKET // allow optional whitespace right at start?
;
versionInfo
: WS 'version' Eq QUOTE versionNum QUOTE
;
versionNum
: VERSION_NUM
;
encodingDecl
: WS 'encoding' Eq QUOTE encName QUOTE
;
encName
: ENCODING_NAME
;
fragment misc
: nsDeclaration
| comment
| WS
;
nsDeclaration
: LEFT_SQUARE_BRACKET '!ns' WS (prefix Eq)? uriLiteral WS? RIGHT_SQUARE_BRACKET
;
comment
: START_COMMENT commentText END_COMMENT
;
commentText
: (COMMENT_CHAR ~END_COMMENT)*
;
content
: ( charData
| markup )*
;
charData
: CHARDATA
;
fragment markup
: tag
| atom
| nsDeclaration
| comment
;
fragment tag
: startTag
| endTag
| emptyTag
;
// DEBUG NOTES:
// running with CharData in annotation:
// including annotation in metadata causes fatal non LL(*) recursion - resolve by left factoring, syntactic predicates, etc.
// but does not if annotations cannot themselves contain metadata.
// Can include metadata in Annotation tag if Empty Tag does not exist.
// Clearly can match empty tag + metadata - get multiple alternative matches:
// startTag and emptyTag.
// OPEN_START_TAG NAME OPEN_START_TAG NAME
// [FIRST[SECOND
// Putting all else (still with CharData in annotation) back as it should be (metadata include)
// Now having emptyTag causes non-LL* recursive rule invocations, and we get multiple alternative matches.
// backtrack = true seems to solve all these issues.
startTag
: openStartTag tagContent closeStartTag
;
endTag : openEndTag tagContent closeEndTag
;
emptyTag
: openStartTag tagContent closeEndTag
;
fragment tagContent
: tagName? (Eq tagIdentifier)? metaData* // should tag name be optional? not quite as in original lmnl spec.
;
fragment atom
: namedAtom
| unicodeAtom
| shortAtom
;
namedAtom
: openAtom atomName? metaData* closeAtom // should atom name be optional?
;
unicodeAtom
: openAtom NUMBER_SIGN HEX_INDICATOR codePoint closeAtom
;
shortAtom
: openAtom NUMBER_SIGN shortHand closeAtom
;
fragment metaData
: annotation
| misc
;
annotation
: ( annotationStartTag content annotationEndTag ) // change Content to CharData
| emptyAnnotationTag // causes unexpected end of AST tree - can match the same info somehow?
;
annotationStartTag
: openStartTag annotationName metaData* closeStartTag // MetaData doesn't need S - it includes the possibility
;
annotationEndTag
: abbreviatedAnnotationEndTag
| fullAnnotationEndTag
;
fragment abbreviatedAnnotationEndTag
: openEndTag closeEndTag
;
fragment fullAnnotationEndTag
: openEndTag annotationName metaData* closeEndTag // removed S? before end tag
;
fragment emptyAnnotationTag
: openStartTag annotationName metaData* closeEndTag // removed S? before end tag
;
// ---------------------------------------------------------------------------------------
// LEXER rules
// Define the basic tokens to recognise in an stream of text:
// ---------------------------------------------------------------------------------------
// charData is any run of text that isn't a reserved character
// This matches all document text between markup.
tagName
: qName ;
tagIdentifier
: NAME ;
fragment openStartTag
: OPEN_START_TAG ;
fragment closeStartTag
: CLOSE_START_TAG ;
fragment openEndTag
: OPEN_END_TAG ;
fragment closeEndTag
: CLOSE_END_TAG ;
atomName
: qName ;
fragment openAtom
: OPEN_END_TAG OPEN_END_TAG ;
fragment closeAtom
: CLOSE_START_TAG CLOSE_START_TAG ;
annotationName
: qName ;
fragment qName
: (prefix COLON)? localPart ;
prefix : NAME ;
localPart
: NAME ;
codePoint
: HEX_DIGIT+ ;
shortHand
: 'lsqb' | 'rsqb' | 'lcub' | 'rcub' ;
uriLiteral
: 'uriLiteral' ; // need definition of URI literal
// ---------------------------------------------------------------------------------------
// Fragments - named patterns or literals we can re-use in higher levels
// (doesn't create a token in the lexer)
// ---------------------------------------------------------------------------------------
fragment CHARDATA : ( ~RESERVED_CHARACTERS )+ ;
fragment RESERVED_CHARACTERS : OPEN_START_TAG | OPEN_END_TAG ;
fragment NAME : NAME_START_CHAR NAME_CHAR* ;
fragment VERSION_NUM : ( ALPHA_NUM_CHAR | FULL_STOP | UNDERSCORE
| HYPHEN | COLON )+ ;
fragment ENCODING_NAME : MIXED_CASE_CHAR
( ALPHA_NUM_CHAR | FULL_STOP | UNDERSCORE | HYPHEN )* ;
fragment START_COMMENT : '[!--' ;
fragment COMMENT_CHAR : TAB | NEW_LINE | CARRIAGE_RETURN
| PRINTABLE_ASCII | ELLIPSIS
| '\u00A0' .. '\uD7FF'
| '\uE000' .. '\uFFFD' ;
// | '\u10000' .. '\u10FFFF'
// Definition of char includes reserved characters. [ and {
// Is this a problem? // why not just a range from SPACE to \uD7FF?
fragment END_COMMENT : '--]' ;
fragment OPEN_START_TAG : LEFT_SQUARE_BRACKET ;
fragment CLOSE_START_TAG : RIGHT_CURLY_BRACKET ;
fragment OPEN_END_TAG : LEFT_CURLY_BRACKET ;
fragment CLOSE_END_TAG : RIGHT_SQUARE_BRACKET ;
fragment Eq : WS? EQUAL WS? ;
fragment WS : ( SPACE | TAB | NEW_LINE )+ ;
fragment NAME_CHAR : NAME_START_CHAR | DIGIT | HYPHEN
| FULL_STOP | MIDDLE_DOT
| '\u0300' .. '\u036F'
| '\u203F' .. '\u2040' ;
fragment NAME_START_CHAR : UPPER_CASE_CHAR | LOWER_CASE_CHAR | UNDERSCORE
| '\u00C0' .. '\u02FF'
| '\u0370' .. '\u037D' // beginning of greek & coptic block?
| '\u037F' .. '\u1FFF'
| '\u200C' .. '\u200D'
| '\u2070' .. '\u218F'
| '\u2C00' .. '\u2FEF'
| '\u3001' .. '\uD7FF' ;
// | '\uF900' .. '\uEFFFF' // Don't think can handle beyond four hexchars
fragment ALPHA_NUM_CHAR : MIXED_CASE_CHAR | DIGIT ;
fragment MIXED_CASE_CHAR : UPPER_CASE_CHAR | LOWER_CASE_CHAR ;
fragment PRINTABLE_ASCII : ' ' .. '~' ; // Comment: doesn't like using named fragments for ranges, for some reason.
fragment HEX_DIGIT : DIGIT | 'A'..'F' | 'a'..'f' ;
fragment HEX_INDICATOR : 'x' ;
fragment DIGIT : '0'..'9' ;
fragment UPPER_CASE_CHAR : 'A'..'Z' ;
fragment LOWER_CASE_CHAR : 'a'..'z' ;
fragment SPACE : ' ' ;
fragment LEFT_SQUARE_BRACKET : '[' ;
fragment RIGHT_SQUARE_BRACKET : ']' ;
fragment LEFT_CURLY_BRACKET : '{' ;
fragment RIGHT_CURLY_BRACKET : '}' ;
fragment NUMBER_SIGN : '#' ;
fragment TILDE : '~' ;
fragment ELLIPSIS : '\u008E' ;
fragment MIDDLE_DOT : '\u00B7' ;
fragment TAB : '\u0009' ;
fragment NEW_LINE : '\u000A' ;
fragment CARRIAGE_RETURN : '\u000D' ;
fragment FULL_STOP : '.' ;
fragment UNDERSCORE : '_' ;
fragment HYPHEN : '-' ;
fragment COLON : ':' ;
fragment QUOTE : '"' ;
fragment EQUAL : '=' ;
