From 8371b28de5b64e54952762a5979839a802e94440 Mon Sep 17 00:00:00 2001 From: Nadrieril Date: Tue, 5 Mar 2019 02:23:56 +0100 Subject: Include modified abnf in repo --- dhall_parser/src/dhall.abnf | 698 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 698 insertions(+) create mode 100644 dhall_parser/src/dhall.abnf (limited to 'dhall_parser/src') diff --git a/dhall_parser/src/dhall.abnf b/dhall_parser/src/dhall.abnf new file mode 100644 index 0000000..391741f --- /dev/null +++ b/dhall_parser/src/dhall.abnf @@ -0,0 +1,698 @@ +; ABNF syntax based on RFC 5234 +; +; The character encoding for Dhall is UTF-8 +; +; Some notes on implementing this grammar: +; +; First, do not use a lexer to tokenize the file before parsing. Instead, treat +; the individual characters of the file as the tokens to feed into the parser. +; You should not use a lexer because Dhall's grammar supports two features which +; cannot be correctly supported by a lexer: +; +; * String interpolation (i.e. "foo ${Natural/toInteger bar} baz") +; * Nested block comments (i.e. "{- foo {- bar -} baz -}") +; +; Second, this grammar assumes that your parser can backtrack and/or try +; multiple parses simultaneously. For example, consider this expression: +; +; List ./MyType +; +; A parser might first try to parse the period as the beginning of a field +; selector, only to realize immediately afterwards that `/MyType` is not a valid +; name for a field. A conforming parser must backtrack so that the expression +; `./MyType` can instead be correctly interpreted as a relative path +; +; Third, if there are multiple valid parses then prefer the first parse +; according to the ordering of alternatives. That is, the order of evaluation +; of the alternatives is left-to-right. +; +; For example, the grammar for single quoted string literals is: +; +; single-quote-continue = +; "'''" single-quote-continue +; / "${" complete-expression "}" single-quote-continue +; / "''${" single-quote-continue +; / "''" +; / %x20-10FFFF single-quote-continue +; / tab single-quote-continue +; / end-of-line single-quote-continue +; +; single-quote-literal = "''" single-quote-continue +; +; ... which permits valid parses for the following code: +; +; "''''''''''''''''" +; +; If you tried to parse all alternatives then there are at least two valid +; interpretations for the above code: +; +; * A single quoted literal with four escape sequences of the form "'''" +; * i.e. "''" followed by "'''" four times in a row followed by "''" +; * Four empty single quoted literals +; * i.e. "''''" four times in a row +; +; The correct interpretation is the first one because parsing the escape +; sequence "'''" takes precedence over parsing the termination sequence "''", +; according to the order of the alternatives in the `single-quote-continue` +; rule. +; +; Some parsing libraries do not backtrack by default but allow the user to +; selectively backtrack in certain parts of the grammar. Usually parsing +; libraries do this to improve efficiency and error messages. Dhall's grammar +; takes that into account by minimizing the number of rules that require the +; parser to backtrack and comments below will highlight where you need to +; explicitly backtrack +; +; Specifically, if you see an uninterrupted literal in a grammar rule such as: +; +; "->" +; +; ... or: +; +; %x66.6f.72.61.6c.6c +; +; ... then that string literal is parsed as a single unit, meaning that you +; should backtrack if you parse only part of the literal +; +; In all other cases you can assume that you do not need to backtrack unless +; there is a comment explicitly asking you to backtrack +; +; When parsing a repeated construct, prefer alternatives that parse as many +; repetitions as possible. On in other words: +; +; [a] = a / "" +; +; a* = a* a / "" +; +; Note that the latter rule also specifies that repetition produces +; left-associated expressions. For example, function application is +; left-associative and all operators are left-associative when they are not +; parenthesized. +; +; Additionally, try alternatives in an order that minimizes backtracking +; according to the following rule: +; +; (a / b) (c / d) = a c / a d / b c / b d + +; NOTE: There are many line endings in the wild +; +; See: https://en.wikipedia.org/wiki/Newline +; +; For simplicity this supports Unix and Windows line-endings, which are the most +; common +end-of-line = + %x0A ; "\n" + / %x0D.0A ; "\r\n" + +tab = %x09 ; "\t" + +block-comment = "{-" block-comment-continue + +block-comment-chunk = + block-comment + / %x20-10FFFF + / tab + / end-of-line + +block-comment-continue = "-}" / block-comment-chunk block-comment-continue + +not-end-of-line = %x20-10FFFF / tab + +; NOTE: Slightly different from Haskell-style single-line comments because this +; does not require a space after the dashes +line-comment = "--" *not-end-of-line end-of-line + +whitespace-chunk = + " " + / tab + / end-of-line + / line-comment + / block-comment + +whitespace = *whitespace-chunk + +nonempty-whitespace = 1*whitespace-chunk + +; Uppercase or lowercase ASCII letter +ALPHA = %x41-5A / %x61-7A + +; ASCII digit +DIGIT = %x30-39 ; 0-9 + +HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F" + +; A simple label cannot be one of the following reserved keywords: +; +; * if +; * then +; * else +; * let +; * in +; * as +; * using +; * merge +simple-label-first-char = ALPHA / "_" +simple-label-next-char = ALPHA / DIGIT / "-" / "/" / "_" +simple-label = simple-label-first-char *simple-label-next-char + +quoted-label = 1*(ALPHA / DIGIT / "-" / "/" / "_" / ":" / "." / "$") + +; NOTE: Dhall does not support Unicode labels, mainly to minimize the potential +; for code obfuscation +label-raw = ("`" quoted-label "`" / simple-label) +label = label-raw whitespace + +; Dhall's double-quoted strings are equivalent to JSON strings except with +; support for string interpolation (and escaping string interpolation) +; +; Dhall uses almost the same escaping rules as JSON (RFC7159) with one +; exception: Dhall adds a new `\$` escape sequence for dollar signs. This +; additional escape sequences lets you escape string interpolation by writing +; `\${` +; +; > The representation of strings is similar to conventions used in the C +; > family of programming languages. A string begins and ends with +; > quotation marks. All Unicode characters may be placed within the +; > quotation marks, except for the characters that must be escaped: +; > quotation mark, reverse solidus, and the control characters (U+0000 +; > through U+001F). +; > +; > Any character may be escaped. If the character is in the Basic +; > Multilingual Plane (U+0000 through U+FFFF), then it may be +; > represented as a six-character sequence: a reverse solidus, followed +; > by the lowercase letter u, followed by four hexadecimal digits that +; > encode the character's code point. The hexadecimal letters A though +; > F can be upper or lower case. So, for example, a string containing +; > only a single reverse solidus character may be represented as +; > "\u005C". +; > +; > Alternatively, there are two-character sequence escape +; > representations of some popular characters. So, for example, a +; > string containing only a single reverse solidus character may be +; > represented more compactly as "\\". +; > +; > To escape an extended character that is not in the Basic Multilingual +; > Plane, the character is represented as a 12-character sequence, +; > encoding the UTF-16 surrogate pair. So, for example, a string +; > containing only the G clef character (U+1D11E) may be represented as +; > "\uD834\uDD1E". +double-quote-chunk = + interpolation + ; '\' + / %x5C double-quote-escaped + / double-quote-char + +double-quote-escaped = + %x22 ; '"' quotation mark U+0022 + / %x24 ; '$' dollar sign U+0024 + / %x5C ; '\' reverse solidus U+005C + / %x2F ; '/' solidus U+002F + / %x62 ; 'b' backspace U+0008 + / %x66 ; 'f' form feed U+000C + / %x6E ; 'n' line feed U+000A + / %x72 ; 'r' carriage return U+000D + / %x74 ; 't' tab U+0009 + / %x75 4HEXDIG ; 'uXXXX' U+XXXX + +; Printable characters except double quote and backslash +double-quote-char = + %x20-21 + ; %x22 = '"' + / %x23-5B + ; %x5C = "\" + / %x5D-10FFFF + +double-quote-literal = %x22 *double-quote-chunk %x22 + +; NOTE: The only way to end a single-quote string literal with a single quote is +; to either interpolate the single quote, like this: +; +; ''ABC${"'"}'' +; +; ... or concatenate another string, like this: +; +; ''ABC'' ++ "'" +; +; If you try to end the string literal with a single quote then you get "'''", +; which is interpreted as an escaped pair of single quotes +single-quote-continue = + interpolation single-quote-continue + / escaped-quote-pair single-quote-continue + / escaped-interpolation single-quote-continue + / single-quote-char single-quote-continue + / "''" ; End of text literal + +; Escape two single quotes (i.e. replace this sequence with "''") +escaped-quote-pair = "'''" + +; Escape interpolation (i.e. replace this sequence with "${") +escaped-interpolation = "''${" + +single-quote-char = + %x20-10FFFF + / tab + / end-of-line + +single-quote-literal = "''" end-of-line single-quote-continue + +; Interpolation +interpolation = "${" complete-expression "}" + +text-literal-raw = (double-quote-literal / single-quote-literal) + +; RFC 5234 interprets string literals as case-insensitive and recommends using +; hex instead for case-sensitive strings +; +; If you don't feel like reading hex, these are all the same as the rule name, +; except without the '-raw' ending. +if-raw = %x69.66 +then-raw = %x74.68.65.6e +else-raw = %x65.6c.73.65 +let-raw = %x6c.65.74 +in-raw = %x69.6e +as-raw = %x61.73 +using-raw = %x75.73.69.6e.67 +merge-raw = %x6d.65.72.67.65 +missing-raw = %x6d.69.73.73.69.6e.67 +Optional-raw = %x4f.70.74.69.6f.6e.61.6c +Text-raw = %x54.65.78.74 +List-raw = %x4c.69.73.74 +Infinity-raw = %x49.6e.66.69.6e.69.74.79 + +; Whitespaced rules for reserved words, to be used when matching expressions +if = if-raw nonempty-whitespace +then = then-raw nonempty-whitespace +else = else-raw nonempty-whitespace +let = let-raw nonempty-whitespace +in = in-raw nonempty-whitespace +as = as-raw nonempty-whitespace +using = using-raw nonempty-whitespace +merge = merge-raw nonempty-whitespace +Optional = Optional-raw whitespace +Text = Text-raw whitespace +List = List-raw whitespace + +equal = "=" whitespace +or = "||" whitespace +plus = "+" nonempty-whitespace ; To disambiguate `f +2` +text-append = "++" whitespace +list-append = "#" nonempty-whitespace ; To disambiguate `http://a/a#a` +and = "&&" whitespace +times = "*" whitespace +double-equal = "==" whitespace +not-equal = "!=" whitespace +dot = "." whitespace +bar = "|" whitespace +comma = "," whitespace +at = "@" whitespace +colon = ":" nonempty-whitespace ; To disambiguate `env:VARIABLE` from type annotations +import-alt = "?" nonempty-whitespace ; To disambiguate `http://a/a?a` +open-parens = "(" whitespace +close-parens-raw = ")" +close-parens = ")" whitespace +open-brace = "{" whitespace +close-brace-raw = "}" +close-brace = "}" whitespace +open-bracket = "[" whitespace +close-bracket-raw = "]" +close-bracket = "]" whitespace +open-angle = "<" whitespace +close-angle-raw = ">" +close-angle = ">" whitespace + +combine = ( %x2227 / "/\" ) whitespace +combine-types = ( %x2A53 / "//\\" ) whitespace +prefer = ( %x2AFD / "//" ) whitespace +lambda = ( %x3BB / "\" ) whitespace +forall = ( %x2200 / %x66.6f.72.61.6c.6c ) whitespace +arrow = ( %x2192 / "->" ) whitespace + +exponent = "e" [ "+" / "-" ] 1*DIGIT + +double-literal-raw = [ "+" / "-" ] 1*DIGIT ( "." 1*DIGIT [ exponent ] / exponent) + +natural-literal-raw = 1*DIGIT + +integer-literal-raw = ( "+" / "-" ) natural-literal-raw + +identifier-raw = label-raw [ whitespace at natural-literal-raw ] +identifier = identifier-raw whitespace + +; Printable characters other than " ()[]{}<>/\," +; +; Excluding those characters ensures that paths don't have to end with trailing +; whitespace most of the time +path-character = + ; %x20 = " " + %x21 + ; %x22 = "\"" + ; %x23 = "#" + / %x24-27 + ; %x28 = "(" + ; %x29 = ")" + / %x2A-2B + ; %x2C = "," + / %x2D-2E + ; %x2F = "/" + / %x30-3B + ; %x3C = "<" + / %x3D + ; %x3E = ">" + ; %x3F = "?" + / %x40-5A + ; %x5B = "[" + ; %x5C = "\" + ; %x5D = "]" + / %x5E-7A + ; %x7B = "{" + / %x7C + ; %x7D = "}" + / %x7E + +quoted-path-character = + %x20-21 + ; %x22 = "\"" + / %x23-2E + ; %x2F = "/" + / %x30-10FFFF + + +path-component = "/" ( 1*path-character / %x22 1*quoted-path-character %x22 ) + +path = 1*path-component + +local-raw = + ".." path ; Relative path + / "." path ; Relative path + / "~" path ; Home-anchored path + ; NOTE: Backtrack if parsing this alternative fails + ; + ; This is because the first character of this alternative will be "/", but + ; if the second character is "/" or "\" then this should have been parsed + ; as an operator instead of a path + / path ; Absolute path + +; `http[s]` URI grammar based on RFC7230 and RFC 3986 with some differences +; noted below + +scheme = %x68.74.74.70 [ %x73 ] ; "http" [ "s" ] + +; NOTE: This does not match the official grammar for a URI. Specifically, this +; replaces `path-abempty` with `path` +http-raw = scheme "://" authority path [ "?" query ] [ "#" fragment ] + +; NOTE: Backtrack if parsing the optional user info prefix fails +authority = [ userinfo "@" ] host [ ":" port ] + +userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) + +host = IP-literal / IPv4address / reg-name + +port = *DIGIT + +IP-literal = "[" ( IPv6address / IPvFuture ) "]" + +IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) + +; NOTE: Backtrack when parsing each alternative +IPv6address = 6( h16 ":" ) ls32 + / "::" 5( h16 ":" ) ls32 + / [ h16 ] "::" 4( h16 ":" ) ls32 + / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 + / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 + / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 + / [ *4( h16 ":" ) h16 ] "::" ls32 + / [ *5( h16 ":" ) h16 ] "::" h16 + / [ *6( h16 ":" ) h16 ] "::" + +h16 = 1*4HEXDIG + +ls32 = ( h16 ":" h16 ) / IPv4address + +IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet + +; NOTE: Backtrack when parsing these alternatives and try them in reverse order +dec-octet = DIGIT ; 0-9 + / %x31-39 DIGIT ; 10-99 + / "1" 2DIGIT ; 100-199 + / "2" %x30-34 DIGIT ; 200-249 + / "25" %x30-35 ; 250-255 + +reg-name = *( unreserved / pct-encoded / sub-delims ) + +pchar = unreserved / pct-encoded / sub-delims / ":" / "@" + +query = *( pchar / "/" / "?" ) + +fragment = *( pchar / "/" / "?" ) + +pct-encoded = "%" HEXDIG HEXDIG + +unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + +sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" + +http = + http-raw + [ whitespace using (import-hashed-raw / open-parens import-hashed-raw whitespace close-parens-raw) ] + +; Dhall supports unquoted environment variables that are Bash-compliant or +; quoted environment variables that are POSIX-compliant +env-raw = "env:" + ( bash-environment-variable + / %x22 posix-environment-variable %x22 + ) + +; Bash supports a restricted subset of POSIX environment variables. From the +; Bash `man` page, an environment variable name is: +; +; > A word consisting only of alphanumeric characters and under-scores, and +; > beginning with an alphabetic character or an under-score +bash-environment-variable = (ALPHA / "_") *(ALPHA / DIGIT / "_") + +; The POSIX standard is significantly more flexible about legal environment +; variable names, which can contain alerts (i.e. '\a'), whitespace, or +; punctuation, for example. The POSIX standard says about environment variable +; names: +; +; > The value of an environment variable is a string of characters. For a +; > C-language program, an array of strings called the environment shall be made +; > available when a process begins. The array is pointed to by the external +; > variable environ, which is defined as: +; > +; > extern char **environ; +; > +; > These strings have the form name=value; names shall not contain the +; > character '='. For values to be portable across systems conforming to IEEE +; > Std 1003.1-2001, the value shall be composed of characters from the portable +; > character set (except NUL and as indicated below). +; +; Note that the standard does not explicitly state that the name must have at +; least one character, but `env` does not appear to support this and `env` +; claims to be POSIX-compliant. To be safe, Dhall requires at least one +; character like `env` +posix-environment-variable = 1*posix-environment-variable-character + +; These are all the characters from the POSIX Portable Character Set except for +; '\0' (NUL) and '='. Note that the POSIX standard does not explicitly state +; that environment variable names cannot have NUL. However, this is implicit +; in the fact that environment variables are passed to the program as +; NUL-terminated `name=value` strings, which implies that the `name` portion of +; the string cannot have NUL characters +posix-environment-variable-character = + %x5C ; '\' Beginning of escape sequence + ( %x22 ; '"' quotation mark U+0022 + / %x5C ; '\' reverse solidus U+005C + / %x61 ; 'a' alert U+0007 + / %x62 ; 'b' backspace U+0008 + / %x66 ; 'f' form feed U+000C + / %x6E ; 'n' line feed U+000A + / %x72 ; 'r' carriage return U+000D + / %x74 ; 't' tab U+0009 + / %x76 ; 'v' vertical tab U+000B + ) + ; Printable characters except double quote, backslash and equals + / %x20-21 + ; %x22 = '"' + / %x23-3C + ; %x3D = '=' + / %x3E-5B + ; %x5C = "\" + / %x5D-7E + +import-type-raw = missing-raw / local-raw / http / env-raw + +hash-raw = %x73.68.61.32.35.36.3a 64HEXDIG ; "sha256:XXX...XXX" + +import-hashed-raw = import-type-raw [ whitespace hash-raw ] + +; "http://example.com" +; "./foo/bar" +; "env:FOO" +import-raw = import-hashed-raw [ whitespace as Text-raw ] + +; NOTE: Every rule past this point should only reference rules that end with +; whitespace. This ensures consistent handling of whitespace in the absence of +; a separate lexing step. +; The exception is the rules ending in -raw, which should _not_ end in whitespace. +; This is important to avoid the need for sequential backtracking in application-expression. + +; An arbitrary dhall expression. Only use in a context where parentheses or +; keywords prevent possible ambiguity. See also atomic-expression. +expression = + lambda-expression + / ifthenelse-expression + / let-expression + / forall-expression + ; NOTE: Backtrack if parsing this alternative fails + / arrow-expression + / merge-expression + ; NOTE: Backtrack if parsing this alternative fails since we can't tell + ; from the opening bracket whether or not this will be an empty list or + ; a non-empty list + / empty-list-or-optional + / annotated-expression + +; "\(x : a) -> b" +lambda-expression = lambda open-parens label colon expression close-parens arrow expression + +; "if a then b else c" +ifthenelse-expression = if expression then expression else expression + +; "let x : t = e1 in e2" +; "let x = e1 in e2" +; "let x = e1 let y = e2 in e3" +let-expression = 1*let-binding in expression +let-binding = let label [ colon expression ] equal expression + +; "forall (x : a) -> b" +forall-expression = forall open-parens label colon expression close-parens arrow expression + +; "a -> b" +arrow-expression = operator-expression arrow expression + +; "merge e1 e2 : t" +; "merge e1 e2" +merge-expression = merge atomic-expression atomic-expression [ colon application-expression ] + +; "[] : List t" +; "[] : Optional t" +; "[x] : Optional t" +empty-list-or-optional = open-bracket (empty-collection / non-empty-optional) +empty-collection = close-bracket colon (List / Optional) atomic-expression +non-empty-optional = expression close-bracket colon Optional atomic-expression + +; "x : t" +annotated-expression = operator-expression [ colon expression ] + + +operator-expression = import-alt-expression + +import-alt-expression = or-expression *(import-alt or-expression) +or-expression = plus-expression *(or plus-expression ) +plus-expression = text-append-expression *(plus text-append-expression ) +text-append-expression = list-append-expression *(text-append list-append-expression ) +list-append-expression = and-expression *(list-append and-expression ) +and-expression = combine-expression *(and combine-expression ) +combine-expression = prefer-expression *(combine prefer-expression ) +prefer-expression = combine-types-expression *(prefer combine-types-expression) +combine-types-expression = times-expression *(combine-types times-expression ) +times-expression = equal-expression *(times equal-expression ) +equal-expression = not-equal-expression *(double-equal not-equal-expression ) +not-equal-expression = application-expression *(not-equal application-expression ) + +; Import expressions need to be separated by some whitespace, otherwise there +; would be ambiguity: `./ab` could be interpreted as "import the file `./ab`", +; or "apply the import `./a` to label `b`" +; The -raw handling is important for greedy parsers, that can't do sequential backtracking. +application-expression = atomic-expression-raw *(nonempty-whitespace atomic-expression-raw) whitespace + +; An expression that does not need to be surrounded by parentheses to disambiguate +atomic-expression = atomic-expression-raw whitespace +atomic-expression-raw = + import-raw + / selector-expression-raw + +; `record.field` extracts one field of a record +; `record.{ field0, field1, field2 }` projects out several fields of a record +; +; NOTE: Backtrack when parsing the `*(dot ...)`. The reason why is that you +; can't tell from parsing just the period whether "foo." will become "foo.bar" +; (i.e. accessing field `bar` of the record `foo`) or `foo./bar` (i.e. applying +; the function `foo` to the relative path `./bar`) +selector-expression-raw = primitive-expression-raw *(whitespace dot selector-raw) + +selector-raw = label-raw / labels-raw + +labels-raw = open-brace [ label *(comma label) ] close-brace-raw + + +primitive-expression-raw = + literal-expression-raw + / open-brace record-type-or-literal close-brace-raw + / open-angle union-type-or-literal close-angle-raw + / non-empty-list-literal-raw + / parenthesized-expression-raw + +; NOTE: Backtrack when parsing the first three alternatives (i.e. the numeric +; literals). This is because they share leading characters in common +literal-expression-raw = + ; "2.0" + double-literal-raw + + ; "2" + / natural-literal-raw + + ; "+2" + / integer-literal-raw + + ; "-Infinity" + / "-" Infinity-raw + + ; '"ABC"' + / text-literal-raw + + ; "x" + ; "x@2" + / identifier-raw + +; "{ foo = 1 , bar = True }" +; "{ foo : Integer, bar : Bool }" +record-type-or-literal = + empty-record-literal + / non-empty-record-type-or-literal + / empty-record-type +empty-record-literal = equal +empty-record-type = "" +non-empty-record-type-or-literal = + label (non-empty-record-literal / non-empty-record-type) +non-empty-record-type = colon expression *(comma record-type-entry) +record-type-entry = label colon expression +non-empty-record-literal = equal expression *(comma record-literal-entry) +record-literal-entry = label equal expression + +; "< Foo : Integer | Bar : Bool >" +; "< Foo : Integer | Bar = True >" +union-type-or-literal = + non-empty-union-type-or-literal + / empty-union-type +empty-union-type = "" +non-empty-union-type-or-literal = + label + ( equal expression union-type-entries + / colon expression [ bar non-empty-union-type-or-literal ] + ) +union-type-entries = *(bar union-type-entry) +union-type-entry = label colon expression + +; "[1, 2, 3]" +; `empty-list-or-optional` handles empty lists +non-empty-list-literal-raw = open-bracket expression *(comma expression) close-bracket-raw + +; "( e )" +parenthesized-expression-raw = open-parens expression close-parens-raw + + +; All expressions end with trailing whitespace. This just adds a final +; whitespace prefix for the top-level of the program +complete-expression = whitespace expression -- cgit v1.2.3