From 686cce9504e5390e047dc715b192c89f2c375ab0 Mon Sep 17 00:00:00 2001 From: Nadrieril Date: Sun, 7 Apr 2019 19:28:48 +0200 Subject: Use upstream grammar Closes #43 --- dhall_parser/src/dhall.abnf | 705 +------------------------------------------- 1 file changed, 1 insertion(+), 704 deletions(-) mode change 100644 => 120000 dhall_parser/src/dhall.abnf diff --git a/dhall_parser/src/dhall.abnf b/dhall_parser/src/dhall.abnf deleted file mode 100644 index 847da02..0000000 --- a/dhall_parser/src/dhall.abnf +++ /dev/null @@ -1,704 +0,0 @@ -; ABNF syntax based on RFC 5234 -; -; The character encoding for Dhall is UTF-8 -; -; Some notes on implementing this grammar: -; -; First, do not use a lexer to tokenize the file before parsing. Instead, treat -; the individual characters of the file as the tokens to feed into the parser. -; You should not use a lexer because Dhall's grammar supports two features which -; cannot be correctly supported by a lexer: -; -; * String interpolation (i.e. "foo ${Natural/toInteger bar} baz") -; * Nested block comments (i.e. "{- foo {- bar -} baz -}") -; -; Second, this grammar assumes that your parser can backtrack and/or try -; multiple parses simultaneously. For example, consider this expression: -; -; List ./MyType -; -; A parser might first try to parse the period as the beginning of a field -; selector, only to realize immediately afterwards that `/MyType` is not a valid -; name for a field. A conforming parser must backtrack so that the expression -; `./MyType` can instead be correctly interpreted as a relative path -; -; Third, if there are multiple valid parses then prefer the first parse -; according to the ordering of alternatives. That is, the order of evaluation -; of the alternatives is left-to-right. -; -; For example, the grammar for single quoted string literals is: -; -; single-quote-continue = -; "'''" single-quote-continue -; / "${" complete-expression "}" single-quote-continue -; / "''${" single-quote-continue -; / "''" -; / %x20-10FFFF single-quote-continue -; / tab single-quote-continue -; / end-of-line single-quote-continue -; -; single-quote-literal = "''" single-quote-continue -; -; ... which permits valid parses for the following code: -; -; "''''''''''''''''" -; -; If you tried to parse all alternatives then there are at least two valid -; interpretations for the above code: -; -; * A single quoted literal with four escape sequences of the form "'''" -; * i.e. "''" followed by "'''" four times in a row followed by "''" -; * Four empty single quoted literals -; * i.e. "''''" four times in a row -; -; The correct interpretation is the first one because parsing the escape -; sequence "'''" takes precedence over parsing the termination sequence "''", -; according to the order of the alternatives in the `single-quote-continue` -; rule. -; -; Some parsing libraries do not backtrack by default but allow the user to -; selectively backtrack in certain parts of the grammar. Usually parsing -; libraries do this to improve efficiency and error messages. Dhall's grammar -; takes that into account by minimizing the number of rules that require the -; parser to backtrack and comments below will highlight where you need to -; explicitly backtrack -; -; Specifically, if you see an uninterrupted literal in a grammar rule such as: -; -; "->" -; -; ... or: -; -; %x66.6f.72.61.6c.6c -; -; ... then that string literal is parsed as a single unit, meaning that you -; should backtrack if you parse only part of the literal -; -; In all other cases you can assume that you do not need to backtrack unless -; there is a comment explicitly asking you to backtrack -; -; When parsing a repeated construct, prefer alternatives that parse as many -; repetitions as possible. On in other words: -; -; [a] = a / "" -; -; a* = a* a / "" -; -; Note that the latter rule also specifies that repetition produces -; left-associated expressions. For example, function application is -; left-associative and all operators are left-associative when they are not -; parenthesized. -; -; Additionally, try alternatives in an order that minimizes backtracking -; according to the following rule: -; -; (a / b) (c / d) = a c / a d / b c / b d - -; NOTE: There are many line endings in the wild -; -; See: https://en.wikipedia.org/wiki/Newline -; -; For simplicity this supports Unix and Windows line-endings, which are the most -; common -end-of-line = - %x0A ; "\n" - / %x0D.0A ; "\r\n" - -tab = %x09 ; "\t" - -block-comment = "{-" block-comment-continue - -block-comment-chunk = - block-comment - / %x20-10FFFF - / tab - / end-of-line - -block-comment-continue = "-}" / block-comment-chunk block-comment-continue - -not-end-of-line = %x20-10FFFF / tab - -; NOTE: Slightly different from Haskell-style single-line comments because this -; does not require a space after the dashes -line-comment = "--" *not-end-of-line end-of-line - -whitespace-chunk = - " " - / tab - / end-of-line - / line-comment - / block-comment - -whsp = *whitespace-chunk - -; nonempty whitespace -whsp1 = 1*whitespace-chunk - -; Uppercase or lowercase ASCII letter -ALPHA = %x41-5A / %x61-7A - -; ASCII digit -DIGIT = %x30-39 ; 0-9 - -HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F" - -; A simple label cannot be one of the reserved keywords -; listed in the `keyword` rule. -; A PEG parser could use negative lookahead to -; enforce this, e.g. as follows: -; simple-label = -; keyword 1*simple-label-next-char -; / !keyword (simple-label-first-char *simple-label-next-char) -simple-label-first-char = ALPHA / "_" -simple-label-next-char = ALPHA / DIGIT / "-" / "/" / "_" -simple-label = simple-label-first-char *simple-label-next-char - -quoted-label-char = - %x20-5F - ; %x60 = '`' - / %x61-7E - -quoted-label = 1*quoted-label-char - -; NOTE: Dhall does not support Unicode labels, mainly to minimize the potential -; for code obfuscation -label = ("`" quoted-label "`" / simple-label) - -; A nonreserved-label cannot not be any of the reserved identifiers for builtins (unless quoted). -; Their list can be found in semantics.md. This is not enforced by the grammar but -; should be checked by implementations. The only place where this restriction applies -; is bound variables. -; A PEG parser could use negative lookahead to avoid parsing those identifiers. -nonreserved-label = label - -; An any-label is allowed to be one of the reserved identifiers. -any-label = label - - -; Dhall's double-quoted strings are equivalent to JSON strings except with -; support for string interpolation (and escaping string interpolation) -; -; Dhall uses almost the same escaping rules as JSON (RFC7159) with one -; exception: Dhall adds a new `\$` escape sequence for dollar signs. This -; additional escape sequences lets you escape string interpolation by writing -; `\${` -; -; > The representation of strings is similar to conventions used in the C -; > family of programming languages. A string begins and ends with -; > quotation marks. All Unicode characters may be placed within the -; > quotation marks, except for the characters that must be escaped: -; > quotation mark, reverse solidus, and the control characters (U+0000 -; > through U+001F). -; > -; > Any character may be escaped. If the character is in the Basic -; > Multilingual Plane (U+0000 through U+FFFF), then it may be -; > represented as a six-character sequence: a reverse solidus, followed -; > by the lowercase letter u, followed by four hexadecimal digits that -; > encode the character's code point. The hexadecimal letters A though -; > F can be upper or lower case. So, for example, a string containing -; > only a single reverse solidus character may be represented as -; > "\u005C". -; > -; > Alternatively, there are two-character sequence escape -; > representations of some popular characters. So, for example, a -; > string containing only a single reverse solidus character may be -; > represented more compactly as "\\". -; > -; > To escape an extended character that is not in the Basic Multilingual -; > Plane, the character is represented as a 12-character sequence, -; > encoding the UTF-16 surrogate pair. So, for example, a string -; > containing only the G clef character (U+1D11E) may be represented as -; > "\uD834\uDD1E". -double-quote-chunk = - interpolation - ; '\' Beginning of escape sequence - / %x5C double-quote-escaped - / double-quote-char - -double-quote-escaped = - %x22 ; '"' quotation mark U+0022 - / %x24 ; '$' dollar sign U+0024 - / %x5C ; '\' reverse solidus U+005C - / %x2F ; '/' solidus U+002F - / %x62 ; 'b' backspace U+0008 - / %x66 ; 'f' form feed U+000C - / %x6E ; 'n' line feed U+000A - / %x72 ; 'r' carriage return U+000D - / %x74 ; 't' tab U+0009 - / %x75 4HEXDIG ; 'uXXXX' U+XXXX - -; Printable characters except double quote and backslash -double-quote-char = - %x20-21 - ; %x22 = '"' - / %x23-5B - ; %x5C = "\" - / %x5D-10FFFF - -double-quote-literal = %x22 *double-quote-chunk %x22 - -; NOTE: The only way to end a single-quote string literal with a single quote is -; to either interpolate the single quote, like this: -; -; ''ABC${"'"}'' -; -; ... or concatenate another string, like this: -; -; ''ABC'' ++ "'" -; -; If you try to end the string literal with a single quote then you get "'''", -; which is interpreted as an escaped pair of single quotes -single-quote-continue = - interpolation single-quote-continue - / escaped-quote-pair single-quote-continue - / escaped-interpolation single-quote-continue - / single-quote-char single-quote-continue - / "''" ; End of text literal - -; Escape two single quotes (i.e. replace this sequence with "''") -escaped-quote-pair = "'''" - -; Escape interpolation (i.e. replace this sequence with "${") -escaped-interpolation = "''${" - -single-quote-char = - %x20-10FFFF - / tab - / end-of-line - -single-quote-literal = "''" end-of-line single-quote-continue - -interpolation = "${" complete-expression "}" - -text-literal = (double-quote-literal / single-quote-literal) - -; RFC 5234 interprets string literals as case-insensitive and recommends using -; hex instead for case-sensitive strings -; -; If you don't feel like reading hex, these are all the same as the rule name. -; Keywords that should never be parsed as identifiers -if = %x69.66 -then = %x74.68.65.6e -else = %x65.6c.73.65 -let = %x6c.65.74 -in = %x69.6e -as = %x61.73 -using = %x75.73.69.6e.67 -merge = %x6d.65.72.67.65 -missing = %x6d.69.73.73.69.6e.67 -Infinity = %x49.6e.66.69.6e.69.74.79 -NaN = %x4e.61.4e -Some = %x53.6f.6d.65 - -; Unused rule that could be used as negative lookahead in the -; `simple-label` rule for parsers that support this. -keyword = - if / then / else - / let / in - / using / missing / as - / Infinity / NaN - / merge / Some - -; Reserved identifiers, only needed for some special cases of parsing -Optional = %x4f.70.74.69.6f.6e.61.6c -Text = %x54.65.78.74 -List = %x4c.69.73.74 - -combine = %x2227 / "/\" -combine-types = %x2A53 / "//\\" -prefer = %x2AFD / "//" -lambda = %x3BB / "\" -forall = %x2200 / %x66.6f.72.61.6c.6c -arrow = %x2192 / "->" - -exponent = "e" [ "+" / "-" ] 1*DIGIT - -numeric-double-literal = [ "+" / "-" ] 1*DIGIT ( "." 1*DIGIT [ exponent ] / exponent) - -minus-infinity-literal = "-" Infinity -plus-infinity-literal = Infinity - -double-literal = - ; "2.0" - numeric-double-literal - ; "-Infinity" - / minus-infinity-literal - ; "Infinity" - / plus-infinity-literal - ; "NaN" - / NaN - -natural-literal = 1*DIGIT - -integer-literal = ( "+" / "-" ) natural-literal - -; The implementation should recognize reserved names for builtins and treat them as special -; values instead of variables. -identifier = any-label [ whsp "@" whsp natural-literal ] - -; Printable characters other than " ()[]{}<>/\," -; -; Excluding those characters ensures that paths don't have to end with trailing -; whitespace most of the time -path-character = - ; %x20 = " " - %x21 - ; %x22 = "\"" - ; %x23 = "#" - / %x24-27 - ; %x28 = "(" - ; %x29 = ")" - / %x2A-2B - ; %x2C = "," - / %x2D-2E - ; %x2F = "/" - / %x30-3B - ; %x3C = "<" - / %x3D - ; %x3E = ">" - ; %x3F = "?" - / %x40-5A - ; %x5B = "[" - ; %x5C = "\" - ; %x5D = "]" - / %x5E-7A - ; %x7B = "{" - / %x7C - ; %x7D = "}" - / %x7E - -quoted-path-character = - %x20-21 - ; %x22 = "\"" - / %x23-2E - ; %x2F = "/" - / %x30-10FFFF - -unquoted-path-component = 1*path-character -quoted-path-component = 1*quoted-path-character - -path-component = "/" ( unquoted-path-component / %x22 quoted-path-component %x22 ) - -; The last path-component matched by this rule is referred to as "file" in the semantics, -; and the other path-components as "directory". -path = 1*path-component - -local = - parent-path - / here-path - / home-path - ; NOTE: Backtrack if parsing this alternative fails - ; - ; This is because the first character of this alternative will be "/", but - ; if the second character is "/" or "\" then this should have been parsed - ; as an operator instead of a path - / absolute-path - -parent-path = ".." path ; Relative path -here-path = "." path ; Relative path -home-path = "~" path ; Home-anchored path -absolute-path = path ; Absolute path - -; `http[s]` URI grammar based on RFC7230 and RFC 3986 with some differences -; noted below - -scheme = %x68.74.74.70 [ %x73 ] ; "http" [ "s" ] - -; NOTE: This does not match the official grammar for a URI. Specifically: -; -; * this replaces `path-abempty` with `path`, so an empty path is -; not valid -; * this does not support fragment identifiers, which have no meaning within -; Dhall expressions and do not affect import resolution -http-raw = scheme "://" authority path [ "?" query ] - -; NOTE: Backtrack if parsing the optional user info prefix fails -authority = [ userinfo "@" ] host [ ":" port ] - -userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) - -host = IP-literal / IPv4address / reg-name - -port = *DIGIT - -IP-literal = "[" ( IPv6address / IPvFuture ) "]" - -IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) - -; NOTE: Backtrack when parsing each alternative -IPv6address = 6( h16 ":" ) ls32 - / "::" 5( h16 ":" ) ls32 - / [ h16 ] "::" 4( h16 ":" ) ls32 - / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 - / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 - / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 - / [ *4( h16 ":" ) h16 ] "::" ls32 - / [ *5( h16 ":" ) h16 ] "::" h16 - / [ *6( h16 ":" ) h16 ] "::" - -h16 = 1*4HEXDIG - -ls32 = ( h16 ":" h16 ) / IPv4address - -IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet - -; NOTE: Backtrack when parsing these alternatives and try them in reverse order -dec-octet = DIGIT ; 0-9 - / %x31-39 DIGIT ; 10-99 - / "1" 2DIGIT ; 100-199 - / "2" %x30-34 DIGIT ; 200-249 - / "25" %x30-35 ; 250-255 - -reg-name = *( unreserved / pct-encoded / sub-delims ) - -pchar = unreserved / pct-encoded / sub-delims / ":" / "@" - -query = *( pchar / "/" / "?" ) - -pct-encoded = "%" HEXDIG HEXDIG - -unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" - -sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" - -http = - http-raw - [ whsp using whsp1 (import-hashed / "(" whsp import-hashed whsp ")") ] - -; Dhall supports unquoted environment variables that are Bash-compliant or -; quoted environment variables that are POSIX-compliant -env = "env:" - ( bash-environment-variable - / %x22 posix-environment-variable %x22 - ) - -; Bash supports a restricted subset of POSIX environment variables. From the -; Bash `man` page, an environment variable name is: -; -; > A word consisting only of alphanumeric characters and under-scores, and -; > beginning with an alphabetic character or an under-score -bash-environment-variable = (ALPHA / "_") *(ALPHA / DIGIT / "_") - -; The POSIX standard is significantly more flexible about legal environment -; variable names, which can contain alerts (i.e. '\a'), whitespace, or -; punctuation, for example. The POSIX standard says about environment variable -; names: -; -; > The value of an environment variable is a string of characters. For a -; > C-language program, an array of strings called the environment shall be made -; > available when a process begins. The array is pointed to by the external -; > variable environ, which is defined as: -; > -; > extern char **environ; -; > -; > These strings have the form name=value; names shall not contain the -; > character '='. For values to be portable across systems conforming to IEEE -; > Std 1003.1-2001, the value shall be composed of characters from the portable -; > character set (except NUL and as indicated below). -; -; Note that the standard does not explicitly state that the name must have at -; least one character, but `env` does not appear to support this and `env` -; claims to be POSIX-compliant. To be safe, Dhall requires at least one -; character like `env` -posix-environment-variable = 1*posix-environment-variable-character - -; These are all the characters from the POSIX Portable Character Set except for -; '\0' (NUL) and '='. Note that the POSIX standard does not explicitly state -; that environment variable names cannot have NUL. However, this is implicit -; in the fact that environment variables are passed to the program as -; NUL-terminated `name=value` strings, which implies that the `name` portion of -; the string cannot have NUL characters -posix-environment-variable-character = - %x5C ; '\' Beginning of escape sequence - ( %x22 ; '"' quotation mark U+0022 - / %x5C ; '\' reverse solidus U+005C - / %x61 ; 'a' alert U+0007 - / %x62 ; 'b' backspace U+0008 - / %x66 ; 'f' form feed U+000C - / %x6E ; 'n' line feed U+000A - / %x72 ; 'r' carriage return U+000D - / %x74 ; 't' tab U+0009 - / %x76 ; 'v' vertical tab U+000B - ) - ; Printable characters except double quote, backslash and equals - / %x20-21 - ; %x22 = '"' - / %x23-3C - ; %x3D = '=' - / %x3E-5B - ; %x5C = "\" - / %x5D-7E - -import-type = missing / local / http / env - -hash = %x73.68.61.32.35.36.3a 64HEXDIG ; "sha256:XXX...XXX" - -import-hashed = import-type [ whsp hash ] - -; "http://example.com" -; "./foo/bar" -; "env:FOO" -import = import-hashed [ whsp as whsp1 Text ] - -expression = - ; "\(x : a) -> b" - lambda whsp "(" whsp nonreserved-label whsp ":" whsp1 expression whsp ")" whsp arrow whsp expression - - ; "if a then b else c" - / if whsp1 expression whsp then whsp1 expression whsp else whsp1 expression - - ; "let x : t = e1 in e2" - ; "let x = e1 in e2" - ; "let x = e1 let y = e2 in e3" - / 1*let-binding in whsp1 expression - - ; "forall (x : a) -> b" - / forall whsp "(" whsp nonreserved-label whsp ":" whsp1 expression whsp ")" whsp arrow whsp expression - - ; "a -> b" - ; - ; NOTE: Backtrack if parsing this alternative fails - / operator-expression whsp arrow whsp expression - - ; "merge e1 e2 : t" - ; "merge e1 e2" - / merge whsp1 import-expression whsp import-expression [ whsp ":" whsp1 application-expression ] - - ; "[] : List t" - ; "[] : Optional t" - ; "[x] : Optional t" - ; - ; NOTE: Backtrack if parsing this alternative fails since we can't tell - ; from the opening bracket whether or not this will be an empty list or - ; a non-empty list - / "[" whsp (empty-collection / non-empty-optional) - - ; "x : t" - / annotated-expression - -; Nonempty-whitespace to disambiguate `env:VARIABLE` from type annotations -annotated-expression = operator-expression [ whsp ":" whsp1 expression ] - -; "let x = e1" -let-binding = let whsp1 nonreserved-label whsp [ ":" whsp1 expression whsp ] "=" whsp expression whsp - -; "] : List t" -; "] : Optional t" -empty-collection = "]" whsp ":" whsp1 (List / Optional) whsp import-expression - -; "x] : Optional t" -non-empty-optional = expression whsp "]" whsp ":" whsp1 Optional whsp import-expression - -operator-expression = import-alt-expression - -; Nonempty-whitespace to disambiguate `http://a/a?a` -import-alt-expression = or-expression *(whsp "?" whsp1 or-expression) -or-expression = plus-expression *(whsp "||" whsp plus-expression) -; Nonempty-whitespace to disambiguate `f +2` -plus-expression = text-append-expression *(whsp "+" whsp1 text-append-expression) -text-append-expression = list-append-expression *(whsp "++" whsp list-append-expression) -list-append-expression = and-expression *(whsp "#" whsp and-expression) -and-expression = combine-expression *(whsp "&&" whsp combine-expression) -combine-expression = prefer-expression *(whsp combine whsp prefer-expression) -prefer-expression = combine-types-expression *(whsp prefer whsp combine-types-expression) -combine-types-expression = times-expression *(whsp combine-types whsp times-expression) -times-expression = equal-expression *(whsp "*" whsp equal-expression) -equal-expression = not-equal-expression *(whsp "==" whsp not-equal-expression) -not-equal-expression = application-expression *(whsp "!=" whsp application-expression) - -; Import expressions need to be separated by some whitespace, otherwise there -; would be ambiguity: `./ab` could be interpreted as "import the file `./ab`", -; or "apply the import `./a` to label `b`" -application-expression = - [ Some whsp1 ] import-expression *(whsp1 import-expression) - -import-expression = import / selector-expression - -; `record.field` extracts one field of a record -; -; `record.{ field0, field1, field2 }` projects out several fields of a record -; -; NOTE: Backtrack when parsing the `*("." ...)`. The reason why is that you -; can't tell from parsing just the period whether "foo." will become "foo.bar" -; (i.e. accessing field `bar` of the record `foo`) or `foo./bar` (i.e. applying -; the function `foo` to the relative path `./bar`) -selector-expression = primitive-expression *(whsp "." whsp selector) - -selector = any-label / labels - -labels = "{" whsp [ any-label whsp *("," whsp any-label whsp) ] "}" - -; NOTE: Backtrack when parsing the first three alternatives (i.e. the numeric -; literals). This is because they share leading characters in common -primitive-expression = - ; "2.0" - double-literal - - ; "2" - / natural-literal - - ; "+2" - / integer-literal - - ; '"ABC"' - / text-literal - - ; "{ foo = 1 , bar = True }" - ; "{ foo : Integer, bar : Bool }" - / "{" whsp record-type-or-literal whsp "}" - - ; "< Foo : Integer | Bar : Bool >" - ; "< Foo : Integer | Bar = True | Baz : Bool >" - ; "< Foo | Bar : Bool >" - / "<" whsp union-type-or-literal whsp ">" - - ; "[1, 2, 3]" - ; `empty-collection` handles empty lists - / non-empty-list-literal - - ; "x" - ; "x@2" - / identifier - - ; "( e )" - / "(" complete-expression ")" - -record-type-or-literal = - empty-record-literal - / non-empty-record-type-or-literal - / empty-record-type - -empty-record-literal = "=" -empty-record-type = "" - -non-empty-record-type-or-literal = - any-label whsp (non-empty-record-literal / non-empty-record-type) - -non-empty-record-type = ":" whsp1 expression *(whsp "," whsp record-type-entry) -record-type-entry = any-label whsp ":" whsp1 expression - -non-empty-record-literal = "=" whsp expression *(whsp "," whsp record-literal-entry) -record-literal-entry = any-label whsp "=" whsp expression - -union-type-or-literal = - non-empty-union-type-or-literal - / empty-union-type - -empty-union-type = "" - -non-empty-union-type-or-literal = - any-label [ whsp ( union-literal-variant-value / union-type-or-literal-variant-type) ] - -; = True | ... -union-literal-variant-value = "=" whsp expression *(whsp "|" whsp union-type-entry) -union-type-entry = any-label whsp ":" whsp1 expression - -; : Integer | ... -; | ... -union-type-or-literal-variant-type = [ ":" whsp1 expression ] [ whsp "|" whsp non-empty-union-type-or-literal ] - - -non-empty-list-literal = "[" whsp expression whsp *("," whsp expression whsp) "]" - -; This just adds surrounding whitespace for the top-level of the program -complete-expression = whsp expression whsp diff --git a/dhall_parser/src/dhall.abnf b/dhall_parser/src/dhall.abnf new file mode 120000 index 0000000..ce13b8e --- /dev/null +++ b/dhall_parser/src/dhall.abnf @@ -0,0 +1 @@ +../../dhall-lang/standard/dhall.abnf \ No newline at end of file -- cgit v1.2.3