From a35de29355dc85f63e0e8514a2e447bf5322ba20 Mon Sep 17 00:00:00 2001 From: Nadrieril Date: Sat, 30 May 2020 15:05:36 +0100 Subject: Commit grammar file locally The crate publishing process does not allow access to files outside the current crate. --- dhall/build.rs | 2 +- dhall/src/syntax/text/dhall.abnf | 936 +++++++++++++++++++++++++++++++++++++++ dhall/src/syntax/text/parser.rs | 29 ++ 3 files changed, 966 insertions(+), 1 deletion(-) create mode 100644 dhall/src/syntax/text/dhall.abnf diff --git a/dhall/build.rs b/dhall/build.rs index 71c634b..a12e91b 100644 --- a/dhall/build.rs +++ b/dhall/build.rs @@ -348,7 +348,7 @@ fn generate_tests() -> std::io::Result<()> { fn convert_abnf_to_pest() -> std::io::Result<()> { let out_dir = env::var("OUT_DIR").unwrap(); - let abnf_path = "../dhall-lang/standard/dhall.abnf"; + let abnf_path = "src/syntax/text/dhall.abnf"; let visibility_path = "src/syntax/text/dhall.pest.visibility"; let grammar_path = Path::new(&out_dir).join("dhall.pest"); println!("cargo:rerun-if-changed={}", abnf_path); diff --git a/dhall/src/syntax/text/dhall.abnf b/dhall/src/syntax/text/dhall.abnf new file mode 100644 index 0000000..1c3a980 --- /dev/null +++ b/dhall/src/syntax/text/dhall.abnf @@ -0,0 +1,936 @@ +; ABNF syntax based on RFC 5234 +; +; The character encoding for Dhall is UTF-8 +; +; Some notes on implementing this grammar: +; +; First, do not use a lexer to tokenize the file before parsing. Instead, treat +; the individual characters of the file as the tokens to feed into the parser. +; You should not use a lexer because Dhall's grammar supports two features which +; cannot be correctly supported by a lexer: +; +; * String interpolation (i.e. "foo ${Natural/toInteger bar} baz") +; * Nested block comments (i.e. "{- foo {- bar -} baz -}") +; +; Second, this grammar assumes that your parser can backtrack and/or try +; multiple parses simultaneously. For example, consider this expression: +; +; List ./MyType +; +; A parser might first try to parse the period as the beginning of a field +; selector, only to realize immediately afterwards that `/MyType` is not a valid +; name for a field. A conforming parser must backtrack so that the expression +; `./MyType` can instead be correctly interpreted as a relative path +; +; Third, if there are multiple valid parses then prefer the first parse +; according to the ordering of alternatives. That is, the order of evaluation +; of the alternatives is left-to-right. +; +; For example, the grammar for single quoted string literals is: +; +; single-quote-continue = +; "'''" single-quote-continue +; / "${" complete-expression "}" single-quote-continue +; / "''${" single-quote-continue +; / "''" +; / %x20-10FFFF single-quote-continue +; / tab single-quote-continue +; / end-of-line single-quote-continue +; +; single-quote-literal = "''" single-quote-continue +; +; ... which permits valid parses for the following code: +; +; "''''''''''''''''" +; +; If you tried to parse all alternatives then there are at least two valid +; interpretations for the above code: +; +; * A single quoted literal with four escape sequences of the form "'''" +; * i.e. "''" followed by "'''" four times in a row followed by "''" +; * Four empty single quoted literals +; * i.e. "''''" four times in a row +; +; The correct interpretation is the first one because parsing the escape +; sequence "'''" takes precedence over parsing the termination sequence "''", +; according to the order of the alternatives in the `single-quote-continue` +; rule. +; +; Some parsing libraries do not backtrack by default but allow the user to +; selectively backtrack in certain parts of the grammar. Usually parsing +; libraries do this to improve efficiency and error messages. Dhall's grammar +; takes that into account by minimizing the number of rules that require the +; parser to backtrack and comments below will highlight where you need to +; explicitly backtrack +; +; Specifically, if you see an uninterrupted literal in a grammar rule such as: +; +; "->" +; +; ... or: +; +; %x66.6f.72.61.6c.6c +; +; ... then that string literal is parsed as a single unit, meaning that you +; should backtrack if you parse only part of the literal +; +; In all other cases you can assume that you do not need to backtrack unless +; there is a comment explicitly asking you to backtrack +; +; When parsing a repeated construct, prefer alternatives that parse as many +; repetitions as possible. On in other words: +; +; [a] = a / "" +; +; a* = a* a / "" +; +; Note that the latter rule also specifies that repetition produces +; left-associated expressions. For example, function application is +; left-associative and all operators are left-associative when they are not +; parenthesized. +; +; Additionally, try alternatives in an order that minimizes backtracking +; according to the following rule: +; +; (a / b) (c / d) = a c / a d / b c / b d + +; NOTE: There are many line endings in the wild +; +; See: https://en.wikipedia.org/wiki/Newline +; +; For simplicity this supports Unix and Windows line-endings, which are the most +; common +end-of-line = + %x0A ; "\n" + / %x0D.0A ; "\r\n" + +; This rule matches all characters that are not: +; +; * not ASCII +; * not part of a surrogate pair +; * not a "non-character" +valid-non-ascii = + %x80-D7FF + ; %xD800-DFFF = surrogate pairs + / %xE000-FFFD + ; %xFFFE-FFFF = non-characters + / %x10000-1FFFD + ; %x1FFFE-1FFFF = non-characters + / %x20000-2FFFD + ; %x2FFFE-2FFFF = non-characters + / %x30000-3FFFD + ; %x3FFFE-3FFFF = non-characters + / %x40000-4FFFD + ; %x4FFFE-4FFFF = non-characters + / %x50000-5FFFD + ; %x5FFFE-5FFFF = non-characters + / %x60000-6FFFD + ; %x6FFFE-6FFFF = non-characters + / %x70000-7FFFD + ; %x7FFFE-7FFFF = non-characters + / %x80000-8FFFD + ; %x8FFFE-8FFFF = non-characters + / %x90000-9FFFD + ; %x9FFFE-9FFFF = non-characters + / %xA0000-AFFFD + ; %xAFFFE-AFFFF = non-characters + / %xB0000-BFFFD + ; %xBFFFE-BFFFF = non-characters + / %xC0000-CFFFD + ; %xCFFFE-CFFFF = non-characters + / %xD0000-DFFFD + ; %xDFFFE-DFFFF = non-characters + / %xE0000-EFFFD + ; %xEFFFE-EFFFF = non-characters + / %xF0000-FFFFD + ; %xFFFFE-FFFFF = non-characters + / %x100000-10FFFD + ; %x10FFFE-10FFFF = non-characters + +tab = %x09 ; "\t" + +block-comment = "{-" block-comment-continue + +block-comment-char = + %x20-7F + / valid-non-ascii + / tab + / end-of-line + +block-comment-continue = + "-}" + / block-comment block-comment-continue + / block-comment-char block-comment-continue + +not-end-of-line = %x20-7F / valid-non-ascii / tab + +; NOTE: Slightly different from Haskell-style single-line comments because this +; does not require a space after the dashes +line-comment = "--" *not-end-of-line end-of-line + +whitespace-chunk = + " " + / tab + / end-of-line + / line-comment + / block-comment + +whsp = *whitespace-chunk + +; nonempty whitespace +whsp1 = 1*whitespace-chunk + +; Uppercase or lowercase ASCII letter +ALPHA = %x41-5A / %x61-7A + +; ASCII digit +DIGIT = %x30-39 ; 0-9 + +ALPHANUM = ALPHA / DIGIT + +HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F" + +; A simple label cannot be one of the reserved keywords +; listed in the `keyword` rule. +; A PEG parser could use negative lookahead to +; enforce this, e.g. as follows: +; simple-label = +; keyword 1*simple-label-next-char +; / !keyword (simple-label-first-char *simple-label-next-char) +simple-label-first-char = ALPHA / "_" +simple-label-next-char = ALPHANUM / "-" / "/" / "_" +simple-label = simple-label-first-char *simple-label-next-char + +quoted-label-char = + %x20-5F + ; %x60 = '`' + / %x61-7E + +quoted-label = 1*quoted-label-char + +; NOTE: Dhall does not support Unicode labels, mainly to minimize the potential +; for code obfuscation +label = ("`" quoted-label "`" / simple-label) + +; A nonreserved-label cannot not be any of the reserved identifiers for builtins +; (unless quoted). +; Their list can be found in the `builtin` rule. +; The only place where this restriction applies is bound variables. +; A PEG parser could use negative lookahead to avoid parsing those identifiers, +; e.g. as follows: +; nonreserved-label = +; builtin 1*simple-label-next-char +; / !builtin label +nonreserved-label = label + +; An any-label is allowed to be one of the reserved identifiers (but not a keyword). +any-label = label + +; Allow specifically `Some` in record and union labels. +any-label-or-some = any-label / Some + +; Dhall's double-quoted strings are similar to JSON strings (RFC7159) except: +; +; * Dhall strings support string interpolation +; +; * Dhall strings also support escaping string interpolation by adding a new +; `\$` escape sequence +; +; * Dhall strings also allow Unicode escape sequences of the form `\u{XXX}` +double-quote-chunk = + interpolation + ; '\' Beginning of escape sequence + / %x5C double-quote-escaped + / double-quote-char + +double-quote-escaped = + %x22 ; '"' quotation mark U+0022 + / %x24 ; '$' dollar sign U+0024 + / %x5C ; '\' reverse solidus U+005C + / %x2F ; '/' solidus U+002F + / %x62 ; 'b' backspace U+0008 + / %x66 ; 'f' form feed U+000C + / %x6E ; 'n' line feed U+000A + / %x72 ; 'r' carriage return U+000D + / %x74 ; 't' tab U+0009 + / %x75 unicode-escape ; 'uXXXX' / 'u{XXXX}' U+XXXX + +; Valid Unicode escape sequences are as follows: +; +; * Exactly 4 hexadecimal digits without braces: +; `\uXXXX` +; * 1-6 hexadecimal digits within braces (with optional zero padding): +; `\u{XXXX}`, `\u{000X}`, `\u{XXXXX}`, `\u{00000XXXXX}`, etc. +; Any number of leading zeros are allowed within the braces preceding the 1-6 +; digits specifying the codepoint. +; +; From these sequences, the parser must also reject any codepoints that are in +; the following ranges: +; +; * Surrogate pairs: `%xD800-DFFF` +; * Non-characters: `%xNFFFE-NFFFF` / `%x10FFFE-10FFFF` for `N` in `{ 0 .. F }` +; +; See the `valid-non-ascii` rule for the exact ranges that are not allowed +unicode-escape = unbraced-escape / "{" braced-escape "}" + +; All valid last 4 digits for unicode codepoints (outside Plane 0): `0000-FFFD` +unicode-suffix = (DIGIT / "A" / "B" / "C" / "D" / "E") 3HEXDIG + / "F" 2HEXDIG (DIGIT / "A" / "B" / "C" / "D") + +; All 4-hex digit unicode escape sequences that are not: +; +; * Surrogate pairs (i.e. `%xD800-DFFF`) +; * Non-characters (i.e. `%xFFFE-FFFF`) +; +unbraced-escape = + (DIGIT / "A" / "B" / "C") 3HEXDIG + / "D" ("0" / "1" / "2" / "3" / "4" / "5" / "6" / "7") HEXDIG HEXDIG + ; %xD800-DFFF Surrogate pairs + / "E" 3HEXDIG + / "F" 2HEXDIG (DIGIT / "A" / "B" / "C" / "D") + ; %xFFFE-FFFF Non-characters + +; All 1-6 digit unicode codepoints that are not: +; +; * Surrogate pairs: `%xD800-DFFF` +; * Non-characters: `%xNFFFE-NFFFF` / `%x10FFFE-10FFFF` for `N` in `{ 0 .. F }` +; +; See the `valid-non-ascii` rule for the exact ranges that are not allowed +braced-codepoint = + ("1" / "2" / "3" / "4" / "5" / "6" / "7" / "8" / "9" / "A" / "B" / "C" / "D" / "E" / "F" / "10") unicode-suffix; (Planes 1-16) + / unbraced-escape ; (Plane 0) + / 1*3HEXDIG ; %x000-FFF + +; Allow zero padding for braced codepoints +braced-escape = *"0" braced-codepoint + +; Printable characters except double quote and backslash +double-quote-char = + %x20-21 + ; %x22 = '"' + / %x23-5B + ; %x5C = "\" + / %x5D-7F + / valid-non-ascii + +double-quote-literal = %x22 *double-quote-chunk %x22 + +; NOTE: The only way to end a single-quote string literal with a single quote is +; to either interpolate the single quote, like this: +; +; ''ABC${"'"}'' +; +; ... or concatenate another string, like this: +; +; ''ABC'' ++ "'" +; +; If you try to end the string literal with a single quote then you get "'''", +; which is interpreted as an escaped pair of single quotes +single-quote-continue = + interpolation single-quote-continue + / escaped-quote-pair single-quote-continue + / escaped-interpolation single-quote-continue + / "''" ; End of text literal + / single-quote-char single-quote-continue + +; Escape two single quotes (i.e. replace this sequence with "''") +escaped-quote-pair = "'''" + +; Escape interpolation (i.e. replace this sequence with "${") +escaped-interpolation = "''${" + +single-quote-char = + %x20-7F + / valid-non-ascii + / tab + / end-of-line + +single-quote-literal = "''" end-of-line single-quote-continue + +interpolation = "${" complete-expression "}" + +text-literal = (double-quote-literal / single-quote-literal) + +; RFC 5234 interprets string literals as case-insensitive and recommends using +; hex instead for case-sensitive strings +; +; If you don't feel like reading hex, these are all the same as the rule name. +; Keywords that should never be parsed as identifiers +if = %x69.66 +then = %x74.68.65.6e +else = %x65.6c.73.65 +let = %x6c.65.74 +in = %x69.6e +as = %x61.73 +using = %x75.73.69.6e.67 +merge = %x6d.65.72.67.65 +missing = %x6d.69.73.73.69.6e.67 +Infinity = %x49.6e.66.69.6e.69.74.79 +NaN = %x4e.61.4e +Some = %x53.6f.6d.65 +toMap = %x74.6f.4d.61.70 +assert = %x61.73.73.65.72.74 +forall = %x2200 / %x66.6f.72.61.6c.6c ; "∀" / "forall" +with = %x77.69.74.68 + +; Unused rule that could be used as negative lookahead in the +; `simple-label` rule for parsers that support this. +keyword = + if / then / else + / let / in + / using / missing + / assert / as + / Infinity / NaN + / merge / Some / toMap + / forall + / with + +builtin = + Natural-fold + / Natural-build + / Natural-isZero + / Natural-even + / Natural-odd + / Natural-toInteger + / Natural-show + / Integer-toDouble + / Integer-show + / Integer-negate + / Integer-clamp + / Natural-subtract + / Double-show + / List-build + / List-fold + / List-length + / List-head + / List-last + / List-indexed + / List-reverse + / Optional-fold + / Optional-build + / Text-show + / Bool + / True + / False + / Optional + / None + / Natural + / Integer + / Double + / Text + / List + / Type + / Kind + / Sort + +; Reserved identifiers, needed for some special cases of parsing +Optional = %x4f.70.74.69.6f.6e.61.6c +Text = %x54.65.78.74 +List = %x4c.69.73.74 +Location = %x4c.6f.63.61.74.69.6f.6e + +; Reminder of the reserved identifiers, needed for the `builtin` rule +Bool = %x42.6f.6f.6c +True = %x54.72.75.65 +False = %x46.61.6c.73.65 +None = %x4e.6f.6e.65 +Natural = %x4e.61.74.75.72.61.6c +Integer = %x49.6e.74.65.67.65.72 +Double = %x44.6f.75.62.6c.65 +Type = %x54.79.70.65 +Kind = %x4b.69.6e.64 +Sort = %x53.6f.72.74 +Natural-fold = %x4e.61.74.75.72.61.6c.2f.66.6f.6c.64 +Natural-build = %x4e.61.74.75.72.61.6c.2f.62.75.69.6c.64 +Natural-isZero = %x4e.61.74.75.72.61.6c.2f.69.73.5a.65.72.6f +Natural-even = %x4e.61.74.75.72.61.6c.2f.65.76.65.6e +Natural-odd = %x4e.61.74.75.72.61.6c.2f.6f.64.64 +Natural-toInteger = %x4e.61.74.75.72.61.6c.2f.74.6f.49.6e.74.65.67.65.72 +Natural-show = %x4e.61.74.75.72.61.6c.2f.73.68.6f.77 +Natural-subtract = %x4e.61.74.75.72.61.6c.2f.73.75.62.74.72.61.63.74 +Integer-toDouble = %x49.6e.74.65.67.65.72.2f.74.6f.44.6f.75.62.6c.65 +Integer-show = %x49.6e.74.65.67.65.72.2f.73.68.6f.77 +Integer-negate = %x49.6e.74.65.67.65.72.2f.6e.65.67.61.74.65 +Integer-clamp = %x49.6e.74.65.67.65.72.2f.63.6c.61.6d.70 +Double-show = %x44.6f.75.62.6c.65.2f.73.68.6f.77 +List-build = %x4c.69.73.74.2f.62.75.69.6c.64 +List-fold = %x4c.69.73.74.2f.66.6f.6c.64 +List-length = %x4c.69.73.74.2f.6c.65.6e.67.74.68 +List-head = %x4c.69.73.74.2f.68.65.61.64 +List-last = %x4c.69.73.74.2f.6c.61.73.74 +List-indexed = %x4c.69.73.74.2f.69.6e.64.65.78.65.64 +List-reverse = %x4c.69.73.74.2f.72.65.76.65.72.73.65 +Optional-fold = %x4f.70.74.69.6f.6e.61.6c.2f.66.6f.6c.64 +Optional-build = %x4f.70.74.69.6f.6e.61.6c.2f.62.75.69.6c.64 +Text-show = %x54.65.78.74.2f.73.68.6f.77 + +; Operators +combine = %x2227 / "/\" +combine-types = %x2A53 / "//\\" +equivalent = %x2261 / "===" +prefer = %x2AFD / "//" +lambda = %x3BB / "\" +arrow = %x2192 / "->" +complete = "::" + +exponent = "e" [ "+" / "-" ] 1*DIGIT + +numeric-double-literal = [ "+" / "-" ] 1*DIGIT ( "." 1*DIGIT [ exponent ] / exponent) + +minus-infinity-literal = "-" Infinity +plus-infinity-literal = Infinity + +double-literal = + ; "2.0" + numeric-double-literal + ; "-Infinity" + / minus-infinity-literal + ; "Infinity" + / plus-infinity-literal + ; "NaN" + / NaN + +natural-literal = + ; Hexadecimal with "0x" prefix + "0" %x78 1*HEXDIG + ; Decimal; leading 0 digits are not allowed + / ("1" / "2" / "3" / "4" / "5" / "6" / "7" / "8" / "9") *DIGIT + ; ... except for 0 itself + / "0" + +integer-literal = ( "+" / "-" ) natural-literal + +; If the identifier matches one of the names in the `builtin` rule, then it is a +; builtin, and should be treated as the corresponding item in the list of +; "Reserved identifiers for builtins" specified in the `standard/README.md` document. +; It is a syntax error to specify a de Bruijn index in this case. +; Otherwise, this is a variable with name and index matching the label and index. +identifier = variable / builtin + +variable = nonreserved-label [ whsp "@" whsp natural-literal ] + +; Printable characters other than " ()[]{}<>/\," +; +; Excluding those characters ensures that paths don't have to end with trailing +; whitespace most of the time +path-character = + ; %x20 = " " + %x21 + ; %x22 = "\"" + ; %x23 = "#" + / %x24-27 + ; %x28 = "(" + ; %x29 = ")" + / %x2A-2B + ; %x2C = "," + / %x2D-2E + ; %x2F = "/" + / %x30-3B + ; %x3C = "<" + / %x3D + ; %x3E = ">" + ; %x3F = "?" + / %x40-5A + ; %x5B = "[" + ; %x5C = "\" + ; %x5D = "]" + / %x5E-7A + ; %x7B = "{" + / %x7C + ; %x7D = "}" + / %x7E + +quoted-path-character = + %x20-21 + ; %x22 = "\"" + / %x23-2E + ; %x2F = "/" + / %x30-7F + / valid-non-ascii + +unquoted-path-component = 1*path-character +quoted-path-component = 1*quoted-path-character + +path-component = "/" ( unquoted-path-component / %x22 quoted-path-component %x22 ) + +; The last path-component matched by this rule is referred to as "file" in the semantics, +; and the other path-components as "directory". +path = 1*path-component + +local = + parent-path + / here-path + / home-path + ; NOTE: Backtrack if parsing this alternative fails + ; + ; This is because the first character of this alternative will be "/", but + ; if the second character is "/" or "\" then this should have been parsed + ; as an operator instead of a path + / absolute-path + +parent-path = ".." path ; Relative path +here-path = "." path ; Relative path +home-path = "~" path ; Home-anchored path +absolute-path = path ; Absolute path + +; `http[s]` URI grammar based on RFC7230 and RFC 3986 with some differences +; noted below + +scheme = %x68.74.74.70 [ %x73 ] ; "http" [ "s" ] + +; NOTE: This does not match the official grammar for a URI. Specifically: +; +; * path segments may be quoted instead of using percent-encoding +; * this does not support fragment identifiers, which have no meaning within +; Dhall expressions and do not affect import resolution +; * the characters "(" ")" and "," are not included in the `sub-delims` rule: +; in particular, these characters can't be used in authority, path or query +; strings. This is because those characters have other meaning in Dhall +; and it would be confusing for the comma in +; [http://example.com/foo, bar] +; to be part of the URL instead of part of the list. If you need a URL +; which contains parens or a comma, you must percent-encode them. +; +; Reserved characters in quoted path components should be percent-encoded +; according to https://tools.ietf.org/html/rfc3986#section-2 +http-raw = scheme "://" authority url-path [ "?" query ] + +; Temporary rule to allow old-style `path-component`s and RFC3986 `segment`s in +; the same grammar. Eventually we can just use `path-abempty` from the same +; RFC. See issue #581 + +url-path = *(path-component / "/" segment) + +; NOTE: Backtrack if parsing the optional user info prefix fails +authority = [ userinfo "@" ] host [ ":" port ] + +userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) + +host = IP-literal / IPv4address / domain + +port = *DIGIT + +IP-literal = "[" ( IPv6address / IPvFuture ) "]" + +IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) + +; NOTE: Backtrack when parsing each alternative +IPv6address = 6( h16 ":" ) ls32 + / "::" 5( h16 ":" ) ls32 + / [ h16 ] "::" 4( h16 ":" ) ls32 + / [ h16 *1( ":" h16 ) ] "::" 3( h16 ":" ) ls32 + / [ h16 *2( ":" h16 ) ] "::" 2( h16 ":" ) ls32 + / [ h16 *3( ":" h16 ) ] "::" h16 ":" ls32 + / [ h16 *4( ":" h16 ) ] "::" ls32 + / [ h16 *5( ":" h16 ) ] "::" h16 + / [ h16 *6( ":" h16 ) ] "::" + +h16 = 1*4HEXDIG + +ls32 = h16 ":" h16 / IPv4address + +IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet + +; NOTE: Backtrack when parsing these alternatives +dec-octet = "25" %x30-35 ; 250-255 + / "2" %x30-34 DIGIT ; 200-249 + / "1" 2DIGIT ; 100-199 + / %x31-39 DIGIT ; 10-99 + / DIGIT ; 0-9 + +; Look in RFC3986 3.2.2 for +; "A registered name intended for lookup in the DNS" +domain = domainlabel *("." domainlabel) [ "." ] + +domainlabel = 1*ALPHANUM *(1*"-" 1*ALPHANUM) + +segment = *pchar + +pchar = unreserved / pct-encoded / sub-delims / ":" / "@" + +query = *( pchar / "/" / "?" ) + +pct-encoded = "%" HEXDIG HEXDIG + +unreserved = ALPHANUM / "-" / "." / "_" / "~" + +; this is the RFC3986 sub-delims rule, without "(", ")" or "," +; see comments above the `http-raw` rule above +sub-delims = "!" / "$" / "&" / "'" / "*" / "+" / ";" / "=" + +http = http-raw [ whsp using whsp1 import-expression ] + +; Dhall supports unquoted environment variables that are Bash-compliant or +; quoted environment variables that are POSIX-compliant +env = "env:" + ( bash-environment-variable + / %x22 posix-environment-variable %x22 + ) + +; Bash supports a restricted subset of POSIX environment variables. From the +; Bash `man` page, an environment variable name is: +; +; > A word consisting only of alphanumeric characters and under-scores, and +; > beginning with an alphabetic character or an under-score +bash-environment-variable = (ALPHA / "_") *(ALPHANUM / "_") + +; The POSIX standard is significantly more flexible about legal environment +; variable names, which can contain alerts (i.e. '\a'), whitespace, or +; punctuation, for example. The POSIX standard says about environment variable +; names: +; +; > The value of an environment variable is a string of characters. For a +; > C-language program, an array of strings called the environment shall be made +; > available when a process begins. The array is pointed to by the external +; > variable environ, which is defined as: +; > +; > extern char **environ; +; > +; > These strings have the form name=value; names shall not contain the +; > character '='. For values to be portable across systems conforming to IEEE +; > Std 1003.1-2001, the value shall be composed of characters from the portable +; > character set (except NUL and as indicated below). +; +; Note that the standard does not explicitly state that the name must have at +; least one character, but `env` does not appear to support this and `env` +; claims to be POSIX-compliant. To be safe, Dhall requires at least one +; character like `env` +posix-environment-variable = 1*posix-environment-variable-character + +; These are all the characters from the POSIX Portable Character Set except for +; '\0' (NUL) and '='. Note that the POSIX standard does not explicitly state +; that environment variable names cannot have NUL. However, this is implicit +; in the fact that environment variables are passed to the program as +; NUL-terminated `name=value` strings, which implies that the `name` portion of +; the string cannot have NUL characters +posix-environment-variable-character = + %x5C ; '\' Beginning of escape sequence + ( %x22 ; '"' quotation mark U+0022 + / %x5C ; '\' reverse solidus U+005C + / %x61 ; 'a' alert U+0007 + / %x62 ; 'b' backspace U+0008 + / %x66 ; 'f' form feed U+000C + / %x6E ; 'n' line feed U+000A + / %x72 ; 'r' carriage return U+000D + / %x74 ; 't' tab U+0009 + / %x76 ; 'v' vertical tab U+000B + ) + ; Printable characters except double quote, backslash and equals + / %x20-21 + ; %x22 = '"' + / %x23-3C + ; %x3D = '=' + / %x3E-5B + ; %x5C = "\" + / %x5D-7E + +import-type = missing / local / http / env + +hash = %x73.68.61.32.35.36.3a 64HEXDIG ; "sha256:XXX...XXX" + +import-hashed = import-type [ whsp1 hash ] + +; "http://example.com" +; "./foo/bar" +; "env:FOO" +import = import-hashed [ whsp as whsp1 (Text / Location) ] + +expression = + ; "\(x : a) -> b" + lambda whsp "(" whsp nonreserved-label whsp ":" whsp1 expression whsp ")" whsp arrow whsp expression + + ; "if a then b else c" + / if whsp1 expression whsp then whsp1 expression whsp else whsp1 expression + + ; "let x : t = e1 in e2" + ; "let x = e1 in e2" + ; We allow dropping the `in` between adjacent let-expressions; the following are equivalent: + ; "let x = e1 let y = e2 in e3" + ; "let x = e1 in let y = e2 in e3" + / 1*let-binding in whsp1 expression + + ; "forall (x : a) -> b" + / forall whsp "(" whsp nonreserved-label whsp ":" whsp1 expression whsp ")" whsp arrow whsp expression + + ; "a -> b" + ; + ; NOTE: Backtrack if parsing this alternative fails + / operator-expression whsp arrow whsp expression + + ; "merge e1 e2 : t" + ; + ; NOTE: Backtrack if parsing this alternative fails since we can't tell + ; from the keyword whether there will be a type annotation or not + / merge whsp1 import-expression whsp1 import-expression whsp ":" whsp1 application-expression + + ; "[] : t" + ; + ; NOTE: Backtrack if parsing this alternative fails since we can't tell + ; from the opening bracket whether or not this will be an empty list or + ; a non-empty list + / empty-list-literal + + ; "toMap e : t" + ; + ; NOTE: Backtrack if parsing this alternative fails since we can't tell + ; from the keyword whether there will be a type annotation or not + / toMap whsp1 import-expression whsp ":" whsp1 application-expression + + ; "assert : Natural/even 1 === False" + / assert whsp ":" whsp1 expression + + ; "x : t" + / annotated-expression + +; Nonempty-whitespace to disambiguate `env:VARIABLE` from type annotations +annotated-expression = operator-expression [ whsp ":" whsp1 expression ] + +; "let x = e1" +let-binding = let whsp1 nonreserved-label whsp [ ":" whsp1 expression whsp ] "=" whsp expression whsp + +; "[] : t" +empty-list-literal = + "[" whsp [ "," whsp ] "]" whsp ":" whsp1 application-expression + +operator-expression = import-alt-expression + +; Nonempty-whitespace to disambiguate `http://a/a?a` +import-alt-expression = or-expression *(whsp "?" whsp1 or-expression) +or-expression = plus-expression *(whsp "||" whsp plus-expression) +; Nonempty-whitespace to disambiguate `f +2` +plus-expression = text-append-expression *(whsp "+" whsp1 text-append-expression) +text-append-expression = list-append-expression *(whsp "++" whsp list-append-expression) +list-append-expression = and-expression *(whsp "#" whsp and-expression) +and-expression = combine-expression *(whsp "&&" whsp combine-expression) +combine-expression = prefer-expression *(whsp combine whsp prefer-expression) +prefer-expression = combine-types-expression *(whsp prefer whsp combine-types-expression) +combine-types-expression = times-expression *(whsp combine-types whsp times-expression) +times-expression = equal-expression *(whsp "*" whsp equal-expression) +equal-expression = not-equal-expression *(whsp "==" whsp not-equal-expression) +not-equal-expression = equivalent-expression *(whsp "!=" whsp equivalent-expression) +equivalent-expression = with-expression *(whsp equivalent whsp with-expression) + +with-expression = application-expression *(whsp1 with whsp1 with-clause) + +with-clause = + any-label-or-some *(whsp "." whsp any-label-or-some) whsp "=" whsp application-expression + + +; Import expressions need to be separated by some whitespace, otherwise there +; would be ambiguity: `./ab` could be interpreted as "import the file `./ab`", +; or "apply the import `./a` to label `b`" +application-expression = + first-application-expression *(whsp1 import-expression) + +first-application-expression = + ; "merge e1 e2" + merge whsp1 import-expression whsp1 import-expression + + ; "Some e" + / Some whsp1 import-expression + + ; "toMap e" + / toMap whsp1 import-expression + + / import-expression + +import-expression = import / completion-expression + +completion-expression = + selector-expression [ whsp complete whsp selector-expression ] + +; `record.field` extracts one field of a record +; +; `record.{ field0, field1, field2 }` projects out several fields of a record +; +; NOTE: Backtrack when parsing the `*("." ...)`. The reason why is that you +; can't tell from parsing just the period whether "foo." will become "foo.bar" +; (i.e. accessing field `bar` of the record `foo`) or `foo./bar` (i.e. applying +; the function `foo` to the relative path `./bar`) +selector-expression = primitive-expression *(whsp "." whsp selector) + +selector = any-label / labels / type-selector + +labels = "{" whsp [ any-label-or-some whsp *("," whsp any-label-or-some whsp) ] "}" + +type-selector = "(" whsp expression whsp ")" +; NOTE: Backtrack when parsing the first three alternatives (i.e. the numeric +; literals). This is because they share leading characters in common +primitive-expression = + ; "2.0" + double-literal + + ; "2" + / natural-literal + + ; "+2" + / integer-literal + + ; '"ABC"' + / text-literal + + ; "{ foo = 1 , bar = True }" + ; "{ foo : Integer, bar : Bool }" + / "{" whsp [ "," whsp ] record-type-or-literal whsp "}" + + ; "< Foo : Integer | Bar : Bool >" + ; "< Foo | Bar : Bool >" + / "<" whsp [ "|" whsp ] union-type whsp ">" + + ; "[1, 2, 3]" + / non-empty-list-literal + + ; "x" + ; "x@2" + / identifier + + ; "( e )" + / "(" complete-expression ")" + + +record-type-or-literal = + empty-record-literal + / non-empty-record-type-or-literal + / empty-record-type + +empty-record-literal = "=" +empty-record-type = "" + +non-empty-record-type-or-literal = + (non-empty-record-type / non-empty-record-literal) + +non-empty-record-type = + record-type-entry *(whsp "," whsp record-type-entry) + +record-type-entry = any-label-or-some whsp ":" whsp1 expression + +non-empty-record-literal = + record-literal-entry *(whsp "," whsp record-literal-entry) + +record-literal-entry = + any-label-or-some (record-literal-normal-entry / record-literal-punned-entry) + +record-literal-normal-entry = + *(whsp "." whsp any-label-or-some) whsp "=" whsp expression +record-literal-punned-entry = "" + + +union-type = + non-empty-union-type + / empty-union-type + +empty-union-type = "" + +non-empty-union-type = + union-type-entry *(whsp "|" whsp union-type-entry) + +; x : Natural +; x +union-type-entry = any-label-or-some [ whsp ":" whsp1 expression ] + + +non-empty-list-literal = + "[" whsp [ "," whsp ] expression whsp *("," whsp expression whsp) "]" + +; This just adds surrounding whitespace for the top-level of the program +complete-expression = whsp expression whsp diff --git a/dhall/src/syntax/text/parser.rs b/dhall/src/syntax/text/parser.rs index dcaf5e4..486030f 100644 --- a/dhall/src/syntax/text/parser.rs +++ b/dhall/src/syntax/text/parser.rs @@ -1063,3 +1063,32 @@ pub fn parse_expr(input_str: &str) -> ParseResult { [expression(e)] => e, )) } + +#[test] +// Check that the local copy of the grammar file is in sync with the one from dhall-lang. +fn test_grammar_files_in_sync() { + use std::process::Command; + + let spec_abnf_path = "../dhall-lang/standard/dhall.abnf"; + let local_abnf_path = "src/syntax/text/dhall.abnf"; + + let out = Command::new("git") + .arg("diff") + .arg("--no-index") + .arg("--ignore-space-change") + .arg("--color") + .arg("--") + .arg(spec_abnf_path) + .arg(local_abnf_path) + .output() + .expect("failed to run `git diff` command"); + + if !out.status.success() { + let output = String::from_utf8_lossy(&out.stdout); + panic!( + "The local dhall.abnf file differs from the one from \ + dhall-lang!\n{}", + output + ); + } +} -- cgit v1.2.3