diff options
author | Nadrieril | 2020-05-30 15:05:36 +0100 |
---|---|---|
committer | Nadrieril | 2020-05-30 15:06:30 +0100 |
commit | a35de29355dc85f63e0e8514a2e447bf5322ba20 (patch) | |
tree | 9eec496ad246e2831d42291d6078c84c8a73879f | |
parent | 97e46bef0fe1729d5de0a9a40ba939842023fea7 (diff) |
Commit grammar file locally
The crate publishing process does not allow access to files outside the
current crate.
-rw-r--r-- | dhall/build.rs | 2 | ||||
-rw-r--r-- | dhall/src/syntax/text/dhall.abnf | 936 | ||||
-rw-r--r-- | dhall/src/syntax/text/parser.rs | 29 |
3 files changed, 966 insertions, 1 deletions
diff --git a/dhall/build.rs b/dhall/build.rs index 71c634b..a12e91b 100644 --- a/dhall/build.rs +++ b/dhall/build.rs @@ -348,7 +348,7 @@ fn generate_tests() -> std::io::Result<()> { fn convert_abnf_to_pest() -> std::io::Result<()> { let out_dir = env::var("OUT_DIR").unwrap(); - let abnf_path = "../dhall-lang/standard/dhall.abnf"; + let abnf_path = "src/syntax/text/dhall.abnf"; let visibility_path = "src/syntax/text/dhall.pest.visibility"; let grammar_path = Path::new(&out_dir).join("dhall.pest"); println!("cargo:rerun-if-changed={}", abnf_path); diff --git a/dhall/src/syntax/text/dhall.abnf b/dhall/src/syntax/text/dhall.abnf new file mode 100644 index 0000000..1c3a980 --- /dev/null +++ b/dhall/src/syntax/text/dhall.abnf @@ -0,0 +1,936 @@ +; ABNF syntax based on RFC 5234
+;
+; The character encoding for Dhall is UTF-8
+;
+; Some notes on implementing this grammar:
+;
+; First, do not use a lexer to tokenize the file before parsing. Instead, treat
+; the individual characters of the file as the tokens to feed into the parser.
+; You should not use a lexer because Dhall's grammar supports two features which
+; cannot be correctly supported by a lexer:
+;
+; * String interpolation (i.e. "foo ${Natural/toInteger bar} baz")
+; * Nested block comments (i.e. "{- foo {- bar -} baz -}")
+;
+; Second, this grammar assumes that your parser can backtrack and/or try
+; multiple parses simultaneously. For example, consider this expression:
+;
+; List ./MyType
+;
+; A parser might first try to parse the period as the beginning of a field
+; selector, only to realize immediately afterwards that `/MyType` is not a valid
+; name for a field. A conforming parser must backtrack so that the expression
+; `./MyType` can instead be correctly interpreted as a relative path
+;
+; Third, if there are multiple valid parses then prefer the first parse
+; according to the ordering of alternatives. That is, the order of evaluation
+; of the alternatives is left-to-right.
+;
+; For example, the grammar for single quoted string literals is:
+;
+; single-quote-continue =
+; "'''" single-quote-continue
+; / "${" complete-expression "}" single-quote-continue
+; / "''${" single-quote-continue
+; / "''"
+; / %x20-10FFFF single-quote-continue
+; / tab single-quote-continue
+; / end-of-line single-quote-continue
+;
+; single-quote-literal = "''" single-quote-continue
+;
+; ... which permits valid parses for the following code:
+;
+; "''''''''''''''''"
+;
+; If you tried to parse all alternatives then there are at least two valid
+; interpretations for the above code:
+;
+; * A single quoted literal with four escape sequences of the form "'''"
+; * i.e. "''" followed by "'''" four times in a row followed by "''"
+; * Four empty single quoted literals
+; * i.e. "''''" four times in a row
+;
+; The correct interpretation is the first one because parsing the escape
+; sequence "'''" takes precedence over parsing the termination sequence "''",
+; according to the order of the alternatives in the `single-quote-continue`
+; rule.
+;
+; Some parsing libraries do not backtrack by default but allow the user to
+; selectively backtrack in certain parts of the grammar. Usually parsing
+; libraries do this to improve efficiency and error messages. Dhall's grammar
+; takes that into account by minimizing the number of rules that require the
+; parser to backtrack and comments below will highlight where you need to
+; explicitly backtrack
+;
+; Specifically, if you see an uninterrupted literal in a grammar rule such as:
+;
+; "->"
+;
+; ... or:
+;
+; %x66.6f.72.61.6c.6c
+;
+; ... then that string literal is parsed as a single unit, meaning that you
+; should backtrack if you parse only part of the literal
+;
+; In all other cases you can assume that you do not need to backtrack unless
+; there is a comment explicitly asking you to backtrack
+;
+; When parsing a repeated construct, prefer alternatives that parse as many
+; repetitions as possible. On in other words:
+;
+; [a] = a / ""
+;
+; a* = a* a / ""
+;
+; Note that the latter rule also specifies that repetition produces
+; left-associated expressions. For example, function application is
+; left-associative and all operators are left-associative when they are not
+; parenthesized.
+;
+; Additionally, try alternatives in an order that minimizes backtracking
+; according to the following rule:
+;
+; (a / b) (c / d) = a c / a d / b c / b d
+
+; NOTE: There are many line endings in the wild
+;
+; See: https://en.wikipedia.org/wiki/Newline
+;
+; For simplicity this supports Unix and Windows line-endings, which are the most
+; common
+end-of-line =
+ %x0A ; "\n"
+ / %x0D.0A ; "\r\n"
+
+; This rule matches all characters that are not:
+;
+; * not ASCII
+; * not part of a surrogate pair
+; * not a "non-character"
+valid-non-ascii =
+ %x80-D7FF
+ ; %xD800-DFFF = surrogate pairs
+ / %xE000-FFFD
+ ; %xFFFE-FFFF = non-characters
+ / %x10000-1FFFD
+ ; %x1FFFE-1FFFF = non-characters
+ / %x20000-2FFFD
+ ; %x2FFFE-2FFFF = non-characters
+ / %x30000-3FFFD
+ ; %x3FFFE-3FFFF = non-characters
+ / %x40000-4FFFD
+ ; %x4FFFE-4FFFF = non-characters
+ / %x50000-5FFFD
+ ; %x5FFFE-5FFFF = non-characters
+ / %x60000-6FFFD
+ ; %x6FFFE-6FFFF = non-characters
+ / %x70000-7FFFD
+ ; %x7FFFE-7FFFF = non-characters
+ / %x80000-8FFFD
+ ; %x8FFFE-8FFFF = non-characters
+ / %x90000-9FFFD
+ ; %x9FFFE-9FFFF = non-characters
+ / %xA0000-AFFFD
+ ; %xAFFFE-AFFFF = non-characters
+ / %xB0000-BFFFD
+ ; %xBFFFE-BFFFF = non-characters
+ / %xC0000-CFFFD
+ ; %xCFFFE-CFFFF = non-characters
+ / %xD0000-DFFFD
+ ; %xDFFFE-DFFFF = non-characters
+ / %xE0000-EFFFD
+ ; %xEFFFE-EFFFF = non-characters
+ / %xF0000-FFFFD
+ ; %xFFFFE-FFFFF = non-characters
+ / %x100000-10FFFD
+ ; %x10FFFE-10FFFF = non-characters
+
+tab = %x09 ; "\t"
+
+block-comment = "{-" block-comment-continue
+
+block-comment-char =
+ %x20-7F
+ / valid-non-ascii
+ / tab
+ / end-of-line
+
+block-comment-continue =
+ "-}"
+ / block-comment block-comment-continue
+ / block-comment-char block-comment-continue
+
+not-end-of-line = %x20-7F / valid-non-ascii / tab
+
+; NOTE: Slightly different from Haskell-style single-line comments because this
+; does not require a space after the dashes
+line-comment = "--" *not-end-of-line end-of-line
+
+whitespace-chunk =
+ " "
+ / tab
+ / end-of-line
+ / line-comment
+ / block-comment
+
+whsp = *whitespace-chunk
+
+; nonempty whitespace
+whsp1 = 1*whitespace-chunk
+
+; Uppercase or lowercase ASCII letter
+ALPHA = %x41-5A / %x61-7A
+
+; ASCII digit
+DIGIT = %x30-39 ; 0-9
+
+ALPHANUM = ALPHA / DIGIT
+
+HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
+
+; A simple label cannot be one of the reserved keywords
+; listed in the `keyword` rule.
+; A PEG parser could use negative lookahead to
+; enforce this, e.g. as follows:
+; simple-label =
+; keyword 1*simple-label-next-char
+; / !keyword (simple-label-first-char *simple-label-next-char)
+simple-label-first-char = ALPHA / "_"
+simple-label-next-char = ALPHANUM / "-" / "/" / "_"
+simple-label = simple-label-first-char *simple-label-next-char
+
+quoted-label-char =
+ %x20-5F
+ ; %x60 = '`'
+ / %x61-7E
+
+quoted-label = 1*quoted-label-char
+
+; NOTE: Dhall does not support Unicode labels, mainly to minimize the potential
+; for code obfuscation
+label = ("`" quoted-label "`" / simple-label)
+
+; A nonreserved-label cannot not be any of the reserved identifiers for builtins
+; (unless quoted).
+; Their list can be found in the `builtin` rule.
+; The only place where this restriction applies is bound variables.
+; A PEG parser could use negative lookahead to avoid parsing those identifiers,
+; e.g. as follows:
+; nonreserved-label =
+; builtin 1*simple-label-next-char
+; / !builtin label
+nonreserved-label = label
+
+; An any-label is allowed to be one of the reserved identifiers (but not a keyword).
+any-label = label
+
+; Allow specifically `Some` in record and union labels.
+any-label-or-some = any-label / Some
+
+; Dhall's double-quoted strings are similar to JSON strings (RFC7159) except:
+;
+; * Dhall strings support string interpolation
+;
+; * Dhall strings also support escaping string interpolation by adding a new
+; `\$` escape sequence
+;
+; * Dhall strings also allow Unicode escape sequences of the form `\u{XXX}`
+double-quote-chunk =
+ interpolation
+ ; '\' Beginning of escape sequence
+ / %x5C double-quote-escaped
+ / double-quote-char
+
+double-quote-escaped =
+ %x22 ; '"' quotation mark U+0022
+ / %x24 ; '$' dollar sign U+0024
+ / %x5C ; '\' reverse solidus U+005C
+ / %x2F ; '/' solidus U+002F
+ / %x62 ; 'b' backspace U+0008
+ / %x66 ; 'f' form feed U+000C
+ / %x6E ; 'n' line feed U+000A
+ / %x72 ; 'r' carriage return U+000D
+ / %x74 ; 't' tab U+0009
+ / %x75 unicode-escape ; 'uXXXX' / 'u{XXXX}' U+XXXX
+
+; Valid Unicode escape sequences are as follows:
+;
+; * Exactly 4 hexadecimal digits without braces:
+; `\uXXXX`
+; * 1-6 hexadecimal digits within braces (with optional zero padding):
+; `\u{XXXX}`, `\u{000X}`, `\u{XXXXX}`, `\u{00000XXXXX}`, etc.
+; Any number of leading zeros are allowed within the braces preceding the 1-6
+; digits specifying the codepoint.
+;
+; From these sequences, the parser must also reject any codepoints that are in
+; the following ranges:
+;
+; * Surrogate pairs: `%xD800-DFFF`
+; * Non-characters: `%xNFFFE-NFFFF` / `%x10FFFE-10FFFF` for `N` in `{ 0 .. F }`
+;
+; See the `valid-non-ascii` rule for the exact ranges that are not allowed
+unicode-escape = unbraced-escape / "{" braced-escape "}"
+
+; All valid last 4 digits for unicode codepoints (outside Plane 0): `0000-FFFD`
+unicode-suffix = (DIGIT / "A" / "B" / "C" / "D" / "E") 3HEXDIG
+ / "F" 2HEXDIG (DIGIT / "A" / "B" / "C" / "D")
+
+; All 4-hex digit unicode escape sequences that are not:
+;
+; * Surrogate pairs (i.e. `%xD800-DFFF`)
+; * Non-characters (i.e. `%xFFFE-FFFF`)
+;
+unbraced-escape =
+ (DIGIT / "A" / "B" / "C") 3HEXDIG
+ / "D" ("0" / "1" / "2" / "3" / "4" / "5" / "6" / "7") HEXDIG HEXDIG
+ ; %xD800-DFFF Surrogate pairs
+ / "E" 3HEXDIG
+ / "F" 2HEXDIG (DIGIT / "A" / "B" / "C" / "D")
+ ; %xFFFE-FFFF Non-characters
+
+; All 1-6 digit unicode codepoints that are not:
+;
+; * Surrogate pairs: `%xD800-DFFF`
+; * Non-characters: `%xNFFFE-NFFFF` / `%x10FFFE-10FFFF` for `N` in `{ 0 .. F }`
+;
+; See the `valid-non-ascii` rule for the exact ranges that are not allowed
+braced-codepoint =
+ ("1" / "2" / "3" / "4" / "5" / "6" / "7" / "8" / "9" / "A" / "B" / "C" / "D" / "E" / "F" / "10") unicode-suffix; (Planes 1-16)
+ / unbraced-escape ; (Plane 0)
+ / 1*3HEXDIG ; %x000-FFF
+
+; Allow zero padding for braced codepoints
+braced-escape = *"0" braced-codepoint
+
+; Printable characters except double quote and backslash
+double-quote-char =
+ %x20-21
+ ; %x22 = '"'
+ / %x23-5B
+ ; %x5C = "\"
+ / %x5D-7F
+ / valid-non-ascii
+
+double-quote-literal = %x22 *double-quote-chunk %x22
+
+; NOTE: The only way to end a single-quote string literal with a single quote is
+; to either interpolate the single quote, like this:
+;
+; ''ABC${"'"}''
+;
+; ... or concatenate another string, like this:
+;
+; ''ABC'' ++ "'"
+;
+; If you try to end the string literal with a single quote then you get "'''",
+; which is interpreted as an escaped pair of single quotes
+single-quote-continue =
+ interpolation single-quote-continue
+ / escaped-quote-pair single-quote-continue
+ / escaped-interpolation single-quote-continue
+ / "''" ; End of text literal
+ / single-quote-char single-quote-continue
+
+; Escape two single quotes (i.e. replace this sequence with "''")
+escaped-quote-pair = "'''"
+
+; Escape interpolation (i.e. replace this sequence with "${")
+escaped-interpolation = "''${"
+
+single-quote-char =
+ %x20-7F
+ / valid-non-ascii
+ / tab
+ / end-of-line
+
+single-quote-literal = "''" end-of-line single-quote-continue
+
+interpolation = "${" complete-expression "}"
+
+text-literal = (double-quote-literal / single-quote-literal)
+
+; RFC 5234 interprets string literals as case-insensitive and recommends using
+; hex instead for case-sensitive strings
+;
+; If you don't feel like reading hex, these are all the same as the rule name.
+; Keywords that should never be parsed as identifiers
+if = %x69.66
+then = %x74.68.65.6e
+else = %x65.6c.73.65
+let = %x6c.65.74
+in = %x69.6e
+as = %x61.73
+using = %x75.73.69.6e.67
+merge = %x6d.65.72.67.65
+missing = %x6d.69.73.73.69.6e.67
+Infinity = %x49.6e.66.69.6e.69.74.79
+NaN = %x4e.61.4e
+Some = %x53.6f.6d.65
+toMap = %x74.6f.4d.61.70
+assert = %x61.73.73.65.72.74
+forall = %x2200 / %x66.6f.72.61.6c.6c ; "∀" / "forall"
+with = %x77.69.74.68
+
+; Unused rule that could be used as negative lookahead in the
+; `simple-label` rule for parsers that support this.
+keyword =
+ if / then / else
+ / let / in
+ / using / missing
+ / assert / as
+ / Infinity / NaN
+ / merge / Some / toMap
+ / forall
+ / with
+
+builtin =
+ Natural-fold
+ / Natural-build
+ / Natural-isZero
+ / Natural-even
+ / Natural-odd
+ / Natural-toInteger
+ / Natural-show
+ / Integer-toDouble
+ / Integer-show
+ / Integer-negate
+ / Integer-clamp
+ / Natural-subtract
+ / Double-show
+ / List-build
+ / List-fold
+ / List-length
+ / List-head
+ / List-last
+ / List-indexed
+ / List-reverse
+ / Optional-fold
+ / Optional-build
+ / Text-show
+ / Bool
+ / True
+ / False
+ / Optional
+ / None
+ / Natural
+ / Integer
+ / Double
+ / Text
+ / List
+ / Type
+ / Kind
+ / Sort
+
+; Reserved identifiers, needed for some special cases of parsing
+Optional = %x4f.70.74.69.6f.6e.61.6c
+Text = %x54.65.78.74
+List = %x4c.69.73.74
+Location = %x4c.6f.63.61.74.69.6f.6e
+
+; Reminder of the reserved identifiers, needed for the `builtin` rule
+Bool = %x42.6f.6f.6c
+True = %x54.72.75.65
+False = %x46.61.6c.73.65
+None = %x4e.6f.6e.65
+Natural = %x4e.61.74.75.72.61.6c
+Integer = %x49.6e.74.65.67.65.72
+Double = %x44.6f.75.62.6c.65
+Type = %x54.79.70.65
+Kind = %x4b.69.6e.64
+Sort = %x53.6f.72.74
+Natural-fold = %x4e.61.74.75.72.61.6c.2f.66.6f.6c.64
+Natural-build = %x4e.61.74.75.72.61.6c.2f.62.75.69.6c.64
+Natural-isZero = %x4e.61.74.75.72.61.6c.2f.69.73.5a.65.72.6f
+Natural-even = %x4e.61.74.75.72.61.6c.2f.65.76.65.6e
+Natural-odd = %x4e.61.74.75.72.61.6c.2f.6f.64.64
+Natural-toInteger = %x4e.61.74.75.72.61.6c.2f.74.6f.49.6e.74.65.67.65.72
+Natural-show = %x4e.61.74.75.72.61.6c.2f.73.68.6f.77
+Natural-subtract = %x4e.61.74.75.72.61.6c.2f.73.75.62.74.72.61.63.74
+Integer-toDouble = %x49.6e.74.65.67.65.72.2f.74.6f.44.6f.75.62.6c.65
+Integer-show = %x49.6e.74.65.67.65.72.2f.73.68.6f.77
+Integer-negate = %x49.6e.74.65.67.65.72.2f.6e.65.67.61.74.65
+Integer-clamp = %x49.6e.74.65.67.65.72.2f.63.6c.61.6d.70
+Double-show = %x44.6f.75.62.6c.65.2f.73.68.6f.77
+List-build = %x4c.69.73.74.2f.62.75.69.6c.64
+List-fold = %x4c.69.73.74.2f.66.6f.6c.64
+List-length = %x4c.69.73.74.2f.6c.65.6e.67.74.68
+List-head = %x4c.69.73.74.2f.68.65.61.64
+List-last = %x4c.69.73.74.2f.6c.61.73.74
+List-indexed = %x4c.69.73.74.2f.69.6e.64.65.78.65.64
+List-reverse = %x4c.69.73.74.2f.72.65.76.65.72.73.65
+Optional-fold = %x4f.70.74.69.6f.6e.61.6c.2f.66.6f.6c.64
+Optional-build = %x4f.70.74.69.6f.6e.61.6c.2f.62.75.69.6c.64
+Text-show = %x54.65.78.74.2f.73.68.6f.77
+
+; Operators
+combine = %x2227 / "/\"
+combine-types = %x2A53 / "//\\"
+equivalent = %x2261 / "==="
+prefer = %x2AFD / "//"
+lambda = %x3BB / "\"
+arrow = %x2192 / "->"
+complete = "::"
+
+exponent = "e" [ "+" / "-" ] 1*DIGIT
+
+numeric-double-literal = [ "+" / "-" ] 1*DIGIT ( "." 1*DIGIT [ exponent ] / exponent)
+
+minus-infinity-literal = "-" Infinity
+plus-infinity-literal = Infinity
+
+double-literal =
+ ; "2.0"
+ numeric-double-literal
+ ; "-Infinity"
+ / minus-infinity-literal
+ ; "Infinity"
+ / plus-infinity-literal
+ ; "NaN"
+ / NaN
+
+natural-literal =
+ ; Hexadecimal with "0x" prefix
+ "0" %x78 1*HEXDIG
+ ; Decimal; leading 0 digits are not allowed
+ / ("1" / "2" / "3" / "4" / "5" / "6" / "7" / "8" / "9") *DIGIT
+ ; ... except for 0 itself
+ / "0"
+
+integer-literal = ( "+" / "-" ) natural-literal
+
+; If the identifier matches one of the names in the `builtin` rule, then it is a
+; builtin, and should be treated as the corresponding item in the list of
+; "Reserved identifiers for builtins" specified in the `standard/README.md` document.
+; It is a syntax error to specify a de Bruijn index in this case.
+; Otherwise, this is a variable with name and index matching the label and index.
+identifier = variable / builtin
+
+variable = nonreserved-label [ whsp "@" whsp natural-literal ]
+
+; Printable characters other than " ()[]{}<>/\,"
+;
+; Excluding those characters ensures that paths don't have to end with trailing
+; whitespace most of the time
+path-character =
+ ; %x20 = " "
+ %x21
+ ; %x22 = "\""
+ ; %x23 = "#"
+ / %x24-27
+ ; %x28 = "("
+ ; %x29 = ")"
+ / %x2A-2B
+ ; %x2C = ","
+ / %x2D-2E
+ ; %x2F = "/"
+ / %x30-3B
+ ; %x3C = "<"
+ / %x3D
+ ; %x3E = ">"
+ ; %x3F = "?"
+ / %x40-5A
+ ; %x5B = "["
+ ; %x5C = "\"
+ ; %x5D = "]"
+ / %x5E-7A
+ ; %x7B = "{"
+ / %x7C
+ ; %x7D = "}"
+ / %x7E
+
+quoted-path-character =
+ %x20-21
+ ; %x22 = "\""
+ / %x23-2E
+ ; %x2F = "/"
+ / %x30-7F
+ / valid-non-ascii
+
+unquoted-path-component = 1*path-character
+quoted-path-component = 1*quoted-path-character
+
+path-component = "/" ( unquoted-path-component / %x22 quoted-path-component %x22 )
+
+; The last path-component matched by this rule is referred to as "file" in the semantics,
+; and the other path-components as "directory".
+path = 1*path-component
+
+local =
+ parent-path
+ / here-path
+ / home-path
+ ; NOTE: Backtrack if parsing this alternative fails
+ ;
+ ; This is because the first character of this alternative will be "/", but
+ ; if the second character is "/" or "\" then this should have been parsed
+ ; as an operator instead of a path
+ / absolute-path
+
+parent-path = ".." path ; Relative path
+here-path = "." path ; Relative path
+home-path = "~" path ; Home-anchored path
+absolute-path = path ; Absolute path
+
+; `http[s]` URI grammar based on RFC7230 and RFC 3986 with some differences
+; noted below
+
+scheme = %x68.74.74.70 [ %x73 ] ; "http" [ "s" ]
+
+; NOTE: This does not match the official grammar for a URI. Specifically:
+;
+; * path segments may be quoted instead of using percent-encoding
+; * this does not support fragment identifiers, which have no meaning within
+; Dhall expressions and do not affect import resolution
+; * the characters "(" ")" and "," are not included in the `sub-delims` rule:
+; in particular, these characters can't be used in authority, path or query
+; strings. This is because those characters have other meaning in Dhall
+; and it would be confusing for the comma in
+; [http://example.com/foo, bar]
+; to be part of the URL instead of part of the list. If you need a URL
+; which contains parens or a comma, you must percent-encode them.
+;
+; Reserved characters in quoted path components should be percent-encoded
+; according to https://tools.ietf.org/html/rfc3986#section-2
+http-raw = scheme "://" authority url-path [ "?" query ]
+
+; Temporary rule to allow old-style `path-component`s and RFC3986 `segment`s in
+; the same grammar. Eventually we can just use `path-abempty` from the same
+; RFC. See issue #581
+
+url-path = *(path-component / "/" segment)
+
+; NOTE: Backtrack if parsing the optional user info prefix fails
+authority = [ userinfo "@" ] host [ ":" port ]
+
+userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
+
+host = IP-literal / IPv4address / domain
+
+port = *DIGIT
+
+IP-literal = "[" ( IPv6address / IPvFuture ) "]"
+
+IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
+
+; NOTE: Backtrack when parsing each alternative
+IPv6address = 6( h16 ":" ) ls32
+ / "::" 5( h16 ":" ) ls32
+ / [ h16 ] "::" 4( h16 ":" ) ls32
+ / [ h16 *1( ":" h16 ) ] "::" 3( h16 ":" ) ls32
+ / [ h16 *2( ":" h16 ) ] "::" 2( h16 ":" ) ls32
+ / [ h16 *3( ":" h16 ) ] "::" h16 ":" ls32
+ / [ h16 *4( ":" h16 ) ] "::" ls32
+ / [ h16 *5( ":" h16 ) ] "::" h16
+ / [ h16 *6( ":" h16 ) ] "::"
+
+h16 = 1*4HEXDIG
+
+ls32 = h16 ":" h16 / IPv4address
+
+IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
+
+; NOTE: Backtrack when parsing these alternatives
+dec-octet = "25" %x30-35 ; 250-255
+ / "2" %x30-34 DIGIT ; 200-249
+ / "1" 2DIGIT ; 100-199
+ / %x31-39 DIGIT ; 10-99
+ / DIGIT ; 0-9
+
+; Look in RFC3986 3.2.2 for
+; "A registered name intended for lookup in the DNS"
+domain = domainlabel *("." domainlabel) [ "." ]
+
+domainlabel = 1*ALPHANUM *(1*"-" 1*ALPHANUM)
+
+segment = *pchar
+
+pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
+
+query = *( pchar / "/" / "?" )
+
+pct-encoded = "%" HEXDIG HEXDIG
+
+unreserved = ALPHANUM / "-" / "." / "_" / "~"
+
+; this is the RFC3986 sub-delims rule, without "(", ")" or ","
+; see comments above the `http-raw` rule above
+sub-delims = "!" / "$" / "&" / "'" / "*" / "+" / ";" / "="
+
+http = http-raw [ whsp using whsp1 import-expression ]
+
+; Dhall supports unquoted environment variables that are Bash-compliant or
+; quoted environment variables that are POSIX-compliant
+env = "env:"
+ ( bash-environment-variable
+ / %x22 posix-environment-variable %x22
+ )
+
+; Bash supports a restricted subset of POSIX environment variables. From the
+; Bash `man` page, an environment variable name is:
+;
+; > A word consisting only of alphanumeric characters and under-scores, and
+; > beginning with an alphabetic character or an under-score
+bash-environment-variable = (ALPHA / "_") *(ALPHANUM / "_")
+
+; The POSIX standard is significantly more flexible about legal environment
+; variable names, which can contain alerts (i.e. '\a'), whitespace, or
+; punctuation, for example. The POSIX standard says about environment variable
+; names:
+;
+; > The value of an environment variable is a string of characters. For a
+; > C-language program, an array of strings called the environment shall be made
+; > available when a process begins. The array is pointed to by the external
+; > variable environ, which is defined as:
+; >
+; > extern char **environ;
+; >
+; > These strings have the form name=value; names shall not contain the
+; > character '='. For values to be portable across systems conforming to IEEE
+; > Std 1003.1-2001, the value shall be composed of characters from the portable
+; > character set (except NUL and as indicated below).
+;
+; Note that the standard does not explicitly state that the name must have at
+; least one character, but `env` does not appear to support this and `env`
+; claims to be POSIX-compliant. To be safe, Dhall requires at least one
+; character like `env`
+posix-environment-variable = 1*posix-environment-variable-character
+
+; These are all the characters from the POSIX Portable Character Set except for
+; '\0' (NUL) and '='. Note that the POSIX standard does not explicitly state
+; that environment variable names cannot have NUL. However, this is implicit
+; in the fact that environment variables are passed to the program as
+; NUL-terminated `name=value` strings, which implies that the `name` portion of
+; the string cannot have NUL characters
+posix-environment-variable-character =
+ %x5C ; '\' Beginning of escape sequence
+ ( %x22 ; '"' quotation mark U+0022
+ / %x5C ; '\' reverse solidus U+005C
+ / %x61 ; 'a' alert U+0007
+ / %x62 ; 'b' backspace U+0008
+ / %x66 ; 'f' form feed U+000C
+ / %x6E ; 'n' line feed U+000A
+ / %x72 ; 'r' carriage return U+000D
+ / %x74 ; 't' tab U+0009
+ / %x76 ; 'v' vertical tab U+000B
+ )
+ ; Printable characters except double quote, backslash and equals
+ / %x20-21
+ ; %x22 = '"'
+ / %x23-3C
+ ; %x3D = '='
+ / %x3E-5B
+ ; %x5C = "\"
+ / %x5D-7E
+
+import-type = missing / local / http / env
+
+hash = %x73.68.61.32.35.36.3a 64HEXDIG ; "sha256:XXX...XXX"
+
+import-hashed = import-type [ whsp1 hash ]
+
+; "http://example.com"
+; "./foo/bar"
+; "env:FOO"
+import = import-hashed [ whsp as whsp1 (Text / Location) ]
+
+expression =
+ ; "\(x : a) -> b"
+ lambda whsp "(" whsp nonreserved-label whsp ":" whsp1 expression whsp ")" whsp arrow whsp expression
+
+ ; "if a then b else c"
+ / if whsp1 expression whsp then whsp1 expression whsp else whsp1 expression
+
+ ; "let x : t = e1 in e2"
+ ; "let x = e1 in e2"
+ ; We allow dropping the `in` between adjacent let-expressions; the following are equivalent:
+ ; "let x = e1 let y = e2 in e3"
+ ; "let x = e1 in let y = e2 in e3"
+ / 1*let-binding in whsp1 expression
+
+ ; "forall (x : a) -> b"
+ / forall whsp "(" whsp nonreserved-label whsp ":" whsp1 expression whsp ")" whsp arrow whsp expression
+
+ ; "a -> b"
+ ;
+ ; NOTE: Backtrack if parsing this alternative fails
+ / operator-expression whsp arrow whsp expression
+
+ ; "merge e1 e2 : t"
+ ;
+ ; NOTE: Backtrack if parsing this alternative fails since we can't tell
+ ; from the keyword whether there will be a type annotation or not
+ / merge whsp1 import-expression whsp1 import-expression whsp ":" whsp1 application-expression
+
+ ; "[] : t"
+ ;
+ ; NOTE: Backtrack if parsing this alternative fails since we can't tell
+ ; from the opening bracket whether or not this will be an empty list or
+ ; a non-empty list
+ / empty-list-literal
+
+ ; "toMap e : t"
+ ;
+ ; NOTE: Backtrack if parsing this alternative fails since we can't tell
+ ; from the keyword whether there will be a type annotation or not
+ / toMap whsp1 import-expression whsp ":" whsp1 application-expression
+
+ ; "assert : Natural/even 1 === False"
+ / assert whsp ":" whsp1 expression
+
+ ; "x : t"
+ / annotated-expression
+
+; Nonempty-whitespace to disambiguate `env:VARIABLE` from type annotations
+annotated-expression = operator-expression [ whsp ":" whsp1 expression ]
+
+; "let x = e1"
+let-binding = let whsp1 nonreserved-label whsp [ ":" whsp1 expression whsp ] "=" whsp expression whsp
+
+; "[] : t"
+empty-list-literal =
+ "[" whsp [ "," whsp ] "]" whsp ":" whsp1 application-expression
+
+operator-expression = import-alt-expression
+
+; Nonempty-whitespace to disambiguate `http://a/a?a`
+import-alt-expression = or-expression *(whsp "?" whsp1 or-expression)
+or-expression = plus-expression *(whsp "||" whsp plus-expression)
+; Nonempty-whitespace to disambiguate `f +2`
+plus-expression = text-append-expression *(whsp "+" whsp1 text-append-expression)
+text-append-expression = list-append-expression *(whsp "++" whsp list-append-expression)
+list-append-expression = and-expression *(whsp "#" whsp and-expression)
+and-expression = combine-expression *(whsp "&&" whsp combine-expression)
+combine-expression = prefer-expression *(whsp combine whsp prefer-expression)
+prefer-expression = combine-types-expression *(whsp prefer whsp combine-types-expression)
+combine-types-expression = times-expression *(whsp combine-types whsp times-expression)
+times-expression = equal-expression *(whsp "*" whsp equal-expression)
+equal-expression = not-equal-expression *(whsp "==" whsp not-equal-expression)
+not-equal-expression = equivalent-expression *(whsp "!=" whsp equivalent-expression)
+equivalent-expression = with-expression *(whsp equivalent whsp with-expression)
+
+with-expression = application-expression *(whsp1 with whsp1 with-clause)
+
+with-clause =
+ any-label-or-some *(whsp "." whsp any-label-or-some) whsp "=" whsp application-expression
+
+
+; Import expressions need to be separated by some whitespace, otherwise there
+; would be ambiguity: `./ab` could be interpreted as "import the file `./ab`",
+; or "apply the import `./a` to label `b`"
+application-expression =
+ first-application-expression *(whsp1 import-expression)
+
+first-application-expression =
+ ; "merge e1 e2"
+ merge whsp1 import-expression whsp1 import-expression
+
+ ; "Some e"
+ / Some whsp1 import-expression
+
+ ; "toMap e"
+ / toMap whsp1 import-expression
+
+ / import-expression
+
+import-expression = import / completion-expression
+
+completion-expression =
+ selector-expression [ whsp complete whsp selector-expression ]
+
+; `record.field` extracts one field of a record
+;
+; `record.{ field0, field1, field2 }` projects out several fields of a record
+;
+; NOTE: Backtrack when parsing the `*("." ...)`. The reason why is that you
+; can't tell from parsing just the period whether "foo." will become "foo.bar"
+; (i.e. accessing field `bar` of the record `foo`) or `foo./bar` (i.e. applying
+; the function `foo` to the relative path `./bar`)
+selector-expression = primitive-expression *(whsp "." whsp selector)
+
+selector = any-label / labels / type-selector
+
+labels = "{" whsp [ any-label-or-some whsp *("," whsp any-label-or-some whsp) ] "}"
+
+type-selector = "(" whsp expression whsp ")"
+; NOTE: Backtrack when parsing the first three alternatives (i.e. the numeric
+; literals). This is because they share leading characters in common
+primitive-expression =
+ ; "2.0"
+ double-literal
+
+ ; "2"
+ / natural-literal
+
+ ; "+2"
+ / integer-literal
+
+ ; '"ABC"'
+ / text-literal
+
+ ; "{ foo = 1 , bar = True }"
+ ; "{ foo : Integer, bar : Bool }"
+ / "{" whsp [ "," whsp ] record-type-or-literal whsp "}"
+
+ ; "< Foo : Integer | Bar : Bool >"
+ ; "< Foo | Bar : Bool >"
+ / "<" whsp [ "|" whsp ] union-type whsp ">"
+
+ ; "[1, 2, 3]"
+ / non-empty-list-literal
+
+ ; "x"
+ ; "x@2"
+ / identifier
+
+ ; "( e )"
+ / "(" complete-expression ")"
+
+
+record-type-or-literal =
+ empty-record-literal
+ / non-empty-record-type-or-literal
+ / empty-record-type
+
+empty-record-literal = "="
+empty-record-type = ""
+
+non-empty-record-type-or-literal =
+ (non-empty-record-type / non-empty-record-literal)
+
+non-empty-record-type =
+ record-type-entry *(whsp "," whsp record-type-entry)
+
+record-type-entry = any-label-or-some whsp ":" whsp1 expression
+
+non-empty-record-literal =
+ record-literal-entry *(whsp "," whsp record-literal-entry)
+
+record-literal-entry =
+ any-label-or-some (record-literal-normal-entry / record-literal-punned-entry)
+
+record-literal-normal-entry =
+ *(whsp "." whsp any-label-or-some) whsp "=" whsp expression
+record-literal-punned-entry = ""
+
+
+union-type =
+ non-empty-union-type
+ / empty-union-type
+
+empty-union-type = ""
+
+non-empty-union-type =
+ union-type-entry *(whsp "|" whsp union-type-entry)
+
+; x : Natural
+; x
+union-type-entry = any-label-or-some [ whsp ":" whsp1 expression ]
+
+
+non-empty-list-literal =
+ "[" whsp [ "," whsp ] expression whsp *("," whsp expression whsp) "]"
+
+; This just adds surrounding whitespace for the top-level of the program
+complete-expression = whsp expression whsp
diff --git a/dhall/src/syntax/text/parser.rs b/dhall/src/syntax/text/parser.rs index dcaf5e4..486030f 100644 --- a/dhall/src/syntax/text/parser.rs +++ b/dhall/src/syntax/text/parser.rs @@ -1063,3 +1063,32 @@ pub fn parse_expr(input_str: &str) -> ParseResult<Expr> { [expression(e)] => e, )) } + +#[test] +// Check that the local copy of the grammar file is in sync with the one from dhall-lang. +fn test_grammar_files_in_sync() { + use std::process::Command; + + let spec_abnf_path = "../dhall-lang/standard/dhall.abnf"; + let local_abnf_path = "src/syntax/text/dhall.abnf"; + + let out = Command::new("git") + .arg("diff") + .arg("--no-index") + .arg("--ignore-space-change") + .arg("--color") + .arg("--") + .arg(spec_abnf_path) + .arg(local_abnf_path) + .output() + .expect("failed to run `git diff` command"); + + if !out.status.success() { + let output = String::from_utf8_lossy(&out.stdout); + panic!( + "The local dhall.abnf file differs from the one from \ + dhall-lang!\n{}", + output + ); + } +} |