diff options
| author | Nadrieril | 2020-05-30 15:05:36 +0100 | 
|---|---|---|
| committer | Nadrieril | 2020-05-30 15:06:30 +0100 | 
| commit | a35de29355dc85f63e0e8514a2e447bf5322ba20 (patch) | |
| tree | 9eec496ad246e2831d42291d6078c84c8a73879f /dhall | |
| parent | 97e46bef0fe1729d5de0a9a40ba939842023fea7 (diff) | |
Commit grammar file locally
The crate publishing process does not allow access to files outside the
current crate.
Diffstat (limited to '')
| -rw-r--r-- | dhall/build.rs | 2 | ||||
| -rw-r--r-- | dhall/src/syntax/text/dhall.abnf | 936 | ||||
| -rw-r--r-- | dhall/src/syntax/text/parser.rs | 29 | 
3 files changed, 966 insertions, 1 deletions
| diff --git a/dhall/build.rs b/dhall/build.rs index 71c634b..a12e91b 100644 --- a/dhall/build.rs +++ b/dhall/build.rs @@ -348,7 +348,7 @@ fn generate_tests() -> std::io::Result<()> {  fn convert_abnf_to_pest() -> std::io::Result<()> {      let out_dir = env::var("OUT_DIR").unwrap(); -    let abnf_path = "../dhall-lang/standard/dhall.abnf"; +    let abnf_path = "src/syntax/text/dhall.abnf";      let visibility_path = "src/syntax/text/dhall.pest.visibility";      let grammar_path = Path::new(&out_dir).join("dhall.pest");      println!("cargo:rerun-if-changed={}", abnf_path); diff --git a/dhall/src/syntax/text/dhall.abnf b/dhall/src/syntax/text/dhall.abnf new file mode 100644 index 0000000..1c3a980 --- /dev/null +++ b/dhall/src/syntax/text/dhall.abnf @@ -0,0 +1,936 @@ +; ABNF syntax based on RFC 5234
 +;
 +; The character encoding for Dhall is UTF-8
 +;
 +; Some notes on implementing this grammar:
 +;
 +; First, do not use a lexer to tokenize the file before parsing.  Instead, treat
 +; the individual characters of the file as the tokens to feed into the parser.
 +; You should not use a lexer because Dhall's grammar supports two features which
 +; cannot be correctly supported by a lexer:
 +;
 +; * String interpolation (i.e. "foo ${Natural/toInteger bar} baz")
 +; * Nested block comments (i.e. "{- foo {- bar -} baz -}")
 +;
 +; Second, this grammar assumes that your parser can backtrack and/or try
 +; multiple parses simultaneously.  For example, consider this expression:
 +;
 +;     List ./MyType
 +;
 +; A parser might first try to parse the period as the beginning of a field
 +; selector, only to realize immediately afterwards that `/MyType` is not a valid
 +; name for a field.  A conforming parser must backtrack so that the expression
 +; `./MyType` can instead be correctly interpreted as a relative path
 +;
 +; Third, if there are multiple valid parses then prefer the first parse
 +; according to the ordering of alternatives. That is, the order of evaluation
 +; of the alternatives is left-to-right.
 +;
 +; For example, the grammar for single quoted string literals is:
 +;
 +;     single-quote-continue =
 +;           "'''"               single-quote-continue
 +;         / "${" complete-expression "}" single-quote-continue
 +;         / "''${"              single-quote-continue
 +;         / "''"
 +;         / %x20-10FFFF         single-quote-continue
 +;         / tab                 single-quote-continue
 +;         / end-of-line         single-quote-continue
 +;
 +;         single-quote-literal = "''" single-quote-continue
 +;
 +; ... which permits valid parses for the following code:
 +;
 +;     "''''''''''''''''"
 +;
 +; If you tried to parse all alternatives then there are at least two valid
 +; interpretations for the above code:
 +;
 +; * A single quoted literal with four escape sequences of the form "'''"
 +;     * i.e. "''" followed by "'''"  four times in a row followed by "''"
 +; * Four empty single quoted literals
 +;     * i.e. "''''" four times in a row
 +;
 +; The correct interpretation is the first one because parsing the escape
 +; sequence "'''" takes precedence over parsing the termination sequence "''",
 +; according to the order of the alternatives in the `single-quote-continue`
 +; rule.
 +;
 +; Some parsing libraries do not backtrack by default but allow the user to
 +; selectively backtrack in certain parts of the grammar.  Usually parsing
 +; libraries do this to improve efficiency and error messages.  Dhall's grammar
 +; takes that into account by minimizing the number of rules that require the
 +; parser to backtrack and comments below will highlight where you need to
 +; explicitly backtrack
 +;
 +; Specifically, if you see an uninterrupted literal in a grammar rule such as:
 +;
 +;     "->"
 +;
 +; ... or:
 +;
 +;     %x66.6f.72.61.6c.6c
 +;
 +; ... then that string literal is parsed as a single unit, meaning that you
 +; should backtrack if you parse only part of the literal
 +;
 +; In all other cases you can assume that you do not need to backtrack unless
 +; there is a comment explicitly asking you to backtrack
 +;
 +; When parsing a repeated construct, prefer alternatives that parse as many
 +; repetitions as possible.  On in other words:
 +;
 +;     [a] = a / ""
 +;
 +;     a* = a* a / ""
 +;
 +; Note that the latter rule also specifies that repetition produces
 +; left-associated expressions.  For example, function application is
 +; left-associative and all operators are left-associative when they are not
 +; parenthesized.
 +;
 +; Additionally, try alternatives in an order that minimizes backtracking
 +; according to the following rule:
 +;
 +;     (a / b) (c / d) = a c / a d / b c / b d
 +
 +; NOTE: There are many line endings in the wild
 +;
 +; See: https://en.wikipedia.org/wiki/Newline
 +;
 +; For simplicity this supports Unix and Windows line-endings, which are the most
 +; common
 +end-of-line =
 +      %x0A     ; "\n"
 +    / %x0D.0A  ; "\r\n"
 +
 +; This rule matches all characters that are not:
 +;
 +; * not ASCII
 +; * not part of a surrogate pair
 +; * not a "non-character"
 +valid-non-ascii =
 +      %x80-D7FF
 +    ; %xD800-DFFF = surrogate pairs
 +    / %xE000-FFFD
 +    ; %xFFFE-FFFF = non-characters
 +    / %x10000-1FFFD
 +    ; %x1FFFE-1FFFF = non-characters
 +    / %x20000-2FFFD
 +    ; %x2FFFE-2FFFF = non-characters
 +    / %x30000-3FFFD
 +    ; %x3FFFE-3FFFF = non-characters
 +    / %x40000-4FFFD
 +    ; %x4FFFE-4FFFF = non-characters
 +    / %x50000-5FFFD
 +    ; %x5FFFE-5FFFF = non-characters
 +    / %x60000-6FFFD
 +    ; %x6FFFE-6FFFF = non-characters
 +    / %x70000-7FFFD
 +    ; %x7FFFE-7FFFF = non-characters
 +    / %x80000-8FFFD
 +    ; %x8FFFE-8FFFF = non-characters
 +    / %x90000-9FFFD
 +    ; %x9FFFE-9FFFF = non-characters
 +    / %xA0000-AFFFD
 +    ; %xAFFFE-AFFFF = non-characters
 +    / %xB0000-BFFFD
 +    ; %xBFFFE-BFFFF = non-characters
 +    / %xC0000-CFFFD
 +    ; %xCFFFE-CFFFF = non-characters
 +    / %xD0000-DFFFD
 +    ; %xDFFFE-DFFFF = non-characters
 +    / %xE0000-EFFFD
 +    ; %xEFFFE-EFFFF = non-characters
 +    / %xF0000-FFFFD
 +    ; %xFFFFE-FFFFF = non-characters
 +    / %x100000-10FFFD
 +    ; %x10FFFE-10FFFF = non-characters
 +
 +tab = %x09  ; "\t"
 +
 +block-comment = "{-" block-comment-continue
 +
 +block-comment-char =
 +      %x20-7F
 +    / valid-non-ascii
 +    / tab
 +    / end-of-line
 +
 +block-comment-continue =
 +    "-}"
 +    / block-comment block-comment-continue
 +    / block-comment-char block-comment-continue
 +
 +not-end-of-line = %x20-7F / valid-non-ascii / tab
 +
 +; NOTE: Slightly different from Haskell-style single-line comments because this
 +; does not require a space after the dashes
 +line-comment = "--" *not-end-of-line end-of-line
 +
 +whitespace-chunk =
 +      " "
 +    / tab
 +    / end-of-line
 +    / line-comment
 +    / block-comment
 +
 +whsp = *whitespace-chunk
 +
 +; nonempty whitespace
 +whsp1 = 1*whitespace-chunk
 +
 +; Uppercase or lowercase ASCII letter
 +ALPHA = %x41-5A / %x61-7A
 +
 +; ASCII digit
 +DIGIT = %x30-39  ; 0-9
 +
 +ALPHANUM = ALPHA / DIGIT
 +
 +HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
 +
 +; A simple label cannot be one of the reserved keywords
 +; listed in the `keyword` rule.
 +; A PEG parser could use negative lookahead to
 +; enforce this, e.g. as follows:
 +; simple-label =
 +;       keyword 1*simple-label-next-char
 +;     / !keyword (simple-label-first-char *simple-label-next-char)
 +simple-label-first-char = ALPHA / "_"
 +simple-label-next-char = ALPHANUM / "-" / "/" / "_"
 +simple-label = simple-label-first-char *simple-label-next-char
 +
 +quoted-label-char =
 +      %x20-5F
 +        ; %x60 = '`'
 +    / %x61-7E
 +
 +quoted-label = 1*quoted-label-char
 +
 +; NOTE: Dhall does not support Unicode labels, mainly to minimize the potential
 +; for code obfuscation
 +label = ("`" quoted-label "`" / simple-label)
 +
 +; A nonreserved-label cannot not be any of the reserved identifiers for builtins
 +; (unless quoted).
 +; Their list can be found in the `builtin` rule.
 +; The only place where this restriction applies is bound variables.
 +; A PEG parser could use negative lookahead to avoid parsing those identifiers,
 +; e.g. as follows:
 +; nonreserved-label =
 +;      builtin 1*simple-label-next-char
 +;    / !builtin label
 +nonreserved-label = label
 +
 +; An any-label is allowed to be one of the reserved identifiers (but not a keyword).
 +any-label = label
 +
 +; Allow specifically `Some` in record and union labels.
 +any-label-or-some = any-label / Some
 +
 +; Dhall's double-quoted strings are similar to JSON strings (RFC7159) except:
 +;
 +; * Dhall strings support string interpolation
 +;
 +; * Dhall strings also support escaping string interpolation by adding a new
 +;   `\$` escape sequence
 +;
 +; * Dhall strings also allow Unicode escape sequences of the form `\u{XXX}`
 +double-quote-chunk =
 +      interpolation
 +      ; '\'    Beginning of escape sequence
 +    / %x5C double-quote-escaped
 +    / double-quote-char
 +
 +double-quote-escaped =
 +      %x22                 ; '"'    quotation mark  U+0022
 +    / %x24                 ; '$'    dollar sign     U+0024
 +    / %x5C                 ; '\'    reverse solidus U+005C
 +    / %x2F                 ; '/'    solidus         U+002F
 +    / %x62                 ; 'b'    backspace       U+0008
 +    / %x66                 ; 'f'    form feed       U+000C
 +    / %x6E                 ; 'n'    line feed       U+000A
 +    / %x72                 ; 'r'    carriage return U+000D
 +    / %x74                 ; 't'    tab             U+0009
 +    / %x75 unicode-escape  ; 'uXXXX' / 'u{XXXX}'    U+XXXX
 +
 +; Valid Unicode escape sequences are as follows:
 +;
 +; * Exactly 4 hexadecimal digits without braces:
 +;       `\uXXXX`
 +; * 1-6 hexadecimal digits within braces (with optional zero padding):
 +;       `\u{XXXX}`, `\u{000X}`, `\u{XXXXX}`, `\u{00000XXXXX}`, etc.
 +;   Any number of leading zeros are allowed within the braces preceding the 1-6
 +;   digits specifying the codepoint.
 +;
 +; From these sequences, the parser must also reject any codepoints that are in
 +; the following ranges:
 +;
 +; * Surrogate pairs: `%xD800-DFFF`
 +; * Non-characters: `%xNFFFE-NFFFF` / `%x10FFFE-10FFFF` for `N` in `{ 0 .. F }`
 +;
 +; See the `valid-non-ascii` rule for the exact ranges that are not allowed
 +unicode-escape = unbraced-escape / "{" braced-escape "}"
 +
 +; All valid last 4 digits for unicode codepoints (outside Plane 0): `0000-FFFD`
 +unicode-suffix = (DIGIT / "A" / "B" / "C" / "D" / "E") 3HEXDIG
 +               / "F" 2HEXDIG (DIGIT / "A" / "B" / "C" / "D")
 +
 +; All 4-hex digit unicode escape sequences that are not:
 +;
 +; * Surrogate pairs (i.e. `%xD800-DFFF`)
 +; * Non-characters (i.e. `%xFFFE-FFFF`)
 +;
 +unbraced-escape =
 +      (DIGIT / "A" / "B" / "C") 3HEXDIG
 +    / "D" ("0" / "1" / "2" / "3" / "4" / "5" / "6" / "7") HEXDIG HEXDIG
 +    ; %xD800-DFFF Surrogate pairs
 +    / "E" 3HEXDIG
 +    / "F" 2HEXDIG (DIGIT / "A" / "B" / "C" / "D")
 +    ; %xFFFE-FFFF Non-characters
 +
 +; All 1-6 digit unicode codepoints that are not:
 +;
 +; * Surrogate pairs: `%xD800-DFFF`
 +; * Non-characters: `%xNFFFE-NFFFF` / `%x10FFFE-10FFFF` for `N` in `{ 0 .. F }`
 +;
 +; See the `valid-non-ascii` rule for the exact ranges that are not allowed
 +braced-codepoint =
 +      ("1" / "2" / "3" / "4" / "5" / "6" / "7" / "8" / "9" / "A" / "B" / "C" / "D" / "E" / "F" / "10") unicode-suffix; (Planes 1-16)
 +    / unbraced-escape ; (Plane 0)
 +    / 1*3HEXDIG ; %x000-FFF
 +
 +; Allow zero padding for braced codepoints
 +braced-escape = *"0" braced-codepoint
 +
 +; Printable characters except double quote and backslash
 +double-quote-char =
 +      %x20-21
 +        ; %x22 = '"'
 +    / %x23-5B
 +        ; %x5C = "\"
 +    / %x5D-7F
 +    / valid-non-ascii
 +
 +double-quote-literal = %x22 *double-quote-chunk %x22
 +
 +; NOTE: The only way to end a single-quote string literal with a single quote is
 +; to either interpolate the single quote, like this:
 +;
 +;     ''ABC${"'"}''
 +;
 +; ... or concatenate another string, like this:
 +;
 +;     ''ABC'' ++ "'"
 +;
 +; If you try to end the string literal with a single quote then you get "'''",
 +; which is interpreted as an escaped pair of single quotes
 +single-quote-continue =
 +      interpolation single-quote-continue
 +    / escaped-quote-pair single-quote-continue
 +    / escaped-interpolation single-quote-continue
 +    / "''" ; End of text literal
 +    / single-quote-char single-quote-continue
 +
 +; Escape two single quotes (i.e. replace this sequence with "''")
 +escaped-quote-pair = "'''"
 +
 +; Escape interpolation (i.e. replace this sequence with "${")
 +escaped-interpolation = "''${"
 +
 +single-quote-char =
 +      %x20-7F
 +    / valid-non-ascii
 +    / tab
 +    / end-of-line
 +
 +single-quote-literal = "''" end-of-line single-quote-continue
 +
 +interpolation = "${" complete-expression "}"
 +
 +text-literal = (double-quote-literal / single-quote-literal)
 +
 +; RFC 5234 interprets string literals as case-insensitive and recommends using
 +; hex instead for case-sensitive strings
 +;
 +; If you don't feel like reading hex, these are all the same as the rule name.
 +; Keywords that should never be parsed as identifiers
 +if                    = %x69.66
 +then                  = %x74.68.65.6e
 +else                  = %x65.6c.73.65
 +let                   = %x6c.65.74
 +in                    = %x69.6e
 +as                    = %x61.73
 +using                 = %x75.73.69.6e.67
 +merge                 = %x6d.65.72.67.65
 +missing               = %x6d.69.73.73.69.6e.67
 +Infinity              = %x49.6e.66.69.6e.69.74.79
 +NaN                   = %x4e.61.4e
 +Some                  = %x53.6f.6d.65
 +toMap                 = %x74.6f.4d.61.70
 +assert                = %x61.73.73.65.72.74
 +forall                = %x2200 / %x66.6f.72.61.6c.6c ; "∀" / "forall"
 +with                  = %x77.69.74.68
 +
 +; Unused rule that could be used as negative lookahead in the
 +; `simple-label` rule for parsers that support this.
 +keyword =
 +      if / then / else
 +    / let / in
 +    / using / missing 
 +    / assert / as
 +    / Infinity / NaN
 +    / merge / Some / toMap
 +    / forall
 +    / with
 +
 +builtin =
 +      Natural-fold
 +    / Natural-build
 +    / Natural-isZero
 +    / Natural-even
 +    / Natural-odd
 +    / Natural-toInteger
 +    / Natural-show
 +    / Integer-toDouble
 +    / Integer-show
 +    / Integer-negate
 +    / Integer-clamp
 +    / Natural-subtract
 +    / Double-show
 +    / List-build
 +    / List-fold
 +    / List-length
 +    / List-head
 +    / List-last
 +    / List-indexed
 +    / List-reverse
 +    / Optional-fold
 +    / Optional-build
 +    / Text-show
 +    / Bool
 +    / True
 +    / False
 +    / Optional
 +    / None
 +    / Natural
 +    / Integer
 +    / Double
 +    / Text
 +    / List
 +    / Type
 +    / Kind
 +    / Sort
 +
 +; Reserved identifiers, needed for some special cases of parsing
 +Optional              = %x4f.70.74.69.6f.6e.61.6c
 +Text                  = %x54.65.78.74
 +List                  = %x4c.69.73.74
 +Location              = %x4c.6f.63.61.74.69.6f.6e
 +
 +; Reminder of the reserved identifiers, needed for the `builtin` rule
 +Bool              = %x42.6f.6f.6c
 +True              = %x54.72.75.65
 +False             = %x46.61.6c.73.65
 +None              = %x4e.6f.6e.65
 +Natural           = %x4e.61.74.75.72.61.6c
 +Integer           = %x49.6e.74.65.67.65.72
 +Double            = %x44.6f.75.62.6c.65
 +Type              = %x54.79.70.65
 +Kind              = %x4b.69.6e.64
 +Sort              = %x53.6f.72.74
 +Natural-fold      = %x4e.61.74.75.72.61.6c.2f.66.6f.6c.64
 +Natural-build     = %x4e.61.74.75.72.61.6c.2f.62.75.69.6c.64
 +Natural-isZero    = %x4e.61.74.75.72.61.6c.2f.69.73.5a.65.72.6f
 +Natural-even      = %x4e.61.74.75.72.61.6c.2f.65.76.65.6e
 +Natural-odd       = %x4e.61.74.75.72.61.6c.2f.6f.64.64
 +Natural-toInteger = %x4e.61.74.75.72.61.6c.2f.74.6f.49.6e.74.65.67.65.72
 +Natural-show      = %x4e.61.74.75.72.61.6c.2f.73.68.6f.77
 +Natural-subtract  = %x4e.61.74.75.72.61.6c.2f.73.75.62.74.72.61.63.74
 +Integer-toDouble  = %x49.6e.74.65.67.65.72.2f.74.6f.44.6f.75.62.6c.65
 +Integer-show      = %x49.6e.74.65.67.65.72.2f.73.68.6f.77
 +Integer-negate    = %x49.6e.74.65.67.65.72.2f.6e.65.67.61.74.65
 +Integer-clamp     = %x49.6e.74.65.67.65.72.2f.63.6c.61.6d.70
 +Double-show       = %x44.6f.75.62.6c.65.2f.73.68.6f.77
 +List-build        = %x4c.69.73.74.2f.62.75.69.6c.64
 +List-fold         = %x4c.69.73.74.2f.66.6f.6c.64
 +List-length       = %x4c.69.73.74.2f.6c.65.6e.67.74.68
 +List-head         = %x4c.69.73.74.2f.68.65.61.64
 +List-last         = %x4c.69.73.74.2f.6c.61.73.74
 +List-indexed      = %x4c.69.73.74.2f.69.6e.64.65.78.65.64
 +List-reverse      = %x4c.69.73.74.2f.72.65.76.65.72.73.65
 +Optional-fold     = %x4f.70.74.69.6f.6e.61.6c.2f.66.6f.6c.64
 +Optional-build    = %x4f.70.74.69.6f.6e.61.6c.2f.62.75.69.6c.64
 +Text-show         = %x54.65.78.74.2f.73.68.6f.77
 +
 +; Operators
 +combine       = %x2227 / "/\"
 +combine-types = %x2A53 / "//\\"
 +equivalent    = %x2261 / "==="
 +prefer        = %x2AFD / "//"
 +lambda        = %x3BB  / "\"
 +arrow         = %x2192 / "->"
 +complete      = "::"
 +
 +exponent = "e" [ "+" / "-" ] 1*DIGIT
 +
 +numeric-double-literal = [ "+" / "-" ] 1*DIGIT ( "." 1*DIGIT [ exponent ] / exponent)
 +
 +minus-infinity-literal = "-" Infinity
 +plus-infinity-literal = Infinity
 +
 +double-literal =
 +    ; "2.0"
 +      numeric-double-literal
 +    ; "-Infinity"
 +    / minus-infinity-literal
 +    ; "Infinity"
 +    / plus-infinity-literal
 +    ; "NaN"
 +    / NaN
 +
 +natural-literal =
 +    ; Hexadecimal with "0x" prefix
 +      "0" %x78 1*HEXDIG
 +    ; Decimal; leading 0 digits are not allowed
 +    / ("1" / "2" / "3" / "4" / "5" / "6" / "7" / "8" / "9") *DIGIT
 +    ; ... except for 0 itself
 +    / "0"
 +
 +integer-literal = ( "+" / "-" ) natural-literal
 +
 +; If the identifier matches one of the names in the `builtin` rule, then it is a
 +; builtin, and should be treated as the corresponding item in the list of
 +; "Reserved identifiers for builtins" specified in the `standard/README.md` document.
 +; It is a syntax error to specify a de Bruijn index in this case.
 +; Otherwise, this is a variable with name and index matching the label and index.
 +identifier = variable / builtin
 +
 +variable = nonreserved-label [ whsp "@" whsp natural-literal ]
 +
 +; Printable characters other than " ()[]{}<>/\,"
 +;
 +; Excluding those characters ensures that paths don't have to end with trailing
 +; whitespace most of the time
 +path-character =
 +        ; %x20 = " "
 +      %x21
 +        ; %x22 = "\""
 +        ; %x23 = "#"
 +    / %x24-27
 +        ; %x28 = "("
 +        ; %x29 = ")"
 +    / %x2A-2B
 +        ; %x2C = ","
 +    / %x2D-2E
 +        ; %x2F = "/"
 +    / %x30-3B
 +        ; %x3C = "<"
 +    / %x3D
 +        ; %x3E = ">"
 +        ; %x3F = "?"
 +    / %x40-5A
 +        ; %x5B = "["
 +        ; %x5C = "\"
 +        ; %x5D = "]"
 +    / %x5E-7A
 +        ; %x7B = "{"
 +    / %x7C
 +        ; %x7D = "}"
 +    / %x7E
 +
 +quoted-path-character =
 +      %x20-21
 +        ; %x22 = "\""
 +    / %x23-2E
 +        ; %x2F = "/"
 +    / %x30-7F
 +    / valid-non-ascii
 +
 +unquoted-path-component = 1*path-character
 +quoted-path-component = 1*quoted-path-character
 +
 +path-component = "/" ( unquoted-path-component / %x22 quoted-path-component %x22 )
 +
 +; The last path-component matched by this rule is referred to as "file" in the semantics,
 +; and the other path-components as "directory".
 +path = 1*path-component
 +
 +local =
 +    parent-path
 +    / here-path
 +    / home-path
 +    ; NOTE: Backtrack if parsing this alternative fails
 +    ;
 +    ; This is because the first character of this alternative will be "/", but
 +    ; if the second character is "/" or "\" then this should have been parsed
 +    ; as an operator instead of a path
 +    / absolute-path
 +
 +parent-path = ".." path  ; Relative path
 +here-path = "."  path  ; Relative path
 +home-path = "~"  path  ; Home-anchored path
 +absolute-path = path  ; Absolute path
 +
 +; `http[s]` URI grammar based on RFC7230 and RFC 3986 with some differences
 +; noted below
 +
 +scheme = %x68.74.74.70 [ %x73 ]  ; "http" [ "s" ]
 +
 +; NOTE: This does not match the official grammar for a URI.  Specifically:
 +;
 +; * path segments may be quoted instead of using percent-encoding
 +; * this does not support fragment identifiers, which have no meaning within
 +;   Dhall expressions and do not affect import resolution
 +; * the characters "(" ")" and "," are not included in the `sub-delims` rule:
 +;   in particular, these characters can't be used in authority, path or query
 +;   strings.  This is because those characters have other meaning in Dhall
 +;   and it would be confusing for the comma in
 +;       [http://example.com/foo, bar]
 +;   to be part of the URL instead of part of the list.  If you need a URL
 +;   which contains parens or a comma, you must percent-encode them.
 +;
 +; Reserved characters in quoted path components should be percent-encoded
 +; according to https://tools.ietf.org/html/rfc3986#section-2
 +http-raw = scheme "://" authority url-path [ "?" query ]
 +
 +; Temporary rule to allow old-style `path-component`s and RFC3986 `segment`s in
 +; the same grammar. Eventually we can just use `path-abempty` from the same
 +; RFC. See issue #581
 +
 +url-path = *(path-component / "/" segment)
 +
 +; NOTE: Backtrack if parsing the optional user info prefix fails
 +authority = [ userinfo "@" ] host [ ":" port ]
 +
 +userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
 +
 +host = IP-literal / IPv4address / domain
 +
 +port = *DIGIT
 +
 +IP-literal = "[" ( IPv6address / IPvFuture  ) "]"
 +
 +IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
 +
 +; NOTE: Backtrack when parsing each alternative
 +IPv6address =                            6( h16 ":" ) ls32
 +            /                       "::" 5( h16 ":" ) ls32
 +            / [ h16               ] "::" 4( h16 ":" ) ls32
 +            / [ h16 *1( ":" h16 ) ] "::" 3( h16 ":" ) ls32
 +            / [ h16 *2( ":" h16 ) ] "::" 2( h16 ":" ) ls32
 +            / [ h16 *3( ":" h16 ) ] "::"    h16 ":"   ls32
 +            / [ h16 *4( ":" h16 ) ] "::"              ls32
 +            / [ h16 *5( ":" h16 ) ] "::"              h16
 +            / [ h16 *6( ":" h16 ) ] "::"
 +
 +h16 = 1*4HEXDIG
 +
 +ls32 = h16 ":" h16 / IPv4address
 +
 +IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
 +
 +; NOTE: Backtrack when parsing these alternatives
 +dec-octet = "25" %x30-35       ; 250-255
 +          / "2" %x30-34 DIGIT  ; 200-249
 +          / "1" 2DIGIT         ; 100-199
 +          / %x31-39 DIGIT      ; 10-99
 +          / DIGIT              ; 0-9
 +
 +; Look in RFC3986 3.2.2 for
 +; "A registered name intended for lookup in the DNS"
 +domain = domainlabel *("." domainlabel) [ "." ]
 +
 +domainlabel = 1*ALPHANUM *(1*"-" 1*ALPHANUM)
 +
 +segment = *pchar
 +
 +pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
 +
 +query = *( pchar / "/" / "?" )
 +
 +pct-encoded = "%" HEXDIG HEXDIG
 +
 +unreserved  = ALPHANUM / "-" / "." / "_" / "~"
 +
 +; this is the RFC3986 sub-delims rule, without "(", ")" or ","
 +; see comments above the `http-raw` rule above
 +sub-delims = "!" / "$" / "&" / "'" / "*" / "+" / ";" / "="
 +
 +http = http-raw [ whsp using whsp1 import-expression ]
 +
 +; Dhall supports unquoted environment variables that are Bash-compliant or
 +; quoted environment variables that are POSIX-compliant
 +env = "env:"
 +    ( bash-environment-variable
 +    / %x22 posix-environment-variable %x22
 +    )
 +
 +; Bash supports a restricted subset of POSIX environment variables.  From the
 +; Bash `man` page, an environment variable name is:
 +;
 +; > A word consisting only of  alphanumeric  characters  and  under-scores,  and
 +; > beginning with an alphabetic character or an under-score
 +bash-environment-variable = (ALPHA / "_") *(ALPHANUM / "_")
 +
 +; The POSIX standard is significantly more flexible about legal environment
 +; variable names, which can contain alerts (i.e. '\a'), whitespace, or
 +; punctuation, for example.  The POSIX standard says about environment variable
 +; names:
 +;
 +; > The value of an environment variable is a string of characters. For a
 +; > C-language program, an array of strings called the environment shall be made
 +; > available when a process begins. The array is pointed to by the external
 +; > variable environ, which is defined as:
 +; >
 +; >     extern char **environ;
 +; >
 +; > These strings have the form name=value; names shall not contain the
 +; > character '='. For values to be portable across systems conforming to IEEE
 +; > Std 1003.1-2001, the value shall be composed of characters from the portable
 +; > character set (except NUL and as indicated below).
 +;
 +; Note that the standard does not explicitly state that the name must have at
 +; least one character, but `env` does not appear to support this and `env`
 +; claims to be POSIX-compliant.  To be safe, Dhall requires at least one
 +; character like `env`
 +posix-environment-variable = 1*posix-environment-variable-character
 +
 +; These are all the characters from the POSIX Portable Character Set except for
 +; '\0' (NUL) and '='.  Note that the POSIX standard does not explicitly state
 +; that environment variable names cannot have NUL.  However, this is implicit
 +; in the fact that environment variables are passed to the program as
 +; NUL-terminated `name=value` strings, which implies that the `name` portion of
 +; the string cannot have NUL characters
 +posix-environment-variable-character =
 +      %x5C                 ; '\'    Beginning of escape sequence
 +      ( %x22               ; '"'    quotation mark  U+0022
 +      / %x5C               ; '\'    reverse solidus U+005C
 +      / %x61               ; 'a'    alert           U+0007
 +      / %x62               ; 'b'    backspace       U+0008
 +      / %x66               ; 'f'    form feed       U+000C
 +      / %x6E               ; 'n'    line feed       U+000A
 +      / %x72               ; 'r'    carriage return U+000D
 +      / %x74               ; 't'    tab             U+0009
 +      / %x76               ; 'v'    vertical tab    U+000B
 +      )
 +    ; Printable characters except double quote, backslash and equals
 +    / %x20-21
 +        ; %x22 = '"'
 +    / %x23-3C
 +        ; %x3D = '='
 +    / %x3E-5B
 +        ; %x5C = "\"
 +    / %x5D-7E
 +
 +import-type = missing / local / http / env
 +
 +hash = %x73.68.61.32.35.36.3a 64HEXDIG ; "sha256:XXX...XXX"
 +
 +import-hashed = import-type [ whsp1 hash ]
 +
 +; "http://example.com"
 +; "./foo/bar"
 +; "env:FOO"
 +import = import-hashed [ whsp as whsp1 (Text / Location) ]
 +
 +expression =
 +    ; "\(x : a) -> b"
 +      lambda whsp "(" whsp nonreserved-label whsp ":" whsp1 expression whsp ")" whsp arrow whsp expression
 +    
 +    ; "if a then b else c"
 +    / if whsp1 expression whsp then whsp1 expression whsp else whsp1 expression
 +    
 +    ; "let x : t = e1 in e2"
 +    ; "let x     = e1 in e2"
 +    ; We allow dropping the `in` between adjacent let-expressions; the following are equivalent:
 +    ; "let x = e1 let y = e2 in e3"
 +    ; "let x = e1 in let y = e2 in e3"
 +    / 1*let-binding in whsp1 expression
 +    
 +    ; "forall (x : a) -> b"
 +    / forall whsp "(" whsp nonreserved-label whsp ":" whsp1 expression whsp ")" whsp arrow whsp expression
 +    
 +    ; "a -> b"
 +    ;
 +    ; NOTE: Backtrack if parsing this alternative fails
 +    / operator-expression whsp arrow whsp expression
 +    
 +    ; "merge e1 e2 : t"
 +    ;
 +    ; NOTE: Backtrack if parsing this alternative fails since we can't tell
 +    ; from the keyword whether there will be a type annotation or not
 +    / merge whsp1 import-expression whsp1 import-expression whsp ":" whsp1 application-expression
 +    
 +    ; "[] : t"
 +    ;
 +    ; NOTE: Backtrack if parsing this alternative fails since we can't tell
 +    ; from the opening bracket whether or not this will be an empty list or
 +    ; a non-empty list
 +    / empty-list-literal
 +    
 +    ; "toMap e : t"
 +    ;
 +    ; NOTE: Backtrack if parsing this alternative fails since we can't tell
 +    ; from the keyword whether there will be a type annotation or not
 +    / toMap whsp1 import-expression whsp ":" whsp1 application-expression
 +    
 +    ; "assert : Natural/even 1 === False"
 +    / assert whsp ":" whsp1 expression
 +    
 +    ; "x : t"
 +    / annotated-expression
 +
 +; Nonempty-whitespace to disambiguate `env:VARIABLE` from type annotations
 +annotated-expression = operator-expression [ whsp ":" whsp1 expression ]
 +
 +; "let x = e1"
 +let-binding = let whsp1 nonreserved-label whsp [ ":" whsp1 expression whsp ] "=" whsp expression whsp
 +
 +; "[] : t"
 +empty-list-literal =
 +    "[" whsp [ "," whsp ] "]" whsp ":" whsp1 application-expression
 +
 +operator-expression = import-alt-expression
 +
 +; Nonempty-whitespace to disambiguate `http://a/a?a`
 +import-alt-expression    = or-expression            *(whsp "?" whsp1 or-expression)
 +or-expression            = plus-expression          *(whsp "||" whsp plus-expression)
 +; Nonempty-whitespace to disambiguate `f +2`
 +plus-expression          = text-append-expression   *(whsp "+" whsp1 text-append-expression)
 +text-append-expression   = list-append-expression   *(whsp "++" whsp list-append-expression)
 +list-append-expression   = and-expression           *(whsp "#" whsp and-expression)
 +and-expression           = combine-expression       *(whsp "&&" whsp combine-expression)
 +combine-expression       = prefer-expression        *(whsp combine whsp prefer-expression)
 +prefer-expression        = combine-types-expression *(whsp prefer whsp combine-types-expression)
 +combine-types-expression = times-expression         *(whsp combine-types whsp times-expression)
 +times-expression         = equal-expression         *(whsp "*" whsp equal-expression)
 +equal-expression         = not-equal-expression     *(whsp "==" whsp not-equal-expression)
 +not-equal-expression     = equivalent-expression    *(whsp "!=" whsp equivalent-expression)
 +equivalent-expression    = with-expression          *(whsp equivalent whsp with-expression)
 +
 +with-expression = application-expression *(whsp1 with whsp1 with-clause)
 +
 +with-clause =
 +    any-label-or-some *(whsp "." whsp any-label-or-some) whsp "=" whsp application-expression
 +
 +
 +; Import expressions need to be separated by some whitespace, otherwise there
 +; would be ambiguity: `./ab` could be interpreted as "import the file `./ab`",
 +; or "apply the import `./a` to label `b`"
 +application-expression =
 +    first-application-expression *(whsp1 import-expression)
 +
 +first-application-expression =
 +    ; "merge e1 e2"
 +      merge whsp1 import-expression whsp1 import-expression
 +    
 +    ; "Some e"
 +    / Some whsp1 import-expression
 +    
 +    ; "toMap e"
 +    / toMap whsp1 import-expression
 +    
 +    / import-expression
 +
 +import-expression = import / completion-expression
 +
 +completion-expression =
 +    selector-expression [ whsp complete whsp selector-expression ]
 +
 +; `record.field` extracts one field of a record
 +;
 +; `record.{ field0, field1, field2 }` projects out several fields of a record
 +;
 +; NOTE: Backtrack when parsing the `*("." ...)`.  The reason why is that you
 +; can't tell from parsing just the period whether "foo." will become "foo.bar"
 +; (i.e. accessing field `bar` of the record `foo`) or `foo./bar` (i.e. applying
 +; the function `foo` to the relative path `./bar`)
 +selector-expression = primitive-expression *(whsp "." whsp selector)
 +
 +selector = any-label / labels / type-selector
 +
 +labels = "{" whsp [ any-label-or-some whsp *("," whsp any-label-or-some whsp) ] "}"
 +
 +type-selector = "(" whsp expression whsp ")"
 +; NOTE: Backtrack when parsing the first three alternatives (i.e. the numeric
 +; literals).  This is because they share leading characters in common
 +primitive-expression =
 +    ; "2.0"
 +      double-literal
 +    
 +    ; "2"
 +    / natural-literal
 +    
 +    ; "+2"
 +    / integer-literal
 +    
 +    ; '"ABC"'
 +    / text-literal
 +    
 +    ; "{ foo = 1      , bar = True }"
 +    ; "{ foo : Integer, bar : Bool }"
 +    / "{" whsp [ "," whsp ] record-type-or-literal whsp "}"
 +    
 +    ; "< Foo : Integer | Bar : Bool >"
 +    ; "< Foo | Bar : Bool >"
 +    / "<" whsp [ "|" whsp ] union-type whsp ">"
 +    
 +    ; "[1, 2, 3]"
 +    / non-empty-list-literal
 +    
 +    ; "x"
 +    ; "x@2"
 +    / identifier
 +    
 +    ; "( e )"
 +    / "(" complete-expression ")"
 +
 +
 +record-type-or-literal =
 +      empty-record-literal
 +    / non-empty-record-type-or-literal
 +    / empty-record-type
 +
 +empty-record-literal = "="
 +empty-record-type = ""
 +
 +non-empty-record-type-or-literal =
 +    (non-empty-record-type / non-empty-record-literal)
 +
 +non-empty-record-type =
 +    record-type-entry *(whsp "," whsp record-type-entry)
 +
 +record-type-entry = any-label-or-some whsp ":" whsp1 expression
 +
 +non-empty-record-literal =
 +    record-literal-entry *(whsp "," whsp record-literal-entry)
 +
 +record-literal-entry =
 +    any-label-or-some (record-literal-normal-entry / record-literal-punned-entry)
 +
 +record-literal-normal-entry =
 +    *(whsp "." whsp any-label-or-some) whsp "=" whsp expression
 +record-literal-punned-entry = ""
 +
 +
 +union-type =
 +      non-empty-union-type
 +    / empty-union-type
 +
 +empty-union-type = ""
 +
 +non-empty-union-type =
 +    union-type-entry *(whsp "|" whsp union-type-entry)
 +
 +; x : Natural
 +; x
 +union-type-entry = any-label-or-some [ whsp ":" whsp1 expression ]
 +
 +
 +non-empty-list-literal =
 +    "[" whsp [ "," whsp ] expression whsp *("," whsp expression whsp) "]"
 +
 +; This just adds surrounding whitespace for the top-level of the program
 +complete-expression = whsp expression whsp
 diff --git a/dhall/src/syntax/text/parser.rs b/dhall/src/syntax/text/parser.rs index dcaf5e4..486030f 100644 --- a/dhall/src/syntax/text/parser.rs +++ b/dhall/src/syntax/text/parser.rs @@ -1063,3 +1063,32 @@ pub fn parse_expr(input_str: &str) -> ParseResult<Expr> {          [expression(e)] => e,      ))  } + +#[test] +// Check that the local copy of the grammar file is in sync with the one from dhall-lang. +fn test_grammar_files_in_sync() { +    use std::process::Command; + +    let spec_abnf_path = "../dhall-lang/standard/dhall.abnf"; +    let local_abnf_path = "src/syntax/text/dhall.abnf"; + +    let out = Command::new("git") +        .arg("diff") +        .arg("--no-index") +        .arg("--ignore-space-change") +        .arg("--color") +        .arg("--") +        .arg(spec_abnf_path) +        .arg(local_abnf_path) +        .output() +        .expect("failed to run `git diff` command"); + +    if !out.status.success() { +        let output = String::from_utf8_lossy(&out.stdout); +        panic!( +            "The local dhall.abnf file differs from the one from \ +             dhall-lang!\n{}", +            output +        ); +    } +} | 
