; ABNF syntax based on RFC 5234
;
; The character encoding for Dhall is UTF-8
;
; Some notes on implementing this grammar:
;
; First, do not use a lexer to tokenize the file before parsing.  Instead, treat
; the individual characters of the file as the tokens to feed into the parser.
; You should not use a lexer because Dhall's grammar supports two features which
; cannot be correctly supported by a lexer:
;
; * String interpolation (i.e. "foo ${Natural/toInteger bar} baz")
; * Nested block comments (i.e. "{- foo {- bar -} baz -}")
;
; Second, this grammar assumes that your parser can backtrack and/or try
; multiple parses simultaneously.  For example, consider this expression:
;
;     List ./MyType
;
; A parser might first try to parse the period as the beginning of a field
; selector, only to realize immediately afterwards that `/MyType` is not a valid
; name for a field.  A conforming parser must backtrack so that the expression
; `./MyType` can instead be correctly interpreted as a relative path
;
; Third, if there are multiple valid parses then prefer the first parse
; according to the ordering of alternatives. That is, the order of evaluation
; of the alternatives is left-to-right.
;
; For example, the grammar for single quoted string literals is:
;
;     single-quote-continue =
;           "'''"               single-quote-continue
;         / "${" complete-expression "}" single-quote-continue
;         / "''${"              single-quote-continue
;         / "''"
;         / %x20-10FFFF         single-quote-continue
;         / tab                 single-quote-continue
;         / end-of-line         single-quote-continue
;
;         single-quote-literal = "''" single-quote-continue
;
; ... which permits valid parses for the following code:
;
;     "''''''''''''''''"
;
; If you tried to parse all alternatives then there are at least two valid
; interpretations for the above code:
;
; * A single quoted literal with four escape sequences of the form "'''"
;     * i.e. "''" followed by "'''"  four times in a row followed by "''"
; * Four empty single quoted literals
;     * i.e. "''''" four times in a row
;
; The correct interpretation is the first one because parsing the escape
; sequence "'''" takes precedence over parsing the termination sequence "''",
; according to the order of the alternatives in the `single-quote-continue`
; rule.
;
; Some parsing libraries do not backtrack by default but allow the user to
; selectively backtrack in certain parts of the grammar.  Usually parsing
; libraries do this to improve efficiency and error messages.  Dhall's grammar
; takes that into account by minimizing the number of rules that require the
; parser to backtrack and comments below will highlight where you need to
; explicitly backtrack
;
; Specifically, if you see an uninterrupted literal in a grammar rule such as:
;
;     "->"
;
; ... or:
;
;     %x66.6f.72.61.6c.6c
;
; ... then that string literal is parsed as a single unit, meaning that you
; should backtrack if you parse only part of the literal
;
; In all other cases you can assume that you do not need to backtrack unless
; there is a comment explicitly asking you to backtrack
;
; When parsing a repeated construct, prefer alternatives that parse as many
; repetitions as possible.  On in other words:
;
;     [a] = a / ""
;
;     a* = a* a / ""
;
; Note that the latter rule also specifies that repetition produces
; left-associated expressions.  For example, function application is
; left-associative and all operators are left-associative when they are not
; parenthesized.
;
; Additionally, try alternatives in an order that minimizes backtracking
; according to the following rule:
;
;     (a / b) (c / d) = a c / a d / b c / b d

; NOTE: There are many line endings in the wild
;
; See: https://en.wikipedia.org/wiki/Newline
;
; For simplicity this supports Unix and Windows line-endings, which are the most
; common
end-of-line-silent =
      %x0A     ; "\n"
    / %x0D.0A  ; "\r\n"
end-of-line = end-of-line-silent

tab = %x09  ; "\t"

block-comment = "{-" block-comment-continue

block-comment-chunk =
      block-comment
    / %x20-10FFFF
    / tab
    / end-of-line-silent

block-comment-continue = "-}" / block-comment-chunk block-comment-continue

not-end-of-line = %x20-10FFFF / tab

; NOTE: Slightly different from Haskell-style single-line comments because this
; does not require a space after the dashes
line-comment = "--" *not-end-of-line end-of-line-silent

whitespace-chunk =
      " "
    / tab
    / end-of-line-silent
    / line-comment
    / block-comment

whitespace = *whitespace-chunk

nonempty-whitespace = 1*whitespace-chunk

; Uppercase or lowercase ASCII letter
ALPHA = %x41-5A / %x61-7A

; ASCII digit
DIGIT = %x30-39  ; 0-9

HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"

; A simple label cannot be one of the following reserved keywords:
;
; * if
; * then
; * else
; * let
; * in
; * as
; * using
; * merge
simple-label-first-char = ALPHA / "_"
simple-label-next-char = ALPHA / DIGIT / "-" / "/" / "_"
simple-label = simple-label-first-char *simple-label-next-char

quoted-label = 1*(ALPHA / DIGIT / "-" / "/" / "_" / ":" / "." / "$")

; NOTE: Dhall does not support Unicode labels, mainly to minimize the potential
; for code obfuscation
label-raw = ("`" quoted-label "`" / simple-label)
label = label-raw whitespace

; Dhall's double-quoted strings are equivalent to JSON strings except with
; support for string interpolation (and escaping string interpolation)
;
; Dhall uses almost the same escaping rules as JSON (RFC7159) with one
; exception: Dhall adds a new `\$` escape sequence for dollar signs.  This
; additional escape sequences lets you escape string interpolation by writing
; `\${`
;
; > The representation of strings is similar to conventions used in the C
; > family of programming languages.  A string begins and ends with
; > quotation marks.  All Unicode characters may be placed within the
; > quotation marks, except for the characters that must be escaped:
; > quotation mark, reverse solidus, and the control characters (U+0000
; > through U+001F).
; > 
; > Any character may be escaped.  If the character is in the Basic
; > Multilingual Plane (U+0000 through U+FFFF), then it may be
; > represented as a six-character sequence: a reverse solidus, followed
; > by the lowercase letter u, followed by four hexadecimal digits that
; > encode the character's code point.  The hexadecimal letters A though
; > F can be upper or lower case.  So, for example, a string containing
; > only a single reverse solidus character may be represented as
; > "\u005C".
; > 
; > Alternatively, there are two-character sequence escape
; > representations of some popular characters.  So, for example, a
; > string containing only a single reverse solidus character may be
; > represented more compactly as "\\".
; > 
; > To escape an extended character that is not in the Basic Multilingual
; > Plane, the character is represented as a 12-character sequence,
; > encoding the UTF-16 surrogate pair.  So, for example, a string
; > containing only the G clef character (U+1D11E) may be represented as
; > "\uD834\uDD1E".
double-quote-chunk =
      interpolation
      ; '\'
    / %x5C double-quote-escaped 
    / double-quote-char

double-quote-escaped =
      %x22               ; '"'    quotation mark  U+0022
    / %x24               ; '$'    dollar sign     U+0024
    / %x5C               ; '\'    reverse solidus U+005C
    / %x2F               ; '/'    solidus         U+002F
    / %x62               ; 'b'    backspace       U+0008
    / %x66               ; 'f'    form feed       U+000C
    / %x6E               ; 'n'    line feed       U+000A
    / %x72               ; 'r'    carriage return U+000D
    / %x74               ; 't'    tab             U+0009
    / %x75 4HEXDIG       ; 'uXXXX'                U+XXXX

; Printable characters except double quote and backslash
double-quote-char =
      %x20-21
        ; %x22 = '"'
    / %x23-5B
        ; %x5C = "\"
    / %x5D-10FFFF

double-quote-literal = %x22 *double-quote-chunk %x22

; NOTE: The only way to end a single-quote string literal with a single quote is
; to either interpolate the single quote, like this:
;
;     ''ABC${"'"}''
;
; ... or concatenate another string, like this:
;
;     ''ABC'' ++ "'"
;
; If you try to end the string literal with a single quote then you get "'''",
; which is interpreted as an escaped pair of single quotes
single-quote-continue =
      interpolation single-quote-continue
    / escaped-quote-pair single-quote-continue
    / escaped-interpolation single-quote-continue
    / single-quote-char single-quote-continue
    / "''" ; End of text literal

; Escape two single quotes (i.e. replace this sequence with "''")
escaped-quote-pair = "'''"

; Escape interpolation (i.e. replace this sequence with "${")
escaped-interpolation = "''${"

single-quote-char =
      %x20-10FFFF
    / tab
    / end-of-line

single-quote-literal = "''" end-of-line single-quote-continue

; Interpolation
interpolation = "${" complete-expression "}"

text-literal-raw = (double-quote-literal / single-quote-literal)

; RFC 5234 interprets string literals as case-insensitive and recommends using
; hex instead for case-sensitive strings
;
; If you don't feel like reading hex, these are all the same as the rule name,
; except without the '-raw' ending.
if-raw                = %x69.66
then-raw              = %x74.68.65.6e
else-raw              = %x65.6c.73.65
let-raw               = %x6c.65.74
in-raw                = %x69.6e
as-raw                = %x61.73
using-raw             = %x75.73.69.6e.67
merge-raw             = %x6d.65.72.67.65
missing-raw           = %x6d.69.73.73.69.6e.67
Optional-raw          = %x4f.70.74.69.6f.6e.61.6c
Text-raw              = %x54.65.78.74
List-raw              = %x4c.69.73.74
Infinity-raw          = %x49.6e.66.69.6e.69.74.79
NaN-raw               = %x4e.61.4e

; Whitespaced rules for reserved words, to be used when matching expressions
if       = if-raw           nonempty-whitespace
then     = then-raw         nonempty-whitespace
else     = else-raw         nonempty-whitespace
let      = let-raw          nonempty-whitespace
in       = in-raw           nonempty-whitespace
as       = as-raw           nonempty-whitespace
using    = using-raw        nonempty-whitespace
merge    = merge-raw        nonempty-whitespace
Optional = Optional-raw     whitespace
Text     = Text-raw         whitespace
List     = List-raw         whitespace

equal             = "="  whitespace
or                = "||" whitespace
plus              = "+"  nonempty-whitespace  ; To disambiguate `f +2`
text-append       = "++" whitespace
list-append       = "#"  nonempty-whitespace  ; To disambiguate `http://a/a#a`
and               = "&&" whitespace
times             = "*"  whitespace
double-equal      = "==" whitespace
not-equal         = "!=" whitespace
dot               = "."  whitespace
bar               = "|"  whitespace
comma             = ","  whitespace
at                = "@"  whitespace
colon             = ":"  nonempty-whitespace  ; To disambiguate `env:VARIABLE` from type annotations
import-alt        = "?"  nonempty-whitespace  ; To disambiguate `http://a/a?a`
open-parens       = "("  whitespace
close-parens-raw  = ")"
close-parens      = ")"  whitespace
open-brace        = "{"  whitespace
close-brace-raw   = "}"
close-brace       = "}"  whitespace
open-bracket      = "["  whitespace
close-bracket-raw = "]"
close-bracket     = "]"  whitespace
open-angle        = "<"  whitespace
close-angle-raw   = ">"
close-angle       = ">"  whitespace

combine       = ( %x2227 / "/\"                ) whitespace
combine-types = ( %x2A53 / "//\\"              ) whitespace
prefer        = ( %x2AFD / "//"                ) whitespace
lambda        = ( %x3BB  / "\"                 ) whitespace
forall        = ( %x2200 / %x66.6f.72.61.6c.6c ) whitespace
arrow         = ( %x2192 / "->"                ) whitespace

exponent = "e" [ "+" / "-" ] 1*DIGIT

double-literal-raw = [ "+" / "-" ] 1*DIGIT ( "." 1*DIGIT [ exponent ] / exponent)

natural-literal-raw = 1*DIGIT

integer-literal-raw = ( "+" / "-" ) natural-literal-raw

identifier-raw = label-raw [ whitespace at natural-literal-raw ]
identifier = identifier-raw whitespace

; Printable characters other than " ()[]{}<>/\,"
;
; Excluding those characters ensures that paths don't have to end with trailing
; whitespace most of the time
path-character =
        ; %x20 = " "
      %x21
        ; %x22 = "\""
        ; %x23 = "#"
    / %x24-27
        ; %x28 = "("
        ; %x29 = ")"
    / %x2A-2B
        ; %x2C = ","
    / %x2D-2E
        ; %x2F = "/"
    / %x30-3B
        ; %x3C = "<"
    / %x3D
        ; %x3E = ">"
        ; %x3F = "?"
    / %x40-5A
        ; %x5B = "["
        ; %x5C = "\"
        ; %x5D = "]"
    / %x5E-7A
        ; %x7B = "{"
    / %x7C
        ; %x7D = "}"
    / %x7E

quoted-path-character =
      %x20-21
        ; %x22 = "\""
    / %x23-2E
        ; %x2F = "/"
    / %x30-10FFFF


path-component = "/" ( 1*path-character / %x22 1*quoted-path-character %x22 )

path = 1*path-component

local-raw =
    parent-path
    / here-path
    / home-path
    ; NOTE: Backtrack if parsing this alternative fails
    ;
    ; This is because the first character of this alternative will be "/", but
    ; if the second character is "/" or "\" then this should have been parsed
    ; as an operator instead of a path
    / absolute-path

parent-path = ".." path  ; Relative path
here-path = "."  path  ; Relative path
home-path = "~"  path  ; Home-anchored path
absolute-path = path  ; Absolute path

; `http[s]` URI grammar based on RFC7230 and RFC 3986 with some differences
; noted below

scheme = %x68.74.74.70 [ %x73 ]  ; "http" [ "s" ]

; NOTE: This does not match the official grammar for a URI.  Specifically, this
; replaces `path-abempty` with `path`
http-raw = scheme "://" authority path [ "?" query ] [ "#" fragment ]

; NOTE: Backtrack if parsing the optional user info prefix fails
authority = [ userinfo "@" ] host [ ":" port ]

userinfo = *( unreserved / pct-encoded / sub-delims / ":" )

host = IP-literal / IPv4address / reg-name

port = *DIGIT

IP-literal = "[" ( IPv6address / IPvFuture  ) "]"

IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )

; NOTE: Backtrack when parsing each alternative
IPv6address =                            6( h16 ":" ) ls32
            /                       "::" 5( h16 ":" ) ls32
            / [               h16 ] "::" 4( h16 ":" ) ls32
            / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
            / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
            / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
            / [ *4( h16 ":" ) h16 ] "::"              ls32
            / [ *5( h16 ":" ) h16 ] "::"              h16
            / [ *6( h16 ":" ) h16 ] "::"

h16 = 1*4HEXDIG

ls32 = ( h16 ":" h16 ) / IPv4address

IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet

; NOTE: Backtrack when parsing these alternatives and try them in reverse order
dec-octet = DIGIT              ; 0-9
          / %x31-39 DIGIT      ; 10-99
          / "1" 2DIGIT         ; 100-199
          / "2" %x30-34 DIGIT  ; 200-249
          / "25" %x30-35       ; 250-255

reg-name = *( unreserved / pct-encoded / sub-delims )

pchar = unreserved / pct-encoded / sub-delims / ":" / "@"

query = *( pchar / "/" / "?" )

fragment = *( pchar / "/" / "?" )

pct-encoded = "%" HEXDIG HEXDIG

unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"

sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="

http =
    http-raw
    [ whitespace using (import-hashed-raw / open-parens import-hashed-raw whitespace close-parens-raw) ] 

; Dhall supports unquoted environment variables that are Bash-compliant or
; quoted environment variables that are POSIX-compliant
env-raw = "env:"
    ( bash-environment-variable
    / %x22 posix-environment-variable %x22
    )

; Bash supports a restricted subset of POSIX environment variables.  From the
; Bash `man` page, an environment variable name is:
;
; > A word consisting only of  alphanumeric  characters  and  under-scores,  and
; > beginning with an alphabetic character or an under-score
bash-environment-variable = (ALPHA / "_") *(ALPHA / DIGIT / "_")

; The POSIX standard is significantly more flexible about legal environment
; variable names, which can contain alerts (i.e. '\a'), whitespace, or
; punctuation, for example.  The POSIX standard says about environment variable
; names:
;
; > The value of an environment variable is a string of characters. For a
; > C-language program, an array of strings called the environment shall be made
; > available when a process begins. The array is pointed to by the external
; > variable environ, which is defined as:
; >
; >     extern char **environ;
; >
; > These strings have the form name=value; names shall not contain the
; > character '='. For values to be portable across systems conforming to IEEE
; > Std 1003.1-2001, the value shall be composed of characters from the portable
; > character set (except NUL and as indicated below).
;
; Note that the standard does not explicitly state that the name must have at
; least one character, but `env` does not appear to support this and `env`
; claims to be POSIX-compliant.  To be safe, Dhall requires at least one
; character like `env`
posix-environment-variable = 1*posix-environment-variable-character

; These are all the characters from the POSIX Portable Character Set except for 
; '\0' (NUL) and '='.  Note that the POSIX standard does not explicitly state
; that environment variable names cannot have NUL.  However, this is implicit
; in the fact that environment variables are passed to the program as
; NUL-terminated `name=value` strings, which implies that the `name` portion of
; the string cannot have NUL characters
posix-environment-variable-character =
      %x5C                 ; '\'    Beginning of escape sequence
      ( %x22               ; '"'    quotation mark  U+0022
      / %x5C               ; '\'    reverse solidus U+005C
      / %x61               ; 'a'    alert           U+0007
      / %x62               ; 'b'    backspace       U+0008
      / %x66               ; 'f'    form feed       U+000C
      / %x6E               ; 'n'    line feed       U+000A
      / %x72               ; 'r'    carriage return U+000D
      / %x74               ; 't'    tab             U+0009
      / %x76               ; 'v'    vertical tab    U+000B
      )
    ; Printable characters except double quote, backslash and equals
    / %x20-21
        ; %x22 = '"'
    / %x23-3C
        ; %x3D = '='
    / %x3E-5B
        ; %x5C = "\"
    / %x5D-7E

import-type-raw = missing-raw / local-raw / http / env-raw

hash-raw = %x73.68.61.32.35.36.3a 64HEXDIG  ; "sha256:XXX...XXX"

import-hashed-raw = import-type-raw [ whitespace hash-raw ]

; "http://example.com"
; "./foo/bar"
; "env:FOO"
import-raw = import-hashed-raw [ whitespace as Text-raw ]

; NOTE: Every rule past this point should only reference rules that end with
; whitespace.  This ensures consistent handling of whitespace in the absence of
; a separate lexing step.
; The exception is the rules ending in -raw, which should _not_ end in whitespace.
; This is important to avoid the need for sequential backtracking in application-expression.

; An arbitrary dhall expression. Only use in a context where parentheses or
; keywords prevent possible ambiguity. See also atomic-expression.
expression =
    lambda-expression
    / ifthenelse-expression
    / let-expression
    / forall-expression
    ; NOTE: Backtrack if parsing this alternative fails
    / arrow-expression
    / merge-expression
    ; NOTE: Backtrack if parsing this alternative fails since we can't tell
    ; from the opening bracket whether or not this will be an empty list or
    ; a non-empty list
    / empty-list-or-optional
    / annotated-expression

; "\(x : a) -> b"
lambda-expression = lambda open-parens label colon expression close-parens arrow expression

; "if a then b else c"
ifthenelse-expression = if expression then expression else expression

; "let x : t = e1 in e2"
; "let x     = e1 in e2"
; "let x = e1 let y = e2 in e3"
let-expression = 1*let-binding in expression
let-binding = let label [ colon expression ] equal expression

; "forall (x : a) -> b"
forall-expression = forall open-parens label colon expression close-parens arrow expression

; "a -> b"
arrow-expression = operator-expression arrow expression

; "merge e1 e2 : t"
; "merge e1 e2"
merge-expression = merge atomic-expression atomic-expression [ colon application-expression ]

; "[]  : List     t"
; "[]  : Optional t"
; "[x] : Optional t"
empty-list-or-optional = open-bracket (empty-collection / non-empty-optional)
empty-collection = close-bracket colon (List / Optional) atomic-expression
non-empty-optional = expression close-bracket colon Optional atomic-expression

; "x : t"
annotated-expression = operator-expression [ colon expression ]


operator-expression = import-alt-expression

import-alt-expression    = or-expression            *(import-alt    or-expression)
or-expression            = plus-expression          *(or            plus-expression         )
plus-expression          = text-append-expression   *(plus          text-append-expression  )
text-append-expression   = list-append-expression   *(text-append   list-append-expression  )
list-append-expression   = and-expression           *(list-append   and-expression          )
and-expression           = combine-expression       *(and           combine-expression      )
combine-expression       = prefer-expression        *(combine       prefer-expression       )
prefer-expression        = combine-types-expression *(prefer        combine-types-expression)
combine-types-expression = times-expression         *(combine-types times-expression        )
times-expression         = equal-expression         *(times         equal-expression        )
equal-expression         = not-equal-expression     *(double-equal  not-equal-expression    )
not-equal-expression     = application-expression   *(not-equal     application-expression  )

; Import expressions need to be separated by some whitespace, otherwise there
; would be ambiguity: `./ab` could be interpreted as "import the file `./ab`",
; or "apply the import `./a` to label `b`"
; The -raw handling is important for greedy parsers, that can't do sequential backtracking.
application-expression = atomic-expression-raw *(nonempty-whitespace atomic-expression-raw) whitespace

; An expression that does not need to be surrounded by parentheses to disambiguate
atomic-expression = atomic-expression-raw whitespace
atomic-expression-raw =
      import-raw
    / selector-expression-raw

; `record.field` extracts one field of a record
; `record.{ field0, field1, field2 }` projects out several fields of a record
;
; NOTE: Backtrack when parsing the `*(dot ...)`.  The reason why is that you
; can't tell from parsing just the period whether "foo." will become "foo.bar"
; (i.e. accessing field `bar` of the record `foo`) or `foo./bar` (i.e. applying
; the function `foo` to the relative path `./bar`)
selector-expression-raw = primitive-expression-raw *(whitespace dot selector-raw)

selector-raw = label-raw / labels-raw

labels-raw = open-brace [ label *(comma label) ] close-brace-raw


primitive-expression-raw =
      literal-expression-raw
    / open-brace record-type-or-literal close-brace-raw
    / open-angle union-type-or-literal  close-angle-raw
    / non-empty-list-literal-raw
    / parenthesized-expression-raw

; NOTE: Backtrack when parsing the first three alternatives (i.e. the numeric
; literals).  This is because they share leading characters in common
literal-expression-raw =
    ; "2.0"
      double-literal-raw
    
    ; "2"
    / natural-literal-raw
    
    ; "+2"
    / integer-literal-raw
    
    ; "-Infinity"
    / minus-infinity-literal
    ; "Infinity"
    / plus-infinity-literal
    ; "NaN"
    / NaN-raw
    
    ; '"ABC"'
    / text-literal-raw
    
    ; "x"
    ; "x@2"
    / identifier-raw

minus-infinity-literal = "-" Infinity-raw
plus-infinity-literal = Infinity-raw

; "{ foo = 1      , bar = True }"
; "{ foo : Integer, bar : Bool }"
record-type-or-literal =
      empty-record-literal
    / non-empty-record-type-or-literal
    / empty-record-type
empty-record-literal = equal
empty-record-type = ""
non-empty-record-type-or-literal =
    label (non-empty-record-literal / non-empty-record-type)
non-empty-record-type    = colon expression *(comma record-type-entry)
record-type-entry = label colon expression
non-empty-record-literal = equal expression *(comma record-literal-entry)
record-literal-entry = label equal expression

; "< Foo : Integer | Bar : Bool >"
; "< Foo : Integer | Bar = True >"
union-type-or-literal =
      non-empty-union-type-or-literal
    / empty-union-type
empty-union-type = ""
non-empty-union-type-or-literal =
    label
    ( equal expression union-type-entries
    / colon expression [ bar non-empty-union-type-or-literal ]
    )
union-type-entries = *(bar union-type-entry)
union-type-entry = label colon expression

; "[1, 2, 3]"
; `empty-list-or-optional` handles empty lists
non-empty-list-literal-raw = open-bracket expression *(comma expression) close-bracket-raw

; "( e )"
parenthesized-expression-raw = open-parens expression close-parens-raw


; All expressions end with trailing whitespace.  This just adds a final
; whitespace prefix for the top-level of the program
complete-expression = whitespace expression