Start rewriting parser in pest

author: Nadrieril 2019-02-28 19:18:04 +0100
committer: Nadrieril 2019-02-28 19:18:04 +0100
commit: 22a5eac0bfb22bfe27973c78ef0e8a9b418ee844 (patch)
tree: d60cfc35762ed3f34ee2d2fd5ff6d9fa866ed8a3 /src
parent: 4267cfef8ad3a929cba9fcc7bbc91b0fe863b0f6 (diff)
2 files changed, 896 insertions, 0 deletions
diff --git a/src/dhall.pest b/src/dhall.pest
new file mode 100644
index 0000000..873428c
--- /dev/null
+++ b/src/dhall.pest
@@ -0,0 +1,836 @@
+/// ; ABNF syntax based on RFC 5234
+/// ;
+/// ; The character encoding for Dhall is UTF-8
+/// ;
+/// ; Some notes on implementing this grammar:
+/// ;
+/// ; First, do not use a lexer to tokenize the file before parsing.  Instead, treat
+/// ; the individual characters of the file as the tokens to feed into the parser.
+/// ; You should not use a lexer because Dhall's grammar supports two features which
+/// ; cannot be correctly supported by a lexer:
+/// ;
+/// ; * String interpolation (i.e. "foo ${Natural/toInteger bar} baz")
+/// ; * Nested block comments (i.e. "{- foo {- bar -} baz -}")
+/// ;
+/// ; Second, this grammar assumes that your parser can backtrack and/or try
+/// ; multiple parses simultaneously.  For example, consider this expression:
+/// ;
+/// ;     List ./MyType
+/// ;
+/// ; A parser might first try to parse the period as the beginning of a field
+/// ; selector, only to realize immediately afterwards that `/MyType` is not a valid
+/// ; name for a field.  A conforming parser must backtrack so that the expression
+/// ; `./MyType` can instead be correctly interpreted as a relative path
+/// ;
+/// ; Third, if there are multiple valid parses then prefer the first parse
+/// ; according to the ordering of alternatives. That is, the order of evaluation
+/// ; of the alternatives is left-to-right.
+/// ;
+/// ; For example, the grammar for single quoted string literals is:
+/// ;
+/// ;     single-quote-continue =
+/// ;           "'''"               single-quote-continue
+/// ;         / "${" complete-expression "}" single-quote-continue
+/// ;         / "''${"              single-quote-continue
+/// ;         / "''"
+/// ;         / %x20-10FFFF         single-quote-continue
+/// ;         / tab                 single-quote-continue
+/// ;         / end-of-line         single-quote-continue
+/// ;
+/// ;         single-quote-literal = "''" single-quote-continue
+/// ;
+/// ; ... which permits valid parses for the following code:
+/// ;
+/// ;     "''''''''''''''''"
+/// ;
+/// ; If you tried to parse all alternatives then there are at least two valid
+/// ; interpretations for the above code:
+/// ;
+/// ; * A single quoted literal with four escape sequences of the form "'''"
+/// ;     * i.e. "''" followed by "'''"  four times in a row followed by "''"
+/// ; * Four empty single quoted literals
+/// ;     * i.e. "''''" four times in a row
+/// ;
+/// ; The correct interpretation is the first one because parsing the escape
+/// ; sequence "'''" takes precedence over parsing the termination sequence "''",
+/// ; according to the order of the alternatives in the `single-quote-continue`
+/// ; rule.
+/// ;
+/// ; Some parsing libraries do not backtrack by default but allow the user to
+/// ; selectively backtrack in certain parts of the grammar.  Usually parsing
+/// ; libraries do this to improve efficiency and error messages.  Dhall's grammar
+/// ; takes that into account by minimizing the number of rules that require the
+/// ; parser to backtrack and comments below will highlight where you need to
+/// ; explicitly backtrack
+/// ;
+/// ; Specifically, if you see an uninterrupted literal in a grammar rule such as:
+/// ;
+/// ;     "->"
+/// ;
+/// ; ... or:
+/// ;
+/// ;     %x66.6f.72.61.6c.6c
+/// ;
+/// ; ... then that string literal is parsed as a single unit, meaning that you
+/// ; should backtrack if you parse only part of the literal
+/// ;
+/// ; In all other cases you can assume that you do not need to backtrack unless
+/// ; there is a comment explicitly asking you to backtrack
+/// ;
+/// ; When parsing a repeated construct, prefer alternatives that parse as many
+/// ; repetitions as possible.  On in other words:
+/// ;
+/// ;     [a] = a / ""
+/// ;
+/// ;     a* = a* a / ""
+/// ;
+/// ; Note that the latter rule also specifies that repetition produces
+/// ; left-associated expressions.  For example, function application is
+/// ; left-associative and all operators are left-associative when they are not
+/// ; parenthesized.
+/// ;
+/// ; Additionally, try alternatives in an order that minimizes backtracking
+/// ; according to the following rule:
+/// ;
+/// ;     (a / b) (c / d) = a c / a d / b c / b d
+/// 
+/// ; NOTE: There are many line endings in the wild
+/// ;
+/// ; See: https://en.wikipedia.org/wiki/Newline
+/// ;
+/// ; For simplicity this supports Unix and Windows line-endings, which are the most
+/// ; common
+/// end-of-line =
+///       %x0A     ; "\n"
+///     / %x0D.0A  ; "\r\n"
+/// 
+/// tab = %x09  ; "\t"
+/// 
+/// block-comment = "{-" block-comment-continue
+/// 
+/// block-comment-chunk =
+///       block-comment
+///     / %x20-10FFFF
+///     / tab
+///     / end-of-line
+/// 
+/// block-comment-continue = "-}" / block-comment-chunk block-comment-continue
+/// 
+/// not-end-of-line = %x20-10FFFF / tab
+/// 
+/// ; NOTE: Slightly different from Haskell-style single-line comments because this
+/// ; does not require a space after the dashes
+/// line-comment = "--" *not-end-of-line end-of-line
+/// 
+/// whitespace-chunk =
+///       " "
+///     / tab
+///     / end-of-line
+///     / line-comment
+///     / block-comment
+whitespace_chunk = _{
+      " "
+    // | tab
+    // | end_of_line
+    // | line_comment
+    // | block_comment
+}
+/// 
+/// whitespace = *whitespace-chunk
+whitespace = _{ whitespace_chunk* }
+/// 
+/// nonempty-whitespace = 1*whitespace-chunk
+nonempty_whitespace = _{ whitespace_chunk+ }
+/// 
+/// ; Uppercase or lowercase ASCII letter
+/// ALPHA = %x41-5A / %x61-7A
+/// 
+/// ; ASCII digit
+/// DIGIT = %x30-39  ; 0-9
+/// 
+/// HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
+/// 
+/// ; A simple label cannot be one of the following reserved names:
+/// ;
+/// ; * Bool
+/// ; * Optional
+/// ; * None
+/// ; * Natural
+/// ; * Integer
+/// ; * Double
+/// ; * Text
+/// ; * List
+/// ; * True
+/// ; * False
+/// ; * NaN
+/// ; * Infinity
+/// ; * Type
+/// ; * Kind
+/// ; * Sort
+/// ; * Natural/fold
+/// ; * Natural/build
+/// ; * Natural/isZero
+/// ; * Natural/even
+/// ; * Natural/odd
+/// ; * Natural/toInteger
+/// ; * Natural/show
+/// ; * Integer/toDouble
+/// ; * Integer/show
+/// ; * Double/show
+/// ; * List/build
+/// ; * List/fold
+/// ; * List/length
+/// ; * List/head
+/// ; * List/last
+/// ; * List/indexed
+/// ; * List/reverse
+/// ; * Optional/fold
+/// ; * Optional/build
+/// ; * Text/show
+/// ; * if
+/// ; * then
+/// ; * else
+/// ; * let
+/// ; * in
+/// ; * as
+/// ; * using
+/// ; * merge
+/// ; * constructors
+/// ; * Some
+/// simple-label = (ALPHA / "_") *(ALPHA / DIGIT / "-" / "/" / "_")
+/// 
+/// quoted-label = 1*(ALPHA / DIGIT / "-" / "/" / "_" / ":" / "." / "$")
+/// 
+/// ; NOTE: Dhall does not support Unicode labels, mainly to minimize the potential
+/// ; for code obfuscation
+/// label = ("`" quoted-label "`" / simple-label) whitespace
+/// 
+/// ; Dhall's double-quoted strings are equivalent to JSON strings except with
+/// ; support for string interpolation (and escaping string interpolation)
+/// ;
+/// ; Dhall uses almost the same escaping rules as JSON (RFC7159) with one
+/// ; exception: Dhall adds a new `\$` escape sequence for dollar signs.  This
+/// ; additional escape sequences lets you escape string interpolation by writing
+/// ; `\${`
+/// ;
+/// ; > The representation of strings is similar to conventions used in the C
+/// ; > family of programming languages.  A string begins and ends with
+/// ; > quotation marks.  All Unicode characters may be placed within the
+/// ; > quotation marks, except for the characters that must be escaped:
+/// ; > quotation mark, reverse solidus, and the control characters (U+0000
+/// ; > through U+001F).
+/// ; > 
+/// ; > Any character may be escaped.  If the character is in the Basic
+/// ; > Multilingual Plane (U+0000 through U+FFFF), then it may be
+/// ; > represented as a six-character sequence: a reverse solidus, followed
+/// ; > by the lowercase letter u, followed by four hexadecimal digits that
+/// ; > encode the character's code point.  The hexadecimal letters A though
+/// ; > F can be upper or lower case.  So, for example, a string containing
+/// ; > only a single reverse solidus character may be represented as
+/// ; > "\u005C".
+/// ; > 
+/// ; > Alternatively, there are two-character sequence escape
+/// ; > representations of some popular characters.  So, for example, a
+/// ; > string containing only a single reverse solidus character may be
+/// ; > represented more compactly as "\\".
+/// ; > 
+/// ; > To escape an extended character that is not in the Basic Multilingual
+/// ; > Plane, the character is represented as a 12-character sequence,
+/// ; > encoding the UTF-16 surrogate pair.  So, for example, a string
+/// ; > containing only the G clef character (U+1D11E) may be represented as
+/// ; > "\uD834\uDD1E".
+/// double-quote-chunk =
+///       "${" complete-expression "}"  ; Interpolation
+///     / %x5C                 ; '\'    Beginning of escape sequence
+///       ( %x22               ; '"'    quotation mark  U+0022
+///       / %x24               ; '$'    dollar sign     U+0024
+///       / %x5C               ; '\'    reverse solidus U+005C
+///       / %x2F               ; '/'    solidus         U+002F
+///       / %x62               ; 'b'    backspace       U+0008
+///       / %x66               ; 'f'    form feed       U+000C
+///       / %x6E               ; 'n'    line feed       U+000A
+///       / %x72               ; 'r'    carriage return U+000D
+///       / %x74               ; 't'    tab             U+0009
+///       / %x75 4HEXDIG       ; 'uXXXX'                U+XXXX
+///       )
+///     ; Printable characters except double quote and backslash
+///     / %x20-21
+///         ; %x22 = '"'
+///     / %x23-5B
+///         ; %x5C = "\"
+///     / %x5D-10FFFF
+/// 
+/// double-quote-literal = %x22 *double-quote-chunk %x22
+/// 
+/// ; NOTE: The only way to end a single-quote string literal with a single quote is
+/// ; to either interpolate the single quote, like this:
+/// ;
+/// ;     ''ABC${"'"}''
+/// ;
+/// ; ... or concatenate another string, like this:
+/// ;
+/// ;     ''ABC'' ++ "'"
+/// ;
+/// ; If you try to end the string literal with a single quote then you get "'''",
+/// ; which is interpreted as an escaped pair of single quotes
+/// single-quote-continue =
+///       ; Escape two single quotes (i.e. replace this sequence with "''")
+///       "'''"               single-quote-continue
+///       ; Interpolation
+///     / "${" complete-expression "}" single-quote-continue
+///       ; Escape interpolation (i.e. replace this sequence with "${")
+///     / "''${"              single-quote-continue
+///     / "''"                                       ; End of text literal
+///     / %x20-10FFFF         single-quote-continue
+///     / tab                 single-quote-continue
+///     / end-of-line         single-quote-continue
+/// 
+/// single-quote-literal = "''" end-of-line single-quote-continue
+/// 
+/// text-literal = (double-quote-literal / single-quote-literal) whitespace
+/// 
+/// ; RFC 5234 interprets string literals as case-insensitive and recommends using
+/// ; hex instead for case-sensitive strings
+/// ;
+/// ; If you don't feel like reading hex, these are all the same as the rule name,
+/// ; except without the '-raw' ending, and converting dashes in the rule name
+/// ; to forward slashes
+/// if-raw                = %x69.66
+/// then-raw              = %x74.68.65.6e
+/// else-raw              = %x65.6c.73.65
+/// let-raw               = %x6c.65.74
+/// in-raw                = %x69.6e
+/// as-raw                = %x61.73
+/// using-raw             = %x75.73.69.6e.67
+/// merge-raw             = %x6d.65.72.67.65
+/// missing-raw           = %x6d.69.73.73.69.6e.67
+/// Some-raw              = %x53.6f.6d.65
+/// constructors-raw      = %x63.6f.6e.73.74.72.75.63.74.6f.72.73
+/// Natural-fold-raw      = %x4e.61.74.75.72.61.6c.2f.66.6f.6c.64
+/// Natural-build-raw     = %x4e.61.74.75.72.61.6c.2f.62.75.69.6c.64
+/// Natural-isZero-raw    = %x4e.61.74.75.72.61.6c.2f.69.73.5a.65.72.6f
+/// Natural-even-raw      = %x4e.61.74.75.72.61.6c.2f.65.76.65.6e
+/// Natural-odd-raw       = %x4e.61.74.75.72.61.6c.2f.6f.64.64
+/// Natural-toInteger-raw = %x4e.61.74.75.72.61.6c.2f.74.6f.49.6e.74.65.67.65.72
+/// Natural-show-raw      = %x4e.61.74.75.72.61.6c.2f.73.68.6f.77
+/// Integer-toDouble-raw  = %x49.6e.74.65.67.65.72.2f.74.6f.44.6f.75.62.6c.65
+/// Integer-show-raw      = %x49.6e.74.65.67.65.72.2f.73.68.6f.77
+/// Double-show-raw       = %x44.6f.75.62.6c.65.2f.73.68.6f.77
+/// List-build-raw        = %x4c.69.73.74.2f.62.75.69.6c.64
+/// List-fold-raw         = %x4c.69.73.74.2f.66.6f.6c.64
+/// List-length-raw       = %x4c.69.73.74.2f.6c.65.6e.67.74.68
+/// List-head-raw         = %x4c.69.73.74.2f.68.65.61.64
+/// List-last-raw         = %x4c.69.73.74.2f.6c.61.73.74
+/// List-indexed-raw      = %x4c.69.73.74.2f.69.6e.64.65.78.65.64
+/// List-reverse-raw      = %x4c.69.73.74.2f.72.65.76.65.72.73.65
+/// Optional-fold-raw     = %x4f.70.74.69.6f.6e.61.6c.2f.66.6f.6c.64
+/// Optional-build-raw    = %x4f.70.74.69.6f.6e.61.6c.2f.62.75.69.6c.64
+/// Text-show-raw         = %x54.65.78.74.2f.73.68.6f.77
+/// Bool-raw              = %x42.6f.6f.6c
+/// Optional-raw          = %x4f.70.74.69.6f.6e.61.6c
+/// None-raw              = %x4e.6f.6e.65
+/// Natural-raw           = %x4e.61.74.75.72.61.6c
+/// Integer-raw           = %x49.6e.74.65.67.65.72
+/// Double-raw            = %x44.6f.75.62.6c.65
+/// Text-raw              = %x54.65.78.74
+/// List-raw              = %x4c.69.73.74
+/// True-raw              = %x54.72.75.65
+/// False-raw             = %x46.61.6c.73.65
+/// NaN-raw               = %x4e.61.4e
+/// Infinity-raw          = %x49.6e.66.69.6e.69.74.79
+/// Type-raw              = %x54.79.70.65
+/// Kind-raw              = %x4b.69.6e.64
+/// Sort-raw              = %x53.6f.72.74
+/// 
+/// reserved-raw =
+///     Bool-raw
+///   / Optional-raw
+///   / None-raw
+///   / Natural-raw
+///   / Integer-raw
+///   / Double-raw
+///   / Text-raw
+///   / List-raw
+///   / True-raw
+///   / False-raw
+///   / NaN-raw
+///   / Infinity-raw
+///   / Type-raw
+///   / Kind-raw
+///   / Sort-raw
+/// 
+/// reserved-namespaced-raw =
+///     Natural-fold-raw
+///   / Natural-build-raw
+///   / Natural-isZero-raw
+///   / Natural-even-raw
+///   / Natural-odd-raw
+///   / Natural-toInteger-raw
+///   / Natural-show-raw
+///   / Integer-toDouble-raw
+///   / Integer-show-raw
+///   / Double-show-raw
+///   / List-build-raw
+///   / List-fold-raw
+///   / List-length-raw
+///   / List-head-raw
+///   / List-last-raw
+///   / List-indexed-raw
+///   / List-reverse-raw
+///   / Optional-fold-raw
+///   / Optional-build-raw
+///   / Text-show-raw
+/// 
+/// reserved            = reserved-raw            whitespace
+/// reserved-namespaced = reserved-namespaced-raw whitespace
+/// 
+/// ; Whitespaced rules for reserved words, to be used when matching expressions
+/// if           = if-raw           nonempty-whitespace
+/// then         = then-raw         nonempty-whitespace
+/// else         = else-raw         nonempty-whitespace
+/// let          = let-raw          nonempty-whitespace
+/// in           = in-raw           nonempty-whitespace
+/// as           = as-raw           nonempty-whitespace
+/// using        = using-raw        nonempty-whitespace
+/// merge        = merge-raw        nonempty-whitespace
+/// constructors = constructors-raw nonempty-whitespace
+/// Some         = Some-raw         nonempty-whitespace
+/// 
+/// Optional     = Optional-raw     whitespace
+/// Text         = Text-raw         whitespace
+/// List         = List-raw         whitespace
+/// 
+/// equal         = "="  whitespace
+/// or            = "||" whitespace
+/// plus          = "+"  nonempty-whitespace  ; To disambiguate `f +2`
+plus = _{ "+" ~ nonempty_whitespace }
+/// text-append   = "++" whitespace
+/// list-append   = "#"  nonempty-whitespace  ; To disambiguate `http://a/a#a`
+/// and           = "&&" whitespace
+/// times         = "*"  whitespace
+times = _{ "*" ~ nonempty_whitespace }
+/// double-equal  = "==" whitespace
+/// not-equal     = "!=" whitespace
+/// dot           = "."  whitespace
+/// open-brace    = "{"  whitespace
+/// close-brace   = "}"  whitespace
+/// open-bracket  = "["  whitespace
+/// close-bracket = "]"  whitespace
+/// open-angle    = "<"  whitespace
+/// close-angle   = ">"  whitespace
+/// bar           = "|"  whitespace
+/// comma         = ","  whitespace
+/// open-parens   = "("  whitespace
+open_parens = _{ "(" ~ whitespace }
+/// close-parens  = ")"  whitespace
+close_parens = _{ ")" ~ whitespace }
+/// at            = "@"  whitespace
+/// colon         = ":"  nonempty-whitespace  ; To disambiguate `env:VARIABLE` from type annotations
+colon = _{ ":" ~ nonempty_whitespace }
+/// import-alt    = "?"  nonempty-whitespace  ; To disambiguate `http://a/a?a`
+/// 
+/// combine       = ( %x2227 / "/\"                ) whitespace
+/// combine-types = ( %x2A53 / "//\\"              ) whitespace
+/// prefer        = ( %x2AFD / "//"                ) whitespace
+/// lambda        = ( %x3BB  / "\"                 ) whitespace
+/// forall        = ( %x2200 / %x66.6f.72.61.6c.6c ) whitespace
+/// arrow         = ( %x2192 / "->"                ) whitespace
+/// 
+/// exponent = "e" [ "+" / "-" ] 1*DIGIT
+/// 
+/// double-literal = [ "+" / "-" ] 1*DIGIT ( "." 1*DIGIT [ exponent ] / exponent) whitespace
+/// 
+/// natural-literal-raw = 1*DIGIT
+natural_literal_raw = _{ ASCII_DIGIT+ }
+/// 
+/// integer-literal = ( "+" / "-" ) natural-literal-raw whitespace
+/// 
+/// natural-literal = natural-literal-raw whitespace
+natural_literal = { natural_literal_raw ~ whitespace }
+/// 
+/// identifier = label [ at natural-literal-raw whitespace ]
+/// 
+/// identifier-reserved-prefix =
+///     reserved-raw 1*(ALPHA / DIGIT / "-" / "/" / "_") whitespace [ at natural-literal-raw whitespace ]
+/// 
+/// identifier-reserved-namespaced-prefix =
+///     reserved-namespaced-raw 1*(ALPHA / DIGIT / "-" / "/" / "_") whitespace [ at natural-literal-raw whitespace ]
+/// 
+/// missing = missing-raw whitespace
+/// 
+/// ; Printable characters other than " ()[]{}<>/\,"
+/// ;
+/// ; Excluding those characters ensures that paths don't have to end with trailing
+/// ; whitespace most of the time
+/// path-character =
+///         ; %x20 = " "
+///       %x21
+///         ; %x22 = "\""
+///         ; %x23 = "#"
+///     / %x24-27
+///         ; %x28 = "("
+///         ; %x29 = ")"
+///     / %x2A-2B
+///         ; %x2C = ","
+///     / %x2D-2E
+///         ; %x2F = "/"
+///     / %x30-3B
+///         ; %x3C = "<"
+///     / %x3D
+///         ; %x3E = ">"
+///         ; %x3F = "?"
+///     / %x40-5A
+///         ; %x5B = "["
+///         ; %x5C = "\"
+///         ; %x5D = "]"
+///     / %x5E-7A
+///         ; %x7B = "{"
+///     / %x7C
+///         ; %x7D = "}"
+///     / %x7E
+/// 
+/// quoted-path-character =
+///       %x20-21
+///         ; %x22 = "\""
+///     / %x23-2E
+///         ; %x2F = "/"
+///     / %x30-10FFFF
+/// 
+/// 
+/// path-component = "/" ( 1*path-character / %x22 1*quoted-path-character %x22 )
+/// 
+/// directory = *path-component
+/// 
+/// file = path-component
+/// 
+/// local-raw =
+///       ".." directory file  ; Relative path
+///     / "."  directory file  ; Relative path
+///     / "~"  directory file  ; Home-anchored path
+///     ; NOTE: Backtrack if parsing this alternative fails
+///     ;
+///     ; This is because the first character of this alternative will be "/", but
+///     ; if the second character is "/" or "\" then this should have been parsed
+///     ; as an operator instead of a path
+///     / directory file  ; Absolute path
+/// 
+/// local = local-raw whitespace
+/// 
+/// ; `http[s]` URI grammar based on RFC7230 and RFC 3986 with some differences
+/// ; noted below
+/// 
+/// scheme = %x68.74.74.70 [ %x73 ]  ; "http" [ "s" ]
+/// 
+/// ; NOTE: This does not match the official grammar for a URI.  Specifically, this
+/// ; replaces `path-abempty` with `directory file`
+/// http-raw = scheme "://" authority directory file [ "?" query ] [ "#" fragment ]
+/// 
+/// ; NOTE: Backtrack if parsing the optional user info prefix fails
+/// authority = [ userinfo "@" ] host [ ":" port ]
+/// 
+/// userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
+/// 
+/// host = IP-literal / IPv4address / reg-name
+/// 
+/// port = *DIGIT
+/// 
+/// IP-literal = "[" ( IPv6address / IPvFuture  ) "]"
+/// 
+/// IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
+/// 
+/// ; NOTE: Backtrack when parsing each alternative
+/// IPv6address =                            6( h16 ":" ) ls32
+///             /                       "::" 5( h16 ":" ) ls32
+///             / [               h16 ] "::" 4( h16 ":" ) ls32
+///             / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
+///             / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
+///             / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
+///             / [ *4( h16 ":" ) h16 ] "::"              ls32
+///             / [ *5( h16 ":" ) h16 ] "::"              h16
+///             / [ *6( h16 ":" ) h16 ] "::"
+/// 
+/// h16 = 1*4HEXDIG
+/// 
+/// ls32 = ( h16 ":" h16 ) / IPv4address
+/// 
+/// IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
+/// 
+/// ; NOTE: Backtrack when parsing these alternatives and try them in reverse order
+/// dec-octet = DIGIT              ; 0-9
+///           / %x31-39 DIGIT      ; 10-99
+///           / "1" 2DIGIT         ; 100-199
+///           / "2" %x30-34 DIGIT  ; 200-249
+///           / "25" %x30-35       ; 250-255
+/// 
+/// reg-name = *( unreserved / pct-encoded / sub-delims )
+/// 
+/// pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
+/// 
+/// query = *( pchar / "/" / "?" )
+/// 
+/// fragment = *( pchar / "/" / "?" )
+/// 
+/// pct-encoded = "%" HEXDIG HEXDIG
+/// 
+/// unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
+/// 
+/// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
+/// 
+/// http =
+///     http-raw whitespace
+///     [ using (import-hashed / open-parens import-hashed close-parens) ]
+/// 
+/// ; Dhall supports unquoted environment variables that are Bash-compliant or
+/// ; quoted environment variables that are POSIX-compliant
+/// env = "env:"
+///     ( bash-environment-variable
+///     / %x22 posix-environment-variable %x22
+///     )
+///     whitespace
+/// 
+/// ; Bash supports a restricted subset of POSIX environment variables.  From the
+/// ; Bash `man` page, an environment variable name is:
+/// ;
+/// ; > A word consisting only of  alphanumeric  characters  and  under-scores,  and
+/// ; > beginning with an alphabetic character or an under-score
+/// bash-environment-variable = (ALPHA / "_") *(ALPHA / DIGIT / "_")
+/// 
+/// ; The POSIX standard is significantly more flexible about legal environment
+/// ; variable names, which can contain alerts (i.e. '\a'), whitespace, or
+/// ; punctuation, for example.  The POSIX standard says about environment variable
+/// ; names:
+/// ;
+/// ; > The value of an environment variable is a string of characters. For a
+/// ; > C-language program, an array of strings called the environment shall be made
+/// ; > available when a process begins. The array is pointed to by the external
+/// ; > variable environ, which is defined as:
+/// ; >
+/// ; >     extern char **environ;
+/// ; >
+/// ; > These strings have the form name=value; names shall not contain the
+/// ; > character '='. For values to be portable across systems conforming to IEEE
+/// ; > Std 1003.1-2001, the value shall be composed of characters from the portable
+/// ; > character set (except NUL and as indicated below).
+/// ;
+/// ; Note that the standard does not explicitly state that the name must have at
+/// ; least one character, but `env` does not appear to support this and `env`
+/// ; claims to be POSIX-compliant.  To be safe, Dhall requires at least one
+/// ; character like `env`
+/// posix-environment-variable = 1*posix-environment-variable-character
+/// 
+/// ; These are all the characters from the POSIX Portable Character Set except for 
+/// ; '\0' (NUL) and '='.  Note that the POSIX standard does not explicitly state
+/// ; that environment variable names cannot have NUL.  However, this is implicit
+/// ; in the fact that environment variables are passed to the program as
+/// ; NUL-terminated `name=value` strings, which implies that the `name` portion of
+/// ; the string cannot have NUL characters
+/// posix-environment-variable-character =
+///       %x5C                 ; '\'    Beginning of escape sequence
+///       ( %x22               ; '"'    quotation mark  U+0022
+///       / %x5C               ; '\'    reverse solidus U+005C
+///       / %x61               ; 'a'    alert           U+0007
+///       / %x62               ; 'b'    backspace       U+0008
+///       / %x66               ; 'f'    form feed       U+000C
+///       / %x6E               ; 'n'    line feed       U+000A
+///       / %x72               ; 'r'    carriage return U+000D
+///       / %x74               ; 't'    tab             U+0009
+///       / %x76               ; 'v'    vertical tab    U+000B
+///       )
+///     ; Printable characters except double quote, backslash and equals
+///     / %x20-21
+///         ; %x22 = '"'
+///     / %x23-3C
+///         ; %x3D = '='
+///     / %x3E-5B
+///         ; %x5C = "\"
+///     / %x5D-7E
+/// 
+/// import-type = missing / local / http / env
+/// 
+/// hash = %x73.68.61.32.35.36.3a 64HEXDIG whitespace  ; "sha256:XXX...XXX"
+/// 
+/// import-hashed = import-type [ hash ]
+/// 
+/// ; "http://example.com"
+/// ; "./foo/bar"
+/// ; "env:FOO"
+/// import = import-hashed [ as Text ]
+/// 
+/// ; NOTE: Every rule past this point should only reference rules that end with
+/// ; whitespace.  This ensures consistent handling of whitespace in the absence of
+/// ; a separate lexing step
+/// 
+/// expression =
+///     ; "\(x : a) -> b"
+///       lambda open-parens label colon expression close-parens arrow expression
+///     
+///     ; "if a then b else c"
+///     / if expression then expression else expression
+///     
+///     ; "let x : t = e1 in e2"
+///     ; "let x     = e1 in e2"
+///     ; "let x = e1 let y = e2 in e3"
+///     / 1*(let label [ colon expression ] equal expression) in expression
+///     
+///     ; "forall (x : a) -> b"
+///     / forall open-parens label colon expression close-parens arrow expression
+///     
+///     ; "a -> b"
+///     ;
+///     ; NOTE: Backtrack if parsing this alternative fails
+///     / operator-expression arrow expression
+///     
+///     / annotated-expression
+expression = _{ annotated_expression }
+/// 
+/// annotated-expression =
+///     ; "merge e1 e2 : t"
+///     ; "merge e1 e2"
+///       merge import-expression import-expression [ colon application-expression ]
+///     
+///     ; "[]  : List     t"
+///     ; "[]  : Optional t"
+///     ; "[x] : Optional t"
+///     ;
+///     ; NOTE: Backtrack if parsing this alternative fails since we can't tell
+///     ; from the opening bracket whether or not this will be an empty list or
+///     ; non-empty list
+///     / open-bracket (empty-collection / non-empty-optional)
+///     
+///     ; "x : t"
+///     / operator-expression (colon expression / "")
+annotated_expression = _{ operator_expression ~ (colon ~ expression)? }
+/// 
+/// empty-collection = close-bracket colon (List / Optional) import-expression
+/// 
+/// non-empty-optional = expression close-bracket colon Optional import-expression
+/// 
+/// operator-expression = import-alt-expression
+operator_expression = _{ plus_expression }
+/// 
+/// import-alt-expression    = or-expression            *(import-alt    or-expression)
+// import_alt_expression = { application_expression }
+/// or-expression            = plus-expression          *(or            plus-expression         )
+/// plus-expression          = text-append-expression   *(plus          text-append-expression  )
+plus_expression = { times_expression ~ (plus ~ times_expression)* }
+/// text-append-expression   = list-append-expression   *(text-append   list-append-expression  )
+/// list-append-expression   = and-expression           *(list-append   and-expression          )
+/// and-expression           = combine-expression       *(and           combine-expression      )
+/// combine-expression       = prefer-expression        *(combine       prefer-expression       )
+/// prefer-expression        = combine-types-expression *(prefer        combine-types-expression)
+/// combine-types-expression = times-expression         *(combine-types times-expression        )
+/// times-expression         = equal-expression         *(times         equal-expression        )
+times_expression = { primitive_expression ~ (times ~ primitive_expression)* }
+/// equal-expression         = not-equal-expression     *(double-equal  not-equal-expression    )
+/// not-equal-expression     = application-expression   *(not-equal     application-expression  )
+/// 
+/// ; Import expressions need to be separated by some whitespace, otherwise there
+/// ; would be ambiguity: `./ab` could be interpreted as "import the file `./ab`",
+/// ; or "apply the import `./a` to label `b`"
+/// application-expression =
+///     [ constructors / Some ] import-expression *(whitespace-chunk import-expression)
+/// 
+/// import-expression = import / selector-expression
+/// 
+/// ; `record.field` extracts one field of a record
+/// ;
+/// ; `record.{ field0, field1, field2 }` projects out several fields of a record
+/// ;
+/// ; NOTE: Backtrack when parsing the `*(dot ...)`.  The reason why is that you
+/// ; can't tell from parsing just the period whether "foo." will become "foo.bar"
+/// ; (i.e. accessing field `bar` of the record `foo`) or `foo./bar` (i.e. applying
+/// ; the function `foo` to the relative path `./bar`)
+/// selector-expression = primitive-expression *(dot ( label / labels ))
+/// 
+/// ; NOTE: Backtrack when parsing the first three alternatives (i.e. the numeric
+/// ; literals).  This is because they share leading characters in common
+/// 
+/// ; NOTE: The reason why we have three different types of identifiers (that is:
+/// ; identifier, identifier-reserved-prefix, identifier-reserved-namespaced-prefix)
+/// ; is that it's the only way to parse correctly identifiers that start with reserved
+/// ; words, other than using a lexer and use the longest match rule.
+/// ;
+/// ; Since reserved words can include themselves (e.g. 'List/build' includes 'List'),
+/// ; we have to match the "namespaced" reserved words before the identifiers prefixed
+/// ; by a reserved word.
+/// primitive-expression =
+///     ; "2.0"
+///       double-literal
+///     
+///     ; "2"
+///     / natural-literal
+///     
+///     ; "+2"
+///     / integer-literal
+///     
+///     ; "-Infinity"
+///     / "-" Infinity-raw whitespace
+///     
+///     ; '"ABC"'
+///     / text-literal
+///     
+///     ; "{ foo = 1      , bar = True }"
+///     ; "{ foo : Integer, bar : Bool }"
+///     / open-brace record-type-or-literal close-brace
+///     
+///     ; "< Foo : Integer | Bar : Bool >"
+///     ; "< Foo : Integer | Bar = True >"
+///     / open-angle union-type-or-literal  close-angle
+///     
+///     ; "[1, 2, 3]"
+///     / non-empty-list-literal  ; `annotated-expression` handles empty lists
+///     
+///     ; "List/foldWith"
+///     / identifier-reserved-namespaced-prefix
+///     
+///     ; "List/head"
+///     / reserved-namespaced
+///     
+///     ; "List/map"
+///     ; "TypeDefinition"
+///     / identifier-reserved-prefix
+///     
+///     ; "List"
+///     / reserved
+///     
+///     ; "x"
+///     ; "x@2"
+///     / identifier
+///     
+///     ; "( e )"
+///     / open-parens expression close-parens
+primitive_expression = _{
+    natural_literal
+    | open_parens ~ expression ~ close_parens
+}
+/// 
+/// labels = open-brace (  label *(comma label) / "" ) close-brace
+/// 
+/// record-type-or-literal =
+///       equal                             ; Empty record literal
+///     / non-empty-record-type-or-literal
+///     / ""                                ; Empty record type
+/// 
+/// non-empty-record-type-or-literal =
+///     label (non-empty-record-literal / non-empty-record-type)
+/// 
+/// non-empty-record-type    = colon expression *(comma label colon expression)
+/// non-empty-record-literal = equal expression *(comma label equal expression)
+/// 
+/// union-type-or-literal =
+///       non-empty-union-type-or-literal
+///     / ""                               ; Empty union type
+/// 
+/// non-empty-union-type-or-literal =
+///     label
+///     ( equal expression *(bar label colon expression)
+///     / colon expression (bar non-empty-union-type-or-literal / "")
+///     )
+/// 
+/// non-empty-list-literal = open-bracket expression *(comma expression) close-bracket
+/// 
+/// ; All expressions end with trailing whitespace.  This just adds a final
+/// ; whitespace prefix for the top-level of the program
+/// complete-expression = whitespace expression
+
+complete_expression = _{ SOI ~ whitespace ~ expression ~ EOI }
diff --git a/src/parser.rs b/src/parser.rs
index 8416d9b..a0281f4 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -3,6 +3,7 @@ use lalrpop_util;
 use crate::grammar;
 use crate::grammar_util::BoxExpr;
 use crate::lexer::{Lexer, LexicalError, Tok};
+use crate::core::{bx, Expr};
 
 pub type ParseError<'i> = lalrpop_util::ParseError<usize, Tok<'i>, LexicalError>;
 
@@ -10,9 +11,64 @@ pub fn parse_expr(s: &str) -> Result<BoxExpr, ParseError>  {
     grammar::ExprParser::new().parse(Lexer::new(s))
 }
 
+use pest::Parser;
+use pest::error::Error;
+use pest_derive::*;
+
+#[derive(Parser)]
+#[grammar = "dhall.pest"]
+struct DhallParser;
+
+use pest::iterators::Pair;
+fn debug_pair(pair: Pair<Rule>) {
+    fn aux(indent: usize, pair: Pair<Rule>) {
+        let indent_str = "| ".repeat(indent);
+        println!(r#"{}{:?}: "{}""#, indent_str, pair.as_rule(), pair.as_str());
+        for p in pair.into_inner() {
+            aux(indent+1, p);
+        }
+    }
+    aux(0, pair)
+}
+
+pub fn parse_expr_pest(s: &str) -> Result<BoxExpr, Error<Rule>>  {
+    let parsed_expr = DhallParser::parse(Rule::complete_expression, s)?.next().unwrap();
+    debug_pair(parsed_expr.clone());
+    // println!("{}", parsed_expr.clone());
+
+    fn parse_pair(pair: Pair<Rule>) -> BoxExpr {
+        match pair.as_rule() {
+            Rule::natural_literal => bx(Expr::NaturalLit(str::parse(pair.as_str().trim()).unwrap())),
+            Rule::plus_expression => {
+                let mut inner = pair.into_inner().map(parse_pair);
+                let first_expr = inner.next().unwrap();
+                inner.fold(first_expr, |acc, e| bx(Expr::NaturalPlus(acc, e)))
+            }
+            Rule::times_expression => {
+                let mut inner = pair.into_inner().map(parse_pair);
+                let first_expr = inner.next().unwrap();
+                inner.fold(first_expr, |acc, e| bx(Expr::NaturalTimes(acc, e)))
+            }
+            r => panic!("{:?}", r),
+        }
+    }
+
+    Ok(parse_pair(parsed_expr))
+}
+
+
 #[test]
 fn test_parse() {
     use crate::core::Expr::*;
+    let expr = "((22 + 3) * 10)";
+    match parse_expr_pest(expr) {
+        Err(e) => println!("{}", e),
+        ok => println!("{:?}", ok),
+    }
+    println!("{:?}", parse_expr(expr));
+    assert_eq!(parse_expr_pest(expr).unwrap(), parse_expr(expr).unwrap());
+    assert!(false);
+
     println!("test {:?}", parse_expr("3 + 5 * 10"));
     assert!(parse_expr("22").is_ok());
     assert!(parse_expr("(22)").is_ok());
@@ -32,4 +88,8 @@ fn test_parse() {
     println!("{:?}", parse_expr("foo.bar"));
     assert!(parse_expr("foo.bar").is_ok());
     assert!(parse_expr("[] : List Bool").is_ok());
+
+    // println!("{:?}", parse_expr("< Left = True | Right : Natural >"));
+    // println!("{:?}", parse_expr(r#""bl${42}ah""#));
+    // assert!(parse_expr("< Left = True | Right : Natural >").is_ok());
 }
author	Nadrieril	2019-02-28 19:18:04 +0100
committer	Nadrieril	2019-02-28 19:18:04 +0100
commit	22a5eac0bfb22bfe27973c78ef0e8a9b418ee844 (patch)
tree	d60cfc35762ed3f34ee2d2fd5ff6d9fa866ed8a3 /src
parent	4267cfef8ad3a929cba9fcc7bbc91b0fe863b0f6 (diff)