diff options
author | Nadrieril | 2019-02-28 19:18:04 +0100 |
---|---|---|
committer | Nadrieril | 2019-02-28 19:18:04 +0100 |
commit | 22a5eac0bfb22bfe27973c78ef0e8a9b418ee844 (patch) | |
tree | d60cfc35762ed3f34ee2d2fd5ff6d9fa866ed8a3 /src | |
parent | 4267cfef8ad3a929cba9fcc7bbc91b0fe863b0f6 (diff) |
Start rewriting parser in pest
Diffstat (limited to 'src')
-rw-r--r-- | src/dhall.pest | 836 | ||||
-rw-r--r-- | src/parser.rs | 60 |
2 files changed, 896 insertions, 0 deletions
diff --git a/src/dhall.pest b/src/dhall.pest new file mode 100644 index 0000000..873428c --- /dev/null +++ b/src/dhall.pest @@ -0,0 +1,836 @@ +/// ; ABNF syntax based on RFC 5234 +/// ; +/// ; The character encoding for Dhall is UTF-8 +/// ; +/// ; Some notes on implementing this grammar: +/// ; +/// ; First, do not use a lexer to tokenize the file before parsing. Instead, treat +/// ; the individual characters of the file as the tokens to feed into the parser. +/// ; You should not use a lexer because Dhall's grammar supports two features which +/// ; cannot be correctly supported by a lexer: +/// ; +/// ; * String interpolation (i.e. "foo ${Natural/toInteger bar} baz") +/// ; * Nested block comments (i.e. "{- foo {- bar -} baz -}") +/// ; +/// ; Second, this grammar assumes that your parser can backtrack and/or try +/// ; multiple parses simultaneously. For example, consider this expression: +/// ; +/// ; List ./MyType +/// ; +/// ; A parser might first try to parse the period as the beginning of a field +/// ; selector, only to realize immediately afterwards that `/MyType` is not a valid +/// ; name for a field. A conforming parser must backtrack so that the expression +/// ; `./MyType` can instead be correctly interpreted as a relative path +/// ; +/// ; Third, if there are multiple valid parses then prefer the first parse +/// ; according to the ordering of alternatives. That is, the order of evaluation +/// ; of the alternatives is left-to-right. +/// ; +/// ; For example, the grammar for single quoted string literals is: +/// ; +/// ; single-quote-continue = +/// ; "'''" single-quote-continue +/// ; / "${" complete-expression "}" single-quote-continue +/// ; / "''${" single-quote-continue +/// ; / "''" +/// ; / %x20-10FFFF single-quote-continue +/// ; / tab single-quote-continue +/// ; / end-of-line single-quote-continue +/// ; +/// ; single-quote-literal = "''" single-quote-continue +/// ; +/// ; ... which permits valid parses for the following code: +/// ; +/// ; "''''''''''''''''" +/// ; +/// ; If you tried to parse all alternatives then there are at least two valid +/// ; interpretations for the above code: +/// ; +/// ; * A single quoted literal with four escape sequences of the form "'''" +/// ; * i.e. "''" followed by "'''" four times in a row followed by "''" +/// ; * Four empty single quoted literals +/// ; * i.e. "''''" four times in a row +/// ; +/// ; The correct interpretation is the first one because parsing the escape +/// ; sequence "'''" takes precedence over parsing the termination sequence "''", +/// ; according to the order of the alternatives in the `single-quote-continue` +/// ; rule. +/// ; +/// ; Some parsing libraries do not backtrack by default but allow the user to +/// ; selectively backtrack in certain parts of the grammar. Usually parsing +/// ; libraries do this to improve efficiency and error messages. Dhall's grammar +/// ; takes that into account by minimizing the number of rules that require the +/// ; parser to backtrack and comments below will highlight where you need to +/// ; explicitly backtrack +/// ; +/// ; Specifically, if you see an uninterrupted literal in a grammar rule such as: +/// ; +/// ; "->" +/// ; +/// ; ... or: +/// ; +/// ; %x66.6f.72.61.6c.6c +/// ; +/// ; ... then that string literal is parsed as a single unit, meaning that you +/// ; should backtrack if you parse only part of the literal +/// ; +/// ; In all other cases you can assume that you do not need to backtrack unless +/// ; there is a comment explicitly asking you to backtrack +/// ; +/// ; When parsing a repeated construct, prefer alternatives that parse as many +/// ; repetitions as possible. On in other words: +/// ; +/// ; [a] = a / "" +/// ; +/// ; a* = a* a / "" +/// ; +/// ; Note that the latter rule also specifies that repetition produces +/// ; left-associated expressions. For example, function application is +/// ; left-associative and all operators are left-associative when they are not +/// ; parenthesized. +/// ; +/// ; Additionally, try alternatives in an order that minimizes backtracking +/// ; according to the following rule: +/// ; +/// ; (a / b) (c / d) = a c / a d / b c / b d +/// +/// ; NOTE: There are many line endings in the wild +/// ; +/// ; See: https://en.wikipedia.org/wiki/Newline +/// ; +/// ; For simplicity this supports Unix and Windows line-endings, which are the most +/// ; common +/// end-of-line = +/// %x0A ; "\n" +/// / %x0D.0A ; "\r\n" +/// +/// tab = %x09 ; "\t" +/// +/// block-comment = "{-" block-comment-continue +/// +/// block-comment-chunk = +/// block-comment +/// / %x20-10FFFF +/// / tab +/// / end-of-line +/// +/// block-comment-continue = "-}" / block-comment-chunk block-comment-continue +/// +/// not-end-of-line = %x20-10FFFF / tab +/// +/// ; NOTE: Slightly different from Haskell-style single-line comments because this +/// ; does not require a space after the dashes +/// line-comment = "--" *not-end-of-line end-of-line +/// +/// whitespace-chunk = +/// " " +/// / tab +/// / end-of-line +/// / line-comment +/// / block-comment +whitespace_chunk = _{ + " " + // | tab + // | end_of_line + // | line_comment + // | block_comment +} +/// +/// whitespace = *whitespace-chunk +whitespace = _{ whitespace_chunk* } +/// +/// nonempty-whitespace = 1*whitespace-chunk +nonempty_whitespace = _{ whitespace_chunk+ } +/// +/// ; Uppercase or lowercase ASCII letter +/// ALPHA = %x41-5A / %x61-7A +/// +/// ; ASCII digit +/// DIGIT = %x30-39 ; 0-9 +/// +/// HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F" +/// +/// ; A simple label cannot be one of the following reserved names: +/// ; +/// ; * Bool +/// ; * Optional +/// ; * None +/// ; * Natural +/// ; * Integer +/// ; * Double +/// ; * Text +/// ; * List +/// ; * True +/// ; * False +/// ; * NaN +/// ; * Infinity +/// ; * Type +/// ; * Kind +/// ; * Sort +/// ; * Natural/fold +/// ; * Natural/build +/// ; * Natural/isZero +/// ; * Natural/even +/// ; * Natural/odd +/// ; * Natural/toInteger +/// ; * Natural/show +/// ; * Integer/toDouble +/// ; * Integer/show +/// ; * Double/show +/// ; * List/build +/// ; * List/fold +/// ; * List/length +/// ; * List/head +/// ; * List/last +/// ; * List/indexed +/// ; * List/reverse +/// ; * Optional/fold +/// ; * Optional/build +/// ; * Text/show +/// ; * if +/// ; * then +/// ; * else +/// ; * let +/// ; * in +/// ; * as +/// ; * using +/// ; * merge +/// ; * constructors +/// ; * Some +/// simple-label = (ALPHA / "_") *(ALPHA / DIGIT / "-" / "/" / "_") +/// +/// quoted-label = 1*(ALPHA / DIGIT / "-" / "/" / "_" / ":" / "." / "$") +/// +/// ; NOTE: Dhall does not support Unicode labels, mainly to minimize the potential +/// ; for code obfuscation +/// label = ("`" quoted-label "`" / simple-label) whitespace +/// +/// ; Dhall's double-quoted strings are equivalent to JSON strings except with +/// ; support for string interpolation (and escaping string interpolation) +/// ; +/// ; Dhall uses almost the same escaping rules as JSON (RFC7159) with one +/// ; exception: Dhall adds a new `\$` escape sequence for dollar signs. This +/// ; additional escape sequences lets you escape string interpolation by writing +/// ; `\${` +/// ; +/// ; > The representation of strings is similar to conventions used in the C +/// ; > family of programming languages. A string begins and ends with +/// ; > quotation marks. All Unicode characters may be placed within the +/// ; > quotation marks, except for the characters that must be escaped: +/// ; > quotation mark, reverse solidus, and the control characters (U+0000 +/// ; > through U+001F). +/// ; > +/// ; > Any character may be escaped. If the character is in the Basic +/// ; > Multilingual Plane (U+0000 through U+FFFF), then it may be +/// ; > represented as a six-character sequence: a reverse solidus, followed +/// ; > by the lowercase letter u, followed by four hexadecimal digits that +/// ; > encode the character's code point. The hexadecimal letters A though +/// ; > F can be upper or lower case. So, for example, a string containing +/// ; > only a single reverse solidus character may be represented as +/// ; > "\u005C". +/// ; > +/// ; > Alternatively, there are two-character sequence escape +/// ; > representations of some popular characters. So, for example, a +/// ; > string containing only a single reverse solidus character may be +/// ; > represented more compactly as "\\". +/// ; > +/// ; > To escape an extended character that is not in the Basic Multilingual +/// ; > Plane, the character is represented as a 12-character sequence, +/// ; > encoding the UTF-16 surrogate pair. So, for example, a string +/// ; > containing only the G clef character (U+1D11E) may be represented as +/// ; > "\uD834\uDD1E". +/// double-quote-chunk = +/// "${" complete-expression "}" ; Interpolation +/// / %x5C ; '\' Beginning of escape sequence +/// ( %x22 ; '"' quotation mark U+0022 +/// / %x24 ; '$' dollar sign U+0024 +/// / %x5C ; '\' reverse solidus U+005C +/// / %x2F ; '/' solidus U+002F +/// / %x62 ; 'b' backspace U+0008 +/// / %x66 ; 'f' form feed U+000C +/// / %x6E ; 'n' line feed U+000A +/// / %x72 ; 'r' carriage return U+000D +/// / %x74 ; 't' tab U+0009 +/// / %x75 4HEXDIG ; 'uXXXX' U+XXXX +/// ) +/// ; Printable characters except double quote and backslash +/// / %x20-21 +/// ; %x22 = '"' +/// / %x23-5B +/// ; %x5C = "\" +/// / %x5D-10FFFF +/// +/// double-quote-literal = %x22 *double-quote-chunk %x22 +/// +/// ; NOTE: The only way to end a single-quote string literal with a single quote is +/// ; to either interpolate the single quote, like this: +/// ; +/// ; ''ABC${"'"}'' +/// ; +/// ; ... or concatenate another string, like this: +/// ; +/// ; ''ABC'' ++ "'" +/// ; +/// ; If you try to end the string literal with a single quote then you get "'''", +/// ; which is interpreted as an escaped pair of single quotes +/// single-quote-continue = +/// ; Escape two single quotes (i.e. replace this sequence with "''") +/// "'''" single-quote-continue +/// ; Interpolation +/// / "${" complete-expression "}" single-quote-continue +/// ; Escape interpolation (i.e. replace this sequence with "${") +/// / "''${" single-quote-continue +/// / "''" ; End of text literal +/// / %x20-10FFFF single-quote-continue +/// / tab single-quote-continue +/// / end-of-line single-quote-continue +/// +/// single-quote-literal = "''" end-of-line single-quote-continue +/// +/// text-literal = (double-quote-literal / single-quote-literal) whitespace +/// +/// ; RFC 5234 interprets string literals as case-insensitive and recommends using +/// ; hex instead for case-sensitive strings +/// ; +/// ; If you don't feel like reading hex, these are all the same as the rule name, +/// ; except without the '-raw' ending, and converting dashes in the rule name +/// ; to forward slashes +/// if-raw = %x69.66 +/// then-raw = %x74.68.65.6e +/// else-raw = %x65.6c.73.65 +/// let-raw = %x6c.65.74 +/// in-raw = %x69.6e +/// as-raw = %x61.73 +/// using-raw = %x75.73.69.6e.67 +/// merge-raw = %x6d.65.72.67.65 +/// missing-raw = %x6d.69.73.73.69.6e.67 +/// Some-raw = %x53.6f.6d.65 +/// constructors-raw = %x63.6f.6e.73.74.72.75.63.74.6f.72.73 +/// Natural-fold-raw = %x4e.61.74.75.72.61.6c.2f.66.6f.6c.64 +/// Natural-build-raw = %x4e.61.74.75.72.61.6c.2f.62.75.69.6c.64 +/// Natural-isZero-raw = %x4e.61.74.75.72.61.6c.2f.69.73.5a.65.72.6f +/// Natural-even-raw = %x4e.61.74.75.72.61.6c.2f.65.76.65.6e +/// Natural-odd-raw = %x4e.61.74.75.72.61.6c.2f.6f.64.64 +/// Natural-toInteger-raw = %x4e.61.74.75.72.61.6c.2f.74.6f.49.6e.74.65.67.65.72 +/// Natural-show-raw = %x4e.61.74.75.72.61.6c.2f.73.68.6f.77 +/// Integer-toDouble-raw = %x49.6e.74.65.67.65.72.2f.74.6f.44.6f.75.62.6c.65 +/// Integer-show-raw = %x49.6e.74.65.67.65.72.2f.73.68.6f.77 +/// Double-show-raw = %x44.6f.75.62.6c.65.2f.73.68.6f.77 +/// List-build-raw = %x4c.69.73.74.2f.62.75.69.6c.64 +/// List-fold-raw = %x4c.69.73.74.2f.66.6f.6c.64 +/// List-length-raw = %x4c.69.73.74.2f.6c.65.6e.67.74.68 +/// List-head-raw = %x4c.69.73.74.2f.68.65.61.64 +/// List-last-raw = %x4c.69.73.74.2f.6c.61.73.74 +/// List-indexed-raw = %x4c.69.73.74.2f.69.6e.64.65.78.65.64 +/// List-reverse-raw = %x4c.69.73.74.2f.72.65.76.65.72.73.65 +/// Optional-fold-raw = %x4f.70.74.69.6f.6e.61.6c.2f.66.6f.6c.64 +/// Optional-build-raw = %x4f.70.74.69.6f.6e.61.6c.2f.62.75.69.6c.64 +/// Text-show-raw = %x54.65.78.74.2f.73.68.6f.77 +/// Bool-raw = %x42.6f.6f.6c +/// Optional-raw = %x4f.70.74.69.6f.6e.61.6c +/// None-raw = %x4e.6f.6e.65 +/// Natural-raw = %x4e.61.74.75.72.61.6c +/// Integer-raw = %x49.6e.74.65.67.65.72 +/// Double-raw = %x44.6f.75.62.6c.65 +/// Text-raw = %x54.65.78.74 +/// List-raw = %x4c.69.73.74 +/// True-raw = %x54.72.75.65 +/// False-raw = %x46.61.6c.73.65 +/// NaN-raw = %x4e.61.4e +/// Infinity-raw = %x49.6e.66.69.6e.69.74.79 +/// Type-raw = %x54.79.70.65 +/// Kind-raw = %x4b.69.6e.64 +/// Sort-raw = %x53.6f.72.74 +/// +/// reserved-raw = +/// Bool-raw +/// / Optional-raw +/// / None-raw +/// / Natural-raw +/// / Integer-raw +/// / Double-raw +/// / Text-raw +/// / List-raw +/// / True-raw +/// / False-raw +/// / NaN-raw +/// / Infinity-raw +/// / Type-raw +/// / Kind-raw +/// / Sort-raw +/// +/// reserved-namespaced-raw = +/// Natural-fold-raw +/// / Natural-build-raw +/// / Natural-isZero-raw +/// / Natural-even-raw +/// / Natural-odd-raw +/// / Natural-toInteger-raw +/// / Natural-show-raw +/// / Integer-toDouble-raw +/// / Integer-show-raw +/// / Double-show-raw +/// / List-build-raw +/// / List-fold-raw +/// / List-length-raw +/// / List-head-raw +/// / List-last-raw +/// / List-indexed-raw +/// / List-reverse-raw +/// / Optional-fold-raw +/// / Optional-build-raw +/// / Text-show-raw +/// +/// reserved = reserved-raw whitespace +/// reserved-namespaced = reserved-namespaced-raw whitespace +/// +/// ; Whitespaced rules for reserved words, to be used when matching expressions +/// if = if-raw nonempty-whitespace +/// then = then-raw nonempty-whitespace +/// else = else-raw nonempty-whitespace +/// let = let-raw nonempty-whitespace +/// in = in-raw nonempty-whitespace +/// as = as-raw nonempty-whitespace +/// using = using-raw nonempty-whitespace +/// merge = merge-raw nonempty-whitespace +/// constructors = constructors-raw nonempty-whitespace +/// Some = Some-raw nonempty-whitespace +/// +/// Optional = Optional-raw whitespace +/// Text = Text-raw whitespace +/// List = List-raw whitespace +/// +/// equal = "=" whitespace +/// or = "||" whitespace +/// plus = "+" nonempty-whitespace ; To disambiguate `f +2` +plus = _{ "+" ~ nonempty_whitespace } +/// text-append = "++" whitespace +/// list-append = "#" nonempty-whitespace ; To disambiguate `http://a/a#a` +/// and = "&&" whitespace +/// times = "*" whitespace +times = _{ "*" ~ nonempty_whitespace } +/// double-equal = "==" whitespace +/// not-equal = "!=" whitespace +/// dot = "." whitespace +/// open-brace = "{" whitespace +/// close-brace = "}" whitespace +/// open-bracket = "[" whitespace +/// close-bracket = "]" whitespace +/// open-angle = "<" whitespace +/// close-angle = ">" whitespace +/// bar = "|" whitespace +/// comma = "," whitespace +/// open-parens = "(" whitespace +open_parens = _{ "(" ~ whitespace } +/// close-parens = ")" whitespace +close_parens = _{ ")" ~ whitespace } +/// at = "@" whitespace +/// colon = ":" nonempty-whitespace ; To disambiguate `env:VARIABLE` from type annotations +colon = _{ ":" ~ nonempty_whitespace } +/// import-alt = "?" nonempty-whitespace ; To disambiguate `http://a/a?a` +/// +/// combine = ( %x2227 / "/\" ) whitespace +/// combine-types = ( %x2A53 / "//\\" ) whitespace +/// prefer = ( %x2AFD / "//" ) whitespace +/// lambda = ( %x3BB / "\" ) whitespace +/// forall = ( %x2200 / %x66.6f.72.61.6c.6c ) whitespace +/// arrow = ( %x2192 / "->" ) whitespace +/// +/// exponent = "e" [ "+" / "-" ] 1*DIGIT +/// +/// double-literal = [ "+" / "-" ] 1*DIGIT ( "." 1*DIGIT [ exponent ] / exponent) whitespace +/// +/// natural-literal-raw = 1*DIGIT +natural_literal_raw = _{ ASCII_DIGIT+ } +/// +/// integer-literal = ( "+" / "-" ) natural-literal-raw whitespace +/// +/// natural-literal = natural-literal-raw whitespace +natural_literal = { natural_literal_raw ~ whitespace } +/// +/// identifier = label [ at natural-literal-raw whitespace ] +/// +/// identifier-reserved-prefix = +/// reserved-raw 1*(ALPHA / DIGIT / "-" / "/" / "_") whitespace [ at natural-literal-raw whitespace ] +/// +/// identifier-reserved-namespaced-prefix = +/// reserved-namespaced-raw 1*(ALPHA / DIGIT / "-" / "/" / "_") whitespace [ at natural-literal-raw whitespace ] +/// +/// missing = missing-raw whitespace +/// +/// ; Printable characters other than " ()[]{}<>/\," +/// ; +/// ; Excluding those characters ensures that paths don't have to end with trailing +/// ; whitespace most of the time +/// path-character = +/// ; %x20 = " " +/// %x21 +/// ; %x22 = "\"" +/// ; %x23 = "#" +/// / %x24-27 +/// ; %x28 = "(" +/// ; %x29 = ")" +/// / %x2A-2B +/// ; %x2C = "," +/// / %x2D-2E +/// ; %x2F = "/" +/// / %x30-3B +/// ; %x3C = "<" +/// / %x3D +/// ; %x3E = ">" +/// ; %x3F = "?" +/// / %x40-5A +/// ; %x5B = "[" +/// ; %x5C = "\" +/// ; %x5D = "]" +/// / %x5E-7A +/// ; %x7B = "{" +/// / %x7C +/// ; %x7D = "}" +/// / %x7E +/// +/// quoted-path-character = +/// %x20-21 +/// ; %x22 = "\"" +/// / %x23-2E +/// ; %x2F = "/" +/// / %x30-10FFFF +/// +/// +/// path-component = "/" ( 1*path-character / %x22 1*quoted-path-character %x22 ) +/// +/// directory = *path-component +/// +/// file = path-component +/// +/// local-raw = +/// ".." directory file ; Relative path +/// / "." directory file ; Relative path +/// / "~" directory file ; Home-anchored path +/// ; NOTE: Backtrack if parsing this alternative fails +/// ; +/// ; This is because the first character of this alternative will be "/", but +/// ; if the second character is "/" or "\" then this should have been parsed +/// ; as an operator instead of a path +/// / directory file ; Absolute path +/// +/// local = local-raw whitespace +/// +/// ; `http[s]` URI grammar based on RFC7230 and RFC 3986 with some differences +/// ; noted below +/// +/// scheme = %x68.74.74.70 [ %x73 ] ; "http" [ "s" ] +/// +/// ; NOTE: This does not match the official grammar for a URI. Specifically, this +/// ; replaces `path-abempty` with `directory file` +/// http-raw = scheme "://" authority directory file [ "?" query ] [ "#" fragment ] +/// +/// ; NOTE: Backtrack if parsing the optional user info prefix fails +/// authority = [ userinfo "@" ] host [ ":" port ] +/// +/// userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) +/// +/// host = IP-literal / IPv4address / reg-name +/// +/// port = *DIGIT +/// +/// IP-literal = "[" ( IPv6address / IPvFuture ) "]" +/// +/// IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) +/// +/// ; NOTE: Backtrack when parsing each alternative +/// IPv6address = 6( h16 ":" ) ls32 +/// / "::" 5( h16 ":" ) ls32 +/// / [ h16 ] "::" 4( h16 ":" ) ls32 +/// / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 +/// / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 +/// / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 +/// / [ *4( h16 ":" ) h16 ] "::" ls32 +/// / [ *5( h16 ":" ) h16 ] "::" h16 +/// / [ *6( h16 ":" ) h16 ] "::" +/// +/// h16 = 1*4HEXDIG +/// +/// ls32 = ( h16 ":" h16 ) / IPv4address +/// +/// IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet +/// +/// ; NOTE: Backtrack when parsing these alternatives and try them in reverse order +/// dec-octet = DIGIT ; 0-9 +/// / %x31-39 DIGIT ; 10-99 +/// / "1" 2DIGIT ; 100-199 +/// / "2" %x30-34 DIGIT ; 200-249 +/// / "25" %x30-35 ; 250-255 +/// +/// reg-name = *( unreserved / pct-encoded / sub-delims ) +/// +/// pchar = unreserved / pct-encoded / sub-delims / ":" / "@" +/// +/// query = *( pchar / "/" / "?" ) +/// +/// fragment = *( pchar / "/" / "?" ) +/// +/// pct-encoded = "%" HEXDIG HEXDIG +/// +/// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +/// +/// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" +/// +/// http = +/// http-raw whitespace +/// [ using (import-hashed / open-parens import-hashed close-parens) ] +/// +/// ; Dhall supports unquoted environment variables that are Bash-compliant or +/// ; quoted environment variables that are POSIX-compliant +/// env = "env:" +/// ( bash-environment-variable +/// / %x22 posix-environment-variable %x22 +/// ) +/// whitespace +/// +/// ; Bash supports a restricted subset of POSIX environment variables. From the +/// ; Bash `man` page, an environment variable name is: +/// ; +/// ; > A word consisting only of alphanumeric characters and under-scores, and +/// ; > beginning with an alphabetic character or an under-score +/// bash-environment-variable = (ALPHA / "_") *(ALPHA / DIGIT / "_") +/// +/// ; The POSIX standard is significantly more flexible about legal environment +/// ; variable names, which can contain alerts (i.e. '\a'), whitespace, or +/// ; punctuation, for example. The POSIX standard says about environment variable +/// ; names: +/// ; +/// ; > The value of an environment variable is a string of characters. For a +/// ; > C-language program, an array of strings called the environment shall be made +/// ; > available when a process begins. The array is pointed to by the external +/// ; > variable environ, which is defined as: +/// ; > +/// ; > extern char **environ; +/// ; > +/// ; > These strings have the form name=value; names shall not contain the +/// ; > character '='. For values to be portable across systems conforming to IEEE +/// ; > Std 1003.1-2001, the value shall be composed of characters from the portable +/// ; > character set (except NUL and as indicated below). +/// ; +/// ; Note that the standard does not explicitly state that the name must have at +/// ; least one character, but `env` does not appear to support this and `env` +/// ; claims to be POSIX-compliant. To be safe, Dhall requires at least one +/// ; character like `env` +/// posix-environment-variable = 1*posix-environment-variable-character +/// +/// ; These are all the characters from the POSIX Portable Character Set except for +/// ; '\0' (NUL) and '='. Note that the POSIX standard does not explicitly state +/// ; that environment variable names cannot have NUL. However, this is implicit +/// ; in the fact that environment variables are passed to the program as +/// ; NUL-terminated `name=value` strings, which implies that the `name` portion of +/// ; the string cannot have NUL characters +/// posix-environment-variable-character = +/// %x5C ; '\' Beginning of escape sequence +/// ( %x22 ; '"' quotation mark U+0022 +/// / %x5C ; '\' reverse solidus U+005C +/// / %x61 ; 'a' alert U+0007 +/// / %x62 ; 'b' backspace U+0008 +/// / %x66 ; 'f' form feed U+000C +/// / %x6E ; 'n' line feed U+000A +/// / %x72 ; 'r' carriage return U+000D +/// / %x74 ; 't' tab U+0009 +/// / %x76 ; 'v' vertical tab U+000B +/// ) +/// ; Printable characters except double quote, backslash and equals +/// / %x20-21 +/// ; %x22 = '"' +/// / %x23-3C +/// ; %x3D = '=' +/// / %x3E-5B +/// ; %x5C = "\" +/// / %x5D-7E +/// +/// import-type = missing / local / http / env +/// +/// hash = %x73.68.61.32.35.36.3a 64HEXDIG whitespace ; "sha256:XXX...XXX" +/// +/// import-hashed = import-type [ hash ] +/// +/// ; "http://example.com" +/// ; "./foo/bar" +/// ; "env:FOO" +/// import = import-hashed [ as Text ] +/// +/// ; NOTE: Every rule past this point should only reference rules that end with +/// ; whitespace. This ensures consistent handling of whitespace in the absence of +/// ; a separate lexing step +/// +/// expression = +/// ; "\(x : a) -> b" +/// lambda open-parens label colon expression close-parens arrow expression +/// +/// ; "if a then b else c" +/// / if expression then expression else expression +/// +/// ; "let x : t = e1 in e2" +/// ; "let x = e1 in e2" +/// ; "let x = e1 let y = e2 in e3" +/// / 1*(let label [ colon expression ] equal expression) in expression +/// +/// ; "forall (x : a) -> b" +/// / forall open-parens label colon expression close-parens arrow expression +/// +/// ; "a -> b" +/// ; +/// ; NOTE: Backtrack if parsing this alternative fails +/// / operator-expression arrow expression +/// +/// / annotated-expression +expression = _{ annotated_expression } +/// +/// annotated-expression = +/// ; "merge e1 e2 : t" +/// ; "merge e1 e2" +/// merge import-expression import-expression [ colon application-expression ] +/// +/// ; "[] : List t" +/// ; "[] : Optional t" +/// ; "[x] : Optional t" +/// ; +/// ; NOTE: Backtrack if parsing this alternative fails since we can't tell +/// ; from the opening bracket whether or not this will be an empty list or +/// ; non-empty list +/// / open-bracket (empty-collection / non-empty-optional) +/// +/// ; "x : t" +/// / operator-expression (colon expression / "") +annotated_expression = _{ operator_expression ~ (colon ~ expression)? } +/// +/// empty-collection = close-bracket colon (List / Optional) import-expression +/// +/// non-empty-optional = expression close-bracket colon Optional import-expression +/// +/// operator-expression = import-alt-expression +operator_expression = _{ plus_expression } +/// +/// import-alt-expression = or-expression *(import-alt or-expression) +// import_alt_expression = { application_expression } +/// or-expression = plus-expression *(or plus-expression ) +/// plus-expression = text-append-expression *(plus text-append-expression ) +plus_expression = { times_expression ~ (plus ~ times_expression)* } +/// text-append-expression = list-append-expression *(text-append list-append-expression ) +/// list-append-expression = and-expression *(list-append and-expression ) +/// and-expression = combine-expression *(and combine-expression ) +/// combine-expression = prefer-expression *(combine prefer-expression ) +/// prefer-expression = combine-types-expression *(prefer combine-types-expression) +/// combine-types-expression = times-expression *(combine-types times-expression ) +/// times-expression = equal-expression *(times equal-expression ) +times_expression = { primitive_expression ~ (times ~ primitive_expression)* } +/// equal-expression = not-equal-expression *(double-equal not-equal-expression ) +/// not-equal-expression = application-expression *(not-equal application-expression ) +/// +/// ; Import expressions need to be separated by some whitespace, otherwise there +/// ; would be ambiguity: `./ab` could be interpreted as "import the file `./ab`", +/// ; or "apply the import `./a` to label `b`" +/// application-expression = +/// [ constructors / Some ] import-expression *(whitespace-chunk import-expression) +/// +/// import-expression = import / selector-expression +/// +/// ; `record.field` extracts one field of a record +/// ; +/// ; `record.{ field0, field1, field2 }` projects out several fields of a record +/// ; +/// ; NOTE: Backtrack when parsing the `*(dot ...)`. The reason why is that you +/// ; can't tell from parsing just the period whether "foo." will become "foo.bar" +/// ; (i.e. accessing field `bar` of the record `foo`) or `foo./bar` (i.e. applying +/// ; the function `foo` to the relative path `./bar`) +/// selector-expression = primitive-expression *(dot ( label / labels )) +/// +/// ; NOTE: Backtrack when parsing the first three alternatives (i.e. the numeric +/// ; literals). This is because they share leading characters in common +/// +/// ; NOTE: The reason why we have three different types of identifiers (that is: +/// ; identifier, identifier-reserved-prefix, identifier-reserved-namespaced-prefix) +/// ; is that it's the only way to parse correctly identifiers that start with reserved +/// ; words, other than using a lexer and use the longest match rule. +/// ; +/// ; Since reserved words can include themselves (e.g. 'List/build' includes 'List'), +/// ; we have to match the "namespaced" reserved words before the identifiers prefixed +/// ; by a reserved word. +/// primitive-expression = +/// ; "2.0" +/// double-literal +/// +/// ; "2" +/// / natural-literal +/// +/// ; "+2" +/// / integer-literal +/// +/// ; "-Infinity" +/// / "-" Infinity-raw whitespace +/// +/// ; '"ABC"' +/// / text-literal +/// +/// ; "{ foo = 1 , bar = True }" +/// ; "{ foo : Integer, bar : Bool }" +/// / open-brace record-type-or-literal close-brace +/// +/// ; "< Foo : Integer | Bar : Bool >" +/// ; "< Foo : Integer | Bar = True >" +/// / open-angle union-type-or-literal close-angle +/// +/// ; "[1, 2, 3]" +/// / non-empty-list-literal ; `annotated-expression` handles empty lists +/// +/// ; "List/foldWith" +/// / identifier-reserved-namespaced-prefix +/// +/// ; "List/head" +/// / reserved-namespaced +/// +/// ; "List/map" +/// ; "TypeDefinition" +/// / identifier-reserved-prefix +/// +/// ; "List" +/// / reserved +/// +/// ; "x" +/// ; "x@2" +/// / identifier +/// +/// ; "( e )" +/// / open-parens expression close-parens +primitive_expression = _{ + natural_literal + | open_parens ~ expression ~ close_parens +} +/// +/// labels = open-brace ( label *(comma label) / "" ) close-brace +/// +/// record-type-or-literal = +/// equal ; Empty record literal +/// / non-empty-record-type-or-literal +/// / "" ; Empty record type +/// +/// non-empty-record-type-or-literal = +/// label (non-empty-record-literal / non-empty-record-type) +/// +/// non-empty-record-type = colon expression *(comma label colon expression) +/// non-empty-record-literal = equal expression *(comma label equal expression) +/// +/// union-type-or-literal = +/// non-empty-union-type-or-literal +/// / "" ; Empty union type +/// +/// non-empty-union-type-or-literal = +/// label +/// ( equal expression *(bar label colon expression) +/// / colon expression (bar non-empty-union-type-or-literal / "") +/// ) +/// +/// non-empty-list-literal = open-bracket expression *(comma expression) close-bracket +/// +/// ; All expressions end with trailing whitespace. This just adds a final +/// ; whitespace prefix for the top-level of the program +/// complete-expression = whitespace expression + +complete_expression = _{ SOI ~ whitespace ~ expression ~ EOI } diff --git a/src/parser.rs b/src/parser.rs index 8416d9b..a0281f4 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -3,6 +3,7 @@ use lalrpop_util; use crate::grammar; use crate::grammar_util::BoxExpr; use crate::lexer::{Lexer, LexicalError, Tok}; +use crate::core::{bx, Expr}; pub type ParseError<'i> = lalrpop_util::ParseError<usize, Tok<'i>, LexicalError>; @@ -10,9 +11,64 @@ pub fn parse_expr(s: &str) -> Result<BoxExpr, ParseError> { grammar::ExprParser::new().parse(Lexer::new(s)) } +use pest::Parser; +use pest::error::Error; +use pest_derive::*; + +#[derive(Parser)] +#[grammar = "dhall.pest"] +struct DhallParser; + +use pest::iterators::Pair; +fn debug_pair(pair: Pair<Rule>) { + fn aux(indent: usize, pair: Pair<Rule>) { + let indent_str = "| ".repeat(indent); + println!(r#"{}{:?}: "{}""#, indent_str, pair.as_rule(), pair.as_str()); + for p in pair.into_inner() { + aux(indent+1, p); + } + } + aux(0, pair) +} + +pub fn parse_expr_pest(s: &str) -> Result<BoxExpr, Error<Rule>> { + let parsed_expr = DhallParser::parse(Rule::complete_expression, s)?.next().unwrap(); + debug_pair(parsed_expr.clone()); + // println!("{}", parsed_expr.clone()); + + fn parse_pair(pair: Pair<Rule>) -> BoxExpr { + match pair.as_rule() { + Rule::natural_literal => bx(Expr::NaturalLit(str::parse(pair.as_str().trim()).unwrap())), + Rule::plus_expression => { + let mut inner = pair.into_inner().map(parse_pair); + let first_expr = inner.next().unwrap(); + inner.fold(first_expr, |acc, e| bx(Expr::NaturalPlus(acc, e))) + } + Rule::times_expression => { + let mut inner = pair.into_inner().map(parse_pair); + let first_expr = inner.next().unwrap(); + inner.fold(first_expr, |acc, e| bx(Expr::NaturalTimes(acc, e))) + } + r => panic!("{:?}", r), + } + } + + Ok(parse_pair(parsed_expr)) +} + + #[test] fn test_parse() { use crate::core::Expr::*; + let expr = "((22 + 3) * 10)"; + match parse_expr_pest(expr) { + Err(e) => println!("{}", e), + ok => println!("{:?}", ok), + } + println!("{:?}", parse_expr(expr)); + assert_eq!(parse_expr_pest(expr).unwrap(), parse_expr(expr).unwrap()); + assert!(false); + println!("test {:?}", parse_expr("3 + 5 * 10")); assert!(parse_expr("22").is_ok()); assert!(parse_expr("(22)").is_ok()); @@ -32,4 +88,8 @@ fn test_parse() { println!("{:?}", parse_expr("foo.bar")); assert!(parse_expr("foo.bar").is_ok()); assert!(parse_expr("[] : List Bool").is_ok()); + + // println!("{:?}", parse_expr("< Left = True | Right : Natural >")); + // println!("{:?}", parse_expr(r#""bl${42}ah""#)); + // assert!(parse_expr("< Left = True | Right : Natural >").is_ok()); } |