diff options
Diffstat (limited to '')
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Cargo.lock | 27 | ||||
-rw-r--r-- | Cargo.toml | 2 | ||||
-rw-r--r-- | build.rs | 107 | ||||
-rw-r--r-- | src/dhall.pest | 836 | ||||
-rw-r--r-- | src/dhall.pest.visibility | 188 | ||||
-rw-r--r-- | src/generated_parser.rs | 6 | ||||
-rw-r--r-- | src/lib.rs | 1 | ||||
-rw-r--r-- | src/parser.rs | 10 |
9 files changed, 334 insertions, 844 deletions
@@ -1,2 +1,3 @@ src/grammar.rs target +src/dhall.pest @@ -1,4 +1,12 @@ [[package]] +name = "abnf" +version = "0.1.1" +source = "git+https://github.com/Nadrieril/abnf#b23a789e0ebc57fed08ad7d6cb015cfbfbb78246" +dependencies = [ + "nom 4.2.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] name = "aho-corasick" version = "0.6.10" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -117,7 +125,9 @@ dependencies = [ name = "dhall" version = "0.1.0" dependencies = [ + "abnf 0.1.1 (git+https://github.com/Nadrieril/abnf)", "bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "itertools 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "lalrpop 0.16.3 (registry+https://github.com/rust-lang/crates.io-index)", "lalrpop-util 0.16.3 (registry+https://github.com/rust-lang/crates.io-index)", "nom 3.2.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -299,6 +309,15 @@ dependencies = [ ] [[package]] +name = "nom" +version = "4.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] name = "opaque-debug" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -688,6 +707,11 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] +name = "version_check" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] name = "winapi" version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -717,6 +741,7 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" [metadata] +"checksum abnf 0.1.1 (git+https://github.com/Nadrieril/abnf)" = "<none>" "checksum aho-corasick 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "81ce3d38065e618af2d7b77e10c5ad9a069859b4be3c2250f674af3840d9c8a5" "checksum arrayref 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0d382e583f07208808f6b1249e60848879ba3543f57c32277bf52d69c2f0f0ee" "checksum ascii-canvas 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b385d69402821a1c254533a011a312531cbcc0e3e24f19bbb4747a5a2daf37e2" @@ -757,6 +782,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39" "checksum new_debug_unreachable 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "f40f005c60db6e03bae699e414c58bf9aa7ea02a2d0b9bfbcf19286cc4c82b30" "checksum nom 3.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05aec50c70fd288702bcd93284a8444607f3292dbdf2a30de5ea5dcdbe72287b" +"checksum nom 4.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4836e9d6036552017e107edc598c97b2dee245161ff1b1ad4af215004774b354" "checksum opaque-debug 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "93f5bb2e8e8dec81642920ccff6b61f1eb94fa3020c5a325c9851ff604152409" "checksum ordermap 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "a86ed3f5f244b372d6b1a00b72ef7f8876d0bc6a78a4c9985c53614041512063" "checksum pest 2.1.0 (git+https://github.com/pest-parser/pest)" = "<none>" @@ -804,6 +830,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" "checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" +"checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" "checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" "checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" @@ -7,6 +7,8 @@ build = "build.rs" [build-dependencies] lalrpop = "0.16.3" +abnf = { git = "https://github.com/Nadrieril/abnf" } +itertools = "0.8.0" [dependencies] bytecount = "0.5.1" @@ -1,6 +1,111 @@ +use std::fs::File; +use std::io::{Read,Write,BufReader,BufRead}; +use std::collections::HashMap; +use itertools::Itertools; + use lalrpop; -fn main() { +fn abnf_to_pest(data: &Vec<u8>, visibility_map: &HashMap<String, bool>) -> std::io::Result<String> { + use abnf::abnf::*; + fn format_rule(x: Rule, visibility_map: &HashMap<String, bool>) -> String { + let rulename = format_rulename(x.name); + let contents = format_alternation(x.elements); + let visible = visibility_map.get(&rulename).unwrap_or(&true); + let visible = if *visible {""} else {"_"}; + format!("{} = {}{{ {} }}", rulename, visible, contents) + } + fn format_rulename(x: String) -> String { + let x = x.replace("-", "_"); + if x == "if" || x == "else" || x == "as" || x == "let" || x == "in" || x == "fn" { + x + "_" + } else { + x + } + } + fn format_alternation(x: Alternation) -> String { + x.concatenations.into_iter().map(format_concatenation).join("\n | ") + } + fn format_concatenation(x: Concatenation) -> String { + x.repetitions.into_iter().map(format_repetition).join(" ~ ") + } + fn format_repetition(x: Repetition) -> String { + format!("{}{}", format_element(x.element), x.repeat.map(format_repeat).unwrap_or("".into())) + } + fn format_repeat(x: Repeat) -> String { + match (x.min.unwrap_or(0), x.max) { + (0, None) => "*".into(), + (1, None) => "+".into(), + (0, Some(1)) => "?".into(), + (min, None) => format!("{{{},}}", min), + (min, Some(max)) if min == max => format!("{{{}}}", min), + (min, Some(max)) => format!("{{{},{}}}", min, max), + } + } + fn format_element(x: Element) -> String { + use abnf::abnf::Element::*; + match x { + Rulename(s) => format_rulename(s), + Group(g) => format!("({})", format_alternation(g.alternation)), + Option(o) => format!("({})?", format_alternation(o.alternation)), + CharVal(s) => format!("^\"{}\"", s.replace("\"", "\\\"").replace("\\", "\\\\")), + NumVal(r) => format_range(r), + ProseVal(_) => unimplemented!(), + } + } + fn format_range(x: Range) -> String { + use abnf::abnf::Range::*; + match x { + Range(x, y) => format!("'{}'..'{}'", format_char(x), format_char(y)), + OneOf(v) => format!("\"{}\"", v.into_iter().map(format_char).join("")), + } + } + fn format_char(x: u64) -> String { + if x <= (u8::max_value() as u64) { + let x: u8 = x as u8; + if x.is_ascii_graphic() { + let x: char = x as char; + if x != '"' && x != '\'' && x != '\\' { + return x.to_string(); + } + } + } + format!("\\u{{{:02X}}}", x) + } + let make_err = |e| std::io::Error::new(std::io::ErrorKind::Other, format!("{}", e)); + + let rules = rulelist_comp(&data).map_err(make_err)?.1; + Ok(rules.into_iter().map(|x| format_rule(x, visibility_map)).join("\n")) +} + +fn main() -> std::io::Result<()> { lalrpop::process_root().unwrap(); println!("cargo:rerun-if-changed=src/grammar.lalrpop"); + + + let abnf_path = "dhall-lang/standard/dhall.abnf"; + let visibility_path = "src/dhall.pest.visibility"; + let pest_path = "src/dhall.pest"; + println!("cargo:rerun-if-changed={}", abnf_path); + println!("cargo:rerun-if-changed={}", visibility_path); + + let mut file = File::open(abnf_path)?; + let mut data = Vec::new(); + file.read_to_end(&mut data)?; + data.push('\n' as u8); + + let mut visibility_map: HashMap<String, bool> = HashMap::new(); + for line in BufReader::new(File::open(visibility_path)?).lines() { + let line = line?; + if line.len() >= 2 && &line[0..2] == "# " { + visibility_map.insert(line[2..].into(), false); + } else { + visibility_map.insert(line, true); + } + } + + let mut file = File::create(pest_path)?; + writeln!(&mut file, "{}", abnf_to_pest(&data, &visibility_map)?)?; + writeln!(&mut file, "final_expression = _{{ SOI ~ complete_expression ~ EOI }}")?; + + Ok(()) } diff --git a/src/dhall.pest b/src/dhall.pest deleted file mode 100644 index 873428c..0000000 --- a/src/dhall.pest +++ /dev/null @@ -1,836 +0,0 @@ -/// ; ABNF syntax based on RFC 5234 -/// ; -/// ; The character encoding for Dhall is UTF-8 -/// ; -/// ; Some notes on implementing this grammar: -/// ; -/// ; First, do not use a lexer to tokenize the file before parsing. Instead, treat -/// ; the individual characters of the file as the tokens to feed into the parser. -/// ; You should not use a lexer because Dhall's grammar supports two features which -/// ; cannot be correctly supported by a lexer: -/// ; -/// ; * String interpolation (i.e. "foo ${Natural/toInteger bar} baz") -/// ; * Nested block comments (i.e. "{- foo {- bar -} baz -}") -/// ; -/// ; Second, this grammar assumes that your parser can backtrack and/or try -/// ; multiple parses simultaneously. For example, consider this expression: -/// ; -/// ; List ./MyType -/// ; -/// ; A parser might first try to parse the period as the beginning of a field -/// ; selector, only to realize immediately afterwards that `/MyType` is not a valid -/// ; name for a field. A conforming parser must backtrack so that the expression -/// ; `./MyType` can instead be correctly interpreted as a relative path -/// ; -/// ; Third, if there are multiple valid parses then prefer the first parse -/// ; according to the ordering of alternatives. That is, the order of evaluation -/// ; of the alternatives is left-to-right. -/// ; -/// ; For example, the grammar for single quoted string literals is: -/// ; -/// ; single-quote-continue = -/// ; "'''" single-quote-continue -/// ; / "${" complete-expression "}" single-quote-continue -/// ; / "''${" single-quote-continue -/// ; / "''" -/// ; / %x20-10FFFF single-quote-continue -/// ; / tab single-quote-continue -/// ; / end-of-line single-quote-continue -/// ; -/// ; single-quote-literal = "''" single-quote-continue -/// ; -/// ; ... which permits valid parses for the following code: -/// ; -/// ; "''''''''''''''''" -/// ; -/// ; If you tried to parse all alternatives then there are at least two valid -/// ; interpretations for the above code: -/// ; -/// ; * A single quoted literal with four escape sequences of the form "'''" -/// ; * i.e. "''" followed by "'''" four times in a row followed by "''" -/// ; * Four empty single quoted literals -/// ; * i.e. "''''" four times in a row -/// ; -/// ; The correct interpretation is the first one because parsing the escape -/// ; sequence "'''" takes precedence over parsing the termination sequence "''", -/// ; according to the order of the alternatives in the `single-quote-continue` -/// ; rule. -/// ; -/// ; Some parsing libraries do not backtrack by default but allow the user to -/// ; selectively backtrack in certain parts of the grammar. Usually parsing -/// ; libraries do this to improve efficiency and error messages. Dhall's grammar -/// ; takes that into account by minimizing the number of rules that require the -/// ; parser to backtrack and comments below will highlight where you need to -/// ; explicitly backtrack -/// ; -/// ; Specifically, if you see an uninterrupted literal in a grammar rule such as: -/// ; -/// ; "->" -/// ; -/// ; ... or: -/// ; -/// ; %x66.6f.72.61.6c.6c -/// ; -/// ; ... then that string literal is parsed as a single unit, meaning that you -/// ; should backtrack if you parse only part of the literal -/// ; -/// ; In all other cases you can assume that you do not need to backtrack unless -/// ; there is a comment explicitly asking you to backtrack -/// ; -/// ; When parsing a repeated construct, prefer alternatives that parse as many -/// ; repetitions as possible. On in other words: -/// ; -/// ; [a] = a / "" -/// ; -/// ; a* = a* a / "" -/// ; -/// ; Note that the latter rule also specifies that repetition produces -/// ; left-associated expressions. For example, function application is -/// ; left-associative and all operators are left-associative when they are not -/// ; parenthesized. -/// ; -/// ; Additionally, try alternatives in an order that minimizes backtracking -/// ; according to the following rule: -/// ; -/// ; (a / b) (c / d) = a c / a d / b c / b d -/// -/// ; NOTE: There are many line endings in the wild -/// ; -/// ; See: https://en.wikipedia.org/wiki/Newline -/// ; -/// ; For simplicity this supports Unix and Windows line-endings, which are the most -/// ; common -/// end-of-line = -/// %x0A ; "\n" -/// / %x0D.0A ; "\r\n" -/// -/// tab = %x09 ; "\t" -/// -/// block-comment = "{-" block-comment-continue -/// -/// block-comment-chunk = -/// block-comment -/// / %x20-10FFFF -/// / tab -/// / end-of-line -/// -/// block-comment-continue = "-}" / block-comment-chunk block-comment-continue -/// -/// not-end-of-line = %x20-10FFFF / tab -/// -/// ; NOTE: Slightly different from Haskell-style single-line comments because this -/// ; does not require a space after the dashes -/// line-comment = "--" *not-end-of-line end-of-line -/// -/// whitespace-chunk = -/// " " -/// / tab -/// / end-of-line -/// / line-comment -/// / block-comment -whitespace_chunk = _{ - " " - // | tab - // | end_of_line - // | line_comment - // | block_comment -} -/// -/// whitespace = *whitespace-chunk -whitespace = _{ whitespace_chunk* } -/// -/// nonempty-whitespace = 1*whitespace-chunk -nonempty_whitespace = _{ whitespace_chunk+ } -/// -/// ; Uppercase or lowercase ASCII letter -/// ALPHA = %x41-5A / %x61-7A -/// -/// ; ASCII digit -/// DIGIT = %x30-39 ; 0-9 -/// -/// HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F" -/// -/// ; A simple label cannot be one of the following reserved names: -/// ; -/// ; * Bool -/// ; * Optional -/// ; * None -/// ; * Natural -/// ; * Integer -/// ; * Double -/// ; * Text -/// ; * List -/// ; * True -/// ; * False -/// ; * NaN -/// ; * Infinity -/// ; * Type -/// ; * Kind -/// ; * Sort -/// ; * Natural/fold -/// ; * Natural/build -/// ; * Natural/isZero -/// ; * Natural/even -/// ; * Natural/odd -/// ; * Natural/toInteger -/// ; * Natural/show -/// ; * Integer/toDouble -/// ; * Integer/show -/// ; * Double/show -/// ; * List/build -/// ; * List/fold -/// ; * List/length -/// ; * List/head -/// ; * List/last -/// ; * List/indexed -/// ; * List/reverse -/// ; * Optional/fold -/// ; * Optional/build -/// ; * Text/show -/// ; * if -/// ; * then -/// ; * else -/// ; * let -/// ; * in -/// ; * as -/// ; * using -/// ; * merge -/// ; * constructors -/// ; * Some -/// simple-label = (ALPHA / "_") *(ALPHA / DIGIT / "-" / "/" / "_") -/// -/// quoted-label = 1*(ALPHA / DIGIT / "-" / "/" / "_" / ":" / "." / "$") -/// -/// ; NOTE: Dhall does not support Unicode labels, mainly to minimize the potential -/// ; for code obfuscation -/// label = ("`" quoted-label "`" / simple-label) whitespace -/// -/// ; Dhall's double-quoted strings are equivalent to JSON strings except with -/// ; support for string interpolation (and escaping string interpolation) -/// ; -/// ; Dhall uses almost the same escaping rules as JSON (RFC7159) with one -/// ; exception: Dhall adds a new `\$` escape sequence for dollar signs. This -/// ; additional escape sequences lets you escape string interpolation by writing -/// ; `\${` -/// ; -/// ; > The representation of strings is similar to conventions used in the C -/// ; > family of programming languages. A string begins and ends with -/// ; > quotation marks. All Unicode characters may be placed within the -/// ; > quotation marks, except for the characters that must be escaped: -/// ; > quotation mark, reverse solidus, and the control characters (U+0000 -/// ; > through U+001F). -/// ; > -/// ; > Any character may be escaped. If the character is in the Basic -/// ; > Multilingual Plane (U+0000 through U+FFFF), then it may be -/// ; > represented as a six-character sequence: a reverse solidus, followed -/// ; > by the lowercase letter u, followed by four hexadecimal digits that -/// ; > encode the character's code point. The hexadecimal letters A though -/// ; > F can be upper or lower case. So, for example, a string containing -/// ; > only a single reverse solidus character may be represented as -/// ; > "\u005C". -/// ; > -/// ; > Alternatively, there are two-character sequence escape -/// ; > representations of some popular characters. So, for example, a -/// ; > string containing only a single reverse solidus character may be -/// ; > represented more compactly as "\\". -/// ; > -/// ; > To escape an extended character that is not in the Basic Multilingual -/// ; > Plane, the character is represented as a 12-character sequence, -/// ; > encoding the UTF-16 surrogate pair. So, for example, a string -/// ; > containing only the G clef character (U+1D11E) may be represented as -/// ; > "\uD834\uDD1E". -/// double-quote-chunk = -/// "${" complete-expression "}" ; Interpolation -/// / %x5C ; '\' Beginning of escape sequence -/// ( %x22 ; '"' quotation mark U+0022 -/// / %x24 ; '$' dollar sign U+0024 -/// / %x5C ; '\' reverse solidus U+005C -/// / %x2F ; '/' solidus U+002F -/// / %x62 ; 'b' backspace U+0008 -/// / %x66 ; 'f' form feed U+000C -/// / %x6E ; 'n' line feed U+000A -/// / %x72 ; 'r' carriage return U+000D -/// / %x74 ; 't' tab U+0009 -/// / %x75 4HEXDIG ; 'uXXXX' U+XXXX -/// ) -/// ; Printable characters except double quote and backslash -/// / %x20-21 -/// ; %x22 = '"' -/// / %x23-5B -/// ; %x5C = "\" -/// / %x5D-10FFFF -/// -/// double-quote-literal = %x22 *double-quote-chunk %x22 -/// -/// ; NOTE: The only way to end a single-quote string literal with a single quote is -/// ; to either interpolate the single quote, like this: -/// ; -/// ; ''ABC${"'"}'' -/// ; -/// ; ... or concatenate another string, like this: -/// ; -/// ; ''ABC'' ++ "'" -/// ; -/// ; If you try to end the string literal with a single quote then you get "'''", -/// ; which is interpreted as an escaped pair of single quotes -/// single-quote-continue = -/// ; Escape two single quotes (i.e. replace this sequence with "''") -/// "'''" single-quote-continue -/// ; Interpolation -/// / "${" complete-expression "}" single-quote-continue -/// ; Escape interpolation (i.e. replace this sequence with "${") -/// / "''${" single-quote-continue -/// / "''" ; End of text literal -/// / %x20-10FFFF single-quote-continue -/// / tab single-quote-continue -/// / end-of-line single-quote-continue -/// -/// single-quote-literal = "''" end-of-line single-quote-continue -/// -/// text-literal = (double-quote-literal / single-quote-literal) whitespace -/// -/// ; RFC 5234 interprets string literals as case-insensitive and recommends using -/// ; hex instead for case-sensitive strings -/// ; -/// ; If you don't feel like reading hex, these are all the same as the rule name, -/// ; except without the '-raw' ending, and converting dashes in the rule name -/// ; to forward slashes -/// if-raw = %x69.66 -/// then-raw = %x74.68.65.6e -/// else-raw = %x65.6c.73.65 -/// let-raw = %x6c.65.74 -/// in-raw = %x69.6e -/// as-raw = %x61.73 -/// using-raw = %x75.73.69.6e.67 -/// merge-raw = %x6d.65.72.67.65 -/// missing-raw = %x6d.69.73.73.69.6e.67 -/// Some-raw = %x53.6f.6d.65 -/// constructors-raw = %x63.6f.6e.73.74.72.75.63.74.6f.72.73 -/// Natural-fold-raw = %x4e.61.74.75.72.61.6c.2f.66.6f.6c.64 -/// Natural-build-raw = %x4e.61.74.75.72.61.6c.2f.62.75.69.6c.64 -/// Natural-isZero-raw = %x4e.61.74.75.72.61.6c.2f.69.73.5a.65.72.6f -/// Natural-even-raw = %x4e.61.74.75.72.61.6c.2f.65.76.65.6e -/// Natural-odd-raw = %x4e.61.74.75.72.61.6c.2f.6f.64.64 -/// Natural-toInteger-raw = %x4e.61.74.75.72.61.6c.2f.74.6f.49.6e.74.65.67.65.72 -/// Natural-show-raw = %x4e.61.74.75.72.61.6c.2f.73.68.6f.77 -/// Integer-toDouble-raw = %x49.6e.74.65.67.65.72.2f.74.6f.44.6f.75.62.6c.65 -/// Integer-show-raw = %x49.6e.74.65.67.65.72.2f.73.68.6f.77 -/// Double-show-raw = %x44.6f.75.62.6c.65.2f.73.68.6f.77 -/// List-build-raw = %x4c.69.73.74.2f.62.75.69.6c.64 -/// List-fold-raw = %x4c.69.73.74.2f.66.6f.6c.64 -/// List-length-raw = %x4c.69.73.74.2f.6c.65.6e.67.74.68 -/// List-head-raw = %x4c.69.73.74.2f.68.65.61.64 -/// List-last-raw = %x4c.69.73.74.2f.6c.61.73.74 -/// List-indexed-raw = %x4c.69.73.74.2f.69.6e.64.65.78.65.64 -/// List-reverse-raw = %x4c.69.73.74.2f.72.65.76.65.72.73.65 -/// Optional-fold-raw = %x4f.70.74.69.6f.6e.61.6c.2f.66.6f.6c.64 -/// Optional-build-raw = %x4f.70.74.69.6f.6e.61.6c.2f.62.75.69.6c.64 -/// Text-show-raw = %x54.65.78.74.2f.73.68.6f.77 -/// Bool-raw = %x42.6f.6f.6c -/// Optional-raw = %x4f.70.74.69.6f.6e.61.6c -/// None-raw = %x4e.6f.6e.65 -/// Natural-raw = %x4e.61.74.75.72.61.6c -/// Integer-raw = %x49.6e.74.65.67.65.72 -/// Double-raw = %x44.6f.75.62.6c.65 -/// Text-raw = %x54.65.78.74 -/// List-raw = %x4c.69.73.74 -/// True-raw = %x54.72.75.65 -/// False-raw = %x46.61.6c.73.65 -/// NaN-raw = %x4e.61.4e -/// Infinity-raw = %x49.6e.66.69.6e.69.74.79 -/// Type-raw = %x54.79.70.65 -/// Kind-raw = %x4b.69.6e.64 -/// Sort-raw = %x53.6f.72.74 -/// -/// reserved-raw = -/// Bool-raw -/// / Optional-raw -/// / None-raw -/// / Natural-raw -/// / Integer-raw -/// / Double-raw -/// / Text-raw -/// / List-raw -/// / True-raw -/// / False-raw -/// / NaN-raw -/// / Infinity-raw -/// / Type-raw -/// / Kind-raw -/// / Sort-raw -/// -/// reserved-namespaced-raw = -/// Natural-fold-raw -/// / Natural-build-raw -/// / Natural-isZero-raw -/// / Natural-even-raw -/// / Natural-odd-raw -/// / Natural-toInteger-raw -/// / Natural-show-raw -/// / Integer-toDouble-raw -/// / Integer-show-raw -/// / Double-show-raw -/// / List-build-raw -/// / List-fold-raw -/// / List-length-raw -/// / List-head-raw -/// / List-last-raw -/// / List-indexed-raw -/// / List-reverse-raw -/// / Optional-fold-raw -/// / Optional-build-raw -/// / Text-show-raw -/// -/// reserved = reserved-raw whitespace -/// reserved-namespaced = reserved-namespaced-raw whitespace -/// -/// ; Whitespaced rules for reserved words, to be used when matching expressions -/// if = if-raw nonempty-whitespace -/// then = then-raw nonempty-whitespace -/// else = else-raw nonempty-whitespace -/// let = let-raw nonempty-whitespace -/// in = in-raw nonempty-whitespace -/// as = as-raw nonempty-whitespace -/// using = using-raw nonempty-whitespace -/// merge = merge-raw nonempty-whitespace -/// constructors = constructors-raw nonempty-whitespace -/// Some = Some-raw nonempty-whitespace -/// -/// Optional = Optional-raw whitespace -/// Text = Text-raw whitespace -/// List = List-raw whitespace -/// -/// equal = "=" whitespace -/// or = "||" whitespace -/// plus = "+" nonempty-whitespace ; To disambiguate `f +2` -plus = _{ "+" ~ nonempty_whitespace } -/// text-append = "++" whitespace -/// list-append = "#" nonempty-whitespace ; To disambiguate `http://a/a#a` -/// and = "&&" whitespace -/// times = "*" whitespace -times = _{ "*" ~ nonempty_whitespace } -/// double-equal = "==" whitespace -/// not-equal = "!=" whitespace -/// dot = "." whitespace -/// open-brace = "{" whitespace -/// close-brace = "}" whitespace -/// open-bracket = "[" whitespace -/// close-bracket = "]" whitespace -/// open-angle = "<" whitespace -/// close-angle = ">" whitespace -/// bar = "|" whitespace -/// comma = "," whitespace -/// open-parens = "(" whitespace -open_parens = _{ "(" ~ whitespace } -/// close-parens = ")" whitespace -close_parens = _{ ")" ~ whitespace } -/// at = "@" whitespace -/// colon = ":" nonempty-whitespace ; To disambiguate `env:VARIABLE` from type annotations -colon = _{ ":" ~ nonempty_whitespace } -/// import-alt = "?" nonempty-whitespace ; To disambiguate `http://a/a?a` -/// -/// combine = ( %x2227 / "/\" ) whitespace -/// combine-types = ( %x2A53 / "//\\" ) whitespace -/// prefer = ( %x2AFD / "//" ) whitespace -/// lambda = ( %x3BB / "\" ) whitespace -/// forall = ( %x2200 / %x66.6f.72.61.6c.6c ) whitespace -/// arrow = ( %x2192 / "->" ) whitespace -/// -/// exponent = "e" [ "+" / "-" ] 1*DIGIT -/// -/// double-literal = [ "+" / "-" ] 1*DIGIT ( "." 1*DIGIT [ exponent ] / exponent) whitespace -/// -/// natural-literal-raw = 1*DIGIT -natural_literal_raw = _{ ASCII_DIGIT+ } -/// -/// integer-literal = ( "+" / "-" ) natural-literal-raw whitespace -/// -/// natural-literal = natural-literal-raw whitespace -natural_literal = { natural_literal_raw ~ whitespace } -/// -/// identifier = label [ at natural-literal-raw whitespace ] -/// -/// identifier-reserved-prefix = -/// reserved-raw 1*(ALPHA / DIGIT / "-" / "/" / "_") whitespace [ at natural-literal-raw whitespace ] -/// -/// identifier-reserved-namespaced-prefix = -/// reserved-namespaced-raw 1*(ALPHA / DIGIT / "-" / "/" / "_") whitespace [ at natural-literal-raw whitespace ] -/// -/// missing = missing-raw whitespace -/// -/// ; Printable characters other than " ()[]{}<>/\," -/// ; -/// ; Excluding those characters ensures that paths don't have to end with trailing -/// ; whitespace most of the time -/// path-character = -/// ; %x20 = " " -/// %x21 -/// ; %x22 = "\"" -/// ; %x23 = "#" -/// / %x24-27 -/// ; %x28 = "(" -/// ; %x29 = ")" -/// / %x2A-2B -/// ; %x2C = "," -/// / %x2D-2E -/// ; %x2F = "/" -/// / %x30-3B -/// ; %x3C = "<" -/// / %x3D -/// ; %x3E = ">" -/// ; %x3F = "?" -/// / %x40-5A -/// ; %x5B = "[" -/// ; %x5C = "\" -/// ; %x5D = "]" -/// / %x5E-7A -/// ; %x7B = "{" -/// / %x7C -/// ; %x7D = "}" -/// / %x7E -/// -/// quoted-path-character = -/// %x20-21 -/// ; %x22 = "\"" -/// / %x23-2E -/// ; %x2F = "/" -/// / %x30-10FFFF -/// -/// -/// path-component = "/" ( 1*path-character / %x22 1*quoted-path-character %x22 ) -/// -/// directory = *path-component -/// -/// file = path-component -/// -/// local-raw = -/// ".." directory file ; Relative path -/// / "." directory file ; Relative path -/// / "~" directory file ; Home-anchored path -/// ; NOTE: Backtrack if parsing this alternative fails -/// ; -/// ; This is because the first character of this alternative will be "/", but -/// ; if the second character is "/" or "\" then this should have been parsed -/// ; as an operator instead of a path -/// / directory file ; Absolute path -/// -/// local = local-raw whitespace -/// -/// ; `http[s]` URI grammar based on RFC7230 and RFC 3986 with some differences -/// ; noted below -/// -/// scheme = %x68.74.74.70 [ %x73 ] ; "http" [ "s" ] -/// -/// ; NOTE: This does not match the official grammar for a URI. Specifically, this -/// ; replaces `path-abempty` with `directory file` -/// http-raw = scheme "://" authority directory file [ "?" query ] [ "#" fragment ] -/// -/// ; NOTE: Backtrack if parsing the optional user info prefix fails -/// authority = [ userinfo "@" ] host [ ":" port ] -/// -/// userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) -/// -/// host = IP-literal / IPv4address / reg-name -/// -/// port = *DIGIT -/// -/// IP-literal = "[" ( IPv6address / IPvFuture ) "]" -/// -/// IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) -/// -/// ; NOTE: Backtrack when parsing each alternative -/// IPv6address = 6( h16 ":" ) ls32 -/// / "::" 5( h16 ":" ) ls32 -/// / [ h16 ] "::" 4( h16 ":" ) ls32 -/// / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 -/// / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 -/// / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 -/// / [ *4( h16 ":" ) h16 ] "::" ls32 -/// / [ *5( h16 ":" ) h16 ] "::" h16 -/// / [ *6( h16 ":" ) h16 ] "::" -/// -/// h16 = 1*4HEXDIG -/// -/// ls32 = ( h16 ":" h16 ) / IPv4address -/// -/// IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet -/// -/// ; NOTE: Backtrack when parsing these alternatives and try them in reverse order -/// dec-octet = DIGIT ; 0-9 -/// / %x31-39 DIGIT ; 10-99 -/// / "1" 2DIGIT ; 100-199 -/// / "2" %x30-34 DIGIT ; 200-249 -/// / "25" %x30-35 ; 250-255 -/// -/// reg-name = *( unreserved / pct-encoded / sub-delims ) -/// -/// pchar = unreserved / pct-encoded / sub-delims / ":" / "@" -/// -/// query = *( pchar / "/" / "?" ) -/// -/// fragment = *( pchar / "/" / "?" ) -/// -/// pct-encoded = "%" HEXDIG HEXDIG -/// -/// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" -/// -/// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" -/// -/// http = -/// http-raw whitespace -/// [ using (import-hashed / open-parens import-hashed close-parens) ] -/// -/// ; Dhall supports unquoted environment variables that are Bash-compliant or -/// ; quoted environment variables that are POSIX-compliant -/// env = "env:" -/// ( bash-environment-variable -/// / %x22 posix-environment-variable %x22 -/// ) -/// whitespace -/// -/// ; Bash supports a restricted subset of POSIX environment variables. From the -/// ; Bash `man` page, an environment variable name is: -/// ; -/// ; > A word consisting only of alphanumeric characters and under-scores, and -/// ; > beginning with an alphabetic character or an under-score -/// bash-environment-variable = (ALPHA / "_") *(ALPHA / DIGIT / "_") -/// -/// ; The POSIX standard is significantly more flexible about legal environment -/// ; variable names, which can contain alerts (i.e. '\a'), whitespace, or -/// ; punctuation, for example. The POSIX standard says about environment variable -/// ; names: -/// ; -/// ; > The value of an environment variable is a string of characters. For a -/// ; > C-language program, an array of strings called the environment shall be made -/// ; > available when a process begins. The array is pointed to by the external -/// ; > variable environ, which is defined as: -/// ; > -/// ; > extern char **environ; -/// ; > -/// ; > These strings have the form name=value; names shall not contain the -/// ; > character '='. For values to be portable across systems conforming to IEEE -/// ; > Std 1003.1-2001, the value shall be composed of characters from the portable -/// ; > character set (except NUL and as indicated below). -/// ; -/// ; Note that the standard does not explicitly state that the name must have at -/// ; least one character, but `env` does not appear to support this and `env` -/// ; claims to be POSIX-compliant. To be safe, Dhall requires at least one -/// ; character like `env` -/// posix-environment-variable = 1*posix-environment-variable-character -/// -/// ; These are all the characters from the POSIX Portable Character Set except for -/// ; '\0' (NUL) and '='. Note that the POSIX standard does not explicitly state -/// ; that environment variable names cannot have NUL. However, this is implicit -/// ; in the fact that environment variables are passed to the program as -/// ; NUL-terminated `name=value` strings, which implies that the `name` portion of -/// ; the string cannot have NUL characters -/// posix-environment-variable-character = -/// %x5C ; '\' Beginning of escape sequence -/// ( %x22 ; '"' quotation mark U+0022 -/// / %x5C ; '\' reverse solidus U+005C -/// / %x61 ; 'a' alert U+0007 -/// / %x62 ; 'b' backspace U+0008 -/// / %x66 ; 'f' form feed U+000C -/// / %x6E ; 'n' line feed U+000A -/// / %x72 ; 'r' carriage return U+000D -/// / %x74 ; 't' tab U+0009 -/// / %x76 ; 'v' vertical tab U+000B -/// ) -/// ; Printable characters except double quote, backslash and equals -/// / %x20-21 -/// ; %x22 = '"' -/// / %x23-3C -/// ; %x3D = '=' -/// / %x3E-5B -/// ; %x5C = "\" -/// / %x5D-7E -/// -/// import-type = missing / local / http / env -/// -/// hash = %x73.68.61.32.35.36.3a 64HEXDIG whitespace ; "sha256:XXX...XXX" -/// -/// import-hashed = import-type [ hash ] -/// -/// ; "http://example.com" -/// ; "./foo/bar" -/// ; "env:FOO" -/// import = import-hashed [ as Text ] -/// -/// ; NOTE: Every rule past this point should only reference rules that end with -/// ; whitespace. This ensures consistent handling of whitespace in the absence of -/// ; a separate lexing step -/// -/// expression = -/// ; "\(x : a) -> b" -/// lambda open-parens label colon expression close-parens arrow expression -/// -/// ; "if a then b else c" -/// / if expression then expression else expression -/// -/// ; "let x : t = e1 in e2" -/// ; "let x = e1 in e2" -/// ; "let x = e1 let y = e2 in e3" -/// / 1*(let label [ colon expression ] equal expression) in expression -/// -/// ; "forall (x : a) -> b" -/// / forall open-parens label colon expression close-parens arrow expression -/// -/// ; "a -> b" -/// ; -/// ; NOTE: Backtrack if parsing this alternative fails -/// / operator-expression arrow expression -/// -/// / annotated-expression -expression = _{ annotated_expression } -/// -/// annotated-expression = -/// ; "merge e1 e2 : t" -/// ; "merge e1 e2" -/// merge import-expression import-expression [ colon application-expression ] -/// -/// ; "[] : List t" -/// ; "[] : Optional t" -/// ; "[x] : Optional t" -/// ; -/// ; NOTE: Backtrack if parsing this alternative fails since we can't tell -/// ; from the opening bracket whether or not this will be an empty list or -/// ; non-empty list -/// / open-bracket (empty-collection / non-empty-optional) -/// -/// ; "x : t" -/// / operator-expression (colon expression / "") -annotated_expression = _{ operator_expression ~ (colon ~ expression)? } -/// -/// empty-collection = close-bracket colon (List / Optional) import-expression -/// -/// non-empty-optional = expression close-bracket colon Optional import-expression -/// -/// operator-expression = import-alt-expression -operator_expression = _{ plus_expression } -/// -/// import-alt-expression = or-expression *(import-alt or-expression) -// import_alt_expression = { application_expression } -/// or-expression = plus-expression *(or plus-expression ) -/// plus-expression = text-append-expression *(plus text-append-expression ) -plus_expression = { times_expression ~ (plus ~ times_expression)* } -/// text-append-expression = list-append-expression *(text-append list-append-expression ) -/// list-append-expression = and-expression *(list-append and-expression ) -/// and-expression = combine-expression *(and combine-expression ) -/// combine-expression = prefer-expression *(combine prefer-expression ) -/// prefer-expression = combine-types-expression *(prefer combine-types-expression) -/// combine-types-expression = times-expression *(combine-types times-expression ) -/// times-expression = equal-expression *(times equal-expression ) -times_expression = { primitive_expression ~ (times ~ primitive_expression)* } -/// equal-expression = not-equal-expression *(double-equal not-equal-expression ) -/// not-equal-expression = application-expression *(not-equal application-expression ) -/// -/// ; Import expressions need to be separated by some whitespace, otherwise there -/// ; would be ambiguity: `./ab` could be interpreted as "import the file `./ab`", -/// ; or "apply the import `./a` to label `b`" -/// application-expression = -/// [ constructors / Some ] import-expression *(whitespace-chunk import-expression) -/// -/// import-expression = import / selector-expression -/// -/// ; `record.field` extracts one field of a record -/// ; -/// ; `record.{ field0, field1, field2 }` projects out several fields of a record -/// ; -/// ; NOTE: Backtrack when parsing the `*(dot ...)`. The reason why is that you -/// ; can't tell from parsing just the period whether "foo." will become "foo.bar" -/// ; (i.e. accessing field `bar` of the record `foo`) or `foo./bar` (i.e. applying -/// ; the function `foo` to the relative path `./bar`) -/// selector-expression = primitive-expression *(dot ( label / labels )) -/// -/// ; NOTE: Backtrack when parsing the first three alternatives (i.e. the numeric -/// ; literals). This is because they share leading characters in common -/// -/// ; NOTE: The reason why we have three different types of identifiers (that is: -/// ; identifier, identifier-reserved-prefix, identifier-reserved-namespaced-prefix) -/// ; is that it's the only way to parse correctly identifiers that start with reserved -/// ; words, other than using a lexer and use the longest match rule. -/// ; -/// ; Since reserved words can include themselves (e.g. 'List/build' includes 'List'), -/// ; we have to match the "namespaced" reserved words before the identifiers prefixed -/// ; by a reserved word. -/// primitive-expression = -/// ; "2.0" -/// double-literal -/// -/// ; "2" -/// / natural-literal -/// -/// ; "+2" -/// / integer-literal -/// -/// ; "-Infinity" -/// / "-" Infinity-raw whitespace -/// -/// ; '"ABC"' -/// / text-literal -/// -/// ; "{ foo = 1 , bar = True }" -/// ; "{ foo : Integer, bar : Bool }" -/// / open-brace record-type-or-literal close-brace -/// -/// ; "< Foo : Integer | Bar : Bool >" -/// ; "< Foo : Integer | Bar = True >" -/// / open-angle union-type-or-literal close-angle -/// -/// ; "[1, 2, 3]" -/// / non-empty-list-literal ; `annotated-expression` handles empty lists -/// -/// ; "List/foldWith" -/// / identifier-reserved-namespaced-prefix -/// -/// ; "List/head" -/// / reserved-namespaced -/// -/// ; "List/map" -/// ; "TypeDefinition" -/// / identifier-reserved-prefix -/// -/// ; "List" -/// / reserved -/// -/// ; "x" -/// ; "x@2" -/// / identifier -/// -/// ; "( e )" -/// / open-parens expression close-parens -primitive_expression = _{ - natural_literal - | open_parens ~ expression ~ close_parens -} -/// -/// labels = open-brace ( label *(comma label) / "" ) close-brace -/// -/// record-type-or-literal = -/// equal ; Empty record literal -/// / non-empty-record-type-or-literal -/// / "" ; Empty record type -/// -/// non-empty-record-type-or-literal = -/// label (non-empty-record-literal / non-empty-record-type) -/// -/// non-empty-record-type = colon expression *(comma label colon expression) -/// non-empty-record-literal = equal expression *(comma label equal expression) -/// -/// union-type-or-literal = -/// non-empty-union-type-or-literal -/// / "" ; Empty union type -/// -/// non-empty-union-type-or-literal = -/// label -/// ( equal expression *(bar label colon expression) -/// / colon expression (bar non-empty-union-type-or-literal / "") -/// ) -/// -/// non-empty-list-literal = open-bracket expression *(comma expression) close-bracket -/// -/// ; All expressions end with trailing whitespace. This just adds a final -/// ; whitespace prefix for the top-level of the program -/// complete-expression = whitespace expression - -complete_expression = _{ SOI ~ whitespace ~ expression ~ EOI } diff --git a/src/dhall.pest.visibility b/src/dhall.pest.visibility new file mode 100644 index 0000000..c09fccf --- /dev/null +++ b/src/dhall.pest.visibility @@ -0,0 +1,188 @@ +# end_of_line +# tab +# block_comment +# block_comment_chunk +# block_comment_continue +# not_end_of_line +# line_comment +# whitespace_chunk +# whitespace +# nonempty_whitespace +# ALPHA +# DIGIT +# HEXDIG +# simple_label +# quoted_label +# label +# double_quote_chunk +# double_quote_literal +# single_quote_continue +# single_quote_literal +# text_literal +# if_raw +# then_raw +# else_raw +# let_raw +# in_raw +# as_raw +# using_raw +# merge_raw +# missing_raw +# Some_raw +# constructors_raw +# Natural_fold_raw +# Natural_build_raw +# Natural_isZero_raw +# Natural_even_raw +# Natural_odd_raw +# Natural_toInteger_raw +# Natural_show_raw +# Integer_toDouble_raw +# Integer_show_raw +# Double_show_raw +# List_build_raw +# List_fold_raw +# List_length_raw +# List_head_raw +# List_last_raw +# List_indexed_raw +# List_reverse_raw +# Optional_fold_raw +# Optional_build_raw +# Text_show_raw +# Bool_raw +# Optional_raw +# None_raw +# Natural_raw +# Integer_raw +# Double_raw +# Text_raw +# List_raw +# True_raw +# False_raw +# NaN_raw +# Infinity_raw +# Type_raw +# Kind_raw +# Sort_raw +# reserved_raw +# reserved_namespaced_raw +# reserved +# reserved_namespaced +# if_ +# then_ +# else_ +# let_ +# in_ +# as_ +# using +# merge +# constructors +# Some +# Optional +# Text +# List +# equal +# or +# plus +# text_append +# list_append +# and +# times +# double_equal +# not_equal +# dot +# open_brace +# close_brace +# open_bracket +# close_bracket +# open_angle +# close_angle +# bar +# comma +# open_parens +# close_parens +# at +# colon +# import_alt +# combine +# combine_types +# prefer +# lambda +# forall +# arrow +# exponent +# double_literal +# natural_literal_raw +integer_literal +natural_literal +# identifier +# identifier_reserved_prefix +# identifier_reserved_namespaced_prefix +# missing +# path_character +# quoted_path_character +# path_component +# directory +# file +# local_raw +# local +# scheme +# http_raw +# authority +# userinfo +# host +# port +# IP_literal +# IPvFuture +# IPv6address +# h16 +# ls32 +# IPv4address +# dec_octet +# reg_name +# pchar +# query +# fragment +# pct_encoded +# unreserved +# sub_delims +# http +# env +# bash_environment_variable +# posix_environment_variable +# posix_environment_variable_character +# import_type +# hash +# import_hashed +# import +# expression +# annotated_expression +# empty_collection +# non_empty_optional +# operator_expression +# import_alt_expression +# or_expression +plus_expression +# text_append_expression +# list_append_expression +# and_expression +# combine_expression +# prefer_expression +# combine_types_expression +times_expression +# equal_expression +# not_equal_expression +# application_expression +# import_expression +# selector_expression +# primitive_expression +# labels +# record_type_or_literal +# non_empty_record_type_or_literal +# non_empty_record_type +# non_empty_record_literal +# union_type_or_literal +# non_empty_union_type_or_literal +# non_empty_list_literal +# complete_expression diff --git a/src/generated_parser.rs b/src/generated_parser.rs new file mode 100644 index 0000000..452b4cd --- /dev/null +++ b/src/generated_parser.rs @@ -0,0 +1,6 @@ +#[allow(unused_imports)] +use pest_derive::*; + +#[derive(Parser)] +#[grammar = "dhall.pest"] +pub struct DhallParser; @@ -6,6 +6,7 @@ pub use crate::core::*; use lalrpop_util::lalrpop_mod; lalrpop_mod!(pub grammar); // synthesized by LALRPOP mod grammar_util; +mod generated_parser; pub mod lexer; pub mod parser; pub mod typecheck; diff --git a/src/parser.rs b/src/parser.rs index a0281f4..057fce2 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -13,13 +13,9 @@ pub fn parse_expr(s: &str) -> Result<BoxExpr, ParseError> { use pest::Parser; use pest::error::Error; -use pest_derive::*; - -#[derive(Parser)] -#[grammar = "dhall.pest"] -struct DhallParser; - use pest::iterators::Pair; +use crate::generated_parser::{DhallParser, Rule}; + fn debug_pair(pair: Pair<Rule>) { fn aux(indent: usize, pair: Pair<Rule>) { let indent_str = "| ".repeat(indent); @@ -61,11 +57,11 @@ pub fn parse_expr_pest(s: &str) -> Result<BoxExpr, Error<Rule>> { fn test_parse() { use crate::core::Expr::*; let expr = "((22 + 3) * 10)"; + println!("{:?}", parse_expr(expr)); match parse_expr_pest(expr) { Err(e) => println!("{}", e), ok => println!("{:?}", ok), } - println!("{:?}", parse_expr(expr)); assert_eq!(parse_expr_pest(expr).unwrap(), parse_expr(expr).unwrap()); assert!(false); |