aboutsummaryrefslogtreecommitdiff
path: root/stdlib/source/library/lux/data/text/unicode
diff options
context:
space:
mode:
Diffstat (limited to 'stdlib/source/library/lux/data/text/unicode')
-rw-r--r--stdlib/source/library/lux/data/text/unicode/block.lux205
-rw-r--r--stdlib/source/library/lux/data/text/unicode/set.lux240
2 files changed, 445 insertions, 0 deletions
diff --git a/stdlib/source/library/lux/data/text/unicode/block.lux b/stdlib/source/library/lux/data/text/unicode/block.lux
new file mode 100644
index 000000000..24ddb34e2
--- /dev/null
+++ b/stdlib/source/library/lux/data/text/unicode/block.lux
@@ -0,0 +1,205 @@
+(.module:
+ [library
+ [lux #*
+ [abstract
+ [equivalence (#+ Equivalence)]
+ [hash (#+ Hash)]
+ [monoid (#+ Monoid)]
+ ["." interval (#+ Interval)]]
+ [math
+ [number (#+ hex)
+ ["n" nat ("#\." interval)]
+ ["." i64]]]
+ [type
+ abstract]]]
+ [/// (#+ Char)])
+
+(abstract: #export Block
+ (Interval Char)
+
+ (implementation: #export monoid
+ (Monoid Block)
+
+ (def: identity
+ (:abstraction (interval.between n.enum n\top n\bottom)))
+ (def: (compose left right)
+ (let [left (:representation left)
+ right (:representation right)]
+ (:abstraction
+ (interval.between n.enum
+ (n.min (\ left bottom)
+ (\ right bottom))
+ (n.max (\ left top)
+ (\ right top)))))))
+
+ (def: #export (block start end)
+ (-> Char Char Block)
+ (:abstraction (interval.between n.enum (n.min start end) (n.max start end))))
+
+ (template [<name> <slot>]
+ [(def: #export <name>
+ (-> Block Char)
+ (|>> :representation (get@ <slot>)))]
+
+ [start #interval.bottom]
+ [end #interval.top]
+ )
+
+ (def: #export (size block)
+ (-> Block Nat)
+ (let [start (get@ #interval.bottom (:representation block))
+ end (get@ #interval.top (:representation block))]
+ (|> end (n.- start) inc)))
+
+ (def: #export (within? block char)
+ (All [a] (-> Block Char Bit))
+ (interval.within? (:representation block) char))
+ )
+
+(implementation: #export equivalence
+ (Equivalence Block)
+
+ (def: (= reference subject)
+ (and (n.= (..start reference) (..start subject))
+ (n.= (..end reference) (..end subject)))))
+
+(implementation: #export hash
+ (Hash Block)
+
+ (def: &equivalence ..equivalence)
+ (def: (hash value)
+ (i64.or (i64.left_shift 32 (..start value))
+ (..end value))))
+
+(template [<name> <start> <end>]
+ [(def: #export <name> Block (..block (hex <start>) (hex <end>)))]
+
+ ## Normal blocks
+ [basic_latin "0000" "007F"]
+ [latin_1_supplement "00A0" "00FF"]
+ [latin_extended_a "0100" "017F"]
+ [latin_extended_b "0180" "024F"]
+ [ipa_extensions "0250" "02AF"]
+ [spacing_modifier_letters "02B0" "02FF"]
+ [combining_diacritical_marks "0300" "036F"]
+ [greek_and_coptic "0370" "03FF"]
+ [cyrillic "0400" "04FF"]
+ [cyrillic_supplementary "0500" "052F"]
+ [armenian "0530" "058F"]
+ [hebrew "0590" "05FF"]
+ [arabic "0600" "06FF"]
+ [syriac "0700" "074F"]
+ [thaana "0780" "07BF"]
+ [devanagari "0900" "097F"]
+ [bengali "0980" "09FF"]
+ [gurmukhi "0A00" "0A7F"]
+ [gujarati "0A80" "0AFF"]
+ [oriya "0B00" "0B7F"]
+ [tamil "0B80" "0BFF"]
+ [telugu "0C00" "0C7F"]
+ [kannada "0C80" "0CFF"]
+ [malayalam "0D00" "0D7F"]
+ [sinhala "0D80" "0DFF"]
+ [thai "0E00" "0E7F"]
+ [lao "0E80" "0EFF"]
+ [tibetan "0F00" "0FFF"]
+ [myanmar "1000" "109F"]
+ [georgian "10A0" "10FF"]
+ [hangul_jamo "1100" "11FF"]
+ [ethiopic "1200" "137F"]
+ [cherokee "13A0" "13FF"]
+ [unified_canadian_aboriginal_syllabics "1400" "167F"]
+ [ogham "1680" "169F"]
+ [runic "16A0" "16FF"]
+ [tagalog "1700" "171F"]
+ [hanunoo "1720" "173F"]
+ [buhid "1740" "175F"]
+ [tagbanwa "1760" "177F"]
+ [khmer "1780" "17FF"]
+ [mongolian "1800" "18AF"]
+ [limbu "1900" "194F"]
+ [tai_le "1950" "197F"]
+ [khmer_symbols "19E0" "19FF"]
+ [phonetic_extensions "1D00" "1D7F"]
+ [latin_extended_additional "1E00" "1EFF"]
+ [greek_extended "1F00" "1FFF"]
+ [general_punctuation "2000" "206F"]
+ [superscripts_and_subscripts "2070" "209F"]
+ [currency_symbols "20A0" "20CF"]
+ [combining_diacritical_marks_for_symbols "20D0" "20FF"]
+ [letterlike_symbols "2100" "214F"]
+ [number_forms "2150" "218F"]
+ [arrows "2190" "21FF"]
+ [mathematical_operators "2200" "22FF"]
+ [miscellaneous_technical "2300" "23FF"]
+ [control_pictures "2400" "243F"]
+ [optical_character_recognition "2440" "245F"]
+ [enclosed_alphanumerics "2460" "24FF"]
+ [box_drawing "2500" "257F"]
+ [block_elements "2580" "259F"]
+ [geometric_shapes "25A0" "25FF"]
+ [miscellaneous_symbols "2600" "26FF"]
+ [dingbats "2700" "27BF"]
+ [miscellaneous_mathematical_symbols_a "27C0" "27EF"]
+ [supplemental_arrows_a "27F0" "27FF"]
+ [braille_patterns "2800" "28FF"]
+ [supplemental_arrows_b "2900" "297F"]
+ [miscellaneous_mathematical_symbols_b "2980" "29FF"]
+ [supplemental_mathematical_operators "2A00" "2AFF"]
+ [miscellaneous_symbols_and_arrows "2B00" "2BFF"]
+ [cjk_radicals_supplement "2E80" "2EFF"]
+ [kangxi_radicals "2F00" "2FDF"]
+ [ideographic_description_characters "2FF0" "2FFF"]
+ [cjk_symbols_and_punctuation "3000" "303F"]
+ [hiragana "3040" "309F"]
+ [katakana "30A0" "30FF"]
+ [bopomofo "3100" "312F"]
+ [hangul_compatibility_jamo "3130" "318F"]
+ [kanbun "3190" "319F"]
+ [bopomofo_extended "31A0" "31BF"]
+ [katakana_phonetic_extensions "31F0" "31FF"]
+ [enclosed_cjk_letters_and_months "3200" "32FF"]
+ [cjk_compatibility "3300" "33FF"]
+ [cjk_unified_ideographs_extension_a "3400" "4DBF"]
+ [yijing_hexagram_symbols "4DC0" "4DFF"]
+ [cjk_unified_ideographs "4E00" "9FFF"]
+ [yi_syllables "A000" "A48F"]
+ [yi_radicals "A490" "A4CF"]
+ [hangul_syllables "AC00" "D7AF"]
+ [high_surrogates "D800" "DB7F"]
+ [high_private_use_surrogates "DB80" "DBFF"]
+ [low_surrogates "DC00" "DFFF"]
+ [private_use_area "E000" "F8FF"]
+ [cjk_compatibility_ideographs "F900" "FAFF"]
+ [alphabetic_presentation_forms "FB00" "FB4F"]
+ [arabic_presentation_forms_a "FB50" "FDFF"]
+ [variation_selectors "FE00" "FE0F"]
+ [combining_half_marks "FE20" "FE2F"]
+ [cjk_compatibility_forms "FE30" "FE4F"]
+ [small_form_variants "FE50" "FE6F"]
+ [arabic_presentation_forms_b "FE70" "FEFF"]
+ [halfwidth_and_fullwidth_forms "FF00" "FFEF"]
+ [specials "FFF0" "FFFF"]
+ ## [linear_b_syllabary "10000" "1007F"]
+ ## [linear_b_ideograms "10080" "100FF"]
+ ## [aegean_numbers "10100" "1013F"]
+ ## [old_italic "10300" "1032F"]
+ ## [gothic "10330" "1034F"]
+ ## [ugaritic "10380" "1039F"]
+ ## [deseret "10400" "1044F"]
+ ## [shavian "10450" "1047F"]
+ ## [osmanya "10480" "104AF"]
+ ## [cypriot_syllabary "10800" "1083F"]
+ ## [byzantine_musical_symbols "1D000" "1D0FF"]
+ ## [musical_symbols "1D100" "1D1FF"]
+ ## [tai_xuan_jing_symbols "1D300" "1D35F"]
+ ## [mathematical_alphanumeric_symbols "1D400" "1D7FF"]
+ ## [cjk_unified_ideographs_extension_b "20000" "2A6DF"]
+ ## [cjk_compatibility_ideographs_supplement "2F800" "2FA1F"]
+ ## [tags "E0000" "E007F"]
+
+ ## Specialized blocks
+ [basic_latin/decimal "0030" "0039"]
+ [basic_latin/upper "0041" "005A"]
+ [basic_latin/lower "0061" "007A"]
+ )
diff --git a/stdlib/source/library/lux/data/text/unicode/set.lux b/stdlib/source/library/lux/data/text/unicode/set.lux
new file mode 100644
index 000000000..2c48aed41
--- /dev/null
+++ b/stdlib/source/library/lux/data/text/unicode/set.lux
@@ -0,0 +1,240 @@
+(.module:
+ [library
+ [lux #*
+ [abstract
+ [equivalence (#+ Equivalence)]]
+ [data
+ [collection
+ ["." list ("#\." fold functor)]
+ ["." set ("#\." equivalence)]
+ ["." tree #_
+ ["#" finger (#+ Tree)]]]]
+ [type (#+ :by_example)
+ abstract]]]
+ ["." / #_
+ ["/#" // #_
+ [// (#+ Char)]
+ ["#." block (#+ Block)]]])
+
+(def: builder
+ (tree.builder //block.monoid))
+
+(def: :@:
+ (:by_example [@]
+ (tree.Builder @ Block)
+ ..builder
+
+ @))
+
+(abstract: #export Set
+ (Tree :@: Block [])
+
+ (def: #export (compose left right)
+ (-> Set Set Set)
+ (:abstraction
+ (\ builder branch
+ (:representation left)
+ (:representation right))))
+
+ (def: (singleton block)
+ (-> Block Set)
+ (:abstraction
+ (\ builder leaf block [])))
+
+ (def: #export (set [head tail])
+ (-> [Block (List Block)] Set)
+ (list\fold (: (-> Block Set Set)
+ (function (_ block set)
+ (..compose (..singleton block) set)))
+ (..singleton head)
+ tail))
+
+ (def: character/0
+ Set
+ (..set [//block.basic_latin
+ (list //block.latin_1_supplement
+ //block.latin_extended_a
+ //block.latin_extended_b
+ //block.ipa_extensions
+ //block.spacing_modifier_letters
+ //block.combining_diacritical_marks
+ //block.greek_and_coptic
+ //block.cyrillic
+ //block.cyrillic_supplementary
+ //block.armenian
+ //block.hebrew
+ //block.arabic
+ //block.syriac
+ //block.thaana
+ //block.devanagari
+ //block.bengali
+ //block.gurmukhi
+ //block.gujarati
+ //block.oriya
+ //block.tamil
+ //block.telugu
+ //block.kannada
+ //block.malayalam
+ //block.sinhala
+ //block.thai
+ //block.lao
+ //block.tibetan
+ //block.myanmar
+ //block.georgian)]))
+
+ (def: character/1
+ Set
+ (..set [//block.hangul_jamo
+ (list //block.ethiopic
+ //block.cherokee
+ //block.unified_canadian_aboriginal_syllabics
+ //block.ogham
+ //block.runic
+ //block.tagalog
+ //block.hanunoo
+ //block.buhid
+ //block.tagbanwa
+ //block.khmer
+ //block.mongolian
+ //block.limbu
+ //block.tai_le
+ //block.khmer_symbols
+ //block.phonetic_extensions
+ //block.latin_extended_additional
+ //block.greek_extended
+ //block.general_punctuation
+ //block.superscripts_and_subscripts
+ //block.currency_symbols
+ //block.combining_diacritical_marks_for_symbols
+ //block.letterlike_symbols
+ //block.number_forms
+ //block.arrows
+ //block.mathematical_operators
+ //block.miscellaneous_technical
+ //block.control_pictures
+ //block.optical_character_recognition
+ //block.enclosed_alphanumerics
+ //block.box_drawing)]))
+
+ (def: character/2
+ Set
+ (..set [//block.block_elements
+ (list //block.geometric_shapes
+ //block.miscellaneous_symbols
+ //block.dingbats
+ //block.miscellaneous_mathematical_symbols_a
+ //block.supplemental_arrows_a
+ //block.braille_patterns
+ //block.supplemental_arrows_b
+ //block.miscellaneous_mathematical_symbols_b
+ //block.supplemental_mathematical_operators
+ //block.miscellaneous_symbols_and_arrows
+ //block.cjk_radicals_supplement
+ //block.kangxi_radicals
+ //block.ideographic_description_characters
+ //block.cjk_symbols_and_punctuation
+ //block.hiragana
+ //block.katakana
+ //block.bopomofo
+ //block.hangul_compatibility_jamo
+ //block.kanbun
+ //block.bopomofo_extended
+ //block.katakana_phonetic_extensions
+ //block.enclosed_cjk_letters_and_months
+ //block.cjk_compatibility
+ //block.cjk_unified_ideographs_extension_a
+ //block.yijing_hexagram_symbols
+ //block.cjk_unified_ideographs
+ //block.yi_syllables
+ //block.yi_radicals
+ //block.hangul_syllables
+ )]))
+
+ (def: #export character
+ Set
+ ($_ ..compose
+ ..character/0
+ ..character/1
+ ..character/2
+ ))
+
+ (def: #export non_character
+ Set
+ (..set [//block.high_surrogates
+ (list //block.high_private_use_surrogates
+ //block.low_surrogates
+ //block.private_use_area
+ //block.cjk_compatibility_ideographs
+ //block.alphabetic_presentation_forms
+ //block.arabic_presentation_forms_a
+ //block.variation_selectors
+ //block.combining_half_marks
+ //block.cjk_compatibility_forms
+ //block.small_form_variants
+ //block.arabic_presentation_forms_b
+ //block.halfwidth_and_fullwidth_forms
+ //block.specials
+ ## //block.linear_b_syllabary
+ ## //block.linear_b_ideograms
+ ## //block.aegean_numbers
+ ## //block.old_italic
+ ## //block.gothic
+ ## //block.ugaritic
+ ## //block.deseret
+ ## //block.shavian
+ ## //block.osmanya
+ ## //block.cypriot_syllabary
+ ## //block.byzantine_musical_symbols
+ ## //block.musical_symbols
+ ## //block.tai_xuan_jing_symbols
+ ## //block.mathematical_alphanumeric_symbols
+ ## //block.cjk_unified_ideographs_extension_b
+ ## //block.cjk_compatibility_ideographs_supplement
+ ## //block.tags
+ )]))
+
+ (def: #export full
+ Set
+ ($_ ..compose
+ ..character
+ ..non_character
+ ))
+
+ (def: #export (range set)
+ (-> Set [Char Char])
+ (let [tag (tree.tag (:representation set))]
+ [(//block.start tag)
+ (//block.end tag)]))
+
+ (def: #export (member? set character)
+ (-> Set Char Bit)
+ (loop [tree (:representation set)]
+ (if (//block.within? (tree.tag tree) character)
+ (case (tree.root tree)
+ (0 #0 _)
+ true
+
+ (0 #1 left right)
+ (or (recur left)
+ (recur right)))
+ false)))
+
+ (implementation: #export equivalence
+ (Equivalence Set)
+
+ (def: (= reference subject)
+ (set\= (set.from_list //block.hash (tree.tags (:representation reference)))
+ (set.from_list //block.hash (tree.tags (:representation subject))))))
+ )
+
+(template [<name> <blocks>]
+ [(def: #export <name>
+ (..set <blocks>))]
+
+ [ascii [//block.basic_latin (list)]]
+ [ascii/alpha [//block.basic_latin/upper (list //block.basic_latin/lower)]]
+ [ascii/alpha_num [//block.basic_latin/upper (list //block.basic_latin/lower //block.basic_latin/decimal)]]
+ [ascii/numeric [//block.basic_latin/decimal (list)]]
+ [ascii/upper [//block.basic_latin/upper (list)]]
+ [ascii/lower [//block.basic_latin/lower (list)]]
+ )