aboutsummaryrefslogtreecommitdiff
path: root/stdlib/source/lux/data/text/unicode.lux
diff options
context:
space:
mode:
Diffstat (limited to 'stdlib/source/lux/data/text/unicode.lux')
-rw-r--r--stdlib/source/lux/data/text/unicode.lux338
1 files changed, 338 insertions, 0 deletions
diff --git a/stdlib/source/lux/data/text/unicode.lux b/stdlib/source/lux/data/text/unicode.lux
new file mode 100644
index 000000000..7b1eb0fa9
--- /dev/null
+++ b/stdlib/source/lux/data/text/unicode.lux
@@ -0,0 +1,338 @@
+(.module:
+ lux
+ (lux (control [interval #+ Interval]
+ [monoid #+ Monoid])
+ (data [number #+ hex "nat/" Interval<Nat>]
+ (coll [list]
+ (tree [finger #+ Tree])))
+ (type abstract)))
+
+(type: #export Char Nat)
+
+(abstract: #export Segment
+ {}
+ (Interval Char)
+
+ (def: empty (@abstraction (interval.between number.Enum<Nat> nat/top nat/bottom)))
+
+ (struct: _ (Monoid Segment)
+ (def: identity ..empty)
+ (def: (compose left right)
+ (let [left (@representation left)
+ right (@representation right)]
+ (@abstraction
+ (interval.between number.Enum<Nat>
+ (n/min (:: left bottom)
+ (:: right bottom))
+ (n/max (:: left top)
+ (:: right top)))))))
+
+ (def: #export (segment start end)
+ (-> Char Char Segment)
+ (@abstraction (interval.between number.Enum<Nat> (n/min start end) (n/max start end))))
+
+ (do-template [<name> <slot>]
+ [(def: #export <name>
+ (-> Segment Char)
+ (|>> @representation (get@ <slot>)))]
+
+ [start #interval.bottom]
+ [end #interval.top]
+ )
+
+ (def: #export (size segment)
+ (-> Segment Nat)
+ (let [start (get@ #interval.bottom (@representation segment))
+ end (get@ #interval.top (@representation segment))]
+ (|> end (n/- start) inc)))
+
+ (def: #export (within? segment char)
+ (All [a] (-> Segment Char Bool))
+ (interval.within? (@representation segment) char))
+ )
+
+(do-template [<name> <start> <end>]
+ [(def: #export <name> Segment (..segment (hex <start>) (hex <end>)))]
+
+ [basic-latin "+0000" "+007F"]
+ [latin-1-supplement "+00A0" "+00FF"]
+ [latin-extended-a "+0100" "+017F"]
+ [latin-extended-b "+0180" "+024F"]
+ [ipa-extensions "+0250" "+02AF"]
+ [spacing-modifier-letters "+02B0" "+02FF"]
+ [combining-diacritical-marks "+0300" "+036F"]
+ [greek-and-coptic "+0370" "+03FF"]
+ [cyrillic "+0400" "+04FF"]
+ [cyrillic-supplementary "+0500" "+052F"]
+ [armenian "+0530" "+058F"]
+ [hebrew "+0590" "+05FF"]
+ [arabic "+0600" "+06FF"]
+ [syriac "+0700" "+074F"]
+ [thaana "+0780" "+07BF"]
+ [devanagari "+0900" "+097F"]
+ [bengali "+0980" "+09FF"]
+ [gurmukhi "+0A00" "+0A7F"]
+ [gujarati "+0A80" "+0AFF"]
+ [oriya "+0B00" "+0B7F"]
+ [tamil "+0B80" "+0BFF"]
+ [telugu "+0C00" "+0C7F"]
+ [kannada "+0C80" "+0CFF"]
+ [malayalam "+0D00" "+0D7F"]
+ [sinhala "+0D80" "+0DFF"]
+ [thai "+0E00" "+0E7F"]
+ [lao "+0E80" "+0EFF"]
+ [tibetan "+0F00" "+0FFF"]
+ [myanmar "+1000" "+109F"]
+ [georgian "+10A0" "+10FF"]
+ [hangul-jamo "+1100" "+11FF"]
+ [ethiopic "+1200" "+137F"]
+ [cherokee "+13A0" "+13FF"]
+ [unified-canadian-aboriginal-syllabics "+1400" "+167F"]
+ [ogham "+1680" "+169F"]
+ [runic "+16A0" "+16FF"]
+ [tagalog "+1700" "+171F"]
+ [hanunoo "+1720" "+173F"]
+ [buhid "+1740" "+175F"]
+ [tagbanwa "+1760" "+177F"]
+ [khmer "+1780" "+17FF"]
+ [mongolian "+1800" "+18AF"]
+ [limbu "+1900" "+194F"]
+ [tai-le "+1950" "+197F"]
+ [khmer-symbols "+19E0" "+19FF"]
+ [phonetic-extensions "+1D00" "+1D7F"]
+ [latin-extended-additional "+1E00" "+1EFF"]
+ [greek-extended "+1F00" "+1FFF"]
+ [general-punctuation "+2000" "+206F"]
+ [superscripts-and-subscripts "+2070" "+209F"]
+ [currency-symbols "+20A0" "+20CF"]
+ [combining-diacritical-marks-for-symbols "+20D0" "+20FF"]
+ [letterlike-symbols "+2100" "+214F"]
+ [number-forms "+2150" "+218F"]
+ [arrows "+2190" "+21FF"]
+ [mathematical-operators "+2200" "+22FF"]
+ [miscellaneous-technical "+2300" "+23FF"]
+ [control-pictures "+2400" "+243F"]
+ [optical-character-recognition "+2440" "+245F"]
+ [enclosed-alphanumerics "+2460" "+24FF"]
+ [box-drawing "+2500" "+257F"]
+ [block-elements "+2580" "+259F"]
+ [geometric-shapes "+25A0" "+25FF"]
+ [miscellaneous-symbols "+2600" "+26FF"]
+ [dingbats "+2700" "+27BF"]
+ [miscellaneous-mathematical-symbols-a "+27C0" "+27EF"]
+ [supplemental-arrows-a "+27F0" "+27FF"]
+ [braille-patterns "+2800" "+28FF"]
+ [supplemental-arrows-b "+2900" "+297F"]
+ [miscellaneous-mathematical-symbols-b "+2980" "+29FF"]
+ [supplemental-mathematical-operators "+2A00" "+2AFF"]
+ [miscellaneous-symbols-and-arrows "+2B00" "+2BFF"]
+ [cjk-radicals-supplement "+2E80" "+2EFF"]
+ [kangxi-radicals "+2F00" "+2FDF"]
+ [ideographic-description-characters "+2FF0" "+2FFF"]
+ [cjk-symbols-and-punctuation "+3000" "+303F"]
+ [hiragana "+3040" "+309F"]
+ [katakana "+30A0" "+30FF"]
+ [bopomofo "+3100" "+312F"]
+ [hangul-compatibility-jamo "+3130" "+318F"]
+ [kanbun "+3190" "+319F"]
+ [bopomofo-extended "+31A0" "+31BF"]
+ [katakana-phonetic-extensions "+31F0" "+31FF"]
+ [enclosed-cjk-letters-and-months "+3200" "+32FF"]
+ [cjk-compatibility "+3300" "+33FF"]
+ [cjk-unified-ideographs-extension-a "+3400" "+4DBF"]
+ [yijing-hexagram-symbols "+4DC0" "+4DFF"]
+ [cjk-unified-ideographs "+4E00" "+9FFF"]
+ [yi-syllables "+A000" "+A48F"]
+ [yi-radicals "+A490" "+A4CF"]
+ [hangul-syllables "+AC00" "+D7AF"]
+ [high-surrogates "+D800" "+DB7F"]
+ [high-private-use-surrogates "+DB80" "+DBFF"]
+ [low-surrogates "+DC00" "+DFFF"]
+ [private-use-area "+E000" "+F8FF"]
+ [cjk-compatibility-ideographs "+F900" "+FAFF"]
+ [alphabetic-presentation-forms "+FB00" "+FB4F"]
+ [arabic-presentation-forms-a "+FB50" "+FDFF"]
+ [variation-selectors "+FE00" "+FE0F"]
+ [combining-half-marks "+FE20" "+FE2F"]
+ [cjk-compatibility-forms "+FE30" "+FE4F"]
+ [small-form-variants "+FE50" "+FE6F"]
+ [arabic-presentation-forms-b "+FE70" "+FEFF"]
+ [halfwidth-and-fullwidth-forms "+FF00" "+FFEF"]
+ [specials "+FFF0" "+FFFF"]
+ [linear-b-syllabary "+10000" "+1007F"]
+ [linear-b-ideograms "+10080" "+100FF"]
+ [aegean-numbers "+10100" "+1013F"]
+ [old-italic "+10300" "+1032F"]
+ [gothic "+10330" "+1034F"]
+ [ugaritic "+10380" "+1039F"]
+ [deseret "+10400" "+1044F"]
+ [shavian "+10450" "+1047F"]
+ [osmanya "+10480" "+104AF"]
+ [cypriot-syllabary "+10800" "+1083F"]
+ [byzantine-musical-symbols "+1D000" "+1D0FF"]
+ [musical-symbols "+1D100" "+1D1FF"]
+ [tai-xuan-jing-symbols "+1D300" "+1D35F"]
+ [mathematical-alphanumeric-symbols "+1D400" "+1D7FF"]
+ [cjk-unified-ideographs-extension-b "+20000" "+2A6DF"]
+ [cjk-compatibility-ideographs-supplement "+2F800" "+2FA1F"]
+ [tags "+E0000" "+E007F"]
+ )
+
+(type: #export Set (Tree Segment []))
+
+(def: (singleton segment)
+ (-> Segment Set)
+ {#finger.monoid Monoid<Segment>
+ #finger.node (#finger.Leaf segment [])})
+
+(def: #export (set segments)
+ (-> (List Segment) Set)
+ (case segments
+ (^ (list))
+ (..singleton (:: Monoid<Segment> identity))
+
+ (^ (list singleton))
+ (..singleton singleton)
+
+ (^ (list left right))
+ (..singleton (:: Monoid<Segment> compose left right))
+
+ _
+ (let [[sides extra] (n//% +2 (list.size segments))
+ [left+ right+] (list.split (n/+ sides extra) segments)]
+ (finger.branch (set left+)
+ (set right+)))))
+
+(def: half/0
+ (List Segment)
+ (list basic-latin
+ latin-1-supplement
+ latin-extended-a
+ latin-extended-b
+ ipa-extensions
+ spacing-modifier-letters
+ combining-diacritical-marks
+ greek-and-coptic
+ cyrillic
+ cyrillic-supplementary
+ armenian
+ hebrew
+ arabic
+ syriac
+ thaana
+ devanagari
+ bengali
+ gurmukhi
+ gujarati
+ oriya
+ tamil
+ telugu
+ kannada
+ malayalam
+ sinhala
+ thai
+ lao
+ tibetan
+ myanmar
+ georgian
+ hangul-jamo
+ ethiopic
+ cherokee
+ unified-canadian-aboriginal-syllabics
+ ogham
+ runic
+ tagalog
+ hanunoo
+ buhid
+ tagbanwa
+ khmer
+ mongolian
+ limbu
+ tai-le
+ khmer-symbols
+ phonetic-extensions
+ latin-extended-additional
+ greek-extended
+ general-punctuation
+ superscripts-and-subscripts
+ currency-symbols
+ combining-diacritical-marks-for-symbols
+ letterlike-symbols
+ number-forms
+ arrows
+ mathematical-operators
+ miscellaneous-technical
+ control-pictures
+ optical-character-recognition
+ enclosed-alphanumerics
+ box-drawing
+ ))
+
+(def: half/1
+ (List Segment)
+ (list block-elements
+ geometric-shapes
+ miscellaneous-symbols
+ dingbats
+ miscellaneous-mathematical-symbols-a
+ supplemental-arrows-a
+ braille-patterns
+ supplemental-arrows-b
+ miscellaneous-mathematical-symbols-b
+ supplemental-mathematical-operators
+ miscellaneous-symbols-and-arrows
+ cjk-radicals-supplement
+ kangxi-radicals
+ ideographic-description-characters
+ cjk-symbols-and-punctuation
+ hiragana
+ katakana
+ bopomofo
+ hangul-compatibility-jamo
+ kanbun
+ bopomofo-extended
+ katakana-phonetic-extensions
+ enclosed-cjk-letters-and-months
+ cjk-compatibility
+ cjk-unified-ideographs-extension-a
+ yijing-hexagram-symbols
+ cjk-unified-ideographs
+ yi-syllables
+ yi-radicals
+ hangul-syllables
+ high-surrogates
+ high-private-use-surrogates
+ low-surrogates
+ private-use-area
+ cjk-compatibility-ideographs
+ alphabetic-presentation-forms
+ arabic-presentation-forms-a
+ variation-selectors
+ combining-half-marks
+ cjk-compatibility-forms
+ small-form-variants
+ arabic-presentation-forms-b
+ halfwidth-and-fullwidth-forms
+ specials
+ linear-b-syllabary
+ linear-b-ideograms
+ aegean-numbers
+ old-italic
+ gothic
+ ugaritic
+ deseret
+ shavian
+ osmanya
+ cypriot-syllabary
+ byzantine-musical-symbols
+ musical-symbols
+ tai-xuan-jing-symbols
+ mathematical-alphanumeric-symbols
+ cjk-unified-ideographs-extension-b
+ cjk-compatibility-ideographs-supplement
+ tags
+ ))
+
+(def: #export full
+ Set
+ (finger.branch (set half/0) (set half/1)))