Skip to content

Commit

Permalink
Last fixes before release (#94)
Browse files Browse the repository at this point in the history
* Remove allBlockRanges

* When downloading files, create a separate directory for each Unicode version. This facilitate the work on various Unicode versions.

* Improve docs
  • Loading branch information
wismill committed Sep 28, 2022
1 parent cd1e33f commit bf8bb53
Show file tree
Hide file tree
Showing 13 changed files with 60 additions and 45 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ See `unicode-data`’s [guide](unicode-data/README.md#unicode-database-version-u
## Licensing

`unicode-data*` packages are an [open source](https://github.com/composewell/unicode-data)
project available under a liberal [Apache-2.0 license](LICENSE).
project available under a liberal [Apache-2.0 license](unicode-data/LICENSE).

## Contributing

Expand Down
7 changes: 7 additions & 0 deletions experimental/unicode-data-text/lib/Unicode/Text/Case.hs
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,17 @@ streamUnfold (C.Unfold step inject) = \case
caseConvertStream :: C.Unfold Char Char -> T.Text -> T.Text
caseConvertStream u t = TF.unstream (streamUnfold u (TF.stream t))

-- | Convert to full upper case using 'T.Text' fusion.
{-# INLINE toUpperFusion #-}
toUpperFusion :: T.Text -> T.Text
toUpperFusion = caseConvertStream C.upperCaseMapping

-- | Convert to full lower case using 'T.Text' fusion.
{-# INLINE toLowerFusion #-}
toLowerFusion :: T.Text -> T.Text
toLowerFusion = caseConvertStream C.lowerCaseMapping

-- | Convert to full case fold using 'T.Text' fusion.
{-# INLINE toCaseFoldFusion #-}
toCaseFoldFusion :: T.Text -> T.Text
toCaseFoldFusion = caseConvertStream C.caseFoldMapping
Expand Down Expand Up @@ -105,6 +108,7 @@ streamUnfoldToTitle = case C.lowerCaseMapping of
C.Yield c st' -> TF.Yield c (CC2 s st')
{-# INLINE [0] streamUnfoldToTitle #-}

-- | Convert to full title case using 'T.Text' fusion.
{-# INLINE toTitleFusion #-}
toTitleFusion :: T.Text -> T.Text
toTitleFusion = TF.unstream . streamUnfoldToTitle . TF.stream
Expand Down Expand Up @@ -189,18 +193,21 @@ caseConvertText ascii (C.Unfold (step :: u -> C.Step u Char) inject) (T.Text src
writeMapping (step st) (dstOff + d)
{-# INLINE caseConvertText #-}

-- | Convert to full upper case /without/ 'T.Text' fusion.
{-# INLINE toUpper #-}
toUpper :: T.Text -> T.Text
toUpper = caseConvertText
(\w -> if w - 97 <= 25 then w - 32 else w)
C.upperCaseMapping

-- | Convert to full lower case /without/ 'T.Text' fusion.
{-# INLINE toLower #-}
toLower :: T.Text -> T.Text
toLower = caseConvertText
(\w -> if w - 65 <= 25 then w + 32 else w)
C.lowerCaseMapping

-- | Convert to full case fold /without/ 'T.Text' fusion.
{-# INLINE toCaseFold #-}
toCaseFold :: T.Text -> T.Text
toCaseFold = caseConvertText
Expand Down
4 changes: 2 additions & 2 deletions experimental/unicode-data-text/unicode-data-text.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ version: 0.1.0
synopsis: Unicode features for “text” package
description:
@unicode-data-text@ provides Unicode features from
<https://hackage.haskell.org/package/unicode-data @unicode-data@> package
for the <https://hackage.haskell.org/package/text @text@> package.
<https://hackage.haskell.org/package/unicode-data unicode-data> package
for the <https://hackage.haskell.org/package/text text> package.
homepage: http://github.com/composewell/unicode-data
bug-reports: https://github.com/composewell/unicode-data/issues
license: Apache-2.0
Expand Down
4 changes: 2 additions & 2 deletions ucd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ SECURITY_FILES="\
# and verify the $checksum if $VERIFY_CHECKSUM is enabled
# $1 = file:checksum
download_file() {
local directory="data/$1"
local directory="data/$VERSION/$1"
local url="$2"
local pair="$3"
local file="$(echo "$pair" | cut -f1 -d':')"
Expand Down Expand Up @@ -81,7 +81,7 @@ download_files() {
run_generator() {
# Compile and run ucd2haskell
cabal run --flag ucd2haskell ucd2haskell -- \
--input ./data/ \
--input "./data/$VERSION" \
--output-core ./unicode-data/lib/ \
--output-names ./unicode-data-names/lib/ \
--output-scripts ./unicode-data-scripts/lib/ \
Expand Down
3 changes: 0 additions & 3 deletions unicode-data/bench/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,6 @@ main = defaultMain
, bgroup "blockDefinition"
[ benchRangeNF "unicode-data" (show . B.blockDefinition)
]
, bgroup "allBlockRanges"
[ benchChars "unicode-data" (const B.allBlockRanges)
]
]
, bgroup "Unicode.Char.General.Compat"
[ bgroup' "isAlpha"
Expand Down
10 changes: 1 addition & 9 deletions unicode-data/exe/Parser/Text.hs
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ genBlocksModule moduleName = done <$> Fold.foldl' step initial
, "{-# OPTIONS_HADDOCK hide #-}"
, ""
, "module " <> moduleName
, "(Block(..), BlockDefinition(..), block, blockDefinition, allBlockRanges)"
, "(Block(..), BlockDefinition(..), block, blockDefinition)"
, "where"
, ""
, "import Data.Ix (Ix)"
Expand Down Expand Up @@ -344,14 +344,6 @@ genBlocksModule moduleName = done <$> Fold.foldl' step initial
, "blockDefinition :: Block -> BlockDefinition"
, "blockDefinition b = case b of"
, mconcat (reverse defs)
, "-- | All the block ranges, in ascending order."
, "--"
, "-- @since 0.3.1"
, "{-# INLINE allBlockRanges #-}"
, "allBlockRanges :: [(Int, Int)]"
, "allBlockRanges ="
, " " <> show ranges'
, ""
, "-- | Character block, if defined."
, "--"
, "-- @since 0.3.1"
Expand Down
2 changes: 1 addition & 1 deletion unicode-data/lib/Unicode/Char.hs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ module Unicode.Char
, module Unicode.Char.Identifiers
, unicodeVersion

-- * Re-export
-- * Re-export from @base@
, ord
, chr
)
Expand Down
34 changes: 30 additions & 4 deletions unicode-data/lib/Unicode/Char/Case.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@
--
-- Case and case mapping related functions.
--
-- This module provides /full/ predicates and mappings that are /not/ compatible
-- with those in "Data.Char", which rely on simple properties.
-- See "Unicode.Char.Case.Compat" for a drop-in replacement of the functions in
-- "Data.Char".
--

module Unicode.Char.Case
( -- * Predicates
isLowerCase
Expand Down Expand Up @@ -48,29 +54,49 @@ import qualified Unicode.Internal.Char.SpecialCasing.LowerCaseMapping as C
import qualified Unicode.Internal.Char.SpecialCasing.TitleCaseMapping as C
import qualified Unicode.Internal.Char.SpecialCasing.UpperCaseMapping as C

-- | Returns 'True' for lower-case letters.
-- | Returns 'True' for lower-case characters.
--
-- It uses the character property
-- <https://www.unicode.org/reports/tr44/#Lowercase Lowercase>.
--
-- @since 0.3.0
{-# INLINE isLowerCase #-}
isLowerCase :: Char -> Bool
isLowerCase = P.isLowercase

-- | Returns 'True' for lower-case letters.
-- | Returns 'True' for lower-case characters.
--
-- It uses the character property
-- <https://www.unicode.org/reports/tr44/#Lowercase Lowercase>.
--
-- @since 0.1.0
{-# INLINE isLower #-}
{-# DEPRECATED isLower "Use isLowerCase instead. Note that the behavior of this function does not match base:Data.Char.isLower. See Unicode.Char.Case.Compat for behavior compatible with base:Data.Char." #-}
isLower :: Char -> Bool
isLower = P.isLowercase

-- | Returns 'True' for upper-case letters.
-- | Returns 'True' for upper-case characters.
--
-- It uses the character property
-- <https://www.unicode.org/reports/tr44/#Uppercase Uppercase>.
--
-- Note: it does /not/ match title-cased letters. Those are matched using:
-- @'Unicode.Char.General.generalCategory' c ==
-- 'Unicode.Char.General.TitlecaseLetter'@.
--
-- @since 0.3.0
{-# INLINE isUpperCase #-}
isUpperCase :: Char -> Bool
isUpperCase = P.isUppercase

-- | Returns 'True' for upper-case letters.
-- | Returns 'True' for upper-case characters.
--
-- It uses the character property
-- <https://www.unicode.org/reports/tr44/#Uppercase Uppercase>.
--
-- Note: it does /not/ match title-cased letters. Those are matched using:
-- @'Unicode.Char.General.generalCategory' c ==
-- 'Unicode.Char.General.TitlecaseLetter'@.
--
-- @since 0.1.0
{-# INLINE isUpper #-}
Expand Down
11 changes: 10 additions & 1 deletion unicode-data/lib/Unicode/Char/Case/Compat.hs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
-- Maintainer : streamly@composewell.com
-- Stability : experimental
--
-- Compatibility module for case and case mapping related functions..
-- Compatibility module for case and case mapping related functions.
--
-- The functions of this module are drop-in replacement for those in "Data.Char".
-- They are similar but not identical to some functions in "Unicode.Char.Case",
Expand All @@ -30,6 +30,11 @@ import qualified Unicode.Internal.Char.UnicodeData.SimpleUpperCaseMapping as C
-- Title case is used by a small number of letter ligatures like the
-- single-character form of /Lj/.
--
-- It matches characters with general category 'UppercaseLetter' and
-- 'TitlecaseLetter'.
--
-- See: 'Unicode.Char.Case.isUpperCase' for the /full upper/ case predicate.
--
-- prop> isUpper c == Data.Char.isUpper c
--
-- @since 0.3.0
Expand All @@ -41,6 +46,10 @@ isUpper c = case generalCategory c of

-- | Selects lower-case alphabetic Unicode characters (letters).
--
-- It matches characters with general category 'LowercaseLetter'.
--
-- See: 'Unicode.Char.Case.isLowerCase' for the /full/ lower case predicate.
--
-- prop> isLower c == Data.Char.isLower c
--
-- @since 0.3.0
Expand Down
11 changes: 1 addition & 10 deletions unicode-data/lib/Unicode/Char/General/Blocks.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- |
-- Module : Unicode.Char.General
-- Module : Unicode.Char.General.Blocks
-- Copyright : (c) 2020 Composewell Technologies and Contributors
-- License : Apache-2.0
-- Maintainer : streamly@composewell.com
Expand All @@ -14,7 +14,6 @@ module Unicode.Char.General.Blocks
, B.BlockDefinition(..)
, block
, B.blockDefinition
, allBlockRanges
)

where
Expand All @@ -27,11 +26,3 @@ import qualified Unicode.Internal.Char.Blocks as B
{-# INLINE block #-}
block :: Char -> Maybe B.Block
block = fmap toEnum . B.block

-- | All the [block](https://www.unicode.org/glossary/#block) ranges,
-- in ascending order.
--
-- @since 0.3.1
{-# INLINE allBlockRanges #-}
allBlockRanges :: [(Int, Int)]
allBlockRanges = B.allBlockRanges
6 changes: 3 additions & 3 deletions unicode-data/lib/Unicode/Char/Numeric.hs
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
-- @since 0.3.0
module Unicode.Char.Numeric
( -- * Predicates
isNumber
, isNumeric
isNumeric
, isNumber

-- * Numeric values
, numericValue
, integerValue

-- * Re-export
-- * Re-export from @base@
, isDigit
, isOctDigit
, isHexDigit
Expand Down
1 change: 1 addition & 0 deletions unicode-data/lib/Unicode/Char/Numeric/Compat.hs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import Unicode.Char.General (GeneralCategory(..), generalCategory)
-- prop> isNumber c == Data.Char.isNumber c
--
-- @since 0.3.1 moved to Compat module.
--
-- @since 0.3.0
isNumber :: Char -> Bool
isNumber c = case generalCategory c of
Expand Down
10 changes: 1 addition & 9 deletions unicode-data/lib/Unicode/Internal/Char/Blocks.hs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
{-# OPTIONS_HADDOCK hide #-}

module Unicode.Internal.Char.Blocks
(Block(..), BlockDefinition(..), block, blockDefinition, allBlockRanges)
(Block(..), BlockDefinition(..), block, blockDefinition)
where

import Data.Ix (Ix)
Expand Down Expand Up @@ -678,14 +678,6 @@ blockDefinition b = case b of
SupplementaryPrivateUseAreaA -> BlockDefinition (0xf0000, 0xfffff) "Supplementary Private Use Area-A"
SupplementaryPrivateUseAreaB -> BlockDefinition (0x100000, 0x10ffff) "Supplementary Private Use Area-B"

-- | All the block ranges, in ascending order.
--
-- @since 0.3.1
{-# INLINE allBlockRanges #-}
allBlockRanges :: [(Int, Int)]
allBlockRanges =
[(0,127),(128,255),(256,383),(384,591),(592,687),(688,767),(768,879),(880,1023),(1024,1279),(1280,1327),(1328,1423),(1424,1535),(1536,1791),(1792,1871),(1872,1919),(1920,1983),(1984,2047),(2048,2111),(2112,2143),(2144,2159),(2160,2207),(2208,2303),(2304,2431),(2432,2559),(2560,2687),(2688,2815),(2816,2943),(2944,3071),(3072,3199),(3200,3327),(3328,3455),(3456,3583),(3584,3711),(3712,3839),(3840,4095),(4096,4255),(4256,4351),(4352,4607),(4608,4991),(4992,5023),(5024,5119),(5120,5759),(5760,5791),(5792,5887),(5888,5919),(5920,5951),(5952,5983),(5984,6015),(6016,6143),(6144,6319),(6320,6399),(6400,6479),(6480,6527),(6528,6623),(6624,6655),(6656,6687),(6688,6831),(6832,6911),(6912,7039),(7040,7103),(7104,7167),(7168,7247),(7248,7295),(7296,7311),(7312,7359),(7360,7375),(7376,7423),(7424,7551),(7552,7615),(7616,7679),(7680,7935),(7936,8191),(8192,8303),(8304,8351),(8352,8399),(8400,8447),(8448,8527),(8528,8591),(8592,8703),(8704,8959),(8960,9215),(9216,9279),(9280,9311),(9312,9471),(9472,9599),(9600,9631),(9632,9727),(9728,9983),(9984,10175),(10176,10223),(10224,10239),(10240,10495),(10496,10623),(10624,10751),(10752,11007),(11008,11263),(11264,11359),(11360,11391),(11392,11519),(11520,11567),(11568,11647),(11648,11743),(11744,11775),(11776,11903),(11904,12031),(12032,12255),(12272,12287),(12288,12351),(12352,12447),(12448,12543),(12544,12591),(12592,12687),(12688,12703),(12704,12735),(12736,12783),(12784,12799),(12800,13055),(13056,13311),(13312,19903),(19904,19967),(19968,40959),(40960,42127),(42128,42191),(42192,42239),(42240,42559),(42560,42655),(42656,42751),(42752,42783),(42784,43007),(43008,43055),(43056,43071),(43072,43135),(43136,43231),(43232,43263),(43264,43311),(43312,43359),(43360,43391),(43392,43487),(43488,43519),(43520,43615),(43616,43647),(43648,43743),(43744,43775),(43776,43823),(43824,43887),(43888,43967),(43968,44031),(44032,55215),(55216,55295),(55296,56191),(56192,56319),(56320,57343),(57344,63743),(63744,64255),(64256,64335),(64336,65023),(65024,65039),(65040,65055),(65056,65071),(65072,65103),(65104,65135),(65136,65279),(65280,65519),(65520,65535),(65536,65663),(65664,65791),(65792,65855),(65856,65935),(65936,65999),(66000,66047),(66176,66207),(66208,66271),(66272,66303),(66304,66351),(66352,66383),(66384,66431),(66432,66463),(66464,66527),(66560,66639),(66640,66687),(66688,66735),(66736,66815),(66816,66863),(66864,66927),(66928,67007),(67072,67455),(67456,67519),(67584,67647),(67648,67679),(67680,67711),(67712,67759),(67808,67839),(67840,67871),(67872,67903),(67968,67999),(68000,68095),(68096,68191),(68192,68223),(68224,68255),(68288,68351),(68352,68415),(68416,68447),(68448,68479),(68480,68527),(68608,68687),(68736,68863),(68864,68927),(69216,69247),(69248,69311),(69376,69423),(69424,69487),(69488,69551),(69552,69599),(69600,69631),(69632,69759),(69760,69839),(69840,69887),(69888,69967),(69968,70015),(70016,70111),(70112,70143),(70144,70223),(70272,70319),(70320,70399),(70400,70527),(70656,70783),(70784,70879),(71040,71167),(71168,71263),(71264,71295),(71296,71375),(71424,71503),(71680,71759),(71840,71935),(71936,72031),(72096,72191),(72192,72271),(72272,72367),(72368,72383),(72384,72447),(72704,72815),(72816,72895),(72960,73055),(73056,73135),(73440,73471),(73648,73663),(73664,73727),(73728,74751),(74752,74879),(74880,75087),(77712,77823),(77824,78895),(78896,78911),(82944,83583),(92160,92735),(92736,92783),(92784,92879),(92880,92927),(92928,93071),(93760,93855),(93952,94111),(94176,94207),(94208,100351),(100352,101119),(101120,101631),(101632,101759),(110576,110591),(110592,110847),(110848,110895),(110896,110959),(110960,111359),(113664,113823),(113824,113839),(118528,118735),(118784,119039),(119040,119295),(119296,119375),(119520,119551),(119552,119647),(119648,119679),(119808,120831),(120832,121519),(122624,122879),(122880,122927),(123136,123215),(123536,123583),(123584,123647),(124896,124927),(124928,125151),(125184,125279),(126064,126143),(126208,126287),(126464,126719),(126976,127023),(127024,127135),(127136,127231),(127232,127487),(127488,127743),(127744,128511),(128512,128591),(128592,128639),(128640,128767),(128768,128895),(128896,129023),(129024,129279),(129280,129535),(129536,129647),(129648,129791),(129792,130047),(131072,173791),(173824,177983),(177984,178207),(178208,183983),(183984,191471),(194560,195103),(196608,201551),(917504,917631),(917760,917999),(983040,1048575),(1048576,1114111)]

-- | Character block, if defined.
--
-- @since 0.3.1
Expand Down

0 comments on commit bf8bb53

Please sign in to comment.