-- |
-- Module      : Unicode.Char.General
-- Copyright   : (c) 2020 Composewell Technologies and Contributors
-- License     : Apache-2.0
-- Maintainer  : streamly@composewell.com
-- Stability   : experimental
--
-- General character property related functions.
--
module Unicode.Char.General
    ( -- * Types of Code Points
      CodePointType(..)
    , codePointType

      -- * Unicode general categories
    , GeneralCategory(..)
    , generalCategoryAbbr
    , generalCategory

      -- * Character classification
    , isAlphabetic
    , isAlphaNum
    , isControl
    , isMark
    , isPrint
    , isPunctuation
    , isSeparator
    , isSymbol
    , isWhiteSpace
    , isNoncharacter

      -- ** Re-export
    , isAscii
    , isLatin1
    , isAsciiUpper
    , isAsciiLower

    -- * Korean Hangul Characters
    -- | The Hangul script used in the Korean writing system consists of
    -- individual consonant and vowel letters (jamo) that are visually combined
    -- into square display cells to form entire syllable  blocks.  Hangul
    -- syllables  may  be  encoded  directly  as  precomposed  combinations of
    -- individual jamo or as decomposed sequences of conjoining jamo. Modern
    -- Hangul syllable blocks can be expressed with either two or three jamo,
    -- either in the  form  consonant + vowel  or  in  the  form  consonant +
    -- vowel + consonant. The leading consonant is represented as L, the vowel
    -- as V and the trailing consonant as T.
    --
    -- The Unicode Standard contains both a large set of precomposed modern
    -- Hangul syllables and a set of conjoining Hangul jamo, which can be used
    -- to encode archaic Korean syllable blocks as well as modern Korean
    -- syllable blocks.
    --
    -- Hangul characters can be composed or decomposed algorithmically instead
    -- of via mappings.  These APIs are used mainly for Unicode normalization
    -- of Hangul text.
    --
    -- Please refer to the following resources for more information:
    --
    -- * The @Hangul@ section of the @East Asia@ chapter of the [Unicode Standard](https://www.unicode.org/versions/latest)
    -- * Conformance chapter of the [Unicode Standard](https://www.unicode.org/versions/latest)
    -- * [Unicode® Standard Annex #15 - Unicode Normalization Forms](https://www.unicode.org/reports/tr15)
    -- * UCD file @HangulSyllableType.txt@
    -- * https://en.wikipedia.org/wiki/Hangul_Jamo_(Unicode_block)
    -- * https://en.wikipedia.org/wiki/List_of_Hangul_jamo

    -- ** Conjoining Jamo
    -- | Jamo L, V and T letters.
    , isJamo
    , jamoNCount

    -- *** Jamo Leading (L)
    , jamoLFirst
    , jamoLCount
    , jamoLIndex
    , jamoLLast

    -- *** Jamo Vowel (V)
    , jamoVFirst
    , jamoVCount
    , jamoVIndex
    , jamoVLast

    -- *** Jamo Trailing (T)
    , jamoTFirst
    , jamoTCount
    , jamoTIndex
    , jamoTLast

    -- ** Hangul Syllables
    -- | Precomposed Hangul syllables.
    , hangulFirst
    , hangulLast
    , isHangul
    , isHangulLV
    )
where

import Control.Exception (assert)
import Data.Bits ((.&.))
import Data.Char (isAscii, isLatin1, isAsciiUpper, isAsciiLower, ord)
import Data.Ix (Ix)

import qualified Unicode.Char.General.Compat as Compat
import qualified Unicode.Internal.Char.DerivedCoreProperties as P
import qualified Unicode.Internal.Char.PropList as P
import qualified Unicode.Internal.Char.UnicodeData.GeneralCategory as UC
import Unicode.Internal.Division (quotRem28)

--------------------------------------------------------------------------------
-- General Category
--------------------------------------------------------------------------------

{-| Unicode General Categories.

These classes are defined in the
[Unicode Character Database](http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table),
part of the Unicode standard.

__Note:__ the classes must be in the same order they are listed in the Unicode Standard,
because some functions (e.g. 'generalCategory') rely on the 'Enum' instance.

@since 0.3.0
-}
data GeneralCategory
    -- L: Letter
    = UppercaseLetter       -- ^ @Lu@: Letter, Uppercase
    | LowercaseLetter       -- ^ @Ll@: Letter, Lowercase
    | TitlecaseLetter       -- ^ @Lt@: Letter, Titlecase
    | ModifierLetter        -- ^ @Lm@: Letter, Modifier
    | OtherLetter           -- ^ @Lo@: Letter, Other

    -- M: Mark
    | NonSpacingMark        -- ^ @Mn@: Mark, Non-Spacing
    | SpacingCombiningMark  -- ^ @Mc@: Mark, Spacing Combining
    | EnclosingMark         -- ^ @Me@: Mark, Enclosing

    -- N: Number
    | DecimalNumber         -- ^ @Nd@: Number, Decimal
    | LetterNumber          -- ^ @Nl@: Number, Letter
    | OtherNumber           -- ^ @No@: Number, Other

    -- P: Punctuation
    | ConnectorPunctuation  -- ^ @Pc@: Punctuation, Connector
    | DashPunctuation       -- ^ @Pd@: Punctuation, Dash
    | OpenPunctuation       -- ^ @Ps@: Punctuation, Open
    | ClosePunctuation      -- ^ @Pe@: Punctuation, Close
    | InitialQuote          -- ^ @Pi@: Punctuation, Initial quote
    | FinalQuote            -- ^ @Pf@: Punctuation, Final quote
    | OtherPunctuation      -- ^ @Po@: Punctuation, Other

    -- S: Symbol
    | MathSymbol            -- ^ @Sm@: Symbol, Math
    | CurrencySymbol        -- ^ @Sc@: Symbol, Currency
    | ModifierSymbol        -- ^ @Sk@: Symbol, Modifier
    | OtherSymbol           -- ^ @So@: Symbol, Other

    -- Z: Separator
    | Space                 -- ^ @Zs@: Separator, Space
    | LineSeparator         -- ^ @Zl@: Separator, Line
    | ParagraphSeparator    -- ^ @Zp@: Separator, Paragraph

    -- C: Other
    | Control               -- ^ @Cc@: Other, Control
    | Format                -- ^ @Cf@: Other, Format
    | Surrogate             -- ^ @Cs@: Other, Surrogate
    | PrivateUse            -- ^ @Co@: Other, Private Use
    | NotAssigned           -- ^ @Cn@: Other, Not Assigned
    deriving ( Int -> GeneralCategory -> ShowS
[GeneralCategory] -> ShowS
GeneralCategory -> String
(Int -> GeneralCategory -> ShowS)
-> (GeneralCategory -> String)
-> ([GeneralCategory] -> ShowS)
-> Show GeneralCategory
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> GeneralCategory -> ShowS
showsPrec :: Int -> GeneralCategory -> ShowS
$cshow :: GeneralCategory -> String
show :: GeneralCategory -> String
$cshowList :: [GeneralCategory] -> ShowS
showList :: [GeneralCategory] -> ShowS
Show
            , GeneralCategory -> GeneralCategory -> Bool
(GeneralCategory -> GeneralCategory -> Bool)
-> (GeneralCategory -> GeneralCategory -> Bool)
-> Eq GeneralCategory
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: GeneralCategory -> GeneralCategory -> Bool
== :: GeneralCategory -> GeneralCategory -> Bool
$c/= :: GeneralCategory -> GeneralCategory -> Bool
/= :: GeneralCategory -> GeneralCategory -> Bool
Eq
            , Eq GeneralCategory
Eq GeneralCategory
-> (GeneralCategory -> GeneralCategory -> Ordering)
-> (GeneralCategory -> GeneralCategory -> Bool)
-> (GeneralCategory -> GeneralCategory -> Bool)
-> (GeneralCategory -> GeneralCategory -> Bool)
-> (GeneralCategory -> GeneralCategory -> Bool)
-> (GeneralCategory -> GeneralCategory -> GeneralCategory)
-> (GeneralCategory -> GeneralCategory -> GeneralCategory)
-> Ord GeneralCategory
GeneralCategory -> GeneralCategory -> Bool
GeneralCategory -> GeneralCategory -> Ordering
GeneralCategory -> GeneralCategory -> GeneralCategory
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: GeneralCategory -> GeneralCategory -> Ordering
compare :: GeneralCategory -> GeneralCategory -> Ordering
$c< :: GeneralCategory -> GeneralCategory -> Bool
< :: GeneralCategory -> GeneralCategory -> Bool
$c<= :: GeneralCategory -> GeneralCategory -> Bool
<= :: GeneralCategory -> GeneralCategory -> Bool
$c> :: GeneralCategory -> GeneralCategory -> Bool
> :: GeneralCategory -> GeneralCategory -> Bool
$c>= :: GeneralCategory -> GeneralCategory -> Bool
>= :: GeneralCategory -> GeneralCategory -> Bool
$cmax :: GeneralCategory -> GeneralCategory -> GeneralCategory
max :: GeneralCategory -> GeneralCategory -> GeneralCategory
$cmin :: GeneralCategory -> GeneralCategory -> GeneralCategory
min :: GeneralCategory -> GeneralCategory -> GeneralCategory
Ord
            , Int -> GeneralCategory
GeneralCategory -> Int
GeneralCategory -> [GeneralCategory]
GeneralCategory -> GeneralCategory
GeneralCategory -> GeneralCategory -> [GeneralCategory]
GeneralCategory
-> GeneralCategory -> GeneralCategory -> [GeneralCategory]
(GeneralCategory -> GeneralCategory)
-> (GeneralCategory -> GeneralCategory)
-> (Int -> GeneralCategory)
-> (GeneralCategory -> Int)
-> (GeneralCategory -> [GeneralCategory])
-> (GeneralCategory -> GeneralCategory -> [GeneralCategory])
-> (GeneralCategory -> GeneralCategory -> [GeneralCategory])
-> (GeneralCategory
    -> GeneralCategory -> GeneralCategory -> [GeneralCategory])
-> Enum GeneralCategory
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
$csucc :: GeneralCategory -> GeneralCategory
succ :: GeneralCategory -> GeneralCategory
$cpred :: GeneralCategory -> GeneralCategory
pred :: GeneralCategory -> GeneralCategory
$ctoEnum :: Int -> GeneralCategory
toEnum :: Int -> GeneralCategory
$cfromEnum :: GeneralCategory -> Int
fromEnum :: GeneralCategory -> Int
$cenumFrom :: GeneralCategory -> [GeneralCategory]
enumFrom :: GeneralCategory -> [GeneralCategory]
$cenumFromThen :: GeneralCategory -> GeneralCategory -> [GeneralCategory]
enumFromThen :: GeneralCategory -> GeneralCategory -> [GeneralCategory]
$cenumFromTo :: GeneralCategory -> GeneralCategory -> [GeneralCategory]
enumFromTo :: GeneralCategory -> GeneralCategory -> [GeneralCategory]
$cenumFromThenTo :: GeneralCategory
-> GeneralCategory -> GeneralCategory -> [GeneralCategory]
enumFromThenTo :: GeneralCategory
-> GeneralCategory -> GeneralCategory -> [GeneralCategory]
Enum
            , GeneralCategory
GeneralCategory -> GeneralCategory -> Bounded GeneralCategory
forall a. a -> a -> Bounded a
$cminBound :: GeneralCategory
minBound :: GeneralCategory
$cmaxBound :: GeneralCategory
maxBound :: GeneralCategory
Bounded
            , Ord GeneralCategory
Ord GeneralCategory
-> ((GeneralCategory, GeneralCategory) -> [GeneralCategory])
-> ((GeneralCategory, GeneralCategory) -> GeneralCategory -> Int)
-> ((GeneralCategory, GeneralCategory) -> GeneralCategory -> Int)
-> ((GeneralCategory, GeneralCategory) -> GeneralCategory -> Bool)
-> ((GeneralCategory, GeneralCategory) -> Int)
-> ((GeneralCategory, GeneralCategory) -> Int)
-> Ix GeneralCategory
(GeneralCategory, GeneralCategory) -> Int
(GeneralCategory, GeneralCategory) -> [GeneralCategory]
(GeneralCategory, GeneralCategory) -> GeneralCategory -> Bool
(GeneralCategory, GeneralCategory) -> GeneralCategory -> Int
forall a.
Ord a
-> ((a, a) -> [a])
-> ((a, a) -> a -> Int)
-> ((a, a) -> a -> Int)
-> ((a, a) -> a -> Bool)
-> ((a, a) -> Int)
-> ((a, a) -> Int)
-> Ix a
$crange :: (GeneralCategory, GeneralCategory) -> [GeneralCategory]
range :: (GeneralCategory, GeneralCategory) -> [GeneralCategory]
$cindex :: (GeneralCategory, GeneralCategory) -> GeneralCategory -> Int
index :: (GeneralCategory, GeneralCategory) -> GeneralCategory -> Int
$cunsafeIndex :: (GeneralCategory, GeneralCategory) -> GeneralCategory -> Int
unsafeIndex :: (GeneralCategory, GeneralCategory) -> GeneralCategory -> Int
$cinRange :: (GeneralCategory, GeneralCategory) -> GeneralCategory -> Bool
inRange :: (GeneralCategory, GeneralCategory) -> GeneralCategory -> Bool
$crangeSize :: (GeneralCategory, GeneralCategory) -> Int
rangeSize :: (GeneralCategory, GeneralCategory) -> Int
$cunsafeRangeSize :: (GeneralCategory, GeneralCategory) -> Int
unsafeRangeSize :: (GeneralCategory, GeneralCategory) -> Int
Ix
            )

-- | Abbreviation of 'GeneralCategory' used in the Unicode standard.
--
-- @since 0.3.0
generalCategoryAbbr :: GeneralCategory -> String
generalCategoryAbbr :: GeneralCategory -> String
generalCategoryAbbr = \case
    GeneralCategory
UppercaseLetter      -> String
"Lu"
    GeneralCategory
LowercaseLetter      -> String
"Ll"
    GeneralCategory
TitlecaseLetter      -> String
"Lt"
    GeneralCategory
ModifierLetter       -> String
"Lm"
    GeneralCategory
OtherLetter          -> String
"Lo"
    GeneralCategory
NonSpacingMark       -> String
"Mn"
    GeneralCategory
SpacingCombiningMark -> String
"Mc"
    GeneralCategory
EnclosingMark        -> String
"Me"
    GeneralCategory
DecimalNumber        -> String
"Nd"
    GeneralCategory
LetterNumber         -> String
"Nl"
    GeneralCategory
OtherNumber          -> String
"No"
    GeneralCategory
ConnectorPunctuation -> String
"Pc"
    GeneralCategory
DashPunctuation      -> String
"Pd"
    GeneralCategory
OpenPunctuation      -> String
"Ps"
    GeneralCategory
ClosePunctuation     -> String
"Pe"
    GeneralCategory
InitialQuote         -> String
"Pi"
    GeneralCategory
FinalQuote           -> String
"Pf"
    GeneralCategory
OtherPunctuation     -> String
"Po"
    GeneralCategory
MathSymbol           -> String
"Sm"
    GeneralCategory
CurrencySymbol       -> String
"Sc"
    GeneralCategory
ModifierSymbol       -> String
"Sk"
    GeneralCategory
OtherSymbol          -> String
"So"
    GeneralCategory
Space                -> String
"Zs"
    GeneralCategory
LineSeparator        -> String
"Zl"
    GeneralCategory
ParagraphSeparator   -> String
"Zp"
    GeneralCategory
Control              -> String
"Cc"
    GeneralCategory
Format               -> String
"Cf"
    GeneralCategory
Surrogate            -> String
"Cs"
    GeneralCategory
PrivateUse           -> String
"Co"
    GeneralCategory
NotAssigned          -> String
"Cn"

{-| The Unicode general category of the character.

This property is defined in the column 2 of the @UnicodeData@ table.

This relies on the 'Enum' instance of 'GeneralCategory', which must remain in the
same order as the categories are presented in the Unicode standard.

prop> show (generalCategory c) == show (Data.Char.generalCategory c)

@since 0.3.0
-}
{-# INLINE generalCategory #-}
generalCategory :: Char -> GeneralCategory
generalCategory :: Char -> GeneralCategory
generalCategory = Int -> GeneralCategory
forall a. Enum a => Int -> a
toEnum (Int -> GeneralCategory)
-> (Char -> Int) -> Char -> GeneralCategory
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
UC.generalCategory

--------------------------------------------------------------------------------
-- Types of Code Points
--------------------------------------------------------------------------------

-- | Types of Code Points.
--
-- These classes are defined in the section
-- [2.4 “Code Points and Characters”](https://www.unicode.org/versions/Unicode15.0.0/ch02.pdf#G14527)
-- of the Unicode standard.
--
-- @since 0.4.1
data CodePointType
    = GraphicType
    -- ^ __Graphic__: defined by the following general categories:
    --
    -- * Letters (L): 'UppercaseLetter', 'LowercaseLetter', 'TitlecaseLetter',
    --   'ModifierLetter', 'OtherLetter'.
    -- * Marks (M): 'NonSpacingMark', 'SpacingCombiningMark', 'EnclosingMark'.
    -- * Numbers (N): 'DecimalNumber', 'LetterNumber', 'OtherNumber'.
    -- * Punctuation (P): 'ConnectorPunctuation', 'DashPunctuation',
    --   'OpenPunctuation', 'ClosePunctuation', 'InitialQuote', 'FinalQuote',
    --   'OtherPunctuation'.
    -- * Symbol (S): 'MathSymbol', 'CurrencySymbol', 'ModifierSymbol',
    --   'OtherSymbol'.
    -- * Separators: 'Space'.
    | FormatType
    -- ^ __Format__: invisible but affects neighboring characters.
    --
    -- Defined by the following general categories:
    -- 'LineSeparator', 'ParagraphSeparator', 'Format'.
    | ControlType
    -- ^ __Control__: usage defined by protocols or standards outside the
    -- Unicode Standard.
    --
    -- Defined by the general category 'Control'.
    | PrivateUseType
    -- ^ __Private-use__: usage defined by private agreement outside the
    -- Unicode Standard.
    --
    -- Defined by the general category 'PrivateUse'.
    | SurrogateType
    -- ^ __Surrogate__: Permanently reserved for UTF-16.
    --
    -- Defined by the general category 'Surrogate'.
    | NoncharacterType
    -- ^ __Noncharacter:__ a code point that is permanently reserved for
    -- internal use (see definition D14 in the section
    -- [3.4 “Characters and Encoding”](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G2212)
    -- of the Unicode Standard).
    -- Noncharacters consist of the values @U+nFFFE@ and @U+nFFFF@ (where @n@
    -- is from 0 to 10₁₆) and the values @U+FDD0..U+FDEF@.
    --
    -- They are a subset of the general category 'NotAssigned'.
    | ReservedType
    -- ^ __Reserved:__ any code point of the Unicode Standard that is reserved
    -- for future assignment (see definition D15 in the section
    -- [3.4 “Characters and Encoding”](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G2212)
    -- of the Unicode Standard). Also known as an unassigned code point.
    --
    -- They are a subset of the general category 'NotAssigned'.
    deriving ( Int -> CodePointType -> ShowS
[CodePointType] -> ShowS
CodePointType -> String
(Int -> CodePointType -> ShowS)
-> (CodePointType -> String)
-> ([CodePointType] -> ShowS)
-> Show CodePointType
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> CodePointType -> ShowS
showsPrec :: Int -> CodePointType -> ShowS
$cshow :: CodePointType -> String
show :: CodePointType -> String
$cshowList :: [CodePointType] -> ShowS
showList :: [CodePointType] -> ShowS
Show
             , CodePointType -> CodePointType -> Bool
(CodePointType -> CodePointType -> Bool)
-> (CodePointType -> CodePointType -> Bool) -> Eq CodePointType
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: CodePointType -> CodePointType -> Bool
== :: CodePointType -> CodePointType -> Bool
$c/= :: CodePointType -> CodePointType -> Bool
/= :: CodePointType -> CodePointType -> Bool
Eq
             , Eq CodePointType
Eq CodePointType
-> (CodePointType -> CodePointType -> Ordering)
-> (CodePointType -> CodePointType -> Bool)
-> (CodePointType -> CodePointType -> Bool)
-> (CodePointType -> CodePointType -> Bool)
-> (CodePointType -> CodePointType -> Bool)
-> (CodePointType -> CodePointType -> CodePointType)
-> (CodePointType -> CodePointType -> CodePointType)
-> Ord CodePointType
CodePointType -> CodePointType -> Bool
CodePointType -> CodePointType -> Ordering
CodePointType -> CodePointType -> CodePointType
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: CodePointType -> CodePointType -> Ordering
compare :: CodePointType -> CodePointType -> Ordering
$c< :: CodePointType -> CodePointType -> Bool
< :: CodePointType -> CodePointType -> Bool
$c<= :: CodePointType -> CodePointType -> Bool
<= :: CodePointType -> CodePointType -> Bool
$c> :: CodePointType -> CodePointType -> Bool
> :: CodePointType -> CodePointType -> Bool
$c>= :: CodePointType -> CodePointType -> Bool
>= :: CodePointType -> CodePointType -> Bool
$cmax :: CodePointType -> CodePointType -> CodePointType
max :: CodePointType -> CodePointType -> CodePointType
$cmin :: CodePointType -> CodePointType -> CodePointType
min :: CodePointType -> CodePointType -> CodePointType
Ord
             , Int -> CodePointType
CodePointType -> Int
CodePointType -> [CodePointType]
CodePointType -> CodePointType
CodePointType -> CodePointType -> [CodePointType]
CodePointType -> CodePointType -> CodePointType -> [CodePointType]
(CodePointType -> CodePointType)
-> (CodePointType -> CodePointType)
-> (Int -> CodePointType)
-> (CodePointType -> Int)
-> (CodePointType -> [CodePointType])
-> (CodePointType -> CodePointType -> [CodePointType])
-> (CodePointType -> CodePointType -> [CodePointType])
-> (CodePointType
    -> CodePointType -> CodePointType -> [CodePointType])
-> Enum CodePointType
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
$csucc :: CodePointType -> CodePointType
succ :: CodePointType -> CodePointType
$cpred :: CodePointType -> CodePointType
pred :: CodePointType -> CodePointType
$ctoEnum :: Int -> CodePointType
toEnum :: Int -> CodePointType
$cfromEnum :: CodePointType -> Int
fromEnum :: CodePointType -> Int
$cenumFrom :: CodePointType -> [CodePointType]
enumFrom :: CodePointType -> [CodePointType]
$cenumFromThen :: CodePointType -> CodePointType -> [CodePointType]
enumFromThen :: CodePointType -> CodePointType -> [CodePointType]
$cenumFromTo :: CodePointType -> CodePointType -> [CodePointType]
enumFromTo :: CodePointType -> CodePointType -> [CodePointType]
$cenumFromThenTo :: CodePointType -> CodePointType -> CodePointType -> [CodePointType]
enumFromThenTo :: CodePointType -> CodePointType -> CodePointType -> [CodePointType]
Enum
             , CodePointType
CodePointType -> CodePointType -> Bounded CodePointType
forall a. a -> a -> Bounded a
$cminBound :: CodePointType
minBound :: CodePointType
$cmaxBound :: CodePointType
maxBound :: CodePointType
Bounded
             , Ord CodePointType
Ord CodePointType
-> ((CodePointType, CodePointType) -> [CodePointType])
-> ((CodePointType, CodePointType) -> CodePointType -> Int)
-> ((CodePointType, CodePointType) -> CodePointType -> Int)
-> ((CodePointType, CodePointType) -> CodePointType -> Bool)
-> ((CodePointType, CodePointType) -> Int)
-> ((CodePointType, CodePointType) -> Int)
-> Ix CodePointType
(CodePointType, CodePointType) -> Int
(CodePointType, CodePointType) -> [CodePointType]
(CodePointType, CodePointType) -> CodePointType -> Bool
(CodePointType, CodePointType) -> CodePointType -> Int
forall a.
Ord a
-> ((a, a) -> [a])
-> ((a, a) -> a -> Int)
-> ((a, a) -> a -> Int)
-> ((a, a) -> a -> Bool)
-> ((a, a) -> Int)
-> ((a, a) -> Int)
-> Ix a
$crange :: (CodePointType, CodePointType) -> [CodePointType]
range :: (CodePointType, CodePointType) -> [CodePointType]
$cindex :: (CodePointType, CodePointType) -> CodePointType -> Int
index :: (CodePointType, CodePointType) -> CodePointType -> Int
$cunsafeIndex :: (CodePointType, CodePointType) -> CodePointType -> Int
unsafeIndex :: (CodePointType, CodePointType) -> CodePointType -> Int
$cinRange :: (CodePointType, CodePointType) -> CodePointType -> Bool
inRange :: (CodePointType, CodePointType) -> CodePointType -> Bool
$crangeSize :: (CodePointType, CodePointType) -> Int
rangeSize :: (CodePointType, CodePointType) -> Int
$cunsafeRangeSize :: (CodePointType, CodePointType) -> Int
unsafeRangeSize :: (CodePointType, CodePointType) -> Int
Ix
             )

-- | Returns the 'CodePointType' of a character.
--
-- @since 0.6.0
codePointType :: Char -> CodePointType
codePointType :: Char -> CodePointType
codePointType Char
c = case Char -> GeneralCategory
generalCategory Char
c of
    GeneralCategory
UppercaseLetter      -> CodePointType
GraphicType
    GeneralCategory
LowercaseLetter      -> CodePointType
GraphicType
    GeneralCategory
TitlecaseLetter      -> CodePointType
GraphicType
    GeneralCategory
ModifierLetter       -> CodePointType
GraphicType
    GeneralCategory
OtherLetter          -> CodePointType
GraphicType
    GeneralCategory
NonSpacingMark       -> CodePointType
GraphicType
    GeneralCategory
SpacingCombiningMark -> CodePointType
GraphicType
    GeneralCategory
EnclosingMark        -> CodePointType
GraphicType
    GeneralCategory
DecimalNumber        -> CodePointType
GraphicType
    GeneralCategory
LetterNumber         -> CodePointType
GraphicType
    GeneralCategory
OtherNumber          -> CodePointType
GraphicType
    GeneralCategory
ConnectorPunctuation -> CodePointType
GraphicType
    GeneralCategory
DashPunctuation      -> CodePointType
GraphicType
    GeneralCategory
OpenPunctuation      -> CodePointType
GraphicType
    GeneralCategory
ClosePunctuation     -> CodePointType
GraphicType
    GeneralCategory
InitialQuote         -> CodePointType
GraphicType
    GeneralCategory
FinalQuote           -> CodePointType
GraphicType
    GeneralCategory
OtherPunctuation     -> CodePointType
GraphicType
    GeneralCategory
MathSymbol           -> CodePointType
GraphicType
    GeneralCategory
CurrencySymbol       -> CodePointType
GraphicType
    GeneralCategory
ModifierSymbol       -> CodePointType
GraphicType
    GeneralCategory
OtherSymbol          -> CodePointType
GraphicType
    GeneralCategory
Space                -> CodePointType
GraphicType
    GeneralCategory
LineSeparator        -> CodePointType
FormatType
    GeneralCategory
ParagraphSeparator   -> CodePointType
FormatType
    GeneralCategory
Control              -> CodePointType
ControlType
    GeneralCategory
Format               -> CodePointType
FormatType
    GeneralCategory
Surrogate            -> CodePointType
SurrogateType
    GeneralCategory
PrivateUse           -> CodePointType
PrivateUseType
    GeneralCategory
NotAssigned
        | Char -> Bool
isNoncharacter Char
c -> CodePointType
NoncharacterType
        | Bool
otherwise        -> CodePointType
ReservedType

--------------------------------------------------------------------------------
-- Predicates
--------------------------------------------------------------------------------

{-| Returns 'True' for alphabetic Unicode characters (lower-case, upper-case
and title-case letters, plus letters of caseless scripts and modifiers
letters).

__Note:__ this function is /not/ equivalent to
'Unicode.Char.General.Compat.isAlpha' / 'Unicode.Char.General.Compat.isLetter':

* 'Unicode.Char.General.Compat.isAlpha' matches the following general categories:

    * 'UppercaseLetter' (@Lu@)
    * 'LowercaseLetter' (@Ll@)
    * 'TitlecaseLetter' (@Lt@)
    * 'ModifierLetter' (@Lm@)
    * 'OtherLetter' (@Lo@)

* whereas 'isAlphabetic' matches:

    * @Uppercase@ [property](https://www.unicode.org/reports/tr44/#Uppercase)
    * @Lowercase@ [property](https://www.unicode.org/reports/tr44/#Lowercase)
    * 'TitlecaseLetter' (@Lt@)
    * 'ModifierLetter' (@Lm@)
    * 'OtherLetter' (@Lo@)
    * 'LetterNumber' (@Nl@)
    * @Other_Alphabetic@ [property](https://www.unicode.org/reports/tr44/#Other_Alphabetic)

@since 0.3.0
-}
{-# INLINE isAlphabetic #-}
isAlphabetic :: Char -> Bool
isAlphabetic :: Char -> Bool
isAlphabetic = Char -> Bool
P.isAlphabetic

{-| Selects alphabetic or numeric Unicode characters.

This function returns 'True' if its argument has one of the
following 'GeneralCategory's, or 'False' otherwise:

* 'UppercaseLetter'
* 'LowercaseLetter'
* 'TitlecaseLetter'
* 'ModifierLetter'
* 'OtherLetter'
* 'DecimalNumber'
* 'LetterNumber'
* 'OtherNumber'

prop> isAlphaNum c == Data.Char.isAlphaNum c

__Note:__ this function is incompatible with 'isAlphabetic':

>>> isAlphabetic '\x345'
True
>>> isAlphaNum '\x345'
False

@since 0.3.0
-}
{-# INLINE isAlphaNum #-}
{-# DEPRECATED isAlphaNum "Use Unicode.Char.General.Compat.isAlphaNum instead." #-}
isAlphaNum :: Char -> Bool
isAlphaNum :: Char -> Bool
isAlphaNum = Char -> Bool
Compat.isAlphaNum

{-| Selects control characters, which are the non-printing characters
of the Latin-1 subset of Unicode.

This function returns 'True' if its argument has the 'GeneralCategory' 'Control'.

prop> isControl c == Data.Char.isControl c

@since 0.3.0
-}
isControl :: Char -> Bool
-- By definition (https://www.unicode.org/reports/tr44/#General_Category_Values)
-- “a C0 or C1 control code”, i.e. the 0x00-0x1f, 0x7f, and 0x80-0x9f.
isControl :: Char -> Bool
isControl Char
c = Int
cp Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x9F Bool -> Bool -> Bool
&& Int -> Int
UC.generalCategoryPlanes0To3 Int
cp Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
UC.Control
    where cp :: Int
cp = Char -> Int
ord Char
c

{-| Selects Unicode mark characters, for example accents and the
like, which combine with preceding characters.

This function returns 'True' if its argument has one of the
following 'GeneralCategory's, or 'False' otherwise:

* 'NonSpacingMark'
* 'SpacingCombiningMark'
* 'EnclosingMark'

prop> isMark c == Data.Char.isMark c

@since 0.3.0
-}
isMark :: Char -> Bool
isMark :: Char -> Bool
isMark Char
c = Int
UC.NonSpacingMark Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
gc Bool -> Bool -> Bool
&& Int
gc Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
UC.EnclosingMark
    where gc :: Int
gc = Char -> Int
UC.generalCategory Char
c

{-| Selects printable Unicode characters (letters, numbers, marks, punctuation,
symbols and spaces).

This function returns 'False' if its argument has one of the
following 'GeneralCategory's, or 'True' otherwise:

* 'LineSeparator'
* 'ParagraphSeparator'
* 'Control'
* 'Format'
* 'Surrogate'
* 'PrivateUse'
* 'NotAssigned'

prop> isPrint c == Data.Char.isPrint c

@since 0.3.0
-}
isPrint :: Char -> Bool
isPrint :: Char -> Bool
isPrint Char
c = Char -> Int
UC.generalCategory Char
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
UC.LineSeparator

{-| Selects Unicode punctuation characters, including various kinds
of connectors, brackets and quotes.

This function returns 'True' if its argument has one of the
following 'GeneralCategory's, or 'False' otherwise:

* 'ConnectorPunctuation'
* 'DashPunctuation'
* 'OpenPunctuation'
* 'ClosePunctuation'
* 'InitialQuote'
* 'FinalQuote'
* 'OtherPunctuation'

prop> isPunctuation c == Data.Char.isPunctuation c

@since 0.3.0
-}
isPunctuation :: Char -> Bool
isPunctuation :: Char -> Bool
isPunctuation Char
c = Int
UC.ConnectorPunctuation Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
gc Bool -> Bool -> Bool
&& Int
gc Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
UC.OtherPunctuation
    where gc :: Int
gc = Char -> Int
UC.generalCategory Char
c

{- | Returns 'True' for any whitespace characters, and the control
characters @\\t@, @\\n@, @\\r@, @\\f@, @\\v@.

See: [Unicode @White_Space@](https://www.unicode.org/reports/tr44/#White_Space).

__Note:__ 'isWhiteSpace' is /not/ equivalent to 'Unicode.Char.General.Compat.isSpace'.
'isWhiteSpace' selects the same characters from 'isSpace' plus the following:

* @U+0085@ NEXT LINE (NEL)
* @U+2028@ LINE SEPARATOR
* @U+2029@ PARAGRAPH SEPARATOR

@since 0.3.0
-}
{-# INLINE isWhiteSpace #-}
isWhiteSpace :: Char -> Bool
isWhiteSpace :: Char -> Bool
isWhiteSpace = Char -> Bool
P.isWhite_Space

{-| Selects Unicode space and separator characters.

This function returns 'True' if its argument has one of the
following 'GeneralCategory's, or 'False' otherwise:

* 'Space'
* 'LineSeparator'
* 'ParagraphSeparator'

prop> isSeparator c == Data.Char.isSeparator c

@since 0.3.0
-}
isSeparator :: Char -> Bool
isSeparator :: Char -> Bool
isSeparator Char
c =
    let !cp :: Int
cp = Char -> Int
ord Char
c
    -- NOTE: The guard constant is updated at each Unicode revision.
    --       It must be < 0x40000 to be accepted by generalCategoryPlanes0To3.
    in Int
cp Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
UC.MaxIsSeparator Bool -> Bool -> Bool
&&
        let !gc :: Int
gc = Int -> Int
UC.generalCategoryPlanes0To3 Int
cp
        in Int
UC.Space Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
gc Bool -> Bool -> Bool
&& Int
gc Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
UC.ParagraphSeparator
    -- Use the following in case the previous code is not valid anymore:
    -- UC.Space <= gc && gc <= UC.ParagraphSeparator
    -- where gc = UC.generalCategory c

{-| Selects Unicode symbol characters, including mathematical and currency symbols.

This function returns 'True' if its argument has one of the
following 'GeneralCategory's, or 'False' otherwise:
* 'MathSymbol'
* 'CurrencySymbol'
* 'ModifierSymbol'
* 'OtherSymbol'

prop> isSymbol c == Data.Char.isSymbol c

@since 0.3.0
-}
isSymbol :: Char -> Bool
isSymbol :: Char -> Bool
isSymbol Char
c = Int
UC.MathSymbol Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
gc Bool -> Bool -> Bool
&& Int
gc Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
UC.OtherSymbol
    where gc :: Int
gc = Char -> Int
UC.generalCategory Char
c

-- | Returns 'True' for any /noncharacter/.
--
-- A /noncharacter/ is a code point that is permanently reserved for internal
-- use (see definition D14 in the section
-- [3.4 “Characters and Encoding”](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G2212)
-- of the Unicode Standard).
--
-- Noncharacters consist of the values @U+nFFFE@ and @U+nFFFF@ (where @n@
-- is from 0 to 10₁₆) and the values @U+FDD0..U+FDEF@.
--
-- @since 0.6.0
isNoncharacter :: Char -> Bool
isNoncharacter :: Char -> Bool
isNoncharacter Char
c
    = (Char
'\xFDD0' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xFDEF')
    Bool -> Bool -> Bool
|| (Char -> Int
ord Char
c Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0xFFFF) Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0xFFFE

-------------------------------------------------------------------------------
-- Korean Hangul
-------------------------------------------------------------------------------

-- jamo leading
jamoLFirst, jamoLCount, jamoLLast :: Int

-- | First leading consonant jamo.
--
-- @since 0.1.0
jamoLFirst :: Int
jamoLFirst  = Int
0x1100

-- | Total count of leading consonant jamo.
--
-- @since 0.3.0
jamoLCount :: Int
jamoLCount = Int
19

-- | Last leading consonant jamo.
--
-- @since 0.1.0
jamoLLast :: Int
jamoLLast = Int
jamoLFirst Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
jamoLCount Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1

-- jamo vowel
jamoVFirst, jamoVCount, jamoVLast :: Int

-- | First vowel jamo.
--
-- @since 0.1.0
jamoVFirst :: Int
jamoVFirst  = Int
0x1161

-- | Total count of vowel jamo.
--
-- @since 0.1.0
jamoVCount :: Int
jamoVCount = Int
21

-- | Last vowel jamo.
--
-- @since 0.1.0
jamoVLast :: Int
jamoVLast = Int
jamoVFirst Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
jamoVCount Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1

-- jamo trailing
jamoTFirst, jamoTCount :: Int

-- | The first trailing consonant jamo.
--
-- Note that 'jamoTFirst' does not represent a valid T, it represents a missing
-- T i.e. LV without a T. See comments under 'jamoTIndex' .
--
-- @since 0.1.0
jamoTFirst :: Int
jamoTFirst  = Int
0x11a7

-- | Total count of trailing consonant jamo.
--
-- @since 0.1.0
jamoTCount :: Int
jamoTCount = Int
28

-- | Last trailing consonant jamo.
--
-- @since 0.1.0
jamoTLast :: Int
jamoTLast :: Int
jamoTLast = Int
jamoTFirst Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
jamoTCount Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1

-- | Total count of all jamo characters.
--
-- @jamoNCount = jamoVCount * jamoTCount@
--
-- @since 0.1.0
jamoNCount :: Int
jamoNCount :: Int
jamoNCount = Int
588

-- Hangul
hangulFirst, hangulLast :: Int

-- | Codepoint of the first pre-composed Hangul character.
--
-- @since 0.1.0
hangulFirst :: Int
hangulFirst = Int
0xac00

-- | Codepoint of the last Hangul character.
--
-- @since 0.1.0
hangulLast :: Int
hangulLast = Int
hangulFirst Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
jamoLCount Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
jamoVCount Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
jamoTCount Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1

-- | Determine if the given character is a precomposed Hangul syllable.
--
-- @since 0.1.0
isHangul :: Char -> Bool
isHangul :: Char -> Bool
isHangul Char
c = Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
hangulFirst Bool -> Bool -> Bool
&& Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
hangulLast
    where n :: Int
n = Char -> Int
ord Char
c

-- | Determine if the given character is a Hangul LV syllable.
--
-- __Note:__ this function requires a precomposed Hangul syllable but does /not/
-- check it. Use 'isHangul' to check the input character before passing it to
-- 'isHangulLV'.
--
-- @since 0.1.0
isHangulLV :: Char -> Bool
isHangulLV :: Char -> Bool
isHangulLV Char
c = Bool -> ((Int, Int) -> Int) -> (Int, Int) -> Int
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
jamoTCount Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
28)
    (Int, Int) -> Int
forall a b. (a, b) -> b
snd (Int -> (Int, Int)
quotRem28 (Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
hangulFirst)) Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0

-- | Determine whether a character is a jamo L, V or T character.
--
-- @since 0.1.0
isJamo :: Char -> Bool
isJamo :: Char -> Bool
isJamo Char
c = Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
jamoLFirst Bool -> Bool -> Bool
&& Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
jamoTLast
    where n :: Int
n = Char -> Int
ord Char
c

-- | Given a Unicode character, if it is a leading jamo, return its index in
-- the list of leading jamo consonants, otherwise return 'Nothing'.
--
-- @since 0.1.0
jamoLIndex :: Char -> Maybe Int
jamoLIndex :: Char -> Maybe Int
jamoLIndex Char
c
  | Int
index Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0 Bool -> Bool -> Bool
&& Int
index Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
jamoLCount = Int -> Maybe Int
forall a. a -> Maybe a
Just Int
index
  | Bool
otherwise = Maybe Int
forall a. Maybe a
Nothing
    where index :: Int
index = Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
jamoLFirst

-- | Given a Unicode character, if it is a vowel jamo, return its index in the
-- list of vowel jamo, otherwise return 'Nothing'.
--
-- @since 0.1.0
jamoVIndex :: Char -> Maybe Int
jamoVIndex :: Char -> Maybe Int
jamoVIndex Char
c
  | Int
index Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0 Bool -> Bool -> Bool
&& Int
index Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
jamoVCount = Int -> Maybe Int
forall a. a -> Maybe a
Just Int
index
  | Bool
otherwise = Maybe Int
forall a. Maybe a
Nothing
    where index :: Int
index = Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
jamoVFirst

-- | Given a Unicode character, if it is a trailing jamo consonant, return its
-- index in the list of trailing jamo consonants, otherwise return 'Nothing'.
--
-- Note that index 0 is not a valid index for a trailing consonant. Index 0
-- corresponds to an LV syllable, without a T.  See "Hangul Syllable
-- Decomposition" in the Conformance chapter of the Unicode standard for more
-- details.
--
-- @since 0.1.0
jamoTIndex :: Char -> Maybe Int
jamoTIndex :: Char -> Maybe Int
jamoTIndex Char
c
  | Int
index Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&& Int
index Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
jamoTCount = Int -> Maybe Int
forall a. a -> Maybe a
Just Int
index
  | Bool
otherwise = Maybe Int
forall a. Maybe a
Nothing
    where index :: Int
index = Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
jamoTFirst