{-|
Module      : Gargantext.Core.Text.Ngrams
Description : Ngrams definition and tools
Copyright   : (c) CNRS, 2017 - present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

An @n-gram@ is a contiguous sequence of n items from a given sample of
text. In Gargantext application the items are words, n is a non negative
integer.

Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
"unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
"four-gram", "five-gram", and so on.

Source: https://en.wikipedia.org/wiki/Ngrams

TODO
group Ngrams -> Tree
compute occ by node of Tree
group occs according groups

compute cooccurrences
compute graph

-}

{-# LANGUAGE TemplateHaskell   #-}
{-# LANGUAGE ConstrainedClassMethods #-}

module Gargantext.Core.Text.Terms
  where

import Control.Lens
import Data.HashMap.Strict (HashMap)
import Data.Hashable (Hashable)
import Data.Map (Map)
import Data.Text (Text)
import Data.Traversable
import GHC.Base (String)
import GHC.Generics (Generic)
import qualified Data.List           as List
import qualified Data.Set            as Set
import qualified Data.Text           as Text
import qualified Data.HashMap.Strict as HashMap
import Gargantext.Core
import Gargantext.Core.Text (sentences, HasText(..))
import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
import Gargantext.Core.Text.Terms.Mono  (monoTerms)
import Gargantext.Core.Text.Terms.Mono.Stem (stem)
import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Core.Text.Terms.Multi (multiterms)
import Gargantext.Core.Types
import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag(..), insertNgramsPostag, np_form, np_lem)
import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..), ngramsTerms, text2ngrams, NgramsId)
import Gargantext.Prelude

data TermType lang
  = Mono      { TermType lang -> lang
_tt_lang :: !lang }
  | Multi     { _tt_lang :: !lang }
  | MonoMulti { _tt_lang :: !lang }
  | Unsupervised { _tt_lang       :: !lang
                 , TermType lang -> Int
_tt_windowSize :: !Int
                 , TermType lang -> Int
_tt_ngramsSize :: !Int
                 , TermType lang -> Maybe (Tries Token ())
_tt_model      :: !(Maybe (Tries Token ()))
                 }
  deriving ((forall x. TermType lang -> Rep (TermType lang) x)
-> (forall x. Rep (TermType lang) x -> TermType lang)
-> Generic (TermType lang)
forall x. Rep (TermType lang) x -> TermType lang
forall x. TermType lang -> Rep (TermType lang) x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall lang x. Rep (TermType lang) x -> TermType lang
forall lang x. TermType lang -> Rep (TermType lang) x
$cto :: forall lang x. Rep (TermType lang) x -> TermType lang
$cfrom :: forall lang x. TermType lang -> Rep (TermType lang) x
Generic)

makeLenses ''TermType
--group :: [Text] -> [Text]
--group = undefined

-- remove Stop Words
-- map (filter (\t -> not . elem t)) $ 
------------------------------------------------------------------------
-- | Sugar to extract terms from text (hiddeng mapM from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]

extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
extractTerms (Unsupervised {Int
Maybe (Tries Token ())
Lang
_tt_model :: Maybe (Tries Token ())
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
_tt_model :: forall lang. TermType lang -> Maybe (Tries Token ())
_tt_ngramsSize :: forall lang. TermType lang -> Int
_tt_windowSize :: forall lang. TermType lang -> Int
_tt_lang :: forall lang. TermType lang -> lang
..}) [Text]
xs = (Text -> IO [Terms]) -> [Text] -> IO [[Terms]]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (TermType Lang -> Text -> IO [Terms]
terms (Unsupervised :: forall lang.
lang -> Int -> Int -> Maybe (Tries Token ()) -> TermType lang
Unsupervised { _tt_model :: Maybe (Tries Token ())
_tt_model = Tries Token () -> Maybe (Tries Token ())
forall a. a -> Maybe a
Just Tries Token ()
m', Int
Lang
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
.. })) [Text]
xs
  where
    m' :: Tries Token ()
m' = case Maybe (Tries Token ())
_tt_model of
      Just Tries Token ()
m''-> Tries Token ()
m''
      Maybe (Tries Token ())
Nothing -> Int -> Text -> Tries Token ()
newTries Int
_tt_windowSize (Text -> [Text] -> Text
Text.intercalate Text
" " [Text]
xs)

extractTerms TermType Lang
termTypeLang [Text]
xs = (Text -> IO [Terms]) -> [Text] -> IO [[Terms]]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (TermType Lang -> Text -> IO [Terms]
terms TermType Lang
termTypeLang) [Text]
xs


------------------------------------------------------------------------
withLang :: (Foldable t, Functor t, HasText h)
         => TermType Lang
         -> t h
         -> TermType Lang
withLang :: TermType Lang -> t h -> TermType Lang
withLang (Unsupervised {Int
Maybe (Tries Token ())
Lang
_tt_model :: Maybe (Tries Token ())
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
_tt_model :: forall lang. TermType lang -> Maybe (Tries Token ())
_tt_ngramsSize :: forall lang. TermType lang -> Int
_tt_windowSize :: forall lang. TermType lang -> Int
_tt_lang :: forall lang. TermType lang -> lang
..}) t h
ns = Unsupervised :: forall lang.
lang -> Int -> Int -> Maybe (Tries Token ()) -> TermType lang
Unsupervised { _tt_model :: Maybe (Tries Token ())
_tt_model = Maybe (Tries Token ())
m', Int
Lang
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
.. }
  where
    m' :: Maybe (Tries Token ())
m' = case Maybe (Tries Token ())
_tt_model of
      Maybe (Tries Token ())
Nothing -> -- trace ("buildTries here" :: String)
               Tries Token () -> Maybe (Tries Token ())
forall a. a -> Maybe a
Just (Tries Token () -> Maybe (Tries Token ()))
-> Tries Token () -> Maybe (Tries Token ())
forall a b. (a -> b) -> a -> b
$ Int -> [[Token]] -> Tries Token ()
buildTries Int
_tt_ngramsSize
                    ([[Token]] -> Tries Token ()) -> [[Token]] -> Tries Token ()
forall a b. (a -> b) -> a -> b
$ ([Text] -> [Token]) -> [[Text]] -> [[Token]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap [Text] -> [Token]
toToken
                    ([[Text]] -> [[Token]]) -> [[Text]] -> [[Token]]
forall a b. (a -> b) -> a -> b
$ Text -> [[Text]]
uniText
                    (Text -> [[Text]]) -> Text -> [[Text]]
forall a b. (a -> b) -> a -> b
$ Text -> [Text] -> Text
Text.intercalate Text
" . "
                    ([Text] -> Text) -> [Text] -> Text
forall a b. (a -> b) -> a -> b
$ t [Text] -> [Text]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
List.concat
                    (t [Text] -> [Text]) -> t [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ (h -> [Text]) -> t h -> t [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map h -> [Text]
forall h. HasText h => h -> [Text]
hasText t h
ns
      Maybe (Tries Token ())
just_m -> Maybe (Tries Token ())
just_m
withLang TermType Lang
l t h
_ = TermType Lang
l

------------------------------------------------------------------------
data ExtractedNgrams = SimpleNgrams   { ExtractedNgrams -> Ngrams
unSimpleNgrams   :: Ngrams       }
                     | EnrichedNgrams { ExtractedNgrams -> NgramsPostag
unEnrichedNgrams :: NgramsPostag }
  deriving (ExtractedNgrams -> ExtractedNgrams -> Bool
(ExtractedNgrams -> ExtractedNgrams -> Bool)
-> (ExtractedNgrams -> ExtractedNgrams -> Bool)
-> Eq ExtractedNgrams
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: ExtractedNgrams -> ExtractedNgrams -> Bool
$c/= :: ExtractedNgrams -> ExtractedNgrams -> Bool
== :: ExtractedNgrams -> ExtractedNgrams -> Bool
$c== :: ExtractedNgrams -> ExtractedNgrams -> Bool
Eq, Eq ExtractedNgrams
Eq ExtractedNgrams
-> (ExtractedNgrams -> ExtractedNgrams -> Ordering)
-> (ExtractedNgrams -> ExtractedNgrams -> Bool)
-> (ExtractedNgrams -> ExtractedNgrams -> Bool)
-> (ExtractedNgrams -> ExtractedNgrams -> Bool)
-> (ExtractedNgrams -> ExtractedNgrams -> Bool)
-> (ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams)
-> (ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams)
-> Ord ExtractedNgrams
ExtractedNgrams -> ExtractedNgrams -> Bool
ExtractedNgrams -> ExtractedNgrams -> Ordering
ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams
$cmin :: ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams
max :: ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams
$cmax :: ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams
>= :: ExtractedNgrams -> ExtractedNgrams -> Bool
$c>= :: ExtractedNgrams -> ExtractedNgrams -> Bool
> :: ExtractedNgrams -> ExtractedNgrams -> Bool
$c> :: ExtractedNgrams -> ExtractedNgrams -> Bool
<= :: ExtractedNgrams -> ExtractedNgrams -> Bool
$c<= :: ExtractedNgrams -> ExtractedNgrams -> Bool
< :: ExtractedNgrams -> ExtractedNgrams -> Bool
$c< :: ExtractedNgrams -> ExtractedNgrams -> Bool
compare :: ExtractedNgrams -> ExtractedNgrams -> Ordering
$ccompare :: ExtractedNgrams -> ExtractedNgrams -> Ordering
$cp1Ord :: Eq ExtractedNgrams
Ord, (forall x. ExtractedNgrams -> Rep ExtractedNgrams x)
-> (forall x. Rep ExtractedNgrams x -> ExtractedNgrams)
-> Generic ExtractedNgrams
forall x. Rep ExtractedNgrams x -> ExtractedNgrams
forall x. ExtractedNgrams -> Rep ExtractedNgrams x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep ExtractedNgrams x -> ExtractedNgrams
$cfrom :: forall x. ExtractedNgrams -> Rep ExtractedNgrams x
Generic, Int -> ExtractedNgrams -> ShowS
[ExtractedNgrams] -> ShowS
ExtractedNgrams -> String
(Int -> ExtractedNgrams -> ShowS)
-> (ExtractedNgrams -> String)
-> ([ExtractedNgrams] -> ShowS)
-> Show ExtractedNgrams
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [ExtractedNgrams] -> ShowS
$cshowList :: [ExtractedNgrams] -> ShowS
show :: ExtractedNgrams -> String
$cshow :: ExtractedNgrams -> String
showsPrec :: Int -> ExtractedNgrams -> ShowS
$cshowsPrec :: Int -> ExtractedNgrams -> ShowS
Show)

instance Hashable ExtractedNgrams

class ExtractNgramsT h
  where
    extractNgramsT :: HasText h
                   => TermType Lang
                   -> h
                   -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
------------------------------------------------------------------------
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms Lang
l PosTagAlgo
pa POS
po (Terms [Text]
ng1 Stems
ng2) =
  Lang -> PosTagAlgo -> POS -> Ngrams -> Ngrams -> NgramsPostag
NgramsPostag Lang
l PosTagAlgo
pa POS
po Ngrams
form Ngrams
lem
    where
      form :: Ngrams
form = Text -> Ngrams
text2ngrams (Text -> Ngrams) -> Text -> Ngrams
forall a b. (a -> b) -> a -> b
$ Text -> [Text] -> Text
Text.intercalate Text
" " [Text]
ng1
      lem :: Ngrams
lem  = Text -> Ngrams
text2ngrams (Text -> Ngrams) -> Text -> Ngrams
forall a b. (a -> b) -> a -> b
$ Text -> [Text] -> Text
Text.intercalate Text
" " ([Text] -> Text) -> [Text] -> Text
forall a b. (a -> b) -> a -> b
$ Stems -> [Text]
forall a. Set a -> [a]
Set.toList Stems
ng2

------------------------------------------------------------------------
cleanNgrams :: Int -> Ngrams -> Ngrams
cleanNgrams :: Int -> Ngrams -> Ngrams
cleanNgrams Int
s Ngrams
ng 
      | Text -> Int
Text.length (Ngrams
ng Ngrams -> Getting Text Ngrams Text -> Text
forall s a. s -> Getting a s a -> a
^. Getting Text Ngrams Text
Lens' Ngrams Text
ngramsTerms) Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
s = Ngrams
ng
      | Bool
otherwise                           = Text -> Ngrams
text2ngrams (Int -> Text -> Text
Text.take Int
s (Ngrams
ng Ngrams -> Getting Text Ngrams Text -> Text
forall s a. s -> Getting a s a -> a
^. Getting Text Ngrams Text
Lens' Ngrams Text
ngramsTerms))

cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams
cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams
cleanExtractedNgrams Int
s (SimpleNgrams   Ngrams
ng) = Ngrams -> ExtractedNgrams
SimpleNgrams (Ngrams -> ExtractedNgrams) -> Ngrams -> ExtractedNgrams
forall a b. (a -> b) -> a -> b
$ (Int -> Ngrams -> Ngrams
cleanNgrams Int
s) Ngrams
ng
cleanExtractedNgrams Int
s (EnrichedNgrams NgramsPostag
ng) = NgramsPostag -> ExtractedNgrams
EnrichedNgrams (NgramsPostag -> ExtractedNgrams)
-> NgramsPostag -> ExtractedNgrams
forall a b. (a -> b) -> a -> b
$ ASetter NgramsPostag NgramsPostag Ngrams Ngrams
-> (Ngrams -> Ngrams) -> NgramsPostag -> NgramsPostag
forall s t a b. ASetter s t a b -> (a -> b) -> s -> t
over ASetter NgramsPostag NgramsPostag Ngrams Ngrams
Lens' NgramsPostag Ngrams
np_form (Int -> Ngrams -> Ngrams
cleanNgrams Int
s)
                                                            (NgramsPostag -> NgramsPostag) -> NgramsPostag -> NgramsPostag
forall a b. (a -> b) -> a -> b
$ ASetter NgramsPostag NgramsPostag Ngrams Ngrams
-> (Ngrams -> Ngrams) -> NgramsPostag -> NgramsPostag
forall s t a b. ASetter s t a b -> (a -> b) -> s -> t
over ASetter NgramsPostag NgramsPostag Ngrams Ngrams
Lens' NgramsPostag Ngrams
np_lem  (Int -> Ngrams -> Ngrams
cleanNgrams Int
s) NgramsPostag
ng

extracted2ngrams :: ExtractedNgrams -> Ngrams
extracted2ngrams :: ExtractedNgrams -> Ngrams
extracted2ngrams (SimpleNgrams   Ngrams
ng) = Ngrams
ng
extracted2ngrams (EnrichedNgrams NgramsPostag
ng) = Getting Ngrams NgramsPostag Ngrams -> NgramsPostag -> Ngrams
forall s (m :: * -> *) a. MonadReader s m => Getting a s a -> m a
view Getting Ngrams NgramsPostag Ngrams
Lens' NgramsPostag Ngrams
np_form NgramsPostag
ng

---------------------------
insertExtractedNgrams :: [ ExtractedNgrams ] -> Cmd err (HashMap Text NgramsId)
insertExtractedNgrams :: [ExtractedNgrams] -> Cmd err (HashMap Text Int)
insertExtractedNgrams [ExtractedNgrams]
ngs = do
  let ([ExtractedNgrams]
s, [ExtractedNgrams]
e) = (ExtractedNgrams -> Bool)
-> [ExtractedNgrams] -> ([ExtractedNgrams], [ExtractedNgrams])
forall a. (a -> Bool) -> [a] -> ([a], [a])
List.partition ExtractedNgrams -> Bool
isSimpleNgrams [ExtractedNgrams]
ngs
  HashMap Text Int
m1 <- [Ngrams] -> Cmd err (HashMap Text Int)
forall err. [Ngrams] -> Cmd err (HashMap Text Int)
insertNgrams       ((ExtractedNgrams -> Ngrams) -> [ExtractedNgrams] -> [Ngrams]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map ExtractedNgrams -> Ngrams
unSimpleNgrams   [ExtractedNgrams]
s)
  --printDebug "others" m1
  
  HashMap Text Int
m2 <- [NgramsPostag] -> Cmd err (HashMap Text Int)
forall err. [NgramsPostag] -> Cmd err (HashMap Text Int)
insertNgramsPostag ((ExtractedNgrams -> NgramsPostag)
-> [ExtractedNgrams] -> [NgramsPostag]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map ExtractedNgrams -> NgramsPostag
unEnrichedNgrams [ExtractedNgrams]
e)
  --printDebug "terms" m2
 
  let result :: HashMap Text Int
result = HashMap Text Int -> HashMap Text Int -> HashMap Text Int
forall k v.
(Eq k, Hashable k) =>
HashMap k v -> HashMap k v -> HashMap k v
HashMap.union HashMap Text Int
m1 HashMap Text Int
m2
  HashMap Text Int -> m (HashMap Text Int)
forall (f :: * -> *) a. Applicative f => a -> f a
pure HashMap Text Int
result

isSimpleNgrams :: ExtractedNgrams -> Bool
isSimpleNgrams :: ExtractedNgrams -> Bool
isSimpleNgrams (SimpleNgrams Ngrams
_) = Bool
True
isSimpleNgrams ExtractedNgrams
_                = Bool
False

------------------------------------------------------------------------
-- | Terms from Text
-- Mono : mono terms
-- Multi : multi terms
-- MonoMulti : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: TermType Lang -> Text -> IO [Terms]
terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono      Lang
lang) Text
txt = [Terms] -> IO [Terms]
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([Terms] -> IO [Terms]) -> [Terms] -> IO [Terms]
forall a b. (a -> b) -> a -> b
$ Lang -> Text -> [Terms]
monoTerms Lang
lang Text
txt
terms (Multi     Lang
lang) Text
txt = Lang -> Text -> IO [Terms]
multiterms Lang
lang Text
txt
terms (MonoMulti Lang
lang) Text
txt = TermType Lang -> Text -> IO [Terms]
terms (Lang -> TermType Lang
forall lang. lang -> TermType lang
Multi Lang
lang) Text
txt
terms (Unsupervised { Int
Maybe (Tries Token ())
Lang
_tt_model :: Maybe (Tries Token ())
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
_tt_model :: forall lang. TermType lang -> Maybe (Tries Token ())
_tt_ngramsSize :: forall lang. TermType lang -> Int
_tt_windowSize :: forall lang. TermType lang -> Int
_tt_lang :: forall lang. TermType lang -> lang
.. }) Text
txt = TermType Lang -> Text -> IO [Terms]
termsUnsupervised (Unsupervised :: forall lang.
lang -> Int -> Int -> Maybe (Tries Token ()) -> TermType lang
Unsupervised { _tt_model :: Maybe (Tries Token ())
_tt_model = Tries Token () -> Maybe (Tries Token ())
forall a. a -> Maybe a
Just Tries Token ()
m', Int
Lang
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
.. }) Text
txt
  where
    m' :: Tries Token ()
m' = Tries Token ()
-> (Tries Token () -> Tries Token ())
-> Maybe (Tries Token ())
-> Tries Token ()
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (Int -> Text -> Tries Token ()
newTries Int
_tt_ngramsSize Text
txt) Tries Token () -> Tries Token ()
forall a. a -> a
identity Maybe (Tries Token ())
_tt_model
-- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt


------------------------------------------------------------------------
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: remove IO
-- TODO: newtype BlockText

type WindowSize = Int
type MinNgramSize = Int

termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
termsUnsupervised (Unsupervised Lang
l Int
n Int
s Maybe (Tries Token ())
m) =
               [Terms] -> IO [Terms]
forall (f :: * -> *) a. Applicative f => a -> f a
pure
             ([Terms] -> IO [Terms]) -> (Text -> [Terms]) -> Text -> IO [Terms]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([Text] -> Terms) -> [[Text]] -> [Terms]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map (Lang -> [Text] -> Terms
text2term Lang
l)
             ([[Text]] -> [Terms]) -> (Text -> [[Text]]) -> Text -> [Terms]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [[Text]] -> [[Text]]
forall a. Eq a => [a] -> [a]
List.nub
             ([[Text]] -> [[Text]]) -> (Text -> [[Text]]) -> Text -> [[Text]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (([Text] -> Bool) -> [[Text]] -> [[Text]]
forall a. (a -> Bool) -> [a] -> [a]
List.filter (\[Text]
l' -> [Text] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
List.length [Text]
l' Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
s))
             ([[Text]] -> [[Text]]) -> (Text -> [[Text]]) -> Text -> [[Text]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [[[Text]]] -> [[Text]]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
List.concat
             ([[[Text]]] -> [[Text]])
-> (Text -> [[[Text]]]) -> Text -> [[Text]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Tries Token () -> Int -> [[Text]] -> [[[Text]]]
mainEleveWith (Tries Token ()
-> (Tries Token () -> Tries Token ())
-> Maybe (Tries Token ())
-> Tries Token ()
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (Text -> Tries Token ()
forall a. HasCallStack => Text -> a
panic Text
"no model") Tries Token () -> Tries Token ()
forall a. a -> a
identity Maybe (Tries Token ())
m) Int
n
             ([[Text]] -> [[[Text]]])
-> (Text -> [[Text]]) -> Text -> [[[Text]]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [[Text]]
uniText
termsUnsupervised TermType Lang
_ = Text -> IO [Terms]
forall a. HasCallStack => a
undefined



newTries :: Int -> Text -> Tries Token ()
newTries :: Int -> Text -> Tries Token ()
newTries Int
n Text
t = Int -> [[Token]] -> Tries Token ()
buildTries Int
n (([Text] -> [Token]) -> [[Text]] -> [[Token]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap [Text] -> [Token]
toToken ([[Text]] -> [[Token]]) -> [[Text]] -> [[Token]]
forall a b. (a -> b) -> a -> b
$ Text -> [[Text]]
uniText Text
t)

-- | TODO removing long terms > 24
uniText :: Text -> [[Text]]
uniText :: Text -> [[Text]]
uniText = ([Text] -> [Text]) -> [[Text]] -> [[Text]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map ((Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
List.filter (Bool -> Bool
not (Bool -> Bool) -> (Text -> Bool) -> Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Bool
isPunctuation))
        ([[Text]] -> [[Text]]) -> (Text -> [[Text]]) -> Text -> [[Text]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Text -> [Text]) -> [Text] -> [[Text]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map Text -> [Text]
tokenize
        ([Text] -> [[Text]]) -> (Text -> [Text]) -> Text -> [[Text]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [Text]
sentences       -- TODO get sentences according to lang
        (Text -> [Text]) -> (Text -> Text) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
Text.toLower

text2term :: Lang -> [Text] -> Terms
text2term :: Lang -> [Text] -> Terms
text2term Lang
_ [] = [Text] -> Stems -> Terms
Terms [] Stems
forall a. Set a
Set.empty
text2term Lang
lang [Text]
txt = [Text] -> Stems -> Terms
Terms [Text]
txt ([Text] -> Stems
forall a. Ord a => [a] -> Set a
Set.fromList ([Text] -> Stems) -> [Text] -> Stems
forall a b. (a -> b) -> a -> b
$ (Text -> Text) -> [Text] -> [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map (Lang -> Text -> Text
stem Lang
lang) [Text]
txt)

isPunctuation :: Text -> Bool
isPunctuation :: Text -> Bool
isPunctuation Text
x = Text -> [Text] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
List.elem Text
x ([Text] -> Bool) -> [Text] -> Bool
forall a b. (a -> b) -> a -> b
$  (String -> Text
Text.pack (String -> Text) -> (Char -> String) -> Char -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> String
forall (f :: * -> *) a. Applicative f => a -> f a
pure)
                             (Char -> Text) -> String -> [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (String
"!?(),;.:" :: String)