{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE ConstrainedClassMethods #-}
module Gargantext.Core.Text.Terms
where
import Control.Lens
import Data.HashMap.Strict (HashMap)
import Data.Hashable (Hashable)
import Data.Map (Map)
import Data.Text (Text)
import Data.Traversable
import GHC.Base (String)
import GHC.Generics (Generic)
import qualified Data.List as List
import qualified Data.Set as Set
import qualified Data.Text as Text
import qualified Data.HashMap.Strict as HashMap
import Gargantext.Core
import Gargantext.Core.Text (sentences, HasText(..))
import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
import Gargantext.Core.Text.Terms.Mono (monoTerms)
import Gargantext.Core.Text.Terms.Mono.Stem (stem)
import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Core.Text.Terms.Multi (multiterms)
import Gargantext.Core.Types
import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag(..), insertNgramsPostag, np_form, np_lem)
import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..), ngramsTerms, text2ngrams, NgramsId)
import Gargantext.Prelude
data TermType lang
= Mono { TermType lang -> lang
_tt_lang :: !lang }
| Multi { _tt_lang :: !lang }
| MonoMulti { _tt_lang :: !lang }
| Unsupervised { _tt_lang :: !lang
, TermType lang -> Int
_tt_windowSize :: !Int
, TermType lang -> Int
_tt_ngramsSize :: !Int
, TermType lang -> Maybe (Tries Token ())
_tt_model :: !(Maybe (Tries Token ()))
}
deriving ((forall x. TermType lang -> Rep (TermType lang) x)
-> (forall x. Rep (TermType lang) x -> TermType lang)
-> Generic (TermType lang)
forall x. Rep (TermType lang) x -> TermType lang
forall x. TermType lang -> Rep (TermType lang) x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall lang x. Rep (TermType lang) x -> TermType lang
forall lang x. TermType lang -> Rep (TermType lang) x
$cto :: forall lang x. Rep (TermType lang) x -> TermType lang
$cfrom :: forall lang x. TermType lang -> Rep (TermType lang) x
Generic)
makeLenses ''TermType
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
(Unsupervised {Int
Maybe (Tries Token ())
Lang
_tt_model :: Maybe (Tries Token ())
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
_tt_model :: forall lang. TermType lang -> Maybe (Tries Token ())
_tt_ngramsSize :: forall lang. TermType lang -> Int
_tt_windowSize :: forall lang. TermType lang -> Int
_tt_lang :: forall lang. TermType lang -> lang
..}) [Text]
xs = (Text -> IO [Terms]) -> [Text] -> IO [[Terms]]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (TermType Lang -> Text -> IO [Terms]
terms (Unsupervised :: forall lang.
lang -> Int -> Int -> Maybe (Tries Token ()) -> TermType lang
Unsupervised { _tt_model :: Maybe (Tries Token ())
_tt_model = Tries Token () -> Maybe (Tries Token ())
forall a. a -> Maybe a
Just Tries Token ()
m', Int
Lang
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
.. })) [Text]
xs
where
m' :: Tries Token ()
m' = case Maybe (Tries Token ())
_tt_model of
Just Tries Token ()
m''-> Tries Token ()
m''
Maybe (Tries Token ())
Nothing -> Int -> Text -> Tries Token ()
newTries Int
_tt_windowSize (Text -> [Text] -> Text
Text.intercalate Text
" " [Text]
xs)
extractTerms TermType Lang
termTypeLang [Text]
xs = (Text -> IO [Terms]) -> [Text] -> IO [[Terms]]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (TermType Lang -> Text -> IO [Terms]
terms TermType Lang
termTypeLang) [Text]
xs
withLang :: (Foldable t, Functor t, HasText h)
=> TermType Lang
-> t h
-> TermType Lang
withLang :: TermType Lang -> t h -> TermType Lang
withLang (Unsupervised {Int
Maybe (Tries Token ())
Lang
_tt_model :: Maybe (Tries Token ())
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
_tt_model :: forall lang. TermType lang -> Maybe (Tries Token ())
_tt_ngramsSize :: forall lang. TermType lang -> Int
_tt_windowSize :: forall lang. TermType lang -> Int
_tt_lang :: forall lang. TermType lang -> lang
..}) t h
ns = Unsupervised :: forall lang.
lang -> Int -> Int -> Maybe (Tries Token ()) -> TermType lang
Unsupervised { _tt_model :: Maybe (Tries Token ())
_tt_model = Maybe (Tries Token ())
m', Int
Lang
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
.. }
where
m' :: Maybe (Tries Token ())
m' = case Maybe (Tries Token ())
_tt_model of
Maybe (Tries Token ())
Nothing ->
Tries Token () -> Maybe (Tries Token ())
forall a. a -> Maybe a
Just (Tries Token () -> Maybe (Tries Token ()))
-> Tries Token () -> Maybe (Tries Token ())
forall a b. (a -> b) -> a -> b
$ Int -> [[Token]] -> Tries Token ()
buildTries Int
_tt_ngramsSize
([[Token]] -> Tries Token ()) -> [[Token]] -> Tries Token ()
forall a b. (a -> b) -> a -> b
$ ([Text] -> [Token]) -> [[Text]] -> [[Token]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap [Text] -> [Token]
toToken
([[Text]] -> [[Token]]) -> [[Text]] -> [[Token]]
forall a b. (a -> b) -> a -> b
$ Text -> [[Text]]
uniText
(Text -> [[Text]]) -> Text -> [[Text]]
forall a b. (a -> b) -> a -> b
$ Text -> [Text] -> Text
Text.intercalate Text
" . "
([Text] -> Text) -> [Text] -> Text
forall a b. (a -> b) -> a -> b
$ t [Text] -> [Text]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
List.concat
(t [Text] -> [Text]) -> t [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ (h -> [Text]) -> t h -> t [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map h -> [Text]
forall h. HasText h => h -> [Text]
hasText t h
ns
Maybe (Tries Token ())
just_m -> Maybe (Tries Token ())
just_m
withLang TermType Lang
l t h
_ = TermType Lang
l
data = SimpleNgrams { ExtractedNgrams -> Ngrams
unSimpleNgrams :: Ngrams }
| EnrichedNgrams { ExtractedNgrams -> NgramsPostag
unEnrichedNgrams :: NgramsPostag }
deriving (ExtractedNgrams -> ExtractedNgrams -> Bool
(ExtractedNgrams -> ExtractedNgrams -> Bool)
-> (ExtractedNgrams -> ExtractedNgrams -> Bool)
-> Eq ExtractedNgrams
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: ExtractedNgrams -> ExtractedNgrams -> Bool
$c/= :: ExtractedNgrams -> ExtractedNgrams -> Bool
== :: ExtractedNgrams -> ExtractedNgrams -> Bool
$c== :: ExtractedNgrams -> ExtractedNgrams -> Bool
Eq, Eq ExtractedNgrams
Eq ExtractedNgrams
-> (ExtractedNgrams -> ExtractedNgrams -> Ordering)
-> (ExtractedNgrams -> ExtractedNgrams -> Bool)
-> (ExtractedNgrams -> ExtractedNgrams -> Bool)
-> (ExtractedNgrams -> ExtractedNgrams -> Bool)
-> (ExtractedNgrams -> ExtractedNgrams -> Bool)
-> (ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams)
-> (ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams)
-> Ord ExtractedNgrams
ExtractedNgrams -> ExtractedNgrams -> Bool
ExtractedNgrams -> ExtractedNgrams -> Ordering
ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams
$cmin :: ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams
max :: ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams
$cmax :: ExtractedNgrams -> ExtractedNgrams -> ExtractedNgrams
>= :: ExtractedNgrams -> ExtractedNgrams -> Bool
$c>= :: ExtractedNgrams -> ExtractedNgrams -> Bool
> :: ExtractedNgrams -> ExtractedNgrams -> Bool
$c> :: ExtractedNgrams -> ExtractedNgrams -> Bool
<= :: ExtractedNgrams -> ExtractedNgrams -> Bool
$c<= :: ExtractedNgrams -> ExtractedNgrams -> Bool
< :: ExtractedNgrams -> ExtractedNgrams -> Bool
$c< :: ExtractedNgrams -> ExtractedNgrams -> Bool
compare :: ExtractedNgrams -> ExtractedNgrams -> Ordering
$ccompare :: ExtractedNgrams -> ExtractedNgrams -> Ordering
$cp1Ord :: Eq ExtractedNgrams
Ord, (forall x. ExtractedNgrams -> Rep ExtractedNgrams x)
-> (forall x. Rep ExtractedNgrams x -> ExtractedNgrams)
-> Generic ExtractedNgrams
forall x. Rep ExtractedNgrams x -> ExtractedNgrams
forall x. ExtractedNgrams -> Rep ExtractedNgrams x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep ExtractedNgrams x -> ExtractedNgrams
$cfrom :: forall x. ExtractedNgrams -> Rep ExtractedNgrams x
Generic, Int -> ExtractedNgrams -> ShowS
[ExtractedNgrams] -> ShowS
ExtractedNgrams -> String
(Int -> ExtractedNgrams -> ShowS)
-> (ExtractedNgrams -> String)
-> ([ExtractedNgrams] -> ShowS)
-> Show ExtractedNgrams
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [ExtractedNgrams] -> ShowS
$cshowList :: [ExtractedNgrams] -> ShowS
show :: ExtractedNgrams -> String
$cshow :: ExtractedNgrams -> String
showsPrec :: Int -> ExtractedNgrams -> ShowS
$cshowsPrec :: Int -> ExtractedNgrams -> ShowS
Show)
instance Hashable ExtractedNgrams
class h
where
:: HasText h
=> TermType Lang
-> h
-> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms Lang
l PosTagAlgo
pa POS
po (Terms [Text]
ng1 Stems
ng2) =
Lang -> PosTagAlgo -> POS -> Ngrams -> Ngrams -> NgramsPostag
NgramsPostag Lang
l PosTagAlgo
pa POS
po Ngrams
form Ngrams
lem
where
form :: Ngrams
form = Text -> Ngrams
text2ngrams (Text -> Ngrams) -> Text -> Ngrams
forall a b. (a -> b) -> a -> b
$ Text -> [Text] -> Text
Text.intercalate Text
" " [Text]
ng1
lem :: Ngrams
lem = Text -> Ngrams
text2ngrams (Text -> Ngrams) -> Text -> Ngrams
forall a b. (a -> b) -> a -> b
$ Text -> [Text] -> Text
Text.intercalate Text
" " ([Text] -> Text) -> [Text] -> Text
forall a b. (a -> b) -> a -> b
$ Stems -> [Text]
forall a. Set a -> [a]
Set.toList Stems
ng2
cleanNgrams :: Int -> Ngrams -> Ngrams
cleanNgrams :: Int -> Ngrams -> Ngrams
cleanNgrams Int
s Ngrams
ng
| Text -> Int
Text.length (Ngrams
ng Ngrams -> Getting Text Ngrams Text -> Text
forall s a. s -> Getting a s a -> a
^. Getting Text Ngrams Text
Lens' Ngrams Text
ngramsTerms) Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
s = Ngrams
ng
| Bool
otherwise = Text -> Ngrams
text2ngrams (Int -> Text -> Text
Text.take Int
s (Ngrams
ng Ngrams -> Getting Text Ngrams Text -> Text
forall s a. s -> Getting a s a -> a
^. Getting Text Ngrams Text
Lens' Ngrams Text
ngramsTerms))
cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams
Int
s (SimpleNgrams Ngrams
ng) = Ngrams -> ExtractedNgrams
SimpleNgrams (Ngrams -> ExtractedNgrams) -> Ngrams -> ExtractedNgrams
forall a b. (a -> b) -> a -> b
$ (Int -> Ngrams -> Ngrams
cleanNgrams Int
s) Ngrams
ng
cleanExtractedNgrams Int
s (EnrichedNgrams NgramsPostag
ng) = NgramsPostag -> ExtractedNgrams
EnrichedNgrams (NgramsPostag -> ExtractedNgrams)
-> NgramsPostag -> ExtractedNgrams
forall a b. (a -> b) -> a -> b
$ ASetter NgramsPostag NgramsPostag Ngrams Ngrams
-> (Ngrams -> Ngrams) -> NgramsPostag -> NgramsPostag
forall s t a b. ASetter s t a b -> (a -> b) -> s -> t
over ASetter NgramsPostag NgramsPostag Ngrams Ngrams
Lens' NgramsPostag Ngrams
np_form (Int -> Ngrams -> Ngrams
cleanNgrams Int
s)
(NgramsPostag -> NgramsPostag) -> NgramsPostag -> NgramsPostag
forall a b. (a -> b) -> a -> b
$ ASetter NgramsPostag NgramsPostag Ngrams Ngrams
-> (Ngrams -> Ngrams) -> NgramsPostag -> NgramsPostag
forall s t a b. ASetter s t a b -> (a -> b) -> s -> t
over ASetter NgramsPostag NgramsPostag Ngrams Ngrams
Lens' NgramsPostag Ngrams
np_lem (Int -> Ngrams -> Ngrams
cleanNgrams Int
s) NgramsPostag
ng
extracted2ngrams :: ExtractedNgrams -> Ngrams
(SimpleNgrams Ngrams
ng) = Ngrams
ng
extracted2ngrams (EnrichedNgrams NgramsPostag
ng) = Getting Ngrams NgramsPostag Ngrams -> NgramsPostag -> Ngrams
forall s (m :: * -> *) a. MonadReader s m => Getting a s a -> m a
view Getting Ngrams NgramsPostag Ngrams
Lens' NgramsPostag Ngrams
np_form NgramsPostag
ng
insertExtractedNgrams :: [ ExtractedNgrams ] -> Cmd err (HashMap Text NgramsId)
[ExtractedNgrams]
ngs = do
let ([ExtractedNgrams]
s, [ExtractedNgrams]
e) = (ExtractedNgrams -> Bool)
-> [ExtractedNgrams] -> ([ExtractedNgrams], [ExtractedNgrams])
forall a. (a -> Bool) -> [a] -> ([a], [a])
List.partition ExtractedNgrams -> Bool
isSimpleNgrams [ExtractedNgrams]
ngs
HashMap Text Int
m1 <- [Ngrams] -> Cmd err (HashMap Text Int)
forall err. [Ngrams] -> Cmd err (HashMap Text Int)
insertNgrams ((ExtractedNgrams -> Ngrams) -> [ExtractedNgrams] -> [Ngrams]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map ExtractedNgrams -> Ngrams
unSimpleNgrams [ExtractedNgrams]
s)
HashMap Text Int
m2 <- [NgramsPostag] -> Cmd err (HashMap Text Int)
forall err. [NgramsPostag] -> Cmd err (HashMap Text Int)
insertNgramsPostag ((ExtractedNgrams -> NgramsPostag)
-> [ExtractedNgrams] -> [NgramsPostag]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map ExtractedNgrams -> NgramsPostag
unEnrichedNgrams [ExtractedNgrams]
e)
let result :: HashMap Text Int
result = HashMap Text Int -> HashMap Text Int -> HashMap Text Int
forall k v.
(Eq k, Hashable k) =>
HashMap k v -> HashMap k v -> HashMap k v
HashMap.union HashMap Text Int
m1 HashMap Text Int
m2
HashMap Text Int -> m (HashMap Text Int)
forall (f :: * -> *) a. Applicative f => a -> f a
pure HashMap Text Int
result
isSimpleNgrams :: ExtractedNgrams -> Bool
isSimpleNgrams :: ExtractedNgrams -> Bool
isSimpleNgrams (SimpleNgrams Ngrams
_) = Bool
True
isSimpleNgrams ExtractedNgrams
_ = Bool
False
terms :: TermType Lang -> Text -> IO [Terms]
terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono Lang
lang) Text
txt = [Terms] -> IO [Terms]
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([Terms] -> IO [Terms]) -> [Terms] -> IO [Terms]
forall a b. (a -> b) -> a -> b
$ Lang -> Text -> [Terms]
monoTerms Lang
lang Text
txt
terms (Multi Lang
lang) Text
txt = Lang -> Text -> IO [Terms]
multiterms Lang
lang Text
txt
terms (MonoMulti Lang
lang) Text
txt = TermType Lang -> Text -> IO [Terms]
terms (Lang -> TermType Lang
forall lang. lang -> TermType lang
Multi Lang
lang) Text
txt
terms (Unsupervised { Int
Maybe (Tries Token ())
Lang
_tt_model :: Maybe (Tries Token ())
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
_tt_model :: forall lang. TermType lang -> Maybe (Tries Token ())
_tt_ngramsSize :: forall lang. TermType lang -> Int
_tt_windowSize :: forall lang. TermType lang -> Int
_tt_lang :: forall lang. TermType lang -> lang
.. }) Text
txt = TermType Lang -> Text -> IO [Terms]
termsUnsupervised (Unsupervised :: forall lang.
lang -> Int -> Int -> Maybe (Tries Token ()) -> TermType lang
Unsupervised { _tt_model :: Maybe (Tries Token ())
_tt_model = Tries Token () -> Maybe (Tries Token ())
forall a. a -> Maybe a
Just Tries Token ()
m', Int
Lang
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
_tt_ngramsSize :: Int
_tt_windowSize :: Int
_tt_lang :: Lang
.. }) Text
txt
where
m' :: Tries Token ()
m' = Tries Token ()
-> (Tries Token () -> Tries Token ())
-> Maybe (Tries Token ())
-> Tries Token ()
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (Int -> Text -> Tries Token ()
newTries Int
_tt_ngramsSize Text
txt) Tries Token () -> Tries Token ()
forall a. a -> a
identity Maybe (Tries Token ())
_tt_model
type WindowSize = Int
type MinNgramSize = Int
termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
termsUnsupervised (Unsupervised Lang
l Int
n Int
s Maybe (Tries Token ())
m) =
[Terms] -> IO [Terms]
forall (f :: * -> *) a. Applicative f => a -> f a
pure
([Terms] -> IO [Terms]) -> (Text -> [Terms]) -> Text -> IO [Terms]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([Text] -> Terms) -> [[Text]] -> [Terms]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map (Lang -> [Text] -> Terms
text2term Lang
l)
([[Text]] -> [Terms]) -> (Text -> [[Text]]) -> Text -> [Terms]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [[Text]] -> [[Text]]
forall a. Eq a => [a] -> [a]
List.nub
([[Text]] -> [[Text]]) -> (Text -> [[Text]]) -> Text -> [[Text]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (([Text] -> Bool) -> [[Text]] -> [[Text]]
forall a. (a -> Bool) -> [a] -> [a]
List.filter (\[Text]
l' -> [Text] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
List.length [Text]
l' Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
s))
([[Text]] -> [[Text]]) -> (Text -> [[Text]]) -> Text -> [[Text]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [[[Text]]] -> [[Text]]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
List.concat
([[[Text]]] -> [[Text]])
-> (Text -> [[[Text]]]) -> Text -> [[Text]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Tries Token () -> Int -> [[Text]] -> [[[Text]]]
mainEleveWith (Tries Token ()
-> (Tries Token () -> Tries Token ())
-> Maybe (Tries Token ())
-> Tries Token ()
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (Text -> Tries Token ()
forall a. HasCallStack => Text -> a
panic Text
"no model") Tries Token () -> Tries Token ()
forall a. a -> a
identity Maybe (Tries Token ())
m) Int
n
([[Text]] -> [[[Text]]])
-> (Text -> [[Text]]) -> Text -> [[[Text]]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [[Text]]
uniText
termsUnsupervised TermType Lang
_ = Text -> IO [Terms]
forall a. HasCallStack => a
undefined
newTries :: Int -> Text -> Tries Token ()
newTries :: Int -> Text -> Tries Token ()
newTries Int
n Text
t = Int -> [[Token]] -> Tries Token ()
buildTries Int
n (([Text] -> [Token]) -> [[Text]] -> [[Token]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap [Text] -> [Token]
toToken ([[Text]] -> [[Token]]) -> [[Text]] -> [[Token]]
forall a b. (a -> b) -> a -> b
$ Text -> [[Text]]
uniText Text
t)
uniText :: Text -> [[Text]]
uniText :: Text -> [[Text]]
uniText = ([Text] -> [Text]) -> [[Text]] -> [[Text]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map ((Text -> Bool) -> [Text] -> [Text]
forall a. (a -> Bool) -> [a] -> [a]
List.filter (Bool -> Bool
not (Bool -> Bool) -> (Text -> Bool) -> Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Bool
isPunctuation))
([[Text]] -> [[Text]]) -> (Text -> [[Text]]) -> Text -> [[Text]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Text -> [Text]) -> [Text] -> [[Text]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map Text -> [Text]
tokenize
([Text] -> [[Text]]) -> (Text -> [Text]) -> Text -> [[Text]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [Text]
sentences
(Text -> [Text]) -> (Text -> Text) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
Text.toLower
text2term :: Lang -> [Text] -> Terms
text2term :: Lang -> [Text] -> Terms
text2term Lang
_ [] = [Text] -> Stems -> Terms
Terms [] Stems
forall a. Set a
Set.empty
text2term Lang
lang [Text]
txt = [Text] -> Stems -> Terms
Terms [Text]
txt ([Text] -> Stems
forall a. Ord a => [a] -> Set a
Set.fromList ([Text] -> Stems) -> [Text] -> Stems
forall a b. (a -> b) -> a -> b
$ (Text -> Text) -> [Text] -> [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map (Lang -> Text -> Text
stem Lang
lang) [Text]
txt)
isPunctuation :: Text -> Bool
isPunctuation :: Text -> Bool
isPunctuation Text
x = Text -> [Text] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
List.elem Text
x ([Text] -> Bool) -> [Text] -> Bool
forall a b. (a -> b) -> a -> b
$ (String -> Text
Text.pack (String -> Text) -> (Char -> String) -> Char -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> String
forall (f :: * -> *) a. Applicative f => a -> f a
pure)
(Char -> Text) -> String -> [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (String
"!?(),;.:" :: String)