{-|
Module      : Gargantext.Core.Text
Description : Ngrams tools
Copyright   : (c) CNRS, 2018
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Text gathers terms in unit of contexts.

-}


module Gargantext.Core.Text
  where

import Data.Text (Text, split)
import Gargantext.Prelude hiding (filter)
import NLP.FullStop (segment)
import qualified Data.Text as DT

-----------------------------------------------------------------
class HasText h
  where
    hasText :: h -> [Text]

-----------------------------------------------------------------
-- French words to distinguish contexts
newtype Texte      = Texte      Text
newtype Paragraphe = Paragraphe Text
newtype Phrase     = Phrase     Text
newtype MultiTerme = MultiTerme Text
newtype Mot        = Mot        Text
newtype Lettre     = Lettre     Text

-- | Type syn seems obvious
type    Titre      = Phrase

-----------------------------------------------------------------

instance Show Texte where
  show :: Texte -> String
show (Texte Text
t) = Text -> String
forall a. Show a => a -> String
show Text
t

instance Show Paragraphe where
  show :: Paragraphe -> String
show (Paragraphe Text
p) = Text -> String
forall a. Show a => a -> String
show Text
p

instance Show Phrase where
  show :: Phrase -> String
show (Phrase Text
p) = Text -> String
forall a. Show a => a -> String
show Text
p

instance Show MultiTerme where
  show :: MultiTerme -> String
show (MultiTerme Text
mt) = Text -> String
forall a. Show a => a -> String
show Text
mt

instance Show Mot where
  show :: Mot -> String
show (Mot Text
t) = Text -> String
forall a. Show a => a -> String
show Text
t

instance Show Lettre where
  show :: Lettre -> String
show (Lettre Text
l) = Text -> String
forall a. Show a => a -> String
show Text
l

-----------------------------------------------------------------

class Collage sup inf where
  dec ::  sup  -> [inf]
  inc :: [inf] -> sup

instance Collage Texte Paragraphe where
  dec :: Texte -> [Paragraphe]
dec (Texte Text
t) = (Text -> Paragraphe) -> [Text] -> [Paragraphe]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map Text -> Paragraphe
Paragraphe ([Text] -> [Paragraphe]) -> [Text] -> [Paragraphe]
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text]
DT.splitOn Text
"\n" Text
t
  inc :: [Paragraphe] -> Texte
inc           = Text -> Texte
Texte (Text -> Texte) -> ([Paragraphe] -> Text) -> [Paragraphe] -> Texte
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [Text] -> Text
DT.intercalate Text
"\n" ([Text] -> Text)
-> ([Paragraphe] -> [Text]) -> [Paragraphe] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Paragraphe -> Text) -> [Paragraphe] -> [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map (\(Paragraphe Text
t) -> Text
t)

instance Collage Paragraphe Phrase where
  dec :: Paragraphe -> [Phrase]
dec (Paragraphe Text
t) = (Text -> Phrase) -> [Text] -> [Phrase]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map Text -> Phrase
Phrase ([Text] -> [Phrase]) -> [Text] -> [Phrase]
forall a b. (a -> b) -> a -> b
$ Text -> [Text]
sentences Text
t
  inc :: [Phrase] -> Paragraphe
inc                = Text -> Paragraphe
Paragraphe (Text -> Paragraphe)
-> ([Phrase] -> Text) -> [Phrase] -> Paragraphe
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Text] -> Text
DT.unwords ([Text] -> Text) -> ([Phrase] -> [Text]) -> [Phrase] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Phrase -> Text) -> [Phrase] -> [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map (\(Phrase Text
p) -> Text
p)

instance Collage Phrase MultiTerme where
  dec :: Phrase -> [MultiTerme]
dec (Phrase Text
t) = (Text -> MultiTerme) -> [Text] -> [MultiTerme]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map Text -> MultiTerme
MultiTerme ([Text] -> [MultiTerme]) -> [Text] -> [MultiTerme]
forall a b. (a -> b) -> a -> b
$ Text -> [Text]
DT.words Text
t
  inc :: [MultiTerme] -> Phrase
inc            = Text -> Phrase
Phrase (Text -> Phrase)
-> ([MultiTerme] -> Text) -> [MultiTerme] -> Phrase
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Text] -> Text
DT.unwords ([Text] -> Text)
-> ([MultiTerme] -> [Text]) -> [MultiTerme] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (MultiTerme -> Text) -> [MultiTerme] -> [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map (\(MultiTerme Text
p) -> Text
p)

instance Collage MultiTerme Mot where
  dec :: MultiTerme -> [Mot]
dec (MultiTerme Text
mt) = (Text -> Mot) -> [Text] -> [Mot]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map Text -> Mot
Mot ([Text] -> [Mot]) -> [Text] -> [Mot]
forall a b. (a -> b) -> a -> b
$ Text -> [Text]
DT.words Text
mt
  inc :: [Mot] -> MultiTerme
inc                 = Text -> MultiTerme
MultiTerme (Text -> MultiTerme) -> ([Mot] -> Text) -> [Mot] -> MultiTerme
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [Text] -> Text
DT.intercalate Text
" " ([Text] -> Text) -> ([Mot] -> [Text]) -> [Mot] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Mot -> Text) -> [Mot] -> [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map (\(Mot Text
m) -> Text
m)

-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
sentences :: Text -> [Text]
sentences Text
txt = (String -> Text) -> [String] -> [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map String -> Text
DT.pack ([String] -> [Text]) -> [String] -> [Text]
forall a b. (a -> b) -> a -> b
$ String -> [String]
segment (String -> [String]) -> String -> [String]
forall a b. (a -> b) -> a -> b
$ Text -> String
DT.unpack Text
txt

sentences' :: Text -> [Text]
sentences' :: Text -> [Text]
sentences' Text
txt = (Char -> Bool) -> Text -> [Text]
split Char -> Bool
isCharStop Text
txt

isCharStop :: Char -> Bool
isCharStop :: Char -> Bool
isCharStop Char
c = Char
c Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Char
'.',Char
'?',Char
'!']

unsentences :: [Text] -> Text
unsentences :: [Text] -> Text
unsentences [Text]
txts = Text -> [Text] -> Text
DT.intercalate Text
" " [Text]
txts

-- | Ngrams size
size :: Text -> Int
size :: Text -> Int
size Text
t = Int
1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Text -> Text -> Int
DT.count Text
" " Text
t