{-|
Module      : Gargantext.Core.Text.Metrics.Count
Description : 
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Token and occurrence

An occurrence is not necessarily a token. Considering the sentence:
"A rose is a rose is a rose". We may equally correctly state that there
are eight or three words in the sentence. There are, in fact, three word
types in the sentence: "rose", "is" and "a". There are eight word tokens
in a token copy of the line. The line itself is a type. There are not
eight word types in the line. It contains (as stated) only the three
word types, 'a', 'is' and 'rose', each of which is unique. So what do we
call what there are eight of? They are occurrences of words. There are
three occurrences of the word type 'a', two of 'is' and three of 'rose'.
Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrences

-}


module Gargantext.Core.Text.Metrics.Count
  where

import Data.Text (Text)
import Control.Arrow (Arrow(..), (***))
import qualified Data.List as List

import qualified Data.Map.Strict as DMS
import Data.Map.Strict  ( Map, empty, singleton
                        , insertWith, unionWith, unionsWith
                        , mapKeys
                        )
import Data.Set (Set)
import Data.Text (pack)


------------------------------------------------------------------------
import Gargantext.Prelude
import Gargantext.Core.Types
------------------------------------------------------------------------
type Occ  a = Map      a  Int
type Cooc a = Map (a,  a) Int
type FIS  a = Map (Set a) Int

data Group = ByStem | ByOntology

type Grouped = Stems


{-
-- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
-- >> map occurrences <$> Prelude.mapM (terms Mono EN) 
-- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),1)]
--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon red lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
---- 
           -}

type Occs      = Int
type Coocs     = Int
type Threshold = Int

removeApax :: Threshold -> Map ([Text], [Text]) Int -> Map ([Text], [Text]) Int
removeApax :: Threshold
-> Map ([Text], [Text]) Threshold -> Map ([Text], [Text]) Threshold
removeApax Threshold
t = (Threshold -> Bool)
-> Map ([Text], [Text]) Threshold -> Map ([Text], [Text]) Threshold
forall a k. (a -> Bool) -> Map k a -> Map k a
DMS.filter (Threshold -> Threshold -> Bool
forall a. Ord a => a -> a -> Bool
> Threshold
t)

cooc :: [[Terms]] -> Map ([Text], [Text]) Int
cooc :: [[Terms]] -> Map ([Text], [Text]) Threshold
cooc [[Terms]]
tss = (Terms -> Stems)
-> (Stems -> [Text]) -> [[Terms]] -> Map ([Text], [Text]) Threshold
forall label b a.
(Ord label, Ord b) =>
(a -> b) -> (b -> label) -> [[a]] -> Map (label, label) Threshold
coocOnWithLabel Terms -> Stems
_terms_stem (Map Stems [Text] -> Stems -> [Text]
useLabelPolicy Map Stems [Text]
label_policy) [[Terms]]
tss
  where
    terms_occs :: Map Stems (Map Terms Threshold)
terms_occs = (Terms -> Stems) -> [Terms] -> Map Stems (Map Terms Threshold)
forall a b.
(Ord a, Ord b) =>
(a -> b) -> [a] -> Map b (Map a Threshold)
occurrencesOn Terms -> Stems
_terms_stem ([[Terms]] -> [Terms]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
List.concat [[Terms]]
tss)
    label_policy :: Map Stems [Text]
label_policy = Map Stems (Map Terms Threshold) -> Map Stems [Text]
mkLabelPolicy Map Stems (Map Terms Threshold)
terms_occs


coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
                                      -> [[a]] -> Map (label, label) Coocs
coocOnWithLabel :: (a -> b) -> (b -> label) -> [[a]] -> Map (label, label) Threshold
coocOnWithLabel a -> b
on' b -> label
policy [[a]]
tss = ((b, b) -> (label, label))
-> Map (b, b) Threshold -> Map (label, label) Threshold
forall k2 k1 a. Ord k2 => (k1 -> k2) -> Map k1 a -> Map k2 a
mapKeys ((b -> label) -> (b, b) -> (label, label)
forall (a :: * -> * -> *) b' c'.
Arrow a =>
a b' c' -> a (b', b') (c', c')
delta b -> label
policy) (Map (b, b) Threshold -> Map (label, label) Threshold)
-> Map (b, b) Threshold -> Map (label, label) Threshold
forall a b. (a -> b) -> a -> b
$ (a -> b) -> [[a]] -> Map (b, b) Threshold
forall b a. Ord b => (a -> b) -> [[a]] -> Map (b, b) Threshold
coocOn a -> b
on' [[a]]
tss
  where
    delta :: Arrow a => a b' c' -> a (b', b') (c', c')
    delta :: a b' c' -> a (b', b') (c', c')
delta a b' c'
f = a b' c'
f a b' c' -> a b' c' -> a (b', b') (c', c')
forall (a :: * -> * -> *) b c b' c'.
Arrow a =>
a b c -> a b' c' -> a (b, b') (c, c')
*** a b' c'
f


mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped [Text]
mkLabelPolicy :: Map Stems (Map Terms Threshold) -> Map Stems [Text]
mkLabelPolicy = (Map Terms Threshold -> [Text])
-> Map Stems (Map Terms Threshold) -> Map Stems [Text]
forall a b k. (a -> b) -> Map k a -> Map k b
DMS.map Map Terms Threshold -> [Text]
f where
  f :: Map Terms Threshold -> [Text]
f = Terms -> [Text]
_terms_label (Terms -> [Text])
-> (Map Terms Threshold -> Terms) -> Map Terms Threshold -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Terms, Threshold) -> Terms
forall a b. (a, b) -> a
fst ((Terms, Threshold) -> Terms)
-> (Map Terms Threshold -> (Terms, Threshold))
-> Map Terms Threshold
-> Terms
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Terms, Threshold) -> Threshold)
-> [(Terms, Threshold)] -> (Terms, Threshold)
forall a1 (t :: * -> *) a2.
(Ord a1, Foldable t) =>
(a2 -> a1) -> t a2 -> a2
maximumWith (Terms, Threshold) -> Threshold
forall a b. (a, b) -> b
snd ([(Terms, Threshold)] -> (Terms, Threshold))
-> (Map Terms Threshold -> [(Terms, Threshold)])
-> Map Terms Threshold
-> (Terms, Threshold)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Map Terms Threshold -> [(Terms, Threshold)]
forall k a. Map k a -> [(k, a)]
DMS.toList
     -- TODO use the Foldable instance of Map instead of building a list

useLabelPolicy :: Map Grouped [Text] -> Grouped -> [Text]
useLabelPolicy :: Map Stems [Text] -> Stems -> [Text]
useLabelPolicy Map Stems [Text]
m Stems
g = case Stems -> Map Stems [Text] -> Maybe [Text]
forall k a. Ord k => k -> Map k a -> Maybe a
DMS.lookup Stems
g Map Stems [Text]
m of
  Just [Text]
label -> [Text]
label
  Maybe [Text]
Nothing    -> Text -> [Text]
forall a. HasCallStack => Text -> a
panic (Text -> [Text]) -> Text -> [Text]
forall a b. (a -> b) -> a -> b
$ Text
"Label of Grouped not found: " Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> (String -> Text
pack (String -> Text) -> String -> Text
forall a b. (a -> b) -> a -> b
$ Stems -> String
forall a. Show a => a -> String
show Stems
g)
                -- TODO: use a non-fatal error if this can happen in practice
{-
labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
labelPolicy m g =  case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
                     Just label -> label
                     Nothing    -> panic $ "Label of Grouped not found: " <> (pack $ show g)
-}

coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Int
coocOn :: (a -> b) -> [[a]] -> Map (b, b) Threshold
coocOn a -> b
f [[a]]
as = (Threshold -> Threshold -> Threshold)
-> [Map (b, b) Threshold] -> Map (b, b) Threshold
forall (f :: * -> *) k a.
(Foldable f, Ord k) =>
(a -> a -> a) -> f (Map k a) -> Map k a
DMS.unionsWith Threshold -> Threshold -> Threshold
forall a. Num a => a -> a -> a
(+) ([Map (b, b) Threshold] -> Map (b, b) Threshold)
-> [Map (b, b) Threshold] -> Map (b, b) Threshold
forall a b. (a -> b) -> a -> b
$ ([a] -> Map (b, b) Threshold) -> [[a]] -> [Map (b, b) Threshold]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map ((a -> b) -> [a] -> Map (b, b) Threshold
forall b a. Ord b => (a -> b) -> [a] -> Map (b, b) Threshold
coocOn' a -> b
f) [[a]]
as

coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Int
coocOn' :: (a -> b) -> [a] -> Map (b, b) Threshold
coocOn' a -> b
fun [a]
ts = (Threshold -> Threshold -> Threshold)
-> [((b, b), Threshold)] -> Map (b, b) Threshold
forall k a. Ord k => (a -> a -> a) -> [(k, a)] -> Map k a
DMS.fromListWith Threshold -> Threshold -> Threshold
forall a. Num a => a -> a -> a
(+) [((b, b), Threshold)]
xs
  where
      ts' :: [b]
ts' = [b] -> [b]
forall a. Eq a => [a] -> [a]
List.nub ([b] -> [b]) -> [b] -> [b]
forall a b. (a -> b) -> a -> b
$ (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map a -> b
fun [a]
ts
      xs :: [((b, b), Threshold)]
xs = [ ((b
x, b
y), Threshold
1)
           | b
x <- [b]
ts'
           , b
y <- [b]
ts'
           , b
x b -> b -> Bool
forall a. Ord a => a -> a -> Bool
>= b
y
           ]


------------------------------------------------------------------------
coocOnContexts :: (a -> [Text]) -> [[a]] -> Map ([Text], [Text]) Int
coocOnContexts :: (a -> [Text]) -> [[a]] -> Map ([Text], [Text]) Threshold
coocOnContexts a -> [Text]
fun = (Threshold -> Threshold -> Threshold)
-> [(([Text], [Text]), Threshold)]
-> Map ([Text], [Text]) Threshold
forall k a. Ord k => (a -> a -> a) -> [(k, a)] -> Map k a
DMS.fromListWith Threshold -> Threshold -> Threshold
forall a. Num a => a -> a -> a
(+) ([(([Text], [Text]), Threshold)] -> Map ([Text], [Text]) Threshold)
-> ([[a]] -> [(([Text], [Text]), Threshold)])
-> [[a]]
-> Map ([Text], [Text]) Threshold
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [[(([Text], [Text]), Threshold)]]
-> [(([Text], [Text]), Threshold)]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
List.concat ([[(([Text], [Text]), Threshold)]]
 -> [(([Text], [Text]), Threshold)])
-> ([[a]] -> [[(([Text], [Text]), Threshold)]])
-> [[a]]
-> [(([Text], [Text]), Threshold)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([a] -> [(([Text], [Text]), Threshold)])
-> [[a]] -> [[(([Text], [Text]), Threshold)]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map ((a -> [Text]) -> [a] -> [(([Text], [Text]), Threshold)]
forall a. (a -> [Text]) -> [a] -> [(([Text], [Text]), Threshold)]
coocOnSingleContext a -> [Text]
fun)

coocOnSingleContext :: (a -> [Text]) -> [a] -> [(([Text], [Text]), Int)]
coocOnSingleContext :: (a -> [Text]) -> [a] -> [(([Text], [Text]), Threshold)]
coocOnSingleContext a -> [Text]
fun [a]
ts = [(([Text], [Text]), Threshold)]
xs
  where
      ts' :: [[Text]]
ts' = [[Text]] -> [[Text]]
forall a. Eq a => [a] -> [a]
List.nub ([[Text]] -> [[Text]]) -> [[Text]] -> [[Text]]
forall a b. (a -> b) -> a -> b
$ (a -> [Text]) -> [a] -> [[Text]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map a -> [Text]
fun [a]
ts
      xs :: [(([Text], [Text]), Threshold)]
xs = [ (([Text]
x, [Text]
y), Threshold
1)
           | [Text]
x <- [[Text]]
ts'
           , [Text]
y <- [[Text]]
ts'
           , [Text]
x [Text] -> [Text] -> Bool
forall a. Ord a => a -> a -> Bool
>= [Text]
y
           ]
------------------------------------------------------------------------


-- | Compute the grouped occurrences (occ)
occurrences :: [Terms] -> Map Grouped (Map Terms Int)
occurrences :: [Terms] -> Map Stems (Map Terms Threshold)
occurrences = (Terms -> Stems) -> [Terms] -> Map Stems (Map Terms Threshold)
forall a b.
(Ord a, Ord b) =>
(a -> b) -> [a] -> Map b (Map a Threshold)
occurrencesOn Terms -> Stems
_terms_stem

occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
occurrencesOn :: (a -> b) -> [a] -> Map b (Map a Threshold)
occurrencesOn a -> b
f = (Map b (Map a Threshold) -> a -> Map b (Map a Threshold))
-> Map b (Map a Threshold) -> [a] -> Map b (Map a Threshold)
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (\Map b (Map a Threshold)
m a
a -> (Map a Threshold -> Map a Threshold -> Map a Threshold)
-> b
-> Map a Threshold
-> Map b (Map a Threshold)
-> Map b (Map a Threshold)
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
insertWith ((Threshold -> Threshold -> Threshold)
-> Map a Threshold -> Map a Threshold -> Map a Threshold
forall k a. Ord k => (a -> a -> a) -> Map k a -> Map k a -> Map k a
unionWith Threshold -> Threshold -> Threshold
forall a. Num a => a -> a -> a
(+)) (a -> b
f a
a) (a -> Threshold -> Map a Threshold
forall k a. k -> a -> Map k a
singleton a
a Threshold
1) Map b (Map a Threshold)
m) Map b (Map a Threshold)
forall k a. Map k a
empty

occurrencesWith :: (Foldable list, Ord k, Num a) => (b -> k) -> list b -> Map k a
occurrencesWith :: (b -> k) -> list b -> Map k a
occurrencesWith b -> k
f list b
xs = (Map k a -> b -> Map k a) -> Map k a -> list b -> Map k a
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (\Map k a
x b
y -> (a -> a -> a) -> k -> a -> Map k a -> Map k a
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
insertWith a -> a -> a
forall a. Num a => a -> a -> a
(+) (b -> k
f b
y) a
1 Map k a
x) Map k a
forall k a. Map k a
empty list b
xs

-- TODO add groups and filter stops

sumOcc :: Ord a => [Occ a] -> Occ a
sumOcc :: [Occ a] -> Occ a
sumOcc [Occ a]
xs = (Threshold -> Threshold -> Threshold) -> [Occ a] -> Occ a
forall (f :: * -> *) k a.
(Foldable f, Ord k) =>
(a -> a -> a) -> f (Map k a) -> Map k a
unionsWith Threshold -> Threshold -> Threshold
forall a. Num a => a -> a -> a
(+) [Occ a]
xs