{-|
Module      : Gargantext.Core.Text.Search
Description : All parsers of Gargantext in one file.
Copyright   : (c) CNRS, 2017 - present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

This search Engine is first made to clean CSV file according to a query.

Starting from this model, a specific Gargantext engine will be made
(using more metrics scores/features).
-}

module Gargantext.Core.Text.Search where

import Data.SearchEngine

import Data.Ix

-- Usefull to use stopwords
-- import Data.Set (Set)
-- import qualified Data.Set as Set
import Data.Text (Text)

import Gargantext.Prelude
import Gargantext.Core.Text.Terms.Mono (monoTexts)
import Gargantext.Core.Text.Terms.Mono.Stem as ST
import Gargantext.Core.Text.Corpus.Parsers.CSV

type DocId = Int

type DocSearchEngine = SearchEngine
                         CsvGargV3
                         DocId
                         DocField
                         NoFeatures

data DocField = TitleField
              | AbstractField
  deriving (DocField -> DocField -> Bool
(DocField -> DocField -> Bool)
-> (DocField -> DocField -> Bool) -> Eq DocField
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: DocField -> DocField -> Bool
$c/= :: DocField -> DocField -> Bool
== :: DocField -> DocField -> Bool
$c== :: DocField -> DocField -> Bool
Eq, Eq DocField
Eq DocField
-> (DocField -> DocField -> Ordering)
-> (DocField -> DocField -> Bool)
-> (DocField -> DocField -> Bool)
-> (DocField -> DocField -> Bool)
-> (DocField -> DocField -> Bool)
-> (DocField -> DocField -> DocField)
-> (DocField -> DocField -> DocField)
-> Ord DocField
DocField -> DocField -> Bool
DocField -> DocField -> Ordering
DocField -> DocField -> DocField
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: DocField -> DocField -> DocField
$cmin :: DocField -> DocField -> DocField
max :: DocField -> DocField -> DocField
$cmax :: DocField -> DocField -> DocField
>= :: DocField -> DocField -> Bool
$c>= :: DocField -> DocField -> Bool
> :: DocField -> DocField -> Bool
$c> :: DocField -> DocField -> Bool
<= :: DocField -> DocField -> Bool
$c<= :: DocField -> DocField -> Bool
< :: DocField -> DocField -> Bool
$c< :: DocField -> DocField -> Bool
compare :: DocField -> DocField -> Ordering
$ccompare :: DocField -> DocField -> Ordering
$cp1Ord :: Eq DocField
Ord, Int -> DocField
DocField -> Int
DocField -> [DocField]
DocField -> DocField
DocField -> DocField -> [DocField]
DocField -> DocField -> DocField -> [DocField]
(DocField -> DocField)
-> (DocField -> DocField)
-> (Int -> DocField)
-> (DocField -> Int)
-> (DocField -> [DocField])
-> (DocField -> DocField -> [DocField])
-> (DocField -> DocField -> [DocField])
-> (DocField -> DocField -> DocField -> [DocField])
-> Enum DocField
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
enumFromThenTo :: DocField -> DocField -> DocField -> [DocField]
$cenumFromThenTo :: DocField -> DocField -> DocField -> [DocField]
enumFromTo :: DocField -> DocField -> [DocField]
$cenumFromTo :: DocField -> DocField -> [DocField]
enumFromThen :: DocField -> DocField -> [DocField]
$cenumFromThen :: DocField -> DocField -> [DocField]
enumFrom :: DocField -> [DocField]
$cenumFrom :: DocField -> [DocField]
fromEnum :: DocField -> Int
$cfromEnum :: DocField -> Int
toEnum :: Int -> DocField
$ctoEnum :: Int -> DocField
pred :: DocField -> DocField
$cpred :: DocField -> DocField
succ :: DocField -> DocField
$csucc :: DocField -> DocField
Enum, DocField
DocField -> DocField -> Bounded DocField
forall a. a -> a -> Bounded a
maxBound :: DocField
$cmaxBound :: DocField
minBound :: DocField
$cminBound :: DocField
Bounded, Ord DocField
Ord DocField
-> ((DocField, DocField) -> [DocField])
-> ((DocField, DocField) -> DocField -> Int)
-> ((DocField, DocField) -> DocField -> Int)
-> ((DocField, DocField) -> DocField -> Bool)
-> ((DocField, DocField) -> Int)
-> ((DocField, DocField) -> Int)
-> Ix DocField
(DocField, DocField) -> Int
(DocField, DocField) -> [DocField]
(DocField, DocField) -> DocField -> Bool
(DocField, DocField) -> DocField -> Int
forall a.
Ord a
-> ((a, a) -> [a])
-> ((a, a) -> a -> Int)
-> ((a, a) -> a -> Int)
-> ((a, a) -> a -> Bool)
-> ((a, a) -> Int)
-> ((a, a) -> Int)
-> Ix a
unsafeRangeSize :: (DocField, DocField) -> Int
$cunsafeRangeSize :: (DocField, DocField) -> Int
rangeSize :: (DocField, DocField) -> Int
$crangeSize :: (DocField, DocField) -> Int
inRange :: (DocField, DocField) -> DocField -> Bool
$cinRange :: (DocField, DocField) -> DocField -> Bool
unsafeIndex :: (DocField, DocField) -> DocField -> Int
$cunsafeIndex :: (DocField, DocField) -> DocField -> Int
index :: (DocField, DocField) -> DocField -> Int
$cindex :: (DocField, DocField) -> DocField -> Int
range :: (DocField, DocField) -> [DocField]
$crange :: (DocField, DocField) -> [DocField]
$cp1Ix :: Ord DocField
Ix, Int -> DocField -> ShowS
[DocField] -> ShowS
DocField -> String
(Int -> DocField -> ShowS)
-> (DocField -> String) -> ([DocField] -> ShowS) -> Show DocField
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [DocField] -> ShowS
$cshowList :: [DocField] -> ShowS
show :: DocField -> String
$cshow :: DocField -> String
showsPrec :: Int -> DocField -> ShowS
$cshowsPrec :: Int -> DocField -> ShowS
Show)

initialDocSearchEngine :: DocSearchEngine
initialDocSearchEngine :: DocSearchEngine
initialDocSearchEngine =
    SearchConfig CsvGargV3 Int DocField NoFeatures
-> SearchRankParameters DocField NoFeatures -> DocSearchEngine
forall field feature doc key.
(Ix field, Bounded field, Ix feature, Bounded feature) =>
SearchConfig doc key field feature
-> SearchRankParameters field feature
-> SearchEngine doc key field feature
initSearchEngine SearchConfig CsvGargV3 Int DocField NoFeatures
docSearchConfig SearchRankParameters DocField NoFeatures
defaultSearchRankParameters

docSearchConfig :: SearchConfig CsvGargV3 DocId DocField NoFeatures
docSearchConfig :: SearchConfig CsvGargV3 Int DocField NoFeatures
docSearchConfig =
    SearchConfig :: forall doc key field feature.
(doc -> key)
-> (doc -> field -> [Term])
-> (Term -> field -> Term)
-> (doc -> feature -> Float)
-> SearchConfig doc key field feature
SearchConfig {
      documentKey :: CsvGargV3 -> Int
documentKey           = CsvGargV3 -> Int
d_docId,
      extractDocumentTerms :: CsvGargV3 -> DocField -> [Term]
extractDocumentTerms  = CsvGargV3 -> DocField -> [Term]
extractTerms,
      transformQueryTerm :: Term -> DocField -> Term
transformQueryTerm    = Term -> DocField -> Term
normaliseQueryToken,
      documentFeatureValue :: CsvGargV3 -> NoFeatures -> Float
documentFeatureValue  = (NoFeatures -> Float) -> CsvGargV3 -> NoFeatures -> Float
forall a b. a -> b -> a
const NoFeatures -> Float
forall a. NoFeatures -> a
noFeatures
  }
  where
    extractTerms :: CsvGargV3 -> DocField -> [Text]
    extractTerms :: CsvGargV3 -> DocField -> [Term]
extractTerms CsvGargV3
doc DocField
TitleField       = Term -> [Term]
monoTexts (CsvGargV3 -> Term
d_title CsvGargV3
doc)
    extractTerms CsvGargV3
doc DocField
AbstractField    = Term -> [Term]
monoTexts (CsvGargV3 -> Term
d_abstract CsvGargV3
doc)

    normaliseQueryToken :: Text -> DocField -> Text
    normaliseQueryToken :: Term -> DocField -> Term
normaliseQueryToken Term
tok =
      let tokStem :: Term -> Term
tokStem = Lang -> Term -> Term
ST.stem Lang
ST.EN
       in \DocField
field -> case DocField
field of
                      DocField
TitleField    -> Term -> Term
tokStem Term
tok
                      DocField
AbstractField -> Term -> Term
tokStem Term
tok

defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
defaultSearchRankParameters =
    SearchRankParameters :: forall field feature.
Float
-> (field -> Float)
-> (field -> Float)
-> (feature -> Float)
-> (feature -> FeatureFunction)
-> Int
-> Int
-> Int
-> Int
-> SearchRankParameters field feature
SearchRankParameters {
      Float
paramK1 :: Float
paramK1 :: Float
paramK1,
      DocField -> Float
paramB :: DocField -> Float
paramB :: DocField -> Float
paramB,
      DocField -> Float
paramFieldWeights :: DocField -> Float
paramFieldWeights :: DocField -> Float
paramFieldWeights,
      paramFeatureWeights :: NoFeatures -> Float
paramFeatureWeights     = NoFeatures -> Float
forall a. NoFeatures -> a
noFeatures,
      paramFeatureFunctions :: NoFeatures -> FeatureFunction
paramFeatureFunctions   = NoFeatures -> FeatureFunction
forall a. NoFeatures -> a
noFeatures,
      paramResultsetSoftLimit :: Int
paramResultsetSoftLimit = Int
2000,
      paramResultsetHardLimit :: Int
paramResultsetHardLimit = Int
4000,
      paramAutosuggestPrefilterLimit :: Int
paramAutosuggestPrefilterLimit  = Int
500,
      paramAutosuggestPostfilterLimit :: Int
paramAutosuggestPostfilterLimit = Int
500
    }
  where
    paramK1 :: Float
    paramK1 :: Float
paramK1 = Float
1.5

    paramB :: DocField -> Float
    paramB :: DocField -> Float
paramB DocField
TitleField      = Float
0.9
    paramB DocField
AbstractField   = Float
0.5

    paramFieldWeights :: DocField -> Float
    paramFieldWeights :: DocField -> Float
paramFieldWeights DocField
TitleField    = Float
20
    paramFieldWeights DocField
AbstractField = Float
5