Copyright | (c) CNRS 2017 - present |
---|---|
License | AGPL + CECILL v3 |
Maintainer | team@gargantext.org |
Stability | experimental |
Portability | POSIX |
Safe Haskell | None |
Language | Haskell2010 |
Gargantext.Core.Text.Terms
Description
An n-gram
is a contiguous sequence of n items from a given sample of
text. In Gargantext application the items are words, n is a non negative
integer.
Using Latin numerical prefixes, an n-gram of size 1 is referred to as a "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size 3 is a "trigram". English cardinal numbers are sometimes used, e.g., "four-gram", "five-gram", and so on.
Source: https://en.wikipedia.org/wiki/Ngrams
TODO group Ngrams -> Tree compute occ by node of Tree group occs according groups
compute cooccurrences compute graph
Synopsis
- data TermType lang
- = Mono {
- _tt_lang :: !lang
- | Multi {
- _tt_lang :: !lang
- | MonoMulti {
- _tt_lang :: !lang
- | Unsupervised {
- _tt_lang :: !lang
- _tt_windowSize :: !Int
- _tt_ngramsSize :: !Int
- _tt_model :: !(Maybe (Tries Token ()))
- = Mono {
- tt_windowSize :: forall lang. Traversal' (TermType lang) Int
- tt_ngramsSize :: forall lang. Traversal' (TermType lang) Int
- tt_model :: forall lang. Traversal' (TermType lang) (Maybe (Tries Token ()))
- tt_lang :: forall lang lang. Lens (TermType lang) (TermType lang) lang lang
- extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
- withLang :: (Foldable t, Functor t, HasText h) => TermType Lang -> t h -> TermType Lang
- data ExtractedNgrams
- = SimpleNgrams {
- unSimpleNgrams :: Ngrams
- | EnrichedNgrams { }
- = SimpleNgrams {
- class ExtractNgramsT h where
- extractNgramsT :: HasText h => TermType Lang -> h -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
- enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
- cleanNgrams :: Int -> Ngrams -> Ngrams
- cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams
- extracted2ngrams :: ExtractedNgrams -> Ngrams
- insertExtractedNgrams :: [ExtractedNgrams] -> Cmd err (HashMap Text NgramsId)
- isSimpleNgrams :: ExtractedNgrams -> Bool
- terms :: TermType Lang -> Text -> IO [Terms]
- type WindowSize = Int
- type MinNgramSize = Int
- termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
- newTries :: Int -> Text -> Tries Token ()
- uniText :: Text -> [[Text]]
- text2term :: Lang -> [Text] -> Terms
- isPunctuation :: Text -> Bool
Documentation
Constructors
Mono | |
Fields
| |
Multi | |
Fields
| |
MonoMulti | |
Fields
| |
Unsupervised | |
Fields
|
tt_windowSize :: forall lang. Traversal' (TermType lang) Int Source #
tt_ngramsSize :: forall lang. Traversal' (TermType lang) Int Source #
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]] Source #
Sugar to extract terms from text (hiddeng mapM from end user). extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
data ExtractedNgrams Source #
Constructors
SimpleNgrams | |
Fields
| |
EnrichedNgrams | |
Fields |
Instances
class ExtractNgramsT h where Source #
Methods
extractNgramsT :: HasText h => TermType Lang -> h -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int)) Source #
Instances
ExtractNgramsT HyperdataDocument Source # | |
Defined in Gargantext.Database.Action.Flow Methods extractNgramsT :: HasText HyperdataDocument => TermType Lang -> HyperdataDocument -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int)) Source # | |
ExtractNgramsT HyperdataContact Source # | |
Defined in Gargantext.Database.Action.Flow Methods extractNgramsT :: HasText HyperdataContact => TermType Lang -> HyperdataContact -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int)) Source # | |
(ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a) Source # | |
Defined in Gargantext.Database.Action.Flow |
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag Source #
cleanNgrams :: Int -> Ngrams -> Ngrams Source #
extracted2ngrams :: ExtractedNgrams -> Ngrams Source #
insertExtractedNgrams :: [ExtractedNgrams] -> Cmd err (HashMap Text NgramsId) Source #
isSimpleNgrams :: ExtractedNgrams -> Bool Source #
terms :: TermType Lang -> Text -> IO [Terms] Source #
Terms from Text Mono : mono terms Multi : multi terms MonoMulti : mono and multi TODO : multi terms should exclude mono (intersection is not empty yet)
type WindowSize = Int Source #
Unsupervised ngrams extraction language agnostic extraction TODO: remove IO TODO: newtype BlockText
type MinNgramSize = Int Source #
isPunctuation :: Text -> Bool Source #