Copyright | (c) CNRS 2017 - present |
---|---|
License | AGPL + CECILL v3 |
Maintainer | team@gargantext.org |
Stability | experimental |
Portability | POSIX |
Safe Haskell | Safe-Inferred |
Language | Haskell2010 |
An n-gram
is a contiguous sequence of n items from a given sample of
text. In Gargantext application the items are words, n is a non negative
integer.
Using Latin numerical prefixes, an n-gram of size 1 is referred to as a "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size 3 is a "trigram". English cardinal numbers are sometimes used, e.g., "four-gram", "five-gram", and so on.
Source: https://en.wikipedia.org/wiki/Ngrams
TODO group Ngrams -> Tree compute occ by node of Tree group occs according groups
compute cooccurrences compute graph
Synopsis
- data TermType lang
- tt_windowSize :: forall lang. Traversal' (TermType lang) Int
- tt_ngramsSize :: forall lang. Traversal' (TermType lang) Int
- tt_model :: forall lang. Traversal' (TermType lang) (Maybe (Tries Token ()))
- tt_lang :: forall lang lang. Lens (TermType lang) (TermType lang) lang lang
- extractTerms :: NLPServerConfig -> TermType Lang -> [Text] -> IO [[TermsWithCount]]
- withLang :: (Foldable t, Functor t, HasText h) => TermType Lang -> t h -> TermType Lang
- data ExtractedNgrams
- = SimpleNgrams { }
- | EnrichedNgrams { }
- class ExtractNgramsT h where
- extractNgramsT :: HasText h => NLPServerConfig -> TermType Lang -> h -> DBCmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
- enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
- cleanNgrams :: Int -> Ngrams -> Ngrams
- cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams
- extracted2ngrams :: ExtractedNgrams -> Ngrams
- insertExtractedNgrams :: [ExtractedNgrams] -> DBCmd err (HashMap Text NgramsId)
- isSimpleNgrams :: ExtractedNgrams -> Bool
- terms :: NLPServerConfig -> TermType Lang -> Text -> IO [TermsWithCount]
- type WindowSize = Int
- type MinNgramSize = Int
- termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount]
- newTries :: Int -> Text -> Tries Token ()
- uniText :: Text -> [[Text]]
- text2term :: Lang -> [Text] -> Terms
- isPunctuation :: Text -> Bool
Documentation
Mono | |
| |
Multi | |
| |
MonoMulti | |
| |
Unsupervised | |
|
Instances
tt_windowSize :: forall lang. Traversal' (TermType lang) Int #
tt_ngramsSize :: forall lang. Traversal' (TermType lang) Int #
extractTerms :: NLPServerConfig -> TermType Lang -> [Text] -> IO [[TermsWithCount]] #
Sugar to extract terms from text (hidding mapM
from end user).
extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
data ExtractedNgrams #
Instances
class ExtractNgramsT h where #
A typeclass that represents extracting ngrams from an entity.
extractNgramsT :: HasText h => NLPServerConfig -> TermType Lang -> h -> DBCmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount)) #
Instances
ExtractNgramsT HyperdataContact # | |
Defined in Gargantext.Database.Action.Flow.Extract extractNgramsT :: HasText HyperdataContact => NLPServerConfig -> TermType Lang -> HyperdataContact -> DBCmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount)) # | |
ExtractNgramsT HyperdataDocument # | Main ngrams extraction functionality. For NgramsTerms, this calls NLP server under the hood. For Sources, Institutes, Authors, this uses simple split on " ". |
Defined in Gargantext.Database.Action.Flow.Extract extractNgramsT :: HasText HyperdataDocument => NLPServerConfig -> TermType Lang -> HyperdataDocument -> DBCmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount)) # | |
(ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a) # | |
Defined in Gargantext.Database.Action.Flow.Extract extractNgramsT :: HasText (Node a) => NLPServerConfig -> TermType Lang -> Node a -> DBCmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount)) # |
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag #
cleanNgrams :: Int -> Ngrams -> Ngrams #
insertExtractedNgrams :: [ExtractedNgrams] -> DBCmd err (HashMap Text NgramsId) #
isSimpleNgrams :: ExtractedNgrams -> Bool #
terms :: NLPServerConfig -> TermType Lang -> Text -> IO [TermsWithCount] #
type WindowSize = Int #
type MinNgramSize = Int #
termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount] #
Unsupervised ngrams extraction language agnostic extraction TODO: newtype BlockText
isPunctuation :: Text -> Bool #