Copyright | (c) CNRS 2019-Present |
---|---|
License | AGPL + CECILL v3 |
Maintainer | team@gargantext.org |
Stability | experimental |
Portability | POSIX |
Safe Haskell | Safe-Inferred |
Language | Haskell2010 |
# Implementation of Unsupervized Word Segmentation
References:
- Python implementation (Korantin August, Emmanuel Navarro): EleVe
- Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics , pages 383–387. PDF
Notes for current implementation: - TODO extract longer ngrams (see paper above, viterbi algo can be used) - TODO AD TEST: prop (Node c _e f) = c == Map.size f
- AD: Real ngrams extraction test from Gargantext.Core.Text.Terms import extractTermsUnsupervised docs <- runCmdRepl $ selectDocs 1004 extractTermsUnsupervised 3 $ DT.intercalate " " $ catMaybes $ Gargantext.map _hyperdataDocument_abstract docs
Synopsis
- nan :: Floating e => e
- noNaNs :: RealFloat e => [e] -> [e]
- updateIfDefined :: RealFloat e => e -> e -> e
- sim :: Entropy e => e -> e -> Bool
- subst :: Entropy e => (e, e) -> e -> e
- type Entropy e = (Fractional e, Floating e, RealFloat e, Show e)
- data I e = I {
- _info_entropy :: e
- _info_entropy_var :: e
- _info_autonomy :: e
- info_entropy_var :: forall e. Lens' (I e) e
- info_entropy :: forall e. Lens' (I e) e
- info_autonomy :: forall e. Lens' (I e) e
- type ModEntropy i o e = (e -> e) -> i -> o
- set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
- set_entropy_var :: Entropy e => Setter e (I e) e e
- data StartStop
- data Token
- isTerminal :: Token -> Bool
- nonTerminals :: [Token] -> [Text]
- parseToken :: Text -> Token
- toToken :: [Text] -> [Token]
- printToken :: Token -> Text
- data Trie k e
- = Node {
- _node_count :: Int
- _node_entropy :: e
- _node_children :: Map k (Trie k e)
- | Leaf {
- _node_count :: Int
- = Node {
- node_entropy :: forall k e. Traversal' (Trie k e) e
- node_count :: forall k e. Lens' (Trie k e) Int
- node_children :: forall k e k. Traversal (Trie k e) (Trie k e) (Map k (Trie k e)) (Map k (Trie k e))
- insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
- emptyTrie :: Trie k e
- mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
- toTree :: k -> Trie k e -> Tree (k, Int, Maybe e)
- normalizeLevel :: Entropy e => e -> e -> e -> e
- chunkAlongEleve :: Int -> [a] -> [[a]]
- data Direction
- buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
- class IsTrie trie where
- entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
- nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
- nodeChild :: Ord k => k -> trie k e -> trie k e
- findTrie :: Ord k => [k] -> trie k e -> trie k e
- printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
- evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
- normalizeEntropy :: Entropy e => Getting e i e -> ModEntropy i o e -> trie k i -> trie k o
- levels :: Trie k e -> [[Trie k e]]
- entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
- normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
- data Tries k e = Tries {}
- fwd :: forall k e. Lens' (Tries k e) (Trie k e)
- bwd :: forall k e. Lens' (Tries k e) (Trie k e)
- buildTries :: Int -> [[Token]] -> Tries Token ()
- onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
- mayCons :: [a] -> [[a]] -> [[a]]
- split :: Entropy e => Int -> Lens' i e -> Tries Token i -> [Token] -> [[Text]]
- mainEleve :: Int -> [[Text]] -> [[[Text]]]
- mainEleve' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
- mainEleve'' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
- mainEleveWith :: Tries Token () -> Int -> [[Text]] -> [[[Text]]]
- type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
- testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
- example0 :: [Text]
- example1 :: [Text]
- example2 :: [Text]
- example3 :: [Text]
- example4 :: [Text]
- example5 :: [Text]
- example6 :: [Text]
- example7 :: [Text]
- example8 :: [Text]
- example9 :: [Text]
- checks0 :: Checks Double
- checks2 :: Checks Double
- checks7 :: Checks Double
- checks8 :: Checks Double
- checks9 :: Checks Double
- runTestsEleve :: Bool -> IO ()
Documentation
updateIfDefined :: RealFloat e => e -> e -> e #
type Entropy e = (Fractional e, Floating e, RealFloat e, Show e) #
TODO: Show Instance only used for debugging
Example and tests for development
I | |
|
info_entropy_var :: forall e. Lens' (I e) e #
info_entropy :: forall e. Lens' (I e) e #
info_autonomy :: forall e. Lens' (I e) e #
type ModEntropy i o e = (e -> e) -> i -> o #
set_autonomy :: Entropy e => ModEntropy (I e) (I e) e #
set_entropy_var :: Entropy e => Setter e (I e) e e #
isTerminal :: Token -> Bool #
nonTerminals :: [Token] -> [Text] #
parseToken :: Text -> Token #
printToken :: Token -> Text #
Node | |
| |
Leaf | |
|
Instances
IsTrie Trie # | |
Defined in Gargantext.Core.Text.Terms.Eleve entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e # nodeEntropy :: Entropy e => Getting e i e -> Trie k i -> e # nodeChild :: Ord k => k -> Trie k e -> Trie k e # findTrie :: Ord k => [k] -> Trie k e -> Trie k e # printTrie :: (Show i, Entropy e) => Getting e i e -> Trie Token i -> IO () # evTrie :: Entropy e => Getting e i e -> Setter i o e e -> Trie k i -> Trie k o # normalizeEntropy :: Entropy e => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o # | |
(Show e, Show k) => Show (Trie k e) # | |
node_entropy :: forall k e. Traversal' (Trie k e) e #
node_count :: forall k e. Lens' (Trie k e) Int #
node_children :: forall k e k. Traversal (Trie k e) (Trie k e) (Map k (Trie k e)) (Map k (Trie k e)) #
insertTrie :: Ord k => [k] -> Trie k () -> Trie k () #
normalizeLevel :: Entropy e => e -> e -> e -> e #
chunkAlongEleve :: Int -> [a] -> [[a]] #
entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e #
nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e #
nodeChild :: Ord k => k -> trie k e -> trie k e #
findTrie :: Ord k => [k] -> trie k e -> trie k e #
printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO () #
evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o #
normalizeEntropy :: Entropy e => Getting e i e -> ModEntropy i o e -> trie k i -> trie k o #
Instances
entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]] #
Instances
IsTrie Tries # | |
Defined in Gargantext.Core.Text.Terms.Eleve entropyTrie :: Entropy e => (k -> Bool) -> Tries k () -> Tries k e # nodeEntropy :: Entropy e => Getting e i e -> Tries k i -> e # nodeChild :: Ord k => k -> Tries k e -> Tries k e # findTrie :: Ord k => [k] -> Tries k e -> Tries k e # printTrie :: (Show i, Entropy e) => Getting e i e -> Tries Token i -> IO () # evTrie :: Entropy e => Getting e i e -> Setter i o e e -> Tries k i -> Tries k o # normalizeEntropy :: Entropy e => Getting e i e -> ModEntropy i o e -> Tries k i -> Tries k o # | |
(Show k, Show e) => Show (Tries k e) # | |
mainEleve'' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]] #
This function should take the longest possible chain of: mainEleve'' n x y = maxChainSizeOf [ mainEleve' n x y , mainEleve' n x x , mainEleve' n y y ]
runTestsEleve :: Bool -> IO () #