{-|
Module      : Gargantext.Core.Text.Corpus.Parsers
Description : All parsers of Gargantext in one file.
Copyright   : (c) CNRS, 2017
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Gargantext enables analyzing semi-structured text that should be parsed
in order to be analyzed.

The parsers suppose we know the format of the Text (TextFormat data
type) according to which the right parser is chosen among the list of
available parsers.

This module mainly describe how to add a new parser to Gargantext,
please follow the types.
-}

{-# LANGUAGE PackageImports    #-}

module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
    where

import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Control.Monad (join)
import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers)
import Data.List (concat, lookup)
import Data.Ord()
import Data.String (String())
import Data.String()
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8)
import Data.Tuple.Extra (both, first, second)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString       as DB
import qualified Data.ByteString.Char8 as DBC
import qualified Data.ByteString.Lazy  as DBL
import qualified Data.Map              as DM
import qualified Data.Text             as DT
import qualified Prelude as Prelude
import System.IO.Temp (emptySystemTempFile)

import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified Gargantext.Core.Text.Corpus.Parsers.RIS  as RIS
import qualified Gargantext.Core.Text.Corpus.Parsers.WOS  as WOS
------------------------------------------------------------------------

type ParseError = String
--type Field      = Text
--type Document   = DM.Map Field Text
--type FilesParsed = DM.Map FilePath FileParsed
--data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
--                              , _fileParsed_result :: [Document]
--                              } deriving (Show)


-- | According to the format of Input file,
-- different parser are available.
data FileFormat = WOS | RIS | RisPresse
                | CsvGargV3 | CsvHal
                | ZIP
  deriving (Int -> FileFormat -> ShowS
[FileFormat] -> ShowS
FileFormat -> String
(Int -> FileFormat -> ShowS)
-> (FileFormat -> String)
-> ([FileFormat] -> ShowS)
-> Show FileFormat
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [FileFormat] -> ShowS
$cshowList :: [FileFormat] -> ShowS
show :: FileFormat -> String
$cshow :: FileFormat -> String
showsPrec :: Int -> FileFormat -> ShowS
$cshowsPrec :: Int -> FileFormat -> ShowS
Show)

-- Implemented (ISI Format)
--                | DOC        -- Not Implemented / import Pandoc
--                | ODT        -- Not Implemented / import Pandoc
--                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
--                | XML        -- Not Implemented / see :


parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
parseFormat :: FileFormat -> ByteString -> IO (Either String [HyperdataDocument])
parseFormat FileFormat
CsvGargV3 ByteString
bs = Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
 -> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ ByteString -> Either String [HyperdataDocument]
parseCsv' (ByteString -> Either String [HyperdataDocument])
-> ByteString -> Either String [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
DBL.fromStrict ByteString
bs
parseFormat FileFormat
CsvHal    ByteString
bs = Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
 -> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ ByteString -> Either String [HyperdataDocument]
parseHal' (ByteString -> Either String [HyperdataDocument])
-> ByteString -> Either String [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
DBL.fromStrict ByteString
bs
parseFormat FileFormat
RisPresse ByteString
bs = do
  [HyperdataDocument]
docs <- ([(Text, Text)] -> IO HyperdataDocument)
-> [[(Text, Text)]] -> IO [HyperdataDocument]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc FileFormat
RIS)
          ([[(Text, Text)]] -> IO [HyperdataDocument])
-> (([String], [[(Text, Text)]]) -> [[(Text, Text)]])
-> ([String], [[(Text, Text)]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ([String], [[(Text, Text)]]) -> [[(Text, Text)]]
forall a b. (a, b) -> b
snd
          (([String], [[(Text, Text)]]) -> IO [HyperdataDocument])
-> (([String], [[[(ByteString, ByteString)]]])
    -> ([String], [[(Text, Text)]]))
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> ([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]])
forall a.
FileFormat
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith FileFormat
RisPresse
          (([String], [[[(ByteString, ByteString)]]])
 -> IO [HyperdataDocument])
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ [Either String [[(ByteString, ByteString)]]]
-> ([String], [[[(ByteString, ByteString)]]])
forall a b. [Either a b] -> ([a], [b])
partitionEithers
          ([Either String [[(ByteString, ByteString)]]]
 -> ([String], [[[(ByteString, ByteString)]]]))
-> [Either String [[(ByteString, ByteString)]]]
-> ([String], [[[(ByteString, ByteString)]]])
forall a b. (a -> b) -> a -> b
$ [FileFormat
-> ByteString -> Either String [[(ByteString, ByteString)]]
runParser'  FileFormat
RisPresse ByteString
bs]
  Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
 -> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ [HyperdataDocument] -> Either String [HyperdataDocument]
forall a b. b -> Either a b
Right [HyperdataDocument]
docs
parseFormat FileFormat
WOS ByteString
bs = do
  [HyperdataDocument]
docs <- ([(Text, Text)] -> IO HyperdataDocument)
-> [[(Text, Text)]] -> IO [HyperdataDocument]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc FileFormat
WOS)
          ([[(Text, Text)]] -> IO [HyperdataDocument])
-> (([String], [[(Text, Text)]]) -> [[(Text, Text)]])
-> ([String], [[(Text, Text)]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ([String], [[(Text, Text)]]) -> [[(Text, Text)]]
forall a b. (a, b) -> b
snd
          (([String], [[(Text, Text)]]) -> IO [HyperdataDocument])
-> (([String], [[[(ByteString, ByteString)]]])
    -> ([String], [[(Text, Text)]]))
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> ([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]])
forall a.
FileFormat
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith FileFormat
WOS
          (([String], [[[(ByteString, ByteString)]]])
 -> IO [HyperdataDocument])
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ [Either String [[(ByteString, ByteString)]]]
-> ([String], [[[(ByteString, ByteString)]]])
forall a b. [Either a b] -> ([a], [b])
partitionEithers
          ([Either String [[(ByteString, ByteString)]]]
 -> ([String], [[[(ByteString, ByteString)]]]))
-> [Either String [[(ByteString, ByteString)]]]
-> ([String], [[[(ByteString, ByteString)]]])
forall a b. (a -> b) -> a -> b
$ [FileFormat
-> ByteString -> Either String [[(ByteString, ByteString)]]
runParser'  FileFormat
WOS ByteString
bs]
  Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
 -> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ [HyperdataDocument] -> Either String [HyperdataDocument]
forall a b. b -> Either a b
Right [HyperdataDocument]
docs
parseFormat FileFormat
ZIP ByteString
bs = do
  String
path <- String -> IO String
emptySystemTempFile String
"parsed-zip"
  String -> ByteString -> IO ()
DB.writeFile String
path ByteString
bs
  [EntrySelector]
parsedZip <- String -> ZipArchive [EntrySelector] -> IO [EntrySelector]
forall (m :: * -> *) a. MonadIO m => String -> ZipArchive a -> m a
withArchive String
path (ZipArchive [EntrySelector] -> IO [EntrySelector])
-> ZipArchive [EntrySelector] -> IO [EntrySelector]
forall a b. (a -> b) -> a -> b
$ do
    Map EntrySelector EntryDescription -> [EntrySelector]
forall k a. Map k a -> [k]
DM.keys (Map EntrySelector EntryDescription -> [EntrySelector])
-> ZipArchive (Map EntrySelector EntryDescription)
-> ZipArchive [EntrySelector]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ZipArchive (Map EntrySelector EntryDescription)
getEntries
  Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
 -> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ String -> Either String [HyperdataDocument]
forall a b. a -> Either a b
Left (String -> Either String [HyperdataDocument])
-> String -> Either String [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ String
"Not implemented for ZIP, parsedZip" String -> ShowS
forall a. Semigroup a => a -> a -> a
<> [EntrySelector] -> String
forall a. Show a => a -> String
show [EntrySelector]
parsedZip
parseFormat FileFormat
_ ByteString
_ = IO (Either String [HyperdataDocument])
forall a. HasCallStack => a
undefined

-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseFile :: FileFormat -> String -> IO (Either String [HyperdataDocument])
parseFile FileFormat
CsvHal    String
p = String -> IO (Either String [HyperdataDocument])
parseHal String
p
parseFile FileFormat
CsvGargV3 String
p = String -> IO (Either String [HyperdataDocument])
parseCsv String
p
parseFile FileFormat
RisPresse String
p = do
  [HyperdataDocument]
docs <- IO (IO [HyperdataDocument]) -> IO [HyperdataDocument]
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join (IO (IO [HyperdataDocument]) -> IO [HyperdataDocument])
-> IO (IO [HyperdataDocument]) -> IO [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ ([(Text, Text)] -> IO HyperdataDocument)
-> [[(Text, Text)]] -> IO [HyperdataDocument]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc FileFormat
RIS) ([[(Text, Text)]] -> IO [HyperdataDocument])
-> (([String], [[(Text, Text)]]) -> [[(Text, Text)]])
-> ([String], [[(Text, Text)]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ([String], [[(Text, Text)]]) -> [[(Text, Text)]]
forall a b. (a, b) -> b
snd (([String], [[(Text, Text)]]) -> IO [HyperdataDocument])
-> (([String], [[[(ByteString, ByteString)]]])
    -> ([String], [[(Text, Text)]]))
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> ([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]])
forall a.
FileFormat
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith FileFormat
RisPresse (([String], [[[(ByteString, ByteString)]]])
 -> IO [HyperdataDocument])
-> IO ([String], [[[(ByteString, ByteString)]]])
-> IO (IO [HyperdataDocument])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> String -> IO ([String], [[[(ByteString, ByteString)]]])
readFileWith FileFormat
RIS String
p
  Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
 -> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ [HyperdataDocument] -> Either String [HyperdataDocument]
forall a b. b -> Either a b
Right [HyperdataDocument]
docs
parseFile FileFormat
WOS       String
p = do
  [HyperdataDocument]
docs <- IO (IO [HyperdataDocument]) -> IO [HyperdataDocument]
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join (IO (IO [HyperdataDocument]) -> IO [HyperdataDocument])
-> IO (IO [HyperdataDocument]) -> IO [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ ([(Text, Text)] -> IO HyperdataDocument)
-> [[(Text, Text)]] -> IO [HyperdataDocument]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc FileFormat
WOS) ([[(Text, Text)]] -> IO [HyperdataDocument])
-> (([String], [[(Text, Text)]]) -> [[(Text, Text)]])
-> ([String], [[(Text, Text)]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ([String], [[(Text, Text)]]) -> [[(Text, Text)]]
forall a b. (a, b) -> b
snd (([String], [[(Text, Text)]]) -> IO [HyperdataDocument])
-> (([String], [[[(ByteString, ByteString)]]])
    -> ([String], [[(Text, Text)]]))
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> ([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]])
forall a.
FileFormat
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith FileFormat
WOS       (([String], [[[(ByteString, ByteString)]]])
 -> IO [HyperdataDocument])
-> IO ([String], [[[(ByteString, ByteString)]]])
-> IO (IO [HyperdataDocument])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> String -> IO ([String], [[[(ByteString, ByteString)]]])
readFileWith FileFormat
WOS String
p
  Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
 -> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ [HyperdataDocument] -> Either String [HyperdataDocument]
forall a b. b -> Either a b
Right [HyperdataDocument]
docs
parseFile FileFormat
ff        String
p = do
  [HyperdataDocument]
docs <- IO (IO [HyperdataDocument]) -> IO [HyperdataDocument]
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join (IO (IO [HyperdataDocument]) -> IO [HyperdataDocument])
-> IO (IO [HyperdataDocument]) -> IO [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ ([(Text, Text)] -> IO HyperdataDocument)
-> [[(Text, Text)]] -> IO [HyperdataDocument]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc FileFormat
ff)  ([[(Text, Text)]] -> IO [HyperdataDocument])
-> (([String], [[(Text, Text)]]) -> [[(Text, Text)]])
-> ([String], [[(Text, Text)]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ([String], [[(Text, Text)]]) -> [[(Text, Text)]]
forall a b. (a, b) -> b
snd (([String], [[(Text, Text)]]) -> IO [HyperdataDocument])
-> (([String], [[[(ByteString, ByteString)]]])
    -> ([String], [[(Text, Text)]]))
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> ([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]])
forall a.
FileFormat
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith FileFormat
ff        (([String], [[[(ByteString, ByteString)]]])
 -> IO [HyperdataDocument])
-> IO ([String], [[[(ByteString, ByteString)]]])
-> IO (IO [HyperdataDocument])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> String -> IO ([String], [[[(ByteString, ByteString)]]])
readFileWith FileFormat
ff  String
p
  Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
 -> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ [HyperdataDocument] -> Either String [HyperdataDocument]
forall a b. b -> Either a b
Right [HyperdataDocument]
docs

toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
-- TODO use language for RIS
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc FileFormat
ff [(Text, Text)]
d = do
      -- let abstract = lookup "abstract" d
      let lang :: Lang
lang = Lang
EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))

      let dateToParse :: Maybe Text
dateToParse = Text -> Text -> Text -> Text
DT.replace Text
"-" Text
" " (Text -> Text) -> Maybe Text -> Maybe Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"PY" [(Text, Text)]
d Maybe Text -> Maybe Text -> Maybe Text
forall a. Semigroup a => a -> a -> a
<> Text -> Maybe Text
forall a. a -> Maybe a
Just Text
" " Maybe Text -> Maybe Text -> Maybe Text
forall a. Semigroup a => a -> a -> a
<> Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"publication_date" [(Text, Text)]
d

      (Maybe UTCTime
utcTime, (Maybe Int
pub_year, Maybe Int
pub_month, Maybe Int
pub_day)) <- Lang
-> Maybe Text
-> IO (Maybe UTCTime, (Maybe Int, Maybe Int, Maybe Int))
Date.dateSplit Lang
lang  Maybe Text
dateToParse

      HyperdataDocument -> IO HyperdataDocument
forall (f :: * -> *) a. Applicative f => a -> f a
pure (HyperdataDocument -> IO HyperdataDocument)
-> HyperdataDocument -> IO HyperdataDocument
forall a b. (a -> b) -> a -> b
$ HyperdataDocument :: Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Int
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Int
-> Maybe Int
-> Maybe Int
-> Maybe Int
-> Maybe Int
-> Maybe Int
-> Maybe Text
-> HyperdataDocument
HyperdataDocument { _hd_bdd :: Maybe Text
_hd_bdd = Text -> Maybe Text
forall a. a -> Maybe a
Just (Text -> Maybe Text) -> Text -> Maybe Text
forall a b. (a -> b) -> a -> b
$ String -> Text
DT.pack (String -> Text) -> String -> Text
forall a b. (a -> b) -> a -> b
$ FileFormat -> String
forall a. Show a => a -> String
show FileFormat
ff
                               , _hd_doi :: Maybe Text
_hd_doi = Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"doi" [(Text, Text)]
d
                               , _hd_url :: Maybe Text
_hd_url = Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"URL" [(Text, Text)]
d
                               , _hd_uniqId :: Maybe Text
_hd_uniqId = Maybe Text
forall a. Maybe a
Nothing
                               , _hd_uniqIdBdd :: Maybe Text
_hd_uniqIdBdd = Maybe Text
forall a. Maybe a
Nothing
                               , _hd_page :: Maybe Int
_hd_page = Maybe Int
forall a. Maybe a
Nothing
                               , _hd_title :: Maybe Text
_hd_title = Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"title" [(Text, Text)]
d
                               , _hd_authors :: Maybe Text
_hd_authors = Maybe Text
forall a. Maybe a
Nothing
                               , _hd_institutes :: Maybe Text
_hd_institutes = Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"authors" [(Text, Text)]
d
                               , _hd_source :: Maybe Text
_hd_source = Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"source" [(Text, Text)]
d
                               , _hd_abstract :: Maybe Text
_hd_abstract = Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"abstract" [(Text, Text)]
d
                               , _hd_publication_date :: Maybe Text
_hd_publication_date = (UTCTime -> Text) -> Maybe UTCTime -> Maybe Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (String -> Text
DT.pack (String -> Text) -> (UTCTime -> String) -> UTCTime -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. UTCTime -> String
forall a. Show a => a -> String
show) Maybe UTCTime
utcTime
                               , _hd_publication_year :: Maybe Int
_hd_publication_year = Maybe Int
pub_year
                               , _hd_publication_month :: Maybe Int
_hd_publication_month = Maybe Int
pub_month
                               , _hd_publication_day :: Maybe Int
_hd_publication_day = Maybe Int
pub_day
                               , _hd_publication_hour :: Maybe Int
_hd_publication_hour = Maybe Int
forall a. Maybe a
Nothing
                               , _hd_publication_minute :: Maybe Int
_hd_publication_minute = Maybe Int
forall a. Maybe a
Nothing
                               , _hd_publication_second :: Maybe Int
_hd_publication_second = Maybe Int
forall a. Maybe a
Nothing
                               , _hd_language_iso2 :: Maybe Text
_hd_language_iso2 = Text -> Maybe Text
forall a. a -> Maybe a
Just (Text -> Maybe Text) -> Text -> Maybe Text
forall a b. (a -> b) -> a -> b
$ (String -> Text
DT.pack (String -> Text) -> (Lang -> String) -> Lang -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Lang -> String
forall a. Show a => a -> String
show) Lang
lang }

enrichWith :: FileFormat
           ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith :: FileFormat
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith FileFormat
RisPresse = ([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
forall a.
([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' [(ByteString, ByteString)] -> [(ByteString, ByteString)]
presseEnrich
enrichWith FileFormat
WOS       = ([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
forall a.
([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' (((ByteString, ByteString) -> (ByteString, ByteString))
-> [(ByteString, ByteString)] -> [(ByteString, ByteString)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map ((ByteString -> ByteString)
-> (ByteString, ByteString) -> (ByteString, ByteString)
forall a a' b. (a -> a') -> (a, b) -> (a', b)
first ByteString -> ByteString
WOS.keys))
enrichWith FileFormat
_         = ([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
forall a.
([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' [(ByteString, ByteString)] -> [(ByteString, ByteString)]
forall a. a -> a
identity


enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
           ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' :: ([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' [(ByteString, ByteString)] -> [(ByteString, ByteString)]
f = ([[[(ByteString, ByteString)]]] -> [[(Text, Text)]])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
forall b b' a. (b -> b') -> (a, b) -> (a, b')
second (([(ByteString, ByteString)] -> [(Text, Text)])
-> [[(ByteString, ByteString)]] -> [[(Text, Text)]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map [(ByteString, ByteString)] -> [(Text, Text)]
both' ([[(ByteString, ByteString)]] -> [[(Text, Text)]])
-> ([[[(ByteString, ByteString)]]] -> [[(ByteString, ByteString)]])
-> [[[(ByteString, ByteString)]]]
-> [[(Text, Text)]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> [[(ByteString, ByteString)]] -> [[(ByteString, ByteString)]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map [(ByteString, ByteString)] -> [(ByteString, ByteString)]
f ([[(ByteString, ByteString)]] -> [[(ByteString, ByteString)]])
-> ([[[(ByteString, ByteString)]]] -> [[(ByteString, ByteString)]])
-> [[[(ByteString, ByteString)]]]
-> [[(ByteString, ByteString)]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [[[(ByteString, ByteString)]]] -> [[(ByteString, ByteString)]]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat)
  where
    both' :: [(ByteString, ByteString)] -> [(Text, Text)]
both'   = ((ByteString, ByteString) -> (Text, Text))
-> [(ByteString, ByteString)] -> [(Text, Text)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map ((ByteString -> Text) -> (ByteString, ByteString) -> (Text, Text)
forall a b. (a -> b) -> (a, a) -> (b, b)
both ByteString -> Text
decodeUtf8)



readFileWith :: FileFormat -> FilePath
       -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
readFileWith :: FileFormat
-> String -> IO ([String], [[[(ByteString, ByteString)]]])
readFileWith FileFormat
format String
path = do
    [ByteString]
files <- case ShowS
takeExtension String
path of
              String
".zip" -> String -> IO [ByteString]
openZip              String
path
              String
_      -> ByteString -> [ByteString]
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ByteString -> [ByteString])
-> (ByteString -> ByteString) -> ByteString -> [ByteString]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ByteString -> ByteString
clean (ByteString -> [ByteString]) -> IO ByteString -> IO [ByteString]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> String -> IO ByteString
DB.readFile String
path
    [Either String [[(ByteString, ByteString)]]]
-> ([String], [[[(ByteString, ByteString)]]])
forall a b. [Either a b] -> ([a], [b])
partitionEithers ([Either String [[(ByteString, ByteString)]]]
 -> ([String], [[[(ByteString, ByteString)]]]))
-> IO [Either String [[(ByteString, ByteString)]]]
-> IO ([String], [[[(ByteString, ByteString)]]])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (ByteString -> IO (Either String [[(ByteString, ByteString)]]))
-> [ByteString] -> IO [Either String [[(ByteString, ByteString)]]]
forall (t :: * -> *) a b.
Traversable t =>
(a -> IO b) -> t a -> IO (t b)
mapConcurrently (FileFormat
-> ByteString -> IO (Either String [[(ByteString, ByteString)]])
runParser FileFormat
format) [ByteString]
files


-- | withParser:
-- According to the format of the text, choose the right parser.
-- TODO  withParser :: FileFormat -> Parser [Document]
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser :: FileFormat -> Parser [[(ByteString, ByteString)]]
withParser FileFormat
WOS = Parser [[(ByteString, ByteString)]]
WOS.parser
withParser FileFormat
RIS = Parser [[(ByteString, ByteString)]]
RIS.parser
--withParser ODT = odtParser
--withParser XML = xmlParser
withParser FileFormat
_   = Text -> Parser [[(ByteString, ByteString)]]
forall a. HasCallStack => Text -> a
panic Text
"[ERROR] Parser not implemented yet"

runParser :: FileFormat -> DB.ByteString
          -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser :: FileFormat
-> ByteString -> IO (Either String [[(ByteString, ByteString)]])
runParser FileFormat
format ByteString
text = Either String [[(ByteString, ByteString)]]
-> IO (Either String [[(ByteString, ByteString)]])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [[(ByteString, ByteString)]]
 -> IO (Either String [[(ByteString, ByteString)]]))
-> Either String [[(ByteString, ByteString)]]
-> IO (Either String [[(ByteString, ByteString)]])
forall a b. (a -> b) -> a -> b
$ FileFormat
-> ByteString -> Either String [[(ByteString, ByteString)]]
runParser' FileFormat
format ByteString
text

runParser' :: FileFormat -> DB.ByteString
          -> (Either String [[(DB.ByteString, DB.ByteString)]])
runParser' :: FileFormat
-> ByteString -> Either String [[(ByteString, ByteString)]]
runParser' FileFormat
format ByteString
text = Parser [[(ByteString, ByteString)]]
-> ByteString -> Either String [[(ByteString, ByteString)]]
forall a. Parser a -> ByteString -> Either String a
parseOnly (FileFormat -> Parser [[(ByteString, ByteString)]]
withParser FileFormat
format) ByteString
text

openZip :: FilePath -> IO [DB.ByteString]
openZip :: String -> IO [ByteString]
openZip String
fp = do
    [EntrySelector]
entries <- String -> ZipArchive [EntrySelector] -> IO [EntrySelector]
forall (m :: * -> *) a. MonadIO m => String -> ZipArchive a -> m a
withArchive String
fp (Map EntrySelector EntryDescription -> [EntrySelector]
forall k a. Map k a -> [k]
DM.keys (Map EntrySelector EntryDescription -> [EntrySelector])
-> ZipArchive (Map EntrySelector EntryDescription)
-> ZipArchive [EntrySelector]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ZipArchive (Map EntrySelector EntryDescription)
getEntries)
    [ByteString]
bs      <- (EntrySelector -> IO ByteString)
-> [EntrySelector] -> IO [ByteString]
forall (t :: * -> *) a b.
Traversable t =>
(a -> IO b) -> t a -> IO (t b)
mapConcurrently (\EntrySelector
s -> String -> ZipArchive ByteString -> IO ByteString
forall (m :: * -> *) a. MonadIO m => String -> ZipArchive a -> m a
withArchive String
fp (EntrySelector -> ZipArchive ByteString
getEntry EntrySelector
s)) [EntrySelector]
entries
    [ByteString] -> IO [ByteString]
forall (f :: * -> *) a. Applicative f => a -> f a
pure [ByteString]
bs

cleanText :: Text -> Text
cleanText :: Text -> Text
cleanText = ByteString -> Text
forall a b. ConvertibleStrings a b => a -> b
cs (ByteString -> Text) -> (Text -> ByteString) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
clean (ByteString -> ByteString)
-> (Text -> ByteString) -> Text -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> ByteString
forall a b. ConvertibleStrings a b => a -> b
cs

clean :: DB.ByteString -> DB.ByteString
clean :: ByteString -> ByteString
clean ByteString
txt = (Char -> Char) -> ByteString -> ByteString
DBC.map Char -> Char
clean' ByteString
txt
  where
    clean' :: Char -> Char
clean' Char
'’' = Char
'\''
    clean' Char
'\r' = Char
' '
    clean' Char
'\t' = Char
' '
    clean' Char
';' = Char
'.'
    clean' Char
c  = Char
c