{-# LANGUAGE PackageImports #-}
module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
where
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Control.Monad (join)
import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers)
import Data.List (concat, lookup)
import Data.Ord()
import Data.String (String())
import Data.String()
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8)
import Data.Tuple.Extra (both, first, second)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.ByteString.Char8 as DBC
import qualified Data.ByteString.Lazy as DBL
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Prelude as Prelude
import System.IO.Temp (emptySystemTempFile)
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
type ParseError = String
data FileFormat = WOS | RIS | RisPresse
| CsvGargV3 | CsvHal
| ZIP
deriving (Int -> FileFormat -> ShowS
[FileFormat] -> ShowS
FileFormat -> String
(Int -> FileFormat -> ShowS)
-> (FileFormat -> String)
-> ([FileFormat] -> ShowS)
-> Show FileFormat
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [FileFormat] -> ShowS
$cshowList :: [FileFormat] -> ShowS
show :: FileFormat -> String
$cshow :: FileFormat -> String
showsPrec :: Int -> FileFormat -> ShowS
$cshowsPrec :: Int -> FileFormat -> ShowS
Show)
parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
parseFormat :: FileFormat -> ByteString -> IO (Either String [HyperdataDocument])
parseFormat FileFormat
CsvGargV3 ByteString
bs = Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ ByteString -> Either String [HyperdataDocument]
parseCsv' (ByteString -> Either String [HyperdataDocument])
-> ByteString -> Either String [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
DBL.fromStrict ByteString
bs
parseFormat FileFormat
CsvHal ByteString
bs = Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ ByteString -> Either String [HyperdataDocument]
parseHal' (ByteString -> Either String [HyperdataDocument])
-> ByteString -> Either String [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
DBL.fromStrict ByteString
bs
parseFormat FileFormat
RisPresse ByteString
bs = do
[HyperdataDocument]
docs <- ([(Text, Text)] -> IO HyperdataDocument)
-> [[(Text, Text)]] -> IO [HyperdataDocument]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc FileFormat
RIS)
([[(Text, Text)]] -> IO [HyperdataDocument])
-> (([String], [[(Text, Text)]]) -> [[(Text, Text)]])
-> ([String], [[(Text, Text)]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ([String], [[(Text, Text)]]) -> [[(Text, Text)]]
forall a b. (a, b) -> b
snd
(([String], [[(Text, Text)]]) -> IO [HyperdataDocument])
-> (([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]]))
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> ([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]])
forall a.
FileFormat
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith FileFormat
RisPresse
(([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument])
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ [Either String [[(ByteString, ByteString)]]]
-> ([String], [[[(ByteString, ByteString)]]])
forall a b. [Either a b] -> ([a], [b])
partitionEithers
([Either String [[(ByteString, ByteString)]]]
-> ([String], [[[(ByteString, ByteString)]]]))
-> [Either String [[(ByteString, ByteString)]]]
-> ([String], [[[(ByteString, ByteString)]]])
forall a b. (a -> b) -> a -> b
$ [FileFormat
-> ByteString -> Either String [[(ByteString, ByteString)]]
runParser' FileFormat
RisPresse ByteString
bs]
Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ [HyperdataDocument] -> Either String [HyperdataDocument]
forall a b. b -> Either a b
Right [HyperdataDocument]
docs
parseFormat FileFormat
WOS ByteString
bs = do
[HyperdataDocument]
docs <- ([(Text, Text)] -> IO HyperdataDocument)
-> [[(Text, Text)]] -> IO [HyperdataDocument]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc FileFormat
WOS)
([[(Text, Text)]] -> IO [HyperdataDocument])
-> (([String], [[(Text, Text)]]) -> [[(Text, Text)]])
-> ([String], [[(Text, Text)]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ([String], [[(Text, Text)]]) -> [[(Text, Text)]]
forall a b. (a, b) -> b
snd
(([String], [[(Text, Text)]]) -> IO [HyperdataDocument])
-> (([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]]))
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> ([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]])
forall a.
FileFormat
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith FileFormat
WOS
(([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument])
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ [Either String [[(ByteString, ByteString)]]]
-> ([String], [[[(ByteString, ByteString)]]])
forall a b. [Either a b] -> ([a], [b])
partitionEithers
([Either String [[(ByteString, ByteString)]]]
-> ([String], [[[(ByteString, ByteString)]]]))
-> [Either String [[(ByteString, ByteString)]]]
-> ([String], [[[(ByteString, ByteString)]]])
forall a b. (a -> b) -> a -> b
$ [FileFormat
-> ByteString -> Either String [[(ByteString, ByteString)]]
runParser' FileFormat
WOS ByteString
bs]
Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ [HyperdataDocument] -> Either String [HyperdataDocument]
forall a b. b -> Either a b
Right [HyperdataDocument]
docs
parseFormat FileFormat
ZIP ByteString
bs = do
String
path <- String -> IO String
emptySystemTempFile String
"parsed-zip"
String -> ByteString -> IO ()
DB.writeFile String
path ByteString
bs
[EntrySelector]
parsedZip <- String -> ZipArchive [EntrySelector] -> IO [EntrySelector]
forall (m :: * -> *) a. MonadIO m => String -> ZipArchive a -> m a
withArchive String
path (ZipArchive [EntrySelector] -> IO [EntrySelector])
-> ZipArchive [EntrySelector] -> IO [EntrySelector]
forall a b. (a -> b) -> a -> b
$ do
Map EntrySelector EntryDescription -> [EntrySelector]
forall k a. Map k a -> [k]
DM.keys (Map EntrySelector EntryDescription -> [EntrySelector])
-> ZipArchive (Map EntrySelector EntryDescription)
-> ZipArchive [EntrySelector]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ZipArchive (Map EntrySelector EntryDescription)
getEntries
Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ String -> Either String [HyperdataDocument]
forall a b. a -> Either a b
Left (String -> Either String [HyperdataDocument])
-> String -> Either String [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ String
"Not implemented for ZIP, parsedZip" String -> ShowS
forall a. Semigroup a => a -> a -> a
<> [EntrySelector] -> String
forall a. Show a => a -> String
show [EntrySelector]
parsedZip
parseFormat FileFormat
_ ByteString
_ = IO (Either String [HyperdataDocument])
forall a. HasCallStack => a
undefined
parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseFile :: FileFormat -> String -> IO (Either String [HyperdataDocument])
parseFile FileFormat
CsvHal String
p = String -> IO (Either String [HyperdataDocument])
parseHal String
p
parseFile FileFormat
CsvGargV3 String
p = String -> IO (Either String [HyperdataDocument])
parseCsv String
p
parseFile FileFormat
RisPresse String
p = do
[HyperdataDocument]
docs <- IO (IO [HyperdataDocument]) -> IO [HyperdataDocument]
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join (IO (IO [HyperdataDocument]) -> IO [HyperdataDocument])
-> IO (IO [HyperdataDocument]) -> IO [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ ([(Text, Text)] -> IO HyperdataDocument)
-> [[(Text, Text)]] -> IO [HyperdataDocument]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc FileFormat
RIS) ([[(Text, Text)]] -> IO [HyperdataDocument])
-> (([String], [[(Text, Text)]]) -> [[(Text, Text)]])
-> ([String], [[(Text, Text)]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ([String], [[(Text, Text)]]) -> [[(Text, Text)]]
forall a b. (a, b) -> b
snd (([String], [[(Text, Text)]]) -> IO [HyperdataDocument])
-> (([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]]))
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> ([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]])
forall a.
FileFormat
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith FileFormat
RisPresse (([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument])
-> IO ([String], [[[(ByteString, ByteString)]]])
-> IO (IO [HyperdataDocument])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> String -> IO ([String], [[[(ByteString, ByteString)]]])
readFileWith FileFormat
RIS String
p
Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ [HyperdataDocument] -> Either String [HyperdataDocument]
forall a b. b -> Either a b
Right [HyperdataDocument]
docs
parseFile FileFormat
WOS String
p = do
[HyperdataDocument]
docs <- IO (IO [HyperdataDocument]) -> IO [HyperdataDocument]
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join (IO (IO [HyperdataDocument]) -> IO [HyperdataDocument])
-> IO (IO [HyperdataDocument]) -> IO [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ ([(Text, Text)] -> IO HyperdataDocument)
-> [[(Text, Text)]] -> IO [HyperdataDocument]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc FileFormat
WOS) ([[(Text, Text)]] -> IO [HyperdataDocument])
-> (([String], [[(Text, Text)]]) -> [[(Text, Text)]])
-> ([String], [[(Text, Text)]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ([String], [[(Text, Text)]]) -> [[(Text, Text)]]
forall a b. (a, b) -> b
snd (([String], [[(Text, Text)]]) -> IO [HyperdataDocument])
-> (([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]]))
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> ([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]])
forall a.
FileFormat
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith FileFormat
WOS (([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument])
-> IO ([String], [[[(ByteString, ByteString)]]])
-> IO (IO [HyperdataDocument])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> String -> IO ([String], [[[(ByteString, ByteString)]]])
readFileWith FileFormat
WOS String
p
Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ [HyperdataDocument] -> Either String [HyperdataDocument]
forall a b. b -> Either a b
Right [HyperdataDocument]
docs
parseFile FileFormat
ff String
p = do
[HyperdataDocument]
docs <- IO (IO [HyperdataDocument]) -> IO [HyperdataDocument]
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join (IO (IO [HyperdataDocument]) -> IO [HyperdataDocument])
-> IO (IO [HyperdataDocument]) -> IO [HyperdataDocument]
forall a b. (a -> b) -> a -> b
$ ([(Text, Text)] -> IO HyperdataDocument)
-> [[(Text, Text)]] -> IO [HyperdataDocument]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc FileFormat
ff) ([[(Text, Text)]] -> IO [HyperdataDocument])
-> (([String], [[(Text, Text)]]) -> [[(Text, Text)]])
-> ([String], [[(Text, Text)]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ([String], [[(Text, Text)]]) -> [[(Text, Text)]]
forall a b. (a, b) -> b
snd (([String], [[(Text, Text)]]) -> IO [HyperdataDocument])
-> (([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]]))
-> ([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> ([String], [[[(ByteString, ByteString)]]])
-> ([String], [[(Text, Text)]])
forall a.
FileFormat
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith FileFormat
ff (([String], [[[(ByteString, ByteString)]]])
-> IO [HyperdataDocument])
-> IO ([String], [[[(ByteString, ByteString)]]])
-> IO (IO [HyperdataDocument])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FileFormat
-> String -> IO ([String], [[[(ByteString, ByteString)]]])
readFileWith FileFormat
ff String
p
Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument]))
-> Either String [HyperdataDocument]
-> IO (Either String [HyperdataDocument])
forall a b. (a -> b) -> a -> b
$ [HyperdataDocument] -> Either String [HyperdataDocument]
forall a b. b -> Either a b
Right [HyperdataDocument]
docs
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc FileFormat
ff [(Text, Text)]
d = do
let lang :: Lang
lang = Lang
EN
let dateToParse :: Maybe Text
dateToParse = Text -> Text -> Text -> Text
DT.replace Text
"-" Text
" " (Text -> Text) -> Maybe Text -> Maybe Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"PY" [(Text, Text)]
d Maybe Text -> Maybe Text -> Maybe Text
forall a. Semigroup a => a -> a -> a
<> Text -> Maybe Text
forall a. a -> Maybe a
Just Text
" " Maybe Text -> Maybe Text -> Maybe Text
forall a. Semigroup a => a -> a -> a
<> Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"publication_date" [(Text, Text)]
d
(Maybe UTCTime
utcTime, (Maybe Int
pub_year, Maybe Int
pub_month, Maybe Int
pub_day)) <- Lang
-> Maybe Text
-> IO (Maybe UTCTime, (Maybe Int, Maybe Int, Maybe Int))
Date.dateSplit Lang
lang Maybe Text
dateToParse
HyperdataDocument -> IO HyperdataDocument
forall (f :: * -> *) a. Applicative f => a -> f a
pure (HyperdataDocument -> IO HyperdataDocument)
-> HyperdataDocument -> IO HyperdataDocument
forall a b. (a -> b) -> a -> b
$ HyperdataDocument :: Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Int
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Int
-> Maybe Int
-> Maybe Int
-> Maybe Int
-> Maybe Int
-> Maybe Int
-> Maybe Text
-> HyperdataDocument
HyperdataDocument { _hd_bdd :: Maybe Text
_hd_bdd = Text -> Maybe Text
forall a. a -> Maybe a
Just (Text -> Maybe Text) -> Text -> Maybe Text
forall a b. (a -> b) -> a -> b
$ String -> Text
DT.pack (String -> Text) -> String -> Text
forall a b. (a -> b) -> a -> b
$ FileFormat -> String
forall a. Show a => a -> String
show FileFormat
ff
, _hd_doi :: Maybe Text
_hd_doi = Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"doi" [(Text, Text)]
d
, _hd_url :: Maybe Text
_hd_url = Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"URL" [(Text, Text)]
d
, _hd_uniqId :: Maybe Text
_hd_uniqId = Maybe Text
forall a. Maybe a
Nothing
, _hd_uniqIdBdd :: Maybe Text
_hd_uniqIdBdd = Maybe Text
forall a. Maybe a
Nothing
, _hd_page :: Maybe Int
_hd_page = Maybe Int
forall a. Maybe a
Nothing
, _hd_title :: Maybe Text
_hd_title = Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"title" [(Text, Text)]
d
, _hd_authors :: Maybe Text
_hd_authors = Maybe Text
forall a. Maybe a
Nothing
, _hd_institutes :: Maybe Text
_hd_institutes = Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"authors" [(Text, Text)]
d
, _hd_source :: Maybe Text
_hd_source = Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"source" [(Text, Text)]
d
, _hd_abstract :: Maybe Text
_hd_abstract = Text -> [(Text, Text)] -> Maybe Text
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Text
"abstract" [(Text, Text)]
d
, _hd_publication_date :: Maybe Text
_hd_publication_date = (UTCTime -> Text) -> Maybe UTCTime -> Maybe Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (String -> Text
DT.pack (String -> Text) -> (UTCTime -> String) -> UTCTime -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. UTCTime -> String
forall a. Show a => a -> String
show) Maybe UTCTime
utcTime
, _hd_publication_year :: Maybe Int
_hd_publication_year = Maybe Int
pub_year
, _hd_publication_month :: Maybe Int
_hd_publication_month = Maybe Int
pub_month
, _hd_publication_day :: Maybe Int
_hd_publication_day = Maybe Int
pub_day
, _hd_publication_hour :: Maybe Int
_hd_publication_hour = Maybe Int
forall a. Maybe a
Nothing
, _hd_publication_minute :: Maybe Int
_hd_publication_minute = Maybe Int
forall a. Maybe a
Nothing
, _hd_publication_second :: Maybe Int
_hd_publication_second = Maybe Int
forall a. Maybe a
Nothing
, _hd_language_iso2 :: Maybe Text
_hd_language_iso2 = Text -> Maybe Text
forall a. a -> Maybe a
Just (Text -> Maybe Text) -> Text -> Maybe Text
forall a b. (a -> b) -> a -> b
$ (String -> Text
DT.pack (String -> Text) -> (Lang -> String) -> Lang -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Lang -> String
forall a. Show a => a -> String
show) Lang
lang }
enrichWith :: FileFormat
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith :: FileFormat
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith FileFormat
RisPresse = ([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
forall a.
([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' [(ByteString, ByteString)] -> [(ByteString, ByteString)]
presseEnrich
enrichWith FileFormat
WOS = ([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
forall a.
([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' (((ByteString, ByteString) -> (ByteString, ByteString))
-> [(ByteString, ByteString)] -> [(ByteString, ByteString)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map ((ByteString -> ByteString)
-> (ByteString, ByteString) -> (ByteString, ByteString)
forall a a' b. (a -> a') -> (a, b) -> (a', b)
first ByteString -> ByteString
WOS.keys))
enrichWith FileFormat
_ = ([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
forall a.
([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' [(ByteString, ByteString)] -> [(ByteString, ByteString)]
forall a. a -> a
identity
enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' :: ([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' [(ByteString, ByteString)] -> [(ByteString, ByteString)]
f = ([[[(ByteString, ByteString)]]] -> [[(Text, Text)]])
-> (a, [[[(ByteString, ByteString)]]]) -> (a, [[(Text, Text)]])
forall b b' a. (b -> b') -> (a, b) -> (a, b')
second (([(ByteString, ByteString)] -> [(Text, Text)])
-> [[(ByteString, ByteString)]] -> [[(Text, Text)]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map [(ByteString, ByteString)] -> [(Text, Text)]
both' ([[(ByteString, ByteString)]] -> [[(Text, Text)]])
-> ([[[(ByteString, ByteString)]]] -> [[(ByteString, ByteString)]])
-> [[[(ByteString, ByteString)]]]
-> [[(Text, Text)]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([(ByteString, ByteString)] -> [(ByteString, ByteString)])
-> [[(ByteString, ByteString)]] -> [[(ByteString, ByteString)]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map [(ByteString, ByteString)] -> [(ByteString, ByteString)]
f ([[(ByteString, ByteString)]] -> [[(ByteString, ByteString)]])
-> ([[[(ByteString, ByteString)]]] -> [[(ByteString, ByteString)]])
-> [[[(ByteString, ByteString)]]]
-> [[(ByteString, ByteString)]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [[[(ByteString, ByteString)]]] -> [[(ByteString, ByteString)]]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat)
where
both' :: [(ByteString, ByteString)] -> [(Text, Text)]
both' = ((ByteString, ByteString) -> (Text, Text))
-> [(ByteString, ByteString)] -> [(Text, Text)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
map ((ByteString -> Text) -> (ByteString, ByteString) -> (Text, Text)
forall a b. (a -> b) -> (a, a) -> (b, b)
both ByteString -> Text
decodeUtf8)
readFileWith :: FileFormat -> FilePath
-> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
readFileWith :: FileFormat
-> String -> IO ([String], [[[(ByteString, ByteString)]]])
readFileWith FileFormat
format String
path = do
[ByteString]
files <- case ShowS
takeExtension String
path of
String
".zip" -> String -> IO [ByteString]
openZip String
path
String
_ -> ByteString -> [ByteString]
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ByteString -> [ByteString])
-> (ByteString -> ByteString) -> ByteString -> [ByteString]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ByteString -> ByteString
clean (ByteString -> [ByteString]) -> IO ByteString -> IO [ByteString]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> String -> IO ByteString
DB.readFile String
path
[Either String [[(ByteString, ByteString)]]]
-> ([String], [[[(ByteString, ByteString)]]])
forall a b. [Either a b] -> ([a], [b])
partitionEithers ([Either String [[(ByteString, ByteString)]]]
-> ([String], [[[(ByteString, ByteString)]]]))
-> IO [Either String [[(ByteString, ByteString)]]]
-> IO ([String], [[[(ByteString, ByteString)]]])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (ByteString -> IO (Either String [[(ByteString, ByteString)]]))
-> [ByteString] -> IO [Either String [[(ByteString, ByteString)]]]
forall (t :: * -> *) a b.
Traversable t =>
(a -> IO b) -> t a -> IO (t b)
mapConcurrently (FileFormat
-> ByteString -> IO (Either String [[(ByteString, ByteString)]])
runParser FileFormat
format) [ByteString]
files
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser :: FileFormat -> Parser [[(ByteString, ByteString)]]
withParser FileFormat
WOS = Parser [[(ByteString, ByteString)]]
WOS.parser
withParser FileFormat
RIS = Parser [[(ByteString, ByteString)]]
RIS.parser
withParser FileFormat
_ = Text -> Parser [[(ByteString, ByteString)]]
forall a. HasCallStack => Text -> a
panic Text
"[ERROR] Parser not implemented yet"
runParser :: FileFormat -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser :: FileFormat
-> ByteString -> IO (Either String [[(ByteString, ByteString)]])
runParser FileFormat
format ByteString
text = Either String [[(ByteString, ByteString)]]
-> IO (Either String [[(ByteString, ByteString)]])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either String [[(ByteString, ByteString)]]
-> IO (Either String [[(ByteString, ByteString)]]))
-> Either String [[(ByteString, ByteString)]]
-> IO (Either String [[(ByteString, ByteString)]])
forall a b. (a -> b) -> a -> b
$ FileFormat
-> ByteString -> Either String [[(ByteString, ByteString)]]
runParser' FileFormat
format ByteString
text
runParser' :: FileFormat -> DB.ByteString
-> (Either String [[(DB.ByteString, DB.ByteString)]])
runParser' :: FileFormat
-> ByteString -> Either String [[(ByteString, ByteString)]]
runParser' FileFormat
format ByteString
text = Parser [[(ByteString, ByteString)]]
-> ByteString -> Either String [[(ByteString, ByteString)]]
forall a. Parser a -> ByteString -> Either String a
parseOnly (FileFormat -> Parser [[(ByteString, ByteString)]]
withParser FileFormat
format) ByteString
text
openZip :: FilePath -> IO [DB.ByteString]
openZip :: String -> IO [ByteString]
openZip String
fp = do
[EntrySelector]
entries <- String -> ZipArchive [EntrySelector] -> IO [EntrySelector]
forall (m :: * -> *) a. MonadIO m => String -> ZipArchive a -> m a
withArchive String
fp (Map EntrySelector EntryDescription -> [EntrySelector]
forall k a. Map k a -> [k]
DM.keys (Map EntrySelector EntryDescription -> [EntrySelector])
-> ZipArchive (Map EntrySelector EntryDescription)
-> ZipArchive [EntrySelector]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ZipArchive (Map EntrySelector EntryDescription)
getEntries)
[ByteString]
bs <- (EntrySelector -> IO ByteString)
-> [EntrySelector] -> IO [ByteString]
forall (t :: * -> *) a b.
Traversable t =>
(a -> IO b) -> t a -> IO (t b)
mapConcurrently (\EntrySelector
s -> String -> ZipArchive ByteString -> IO ByteString
forall (m :: * -> *) a. MonadIO m => String -> ZipArchive a -> m a
withArchive String
fp (EntrySelector -> ZipArchive ByteString
getEntry EntrySelector
s)) [EntrySelector]
entries
[ByteString] -> IO [ByteString]
forall (f :: * -> *) a. Applicative f => a -> f a
pure [ByteString]
bs
cleanText :: Text -> Text
cleanText :: Text -> Text
cleanText = ByteString -> Text
forall a b. ConvertibleStrings a b => a -> b
cs (ByteString -> Text) -> (Text -> ByteString) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
clean (ByteString -> ByteString)
-> (Text -> ByteString) -> Text -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> ByteString
forall a b. ConvertibleStrings a b => a -> b
cs
clean :: DB.ByteString -> DB.ByteString
clean :: ByteString -> ByteString
clean ByteString
txt = (Char -> Char) -> ByteString -> ByteString
DBC.map Char -> Char
clean' ByteString
txt
where
clean' :: Char -> Char
clean' Char
'’' = Char
'\''
clean' Char
'\r' = Char
' '
clean' Char
'\t' = Char
' '
clean' Char
';' = Char
'.'
clean' Char
c = Char
c