在不知道子标签顺序的情况下流解析 xml
stream parsing xml without knowing the ordering of child tags
我必须解析一些 xml,我决定使用 xml-conduit 完成该任务并使用它的流媒体部分。
xml 的结构由 xsd 文件给出,其中包含元素及其出现的频率。但是不是他们期望的顺序。
如何使用 Text.XML.Stream.Parse
解析 xml 结构的子结构的所有可能重新排序?
问题
假设我们有一个 xml 描述,例如
Root
/ \
A B
那么 <Root><A>atext</A><B>btext</B></Root>
和 <Root><B>btext</B><A>atext</A></Root>
都是这个 xml 结构的有效实例。
但是在流设置中解析需要排序才能成功。
我想使用 parseRoot1 <|> parseRoot2
之类的东西,但后来我不得不实现 Alternative
实例并手动编写所有可能性,我真的不想这样做。
这是一个最小示例 haskell 程序。
Example.hs
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Rank2Types #-}
module Main where
import Control.Exception
import Control.Monad.Trans.Resource (MonadThrow)
import Text.XML.Stream.Parse
import Data.Monoid ((<>))
import Data.Maybe
import Data.Text (Text)
import Data.XML.Types (Event)
import Data.Conduit (ConduitM, Consumer, yield, ($=), ($$))
data Root = Root {a :: A, b :: B} deriving (Show, Eq)
data A = A Text deriving (Show, Eq)
data B = B Text deriving (Show, Eq)
ex1, ex2 :: Text
ex1 = "<Root>"<>
"<A>Atest</A>"<>
"<B>Btest</B>"<>
"</Root>"
ex2 = "<Root>"<>
"<B>Btest</B>"<>
"<A>Atest</A>"<>
"</Root>"
ex :: Root
ex = Root {a = A "Atest", b = B "Btest"}
parseA :: MonadThrow m => ConduitM Event o m (Maybe A)
parseA = tagIgnoreAttrs "A"
$ do result <- content
return (A $ result)
parseB :: MonadThrow m => ConduitM Event o m (Maybe B)
parseB = tagIgnoreAttrs "B"
$ do result <- content
return (B result)
parseRoot1 :: MonadThrow m => ConduitM Event o m (Maybe Root)
parseRoot1 = tagIgnoreAttrs "Root" $ do
a' <- fromMaybe (error "error parsing A") <$> parseA
b' <- fromMaybe (error "error parsing B") <$> parseB
return $ Root{a = a', b = b'}
parseRoot2 :: MonadThrow m => ConduitM Event o m (Maybe Root)
parseRoot2 = tagIgnoreAttrs "Root" $ do
b' <- fromMaybe (error "error parsing B") <$> parseB
a' <- fromMaybe (error "error parsing A") <$> parseA
return $ Root{a = a', b = b'}
parseTxt :: Consumer Event (Either SomeException) (Maybe a)
-> Text
-> Either SomeException (Maybe a)
parseTxt p inTxt = yield inTxt
$= parseText' def
$$ p
main :: IO ()
main = do putStrLn "Poor Mans Test Suite"
putStrLn "===================="
putStrLn "test1 Root -> A - B " -- works
print $ parseTxt parseRoot1 ex1
putStrLn "test1 Root -> B - A " -- fails
print $ parseTxt parseRoot1 ex2
putStrLn "test2 Root -> A - B " -- fails
print $ parseTxt parseRoot2 ex1
putStrLn "test2 Root -> B - A " -- works again
print $ parseTxt parseRoot2 ex2
注意
example.cabal
[...]
build-depends: base >=4.8 && <4.9
, conduit
, resourcet
, text
, xml-conduit
, xml-types
[...]
这是我的想法...
首先是一些定义:
{-# LANGUAGE OverloadedStrings, MultiWayIf #-}
import Control.Monad.Trans.Resource
import Data.Conduit
import Data.Text (Text, unpack)
import Data.XML.Types
import Text.XML.Stream.Parse
data SumType = A Text | B Text | C Text
我们从接受 A 或 B 标签的管道开始,忽略
属性和 returns 名称和内容:
parseAorB :: MonadThrow m => ConduitM Event o m (Maybe (Name, Text))
parseAorB =
tag (\n -> if (n == "A" || n == "B") then Just n else Nothing) -- accept either A or B
(\n -> return n) -- ignore attributes
(\n -> do c <- content; return (n,c)) -- extract content
然后我们用它来写一个解析两个标签的管道,确保
一个是A,另一个是B:
parseAB :: MonadThrow m => ConduitM Event o m (Maybe (SumType, SumType))
parseAB = do
t1 <- parseAorB
case t1 of
Nothing -> return Nothing
Just (n1,c1) -> do
t2 <- parseAorB
case t2 of
Nothing -> return Nothing
Just (n2,c2) -> do
if | "A" == n1 && "B" == n2 -> return $ Just (A c1, B c2)
| "A" == n2 && "B" == n1 -> return $ Just (A c2, B c1)
| otherwise -> return Nothing
更新
您可以使用 MaybeT
转换器减少 parseAB
中的样板代码:
import Control.Monad.Trans.Maybe
import Control.Monad.Trans
parseAB' :: MonadThrow m => MaybeT (ConduitM Event o m) (SumType, SumType)
parseAB' = do
(n1, c1) <- MaybeT parseAorB
(n2, c2) <- MaybeT parseAorB
if | "A" == n1 && "B" == n2 -> return (A c1, B c2)
| "A" == n2 && "B" == n1 -> return (A c2, B c1)
| otherwise -> MaybeT $ return Nothing
如果您有多个构造函数,我会考虑这样做:
allkids = do
kids <- many parseAorB
let sorted = sort kids -- automatically sorts by name
if map fst kids == [ "A", "B", "C", "D", "E", "F", "G", "H"]
then let [ca, cb, cc, cd, ce, cf, cg, ch] = map snd kids
in return (A ca, B cb, C cc, D cd, E ce, F cf, G cg, H ch)
else ...error...
many
组合子来自 Tet.XML.Stream.Parse。
我必须解析一些 xml,我决定使用 xml-conduit 完成该任务并使用它的流媒体部分。
xml 的结构由 xsd 文件给出,其中包含元素及其出现的频率。但是不是他们期望的顺序。
如何使用 Text.XML.Stream.Parse
解析 xml 结构的子结构的所有可能重新排序?
问题
假设我们有一个 xml 描述,例如
Root
/ \
A B
那么 <Root><A>atext</A><B>btext</B></Root>
和 <Root><B>btext</B><A>atext</A></Root>
都是这个 xml 结构的有效实例。
但是在流设置中解析需要排序才能成功。
我想使用 parseRoot1 <|> parseRoot2
之类的东西,但后来我不得不实现 Alternative
实例并手动编写所有可能性,我真的不想这样做。
这是一个最小示例 haskell 程序。
Example.hs
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Rank2Types #-}
module Main where
import Control.Exception
import Control.Monad.Trans.Resource (MonadThrow)
import Text.XML.Stream.Parse
import Data.Monoid ((<>))
import Data.Maybe
import Data.Text (Text)
import Data.XML.Types (Event)
import Data.Conduit (ConduitM, Consumer, yield, ($=), ($$))
data Root = Root {a :: A, b :: B} deriving (Show, Eq)
data A = A Text deriving (Show, Eq)
data B = B Text deriving (Show, Eq)
ex1, ex2 :: Text
ex1 = "<Root>"<>
"<A>Atest</A>"<>
"<B>Btest</B>"<>
"</Root>"
ex2 = "<Root>"<>
"<B>Btest</B>"<>
"<A>Atest</A>"<>
"</Root>"
ex :: Root
ex = Root {a = A "Atest", b = B "Btest"}
parseA :: MonadThrow m => ConduitM Event o m (Maybe A)
parseA = tagIgnoreAttrs "A"
$ do result <- content
return (A $ result)
parseB :: MonadThrow m => ConduitM Event o m (Maybe B)
parseB = tagIgnoreAttrs "B"
$ do result <- content
return (B result)
parseRoot1 :: MonadThrow m => ConduitM Event o m (Maybe Root)
parseRoot1 = tagIgnoreAttrs "Root" $ do
a' <- fromMaybe (error "error parsing A") <$> parseA
b' <- fromMaybe (error "error parsing B") <$> parseB
return $ Root{a = a', b = b'}
parseRoot2 :: MonadThrow m => ConduitM Event o m (Maybe Root)
parseRoot2 = tagIgnoreAttrs "Root" $ do
b' <- fromMaybe (error "error parsing B") <$> parseB
a' <- fromMaybe (error "error parsing A") <$> parseA
return $ Root{a = a', b = b'}
parseTxt :: Consumer Event (Either SomeException) (Maybe a)
-> Text
-> Either SomeException (Maybe a)
parseTxt p inTxt = yield inTxt
$= parseText' def
$$ p
main :: IO ()
main = do putStrLn "Poor Mans Test Suite"
putStrLn "===================="
putStrLn "test1 Root -> A - B " -- works
print $ parseTxt parseRoot1 ex1
putStrLn "test1 Root -> B - A " -- fails
print $ parseTxt parseRoot1 ex2
putStrLn "test2 Root -> A - B " -- fails
print $ parseTxt parseRoot2 ex1
putStrLn "test2 Root -> B - A " -- works again
print $ parseTxt parseRoot2 ex2
注意
example.cabal
[...]
build-depends: base >=4.8 && <4.9
, conduit
, resourcet
, text
, xml-conduit
, xml-types
[...]
这是我的想法...
首先是一些定义:
{-# LANGUAGE OverloadedStrings, MultiWayIf #-}
import Control.Monad.Trans.Resource
import Data.Conduit
import Data.Text (Text, unpack)
import Data.XML.Types
import Text.XML.Stream.Parse
data SumType = A Text | B Text | C Text
我们从接受 A 或 B 标签的管道开始,忽略 属性和 returns 名称和内容:
parseAorB :: MonadThrow m => ConduitM Event o m (Maybe (Name, Text))
parseAorB =
tag (\n -> if (n == "A" || n == "B") then Just n else Nothing) -- accept either A or B
(\n -> return n) -- ignore attributes
(\n -> do c <- content; return (n,c)) -- extract content
然后我们用它来写一个解析两个标签的管道,确保 一个是A,另一个是B:
parseAB :: MonadThrow m => ConduitM Event o m (Maybe (SumType, SumType))
parseAB = do
t1 <- parseAorB
case t1 of
Nothing -> return Nothing
Just (n1,c1) -> do
t2 <- parseAorB
case t2 of
Nothing -> return Nothing
Just (n2,c2) -> do
if | "A" == n1 && "B" == n2 -> return $ Just (A c1, B c2)
| "A" == n2 && "B" == n1 -> return $ Just (A c2, B c1)
| otherwise -> return Nothing
更新
您可以使用 MaybeT
转换器减少 parseAB
中的样板代码:
import Control.Monad.Trans.Maybe
import Control.Monad.Trans
parseAB' :: MonadThrow m => MaybeT (ConduitM Event o m) (SumType, SumType)
parseAB' = do
(n1, c1) <- MaybeT parseAorB
(n2, c2) <- MaybeT parseAorB
if | "A" == n1 && "B" == n2 -> return (A c1, B c2)
| "A" == n2 && "B" == n1 -> return (A c2, B c1)
| otherwise -> MaybeT $ return Nothing
如果您有多个构造函数,我会考虑这样做:
allkids = do
kids <- many parseAorB
let sorted = sort kids -- automatically sorts by name
if map fst kids == [ "A", "B", "C", "D", "E", "F", "G", "H"]
then let [ca, cb, cc, cd, ce, cf, cg, ch] = map snd kids
in return (A ca, B cb, C cc, D cd, E ce, F cf, G cg, H ch)
else ...error...
many
组合子来自 Tet.XML.Stream.Parse。