attoparsec 高内存使用率读取大文件
attoparsec high memory usage reading huge file
我已经尝试了很多逐行解析文件内容的方法,但目前无法正常工作,并且在运行时会占用大量内存(超过 16GB)。
这是我要解析的文件的子集http://lpaste.net/144719
我要三种错误:
1) 回溯错误(多行,第一行像 3))
2) 多一行的单一错误
3) 单行错误
这是我当前的代码:
import qualified Data.ByteString as B
import Data.ByteString.Char8 as B8 hiding (lines, filter, unlines, head, readFile, take, length,
putStrLn, tail, map, concat, or, writeFile, intersperse,
groupBy, hGetContents)
import qualified Data.Text as T
import qualified Data.Text.IO as TIO
import Data.Attoparsec.Text hiding (take)
import Control.Applicative
import Control.Monad (replicateM, mapM)
import Data.Either (either)
import Data.List (intersperse, groupBy)
import System.Environment
import qualified System.IO as SIO
data TimeStamp = MkTimeStamp T.Text
deriving Show
data LogFileInfo = BackTraceLineInfo T.Text
| BackTraceInfo TimeStamp T.Text T.Text [LogFileInfo]
| Error TimeStamp T.Text
| LargeError TimeStamp T.Text T.Text
deriving Show
data LineType = SingleLineError TimeStamp T.Text
| DirectoryInfo T.Text
| ErrorInfo T.Text
| LineBackTraceInfo T.Text
| BackTraceString T.Text
| BackTraceLine T.Text
deriving Show
parseTimeStamp :: Parser TimeStamp
parseTimeStamp = do
year <- many digit
char '-'
month <- many digit
char '-'
day <- many digit
char ' '
hour <- many digit
char ':'
minute <- many digit
char ':'
second <- many digit
char ' '
(return . MkTimeStamp) $ T.pack $ year ++ "-" ++ month ++ "-" ++ day ++ " " ++ hour ++ ":" ++ minute ++ ":" ++ second
parseError :: Parser LineType
parseError = do
string $ T.pack "ERROR - "
timeStamp <- parseTimeStamp
errorInfo <- parseAnyLine
return $ SingleLineError timeStamp errorInfo
parseDirectoryInfo :: Parser LineType
parseDirectoryInfo = do
char '/'
directoryInfo <- parseAnyLine
(return . DirectoryInfo) $ T.append (T.pack "/") directoryInfo
parseErrorInfo :: Parser LineType
parseErrorInfo = do
errorInfo <- parseAnyLine
(return . ErrorInfo) errorInfo
parseBackTraceString :: Parser LineType
parseBackTraceString = do
let backTraceStr = T.pack " Backtrace: "
string backTraceStr
return $ BackTraceString backTraceStr
parseBacktraceLine :: Parser LineType
parseBacktraceLine = do
char '#'
number <- many1 digit
backTraceInfo <- parseAnyLine
let numberPart = T.pack $ '#' : number
return $ LineBackTraceInfo $ T.append numberPart backTraceInfo
parseAnyLine :: Parser T.Text
parseAnyLine = fmap T.pack $ many anyChar
-- Skips n lines for allowing other parsers to succeed
skipNLines n = replicateM n $ manyTill anyChar endOfLine
-- performParser :: Parser a -> T.Text -> BackTraceInfo
performParser = parseOnly
getEitherRight :: Either a b -> b
getEitherRight (Right b) = b
parseLogFile :: [T.Text] -> [LineType]
parseLogFile textxs =
let listaEithers = mapM (parseOnly $
try parseError
<|> try parseDirectoryInfo
<|> try parseBacktraceLine
<|> try parseBackTraceString
<|> parseErrorInfo) textxs
in getEitherRight listaEithers
customUnlines :: [String] -> String
customUnlines [] = []
customUnlines (x:xs) = if x == "\n"
then '\n':customUnlines xs
else x ++ "\n" ++ customUnlines xs
main = do
(fileName : _) <- getArgs
h <- SIO.openFile fileName SIO.ReadMode
SIO.hSetEncoding h SIO.latin1
fileContents <- SIO.hGetContents h
let titleLength = length fileName
titleWithoutExtension = take (titleLength - 4) fileName
allNonEmptyLines = map T.pack $ intersperse "\n" $ tail $ filter (/= "") $ lines fileContents -- [T.Text]
listParseResults = parseLogFile allNonEmptyLines -- [LineType]
-- onlyModelErrors = filter isModelError parseResult -- [LogFileInfo]
-- onlyOneRepresentative = map head $ groupBy equalErrors onlyModelErrors
listOfStrings = map show listParseResults
writeFile (titleWithoutExtension ++ ".logsummary") $ customUnlines listOfStrings
第一个问题是解析器没有解析任何东西。第二个问题是使用 16GB 的 RAM。如何改进我的方法?
至少有两个问题 - writeFile
和 customUnlines
。
writeFile
需要在写入之前收集所有输出,所以我会先看看这是否会产生输出:
h <- openFile "summary.txt" WriteMode
forM_ listOfStrings (hPutStrLn h)
hClose h
如果 listOfStrings
是惰性列表,这应该以流方式处理日志文件。
假设这可行,为了实现您的 customUnlines
逻辑,我会这样做:
h <- openFile "summary.txt" WriteMode
forM_ listOfStrings $ \x -> do
if x == "\n"
then hPutStr h "\n"
else hPutStrLn h "\n"
hClose h
如果 listOfStrings
不是惰性列表,那么我需要你的导入来进一步调试问题。
更新
事实证明 listOfStrings
不是惰性列表,因为 parseLogFile
。
请注意 listaEithers
的类型为 Either String [LineType]
。这意味着您必须解析它之前的所有行 returns。相反,您应该单独解析每一行:
forM_ allNonEmptyLines $ \x -> do
case parseOnly parseLogLine x of
Left e -> error "oops"
Right a -> print a -- a is a LineType
这里parseLogLine
是:
parseLogLine =
try parseError
<|> try parseDirectoryInfo
<|> try parseBacktraceLine
<|> try parseBackTraceString
<|> parseErrorInfo
我已经尝试了很多逐行解析文件内容的方法,但目前无法正常工作,并且在运行时会占用大量内存(超过 16GB)。
这是我要解析的文件的子集http://lpaste.net/144719
我要三种错误:
1) 回溯错误(多行,第一行像 3))
2) 多一行的单一错误
3) 单行错误
这是我当前的代码:
import qualified Data.ByteString as B
import Data.ByteString.Char8 as B8 hiding (lines, filter, unlines, head, readFile, take, length,
putStrLn, tail, map, concat, or, writeFile, intersperse,
groupBy, hGetContents)
import qualified Data.Text as T
import qualified Data.Text.IO as TIO
import Data.Attoparsec.Text hiding (take)
import Control.Applicative
import Control.Monad (replicateM, mapM)
import Data.Either (either)
import Data.List (intersperse, groupBy)
import System.Environment
import qualified System.IO as SIO
data TimeStamp = MkTimeStamp T.Text
deriving Show
data LogFileInfo = BackTraceLineInfo T.Text
| BackTraceInfo TimeStamp T.Text T.Text [LogFileInfo]
| Error TimeStamp T.Text
| LargeError TimeStamp T.Text T.Text
deriving Show
data LineType = SingleLineError TimeStamp T.Text
| DirectoryInfo T.Text
| ErrorInfo T.Text
| LineBackTraceInfo T.Text
| BackTraceString T.Text
| BackTraceLine T.Text
deriving Show
parseTimeStamp :: Parser TimeStamp
parseTimeStamp = do
year <- many digit
char '-'
month <- many digit
char '-'
day <- many digit
char ' '
hour <- many digit
char ':'
minute <- many digit
char ':'
second <- many digit
char ' '
(return . MkTimeStamp) $ T.pack $ year ++ "-" ++ month ++ "-" ++ day ++ " " ++ hour ++ ":" ++ minute ++ ":" ++ second
parseError :: Parser LineType
parseError = do
string $ T.pack "ERROR - "
timeStamp <- parseTimeStamp
errorInfo <- parseAnyLine
return $ SingleLineError timeStamp errorInfo
parseDirectoryInfo :: Parser LineType
parseDirectoryInfo = do
char '/'
directoryInfo <- parseAnyLine
(return . DirectoryInfo) $ T.append (T.pack "/") directoryInfo
parseErrorInfo :: Parser LineType
parseErrorInfo = do
errorInfo <- parseAnyLine
(return . ErrorInfo) errorInfo
parseBackTraceString :: Parser LineType
parseBackTraceString = do
let backTraceStr = T.pack " Backtrace: "
string backTraceStr
return $ BackTraceString backTraceStr
parseBacktraceLine :: Parser LineType
parseBacktraceLine = do
char '#'
number <- many1 digit
backTraceInfo <- parseAnyLine
let numberPart = T.pack $ '#' : number
return $ LineBackTraceInfo $ T.append numberPart backTraceInfo
parseAnyLine :: Parser T.Text
parseAnyLine = fmap T.pack $ many anyChar
-- Skips n lines for allowing other parsers to succeed
skipNLines n = replicateM n $ manyTill anyChar endOfLine
-- performParser :: Parser a -> T.Text -> BackTraceInfo
performParser = parseOnly
getEitherRight :: Either a b -> b
getEitherRight (Right b) = b
parseLogFile :: [T.Text] -> [LineType]
parseLogFile textxs =
let listaEithers = mapM (parseOnly $
try parseError
<|> try parseDirectoryInfo
<|> try parseBacktraceLine
<|> try parseBackTraceString
<|> parseErrorInfo) textxs
in getEitherRight listaEithers
customUnlines :: [String] -> String
customUnlines [] = []
customUnlines (x:xs) = if x == "\n"
then '\n':customUnlines xs
else x ++ "\n" ++ customUnlines xs
main = do
(fileName : _) <- getArgs
h <- SIO.openFile fileName SIO.ReadMode
SIO.hSetEncoding h SIO.latin1
fileContents <- SIO.hGetContents h
let titleLength = length fileName
titleWithoutExtension = take (titleLength - 4) fileName
allNonEmptyLines = map T.pack $ intersperse "\n" $ tail $ filter (/= "") $ lines fileContents -- [T.Text]
listParseResults = parseLogFile allNonEmptyLines -- [LineType]
-- onlyModelErrors = filter isModelError parseResult -- [LogFileInfo]
-- onlyOneRepresentative = map head $ groupBy equalErrors onlyModelErrors
listOfStrings = map show listParseResults
writeFile (titleWithoutExtension ++ ".logsummary") $ customUnlines listOfStrings
第一个问题是解析器没有解析任何东西。第二个问题是使用 16GB 的 RAM。如何改进我的方法?
至少有两个问题 - writeFile
和 customUnlines
。
writeFile
需要在写入之前收集所有输出,所以我会先看看这是否会产生输出:
h <- openFile "summary.txt" WriteMode
forM_ listOfStrings (hPutStrLn h)
hClose h
如果 listOfStrings
是惰性列表,这应该以流方式处理日志文件。
假设这可行,为了实现您的 customUnlines
逻辑,我会这样做:
h <- openFile "summary.txt" WriteMode
forM_ listOfStrings $ \x -> do
if x == "\n"
then hPutStr h "\n"
else hPutStrLn h "\n"
hClose h
如果 listOfStrings
不是惰性列表,那么我需要你的导入来进一步调试问题。
更新
事实证明 listOfStrings
不是惰性列表,因为 parseLogFile
。
请注意 listaEithers
的类型为 Either String [LineType]
。这意味着您必须解析它之前的所有行 returns。相反,您应该单独解析每一行:
forM_ allNonEmptyLines $ \x -> do
case parseOnly parseLogLine x of
Left e -> error "oops"
Right a -> print a -- a is a LineType
这里parseLogLine
是:
parseLogLine =
try parseError
<|> try parseDirectoryInfo
<|> try parseBacktraceLine
<|> try parseBackTraceString
<|> parseErrorInfo