为什么这不会 运行 在常量内存中?

Why does this not run in constant memory?

我正在尝试将大量数据写入常量内存中的文件。

import qualified Data.ByteString.Lazy as B

{- Creates and writes num grids of dimensions aa x aa -}
writeGrids :: Int -> Int -> IO ()
writeGrids num aa = do
    rng <- newPureMT
    let (grids,shuffleds) = createGrids rng aa
    createDirectoryIfMissing True "data/grids/"
    B.writeFile (gridFileName num aa)
                (encode (take num grids))
    B.writeFile (shuffledFileName num aa)
                (encode (take num shuffleds))

然而,这消耗的内存与 num 的大小成正比。我知道 createGrids 是一个足够惰性的函数,因为我已经通过将 error "not lazy enough" (如 Haskell wiki here 所建议的)附加到列表的末尾来测试它 returns 并且没有错误被提出。 take 是在 Data.List 中定义的惰性函数。 encode也是Data.Binary中定义的惰性函数。 B.writeFile 定义在 Data.ByteString.Lazy.

这是完整的代码,您可以执行它:

import Control.Arrow (first)
import Data.Binary
import GHC.Float (double2Float)
import System.Random (next)
import System.Random.Mersenne.Pure64 (PureMT, newPureMT, randomDouble)
import System.Random.Shuffle (shuffle')
import qualified Data.ByteString.Lazy as B

main :: IO ()
main = writeGrids 1000 64

{- Creates and writes num grids of dimensions aa x aa -}
writeGrids :: Int -> Int -> IO ()
writeGrids num aa = do
    rng <- newPureMT
    let (grids,shuffleds) = createGrids rng aa
    B.writeFile "grids.bin" (encode (take num grids))
    B.writeFile "shuffleds.bin" (encode (take num shuffleds))

{- a random number generator, dimension of grids to make
   returns a pair of lists, the first is a list of grids of dimensions
   aa x aa, the second is a list of the shuffled grids corresponding to the first list -}
createGrids :: PureMT -> Int -> ([[(Float,Float)]],[[(Float,Float)]])
createGrids rng aa = (grids,shuffleds) where
       rs = randomFloats rng
       grids = map (getGridR aa) (chunksOf (2 * aa * aa) rs) 
       shuffleds = shuffler (aa * aa) rng grids

{- length of each grid, a random number generator, a list of grids
   returns a the list with each grid shuffled -}
shuffler :: Int -> PureMT -> [[(Float,Float)]] -> [[(Float,Float)]]
shuffler n rng (xs:xss) = shuffle' xs n rng : shuffler n (snd (next rng))         xss
shuffler _ _ [] = []

{- divides list into chunks of size n -}
chunksOf :: Int -> [a] -> [[a]]
chunksOf n = go 
     where go xs = case splitAt n xs of
              (ys,zs) | null ys -> []
                      | otherwise -> ys : go zs

{- dimension of grid, list of random floats [0,1]
   returns a list of (x,y) points of length n^2 such that all
   points are in the range [0,1] and the points are a randomly 
   perturbed regular grid -}
getGridR :: Int -> [Float] -> [(Float,Float)]
getGridR n rs = pts where
   nn = n * n
   (irs,jrs) = splitAt nn rs
   n' = fromIntegral n
   grid = [ (p,q) | p <- [0..n'-1], q <- [0..n'-1] ]
   pts = zipWith (\(p,q) (ir,jr) -> ((p+ir)/n',(q+jr)/n')) grid (zip irs jrs)

{- an infinite list of random floats in range [0,1] -}
randomFloats :: PureMT -> [Float]
randomFloats rng = let (d,rng') = first double2Float (randomDouble rng)
                   in d : randomFloats rng'

所需的软件包是: , 字节串 , 二进制 , 随机的 , mersenne-random-pure64 , 随机洗牌

内存占用的两个原因:

首先Data.Binary.encode似乎并不运行常量space。以下程序使用 910 MB 内存:

import Data.Binary
import qualified Data.ByteString.Lazy as B

len = 10000000 :: Int 

main = B.writeFile "grids.bin" $ encode [0..len]

如果我们从 len 中保留 0,我们将使用 97 MB 的内存。

相比之下,以下程序使用 1 MB:

import qualified Data.ByteString.Lazy.Char8 as B

main = B.writeFile "grids.bin" $ B.pack $ show [0..(1000000::Int)]

Second,在您的程序 shuffleds 中包含对 grids 内容的引用,这会阻止 grids 的垃圾回收。所以当我们打印 grids 时,我们也会评估它,然后它必须坐在内存中,直到我们完成打印 shuffleds。您的程序的以下版本仍然消耗大量内存,但如果我们用 B.writeFile 注释掉两行之一,它会使用常量 space。

import qualified Data.ByteString.Lazy.Char8 as B

writeGrids :: Int -> Int -> IO ()
writeGrids num aa = do
    rng <- newPureMT
    let (grids,shuffleds) = createGrids rng aa
    B.writeFile "grids.bin" (B.pack $ show (take num grids))
    B.writeFile "shuffleds.bin" (B.pack $ show (take num shuffleds))

不管怎样,这里有一个结合了这里每个人的想法的完整解决方案。内存消耗恒定在 ~6MB(使用 -O2 编译)。

import Control.Arrow (first)
import Control.Monad.State (state, evalState)
import Data.Binary
import GHC.Float (double2Float)
import System.Random (next)
import System.Random.Mersenne.Pure64 (PureMT, newPureMT, randomDouble)
import System.Random.Shuffle (shuffle')
import qualified Data.ByteString as B (hPut)
import qualified Pipes.Binary as P (encode)
import qualified Pipes.Prelude as P (zip, mapM, drain)
import Pipes (runEffect, (>->))
import System.IO (withFile, IOMode(AppendMode))

main :: IO ()
main = writeGrids 1000 64

{- Creates and writes num grids of dimensions aa x aa -}
writeGrids :: Int -> Int -> IO ()
writeGrids num aa = do
    rng <- newPureMT
    let (grids, shuffleds) = createGrids rng aa
        gridFile = "grids.bin"
        shuffledFile = "shuffleds.bin"
        encoder = P.encode . SerList . take num
    writeFile gridFile ""
    writeFile shuffledFile ""
    withFile gridFile AppendMode $ \hGr ->
        withFile shuffledFile AppendMode $ \hSh ->
            runEffect
                $ P.zip (encoder grids) (encoder shuffleds)
                >-> P.mapM (\(ch1, ch2) -> B.hPut hGr ch1 >> B.hPut hSh ch2)
                >-> P.drain -- discards the stream of () results.

{- a random number generator, dimension of grids to make
   returns a pair of lists, the first is a list of grids of dimensions
   aa x aa, the second is a list of the shuffled grids corresponding to the first list -}
createGrids :: PureMT -> Int -> ( [[(Float,Float)]], [[(Float,Float)]] )
createGrids rng aa = unzip gridsAndShuffleds where
       rs = randomFloats rng
       grids =  map (getGridR aa) (chunksOf (2 * aa * aa) rs)
       gridsAndShuffleds = shuffler (aa * aa) rng grids

{- length of each grid, a random number generator, a list of grids
   returns a the list with each grid shuffled -}
shuffler :: Int -> PureMT -> [[(Float,Float)]] -> [( [(Float,Float)], [(Float,Float)] )]
shuffler n rng xss = evalState (traverse oneShuffle xss) rng
    where
    oneShuffle xs = state $ \r -> ((xs, shuffle' xs n r), snd (next r))

newtype SerList a = SerList { runSerList :: [a] }
    deriving (Show)

instance Binary a => Binary (SerList a) where
    put (SerList (x:xs)) = put False >> put x >> put (SerList xs)
    put _                = put True
    get = do
        stop <- get :: Get Bool
        if stop
            then return (SerList [])
            else do
                x          <- get
                SerList xs <- get
                return (SerList (x : xs))

{- divides list into chunks of size n -}
chunksOf :: Int -> [a] -> [[a]]
chunksOf n = go 
     where go xs = case splitAt n xs of
              (ys,zs) | null ys -> []
                      | otherwise -> ys : go zs

{- dimension of grid, list of random floats [0,1]
   returns a list of (x,y) points of length n^2 such that all
   points are in the range [0,1] and the points are a randomly 
   perturbed regular grid -}
getGridR :: Int -> [Float] -> [(Float,Float)]
getGridR n rs = pts where
   nn = n * n
   (irs,jrs) = splitAt nn rs
   n' = fromIntegral n
   grid = [ (p,q) | p <- [0..n'-1], q <- [0..n'-1] ]
   pts = zipWith (\(p,q) (ir,jr) -> ((p+ir)/n',(q+jr)/n')) grid (zip irs jrs)

{- an infinite list of random floats in range [0,1] -}
randomFloats :: PureMT -> [Float]
randomFloats rng = let (d,rng') = first double2Float (randomDouble rng)
                   in d : randomFloats rng'

对更改的评论:

  • shuffler 现在是 State 函子的遍历。它通过一次输入列表生成一个对列表,其中每个网格都与其打乱后的版本配对。 createGrids 然后(懒惰地)解压这个列表。

  • 文件是使用 pipes machinery, in a way loosely inspired by this answer 写入的(我最初是使用 P.foldM 编写的)。请注意,我使用的 hPut 是严格的字节串,因为它作用于生产者使用 P.zip 提供的严格块(从本质上讲,它是一对惰性字节串,成对提供块).

  • SerList 是用来保存 Thomas M. DuBuisson 提到的自定义 Binary 实例的。请注意,我并没有过多考虑实例的 get 方法中的惰性和严格性。如果这给您带来麻烦,this question 看起来很有用。