如何使用 RealmSwift 解决我的最大匹配算法中的内存问题?

How can I fix this memory issue in my maximum matching algorithm with RealmSwift?

我在Swift中写了自己的最大匹配函数来对中文句子进行分词。它工作正常,除了异常长的句子外,内存使用量上升超过 1 GB。我需要帮助弄清楚如何修改我的代码,以便不存在此内存问题。我不确定这是否与我使用 RealmSwift 的方式有关,或者它是否是我的一般算法。

这是我的代码:

    func splitSentenceIntoWordsWithDictionaryMaximumMatching(string: String) -> [String] {
    var string = string
    var foundWordsArray: [String] = []
    var position = count(string)

    while position > 0
    {
        var index = advance(string.startIndex, position)
        let partialString = string.substringToIndex(index)
        if let found = Realm().objects(Word).filter("simplified == '\(partialString)'").first
        {
            foundWordsArray.append(partialString)
            position = position - 1
            var partialStringCount = count(partialString)
            while partialStringCount > 0
            {
                string = dropFirst(string)
                partialStringCount -= 1
            }
            position = count(string)
            index = advance(string.startIndex, position)
        }
        else if count(partialString) == 1
        {
            addNewEntryToDictionaryInTransaction(partialString, "", [partialString], partialString)
            foundWordsArray.append(partialString)
            var partialStringCount = count(partialString)
            while partialStringCount > 0
            {
                string = dropFirst(string)
                partialStringCount -= 1
            }
            position = count(string)
            index = advance(string.startIndex, position)
        }
        else
        {
            position = position - 1
            index = advance(string.startIndex, position)
        }
    }

    return foundWordsArray
}

在这种情况下,您应该在循环中使用autoreleasepool(参见: Use Local Autorelease Pool Blocks to Reduce Peak Memory Footprint):

    while position > 0 {
        autoreleasepool {
            var index = advance(string.startIndex, position)
            ...
        }
    }

顺便说一句,您的代码在 String.Indexcount()advance())上有太多内存复制操作和 O(N) 操作,这可能会导致严重的性能问题。相反,你应该有效地使用 String.Index 这样的东西:

import Foundation

var  words:Set = ["今日", "献立", "魚", "味噌汁", "定食", "焼き魚", "です"]
func splitSentenceIntoWordsWithDictionaryMaximumMatching(string: String) -> [String] {
    var foundWordsArray: [String] = []
    var start = string.startIndex
    var end = string.endIndex

    while start != end {
        autoreleasepool { // In this case(using builtin `Set`), I think we don't need `autoreleasepool` here. But this is just a demo :)
            let partialString = string[start ..< end]
            if !words.contains(partialString) {
                if end.predecessor() != start { // faster way of `count(partialString) == 1`
                    end = end.predecessor()
                    return // we cannot use `continue` here because we are in `autoreleasepool` closure
                }
                words.insert(partialString)
            }
            foundWordsArray.append(partialString)
            start = end
            end = string.endIndex
        }
    }

    return foundWordsArray
}

var str = "今日の献立は焼き魚定食と味噌汁です"
let result = splitSentenceIntoWordsWithDictionaryMaximumMatching(str)

debugPrintln(result) // ["今日", "の", "献立", "は", "焼き魚", "定食", "と", "味噌汁", "です"]
debugPrintln(words) // Set(["献立", "焼き魚", "今日", "魚", "味噌汁", "定食", "は", "と", "です", "の"])

我使用了内置 Set,但我认为您可以轻松地在此处采用您的 Realm 代码 :)


已添加:回复评论:

反转版本:

var  words:Set = ["研究", "研究生", "生命", "起源"]
func splitSentenceIntoWordsWithDictionaryMaximumMatchingReversed(string: String) -> [String] {
    var foundWordsArray: [String] = []
    var start = string.startIndex
    var end = string.endIndex

    while start != end {
        autoreleasepool {
            let partialString = string[start ..< end]
            if !words.contains(partialString) {
                if start.successor() != end {
                    start = start.successor()
                    return
                }
                words.insert(partialString)
            }
            foundWordsArray.append(partialString)
            end = start
            start = string.startIndex
        }
    }

    return foundWordsArray.reverse()
}

var str = "研究生命起源"
let result = splitSentenceIntoWordsWithDictionaryMaximumMatchingReversed(str)

debugPrintln(result) // ["研究", "生命", "起源"]