如何使用 RealmSwift 解决我的最大匹配算法中的内存问题?
How can I fix this memory issue in my maximum matching algorithm with RealmSwift?
我在Swift中写了自己的最大匹配函数来对中文句子进行分词。它工作正常,除了异常长的句子外,内存使用量上升超过 1 GB。我需要帮助弄清楚如何修改我的代码,以便不存在此内存问题。我不确定这是否与我使用 RealmSwift 的方式有关,或者它是否是我的一般算法。
这是我的代码:
func splitSentenceIntoWordsWithDictionaryMaximumMatching(string: String) -> [String] {
var string = string
var foundWordsArray: [String] = []
var position = count(string)
while position > 0
{
var index = advance(string.startIndex, position)
let partialString = string.substringToIndex(index)
if let found = Realm().objects(Word).filter("simplified == '\(partialString)'").first
{
foundWordsArray.append(partialString)
position = position - 1
var partialStringCount = count(partialString)
while partialStringCount > 0
{
string = dropFirst(string)
partialStringCount -= 1
}
position = count(string)
index = advance(string.startIndex, position)
}
else if count(partialString) == 1
{
addNewEntryToDictionaryInTransaction(partialString, "", [partialString], partialString)
foundWordsArray.append(partialString)
var partialStringCount = count(partialString)
while partialStringCount > 0
{
string = dropFirst(string)
partialStringCount -= 1
}
position = count(string)
index = advance(string.startIndex, position)
}
else
{
position = position - 1
index = advance(string.startIndex, position)
}
}
return foundWordsArray
}
在这种情况下,您应该在循环中使用autoreleasepool
(参见: Use Local Autorelease Pool Blocks to Reduce Peak Memory Footprint):
while position > 0 {
autoreleasepool {
var index = advance(string.startIndex, position)
...
}
}
顺便说一句,您的代码在 String.Index
(count()
和 advance()
)上有太多内存复制操作和 O(N)
操作,这可能会导致严重的性能问题。相反,你应该有效地使用 String.Index
这样的东西:
import Foundation
var words:Set = ["今日", "献立", "魚", "味噌汁", "定食", "焼き魚", "です"]
func splitSentenceIntoWordsWithDictionaryMaximumMatching(string: String) -> [String] {
var foundWordsArray: [String] = []
var start = string.startIndex
var end = string.endIndex
while start != end {
autoreleasepool { // In this case(using builtin `Set`), I think we don't need `autoreleasepool` here. But this is just a demo :)
let partialString = string[start ..< end]
if !words.contains(partialString) {
if end.predecessor() != start { // faster way of `count(partialString) == 1`
end = end.predecessor()
return // we cannot use `continue` here because we are in `autoreleasepool` closure
}
words.insert(partialString)
}
foundWordsArray.append(partialString)
start = end
end = string.endIndex
}
}
return foundWordsArray
}
var str = "今日の献立は焼き魚定食と味噌汁です"
let result = splitSentenceIntoWordsWithDictionaryMaximumMatching(str)
debugPrintln(result) // ["今日", "の", "献立", "は", "焼き魚", "定食", "と", "味噌汁", "です"]
debugPrintln(words) // Set(["献立", "焼き魚", "今日", "魚", "味噌汁", "定食", "は", "と", "です", "の"])
我使用了内置 Set
,但我认为您可以轻松地在此处采用您的 Realm
代码 :)
已添加:回复评论:
反转版本:
var words:Set = ["研究", "研究生", "生命", "起源"]
func splitSentenceIntoWordsWithDictionaryMaximumMatchingReversed(string: String) -> [String] {
var foundWordsArray: [String] = []
var start = string.startIndex
var end = string.endIndex
while start != end {
autoreleasepool {
let partialString = string[start ..< end]
if !words.contains(partialString) {
if start.successor() != end {
start = start.successor()
return
}
words.insert(partialString)
}
foundWordsArray.append(partialString)
end = start
start = string.startIndex
}
}
return foundWordsArray.reverse()
}
var str = "研究生命起源"
let result = splitSentenceIntoWordsWithDictionaryMaximumMatchingReversed(str)
debugPrintln(result) // ["研究", "生命", "起源"]
我在Swift中写了自己的最大匹配函数来对中文句子进行分词。它工作正常,除了异常长的句子外,内存使用量上升超过 1 GB。我需要帮助弄清楚如何修改我的代码,以便不存在此内存问题。我不确定这是否与我使用 RealmSwift 的方式有关,或者它是否是我的一般算法。
这是我的代码:
func splitSentenceIntoWordsWithDictionaryMaximumMatching(string: String) -> [String] {
var string = string
var foundWordsArray: [String] = []
var position = count(string)
while position > 0
{
var index = advance(string.startIndex, position)
let partialString = string.substringToIndex(index)
if let found = Realm().objects(Word).filter("simplified == '\(partialString)'").first
{
foundWordsArray.append(partialString)
position = position - 1
var partialStringCount = count(partialString)
while partialStringCount > 0
{
string = dropFirst(string)
partialStringCount -= 1
}
position = count(string)
index = advance(string.startIndex, position)
}
else if count(partialString) == 1
{
addNewEntryToDictionaryInTransaction(partialString, "", [partialString], partialString)
foundWordsArray.append(partialString)
var partialStringCount = count(partialString)
while partialStringCount > 0
{
string = dropFirst(string)
partialStringCount -= 1
}
position = count(string)
index = advance(string.startIndex, position)
}
else
{
position = position - 1
index = advance(string.startIndex, position)
}
}
return foundWordsArray
}
在这种情况下,您应该在循环中使用autoreleasepool
(参见: Use Local Autorelease Pool Blocks to Reduce Peak Memory Footprint):
while position > 0 {
autoreleasepool {
var index = advance(string.startIndex, position)
...
}
}
顺便说一句,您的代码在 String.Index
(count()
和 advance()
)上有太多内存复制操作和 O(N)
操作,这可能会导致严重的性能问题。相反,你应该有效地使用 String.Index
这样的东西:
import Foundation
var words:Set = ["今日", "献立", "魚", "味噌汁", "定食", "焼き魚", "です"]
func splitSentenceIntoWordsWithDictionaryMaximumMatching(string: String) -> [String] {
var foundWordsArray: [String] = []
var start = string.startIndex
var end = string.endIndex
while start != end {
autoreleasepool { // In this case(using builtin `Set`), I think we don't need `autoreleasepool` here. But this is just a demo :)
let partialString = string[start ..< end]
if !words.contains(partialString) {
if end.predecessor() != start { // faster way of `count(partialString) == 1`
end = end.predecessor()
return // we cannot use `continue` here because we are in `autoreleasepool` closure
}
words.insert(partialString)
}
foundWordsArray.append(partialString)
start = end
end = string.endIndex
}
}
return foundWordsArray
}
var str = "今日の献立は焼き魚定食と味噌汁です"
let result = splitSentenceIntoWordsWithDictionaryMaximumMatching(str)
debugPrintln(result) // ["今日", "の", "献立", "は", "焼き魚", "定食", "と", "味噌汁", "です"]
debugPrintln(words) // Set(["献立", "焼き魚", "今日", "魚", "味噌汁", "定食", "は", "と", "です", "の"])
我使用了内置 Set
,但我认为您可以轻松地在此处采用您的 Realm
代码 :)
已添加:回复评论:
反转版本:
var words:Set = ["研究", "研究生", "生命", "起源"]
func splitSentenceIntoWordsWithDictionaryMaximumMatchingReversed(string: String) -> [String] {
var foundWordsArray: [String] = []
var start = string.startIndex
var end = string.endIndex
while start != end {
autoreleasepool {
let partialString = string[start ..< end]
if !words.contains(partialString) {
if start.successor() != end {
start = start.successor()
return
}
words.insert(partialString)
}
foundWordsArray.append(partialString)
end = start
start = string.startIndex
}
}
return foundWordsArray.reverse()
}
var str = "研究生命起源"
let result = splitSentenceIntoWordsWithDictionaryMaximumMatchingReversed(str)
debugPrintln(result) // ["研究", "生命", "起源"]