查找字符串中第 N 个单词的范围

Find the Range of the Nth word in a String

我想要的是这样的东西

"word1 word2 word3".rangeOfWord(2) => 6 to 10

结果可以是范围或元组或其他形式。

我宁愿不对字符进行迭代并使用状态机。为什么要重新发明词法分析器?有没有更好的方法?

在你的例子中,你的话是独一无二的,你可以使用下面的方法:

let myString = "word1 word2 word3"
let wordNum = 2
let myRange = myString.rangeOfString(myString.componentsSeparatedByString(" ")[wordNum-1])
    // 6..<11

正如安德鲁·邓肯 (Andrew Duncan) 在下面的评论中指出的那样,以上内容仅在您的文字独一无二时才有效。如果你有非独特的词,你可以使用这种不太简洁的方法:

let myString = "word1 word2 word3 word2 word1 word3 word1"
let wordNum = 7 // 2nd instance (out of 3) of "word1"
let arr = myString.componentsSeparatedByString(" ")
var fromIndex = arr[0..<wordNum-1].map { [=11=].characters.count }.reduce(0, combine: +) + wordNum - 1

let myRange = Range<String.Index>(start: myString.startIndex.advancedBy(fromIndex), end: myString.startIndex.advancedBy(fromIndex+arr[wordNum-1].characters.count))
let myWord = myString.substringWithRange(myRange) 
    // string "word1" (from range 36..<41)

最后,让我们使用后者来构造 String 的扩展,正如您在问题示例中所希望的那样:

extension String {
    private func rangeOfNthWord(wordNum: Int, wordSeparator: String) -> Range<String.Index>? {
        let arr = myString.componentsSeparatedByString(wordSeparator)

        if arr.count < wordNum {
            return nil
        }
        else {
            let fromIndex = arr[0..<wordNum-1].map { [=12=].characters.count }.reduce(0, combine: +) + (wordNum - 1)*wordSeparator.characters.count
            return Range<String.Index>(start: myString.startIndex.advancedBy(fromIndex), end: myString.startIndex.advancedBy(fromIndex+arr[wordNum-1].characters.count))
        }
    }
}

let myString = "word1 word2 word3 word2 word1 word3 word1"
let wordNum = 7 // 2nd instance (out of 3) of "word1"

if let myRange = myString.rangeOfNthWord(wordNum, wordSeparator: " ") {
        // myRange: 36..<41
    print(myString.substringWithRange(myRange)) // prints "word1"
}

如果单词分隔不是唯一的(假设某些单词由两个空格分隔 " "),您可以调整 .rangeOfNthWord(...) 方法。


还在下面的评论中指出,使用 .rangeOfString(...) 本身并不是 Swift。然而,这绝不是坏习惯。来自 Swift Language Guide - Strings and Characters:

Swift’s String type is bridged with Foundation’s NSString class. If you are working with the Foundation framework in Cocoa, the entire NSString API is available to call on any String value you create when type cast to NSString, as described in AnyObject. You can also use a String value with any API that requires an NSString instance.

另见 NSString class reference for rangeOfString method

// Swift Declaration:
func rangeOfString(_ searchString: String) -> NSRange

我继续写状态机。 (抱怨..)FWIW,在这里:

extension String {
    private func halfOpenIntervalOfBlock(n:Int, separator sep:Character? = nil) -> (Int, Int)? {
        enum State {
            case InSeparator
            case InPrecedingSeparator
            case InWord
            case InTarget
            case Done
        }

        guard n > 0 else {
            return nil
        }

        var state:State
        if n == 1 {
            state = .InPrecedingSeparator
        } else {
            state = .InSeparator
        }

        var separatorNum = 0
        var startIndex:Int = 0
        var endIndex:Int = 0

        for (i, c) in self.characters.enumerate() {
            let inSeparator:Bool
            // A bit inefficient to keep doing this test.
            if let s = sep {
                inSeparator = c == s
            } else {
                inSeparator = c == " " || c == "\n"
            }
            endIndex = i

            switch state {
            case .InPrecedingSeparator:
                if !inSeparator {
                    state = .InTarget
                    startIndex = i
                }

            case .InTarget:
                if inSeparator {
                    state = .Done
                }

            case .InWord:
                if inSeparator {
                    separatorNum += 1
                    if separatorNum == n - 1 {
                        state = .InPrecedingSeparator
                    } else {
                        state = .InSeparator
                    }
                }

            case .InSeparator:
                if !inSeparator {
                    state = .InWord
                }

            case .Done:
                break
            }

            if state == .Done {
                break
            }
        }

        if state == .Done {
            return (startIndex, endIndex)
        } else if state == .InTarget {
            return (startIndex, endIndex + 1) // We ran off end.
        } else {
            return nil
        }
    }

    func rangeOfWord(n:Int) -> Range<Index>? {
        guard let (s, e) = self.halfOpenIntervalOfBlock(n) else {
            return nil
        }
        let ss = self.startIndex.advancedBy(s)
        let ee = self.startIndex.advancedBy(e)
        return Range(start:ss, end:ee)
    }

 }

目前还不清楚字符串是否必须被视为由它可能包含的分隔符分成单词,或者您是否只是在寻找特定的子字符串出现。 无论如何,我认为这两种情况都可以通过这种方式解决:

extension String {
   func enumerateOccurencies(of pattern: String, _ body: (Range<String.Index>, inout Bool) throws -> Void) rethrows {
        guard
            !pattern.isEmpty,
            count >= pattern.count
        else { return }
    
        var stop = false
        var lo = startIndex
        while !stop && lo < endIndex {
            guard 
                let r = self[lo..<endIndex].range(of: pattern)
            else { break }
            
            try body(r, &stop)
            lo = r.upperBound
        }
    }
    
}

一旦达到所需的出现次数,您将在 body 闭包中将 stop 设置为 true 并捕获传递给它的 range

let words = "word1, word1, word2, word3, word1, word3"
var matches = 0
var rangeOfThirdOccurencyOfWord1: Range<String.Index>? = nil
words.enumerateOccurencies(of: "word1") { range, stop in 
    matches +=1
    stop = matches == 3
    if stop {
        rangeOfThirdOccurencyOfWord1 = range
    } 
}

关于 DFA:最近我写了一个利用 Hashable 并使用字典数组作为其状态节点的方法,但我发现上面的方法更快,因为可能 range(of:) 使用指纹.

更新

否则你也可以通过这种方式实现API你提到的:

import Foundation

extension String {
    func rangeOfWord(order: Int, separator: String) -> Range<String.Index>? {
        precondition(order > 0)
        guard
            !isEmpty,
            !separator.isEmpty,
            separator.count < count
        else { return nil }
        
        var wordsSoFar = 0
        var lo = startIndex
        while let r = self[lo..<endIndex].range(of: separator) {
            guard
                r.lowerBound != lo
            else {
                lo = r.upperBound
                continue
            }
            wordsSoFar += 1
            guard
                wordsSoFar < order
            else { return lo..<r.lowerBound }
            
            lo = r.upperBound
        }
        
        if
            lo < endIndex,
            wordsSoFar + 1 == order
        {
            return lo..<endIndex
        }
        
        return nil
    }
}

let words = "word anotherWord oneMore lastOne"
if let r = words.rangeOfWord(order: 4, separator: " ") {
    print(words[r])
} else {
    print("not found")
}

这里的order参数是指单词在字符串中的第n个顺序,从1开始。我还添加了separator参数来指定用于查找单词的字符串标记在字符串中(它也可以默认为 " " 以便能够调用该函数而无需指定它)。

这是我在 Swift 5.5 中更新答案的尝试:

import Foundation

extension String {

    func rangeOfWord(atPosition wordAt: Int) -> Range<String.Index>? {
        let fullrange = self.startIndex..<self.endIndex
        var count = 0
        var foundAt: Range<String.Index>? = nil

        self.enumerateSubstrings(in: fullrange, options: .byWords) { _, substringRange, _, stop in
            count += 1
            if count == wordAt {
                foundAt = substringRange
                stop = true  // Stop the enumeration after the word range is found.
            }
        }

        return foundAt
    }
}

let lorem = "Morbi leo risus, porta ac consectetur ac, vestibulum at eros."

if let found = lorem.rangeOfWord(atPosition: 8) {
    print("found: \(lorem[found])")
} else {
    print("not found.")
}

此解决方案不会创建一个新数组来包含单词,因此使用的内存更少(我没有测试过,但理论上它应该使用更少的内存)。尽可能使用内置方法,因此出现错误的可能性较小。