如何获取句子的上下文
How to get context of a sentence
所以我正在开发一个可以修补损坏单词的应用程序。
让我们采取:
mny people say there is a error in this sentence
有了 swift 在这里我们可以 UITextChecker
并得到 mny
这个词实际上可能是什么的美妙结果......但是,我实际上有几个选择,其中一个是 many
,另一个是 money
,所以很明显,money 不适合用在这句话中。有没有办法检查句子本身是否合乎逻辑?
认为这还有待改进。我更新了 this swift 3 solution to Swift 5. Worth to mention that it was originally inspired by this python tutorial
创建一个新的 iOS 项目,添加一个名为 bigtext.txt
的文本文件,其中将包含 this text。这将是我们的 "learning" 字典。
然后在 ViewController
:
import UIKit
import NaturalLanguage
class ViewController: UIViewController {
override func viewDidLoad() {
super.viewDidLoad()
let inputString = "mny people say there is a error in this sentence"
var newString = inputString
// Read a text file and "study" the model
guard let path = Bundle.main.path(forResource: "bigtext", ofType: "txt") else {
print("Path not available")
return
}
let checker = SpellChecker(contentsOfFile: path)
// better to use this to iterate between words in a sentence
let tokenizer = NLTokenizer(unit: .word)
tokenizer.string = inputString
tokenizer.enumerateTokens(in: inputString.startIndex..<inputString.endIndex) { tokenRange, _ in
let word = String(inputString[tokenRange])
let checked = checker?.correct(word: word)
let candidates = checker?.candidates(word: word)
if word == checked {
print("\(word) unchanged")
} else {
if let checked = checked {
newString.replaceSubrange(tokenRange, with: checked)
}
print("Correct:\t\(word) -> \(String(describing: checked))")
print("Candidates:\t\(word) -> \(String(describing: candidates))")
}
return true
}
print("Result: \(newString)")
}
}
func edits(word: String) -> Set<String> {
if word.isEmpty { return [] }
let splits = word.indices.map {
(word[word.startIndex..<[=10=]], word[[=10=]..<word.endIndex])
}
let deletes = splits.map { [=10=].0 + String([=10=].1.dropFirst()) }
let transposes: [String] = splits.map { left, right in
if let fst = right.first {
let drop1 = String(right.dropFirst())
if let snd = drop1.first {
let drop2 = String(drop1.dropFirst())
return "\(left)\(snd)\(fst)\(drop2)"
}
}
return ""
}.filter { ![=10=].isEmpty }
let alphabet = "abcdefghijklmnopqrstuvwxyz"
let replaces = splits.flatMap { left, right in
alphabet.map { "\(left)\([=10=])\(String(right.dropFirst()))" }
}
let inserts = splits.flatMap { left, right in
alphabet.map { "\(left)\([=10=])\(right)" }
}
let setString = [String(deletes.first!)] + transposes + replaces + inserts
return Set(setString)
}
struct SpellChecker {
var knownWords: [String:Int] = [:]
mutating func train(word: String) {
if let idx = knownWords[word] {
knownWords[word] = idx + 1
}
else {
knownWords[word] = 1
}
}
init?(contentsOfFile file: String) {
do {
let text = try String(contentsOfFile: file, encoding: .utf8).lowercased()
let words = text.unicodeScalars.split(whereSeparator: { !("a"..."z").contains([=10=]) }).map { String([=10=]) }
for word in words { self.train(word: word) }
}
catch {
return nil
}
}
func knownEdits2(word: String) -> Set<String>? {
var known_edits: Set<String> = []
for edit in edits(word: word) {
if let k = known(words: edits(word: edit)) {
known_edits.formUnion(k)
}
}
return known_edits.isEmpty ? nil : known_edits
}
func known<S: Sequence>(words: S) -> Set<String>? where S.Iterator.Element == String {
let s = Set(words.filter { self.knownWords.index(forKey: [=10=]) != nil })
return s.isEmpty ? nil : s
}
func candidates(word: String) -> Set<String> {
guard let result = known(words: [word]) ?? known(words: edits(word: word)) ?? knownEdits2(word: word) else {
return Set<String>()
}
return result
}
func correct(word: String) -> String {
return candidates(word: word).reduce(word) {
(knownWords[[=10=]] ?? 1) < (knownWords[] ?? 1) ? : [=10=]
}
}
}
会输出你:
Correct: mny -> Optional("may")
Candidates: mny -> Optional(Set(["any", "ny", "may", "many"]))
people unchanged
say unchanged
there unchanged
is unchanged
a unchanged
error unchanged
in unchanged
this unchanged
sentence unchanged
Result: may people say there is a error in this sentence
请考虑我们采用了第一个更正候选。
需要先弄清楚词序,理解句子的上下文。
所以我正在开发一个可以修补损坏单词的应用程序。
让我们采取:
mny people say there is a error in this sentence
有了 swift 在这里我们可以 UITextChecker
并得到 mny
这个词实际上可能是什么的美妙结果......但是,我实际上有几个选择,其中一个是 many
,另一个是 money
,所以很明显,money 不适合用在这句话中。有没有办法检查句子本身是否合乎逻辑?
认为这还有待改进。我更新了 this swift 3 solution to Swift 5. Worth to mention that it was originally inspired by this python tutorial
创建一个新的 iOS 项目,添加一个名为 bigtext.txt
的文本文件,其中将包含 this text。这将是我们的 "learning" 字典。
然后在 ViewController
:
import UIKit
import NaturalLanguage
class ViewController: UIViewController {
override func viewDidLoad() {
super.viewDidLoad()
let inputString = "mny people say there is a error in this sentence"
var newString = inputString
// Read a text file and "study" the model
guard let path = Bundle.main.path(forResource: "bigtext", ofType: "txt") else {
print("Path not available")
return
}
let checker = SpellChecker(contentsOfFile: path)
// better to use this to iterate between words in a sentence
let tokenizer = NLTokenizer(unit: .word)
tokenizer.string = inputString
tokenizer.enumerateTokens(in: inputString.startIndex..<inputString.endIndex) { tokenRange, _ in
let word = String(inputString[tokenRange])
let checked = checker?.correct(word: word)
let candidates = checker?.candidates(word: word)
if word == checked {
print("\(word) unchanged")
} else {
if let checked = checked {
newString.replaceSubrange(tokenRange, with: checked)
}
print("Correct:\t\(word) -> \(String(describing: checked))")
print("Candidates:\t\(word) -> \(String(describing: candidates))")
}
return true
}
print("Result: \(newString)")
}
}
func edits(word: String) -> Set<String> {
if word.isEmpty { return [] }
let splits = word.indices.map {
(word[word.startIndex..<[=10=]], word[[=10=]..<word.endIndex])
}
let deletes = splits.map { [=10=].0 + String([=10=].1.dropFirst()) }
let transposes: [String] = splits.map { left, right in
if let fst = right.first {
let drop1 = String(right.dropFirst())
if let snd = drop1.first {
let drop2 = String(drop1.dropFirst())
return "\(left)\(snd)\(fst)\(drop2)"
}
}
return ""
}.filter { ![=10=].isEmpty }
let alphabet = "abcdefghijklmnopqrstuvwxyz"
let replaces = splits.flatMap { left, right in
alphabet.map { "\(left)\([=10=])\(String(right.dropFirst()))" }
}
let inserts = splits.flatMap { left, right in
alphabet.map { "\(left)\([=10=])\(right)" }
}
let setString = [String(deletes.first!)] + transposes + replaces + inserts
return Set(setString)
}
struct SpellChecker {
var knownWords: [String:Int] = [:]
mutating func train(word: String) {
if let idx = knownWords[word] {
knownWords[word] = idx + 1
}
else {
knownWords[word] = 1
}
}
init?(contentsOfFile file: String) {
do {
let text = try String(contentsOfFile: file, encoding: .utf8).lowercased()
let words = text.unicodeScalars.split(whereSeparator: { !("a"..."z").contains([=10=]) }).map { String([=10=]) }
for word in words { self.train(word: word) }
}
catch {
return nil
}
}
func knownEdits2(word: String) -> Set<String>? {
var known_edits: Set<String> = []
for edit in edits(word: word) {
if let k = known(words: edits(word: edit)) {
known_edits.formUnion(k)
}
}
return known_edits.isEmpty ? nil : known_edits
}
func known<S: Sequence>(words: S) -> Set<String>? where S.Iterator.Element == String {
let s = Set(words.filter { self.knownWords.index(forKey: [=10=]) != nil })
return s.isEmpty ? nil : s
}
func candidates(word: String) -> Set<String> {
guard let result = known(words: [word]) ?? known(words: edits(word: word)) ?? knownEdits2(word: word) else {
return Set<String>()
}
return result
}
func correct(word: String) -> String {
return candidates(word: word).reduce(word) {
(knownWords[[=10=]] ?? 1) < (knownWords[] ?? 1) ? : [=10=]
}
}
}
会输出你:
Correct: mny -> Optional("may")
Candidates: mny -> Optional(Set(["any", "ny", "may", "many"]))
people unchanged
say unchanged
there unchanged
is unchanged
a unchanged
error unchanged
in unchanged
this unchanged
sentence unchanged
Result: may people say there is a error in this sentence
请考虑我们采用了第一个更正候选。 需要先弄清楚词序,理解句子的上下文。