从字符串优化中提取链接
Extract links from string optimization
我从网站获取数据(HTML 字符串)。我想提取所有链接。我写了函数(它可以工作),但是它太慢了...
你能帮我优化一下吗?我可以使用哪些标准功能?
函数逻辑:在text中找到"http:.//" sting,然后读取string(buy char)直到不会得到"\".
extension String {
subscript (i: Int) -> Character {
return self[advance(self.startIndex, i)]
}
subscript (i: Int) -> String {
return String(self[i] as Character)
}
subscript (r: Range<Int>) -> String {
return substringWithRange(Range(start: advance(startIndex, r.startIndex), end: advance(startIndex, r.endIndex)))
}}
func extractAllLinks(text:String) -> Array<String>{
var stringArray = Array<String>()
var find = "http://" as String
for (var i = countElements(find); i<countElements(text); i++)
{
var ch:Character = text[i - Int(countElements(find))]
if (ch == find[0])
{
var j = 0
while (ch == find[j])
{
var ch2:Character = find[j]
if(countElements(find)-1 == j)
{
break
}
j++
i++
ch = text[i - Int(countElements(find))]
}
i -= j
if (j == (countElements(find)-1))
{
var str = ""
for (; text[i - Int(countElements(find))] != "\""; i++)
{
str += text[i - Int(countElements(find))]
}
stringArray.append(str)
}
}
}
return stringArray}
实际上有一个名为 NSDataDetector
的 class 可以为您检测 link。
您可以在 NSHipster 上找到它的示例:http://nshipster.com/nsdatadetector/
我想知道您是否意识到每次调用 countElements 时都会调用一个主要的复杂函数,该函数必须扫描字符串中的所有 Unicode 字符,并从中提取扩展字素簇并对其进行计数。如果您不知道什么是扩展字素簇,那么您应该能够想象这并不便宜而且是大材小用。
只需将其转换为 NSString*,调用 rangeOfString 即可。
显然你所做的是完全不安全的,因为 http:// 并不意味着有 link。您不能只在 html 中查找字符串并希望它有效;它没有。然后是 https、Http、hTtp、htTp、httP 等等等等。但这很容易,真正的恐怖是遵循 Uttam Sinha 评论中的 link。
就像上面AdamPro13说的那样,使用NSDataDetector
就可以很容易的得到所有的url,看下面的代码:
let text = "http://www.google.com. http://www.bla.com"
let types: NSTextCheckingType = .Link
var error : NSError?
let detector = NSDataDetector(types: types.rawValue, error: &error)
var matches = detector!.matchesInString(text, options: nil, range: NSMakeRange(0, count(text)))
for match in matches {
println(match.URL!)
}
它输出:
http://www.google.com
http://www.bla.com
Updated to Swift 2.0
let text = "http://www.google.com. http://www.bla.com"
let types: NSTextCheckingType = .Link
let detector = try? NSDataDetector(types: types.rawValue)
guard let detect = detector else {
return
}
let matches = detect.matchesInString(text, options: .ReportCompletion, range: NSMakeRange(0, text.characters.count))
for match in matches {
print(match.URL!)
}
记得在上面的例子中使用guard
语句,它必须在函数或循环中。
希望对您有所帮助。
正如其他人所指出的,您最好使用正则表达式、数据检测器或解析库。但是,作为对您的字符串处理的具体反馈:
Swift 字符串的关键是拥抱它们的只进性质。通常情况下,整数索引和随机访问是不必要的。正如@gnasher729 指出的那样,每次调用 count
时,您都在遍历字符串。同样,整数索引扩展是线性的,因此如果您在循环中使用它们,您很容易不小心创建二次或三次复杂度算法。
但在这种情况下,无需完成所有工作即可将字符串索引转换为随机访问整数。我认为这是一个仅使用本机字符串索引执行类似逻辑的版本(查找前缀,然后从那里查找 " 字符 - 忽略这不适合 https、upper/lower 大小写等):
func extractAllLinks(text: String) -> [String] {
var links: [String] = []
let prefix = "http://"
let prefixLen = count(prefix)
for var idx = text.startIndex; idx != text.endIndex; ++idx {
let candidate = text[idx..<text.endIndex]
if candidate.hasPrefix(prefix),
let closingQuote = find(candidate, "\"") {
let link = candidate[candidate.startIndex..<closingQuote]
links.append(link)
idx = advance(idx, count(link))
}
}
return links
}
let text = "This contains the link \"http://www.whatever.com/\" and"
+ " the link \"http://google.com\""
extractAllLinks(text)
如果有其他帮助程序,例如 findFromIndex
等,或者愿意不使用字符串切片并手动滚动搜索结束字符。
非常有帮助的帖子!这是一个在 Swift 1.2 中有效的示例,基于 Victor Sigler 的回答。
// extract first link (if available) and open it!
let text = "How technology is changing our relationships to each other: http://t.ted.com/mzRtRfX"
let types: NSTextCheckingType = .Link
do {
let detector = try NSDataDetector(types: types.rawValue)
let matches = detector.matchesInString(text, options: .ReportCompletion, range: NSMakeRange(0, text.characters.count))
if matches.count > 0 {
let url = matches[0].URL!
print("Opening URL: \(url)")
UIApplication.sharedApplication().openURL(url)
}
} catch {
// none found or some other issue
print ("error in findAndOpenURL detector")
}
这就是 Swift 5.0
的答案
let text = "http://www.google.com. http://www.bla.com"
func checkForUrls(text: String) -> [URL] {
let types: NSTextCheckingResult.CheckingType = .link
do {
let detector = try NSDataDetector(types: types.rawValue)
let matches = detector.matches(in: text, options: .reportCompletion, range: NSMakeRange(0, text.count))
return matches.compactMap({[=10=].url})
} catch let error {
debugPrint(error.localizedDescription)
}
return []
}
checkForUrls(text: text)
详情
- Swift5.2,Xcode11.4 (11E146)
解决方案
// MARK: DataDetector
class DataDetector {
private class func _find(all type: NSTextCheckingResult.CheckingType,
in string: String, iterationClosure: (String) -> Bool) {
guard let detector = try? NSDataDetector(types: type.rawValue) else { return }
let range = NSRange(string.startIndex ..< string.endIndex, in: string)
let matches = detector.matches(in: string, options: [], range: range)
loop: for match in matches {
for i in 0 ..< match.numberOfRanges {
let nsrange = match.range(at: i)
let startIndex = string.index(string.startIndex, offsetBy: nsrange.lowerBound)
let endIndex = string.index(string.startIndex, offsetBy: nsrange.upperBound)
let range = startIndex..<endIndex
guard iterationClosure(String(string[range])) else { break loop }
}
}
}
class func find(all type: NSTextCheckingResult.CheckingType, in string: String) -> [String] {
var results = [String]()
_find(all: type, in: string) {
results.append([=10=])
return true
}
return results
}
class func first(type: NSTextCheckingResult.CheckingType, in string: String) -> String? {
var result: String?
_find(all: type, in: string) {
result = [=10=]
return false
}
return result
}
}
// MARK: String extension
extension String {
var detectedLinks: [String] { DataDetector.find(all: .link, in: self) }
var detectedFirstLink: String? { DataDetector.first(type: .link, in: self) }
var detectedURLs: [URL] { detectedLinks.compactMap { URL(string: [=10=]) } }
var detectedFirstURL: URL? {
guard let urlString = detectedFirstLink else { return nil }
return URL(string: urlString)
}
}
用法
let text = """
Lorm Ipsum is simply dummy text of the printing and typesetting industry. apple.com/ Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. http://gooogle.com. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. yahoo.com It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
"""
print(text.detectedLinks)
print(text.detectedFirstLink)
print(text.detectedURLs)
print(text.detectedFirstURL)
控制台输出
["apple.com/", "http://gooogle.com", "yahoo.com"]
Optional("apple.com/")
[apple.com/, http://gooogle.com, yahoo.com]
Optional(apple.com/)
我从网站获取数据(HTML 字符串)。我想提取所有链接。我写了函数(它可以工作),但是它太慢了...
你能帮我优化一下吗?我可以使用哪些标准功能? 函数逻辑:在text中找到"http:.//" sting,然后读取string(buy char)直到不会得到"\".
extension String {
subscript (i: Int) -> Character {
return self[advance(self.startIndex, i)]
}
subscript (i: Int) -> String {
return String(self[i] as Character)
}
subscript (r: Range<Int>) -> String {
return substringWithRange(Range(start: advance(startIndex, r.startIndex), end: advance(startIndex, r.endIndex)))
}}
func extractAllLinks(text:String) -> Array<String>{
var stringArray = Array<String>()
var find = "http://" as String
for (var i = countElements(find); i<countElements(text); i++)
{
var ch:Character = text[i - Int(countElements(find))]
if (ch == find[0])
{
var j = 0
while (ch == find[j])
{
var ch2:Character = find[j]
if(countElements(find)-1 == j)
{
break
}
j++
i++
ch = text[i - Int(countElements(find))]
}
i -= j
if (j == (countElements(find)-1))
{
var str = ""
for (; text[i - Int(countElements(find))] != "\""; i++)
{
str += text[i - Int(countElements(find))]
}
stringArray.append(str)
}
}
}
return stringArray}
实际上有一个名为 NSDataDetector
的 class 可以为您检测 link。
您可以在 NSHipster 上找到它的示例:http://nshipster.com/nsdatadetector/
我想知道您是否意识到每次调用 countElements 时都会调用一个主要的复杂函数,该函数必须扫描字符串中的所有 Unicode 字符,并从中提取扩展字素簇并对其进行计数。如果您不知道什么是扩展字素簇,那么您应该能够想象这并不便宜而且是大材小用。
只需将其转换为 NSString*,调用 rangeOfString 即可。
显然你所做的是完全不安全的,因为 http:// 并不意味着有 link。您不能只在 html 中查找字符串并希望它有效;它没有。然后是 https、Http、hTtp、htTp、httP 等等等等。但这很容易,真正的恐怖是遵循 Uttam Sinha 评论中的 link。
就像上面AdamPro13说的那样,使用NSDataDetector
就可以很容易的得到所有的url,看下面的代码:
let text = "http://www.google.com. http://www.bla.com"
let types: NSTextCheckingType = .Link
var error : NSError?
let detector = NSDataDetector(types: types.rawValue, error: &error)
var matches = detector!.matchesInString(text, options: nil, range: NSMakeRange(0, count(text)))
for match in matches {
println(match.URL!)
}
它输出:
http://www.google.com
http://www.bla.com
Updated to Swift 2.0
let text = "http://www.google.com. http://www.bla.com"
let types: NSTextCheckingType = .Link
let detector = try? NSDataDetector(types: types.rawValue)
guard let detect = detector else {
return
}
let matches = detect.matchesInString(text, options: .ReportCompletion, range: NSMakeRange(0, text.characters.count))
for match in matches {
print(match.URL!)
}
记得在上面的例子中使用guard
语句,它必须在函数或循环中。
希望对您有所帮助。
正如其他人所指出的,您最好使用正则表达式、数据检测器或解析库。但是,作为对您的字符串处理的具体反馈:
Swift 字符串的关键是拥抱它们的只进性质。通常情况下,整数索引和随机访问是不必要的。正如@gnasher729 指出的那样,每次调用 count
时,您都在遍历字符串。同样,整数索引扩展是线性的,因此如果您在循环中使用它们,您很容易不小心创建二次或三次复杂度算法。
但在这种情况下,无需完成所有工作即可将字符串索引转换为随机访问整数。我认为这是一个仅使用本机字符串索引执行类似逻辑的版本(查找前缀,然后从那里查找 " 字符 - 忽略这不适合 https、upper/lower 大小写等):
func extractAllLinks(text: String) -> [String] {
var links: [String] = []
let prefix = "http://"
let prefixLen = count(prefix)
for var idx = text.startIndex; idx != text.endIndex; ++idx {
let candidate = text[idx..<text.endIndex]
if candidate.hasPrefix(prefix),
let closingQuote = find(candidate, "\"") {
let link = candidate[candidate.startIndex..<closingQuote]
links.append(link)
idx = advance(idx, count(link))
}
}
return links
}
let text = "This contains the link \"http://www.whatever.com/\" and"
+ " the link \"http://google.com\""
extractAllLinks(text)
如果有其他帮助程序,例如 findFromIndex
等,或者愿意不使用字符串切片并手动滚动搜索结束字符。
非常有帮助的帖子!这是一个在 Swift 1.2 中有效的示例,基于 Victor Sigler 的回答。
// extract first link (if available) and open it!
let text = "How technology is changing our relationships to each other: http://t.ted.com/mzRtRfX"
let types: NSTextCheckingType = .Link
do {
let detector = try NSDataDetector(types: types.rawValue)
let matches = detector.matchesInString(text, options: .ReportCompletion, range: NSMakeRange(0, text.characters.count))
if matches.count > 0 {
let url = matches[0].URL!
print("Opening URL: \(url)")
UIApplication.sharedApplication().openURL(url)
}
} catch {
// none found or some other issue
print ("error in findAndOpenURL detector")
}
这就是 Swift 5.0
的答案let text = "http://www.google.com. http://www.bla.com"
func checkForUrls(text: String) -> [URL] {
let types: NSTextCheckingResult.CheckingType = .link
do {
let detector = try NSDataDetector(types: types.rawValue)
let matches = detector.matches(in: text, options: .reportCompletion, range: NSMakeRange(0, text.count))
return matches.compactMap({[=10=].url})
} catch let error {
debugPrint(error.localizedDescription)
}
return []
}
checkForUrls(text: text)
详情
- Swift5.2,Xcode11.4 (11E146)
解决方案
// MARK: DataDetector
class DataDetector {
private class func _find(all type: NSTextCheckingResult.CheckingType,
in string: String, iterationClosure: (String) -> Bool) {
guard let detector = try? NSDataDetector(types: type.rawValue) else { return }
let range = NSRange(string.startIndex ..< string.endIndex, in: string)
let matches = detector.matches(in: string, options: [], range: range)
loop: for match in matches {
for i in 0 ..< match.numberOfRanges {
let nsrange = match.range(at: i)
let startIndex = string.index(string.startIndex, offsetBy: nsrange.lowerBound)
let endIndex = string.index(string.startIndex, offsetBy: nsrange.upperBound)
let range = startIndex..<endIndex
guard iterationClosure(String(string[range])) else { break loop }
}
}
}
class func find(all type: NSTextCheckingResult.CheckingType, in string: String) -> [String] {
var results = [String]()
_find(all: type, in: string) {
results.append([=10=])
return true
}
return results
}
class func first(type: NSTextCheckingResult.CheckingType, in string: String) -> String? {
var result: String?
_find(all: type, in: string) {
result = [=10=]
return false
}
return result
}
}
// MARK: String extension
extension String {
var detectedLinks: [String] { DataDetector.find(all: .link, in: self) }
var detectedFirstLink: String? { DataDetector.first(type: .link, in: self) }
var detectedURLs: [URL] { detectedLinks.compactMap { URL(string: [=10=]) } }
var detectedFirstURL: URL? {
guard let urlString = detectedFirstLink else { return nil }
return URL(string: urlString)
}
}
用法
let text = """
Lorm Ipsum is simply dummy text of the printing and typesetting industry. apple.com/ Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. http://gooogle.com. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. yahoo.com It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
"""
print(text.detectedLinks)
print(text.detectedFirstLink)
print(text.detectedURLs)
print(text.detectedFirstURL)
控制台输出
["apple.com/", "http://gooogle.com", "yahoo.com"]
Optional("apple.com/")
[apple.com/, http://gooogle.com, yahoo.com]
Optional(apple.com/)