在杂乱的字符串数据库上解析和修复时间
Parsing and fixing time on a messy string database
我必须解析数据库中的 10K 个条目。
这个数据库有一个叫做工作时间的字段,它显示了汽车经销商的办公室工作时间。
问题是该字段包含如下描述性内容:
Office working hours from 10 am - 4 pm
Open from 9AM to 5PM
Main Showroom from 10:00AM - 5:00PM
Open 10 AM to 13 PM
Office 10AM to 3PM -- Showroom 9AM to 4PM
所以你可以看到风格各不相同,am
和 pm
小写和大写,有和没有 space,小时有零和冒号,没有它甚至 13 PM
两种风格的混合产生了错误的时间。换句话说,一团糟。另外,每行是否有多个时间范围。
我想把整个东西转换成 24 小时格式,比如。
Office working hours from 10:00 - 16:00 hours
Open from 9:00 to 17:00 hours
Main Showroom from 10:00 - 17:00 hours
Open 10:00 to 13:00 hours
Office 10:00 to 15:00 -- Showroom 9:00 to 16:00 hours
我每小时可以进行无数次这样的 if
:
if ([text containsString:@"7 PM"]) {
text = [text stringByReplacingOccurrencesOfString:@"7 PM" withString:@"19:00"];
}
但这将有数十亿行并且效率不高。我将不得不测试大写、小写、有无 spaces 以及错误的条目。
这一定是一种更简单的方法...
有什么想法吗?
NSRegularExpression
和 NSDateFormatter
的组合将对这项工作派上用场。结果是这样的:
13PM 需要手动编辑,或者可能有自动修复的方法。
正则表达式并不完美,它还会捕获 36 AM
之类的内容,但日期格式化程序会 return nil
.
这是代码,准备运行:
NSString *test1 = @"Office working hours from 10 am - 4 pm";
NSString *test2 = @"Open from 9AM to 5PM";
NSString *test3 = @"Main Showroom from 10:00AM - 5:00PM";
NSString *test4 = @"Open 10 AM to 13 PM";
NSString *test5 = @"Office 10AM to 3PM -- Showroom 9AM to 4PM";
NSMutableArray *newStrings = [NSMutableArray array];
// [0-9]+ -> Capture 1 or more digit
// (?:\:[0-9]+)? -> Capture ":" optionally, if so capture 1 or more digit
// ( )* -> Capture 0 or more whitespace
// (am|pm) -> Case insensitive search, captures aM, Pm, AM, pm
NSString *hourPattern = @"([0-9]+(?:\:[0-9]+)?( )*(am|pm))";
NSError *error = nil;
NSRegularExpression *miniFormatter = [NSRegularExpression
regularExpressionWithPattern:hourPattern
options:NSRegularExpressionCaseInsensitive | NSRegularExpressionSearch
error:&error];
if(error)
{
NSLog(@"%@", error.localizedDescription);
return;
}
for(NSString *text in @[test1, test2, test3, test4, test5])
{
NSArray<NSTextCheckingResult *> *matches = [miniFormatter matchesInString:text
options:kNilOptions
range:NSMakeRange(0, text.length)];
NSString *textToChange = [text copy];
for(NSTextCheckingResult *result in matches)
{
NSString *foundTime = [text substringWithRange:result.range];
NSString *foundTimeOriginal = [foundTime copy]; // This will be used when finding the current range of the text.
// Step 1: Remove whitespace for parsing.
foundTime = [foundTime stringByReplacingOccurrencesOfString:@" " withString:@""];
// Step 2: Make am/pm uppercase.
foundTime = [foundTime uppercaseString];
NSDateFormatter *dateFormatter = [NSDateFormatter new];
[dateFormatter setTimeZone:[NSTimeZone timeZoneWithName:@"GMT"]]; // You may change it accordingly.
NSDate *foundDate;
// Step 3: Detect if it's in hh:mm format or hh format.
if([foundTime containsString:@":"])
{
// hh:mm format
[dateFormatter setDateFormat:@"hh:mma"];
}
else
{
// hh format
[dateFormatter setDateFormat:@"hha"];
}
foundDate = [dateFormatter dateFromString:foundTime];
if(!foundDate)
{
// There's a problem with parsing (such as 13PM).
// Proceeding manually...
continue;
}
//NSLog(@"%@ : %@", foundTime, foundDate);
// Step 4: Convert to 24-Hour
[dateFormatter setDateFormat:@"HH:mm"];
NSString *convertedTime = [dateFormatter stringFromDate:foundDate];
NSRange currentRange = [textToChange rangeOfString:foundTimeOriginal];
textToChange = [textToChange stringByReplacingCharactersInRange:currentRange withString:convertedTime];
}
[newStrings addObject:textToChange];
}
for(NSString *text in newStrings)
{
NSLog(@"%@", text);
}
希望对您有所帮助。
我必须解析数据库中的 10K 个条目。
这个数据库有一个叫做工作时间的字段,它显示了汽车经销商的办公室工作时间。
问题是该字段包含如下描述性内容:
Office working hours from 10 am - 4 pm
Open from 9AM to 5PM
Main Showroom from 10:00AM - 5:00PM
Open 10 AM to 13 PM
Office 10AM to 3PM -- Showroom 9AM to 4PM
所以你可以看到风格各不相同,am
和 pm
小写和大写,有和没有 space,小时有零和冒号,没有它甚至 13 PM
两种风格的混合产生了错误的时间。换句话说,一团糟。另外,每行是否有多个时间范围。
我想把整个东西转换成 24 小时格式,比如。
Office working hours from 10:00 - 16:00 hours
Open from 9:00 to 17:00 hours
Main Showroom from 10:00 - 17:00 hours
Open 10:00 to 13:00 hours
Office 10:00 to 15:00 -- Showroom 9:00 to 16:00 hours
我每小时可以进行无数次这样的 if
:
if ([text containsString:@"7 PM"]) {
text = [text stringByReplacingOccurrencesOfString:@"7 PM" withString:@"19:00"];
}
但这将有数十亿行并且效率不高。我将不得不测试大写、小写、有无 spaces 以及错误的条目。
这一定是一种更简单的方法...
有什么想法吗?
NSRegularExpression
和 NSDateFormatter
的组合将对这项工作派上用场。结果是这样的:
13PM 需要手动编辑,或者可能有自动修复的方法。
正则表达式并不完美,它还会捕获 36 AM
之类的内容,但日期格式化程序会 return nil
.
这是代码,准备运行:
NSString *test1 = @"Office working hours from 10 am - 4 pm";
NSString *test2 = @"Open from 9AM to 5PM";
NSString *test3 = @"Main Showroom from 10:00AM - 5:00PM";
NSString *test4 = @"Open 10 AM to 13 PM";
NSString *test5 = @"Office 10AM to 3PM -- Showroom 9AM to 4PM";
NSMutableArray *newStrings = [NSMutableArray array];
// [0-9]+ -> Capture 1 or more digit
// (?:\:[0-9]+)? -> Capture ":" optionally, if so capture 1 or more digit
// ( )* -> Capture 0 or more whitespace
// (am|pm) -> Case insensitive search, captures aM, Pm, AM, pm
NSString *hourPattern = @"([0-9]+(?:\:[0-9]+)?( )*(am|pm))";
NSError *error = nil;
NSRegularExpression *miniFormatter = [NSRegularExpression
regularExpressionWithPattern:hourPattern
options:NSRegularExpressionCaseInsensitive | NSRegularExpressionSearch
error:&error];
if(error)
{
NSLog(@"%@", error.localizedDescription);
return;
}
for(NSString *text in @[test1, test2, test3, test4, test5])
{
NSArray<NSTextCheckingResult *> *matches = [miniFormatter matchesInString:text
options:kNilOptions
range:NSMakeRange(0, text.length)];
NSString *textToChange = [text copy];
for(NSTextCheckingResult *result in matches)
{
NSString *foundTime = [text substringWithRange:result.range];
NSString *foundTimeOriginal = [foundTime copy]; // This will be used when finding the current range of the text.
// Step 1: Remove whitespace for parsing.
foundTime = [foundTime stringByReplacingOccurrencesOfString:@" " withString:@""];
// Step 2: Make am/pm uppercase.
foundTime = [foundTime uppercaseString];
NSDateFormatter *dateFormatter = [NSDateFormatter new];
[dateFormatter setTimeZone:[NSTimeZone timeZoneWithName:@"GMT"]]; // You may change it accordingly.
NSDate *foundDate;
// Step 3: Detect if it's in hh:mm format or hh format.
if([foundTime containsString:@":"])
{
// hh:mm format
[dateFormatter setDateFormat:@"hh:mma"];
}
else
{
// hh format
[dateFormatter setDateFormat:@"hha"];
}
foundDate = [dateFormatter dateFromString:foundTime];
if(!foundDate)
{
// There's a problem with parsing (such as 13PM).
// Proceeding manually...
continue;
}
//NSLog(@"%@ : %@", foundTime, foundDate);
// Step 4: Convert to 24-Hour
[dateFormatter setDateFormat:@"HH:mm"];
NSString *convertedTime = [dateFormatter stringFromDate:foundDate];
NSRange currentRange = [textToChange rangeOfString:foundTimeOriginal];
textToChange = [textToChange stringByReplacingCharactersInRange:currentRange withString:convertedTime];
}
[newStrings addObject:textToChange];
}
for(NSString *text in newStrings)
{
NSLog(@"%@", text);
}
希望对您有所帮助。