来自 NSInputStream 的字符串不是有效的 utf8。如何转换成utf8更多'lossy'
String from NSInputStream is not valid utf8. How to convert to utf8 more 'lossy'
我有一个从服务器读取数据的应用程序。有时,数据似乎不是有效的 UTF-8。如果我从字节数组转换为 UTF8 字符串,则该字符串显示为 nil。字节数组中必须有一些无效的非 UTF8 字符。有没有办法'lossy'将字节数组转换为UTF8并仅过滤掉无效字符?
有什么想法吗?
我的代码如下所示:
- (void)stream:(NSStream *)theStream handleEvent:(NSStreamEvent)streamEvent {
switch (streamEvent){
case NSStreamEventHasBytesAvailable:
{
uint8_t buffer[1024];
int len;
NSMutableData * inputData = [NSMutableData data];
while ([directoryStream hasBytesAvailable]){
len = [directoryStream read:buffer maxLength:sizeof(buffer)];
if (len> 0) {
[inputData appendBytes:(const void *)buffer length:len];
}
}
NSString *directoryString = [[NSString alloc] initWithData:data encoding:NSUTF8StringEncoding];
}
NSLog(@"directoryString: %@", directoryString);
...
有没有办法以更 'lossy' 的方式进行这种转换?
如您所见,我首先将数据块附加到 NSData 值,并在读取所有内容后转换为 utf8。这可以防止(多字节)utf8 字符被拆分,从而导致更多无效(空)utf8 字符串。
有效!通过结合 Larme 的代码片段和关于 UTF-8 字符大小的注释,我设法创建了一个 'lossy' NSData 到 UTF-8 NSString 的转换方法。
+ (NSString *) data2UTF8String:(NSData *) data {
// First try to do the 'standard' UTF-8 conversion
NSString * bufferStr = [[NSString alloc] initWithData:data
encoding:NSUTF8StringEncoding];
// if it fails, do the 'lossy' UTF8 conversion
if (!bufferStr) {
const Byte * buffer = [data bytes];
NSMutableString * filteredString = [[NSMutableString alloc] init];
int i = 0;
while (i < [data length]) {
int expectedLength = 1;
if ((buffer[i] & 0b10000000) == 0b00000000) expectedLength = 1;
else if ((buffer[i] & 0b11100000) == 0b11000000) expectedLength = 2;
else if ((buffer[i] & 0b11110000) == 0b11100000) expectedLength = 3;
else if ((buffer[i] & 0b11111000) == 0b11110000) expectedLength = 4;
else if ((buffer[i] & 0b11111100) == 0b11111000) expectedLength = 5;
else if ((buffer[i] & 0b11111110) == 0b11111100) expectedLength = 6;
int length = MIN(expectedLength, [data length] - i);
NSData * character = [NSData dataWithBytes:&buffer[i] length:(sizeof(Byte) * length)];
NSString * possibleString = [NSString stringWithUTF8String:[character bytes]];
if (possibleString) {
[filteredString appendString:possibleString];
}
i = i + expectedLength;
}
bufferStr = filteredString;
}
return bufferStr;
}
如果您有任何意见,请告诉我。
谢谢 Larme!
我使用 -[validUTF8String] 方法创建了一个 NSString 类别,在 UTF8String returns NULL 的情况下,去除无效的代理字符,然后在已清理的字符串上调用 UTF8String:
@interface NSString (ValidUTF8String)
- (const char *)validUTF8String;
- (NSString *)stringByStrippingInvalidUnicode; // warning: very inefficient! should only be called when we are sure that the string contains invalid Unicode, e.g. when -[UTF8String] is NULL
@end
@implementation NSString (ValidUTF8String)
- (const char *)validUTF8String;
{
const char *result=[self UTF8String];
if (!result)
{
result=[[self stringByStrippingInvalidUnicode] UTF8String];
if (!result)
result="";
}
return result;
}
#define isHighSurrogate(k) ((k>=0xD800) && (k<=0xDBFF))
#define isLowSurrogate(k) ((k>=0xDC00) && (k<=0xDFFF))
- (NSString *)stringByStrippingInvalidUnicode
{
NSMutableString *fixed=[[self mutableCopy] autorelease];
for (NSInteger idx=0; idx<[fixed length]; idx++)
{
unichar k=[fixed characterAtIndex:idx];
if (isHighSurrogate(k))
{
BOOL nextIsLowSurrogate=NO;
if (idx+1<[fixed length])
{
unichar nextK=[fixed characterAtIndex:idx+1];
nextIsLowSurrogate=isLowSurrogate(nextK);
}
if (!nextIsLowSurrogate)
{
[fixed deleteCharactersInRange:NSMakeRange(idx, 1)];
idx--;
}
}
else if (isLowSurrogate(k))
{
BOOL previousWasHighSurrogate=NO;
if (idx>0)
{
unichar previousK=[fixed characterAtIndex:idx-1];
previousWasHighSurrogate=isHighSurrogate(previousK);
}
if (!previousWasHighSurrogate)
{
[fixed deleteCharactersInRange:NSMakeRange(idx, 1)];
idx--;
}
}
}
return fixed;
}
@end
我有一个从服务器读取数据的应用程序。有时,数据似乎不是有效的 UTF-8。如果我从字节数组转换为 UTF8 字符串,则该字符串显示为 nil。字节数组中必须有一些无效的非 UTF8 字符。有没有办法'lossy'将字节数组转换为UTF8并仅过滤掉无效字符?
有什么想法吗?
我的代码如下所示:
- (void)stream:(NSStream *)theStream handleEvent:(NSStreamEvent)streamEvent {
switch (streamEvent){
case NSStreamEventHasBytesAvailable:
{
uint8_t buffer[1024];
int len;
NSMutableData * inputData = [NSMutableData data];
while ([directoryStream hasBytesAvailable]){
len = [directoryStream read:buffer maxLength:sizeof(buffer)];
if (len> 0) {
[inputData appendBytes:(const void *)buffer length:len];
}
}
NSString *directoryString = [[NSString alloc] initWithData:data encoding:NSUTF8StringEncoding];
}
NSLog(@"directoryString: %@", directoryString);
...
有没有办法以更 'lossy' 的方式进行这种转换?
如您所见,我首先将数据块附加到 NSData 值,并在读取所有内容后转换为 utf8。这可以防止(多字节)utf8 字符被拆分,从而导致更多无效(空)utf8 字符串。
有效!通过结合 Larme 的代码片段和关于 UTF-8 字符大小的注释,我设法创建了一个 'lossy' NSData 到 UTF-8 NSString 的转换方法。
+ (NSString *) data2UTF8String:(NSData *) data {
// First try to do the 'standard' UTF-8 conversion
NSString * bufferStr = [[NSString alloc] initWithData:data
encoding:NSUTF8StringEncoding];
// if it fails, do the 'lossy' UTF8 conversion
if (!bufferStr) {
const Byte * buffer = [data bytes];
NSMutableString * filteredString = [[NSMutableString alloc] init];
int i = 0;
while (i < [data length]) {
int expectedLength = 1;
if ((buffer[i] & 0b10000000) == 0b00000000) expectedLength = 1;
else if ((buffer[i] & 0b11100000) == 0b11000000) expectedLength = 2;
else if ((buffer[i] & 0b11110000) == 0b11100000) expectedLength = 3;
else if ((buffer[i] & 0b11111000) == 0b11110000) expectedLength = 4;
else if ((buffer[i] & 0b11111100) == 0b11111000) expectedLength = 5;
else if ((buffer[i] & 0b11111110) == 0b11111100) expectedLength = 6;
int length = MIN(expectedLength, [data length] - i);
NSData * character = [NSData dataWithBytes:&buffer[i] length:(sizeof(Byte) * length)];
NSString * possibleString = [NSString stringWithUTF8String:[character bytes]];
if (possibleString) {
[filteredString appendString:possibleString];
}
i = i + expectedLength;
}
bufferStr = filteredString;
}
return bufferStr;
}
如果您有任何意见,请告诉我。 谢谢 Larme!
我使用 -[validUTF8String] 方法创建了一个 NSString 类别,在 UTF8String returns NULL 的情况下,去除无效的代理字符,然后在已清理的字符串上调用 UTF8String:
@interface NSString (ValidUTF8String)
- (const char *)validUTF8String;
- (NSString *)stringByStrippingInvalidUnicode; // warning: very inefficient! should only be called when we are sure that the string contains invalid Unicode, e.g. when -[UTF8String] is NULL
@end
@implementation NSString (ValidUTF8String)
- (const char *)validUTF8String;
{
const char *result=[self UTF8String];
if (!result)
{
result=[[self stringByStrippingInvalidUnicode] UTF8String];
if (!result)
result="";
}
return result;
}
#define isHighSurrogate(k) ((k>=0xD800) && (k<=0xDBFF))
#define isLowSurrogate(k) ((k>=0xDC00) && (k<=0xDFFF))
- (NSString *)stringByStrippingInvalidUnicode
{
NSMutableString *fixed=[[self mutableCopy] autorelease];
for (NSInteger idx=0; idx<[fixed length]; idx++)
{
unichar k=[fixed characterAtIndex:idx];
if (isHighSurrogate(k))
{
BOOL nextIsLowSurrogate=NO;
if (idx+1<[fixed length])
{
unichar nextK=[fixed characterAtIndex:idx+1];
nextIsLowSurrogate=isLowSurrogate(nextK);
}
if (!nextIsLowSurrogate)
{
[fixed deleteCharactersInRange:NSMakeRange(idx, 1)];
idx--;
}
}
else if (isLowSurrogate(k))
{
BOOL previousWasHighSurrogate=NO;
if (idx>0)
{
unichar previousK=[fixed characterAtIndex:idx-1];
previousWasHighSurrogate=isHighSurrogate(previousK);
}
if (!previousWasHighSurrogate)
{
[fixed deleteCharactersInRange:NSMakeRange(idx, 1)];
idx--;
}
}
}
return fixed;
}
@end