将文本中的表情符号从 MySQL 复制到 SQL 服务器
Copying emojis in text from MySQL to SQL Server
我正在使用链接服务器将数据从 MySQL 复制到 SQL 服务器。
SELECT comment FROM openquery(my_linked_server, 'SELECT comment FROM search_data');
MySQL table 列中的文本是 xxx xxx
。当我在 SQL 服务器中收到它时,它是 xxx 🤘 xxx
。 MySQL table 是 utf8mb4
,我已经为链接服务器设置了 ODBC 配置以使用它。我正在使用 MySQL ODBC 5.3.13
如有任何建议,我们将不胜感激。 SQL服务器版本是2016,我看过例子
select N''
等,但不知道如何将其应用于上述查询。
ðŸ¤∼ 好像是4个字符
ð = u00f0 , dec = 240
Ÿ = u0178 , dec = 276
¤ = u00a4 , dec = 164
˜ = u02dc , dec = 732
= ud83e, dec = 55358
有趣的是这甚至不起作用
select nchar(unicode(N'')),unicode(N'')
返回 � 符号
字符的Unicode代码点是Unicode的U+1F918, which means it is outside the Basic Multilingual Plane(BMP),它涵盖了U+FFFF的代码点。
要处理BMP以外的Unicode字符,需要申请collations supporting Supplementary Characters,命名为*_SC
:
SQL Server 2012 (11.x) introduced a new family of supplementary character (_SC) collations that can be used with the nchar, nvarchar, and sql_variant data types to represent the full Unicode character range (000000–10FFFF)
比较此 SQL 语句的结果
select
nchar(unicode(N'' collate Latin1_General_100_CI_AS_SC)) as EmojiSC,
unicode(N'' collate Latin1_General_100_CI_AS_SC) as EmojiSCUnicode,
cast(N'' as varbinary) as EmojiBinary,
cast(nchar(unicode(N'')) as varbinary) as EmojiConvBinary,
unicode(N'') as EmojiUnicode
as 运行 对数据库使用 Latin1_General_CI_AS
EmojiSC EmojiSCUnicode EmojiBinary EmojiConvBinary EmojiUnicode
NULL 129304 0x3ED818DD 0x3ED8 55358
与数据库设置为 Latin1_General_100_CI_AI_SC
EmojiSC EmojiSCUnicode EmojiBinary EmojiConvBinary EmojiUnicode
129304 0x3ED818DD 0x3ED818DD 129304
为什么会看到“🤘
”?
U+1F918的UTF-8编码为0xF0 0x9F 0xA4 0x98, and the characters are the result of interpreting these codes as ANSI characters.
为什么看到“�”?
字符 � 是 Unicode REPLACEMENT CHARACTER 并且是
used to replace an unknown, unrecognized or unrepresentable character
那是因为 U+D83E 是 not a valid Unicode codepoint,但是代码点的第一个字编码为 UTF-16 (0xD83E 0xDD18
)。
检查存储的内容,而不是显示的内容
显示 Unicode 数据可能很棘手,找出幕后情况的最有效方法是查看字节。在TSQL中,用cast(... as varbinary)
分析Unicode数据操作哪里出错了
我制定了解决方案并发布,这样其他人就不会花一天时间做同样的事情
select ab_test.dbo.GetEmojisInString('👌💖🤷â€â™‚ï¸ðŸ˜ŽðŸ±â€ðŸ’»ðŸ˜‰â¤ðŸ±â€ðŸ‘¤ðŸ¤žðŸ¤£ðŸ‘💕✌ðŸ±â€ðŸðŸ’‹ðŸŽ‚🎉🤦â€â™‚ï¸ðŸ˜ŠðŸŒ¹ðŸ‘ðŸ±â€ðŸ‰ðŸŽ¶ðŸ˜ðŸ¤¦â€â™€ï¸ðŸ˜ðŸ™ŒðŸ±â€ðŸš€ðŸ˜œðŸ˜˜ðŸ±â€ðŸ‘“😢😒🤳😂')
将return
♂️❤✌♂️♀️
下面有 5 个功能,可能并不完美,也许更短/更好的方法,但这个功能。如果有任何错误让我知道。
注意:我必须拆分两个数据库才能工作,排序规则需要有 _CS,我的解决方案下面的 bi_library 数据库我无法更改它,因为数据库已被锁定,所以现在刚刚创建了一个 ab_test 数据库。
USE [bi_library]
GO
CREATE FUNCTION [dbo].[GetDecimalFromOtherBase]
( @p_in_value varchar(100),
@p_from_base int -- ie 16 for hex, 8 for octal, 2 for bin
) returns int
as
begin
declare @l_in_value varchar(100) = reverse(@p_in_value) -- spin backwards as maths works in easier this way
declare @l_from_base varchar(100) = @p_from_base--@p_from_base --= @p_in_value
declare @l_pos int = 1
declare @l_char char(1)
declare @l_val int = 0
declare @l_total int = 0
while @l_pos<= len(@l_in_value)
begin
set @l_char = substring(@l_in_value,@l_pos,1)
if isnumeric(@l_char)=0
begin
set @l_val = ascii(@l_char)-55 -- convert A to 10, F to 15 etc
end
else
begin
set @l_val = @l_char
end
set @l_total = @l_total + (power(@l_from_base,@l_pos-1)*@l_val)
set @l_pos=@l_pos+1
end
return @l_total
end
GO
CREATE FUNCTION [dbo].[GetOtherBaseFromDecimal]
( @p_in_value int,
@p_to_base int -- ie 16 for hex, 8 for octal, 2 for bin
) returns varchar(100)
as
begin
-- convert decimal to other base
declare @l_dec int = @p_in_value
declare @l_ret_str varchar(100) = ''
declare @l_rem int = 0
declare @l_rem_char char(1) = '?'
while @l_dec > 0
begin
set @l_rem = @l_dec % @p_to_base
if @l_rem >= 10
begin
set @l_rem_char = char(55+@l_rem)
end
else
begin
set @l_rem_char = cast(@l_rem as varchar)
end
set @l_ret_str = @l_ret_str + @l_rem_char
set @l_dec = @l_dec / @p_to_base
end
return reverse(@l_ret_str)
end
GO
CREATE FUNCTION [dbo].[GetBaseFromOtherBase]
( @p_in_value varchar(100),
@p_in_base bigint, -- ie 16 for hex, 8 for octal, 2 for bin
@p_to_base bigint -- ie 16 for hex, 8 for octal, 2 for bin
) returns varchar(100)
as
begin
return bi_library.dbo.GetOtherBaseFromDecimal(bi_library.dbo.GetDecimalFromOtherBase(@p_in_value,@p_in_base),@p_to_base)
end
GO
USE [ab_test]
GO
ALTER function [dbo].[GetEmojisInString] (@p_in_string nvarchar(max)) returns nvarchar(max)
as
begin
declare @l_string varchar(1000) = @p_in_string --'✌ðŸ˜ðŸ’‹ðŸ¤·â€â™‚ï¸ðŸ¤³ðŸ±â€ðŸ‘“ðŸ±â€ðŸš€ðŸ±â€ðŸ‰ðŸ˜ŠðŸ’•🤞😉👌🤦â€â™€ï¸ðŸ±â€ðŸðŸ’–😒😘ðŸ˜ðŸ‘🤦â€â™‚ï¸ðŸ‘ðŸ±â€ðŸ‘¤ðŸ±â€ðŸ’»ðŸ™ŒðŸŽ‚😎😂😢😜🎶🌹🎉🤣â¤ðŸ¤·â€â™€ï¸'
declare @l_pos int = 1
declare @l_char varchar(1)
declare @l_cont_extended_ascii int = 0
declare @l_byte1_hex varchar(2)
declare @l_byte2_hex varchar(2)
declare @l_byte3_hex varchar(2)
declare @l_byte4_hex varchar(2)
declare @l_hex_char varchar(2)
declare @l_str nvarchar(max) = ''
declare @l_dec_value_found int
while @l_pos <= len(@l_string)
begin
set @l_char = substring(@l_string,@l_pos,1)
--print(ascii(@l_char))
if ascii(@l_char)>=128
begin
set @l_cont_extended_ascii = @l_cont_extended_ascii+1
--print(@l_char)
set @l_hex_char = bi_library.dbo.GetOtherBaseFromDecimal(ascii(@l_char),16)
if @l_cont_extended_ascii = 1
begin
set @l_byte1_hex = @l_hex_char
--print('set byte 1')
end
else if @l_cont_extended_ascii = 2
begin
--print('set byte 2')
set @l_byte2_hex = @l_hex_char
set @l_dec_value_found = bi_library.dbo.GetDecimalFromOtherBase(
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte1_hex,16,2)),1,6))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte2_hex,16,2)),1,6))
,2)
if @l_dec_value_found between 128/*U+0080*/ and 2047/*U+07FF */
begin
--print('2 byte emoji found')
set @l_str = @l_str+coalesce(nchar(@l_dec_value_found),'?')
set @l_cont_extended_ascii = 0
end
end
else if @l_cont_extended_ascii = 3
begin
--print('set byte 3')
set @l_byte3_hex = @l_hex_char
set @l_dec_value_found = bi_library.dbo.GetDecimalFromOtherBase(
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte1_hex,16,2)),1,4))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte2_hex,16,2)),1,6))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte3_hex,16,2)),1,6))
,2)
if @l_dec_value_found between 2048/*U+0800*/ and 65535/*U+FFFF*/
begin
--print('3 byte emoji found')
set @l_str = @l_str+coalesce(nchar(@l_dec_value_found),'?')
set @l_cont_extended_ascii = 0
end
--print(@l_str)
end
else if @l_cont_extended_ascii = 4 begin set @l_byte4_hex = @l_hex_char
set @l_dec_value_found = bi_library.dbo.GetDecimalFromOtherBase(
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte1_hex,16,2)),1,3))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte2_hex,16,2)),1,6))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte3_hex,16,2)),1,6))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte4_hex,16,2)),1,6))
,2)
if @l_dec_value_found between 65536/*U+10000*/ and 1114111/*U+10FFFF*/
begin
--print('4 byte emoji found')
set @l_str = @l_str+coalesce(nchar(@l_dec_value_found),'?')
set @l_cont_extended_ascii = 0
end
else
begin
--print('out of range byte emoji found')
set @l_str = @l_str+@l_char
end
--print(@l_str)
--end
set @l_cont_extended_ascii = 0
end
end
else
begin
--print('snapping')
set @l_str = @l_str+@l_char
set @l_cont_extended_ascii = 0
--print(@l_str)
end
set @l_pos = @l_pos+1
end
--print(@l_str)
return @l_str
end
CREATE function [dbo].[HasEmojisInString] (@p_in_string nvarchar(max)) returns int
as
begin
declare @l_string_emojified varchar(1000)
set @l_string_emojified = dbo.GetEmojisInString(@p_in_string)
if @l_string_emojified <> @p_in_string
begin
return 1
end
return 0
end
GO
我正在使用链接服务器将数据从 MySQL 复制到 SQL 服务器。
SELECT comment FROM openquery(my_linked_server, 'SELECT comment FROM search_data');
MySQL table 列中的文本是 xxx xxx
。当我在 SQL 服务器中收到它时,它是 xxx 🤘 xxx
。 MySQL table 是 utf8mb4
,我已经为链接服务器设置了 ODBC 配置以使用它。我正在使用 MySQL ODBC 5.3.13
如有任何建议,我们将不胜感激。 SQL服务器版本是2016,我看过例子
select N''
等,但不知道如何将其应用于上述查询。
ðŸ¤∼ 好像是4个字符
ð = u00f0 , dec = 240
Ÿ = u0178 , dec = 276
¤ = u00a4 , dec = 164
˜ = u02dc , dec = 732
= ud83e, dec = 55358
有趣的是这甚至不起作用
select nchar(unicode(N'')),unicode(N'')
返回 � 符号
字符的Unicode代码点是Unicode的U+1F918, which means it is outside the Basic Multilingual Plane(BMP),它涵盖了U+FFFF的代码点。
要处理BMP以外的Unicode字符,需要申请collations supporting Supplementary Characters,命名为*_SC
:
SQL Server 2012 (11.x) introduced a new family of supplementary character (_SC) collations that can be used with the nchar, nvarchar, and sql_variant data types to represent the full Unicode character range (000000–10FFFF)
比较此 SQL 语句的结果
select
nchar(unicode(N'' collate Latin1_General_100_CI_AS_SC)) as EmojiSC,
unicode(N'' collate Latin1_General_100_CI_AS_SC) as EmojiSCUnicode,
cast(N'' as varbinary) as EmojiBinary,
cast(nchar(unicode(N'')) as varbinary) as EmojiConvBinary,
unicode(N'') as EmojiUnicode
as 运行 对数据库使用 Latin1_General_CI_AS
EmojiSC EmojiSCUnicode EmojiBinary EmojiConvBinary EmojiUnicode
NULL 129304 0x3ED818DD 0x3ED8 55358
与数据库设置为 Latin1_General_100_CI_AI_SC
EmojiSC EmojiSCUnicode EmojiBinary EmojiConvBinary EmojiUnicode
129304 0x3ED818DD 0x3ED818DD 129304
为什么会看到“🤘
”?
U+1F918的UTF-8编码为0xF0 0x9F 0xA4 0x98, and the characters are the result of interpreting these codes as ANSI characters.
为什么看到“�”?
字符 � 是 Unicode REPLACEMENT CHARACTER 并且是
used to replace an unknown, unrecognized or unrepresentable character
那是因为 U+D83E 是 not a valid Unicode codepoint,但是代码点的第一个字编码为 UTF-16 (0xD83E 0xDD18
)。
检查存储的内容,而不是显示的内容
显示 Unicode 数据可能很棘手,找出幕后情况的最有效方法是查看字节。在TSQL中,用cast(... as varbinary)
分析Unicode数据操作哪里出错了
我制定了解决方案并发布,这样其他人就不会花一天时间做同样的事情
select ab_test.dbo.GetEmojisInString('👌💖🤷â€â™‚ï¸ðŸ˜ŽðŸ±â€ðŸ’»ðŸ˜‰â¤ðŸ±â€ðŸ‘¤ðŸ¤žðŸ¤£ðŸ‘💕✌ðŸ±â€ðŸðŸ’‹ðŸŽ‚🎉🤦â€â™‚ï¸ðŸ˜ŠðŸŒ¹ðŸ‘ðŸ±â€ðŸ‰ðŸŽ¶ðŸ˜ðŸ¤¦â€â™€ï¸ðŸ˜ðŸ™ŒðŸ±â€ðŸš€ðŸ˜œðŸ˜˜ðŸ±â€ðŸ‘“😢😒🤳😂')
将return
♂️❤✌♂️♀️
下面有 5 个功能,可能并不完美,也许更短/更好的方法,但这个功能。如果有任何错误让我知道。
注意:我必须拆分两个数据库才能工作,排序规则需要有 _CS,我的解决方案下面的 bi_library 数据库我无法更改它,因为数据库已被锁定,所以现在刚刚创建了一个 ab_test 数据库。
USE [bi_library]
GO
CREATE FUNCTION [dbo].[GetDecimalFromOtherBase]
( @p_in_value varchar(100),
@p_from_base int -- ie 16 for hex, 8 for octal, 2 for bin
) returns int
as
begin
declare @l_in_value varchar(100) = reverse(@p_in_value) -- spin backwards as maths works in easier this way
declare @l_from_base varchar(100) = @p_from_base--@p_from_base --= @p_in_value
declare @l_pos int = 1
declare @l_char char(1)
declare @l_val int = 0
declare @l_total int = 0
while @l_pos<= len(@l_in_value)
begin
set @l_char = substring(@l_in_value,@l_pos,1)
if isnumeric(@l_char)=0
begin
set @l_val = ascii(@l_char)-55 -- convert A to 10, F to 15 etc
end
else
begin
set @l_val = @l_char
end
set @l_total = @l_total + (power(@l_from_base,@l_pos-1)*@l_val)
set @l_pos=@l_pos+1
end
return @l_total
end
GO
CREATE FUNCTION [dbo].[GetOtherBaseFromDecimal]
( @p_in_value int,
@p_to_base int -- ie 16 for hex, 8 for octal, 2 for bin
) returns varchar(100)
as
begin
-- convert decimal to other base
declare @l_dec int = @p_in_value
declare @l_ret_str varchar(100) = ''
declare @l_rem int = 0
declare @l_rem_char char(1) = '?'
while @l_dec > 0
begin
set @l_rem = @l_dec % @p_to_base
if @l_rem >= 10
begin
set @l_rem_char = char(55+@l_rem)
end
else
begin
set @l_rem_char = cast(@l_rem as varchar)
end
set @l_ret_str = @l_ret_str + @l_rem_char
set @l_dec = @l_dec / @p_to_base
end
return reverse(@l_ret_str)
end
GO
CREATE FUNCTION [dbo].[GetBaseFromOtherBase]
( @p_in_value varchar(100),
@p_in_base bigint, -- ie 16 for hex, 8 for octal, 2 for bin
@p_to_base bigint -- ie 16 for hex, 8 for octal, 2 for bin
) returns varchar(100)
as
begin
return bi_library.dbo.GetOtherBaseFromDecimal(bi_library.dbo.GetDecimalFromOtherBase(@p_in_value,@p_in_base),@p_to_base)
end
GO
USE [ab_test]
GO
ALTER function [dbo].[GetEmojisInString] (@p_in_string nvarchar(max)) returns nvarchar(max)
as
begin
declare @l_string varchar(1000) = @p_in_string --'✌ðŸ˜ðŸ’‹ðŸ¤·â€â™‚ï¸ðŸ¤³ðŸ±â€ðŸ‘“ðŸ±â€ðŸš€ðŸ±â€ðŸ‰ðŸ˜ŠðŸ’•🤞😉👌🤦â€â™€ï¸ðŸ±â€ðŸðŸ’–😒😘ðŸ˜ðŸ‘🤦â€â™‚ï¸ðŸ‘ðŸ±â€ðŸ‘¤ðŸ±â€ðŸ’»ðŸ™ŒðŸŽ‚😎😂😢😜🎶🌹🎉🤣â¤ðŸ¤·â€â™€ï¸'
declare @l_pos int = 1
declare @l_char varchar(1)
declare @l_cont_extended_ascii int = 0
declare @l_byte1_hex varchar(2)
declare @l_byte2_hex varchar(2)
declare @l_byte3_hex varchar(2)
declare @l_byte4_hex varchar(2)
declare @l_hex_char varchar(2)
declare @l_str nvarchar(max) = ''
declare @l_dec_value_found int
while @l_pos <= len(@l_string)
begin
set @l_char = substring(@l_string,@l_pos,1)
--print(ascii(@l_char))
if ascii(@l_char)>=128
begin
set @l_cont_extended_ascii = @l_cont_extended_ascii+1
--print(@l_char)
set @l_hex_char = bi_library.dbo.GetOtherBaseFromDecimal(ascii(@l_char),16)
if @l_cont_extended_ascii = 1
begin
set @l_byte1_hex = @l_hex_char
--print('set byte 1')
end
else if @l_cont_extended_ascii = 2
begin
--print('set byte 2')
set @l_byte2_hex = @l_hex_char
set @l_dec_value_found = bi_library.dbo.GetDecimalFromOtherBase(
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte1_hex,16,2)),1,6))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte2_hex,16,2)),1,6))
,2)
if @l_dec_value_found between 128/*U+0080*/ and 2047/*U+07FF */
begin
--print('2 byte emoji found')
set @l_str = @l_str+coalesce(nchar(@l_dec_value_found),'?')
set @l_cont_extended_ascii = 0
end
end
else if @l_cont_extended_ascii = 3
begin
--print('set byte 3')
set @l_byte3_hex = @l_hex_char
set @l_dec_value_found = bi_library.dbo.GetDecimalFromOtherBase(
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte1_hex,16,2)),1,4))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte2_hex,16,2)),1,6))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte3_hex,16,2)),1,6))
,2)
if @l_dec_value_found between 2048/*U+0800*/ and 65535/*U+FFFF*/
begin
--print('3 byte emoji found')
set @l_str = @l_str+coalesce(nchar(@l_dec_value_found),'?')
set @l_cont_extended_ascii = 0
end
--print(@l_str)
end
else if @l_cont_extended_ascii = 4 begin set @l_byte4_hex = @l_hex_char
set @l_dec_value_found = bi_library.dbo.GetDecimalFromOtherBase(
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte1_hex,16,2)),1,3))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte2_hex,16,2)),1,6))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte3_hex,16,2)),1,6))+
reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte4_hex,16,2)),1,6))
,2)
if @l_dec_value_found between 65536/*U+10000*/ and 1114111/*U+10FFFF*/
begin
--print('4 byte emoji found')
set @l_str = @l_str+coalesce(nchar(@l_dec_value_found),'?')
set @l_cont_extended_ascii = 0
end
else
begin
--print('out of range byte emoji found')
set @l_str = @l_str+@l_char
end
--print(@l_str)
--end
set @l_cont_extended_ascii = 0
end
end
else
begin
--print('snapping')
set @l_str = @l_str+@l_char
set @l_cont_extended_ascii = 0
--print(@l_str)
end
set @l_pos = @l_pos+1
end
--print(@l_str)
return @l_str
end
CREATE function [dbo].[HasEmojisInString] (@p_in_string nvarchar(max)) returns int
as
begin
declare @l_string_emojified varchar(1000)
set @l_string_emojified = dbo.GetEmojisInString(@p_in_string)
if @l_string_emojified <> @p_in_string
begin
return 1
end
return 0
end
GO