将文本中的表情符号从 MySQL 复制到 SQL 服务器

Copying emojis in text from MySQL to SQL Server

我正在使用链接服务器将数据从 MySQL 复制到 SQL 服务器。

SELECT comment FROM openquery(my_linked_server, 'SELECT comment FROM search_data');

MySQL table 列中的文本是 xxx xxx 。当我在 SQL 服务器中收到它时,它是 xxx 🤘 xxx。 MySQL table 是 utf8mb4,我已经为链接服务器设置了 ODBC 配置以使用它。我正在使用 MySQL ODBC 5.3.13

如有任何建议,我们将不胜感激。 SQL服务器版本是2016,我看过例子

select N''

等,但不知道如何将其应用于上述查询。

ðŸ¤∼ 好像是4个字符

ð = u00f0 , dec = 240
Ÿ = u0178 , dec = 276
¤ = u00a4 , dec = 164
˜ = u02dc , dec =  732

= ud83e, dec = 55358

有趣的是这甚至不起作用

select nchar(unicode(N'')),unicode(N'') 

返回 � 符号

字符的Unicode代码点是Unicode的U+1F918, which means it is outside the Basic Multilingual Plane(BMP),它涵盖了U+FFFF的代码点。

要处理BMP以外的Unicode字符,需要申请collations supporting Supplementary Characters,命名为*_SC:

SQL Server 2012 (11.x) introduced a new family of supplementary character (_SC) collations that can be used with the nchar, nvarchar, and sql_variant data types to represent the full Unicode character range (000000–10FFFF)

比较此 SQL 语句的结果

select 
    nchar(unicode(N'' collate Latin1_General_100_CI_AS_SC)) as EmojiSC, 
    unicode(N'' collate Latin1_General_100_CI_AS_SC) as EmojiSCUnicode, 
    cast(N'' as varbinary) as EmojiBinary, 
    cast(nchar(unicode(N'')) as varbinary) as EmojiConvBinary,
    unicode(N'') as EmojiUnicode

as 运行 对数据库使用 Latin1_General_CI_AS

EmojiSC EmojiSCUnicode  EmojiBinary EmojiConvBinary EmojiUnicode
NULL    129304          0x3ED818DD  0x3ED8          55358

与数据库设置为 Latin1_General_100_CI_AI_SC

EmojiSC EmojiSCUnicode  EmojiBinary EmojiConvBinary EmojiUnicode
      129304          0x3ED818DD  0x3ED818DD      129304

为什么会看到“🤘”?

U+1F918的UTF-8编码为0xF0 0x9F 0xA4 0x98, and the characters are the result of interpreting these codes as ANSI characters.

为什么看到“�”?

字符 � 是 Unicode REPLACEMENT CHARACTER 并且是

used to replace an unknown, unrecognized or unrepresentable character

那是因为 U+D83E 是 not a valid Unicode codepoint,但是代码点的第一个字编码为 UTF-16 (0xD83E 0xDD18)。

检查存储的内容,而不是显示的内容

显示 Unicode 数据可能很棘手,找出幕后情况的最有效方法是查看字节。在TSQL中,用cast(... as varbinary)分析Unicode数据操作哪里出错了

我制定了解决方案并发布,这样其他人就不会花一天时间做同样的事情

select ab_test.dbo.GetEmojisInString('👌💖🤷â€â™‚ï¸ðŸ˜ŽðŸ±â€ðŸ’»ðŸ˜‰â¤ðŸ±â€ðŸ‘¤ðŸ¤žðŸ¤£ðŸ‘💕✌ðŸ±â€ðŸðŸ’‹ðŸŽ‚🎉🤦â€â™‚ï¸ðŸ˜ŠðŸŒ¹ðŸ‘ðŸ±â€ðŸ‰ðŸŽ¶ðŸ˜ðŸ¤¦â€â™€ï¸ðŸ˜ðŸ™ŒðŸ±â€ðŸš€ðŸ˜œðŸ˜˜ðŸ±â€ðŸ‘“😢😒🤳😂')

将return

‍♂️‍❤‍✌‍‍♂️‍‍♀️‍‍

下面有 5 个功能,可能并不完美,也许更短/更好的方法,但这个功能。如果有任何错误让我知道。

注意:我必须拆分两个数据库才能工作,排序规则需要有 _CS,我的解决方案下面的 bi_library 数据库我无法更改它,因为数据库已被锁定,所以现在刚刚创建了一个 ab_test 数据库。

USE [bi_library]
GO

CREATE FUNCTION [dbo].[GetDecimalFromOtherBase]
(  @p_in_value   varchar(100),
   @p_from_base  int -- ie 16 for hex, 8 for octal, 2 for bin
) returns int
as
begin
    declare @l_in_value varchar(100) = reverse(@p_in_value) -- spin backwards as maths works in easier this way
    declare @l_from_base varchar(100) = @p_from_base--@p_from_base --= @p_in_value
    declare @l_pos int = 1
    declare @l_char char(1)
    declare @l_val int = 0
    declare @l_total int = 0

    while @l_pos<= len(@l_in_value)
    begin
       set @l_char = substring(@l_in_value,@l_pos,1)

       if isnumeric(@l_char)=0
       begin
          set @l_val = ascii(@l_char)-55 -- convert A to 10, F to 15 etc
       end 
       else
       begin
          set @l_val = @l_char
       end

       set @l_total = @l_total + (power(@l_from_base,@l_pos-1)*@l_val)
       set @l_pos=@l_pos+1
    end
    return @l_total
end
GO

CREATE FUNCTION [dbo].[GetOtherBaseFromDecimal]
(  @p_in_value   int,
   @p_to_base  int -- ie 16 for hex, 8 for octal, 2 for bin
) returns varchar(100)
as
begin
    -- convert decimal to other base
    declare @l_dec int = @p_in_value
    declare @l_ret_str varchar(100) = ''
    declare @l_rem int = 0
    declare @l_rem_char char(1) = '?'
    while @l_dec > 0
    begin
          set @l_rem = @l_dec % @p_to_base
          if @l_rem >= 10
          begin
             set @l_rem_char = char(55+@l_rem)
          end
          else
          begin
             set @l_rem_char = cast(@l_rem as varchar)
          end

          set @l_ret_str = @l_ret_str + @l_rem_char
          set @l_dec = @l_dec / @p_to_base
    end
    return reverse(@l_ret_str)
end
GO


CREATE FUNCTION [dbo].[GetBaseFromOtherBase]
(  @p_in_value varchar(100),
   @p_in_base  bigint, -- ie 16 for hex, 8 for octal, 2 for bin
   @p_to_base  bigint -- ie 16 for hex, 8 for octal, 2 for bin
) returns varchar(100)
as
begin
   return bi_library.dbo.GetOtherBaseFromDecimal(bi_library.dbo.GetDecimalFromOtherBase(@p_in_value,@p_in_base),@p_to_base)
end
GO

USE [ab_test]
GO



ALTER function [dbo].[GetEmojisInString] (@p_in_string nvarchar(max)) returns nvarchar(max)
as
begin
    declare @l_string varchar(1000) = @p_in_string --'✌ðŸ˜ðŸ’‹ðŸ¤·â€â™‚ï¸ðŸ¤³ðŸ±â€ðŸ‘“ðŸ±â€ðŸš€ðŸ±â€ðŸ‰ðŸ˜ŠðŸ’•🤞😉👌🤦â€â™€ï¸ðŸ±â€ðŸðŸ’–😒😘ðŸ˜ðŸ‘🤦â€â™‚ï¸ðŸ‘ðŸ±â€ðŸ‘¤ðŸ±â€ðŸ’»ðŸ™ŒðŸŽ‚😎😂😢😜🎶🌹🎉🤣â¤ðŸ¤·â€â™€ï¸'
    declare @l_pos    int = 1
    declare @l_char   varchar(1)
    declare @l_cont_extended_ascii int = 0
    declare @l_byte1_hex varchar(2)
    declare @l_byte2_hex varchar(2)
    declare @l_byte3_hex varchar(2)
    declare @l_byte4_hex varchar(2)
    declare @l_hex_char  varchar(2)
    declare @l_str       nvarchar(max) = ''
    declare @l_dec_value_found int

    while   @l_pos  <= len(@l_string)
    begin
       set @l_char = substring(@l_string,@l_pos,1)
       --print(ascii(@l_char))
       if ascii(@l_char)>=128 
       begin
          set @l_cont_extended_ascii = @l_cont_extended_ascii+1
          --print(@l_char)
          set @l_hex_char = bi_library.dbo.GetOtherBaseFromDecimal(ascii(@l_char),16)
          if  @l_cont_extended_ascii = 1 
          begin 
             set @l_byte1_hex = @l_hex_char 
             --print('set byte 1')
          end
          else if @l_cont_extended_ascii = 2 
          begin 
             --print('set byte 2')
             set @l_byte2_hex = @l_hex_char
             set @l_dec_value_found = bi_library.dbo.GetDecimalFromOtherBase(
                       reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte1_hex,16,2)),1,6))+
                       reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte2_hex,16,2)),1,6))
                                          ,2)
              if @l_dec_value_found between 128/*U+0080*/ and 2047/*U+07FF  */
              begin
                 --print('2 byte emoji found')
                 set @l_str = @l_str+coalesce(nchar(@l_dec_value_found),'?')
                 set @l_cont_extended_ascii = 0
              end

          end
          else if @l_cont_extended_ascii = 3 
          begin 
              --print('set byte 3')
              set @l_byte3_hex = @l_hex_char 
              set @l_dec_value_found = bi_library.dbo.GetDecimalFromOtherBase(
                       reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte1_hex,16,2)),1,4))+
                       reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte2_hex,16,2)),1,6))+
                       reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte3_hex,16,2)),1,6))
                                          ,2)
              if @l_dec_value_found between 2048/*U+0800*/ and 65535/*U+FFFF*/
              begin
                 --print('3 byte emoji found')
                 set @l_str = @l_str+coalesce(nchar(@l_dec_value_found),'?')
                 set @l_cont_extended_ascii = 0
              end
               --print(@l_str)
          end
          else if @l_cont_extended_ascii = 4 begin set @l_byte4_hex = @l_hex_char

                   set @l_dec_value_found = bi_library.dbo.GetDecimalFromOtherBase(
                                                   reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte1_hex,16,2)),1,3))+
                                                   reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte2_hex,16,2)),1,6))+
                                                   reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte3_hex,16,2)),1,6))+
                                                   reverse(substring(reverse(bi_library.dbo.GetBaseFromOtherBase(@l_byte4_hex,16,2)),1,6))
                                                                      ,2)
                   if @l_dec_value_found between 65536/*U+10000*/ and 1114111/*U+10FFFF*/
                   begin
                      --print('4 byte emoji found')
                      set @l_str = @l_str+coalesce(nchar(@l_dec_value_found),'?')
                      set @l_cont_extended_ascii = 0
                   end
                   else
                   begin
                      --print('out of range byte emoji found')
                      set @l_str = @l_str+@l_char
                   end
                    --print(@l_str)
             --end


             set @l_cont_extended_ascii = 0  
          end
       end
       else
       begin
          --print('snapping')
          set @l_str = @l_str+@l_char
          set @l_cont_extended_ascii = 0
           --print(@l_str)
       end


       set @l_pos = @l_pos+1
    end
    --print(@l_str)
    return @l_str
end

CREATE function [dbo].[HasEmojisInString] (@p_in_string nvarchar(max)) returns int
as
begin
    declare @l_string_emojified varchar(1000)
    set @l_string_emojified = dbo.GetEmojisInString(@p_in_string) 
    if @l_string_emojified <> @p_in_string
    begin
       return 1
    end
    return 0
end
GO