有没有办法在 SQL 服务器中进行 HTML 解码?

Is there any way to do HTML decode in SQL Server?

我的其中一个 table

中有以下记录
CD&M Communications 
auburndale oil & propane inc  
C F La Fountaine #7561  
Laramie County Fire District # 2  
AmeriGas Propane LP #2250  

有没有办法删除 &、#7561、#2250 等字符

"&"应根据 C# HTMLDECODE 函数替换为“&”

以下 SQL 函数适用于您的情况,或者它是您扩展它的良好起点。但是,请注意,与应用程序层中的字符串操作相比,数据库 [SQL 服务器] 中的字符串操作会更慢。

GO

IF OBJECT_ID('dbo.MyHTMLDecode') IS NOT NULL BEGIN DROP FUNCTION dbo.MyHTMLDecode END

GO
CREATE FUNCTION dbo.MyHTMLDecode (@vcWhat VARCHAR(MAX))
RETURNS VARCHAR(MAX)
AS
BEGIN
    DECLARE @vcResult VARCHAR(MAX)
    DECLARE @siPos INT
        ,@vcEncoded VARCHAR(7)
        ,@siChar INT

    SET @vcResult = RTRIM(LTRIM(CAST(REPLACE(@vcWhat COLLATE Latin1_General_BIN, CHAR(0), '') AS VARCHAR(MAX))))

    SELECT @vcResult = REPLACE(REPLACE(@vcResult, ' ', ' '), ' ', ' ')

    IF @vcResult = ''
        RETURN @vcResult

    SELECT @siPos = PATINDEX('%&#[0-9][0-9][0-9];%', @vcResult)

    WHILE @siPos > 0
    BEGIN
        SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, 6)
            ,@siChar = CAST(SUBSTRING(@vcEncoded, 3, 3) AS INT)
            ,@vcResult = REPLACE(@vcResult, @vcEncoded, NCHAR(@siChar))
            ,@siPos = PATINDEX('%&#[0-9][0-9][0-9];%', @vcResult)
    END

    SELECT @siPos = PATINDEX('%&#[0-9][0-9][0-9][0-9];%', @vcResult)

    WHILE @siPos > 0
    BEGIN
        SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, 7)
            ,@siChar = CAST(SUBSTRING(@vcEncoded, 3, 4) AS INT)
            ,@vcResult = REPLACE(@vcResult, @vcEncoded, NCHAR(@siChar))
            ,@siPos = PATINDEX('%&#[0-9][0-9][0-9][0-9];%', @vcResult)
    END

    SELECT @siPos = PATINDEX('%#[0-9][0-9][0-9][0-9]%', @vcResult)

    WHILE @siPos > 0
    BEGIN
        SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, 5)
            ,@vcResult = REPLACE(@vcResult, @vcEncoded, '')
            ,@siPos = PATINDEX('%#[0-9][0-9][0-9][0-9]%', @vcResult)
    END

    SELECT @vcResult = REPLACE(REPLACE(@vcResult, NCHAR(160), ' '), CHAR(160), ' ')

    SELECT @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult, '&amp;', '&'), '&quot;', '"'), '&lt;', '<'), '&gt;', '>'), '&amp;amp;', '&')

    RETURN @vcResult
END

GO

插图:

  DECLARE @S VARCHAR(MAX)='CD&amp;amp;amp;M Communications 
    auburndale oil &amp;amp;amp; propane inc  
    C F La Fountaine #7561  
    Laramie County Fire District # 2  
    AmeriGas Propane LP #2250'

    SELECT dbo.MyHTMLDecode (@s)

输出:

CD&M Communications 
auburndale oil & propane inc  
C F La Fountaine   
Laramie County Fire District # 2  
AmeriGas Propane LP 

以前的版本不适用于日文、韩文... 这里固定版本:

GO
IF OBJECT_ID('dbo.fn_HTMLDecode') IS NOT NULL BEGIN DROP FUNCTION dbo.fn_HTMLDecode END
GO
CREATE FUNCTION dbo.fn_HTMLDecode(
    @vcWhat NVARCHAR(MAX)
    ,@toDecodeMainISOSymbols bit = 1
    ,@toDecodeISOChars bit = 1
)
RETURNS NVARCHAR(MAX)
AS
BEGIN
    DECLARE @vcResult NVARCHAR(MAX);
    DECLARE @siPos INT ,@vcEncoded NVARCHAR(9) ,@siChar INT;
    SET @vcResult = RTRIM(LTRIM(CAST(REPLACE(@vcWhat COLLATE Latin1_General_BIN, CHAR(0), '') AS NVARCHAR(MAX))));
    SELECT @vcResult = REPLACE(REPLACE(@vcResult, '&#160;', ' '), '&nbsp;', ' ');
    IF @vcResult = '' RETURN @vcResult;

    declare @s varchar(35);
    declare @n int; set @n = 6;
    declare @i int;

    while @n > 2
    begin
        set @s = '';
        set @i=1;
        while @i<=@n
        begin
            set @s = @s + '[0-9]';
            set @i = @i + 1;
        end
        set @s = '%&#' + @s + '%';
        SELECT @siPos = PATINDEX(@s, @vcResult);
        WHILE @siPos > 0
        BEGIN
            SELECT @vcEncoded = SUBSTRING(@vcResult, @siPos, @n+3)
                ,@siChar = CAST(SUBSTRING(@vcEncoded, 3, @n) AS INT)
                ,@vcResult = REPLACE(@vcResult, @vcEncoded, NCHAR(@siChar))
                ,@siPos = PATINDEX(@s, @vcResult);
        END
        set @n = @n - 1;
    end

    if @toDecodeMainISOSymbols=1
    begin
        select @vcResult = REPLACE(REPLACE(@vcResult, NCHAR(160), ' '), CHAR(160), ' ');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult, '&amp;', '&'), '&quot;', '"'), '&lt;', '<'), '&gt;', '>'), '&amp;amp;', '&'),'&rdquo;','”'),'&bdquo;','„'),'&ndash;','–'),'&mdash;','—');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult,'&lsquo;','‘'),'&rsquo;','’'),'&bull;','•'),'&hellip;','…'),'&permil;','‰') COLLATE Latin1_General_BIN,'&prime;','′') COLLATE Latin1_General_BIN,'&Prime;','″'),'&circ;','ˆ'),'&tilde;','˜'),'&nbsp;',' ');
    end

    if @toDecodeISOChars=1
    begin
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'&Scaron;','Š') COLLATE Latin1_General_BIN,'&scaron;','š') COLLATE Latin1_General_BIN,'&Ccedil;','Ç') COLLATE Latin1_General_BIN,'&ccedil;','ç');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult,'&Agrave;','À') COLLATE Latin1_General_BIN,'&agrave;','à') COLLATE Latin1_General_BIN,'&Aacute;','Á') COLLATE Latin1_General_BIN,'&aacute;','á') COLLATE Latin1_General_BIN,'&Acirc;','Â') COLLATE Latin1_General_BIN,'&acirc;','â') COLLATE Latin1_General_BIN,'&Atilde;','Ã') COLLATE Latin1_General_BIN,'&atilde;','ã') COLLATE Latin1_General_BIN,'&Auml;','Ä') COLLATE Latin1_General_BIN,'&auml;','ä') COLLATE Latin1_General_BIN,'&Aring;','Å') COLLATE Latin1_General_BIN,'&aring;','å') COLLATE Latin1_General_BIN,'&AElig;','Æ') COLLATE Latin1_General_BIN,'&aelig;','æ');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'&Egrave','È') COLLATE Latin1_General_BIN,'&egrave','è') COLLATE Latin1_General_BIN,'&Eacute;','É') COLLATE Latin1_General_BIN,'&eacute;','é') COLLATE Latin1_General_BIN,'&Ecirc;','Ê') COLLATE Latin1_General_BIN,'&ecirc;','ê') COLLATE Latin1_General_BIN,'&Euml;','Ë') COLLATE Latin1_General_BIN,'&euml;','ë');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'&Igrave;','Ì') COLLATE Latin1_General_BIN,'&igrave;','ì') COLLATE Latin1_General_BIN,'&Iacute;','Í') COLLATE Latin1_General_BIN,'&iacute;','í') COLLATE Latin1_General_BIN,'&Icirc;','Î') COLLATE Latin1_General_BIN,'&icirc;','î') COLLATE Latin1_General_BIN,'&Iuml;','Ï') COLLATE Latin1_General_BIN,'&iuml;','ï');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'&Ograve;','Ò') COLLATE Latin1_General_BIN,'&ograve;','ò') COLLATE Latin1_General_BIN,'&Oacute;','Ó') COLLATE Latin1_General_BIN,'&oacute;','ó') COLLATE Latin1_General_BIN,'&Ocirc;','Ô') COLLATE Latin1_General_BIN,'&ocirc;','ô') COLLATE Latin1_General_BIN,'&Otilde;','Õ') COLLATE Latin1_General_BIN,'&otilde;','õ') COLLATE Latin1_General_BIN,'&Ouml;','Ö') COLLATE Latin1_General_BIN,'&ouml;','ö') COLLATE Latin1_General_BIN,'&Oslash','Ø') COLLATE Latin1_General_BIN,'&oslash','ø');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'&Ugrave;','Ù') COLLATE Latin1_General_BIN,'&ugrave;','ù') COLLATE Latin1_General_BIN,'&Uacute;','Ú') COLLATE Latin1_General_BIN,'&uacute;','ú') COLLATE Latin1_General_BIN,'&Ucirc;','Û') COLLATE Latin1_General_BIN,'&ucirc;','û') COLLATE Latin1_General_BIN,'&Uuml;','Ü') COLLATE Latin1_General_BIN,'&uuml;','ü');
        select @vcResult = REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@vcResult COLLATE Latin1_General_BIN,'&ETH;','Ð') COLLATE Latin1_General_BIN,'&eth;','ð') COLLATE Latin1_General_BIN,'&Ntilde;','Ñ') COLLATE Latin1_General_BIN,'&ntilde;','ñ') COLLATE Latin1_General_BIN,'&Yacute;','Ý') COLLATE Latin1_General_BIN,'&yacute;','ý') COLLATE Latin1_General_BIN,'&THORN;','Þ') COLLATE Latin1_General_BIN,'&thorn;','þ') COLLATE Latin1_General_BIN,'&szlig;','ß');
    end
    RETURN @vcResult;
END
-- test:
-- select dbo.fn_HTMLDecode(N'A fine example of man and nature co-existing is Slovenia&#8217;s ecological tourist farms.',1,1)
-- select dbo.fn_HTMLDecode(N'm0 &#50752;&#51064;&#48516;&#50556;&#50640;&#49436; m1 &#44032;&#51109; m2 &#50689;&#54693;&#47141; m3&#51080;&#45716;m10',1,1)

有一个更简单的解决方案...

SQL 服务器支持 XML 数据类型,并且支持解码 XML/HTML 编码实体。如果您只是将字符串转换为 XML 数据类型,则可以使用内置的解码函数。

看起来像这样:

select cast('Q &amp; A' as XML).value('.[1]','nvarchar(max)' );

把它变成一个函数以便于使用:

create function dbo.xmlDecode (@string nvarchar(max))
returns varchar(max)
begin
    return cast(@string as XML).value('.[1]','nvarchar(max)' )
end;

请记住,在 OP 的示例中,字符串似乎连续编码了 3 次。 & 变成了 &amp;,然后变成了 &amp;amp;,然后变成了 &amp;amp;amp;。结果,要取回 "original" 字符串,必须使用 decode 函数 3 次。

使用来自@Wouter 的转换方法处理十进制 十六进制实体,而接受的答案不会。

但是,还需要处理程序来处理输入中的 XML 个特殊字符。

XML 特殊字符:<, >, ", ', &

create function [dbo].[XMLdecode] (@input nvarchar(max))
returns nvarchar(max)
begin

    declare @output nvarchar(max) = ''
    declare @next nchar(1)
    declare @endIdx int = 0
    declare @idx int = 0
    while @idx < len(@input)
    begin

        set @idx += 1
        set @next = substring(@input, @idx, 1)
        set @endIdx = charindex(';', @input, @idx) - @idx

        if @next = '&' and (@endIdx > 8 or @endIdx < 1) 
            set @output += '&amp;'
        else if @next = '&' and @endIdx > 1 and @endIdx < 8
        begin
            set @output += lower(substring(@input,@idx,@endIdx+1))
            set @idx += @endIdx
        end
        else
            set @output += @next
    end

    set @output = replace(@output,'<','&lt;')
    set @output = replace(@output,'>','&gt;')
    set @output = replace(@output,'"','&quot;')
    set @output = replace(@output,'''','&apos;')

    return cast(@output as XML).value('.[1]','nvarchar(max)')

end;

用法示例:

select dbo.XMLdecode('this is a t&#xc9;st &#xba;f HEX &amp; DECIMAL &#8364;ntities & <<<< non-entities too! &#9745;')
------------------------------------------------------------------------------
returns: 'this is a tÉst ºf HEX & DECIMAL €ntities & <<<< non-entities too! ☑'

它不是防弹的,但处理了我所有的案例。

看起来像 HTML 实体但实际上不是的东西仍然会造成麻烦。例如:&#x2; 甚至 &;

它也不处理 non-XML 命名实体,例如 &check;&hearts;