我正在使用地图进行流式传输和分组，需要有关如何提高性能的建议

Question

我的源数据是这样的，它真的很大 xml 2+ GB。

    <?xml version="1.0" encoding="UTF-8"?>
    <Journal_Lines>
        <jrnl1 CY="USD" CCD="1001" CC="11062" IsPyJrl="1" AID="11382" LAI="107709"
            TLCCr="11062" TCAmt="222.85" TDAmt="0" CDI="C" CDAmt="222.85" DN=""
            EDt="2019-06-16-07:00" SCd="" HURCl="0"/>
        <jrnl1 CY="USD" CCD="1001" CC="11062" IsPyJrl="1" AID="11382" LAI="240997"
            TLCCr="11062" TCAmt="0" TDAmt="222.85" CDI="D" CDAmt="222.85" DN=""
            EDt="2019-06-16-07:00" SCd="" HURCl="0"/>
        <jrnl1 CY="USD" CCD="1001" CC="16835" IsPyJrl="1" AID="12661" LAI="107769"
            TLCCr="16835" TCAmt="94.06" TDAmt="0" CDI="C" CDAmt="94.06" DN="" EDt="2019-06-16-07:00"
            SCd="" HURCl="0"/>
        <jrnl1 CY="USD" CCD="1001" CC="16835" IsPyJrl="1" AID="12661" LAI="240997"
            TLCCr="16835" TCAmt="0" TDAmt="94.06" CDI="D" CDAmt="94.06" DN="" EDt="2019-06-16-07:00"
            SCd="" HURCl="0"/>
        <jrnl1 CY="USD" CCD="1001" CC="19655" IsPyJrl="1" AID="12731" LAI="240997"
            TLCCr="19655" TCAmt="0" TDAmt="899.11" CDI="D" CDAmt="899.11" DN=""
            EDt="2019-06-16-07:00" SCd="" HURCl="0"/>
        <jrnl1 CY="USD" CCD="1001" CC="19655" IsPyJrl="1" AID="12731" LAI="107709"
            TLCCr="19655" TCAmt="899.11" TDAmt="0" CDI="C" CDAmt="899.11" DN=""
            EDt="2019-06-16-07:00" SCd="" HURCl="0"/>
    </Journal_Lines>

我的输出是

<Journal_Lines xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:map="http://www.w3.org/2005/xpath-functions/map">
    <Group CCD="1001" CC="11062">
        <Jrnln CY="USD" CCD="1001" CC="11062" IsPyJrl="1" AID="11382" LAI="107709" TLCCr="11062"
            TCAmt="222.85" TDAmt="0" CDI="C" CDAmt="222.85" DN="" EDt="2019-06-16-07:00" SCd=""
            HURCl="0"/>
        <Jrnln CY="USD" CCD="1001" CC="11062" IsPyJrl="1" AID="11382" LAI="240997" TLCCr="11062"
            TCAmt="0" TDAmt="222.85" CDI="D" CDAmt="222.85" DN="" EDt="2019-06-16-07:00" SCd=""
            HURCl="0"/>
    </Group>
    <Group CCD="1001" CC="16835">
        <Jrnln CY="USD" CCD="1001" CC="16835" IsPyJrl="1" AID="12661" LAI="107769" TLCCr="16835"
            TCAmt="94.06" TDAmt="0" CDI="C" CDAmt="94.06" DN="" EDt="2019-06-16-07:00" SCd=""
            HURCl="0"/>
        <Jrnln CY="USD" CCD="1001" CC="16835" IsPyJrl="1" AID="12661" LAI="240997" TLCCr="16835"
            TCAmt="0" TDAmt="94.06" CDI="D" CDAmt="94.06" DN="" EDt="2019-06-16-07:00" SCd=""
            HURCl="0"/>
    </Group>
    <Group CCD="1001" CC="19655">
        <Jrnln CY="USD" CCD="1001" CC="19655" IsPyJrl="1" AID="12731" LAI="240997" TLCCr="19655"
            TCAmt="0" TDAmt="899.11" CDI="D" CDAmt="899.11" DN="" EDt="2019-06-16-07:00" SCd=""
            HURCl="0"/>
        <Jrnln CY="USD" CCD="1001" CC="19655" IsPyJrl="1" AID="12731" LAI="107709" TLCCr="19655"
            TCAmt="899.11" TDAmt="0" CDI="C" CDAmt="899.11" DN="" EDt="2019-06-16-07:00" SCd=""
            HURCl="0"/>
    </Group>
</Journal_Lines>

我正在按 CC 和 CCD 进行分组和排序，我当前的代码在下面并且可以正常工作。但是需要很长时间。

    <?xml version="1.0" encoding="UTF-8"?>
    <xsl:stylesheet version="3.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
        xmlns:xs="http://www.w3.org/2001/XMLSchema"
        xmlns:map="http://www.w3.org/2005/xpath-functions/map">

        <xsl:output indent="no"/>
        <xsl:mode streamable="yes" on-no-match="shallow-skip"/>
        <xsl:variable name="vElementMap" as="map(*)" 
            select="map { 
            1:'CY', 2:'CCD', 3:'CC', 4:'IsPyJrl', 5:'AID',
            6:'LAI', 7:'TLCCr', 8:'TCAmt', 9:'TDAmt', 10:'CDI',
            11:'CDAmt', 12:'DN', 13:'EDt', 14:'SCd', 15:'HURCl' }"
        />

        <xsl:template match="/">
            <xsl:iterate select="Journal_Lines/jrnl1">
                <xsl:param name="mapJournalLines" as="map(xs:string, xs:string)" select="map{}"/>

                <xsl:on-completion>
                    <Journal_Lines>
                        <!-- Sort data  -->
                        <xsl:for-each select="map:for-each($mapJournalLines, function ($k, $v) {$k})">
                            <xsl:sort select="."/>
                            <Group CCD="{substring-before(.,'^')}" CC="{substring-after(.,'^')}">
                                <xsl:for-each select="tokenize($mapJournalLines(.),'\^')">
                                    <Jrnln>
                                        <xsl:for-each select="tokenize(.,'\|')">
                                            <xsl:attribute name="{$vElementMap(position())}">
                                                <xsl:value-of select="."/>
                                            </xsl:attribute>
                                        </xsl:for-each>
                                    </Jrnln>
                                </xsl:for-each>
                            </Group>                        
                        </xsl:for-each>
                    </Journal_Lines>
                </xsl:on-completion>

                <xsl:variable name="current-entry" select="copy-of()"/>
                <xsl:variable name="vKey" select="$current-entry/@CCD || '^' || $current-entry/@CC"/>
                <xsl:variable name="vValue">
                    <xsl:for-each select="$current-entry/@*">
                        <xsl:if test="position() ne 1">|</xsl:if>
                        <xsl:value-of select="."/>
                    </xsl:for-each>
                </xsl:variable>

                <xsl:next-iteration>

                    <xsl:with-param name="mapJournalLines"
                        select="
                        if (map:contains($mapJournalLines, xs:string($vKey))) then
                        map:put($mapJournalLines, xs:string($vKey), $mapJournalLines(xs:string($vKey)) || '^' || xs:string($vValue)) 
                        else 
                        map:put($mapJournalLines, xs:string($vKey), xs:string($vValue))"
                    />

                </xsl:next-iteration>
            </xsl:iterate>

        </xsl:template>

    </xsl:stylesheet>

我正在将单个 jrnl1 节点转换为单个竖线分隔线，多个分组线由 ^ 分隔这适用于小负载，但需要永远处理大数据。

感谢任何帮助。

Answer 1

您以属性为中心的数据似乎是流式传输的良好输入 xsl:fork/xsl:for-each-group:

<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="3.0"
    xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="#all">

    <xsl:output indent="yes"/>

    <xsl:mode streamable="yes" on-no-match="shallow-skip"/>

    <xsl:template match="Journal_Lines">
        <xsl:copy>
            <xsl:fork>
                <xsl:for-each-group select="jrnl1" composite="yes" group-by="@CCD, @CC">
                    <Group CCD="{current-grouping-key()[1]}" CC="{current-grouping-key()[2]}">
                        <xsl:apply-templates select="current-group()"/>
                    </Group>
                </xsl:for-each-group>
            </xsl:fork>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="jrnl1">
        <Jrnln>
            <xsl:copy-of select="@*"/>
        </Jrnln>
    </xsl:template>

</xsl:stylesheet>

然而，即使是这种方法也需要 XSLT 处理器在内存中缓冲组，因为直到最后一个元素被处理它属于哪个组时才可以确定，或者换句话说，它不能推出和关闭任何在处理最后一个元素之前进行分组。只有 group-adjacent 会减少缓冲的需要（以及 xsl:fork 的使用），但这显然需要输入将元素分组在一起，这些元素已经相互跟随。

使用排序与 XSLT 3 中的流式处理并不真正兼容，我认为对它的任何使用都会破坏任何流式分析，您需要加入 copy-of()，我不确定它是否有任何优势优于传统 XSLT：

<xsl:template match="Journal_Lines">
    <xsl:copy>
            <xsl:for-each-group select="jrnl1!copy-of()" composite="yes" group-by="@CCD, @CC">
                <xsl:sort select="current-grouping-key()[1]"/>
                <xsl:sort select="current-grouping-key()[2]"/>
                <Group CCD="{current-grouping-key()[1]}" CC="{current-grouping-key()[2]}">
                    <xsl:apply-templates select="current-group()"/>
                </Group>
            </xsl:for-each-group>          
    </xsl:copy>
</xsl:template>

可能值得研究专用的 XML 数据库系统，例如 eXist 或 BaseX，如果它们（主要基于 XQuery）的处理允许比独立的 XSLT 3 处理器具有更高的性能和更少的内存密集型排序和分组。

对于您当前使用字符串映射的方法，您连接并标记化可能值得检查使用嵌套数组或 array/sequence 的嵌套是否性能更好，或者可能只是存储您已经拥有的元素 copy-of()ed 也比连接和拆分字符串更快：

<xsl:stylesheet version="3.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:map="http://www.w3.org/2005/xpath-functions/map">

    <xsl:output indent="yes"/>
    <xsl:mode streamable="yes" on-no-match="shallow-skip"/>
    <xsl:variable name="vElementMap" as="map(*)" 
        select="map { 
        1:'CY', 2:'CCD', 3:'CC', 4:'IsPyJrl', 5:'AID',
        6:'LAI', 7:'TLCCr', 8:'TCAmt', 9:'TDAmt', 10:'CDI',
        11:'CDAmt', 12:'DN', 13:'EDt', 14:'SCd', 15:'HURCl' }"
    />

    <xsl:template match="/">
        <xsl:iterate select="Journal_Lines/jrnl1">
            <xsl:param name="mapJournalLines" as="map(xs:string, element(jrnl1)*)" select="map{}"/>

            <xsl:on-completion>
                <Journal_Lines>
                    <!-- Sort data  -->
                    <xsl:for-each select="map:keys($mapJournalLines)">
                        <xsl:sort select="."/>
                        <Group CCD="{substring-before(.,'^')}" CC="{substring-after(.,'^')}">
                            <xsl:for-each select="$mapJournalLines(.)">
                                <Jrnln>
                                    <xsl:copy-of select="@*"/>
                                </Jrnln>
                            </xsl:for-each>
                        </Group>                        
                    </xsl:for-each>
                </Journal_Lines>
            </xsl:on-completion>

            <xsl:variable name="current-entry" select="copy-of()"/>
            <xsl:variable name="vKey" as="xs:string" select="$current-entry/@CCD || '^' || $current-entry/@CC"/>

            <xsl:next-iteration>

                <xsl:with-param name="mapJournalLines"
                    select="
                    if (map:contains($mapJournalLines, $vKey)) then
                    map:put($mapJournalLines, $vKey, ($mapJournalLines($vKey), $current-entry)) 
                    else 
                    map:put($mapJournalLines, $vKey, $current-entry)"
                />

            </xsl:next-iteration>
        </xsl:iterate>

    </xsl:template>

</xsl:stylesheet>

最后，为了保持字符串数据映射的原始方法，但为了避免所有的连接和标记化，您可以尝试 map(xs:string, array(xs:string)*)，即将每组数据存储为序列的映射字符串数组，其中每个数组代表最终输出中的一行：

<xsl:stylesheet version="3.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:map="http://www.w3.org/2005/xpath-functions/map">

    <xsl:output indent="yes"/>
    <xsl:mode streamable="yes" on-no-match="shallow-skip"/>
    <xsl:variable name="vElementMap" as="map(*)" 
        select="map { 
        1:'CY', 2:'CCD', 3:'CC', 4:'IsPyJrl', 5:'AID',
        6:'LAI', 7:'TLCCr', 8:'TCAmt', 9:'TDAmt', 10:'CDI',
        11:'CDAmt', 12:'DN', 13:'EDt', 14:'SCd', 15:'HURCl' }"
    />

    <xsl:template match="/">
        <xsl:iterate select="Journal_Lines/jrnl1">
            <xsl:param name="mapJournalLines" as="map(xs:string, array(xs:string)*)" select="map{}"/>

            <xsl:on-completion>
                <Journal_Lines>
                    <!-- Sort data  -->
                    <xsl:for-each select="map:keys($mapJournalLines)">
                        <xsl:sort select="."/>
                        <Group CCD="{substring-before(.,'^')}" CC="{substring-after(.,'^')}">
                            <xsl:for-each select="$mapJournalLines(.)">
                                <Jrnln>
                                    <xsl:for-each select="?*">
                                        <xsl:attribute name="{$vElementMap(position())}">
                                            <xsl:value-of select="."/>
                                        </xsl:attribute>
                                    </xsl:for-each>
                                </Jrnln>
                            </xsl:for-each>
                        </Group>                        
                    </xsl:for-each>
                </Journal_Lines>
            </xsl:on-completion>

            <xsl:variable name="vKey" as="xs:string" select="@CCD || '^' || @CC"/>

            <xsl:variable name="vValue" as="array(xs:string)*" select="array { @*!string() }"/>

            <xsl:next-iteration>

                <xsl:with-param name="mapJournalLines"
                    select="
                    if (map:contains($mapJournalLines, $vKey)) then
                    map:put($mapJournalLines, $vKey, ($mapJournalLines($vKey), $vValue)) 
                    else 
                    map:put($mapJournalLines, $vKey, $vValue)"
                />

            </xsl:next-iteration>
        </xsl:iterate>

    </xsl:template>

</xsl:stylesheet>

我正在使用地图进行流式传输和分组，需要有关如何提高性能的建议

I'm using maps for streaming and grouping, need advice on how to improve the performance

performance

xslt-grouping

xslt-3.0