java 中给定压缩文本的 lzw 解压算法

lzw decompression algorithm in java for given compressed text

我有以下作为字节数组给出的压缩文本:

byte[] compressed = [97, 2, 10, 28, 72, -80, -96, -63, -125, 8, 19, 42, 92, -56, -80, -95, -61, -121, 16, 35, 74, -100, 72, -79, -94, -59, -117, 24, 51, 106, -36, -56, -79, -93, -57, -113, 32, 67, -118, 28, 73, -78, -92, -55, -109, 40, 83, -86, 92, -55, -78, -91, -53, -105, 48, 99, -54, -100, 73, -77, -90, -51, -101, 56, 115, -22, -36, -55, -77, -89, -49, -97, 64, -125, 10, 29, 74, -76, -88, -47, -93, 72, -109, 42, 93, -54, -76, -87, -45, -89, 80, -93, 74, -99, 74, -75, -86, -43, -85, 88, -77, 106, -35, -54, -75, -85, -41, -81, 96, -61, -118, 29, 75, -74, -84, -39, -77, 104, -45, -86, 93, -53, -74, -83, -37, -73, 112, -29, -54, -99, 75, -73, -82, -35, -69, 120, -13, -22, -35, -53, -73, -81, -33, -65, -128, 3, 11, 30, 76, -72, -80, -31, -61, -120, 19, 43, 94, -52, -72, -79, -29, -57, -112, 35, 75, -98, 76, -71, -78, -27, -53, -104, 51, 107, -34, -52, -71, -77, -25, -49, -96, 67, -117, 30, 77, -70, -76, -23, -45, -88, 83, -85, 94, -51, -70, -75, -21, -41, -80, 99, -53, -98, 77, -69, -74, -19, -37, -72, 115, -21, -34, -51, -69, -73, -17, -33, -64, -125, 11, 31, 78, -68, -72, -15, -29, -56, -109, 43, 95, -50, -68, -71, -13, -25, -48, -93, 75, -97, 78, -67, -70, -11, -21, -40, -77, 107, -33, -50, -67, -69, -9, -17, -32, -61, -117, 31, 79, -66, -68, -7, -13, -24, -45, -85, 95, -49, -66, -67, -5, -9, -16, -29, -53, -97, 79, -65, -66, -3, -5, -8, -13, -21, -33, -49, -65, -65, -1, -1, 0, -61, 4, 20, 56, -112, 96, 65, -125, 7, 17, 38, 84, -72, -112, 97, 67, -121, 15, 33, 70, -108, 56, -111, 98, 69, -117, 23, 49, 102, -44, -72, -111, 99, 71, -113, 31, 65, -122, 20, 57, -110, 100, 73, -109, 39, 81, -90, 84, -71, -110, 101, 75, -105, 47, 97, -58, -108, 57, -109, 102, 77, -101, 55, 113, -26, -44, -71, -109, 103, 79, -97, 63, -127, 6, 21, 58, -108, 104, 81, -93, 71, -111, 38, 85, -70, -108, 105, 83, -89, 79, -95, 70, -107, 58, -107, 106, 85, -85, 87, -79, 102, -43, -70, -107, 107, 87, -81, 95, -63, -122, 21, 59, -106, 108, 89, -77, 103, -47, -90, 85, -69, -106, 109, 91, -73, 111, 91, 2];

和给定的码字长度 9 bits

生成的字符串应该是 aaaaaaaaaaaaaaaaaaaaaaaaa...,所有 *a*s.

的总长度为 39270

我想编写一个解压缩函数,用于解压缩压缩的字节数组和 returns *a*s 的结果字符串。

我尝试了正常的 LZW 实现,但效果不是很好。给定的 9 位码字长度和字节数组中的负值让我有些头疼。

我的理解是我必须将整个字节数组转换为一个二进制字符串并每 9 位(码字长度)读取每个值?

有没有人提示或建议如何做到这一点?谢谢,非常感谢您的支持。

编辑:

这是一些代码,到目前为止我已经尝试过,它适用于

byte[] compressed = [68, 0, 97, 0, 115, 0, 32, 0, 105, 0, 115, 0, 116, 0, 32, 0, 101, 0, 105, 0, 110, 0, 32, 0, 107, 0, 117, 0, 114, 0, 122, 0, 101, 0, 114, 0, 32, 0, 84, 0, 101, 0, 120, 0, 116, 0] 

和给定的 16 位码字长度。

结果字符串是 Das ist ein kurzer Text.

public static List<String> convertByteArrayToBinaryStringList(byte[]compressedData,int codeWordLength){
        StringBuilder sb=new StringBuilder();
        List<String> binaryCompressedValues=new ArrayList<String>();

        for(int i=0;i<compressedData.length;i++){
            sb.append(byteToBinaryString(compressedData[i]));
        }

        char[]binaryCharArray=sb.toString().toCharArray();

        int j=0;

        while(j<binaryCharArray.length){

            StringBuilder binStringBuilder=new StringBuilder();

        for(int i=j;i<j+codeWordLength;i++){

            if(j+codeWordLength>binaryCharArray.length){
                System.out.println("End reached!");
                binStringBuilder.append("0");
            }else{
                binStringBuilder.append(binaryCharArray[i]);
            }
        }

        j+=codeWordLength;

        binaryCompressedValues.add(binStringBuilder.toString());
        }

        return binaryCompressedValues;
        }

public String incrementDictSize(String currentDictSize) {
    String incrementedString = Integer.toBinaryString(Integer.valueOf(currentDictSize, 2) + 1);

    int lengthDistance = currentDictSize.length() - incrementedString.length();

    String padding = "";

    if (lengthDistance > 0) {
    for (int i = 0; i < lengthDistance; i++) {
    padding += "0";
    }
    }
    return padding + incrementedString;
    }

public byte[]uncompress(byte[]compressedData){
        int codeWordLength=16;

        List<String> binaryCompressedValues=new ArrayList<String>();

        binaryCompressedValues=convertByteArrayToBinaryStringList(compressedData,codeWordLength);

        //int dictSize = 256;
        String dictSize="1111111100000000";

        Map<String, String> dictionary=new HashMap<String, String>();
        String s="00000000";
        String padString="00000000";

        for(int i=0;i< 256;i++){

            s=String.format("%8s",Integer.toBinaryString(i)).replace(' ','0');
            dictionary.put(s+padString,""+(char)i);
            System.out.println("dictionary.get("+i+") "+s+padString+" "+dictionary.get(s+padString));
        }

        //String w = "" + (char)(byte)compressedData[0];
        String w="";

        StringBuffer result=new StringBuffer(w);

        for(String k:binaryCompressedValues){
        String entry;

        if(dictionary.containsKey(k)){
        entry=dictionary.get(k);

        dictionary.put(incrementDictSize(currentDictSize),w+entry.charAt(0));
        result.append(entry);
        w=entry;
        }else{

        entry=w+w.charAt(0);

        result.append(entry);

        dictionary.put(incrementDictSize(currentDictSize),w+w.charAt(0));
        w=entry;
        }

        }
        return result.toString().getBytes();

}

读取 n-bit-codes LSB 优先的示例代码.
当心肮脏的细节,例如代码大小更改的对齐方式。

/** An <code>NBitsInputStream</code> reads its <code>InputStream</code>
 *  as codes of n bits, least significant bits first.
 */// extend with set/increaseCodeLength() as needed
class NBitsInputStream extends java.io.FilterInputStream {
    int buffer, validBits;
    int codeLength, codeMask;
    protected NBitsInputStream(InputStream in, int n) {
        super(in);
        codeLength = n;
        codeMask = (1 << n) - 1;
    }
    /** Reads a code of n bits, least significant bits first.
     * @return  the code, or -1 if -1 is read. */
    @Override
    public int read() throws IOException {
        while (validBits < codeLength) {
            int high = super.read();
            if (high < 0)   // EOF
                return high;
            buffer |= high << validBits;
            validBits += 8;
        }
        int code = buffer & codeMask;
        validBits -= codeLength;
        buffer >>= codeLength;
        return code;
    }
}

(试过

    for (int code ; 0 <= (code = codes.read()) ; ) {
        String entry = (String) dictionary.get(code);
        w += (null != entry ? entry : w).charAt(0);
        dictionary.put(++currentDictSize, w);
        result.append(w = entry);
    }

没有意外观察。)

请注意,您的字典将不仅包含最长的字符串,还包含它们的每个前缀,使用 16 位代码最多约 2130641537 个字符(除了几乎该长度的 result)。