java 中给定压缩文本的 lzw 解压算法
lzw decompression algorithm in java for given compressed text
我有以下作为字节数组给出的压缩文本:
byte[] compressed = [97, 2, 10, 28, 72, -80, -96, -63, -125, 8, 19, 42, 92, -56, -80, -95, -61, -121, 16, 35, 74, -100, 72, -79, -94, -59, -117, 24, 51, 106, -36, -56, -79, -93, -57, -113, 32, 67, -118, 28, 73, -78, -92, -55, -109, 40, 83, -86, 92, -55, -78, -91, -53, -105, 48, 99, -54, -100, 73, -77, -90, -51, -101, 56, 115, -22, -36, -55, -77, -89, -49, -97, 64, -125, 10, 29, 74, -76, -88, -47, -93, 72, -109, 42, 93, -54, -76, -87, -45, -89, 80, -93, 74, -99, 74, -75, -86, -43, -85, 88, -77, 106, -35, -54, -75, -85, -41, -81, 96, -61, -118, 29, 75, -74, -84, -39, -77, 104, -45, -86, 93, -53, -74, -83, -37, -73, 112, -29, -54, -99, 75, -73, -82, -35, -69, 120, -13, -22, -35, -53, -73, -81, -33, -65, -128, 3, 11, 30, 76, -72, -80, -31, -61, -120, 19, 43, 94, -52, -72, -79, -29, -57, -112, 35, 75, -98, 76, -71, -78, -27, -53, -104, 51, 107, -34, -52, -71, -77, -25, -49, -96, 67, -117, 30, 77, -70, -76, -23, -45, -88, 83, -85, 94, -51, -70, -75, -21, -41, -80, 99, -53, -98, 77, -69, -74, -19, -37, -72, 115, -21, -34, -51, -69, -73, -17, -33, -64, -125, 11, 31, 78, -68, -72, -15, -29, -56, -109, 43, 95, -50, -68, -71, -13, -25, -48, -93, 75, -97, 78, -67, -70, -11, -21, -40, -77, 107, -33, -50, -67, -69, -9, -17, -32, -61, -117, 31, 79, -66, -68, -7, -13, -24, -45, -85, 95, -49, -66, -67, -5, -9, -16, -29, -53, -97, 79, -65, -66, -3, -5, -8, -13, -21, -33, -49, -65, -65, -1, -1, 0, -61, 4, 20, 56, -112, 96, 65, -125, 7, 17, 38, 84, -72, -112, 97, 67, -121, 15, 33, 70, -108, 56, -111, 98, 69, -117, 23, 49, 102, -44, -72, -111, 99, 71, -113, 31, 65, -122, 20, 57, -110, 100, 73, -109, 39, 81, -90, 84, -71, -110, 101, 75, -105, 47, 97, -58, -108, 57, -109, 102, 77, -101, 55, 113, -26, -44, -71, -109, 103, 79, -97, 63, -127, 6, 21, 58, -108, 104, 81, -93, 71, -111, 38, 85, -70, -108, 105, 83, -89, 79, -95, 70, -107, 58, -107, 106, 85, -85, 87, -79, 102, -43, -70, -107, 107, 87, -81, 95, -63, -122, 21, 59, -106, 108, 89, -77, 103, -47, -90, 85, -69, -106, 109, 91, -73, 111, 91, 2];
和给定的码字长度 9 bits
。
生成的字符串应该是 aaaaaaaaaaaaaaaaaaaaaaaaa...
,所有 *a*s
.
的总长度为 39270
我想编写一个解压缩函数,用于解压缩压缩的字节数组和 returns *a*s
的结果字符串。
我尝试了正常的 LZW 实现,但效果不是很好。给定的 9 位码字长度和字节数组中的负值让我有些头疼。
我的理解是我必须将整个字节数组转换为一个二进制字符串并每 9 位(码字长度)读取每个值?
有没有人提示或建议如何做到这一点?谢谢,非常感谢您的支持。
编辑:
这是一些代码,到目前为止我已经尝试过,它适用于
byte[] compressed = [68, 0, 97, 0, 115, 0, 32, 0, 105, 0, 115, 0, 116, 0, 32, 0, 101, 0, 105, 0, 110, 0, 32, 0, 107, 0, 117, 0, 114, 0, 122, 0, 101, 0, 114, 0, 32, 0, 84, 0, 101, 0, 120, 0, 116, 0]
和给定的 16 位码字长度。
结果字符串是 Das ist ein kurzer Text
.
public static List<String> convertByteArrayToBinaryStringList(byte[]compressedData,int codeWordLength){
StringBuilder sb=new StringBuilder();
List<String> binaryCompressedValues=new ArrayList<String>();
for(int i=0;i<compressedData.length;i++){
sb.append(byteToBinaryString(compressedData[i]));
}
char[]binaryCharArray=sb.toString().toCharArray();
int j=0;
while(j<binaryCharArray.length){
StringBuilder binStringBuilder=new StringBuilder();
for(int i=j;i<j+codeWordLength;i++){
if(j+codeWordLength>binaryCharArray.length){
System.out.println("End reached!");
binStringBuilder.append("0");
}else{
binStringBuilder.append(binaryCharArray[i]);
}
}
j+=codeWordLength;
binaryCompressedValues.add(binStringBuilder.toString());
}
return binaryCompressedValues;
}
public String incrementDictSize(String currentDictSize) {
String incrementedString = Integer.toBinaryString(Integer.valueOf(currentDictSize, 2) + 1);
int lengthDistance = currentDictSize.length() - incrementedString.length();
String padding = "";
if (lengthDistance > 0) {
for (int i = 0; i < lengthDistance; i++) {
padding += "0";
}
}
return padding + incrementedString;
}
public byte[]uncompress(byte[]compressedData){
int codeWordLength=16;
List<String> binaryCompressedValues=new ArrayList<String>();
binaryCompressedValues=convertByteArrayToBinaryStringList(compressedData,codeWordLength);
//int dictSize = 256;
String dictSize="1111111100000000";
Map<String, String> dictionary=new HashMap<String, String>();
String s="00000000";
String padString="00000000";
for(int i=0;i< 256;i++){
s=String.format("%8s",Integer.toBinaryString(i)).replace(' ','0');
dictionary.put(s+padString,""+(char)i);
System.out.println("dictionary.get("+i+") "+s+padString+" "+dictionary.get(s+padString));
}
//String w = "" + (char)(byte)compressedData[0];
String w="";
StringBuffer result=new StringBuffer(w);
for(String k:binaryCompressedValues){
String entry;
if(dictionary.containsKey(k)){
entry=dictionary.get(k);
dictionary.put(incrementDictSize(currentDictSize),w+entry.charAt(0));
result.append(entry);
w=entry;
}else{
entry=w+w.charAt(0);
result.append(entry);
dictionary.put(incrementDictSize(currentDictSize),w+w.charAt(0));
w=entry;
}
}
return result.toString().getBytes();
}
读取 n-bit-codes LSB 优先的示例代码.
当心肮脏的细节,例如代码大小更改的对齐方式。
/** An <code>NBitsInputStream</code> reads its <code>InputStream</code>
* as codes of n bits, least significant bits first.
*/// extend with set/increaseCodeLength() as needed
class NBitsInputStream extends java.io.FilterInputStream {
int buffer, validBits;
int codeLength, codeMask;
protected NBitsInputStream(InputStream in, int n) {
super(in);
codeLength = n;
codeMask = (1 << n) - 1;
}
/** Reads a code of n bits, least significant bits first.
* @return the code, or -1 if -1 is read. */
@Override
public int read() throws IOException {
while (validBits < codeLength) {
int high = super.read();
if (high < 0) // EOF
return high;
buffer |= high << validBits;
validBits += 8;
}
int code = buffer & codeMask;
validBits -= codeLength;
buffer >>= codeLength;
return code;
}
}
(试过
for (int code ; 0 <= (code = codes.read()) ; ) {
String entry = (String) dictionary.get(code);
w += (null != entry ? entry : w).charAt(0);
dictionary.put(++currentDictSize, w);
result.append(w = entry);
}
没有意外观察。)
请注意,您的字典将不仅包含最长的字符串,还包含它们的每个前缀,使用 16 位代码最多约 2130641537 个字符(除了几乎该长度的 result
)。
我有以下作为字节数组给出的压缩文本:
byte[] compressed = [97, 2, 10, 28, 72, -80, -96, -63, -125, 8, 19, 42, 92, -56, -80, -95, -61, -121, 16, 35, 74, -100, 72, -79, -94, -59, -117, 24, 51, 106, -36, -56, -79, -93, -57, -113, 32, 67, -118, 28, 73, -78, -92, -55, -109, 40, 83, -86, 92, -55, -78, -91, -53, -105, 48, 99, -54, -100, 73, -77, -90, -51, -101, 56, 115, -22, -36, -55, -77, -89, -49, -97, 64, -125, 10, 29, 74, -76, -88, -47, -93, 72, -109, 42, 93, -54, -76, -87, -45, -89, 80, -93, 74, -99, 74, -75, -86, -43, -85, 88, -77, 106, -35, -54, -75, -85, -41, -81, 96, -61, -118, 29, 75, -74, -84, -39, -77, 104, -45, -86, 93, -53, -74, -83, -37, -73, 112, -29, -54, -99, 75, -73, -82, -35, -69, 120, -13, -22, -35, -53, -73, -81, -33, -65, -128, 3, 11, 30, 76, -72, -80, -31, -61, -120, 19, 43, 94, -52, -72, -79, -29, -57, -112, 35, 75, -98, 76, -71, -78, -27, -53, -104, 51, 107, -34, -52, -71, -77, -25, -49, -96, 67, -117, 30, 77, -70, -76, -23, -45, -88, 83, -85, 94, -51, -70, -75, -21, -41, -80, 99, -53, -98, 77, -69, -74, -19, -37, -72, 115, -21, -34, -51, -69, -73, -17, -33, -64, -125, 11, 31, 78, -68, -72, -15, -29, -56, -109, 43, 95, -50, -68, -71, -13, -25, -48, -93, 75, -97, 78, -67, -70, -11, -21, -40, -77, 107, -33, -50, -67, -69, -9, -17, -32, -61, -117, 31, 79, -66, -68, -7, -13, -24, -45, -85, 95, -49, -66, -67, -5, -9, -16, -29, -53, -97, 79, -65, -66, -3, -5, -8, -13, -21, -33, -49, -65, -65, -1, -1, 0, -61, 4, 20, 56, -112, 96, 65, -125, 7, 17, 38, 84, -72, -112, 97, 67, -121, 15, 33, 70, -108, 56, -111, 98, 69, -117, 23, 49, 102, -44, -72, -111, 99, 71, -113, 31, 65, -122, 20, 57, -110, 100, 73, -109, 39, 81, -90, 84, -71, -110, 101, 75, -105, 47, 97, -58, -108, 57, -109, 102, 77, -101, 55, 113, -26, -44, -71, -109, 103, 79, -97, 63, -127, 6, 21, 58, -108, 104, 81, -93, 71, -111, 38, 85, -70, -108, 105, 83, -89, 79, -95, 70, -107, 58, -107, 106, 85, -85, 87, -79, 102, -43, -70, -107, 107, 87, -81, 95, -63, -122, 21, 59, -106, 108, 89, -77, 103, -47, -90, 85, -69, -106, 109, 91, -73, 111, 91, 2];
和给定的码字长度 9 bits
。
生成的字符串应该是 aaaaaaaaaaaaaaaaaaaaaaaaa...
,所有 *a*s
.
我想编写一个解压缩函数,用于解压缩压缩的字节数组和 returns *a*s
的结果字符串。
我尝试了正常的 LZW 实现,但效果不是很好。给定的 9 位码字长度和字节数组中的负值让我有些头疼。
我的理解是我必须将整个字节数组转换为一个二进制字符串并每 9 位(码字长度)读取每个值?
有没有人提示或建议如何做到这一点?谢谢,非常感谢您的支持。
编辑:
这是一些代码,到目前为止我已经尝试过,它适用于
byte[] compressed = [68, 0, 97, 0, 115, 0, 32, 0, 105, 0, 115, 0, 116, 0, 32, 0, 101, 0, 105, 0, 110, 0, 32, 0, 107, 0, 117, 0, 114, 0, 122, 0, 101, 0, 114, 0, 32, 0, 84, 0, 101, 0, 120, 0, 116, 0]
和给定的 16 位码字长度。
结果字符串是 Das ist ein kurzer Text
.
public static List<String> convertByteArrayToBinaryStringList(byte[]compressedData,int codeWordLength){
StringBuilder sb=new StringBuilder();
List<String> binaryCompressedValues=new ArrayList<String>();
for(int i=0;i<compressedData.length;i++){
sb.append(byteToBinaryString(compressedData[i]));
}
char[]binaryCharArray=sb.toString().toCharArray();
int j=0;
while(j<binaryCharArray.length){
StringBuilder binStringBuilder=new StringBuilder();
for(int i=j;i<j+codeWordLength;i++){
if(j+codeWordLength>binaryCharArray.length){
System.out.println("End reached!");
binStringBuilder.append("0");
}else{
binStringBuilder.append(binaryCharArray[i]);
}
}
j+=codeWordLength;
binaryCompressedValues.add(binStringBuilder.toString());
}
return binaryCompressedValues;
}
public String incrementDictSize(String currentDictSize) {
String incrementedString = Integer.toBinaryString(Integer.valueOf(currentDictSize, 2) + 1);
int lengthDistance = currentDictSize.length() - incrementedString.length();
String padding = "";
if (lengthDistance > 0) {
for (int i = 0; i < lengthDistance; i++) {
padding += "0";
}
}
return padding + incrementedString;
}
public byte[]uncompress(byte[]compressedData){
int codeWordLength=16;
List<String> binaryCompressedValues=new ArrayList<String>();
binaryCompressedValues=convertByteArrayToBinaryStringList(compressedData,codeWordLength);
//int dictSize = 256;
String dictSize="1111111100000000";
Map<String, String> dictionary=new HashMap<String, String>();
String s="00000000";
String padString="00000000";
for(int i=0;i< 256;i++){
s=String.format("%8s",Integer.toBinaryString(i)).replace(' ','0');
dictionary.put(s+padString,""+(char)i);
System.out.println("dictionary.get("+i+") "+s+padString+" "+dictionary.get(s+padString));
}
//String w = "" + (char)(byte)compressedData[0];
String w="";
StringBuffer result=new StringBuffer(w);
for(String k:binaryCompressedValues){
String entry;
if(dictionary.containsKey(k)){
entry=dictionary.get(k);
dictionary.put(incrementDictSize(currentDictSize),w+entry.charAt(0));
result.append(entry);
w=entry;
}else{
entry=w+w.charAt(0);
result.append(entry);
dictionary.put(incrementDictSize(currentDictSize),w+w.charAt(0));
w=entry;
}
}
return result.toString().getBytes();
}
读取 n-bit-codes LSB 优先的示例代码.
当心肮脏的细节,例如代码大小更改的对齐方式。
/** An <code>NBitsInputStream</code> reads its <code>InputStream</code>
* as codes of n bits, least significant bits first.
*/// extend with set/increaseCodeLength() as needed
class NBitsInputStream extends java.io.FilterInputStream {
int buffer, validBits;
int codeLength, codeMask;
protected NBitsInputStream(InputStream in, int n) {
super(in);
codeLength = n;
codeMask = (1 << n) - 1;
}
/** Reads a code of n bits, least significant bits first.
* @return the code, or -1 if -1 is read. */
@Override
public int read() throws IOException {
while (validBits < codeLength) {
int high = super.read();
if (high < 0) // EOF
return high;
buffer |= high << validBits;
validBits += 8;
}
int code = buffer & codeMask;
validBits -= codeLength;
buffer >>= codeLength;
return code;
}
}
(试过
for (int code ; 0 <= (code = codes.read()) ; ) {
String entry = (String) dictionary.get(code);
w += (null != entry ? entry : w).charAt(0);
dictionary.put(++currentDictSize, w);
result.append(w = entry);
}
没有意外观察。)
请注意,您的字典将不仅包含最长的字符串,还包含它们的每个前缀,使用 16 位代码最多约 2130641537 个字符(除了几乎该长度的 result
)。