gzip 解压缩的改进 java
Improvements in gzip decompression java
场景:
我在 oracle 数据库中有将近 1500 万条记录,每条记录都有一个压缩的列。任务是导出相同的 table 但列值已解压缩。我的解决步骤如下,
- Read a chunk of data using jdbcTemplate (returns List)
- For each of the record above decompress the column value and form an updated list
- Use the above list to insert into another table (This is being executed by another thread).
因此这里对一批48842条记录进行分析,
- Reading takes around 9 seconds
- Writing takes around 47 seconds
- Compression takes around 135 seconds
根据上述处理 1500 万条记录的分析,该过程大约需要 16 - 17 个小时。有没有办法改进它呢?
我正在寻找减压技术的广阔领域。在我的情况下,即使减压技术有少量改进也会产生非常大的差异。任何帮助将非常感激。
下面是我使用的解压方法,
public String decompressMessage(String message)
throws Exception
{
ByteArrayInputStream byteArrayIPStream = null;
GZIPInputStream gZipIPStream = null;
BufferedReader bufferedReader = null;
String decompressedMessage = "";
String line="";
byte[] compressByteArray = null;
try{
if(message==null || "".equals(message))
{
logger.error("Decompress is not possible as the string is empty");
return "";
}
compressByteArray = Base64.decode(message);
byteArrayIPStream = new ByteArrayInputStream(compressByteArray);
gZipIPStream = new GZIPInputStream(byteArrayIPStream);
bufferedReader = new BufferedReader(new InputStreamReader(gZipIPStream, "UTF-8"));
while ((line = bufferedReader.readLine()) != null) {
decompressedMessage = decompressedMessage + line;
}
return decompressedMessage;
}
catch(Exception e)
{
logger.error("Exception while decompressing the message with details {}",e);
return "";
}
finally{
line = null;
compressByteArray = null;
if(byteArrayIPStream!=null)
byteArrayIPStream.close();
if(gZipIPStream!=null)
gZipIPStream.close();
if(bufferedReader!=null)
bufferedReader.close();
}
}
当然,最大的问题是在循环中连接字符串。字符串是不可变的,这意味着您将 O(n2) 时间复杂度强加给本质上为 O(n) 的作业。
用 StringWriter
替换字符串,并从输入端删除 BufferedReader
。使用Reader#read(char[])
后跟[=13=]累加StringWriter
中的数据,最后得到StringWriter.toString()
.
的字符串
让Oracle数据库来做吧。例如:
-- NOTE: This example would be simpler if compressed_data were a RAW type...
create table matt1 ( compressed_data VARCHAR2(4000) );
-- Put 100,000 rows of compressed data in there
insert into matt1 (compressed_data)
select utl_raw.cast_to_varchar2(utl_compress.lz_compress(src => utl_raw.cast_to_raw(dbms_random.string('a',30) || 'UNCOMPRESSED_DATA' || lpad(rownum,10,'0') || dbms_random.string('a',30))))
from dual
connect by rownum <= 100000;
-- Create the uncompressed version of the table to export
create table matt1_uncompressed as
select utl_raw.cast_to_varchar2(utl_compress.lz_uncompress(src => utl_raw.cast_to_raw(compressed_data))) uncompressed_data
from matt1
where rownum <= 100000;
--- execution time was 3.448 seconds
更新 OP 发布的样本数据
您的示例中的数据看起来是经过 base64 编码的。试试这个:
SELECT utl_compress.lz_uncompress(src =>
utl_encode.base64_decode(utl_raw.cast_to_raw(your_table.compressed_column)))
from your_table;
场景: 我在 oracle 数据库中有将近 1500 万条记录,每条记录都有一个压缩的列。任务是导出相同的 table 但列值已解压缩。我的解决步骤如下,
- Read a chunk of data using jdbcTemplate (returns List)
- For each of the record above decompress the column value and form an updated list
- Use the above list to insert into another table (This is being executed by another thread).
因此这里对一批48842条记录进行分析,
- Reading takes around 9 seconds
- Writing takes around 47 seconds
- Compression takes around 135 seconds
根据上述处理 1500 万条记录的分析,该过程大约需要 16 - 17 个小时。有没有办法改进它呢? 我正在寻找减压技术的广阔领域。在我的情况下,即使减压技术有少量改进也会产生非常大的差异。任何帮助将非常感激。
下面是我使用的解压方法,
public String decompressMessage(String message)
throws Exception
{
ByteArrayInputStream byteArrayIPStream = null;
GZIPInputStream gZipIPStream = null;
BufferedReader bufferedReader = null;
String decompressedMessage = "";
String line="";
byte[] compressByteArray = null;
try{
if(message==null || "".equals(message))
{
logger.error("Decompress is not possible as the string is empty");
return "";
}
compressByteArray = Base64.decode(message);
byteArrayIPStream = new ByteArrayInputStream(compressByteArray);
gZipIPStream = new GZIPInputStream(byteArrayIPStream);
bufferedReader = new BufferedReader(new InputStreamReader(gZipIPStream, "UTF-8"));
while ((line = bufferedReader.readLine()) != null) {
decompressedMessage = decompressedMessage + line;
}
return decompressedMessage;
}
catch(Exception e)
{
logger.error("Exception while decompressing the message with details {}",e);
return "";
}
finally{
line = null;
compressByteArray = null;
if(byteArrayIPStream!=null)
byteArrayIPStream.close();
if(gZipIPStream!=null)
gZipIPStream.close();
if(bufferedReader!=null)
bufferedReader.close();
}
}
当然,最大的问题是在循环中连接字符串。字符串是不可变的,这意味着您将 O(n2) 时间复杂度强加给本质上为 O(n) 的作业。
用 StringWriter
替换字符串,并从输入端删除 BufferedReader
。使用Reader#read(char[])
后跟[=13=]累加StringWriter
中的数据,最后得到StringWriter.toString()
.
让Oracle数据库来做吧。例如:
-- NOTE: This example would be simpler if compressed_data were a RAW type...
create table matt1 ( compressed_data VARCHAR2(4000) );
-- Put 100,000 rows of compressed data in there
insert into matt1 (compressed_data)
select utl_raw.cast_to_varchar2(utl_compress.lz_compress(src => utl_raw.cast_to_raw(dbms_random.string('a',30) || 'UNCOMPRESSED_DATA' || lpad(rownum,10,'0') || dbms_random.string('a',30))))
from dual
connect by rownum <= 100000;
-- Create the uncompressed version of the table to export
create table matt1_uncompressed as
select utl_raw.cast_to_varchar2(utl_compress.lz_uncompress(src => utl_raw.cast_to_raw(compressed_data))) uncompressed_data
from matt1
where rownum <= 100000;
--- execution time was 3.448 seconds
更新 OP 发布的样本数据
您的示例中的数据看起来是经过 base64 编码的。试试这个:
SELECT utl_compress.lz_uncompress(src =>
utl_encode.base64_decode(utl_raw.cast_to_raw(your_table.compressed_column)))
from your_table;