基于大小拆分文件的优化问题
Optimization Problem on files split based on size
我面临一个优化问题,我想找到一个能够解决它的算法。
假设:文件大小总和至少大于15MB。所以总大小是 [15 MB; +∞]。为简单起见,将无穷大视为 100 GB,
问题:我有一个大小在 3 KB 到 4 MB 之间的文件列表。我必须压缩这些文件,我需要保证在将它们压缩在一起之前文件大小的总和在 15 MB 到 150 MB 之间。
有没有已知的算法来处理这个问题?为了不让算法在计算要求方面成本过高,不最小化块的数量是可以接受的(因此每个块不一定要尽可能大)。
谢谢,
朱塞佩
我们可以调整著名的首次拟合递减算法来做到这一点。
import random
K = 1000
B = 1
KB = K * B
MB = K * KB
class File:
def __init__(self):
self.size = random.randrange(3 * KB, 4 * MB)
class Chunk:
def __init__(self, max_chunk_size):
self.free = max_chunk_size
self.files = []
def add(self, file):
if self.free < file.size:
return False
self.free -= file.size
self.files.append(file)
return True
def enlarge(self, delta_free):
assert delta_free >= 0
self.free += delta_free
def size(self):
return sum(file.size for file in self.files)
def first_fit_decreasing(files, min_chunk_size, max_chunk_size):
chunks = []
for file in sorted(files, key=lambda file: file.size, reverse=True):
for existing_chunk in chunks:
if existing_chunk.add(file):
break
else:
if len(chunks) >= 2:
chunks[-2].enlarge(min_chunk_size)
new_chunk = Chunk(max_chunk_size - min_chunk_size)
new_chunk.add(file)
chunks.append(new_chunk)
if chunks[-1].size() < min_chunk_size:
chunks[-2].enlarge(min_chunk_size)
for file in chunks[-1].files:
chunks[-2].add(file)
del chunks[-1]
return chunks
def test(n):
files = [File() for i in range(n)]
min_chunk_size = 15 * MB
max_chunk_size = 150 * MB
chunks = first_fit_decreasing(files, min_chunk_size, max_chunk_size)
assert sorted(id(file) for file in files) == sorted(
id(file) for chunk in chunks for file in chunk.files
)
for chunk in chunks:
assert min_chunk_size <= chunk.size() <= max_chunk_size
print(len(chunks), "chunks")
print(sum(chunk.free for chunk in chunks) / MB, "MB free")
if __name__ == "__main__":
for t in range(1000):
test(150)
test(10000)
我面临一个优化问题,我想找到一个能够解决它的算法。
假设:文件大小总和至少大于15MB。所以总大小是 [15 MB; +∞]。为简单起见,将无穷大视为 100 GB,
问题:我有一个大小在 3 KB 到 4 MB 之间的文件列表。我必须压缩这些文件,我需要保证在将它们压缩在一起之前文件大小的总和在 15 MB 到 150 MB 之间。 有没有已知的算法来处理这个问题?为了不让算法在计算要求方面成本过高,不最小化块的数量是可以接受的(因此每个块不一定要尽可能大)。
谢谢, 朱塞佩
我们可以调整著名的首次拟合递减算法来做到这一点。
import random
K = 1000
B = 1
KB = K * B
MB = K * KB
class File:
def __init__(self):
self.size = random.randrange(3 * KB, 4 * MB)
class Chunk:
def __init__(self, max_chunk_size):
self.free = max_chunk_size
self.files = []
def add(self, file):
if self.free < file.size:
return False
self.free -= file.size
self.files.append(file)
return True
def enlarge(self, delta_free):
assert delta_free >= 0
self.free += delta_free
def size(self):
return sum(file.size for file in self.files)
def first_fit_decreasing(files, min_chunk_size, max_chunk_size):
chunks = []
for file in sorted(files, key=lambda file: file.size, reverse=True):
for existing_chunk in chunks:
if existing_chunk.add(file):
break
else:
if len(chunks) >= 2:
chunks[-2].enlarge(min_chunk_size)
new_chunk = Chunk(max_chunk_size - min_chunk_size)
new_chunk.add(file)
chunks.append(new_chunk)
if chunks[-1].size() < min_chunk_size:
chunks[-2].enlarge(min_chunk_size)
for file in chunks[-1].files:
chunks[-2].add(file)
del chunks[-1]
return chunks
def test(n):
files = [File() for i in range(n)]
min_chunk_size = 15 * MB
max_chunk_size = 150 * MB
chunks = first_fit_decreasing(files, min_chunk_size, max_chunk_size)
assert sorted(id(file) for file in files) == sorted(
id(file) for chunk in chunks for file in chunk.files
)
for chunk in chunks:
assert min_chunk_size <= chunk.size() <= max_chunk_size
print(len(chunks), "chunks")
print(sum(chunk.free for chunk in chunks) / MB, "MB free")
if __name__ == "__main__":
for t in range(1000):
test(150)
test(10000)