在 python 中读取 bz2 文件的第一行
Reading first lines of bz2 files in python
我正在尝试从 bz2 文件中提取 10'000 行。
import bz2
file = "file.bz2"
file_10000 = "file.txt"
output_file = codecs.open(file_10000,'w+','utf-8')
source_file = bz2.open(file, "r")
count = 0
for line in source_file:
count += 1
if count < 10000:
output_file.writerow(line)
但我收到错误“'module' 对象没有属性 'open'”。你有什么想法?或者我可以通过其他方式将 10'000 行保存到 txt 文件中吗?我在 Windows.
这是一个完整的示例,其中包括编写和读取比您的 10000 行小得多的测试文件。很高兴在问题中有工作示例,这样我们就可以轻松测试。
import bz2
import itertools
import codecs
file = "file.bz2"
file_10000 = "file.txt"
# write test file with 9 lines
with bz2.BZ2File(file, "w") as fp:
fp.write('\n'.join('123456789'))
# the original script using BZ2File ... and 3 lines for test
# ...and fixing bugs:
# 1) it only writes 9999 instead of 10000
# 2) files don't do writerow
# 3) close the files
output_file = codecs.open(file_10000,'w+','utf-8')
source_file = bz2.BZ2File(file, "r")
count = 0
for line in source_file:
count += 1
if count <= 3:
output_file.write(line)
source_file.close()
output_file.close()
# show what you got
print('---- Test 1 ----')
print(repr(open(file_10000).read()))
一种更有效的方法是在阅读所需行后跳出 for
循环。你甚至可以像这样利用迭代器来精简代码:
# a faster way to read first 3 lines
with bz2.BZ2File(file) as source_file,\
codecs.open(file_10000,'w+','utf-8') as output_file:
output_file.writelines(itertools.islice(source_file, 3))
# show what you got
print('---- Test 2 ----')
print(repr(open(file_10000).read()))
这绝对是比其他答案更简单的方法,但在 Python2/3 中这都是一种简单的方法。此外,如果您没有 >= 10,000 行,它会短路。
from bz2 import BZ2File as bzopen
# writing to a file
with bzopen("file.bz2", "w") as bzfout:
for i in range(123456):
bzfout.write(b"%i\n" % i)
# reading a bz2 archive
with bzopen("file.bz2", "r") as bzfin:
""" Handle lines here """
lines = []
for i, line in enumerate(bzfin):
if i == 10000: break
lines.append(line.rstrip())
print(lines)
只是另一种变体。
import bz2
myfile = 'c:\my_dir\random.txt.bz2'
newfile = 'c:\my_dir\random_10000.txt'
stream = bz2.BZ2File(myfile)
with open(newfile, 'w') as f:
for i in range(1,10000):
f.write(stream.readline())
这对我有用:
sudo apt-get install python-dev
sudo pip install backports.lzma
我正在尝试从 bz2 文件中提取 10'000 行。
import bz2
file = "file.bz2"
file_10000 = "file.txt"
output_file = codecs.open(file_10000,'w+','utf-8')
source_file = bz2.open(file, "r")
count = 0
for line in source_file:
count += 1
if count < 10000:
output_file.writerow(line)
但我收到错误“'module' 对象没有属性 'open'”。你有什么想法?或者我可以通过其他方式将 10'000 行保存到 txt 文件中吗?我在 Windows.
这是一个完整的示例,其中包括编写和读取比您的 10000 行小得多的测试文件。很高兴在问题中有工作示例,这样我们就可以轻松测试。
import bz2
import itertools
import codecs
file = "file.bz2"
file_10000 = "file.txt"
# write test file with 9 lines
with bz2.BZ2File(file, "w") as fp:
fp.write('\n'.join('123456789'))
# the original script using BZ2File ... and 3 lines for test
# ...and fixing bugs:
# 1) it only writes 9999 instead of 10000
# 2) files don't do writerow
# 3) close the files
output_file = codecs.open(file_10000,'w+','utf-8')
source_file = bz2.BZ2File(file, "r")
count = 0
for line in source_file:
count += 1
if count <= 3:
output_file.write(line)
source_file.close()
output_file.close()
# show what you got
print('---- Test 1 ----')
print(repr(open(file_10000).read()))
一种更有效的方法是在阅读所需行后跳出 for
循环。你甚至可以像这样利用迭代器来精简代码:
# a faster way to read first 3 lines
with bz2.BZ2File(file) as source_file,\
codecs.open(file_10000,'w+','utf-8') as output_file:
output_file.writelines(itertools.islice(source_file, 3))
# show what you got
print('---- Test 2 ----')
print(repr(open(file_10000).read()))
这绝对是比其他答案更简单的方法,但在 Python2/3 中这都是一种简单的方法。此外,如果您没有 >= 10,000 行,它会短路。
from bz2 import BZ2File as bzopen
# writing to a file
with bzopen("file.bz2", "w") as bzfout:
for i in range(123456):
bzfout.write(b"%i\n" % i)
# reading a bz2 archive
with bzopen("file.bz2", "r") as bzfin:
""" Handle lines here """
lines = []
for i, line in enumerate(bzfin):
if i == 10000: break
lines.append(line.rstrip())
print(lines)
只是另一种变体。
import bz2
myfile = 'c:\my_dir\random.txt.bz2'
newfile = 'c:\my_dir\random_10000.txt'
stream = bz2.BZ2File(myfile)
with open(newfile, 'w') as f:
for i in range(1,10000):
f.write(stream.readline())
这对我有用:
sudo apt-get install python-dev
sudo pip install backports.lzma