使用 python 更快地读取大型 fastq 文件
reading large fastq file with python faster
我有几个 fastq 文件,平均有 500.000.000 行(125.000.000 个序列)。有没有快速读取这些fastq文件的方法。
我想做的是读取每个序列并将前 16 个序列用作条形码。然后统计每个文件的条码数。
这是我的脚本,需要几个小时:
import os, errno
from Bio import SeqIO
import gzip
files = os.listdir(".")
for file in files[:]:
if not file.endswith(".fastq.gz"):
files.remove(file)
maps = {}
for file in files:
print "Now Parsing file %s"%file
maps[file] = {}
with gzip.open(file,"r") as handle:
recs = SeqIO.parse(handle,"fastq")
for rec in recs:
tag = str(rec.seq)[0:16]
if tag not in map[file]:
maps[file][tag] = 1
else:
maps[file][tag] += 1
我有 250 GB RAM 和 20 CPU 可用于多线程...
谢谢。
未经测试,但您可以通过以下方式以 'embarassingly parallel' 方式执行此操作:
import multiprocessing as mp
import os, errno
from Bio import SeqIO
import gzip
def ImportFile(file):
maps = {}
with gzip.open(file,"r") as handle:
recs = SeqIO.parse(handle,"fastq")
for rec in recs:
tag = str(rec.seq)[0:16]
if tag not in maps.keys():
maps[tag] = 1
else:
maps[tag] += 1
return {file:maps}
files = os.listdir(".")
for file in files[:]:
if not file.endswith(".fastq.gz"):
files.remove(file)
# I'd test this with smaller numbers before using up all 20 cores
pool = mp.Pool(processes=10)
output = pool.map(ImportFile,files)
我有几个 fastq 文件,平均有 500.000.000 行(125.000.000 个序列)。有没有快速读取这些fastq文件的方法。
我想做的是读取每个序列并将前 16 个序列用作条形码。然后统计每个文件的条码数。
这是我的脚本,需要几个小时:
import os, errno
from Bio import SeqIO
import gzip
files = os.listdir(".")
for file in files[:]:
if not file.endswith(".fastq.gz"):
files.remove(file)
maps = {}
for file in files:
print "Now Parsing file %s"%file
maps[file] = {}
with gzip.open(file,"r") as handle:
recs = SeqIO.parse(handle,"fastq")
for rec in recs:
tag = str(rec.seq)[0:16]
if tag not in map[file]:
maps[file][tag] = 1
else:
maps[file][tag] += 1
我有 250 GB RAM 和 20 CPU 可用于多线程...
谢谢。
未经测试,但您可以通过以下方式以 'embarassingly parallel' 方式执行此操作:
import multiprocessing as mp
import os, errno
from Bio import SeqIO
import gzip
def ImportFile(file):
maps = {}
with gzip.open(file,"r") as handle:
recs = SeqIO.parse(handle,"fastq")
for rec in recs:
tag = str(rec.seq)[0:16]
if tag not in maps.keys():
maps[tag] = 1
else:
maps[tag] += 1
return {file:maps}
files = os.listdir(".")
for file in files[:]:
if not file.endswith(".fastq.gz"):
files.remove(file)
# I'd test this with smaller numbers before using up all 20 cores
pool = mp.Pool(processes=10)
output = pool.map(ImportFile,files)