ValueError: Can't specify both mapper_raw and mapper in Python
ValueError: Can't specify both mapper_raw and mapper in Python
我正在尝试读取 Python 中包含 mrjob
的 fna
文件。
这是我的load_read.py
程序,所有代码都可以在不使用mrjob
的情况下正常工作。
from mrjob.job import MRJob
from Bio import SeqIO
from Bio.Seq import Seq
import re
from operator import itemgetter
import sys
def format_read(read):
z = re.split('[|={,]+', read.description)
return read.seq, z[3]
class LoadMetaRead(MRJob):
def mapper_raw(self, file_path, file_uri):
from Bio import SeqIO
from Bio.Seq import Seq
seqs = list(SeqIO.parse(file_path, type='fna'))
is_paired_end = False
if len(seqs) > 2 and seqs[0].id[-1:] != seqs[1].id[-1:]:
is_paired_end = True
label_list = dict()
label_index = 0
for i in range(0, len(seqs), 2 if is_paired_end else 1):
read, label = format_read(seqs[i])
if is_paired_end:
read2, _ = format_read(seqs[i + 1])
read += read2
if label not in label_list:
label_list[label] = label_index
label_index += 1
yield str(i), str(read), str(label_list[label])
def mapper(self, _, line):
yield 'read', line
def reducer(self, key, values):
yield key, values
combiner = reducer
if __name__ == '__main__':
LoadMetaRead.run()
数据文件示例R4.fna
:
>r1.1 |SOURCES={GI=15668172,fw,1146130-1146958}|ERRORS={52_1:A,78_1:G,78_2:G,78_3:G,641_1:G}|SOURCE_1="Methanocaldococcus jannaschii DSM 2661 chromosome" (392b1054a4bf536ea1cc349545ace50120973c3a)
AAACCCTCTTCCACGAACCCTCTTGAAAATCCCCCACATCCACAAAATAAATCAAATAAATTTCA
ACATTATCACCAAAAGGGTAAAAGGTTATTTAAAAAATAAAATAAATTTAAAAATTTAAATTAAA
TACCAAAAAAGCCAAATAACTTATTGTGATTCTTGAGCTTTCTTTAACTCTGCCTTCATATCTTG
ATAGACTTTAGTCCATTTTAATTTTCTTGGATTTCTTCCCATTCTGTAGCTTTTCTCACATTTGG
ATGAGCAGAAATATAATACAGTCCCATCTTTTTCTACGACCATTTTTCCTTTTCCTGGCTCAATT
TCATAACCACAAAAGCTGCATGTTCTCCATTCTGGCATAGCTATCCCCCTTTAATAGTGTTTCAG
TGATTTTAAAATAATTTAAGATTAAATTATTTATCTTCTTCTGTCTAATGGTCTTGCTTCTCTCT
CTGTTTCTCTTAACATAATAATGTCTCCAACTTTAACTGGACCTTTAACGTTTCTAACTAAAACT
CTTCCAGTATCTTTTCCACCTAAGATTTTACATCTAACTTGTATAATTCCTCCAGTAACCCCTGT
TCTACCAATGACTTCAATAACTTCAGCAGCTACTGCTTCCTTATAAACAAATTCATCTTCCGATC
CTCATCACCTAATATTAATGAAGGTTTAAAATTTATAAAAAAGTTAGTAGTAGTGTTTCATAATT
TATATAATAATAACTATATACTATTGATTGATGGTTAAATAGCGTTCTAATAATTTACTGCTTCA
AAACATTTACCTTTTCAATTAATACCTTTAACTCTTCAGCATCTCCTTCGTTG
>r2.1 |SOURCES={GI=15668172,bw,239211-239971}|ERRORS={113:-,217_1:C,281_1:G,627_1:G,717_1:T}|SOURCE_1="Methanocaldococcus jannaschii DSM 2661 chromosome" (392b1054a4bf536ea1cc349545ace50120973c3a)
TAGCATGTAAATCCCTTATTTCTTAATTTCTCCCAGAATTATTTCTATTGCTTTATCAACTGCCT
TGGCAACCTCTTCAGACAACCCTGGTTTTATGTCTGGCATTGTAAATTTTTACCTTGACAACCAA
TAACCACGACTTCTATGCCTTTATTATGTAAATCTTTGAGAAATGGGGCTAATGGAACGTTATGG
GCATCGAAAGAATATTTTTTAACTATTCGGTAATTCATCAACATCTATCTTTTTTATTGTTCCAG
GTTCTAAATCAAAATCAATGGCGATCAACAACAATAATCTTTTTTATATCTTCATCAACCAACGT
CATTAAATAGTATGCTCCACTTGCCCCAGCATCTATAACTTCAACGTTATCTGGCAAGTTCATTT
TTTCTAATTTGCTAACAACCTCACATCCAAAGCCATCATCTCCAAACAACAGATTTCCACAACCA
ACAATTAATATATCCTTCTTTTTCATTTTATCACTTATTTAGCATTTCTTTATATTTTTTAGCCT
CTTCTTTAGGATTTTGTGATTGATAGATTGCCCTTCCAACAATGACGTAATCATTCTCATCTAAA
ATATTTAAAATATCCTCAATCTTCCCTCCCTGAGCTCCGACTCCGTGGTGTTATTACTGGCAATT
CTGCAATTTCTTTAATTTCTTTAAGCCTTTCAGGCCTTGTTGATGGAGCAACTATAGCATCAACT
TTTAGTTTTTTTAGCCATCTCTGACAATTTATCTGCTATTGGCTGTAG
当我运行程序使用这个命令时:
python load_read.py R4.fna
它引发了这个错误:
ValueError: Can't specify both mapper_raw and mapper
你知道如何解决这个问题吗?
所以我发现我不能同时定义mapper_raw()
和mapper
。我只需要定义其中之一。
我使用 mapper_raw()
是因为我读取了整个文件,而不是逐行读取。
class LoadMetaRead(MRJob):
def mapper_raw(self, file_path, file_uri):
from Bio import SeqIO
from Bio.Seq import Seq
seqs = list(SeqIO.parse(file_path, 'fasta'))
is_paired_end = False
if len(seqs) > 2 and seqs[0].id[-1:] != seqs[1].id[-1:]:
is_paired_end = True
label_list = dict()
label_index = 0
for i in range(0, len(seqs), 2 if is_paired_end else 1):
read, label = format_read(seqs[i])
if is_paired_end:
read2, _ = format_read(seqs[i + 1])
read += read2
if label not in label_list:
label_list[label] = label_index
label_index += 1
yield None, (str(read), str(label_list[label]))
def reducer(self, key, values):
for value in values:
yield key, str(value)
此代码按预期工作。
我正在尝试读取 Python 中包含 mrjob
的 fna
文件。
这是我的load_read.py
程序,所有代码都可以在不使用mrjob
的情况下正常工作。
from mrjob.job import MRJob
from Bio import SeqIO
from Bio.Seq import Seq
import re
from operator import itemgetter
import sys
def format_read(read):
z = re.split('[|={,]+', read.description)
return read.seq, z[3]
class LoadMetaRead(MRJob):
def mapper_raw(self, file_path, file_uri):
from Bio import SeqIO
from Bio.Seq import Seq
seqs = list(SeqIO.parse(file_path, type='fna'))
is_paired_end = False
if len(seqs) > 2 and seqs[0].id[-1:] != seqs[1].id[-1:]:
is_paired_end = True
label_list = dict()
label_index = 0
for i in range(0, len(seqs), 2 if is_paired_end else 1):
read, label = format_read(seqs[i])
if is_paired_end:
read2, _ = format_read(seqs[i + 1])
read += read2
if label not in label_list:
label_list[label] = label_index
label_index += 1
yield str(i), str(read), str(label_list[label])
def mapper(self, _, line):
yield 'read', line
def reducer(self, key, values):
yield key, values
combiner = reducer
if __name__ == '__main__':
LoadMetaRead.run()
数据文件示例R4.fna
:
>r1.1 |SOURCES={GI=15668172,fw,1146130-1146958}|ERRORS={52_1:A,78_1:G,78_2:G,78_3:G,641_1:G}|SOURCE_1="Methanocaldococcus jannaschii DSM 2661 chromosome" (392b1054a4bf536ea1cc349545ace50120973c3a)
AAACCCTCTTCCACGAACCCTCTTGAAAATCCCCCACATCCACAAAATAAATCAAATAAATTTCA
ACATTATCACCAAAAGGGTAAAAGGTTATTTAAAAAATAAAATAAATTTAAAAATTTAAATTAAA
TACCAAAAAAGCCAAATAACTTATTGTGATTCTTGAGCTTTCTTTAACTCTGCCTTCATATCTTG
ATAGACTTTAGTCCATTTTAATTTTCTTGGATTTCTTCCCATTCTGTAGCTTTTCTCACATTTGG
ATGAGCAGAAATATAATACAGTCCCATCTTTTTCTACGACCATTTTTCCTTTTCCTGGCTCAATT
TCATAACCACAAAAGCTGCATGTTCTCCATTCTGGCATAGCTATCCCCCTTTAATAGTGTTTCAG
TGATTTTAAAATAATTTAAGATTAAATTATTTATCTTCTTCTGTCTAATGGTCTTGCTTCTCTCT
CTGTTTCTCTTAACATAATAATGTCTCCAACTTTAACTGGACCTTTAACGTTTCTAACTAAAACT
CTTCCAGTATCTTTTCCACCTAAGATTTTACATCTAACTTGTATAATTCCTCCAGTAACCCCTGT
TCTACCAATGACTTCAATAACTTCAGCAGCTACTGCTTCCTTATAAACAAATTCATCTTCCGATC
CTCATCACCTAATATTAATGAAGGTTTAAAATTTATAAAAAAGTTAGTAGTAGTGTTTCATAATT
TATATAATAATAACTATATACTATTGATTGATGGTTAAATAGCGTTCTAATAATTTACTGCTTCA
AAACATTTACCTTTTCAATTAATACCTTTAACTCTTCAGCATCTCCTTCGTTG
>r2.1 |SOURCES={GI=15668172,bw,239211-239971}|ERRORS={113:-,217_1:C,281_1:G,627_1:G,717_1:T}|SOURCE_1="Methanocaldococcus jannaschii DSM 2661 chromosome" (392b1054a4bf536ea1cc349545ace50120973c3a)
TAGCATGTAAATCCCTTATTTCTTAATTTCTCCCAGAATTATTTCTATTGCTTTATCAACTGCCT
TGGCAACCTCTTCAGACAACCCTGGTTTTATGTCTGGCATTGTAAATTTTTACCTTGACAACCAA
TAACCACGACTTCTATGCCTTTATTATGTAAATCTTTGAGAAATGGGGCTAATGGAACGTTATGG
GCATCGAAAGAATATTTTTTAACTATTCGGTAATTCATCAACATCTATCTTTTTTATTGTTCCAG
GTTCTAAATCAAAATCAATGGCGATCAACAACAATAATCTTTTTTATATCTTCATCAACCAACGT
CATTAAATAGTATGCTCCACTTGCCCCAGCATCTATAACTTCAACGTTATCTGGCAAGTTCATTT
TTTCTAATTTGCTAACAACCTCACATCCAAAGCCATCATCTCCAAACAACAGATTTCCACAACCA
ACAATTAATATATCCTTCTTTTTCATTTTATCACTTATTTAGCATTTCTTTATATTTTTTAGCCT
CTTCTTTAGGATTTTGTGATTGATAGATTGCCCTTCCAACAATGACGTAATCATTCTCATCTAAA
ATATTTAAAATATCCTCAATCTTCCCTCCCTGAGCTCCGACTCCGTGGTGTTATTACTGGCAATT
CTGCAATTTCTTTAATTTCTTTAAGCCTTTCAGGCCTTGTTGATGGAGCAACTATAGCATCAACT
TTTAGTTTTTTTAGCCATCTCTGACAATTTATCTGCTATTGGCTGTAG
当我运行程序使用这个命令时:
python load_read.py R4.fna
它引发了这个错误:
ValueError: Can't specify both mapper_raw and mapper
你知道如何解决这个问题吗?
所以我发现我不能同时定义mapper_raw()
和mapper
。我只需要定义其中之一。
我使用 mapper_raw()
是因为我读取了整个文件,而不是逐行读取。
class LoadMetaRead(MRJob):
def mapper_raw(self, file_path, file_uri):
from Bio import SeqIO
from Bio.Seq import Seq
seqs = list(SeqIO.parse(file_path, 'fasta'))
is_paired_end = False
if len(seqs) > 2 and seqs[0].id[-1:] != seqs[1].id[-1:]:
is_paired_end = True
label_list = dict()
label_index = 0
for i in range(0, len(seqs), 2 if is_paired_end else 1):
read, label = format_read(seqs[i])
if is_paired_end:
read2, _ = format_read(seqs[i + 1])
read += read2
if label not in label_list:
label_list[label] = label_index
label_index += 1
yield None, (str(read), str(label_list[label]))
def reducer(self, key, values):
for value in values:
yield key, str(value)
此代码按预期工作。