使用 Python 将多个文件中的项目组合成一个矩阵
Combining items from several files into one matrix using Python
我正在尝试将多个文件 (1.20.1_Indel_allEff.vcf,1.20.2_Indel_allEff.vcf.....1.200.1_Indel_allEff.vcf) 中的项目合并到一个文件夹中,以便获得一个看起来像这样的矩阵。
Fm Chromosome Position Ref Alt Gene X1.20.1 X1.20.2 X1.20.3
Fm chrI 100007 AT A CAR2 0 0 0
Fm chrX 3000676 G T HYM1 0 0 0.5
其中,X1.20.1
、X1.20.2
、X1.20.3
.....X1.200.3
是文件夹中包含的单个文件名及其频率值。
我写了一段代码在python(F1_comparison.py)
snps = defaultdict(lambda: defaultdict(str))
myfiles=listdir(str(sys.argv[1]))
for f1 in myfiles:
f = open(f1)
tpp = f1.split("_")[0].split(".")
tp=tpp[0]+'.'+tpp[1]+'.'+tpp[2]
for l in f:
ls = l.split()
if l.find("#") == -1 and len(ls) > 6:
chrom = ls[0]
pos = ls[1]
ref = ls[2]
alt = ls[3]
freq = ls[4]
typ = ls[5]
gene = ls[6]
if len(alt) == 1:
snps[pos+"_"+ref+"-"+alt+"_"+chrom+"_"+gene+"_"+typ][tp] = freq
elif len(alt) > 1:
for k in range (0,len(alt.split(","))):
snps[pos+"_"+ref+alt.split(",")[k]+"_"+chrom+"_"+gene+"_"+typ][tp] = freq.split(",")[k]
f.close()
traj = 1
tp_list = ['1.20.1','1.20.2','1.20.3','1.30.1','1.30.2','1.30.3','1.40.1','1.40.2','1.40.3','1.50.1','1.50.2','1.50.3','1.60.1','1.60.2','1.60.3','1.90.1','1.90.2','1.90.3','1.100.1','1.100.2','1.100.3','1.130.1','1.130.2','1.130.3','1.200.1','1.200.2','1.200.3']
print "Fermentor\tTrajectory\tChromosome\tPosition\tMutation\tGene\tEffect\t1.20.1\t1.20.2\t1.20.3\t1.30.1\t1.30.2\t1.30.3\t1.40.1\t1.40.3\t1.50.1\t1.50.2\t1.50.3\t1.60.1\t1.60.2\t1.60.3\t1.90.1\t1.90.2\t1.90.3\t1.100.1\t1.100.2\t1.100.3\t1.130.1\t1.130.2\t1.130.3\t1.200.1\t1.200.2\t1.200.3"
for pos in sorted(snps.keys()):
pos1 = pos.split("_")[0]
mut = pos.split("_")[1]
chrom = pos.split("_")[2]
gene = pos.split("_")[3]
typ = pos.split("_")[4]
tp_string = ""
for tp in tp_list:
if len(snps[pos][tp])>0:
tp_string += "\t"+str(snps[pos][tp])
else:
tp_string += "\t"+str("0/0")
print "F1"+"\t"+str(traj)+"\t"+chrom+"\t"+pos1+"\t"+mut+"\t"+gene+"\t"+typ+"\t"+tp_string
traj += 1
但是,我收到一个错误,其中代码无法识别文件夹中的某些文件,尽管它们的格式都相同。
我的命令和我得到的错误:
python F1_comparison.py Fer1 > output.csv
Traceback (most recent call last):
File "Fer1_comparison.py", line 18, in <module>
f = open(f1)
IOError: [Errno 2] No such file or directory: '1.30.2_INDEL_allEff.vcf'
有人可以帮我解决这个问题吗?这将是一个很大的帮助。谢谢
您需要将文件加入路径:
from os import path, listdir
pth = sys.argv[1] # get full path
myfiles = listdir(pth) # get list of all files in that path/directory
for f1 in myfiles:
with open(path.join(pth,f1)) as f: # join -> pth/f1. with also closes your file
tpp = f1.split("_",1)[0].split(".")
tp = ".".join(tpp[0:3]) # same as tp=tpp[0]+'.'+tpp[1]+'.'+tpp[2]
for line in f:
# continue your code ...
你可以把你的代码写得更简洁、更高效,使用切片、解包str.format而不是重复拆分:
from os import path, listdir
import sys
from collections import defaultdict
snps = defaultdict(lambda: defaultdict(str))
pth = sys.argv[1] # get full path
myfiles = listdir(pth) # get list of all files in that path/directory
with open("Fer1_INDELs_clones_filtered.csv","w") as out: # file to write all filtered data to
out.write("Fermentor\tTrajectory\tChromosome\tPosition\tMutation\tGene\tEffect\t1.20.1\t1.20.2\t1.20.3\t1.30.1\t1.30.2\t1.30.3\t1.40.1\t1.40.3\t1.50.1\t1.50.2\t1.50.3\t1.60.1\t1.60.2\t1.60.3\t1.90.1\t1.90.2\t1.90.3\t1.100.1\t1.100.2\t1.100.3\t1.130.1\t1.130.2\t1.130.3\t1.200.1\t1.200.2\t1.200.3\n")
for f1 in myfiles:
with open(path.join(pth, f1)) as f: # join -> pth/f1
tpp = f1.split("_", 1)[0].split(".")
tp = ".".join(tpp[0:3]) # same as tp=tpp[0]+'.'+tpp[1]+'.'+tpp[2]
for line in f:
ls = line.split()
if line.find("#") == -1 and len(ls) > 6:
print(line)
# use unpacking and slicing
chrom, pos, ref, alt, freq, typ, gene = ls[:7]
if len(alt) == 1:
# use str.fromat
snps["{}_{}-{}_{}_{}_{}".format(pos,ref,alt,chrom,gene,typ)][tp] = freq
elif len(alt) > 1:
# use enumerate
for ind,k in enumerate(alt.split(",")):
snps["{}_{}_{}_{}_{}_{}".format(pos,ref,k,chrom,gene,typ)][tp] = freq.split(",")[ind]
traj = 1
tp_list = ['1.20.1', '1.20.2', '1.20.3', '1.30.1', '1.30.2', '1.30.3', '1.40.1', '1.40.2', '1.40.3', '1.50.1', '1.50.2',
'1.50.3', '1.60.1', '1.60.2', '1.60.3', '1.90.1', '1.90.2', '1.90.3', '1.100.1', '1.100.2', '1.100.3',
'1.130.1', '1.130.2', '1.130.3', '1.200.1', '1.200.2', '1.200.3']
for pos in sorted(snps):
# split once and again use unpacking and slicing
pos1, mut, chrom, gene, typ = pos.split("_")[:5]
tp_string = ""
for tp in tp_list:
#print(tp)
if snps[pos][tp]: # empty value will be False no need to check len
tp_string += "\t{}".format(snps[pos][tp])
else:
tp_string += "\t0/0"
out.write(("F1{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(traj,chrom,pos1,mut,gene,typ,tp_string)))
traj += 1
我正在尝试将多个文件 (1.20.1_Indel_allEff.vcf,1.20.2_Indel_allEff.vcf.....1.200.1_Indel_allEff.vcf) 中的项目合并到一个文件夹中,以便获得一个看起来像这样的矩阵。
Fm Chromosome Position Ref Alt Gene X1.20.1 X1.20.2 X1.20.3
Fm chrI 100007 AT A CAR2 0 0 0
Fm chrX 3000676 G T HYM1 0 0 0.5
其中,X1.20.1
、X1.20.2
、X1.20.3
.....X1.200.3
是文件夹中包含的单个文件名及其频率值。
我写了一段代码在python(F1_comparison.py)
snps = defaultdict(lambda: defaultdict(str))
myfiles=listdir(str(sys.argv[1]))
for f1 in myfiles:
f = open(f1)
tpp = f1.split("_")[0].split(".")
tp=tpp[0]+'.'+tpp[1]+'.'+tpp[2]
for l in f:
ls = l.split()
if l.find("#") == -1 and len(ls) > 6:
chrom = ls[0]
pos = ls[1]
ref = ls[2]
alt = ls[3]
freq = ls[4]
typ = ls[5]
gene = ls[6]
if len(alt) == 1:
snps[pos+"_"+ref+"-"+alt+"_"+chrom+"_"+gene+"_"+typ][tp] = freq
elif len(alt) > 1:
for k in range (0,len(alt.split(","))):
snps[pos+"_"+ref+alt.split(",")[k]+"_"+chrom+"_"+gene+"_"+typ][tp] = freq.split(",")[k]
f.close()
traj = 1
tp_list = ['1.20.1','1.20.2','1.20.3','1.30.1','1.30.2','1.30.3','1.40.1','1.40.2','1.40.3','1.50.1','1.50.2','1.50.3','1.60.1','1.60.2','1.60.3','1.90.1','1.90.2','1.90.3','1.100.1','1.100.2','1.100.3','1.130.1','1.130.2','1.130.3','1.200.1','1.200.2','1.200.3']
print "Fermentor\tTrajectory\tChromosome\tPosition\tMutation\tGene\tEffect\t1.20.1\t1.20.2\t1.20.3\t1.30.1\t1.30.2\t1.30.3\t1.40.1\t1.40.3\t1.50.1\t1.50.2\t1.50.3\t1.60.1\t1.60.2\t1.60.3\t1.90.1\t1.90.2\t1.90.3\t1.100.1\t1.100.2\t1.100.3\t1.130.1\t1.130.2\t1.130.3\t1.200.1\t1.200.2\t1.200.3"
for pos in sorted(snps.keys()):
pos1 = pos.split("_")[0]
mut = pos.split("_")[1]
chrom = pos.split("_")[2]
gene = pos.split("_")[3]
typ = pos.split("_")[4]
tp_string = ""
for tp in tp_list:
if len(snps[pos][tp])>0:
tp_string += "\t"+str(snps[pos][tp])
else:
tp_string += "\t"+str("0/0")
print "F1"+"\t"+str(traj)+"\t"+chrom+"\t"+pos1+"\t"+mut+"\t"+gene+"\t"+typ+"\t"+tp_string
traj += 1
但是,我收到一个错误,其中代码无法识别文件夹中的某些文件,尽管它们的格式都相同。
我的命令和我得到的错误:
python F1_comparison.py Fer1 > output.csv
Traceback (most recent call last):
File "Fer1_comparison.py", line 18, in <module>
f = open(f1)
IOError: [Errno 2] No such file or directory: '1.30.2_INDEL_allEff.vcf'
有人可以帮我解决这个问题吗?这将是一个很大的帮助。谢谢
您需要将文件加入路径:
from os import path, listdir
pth = sys.argv[1] # get full path
myfiles = listdir(pth) # get list of all files in that path/directory
for f1 in myfiles:
with open(path.join(pth,f1)) as f: # join -> pth/f1. with also closes your file
tpp = f1.split("_",1)[0].split(".")
tp = ".".join(tpp[0:3]) # same as tp=tpp[0]+'.'+tpp[1]+'.'+tpp[2]
for line in f:
# continue your code ...
你可以把你的代码写得更简洁、更高效,使用切片、解包str.format而不是重复拆分:
from os import path, listdir
import sys
from collections import defaultdict
snps = defaultdict(lambda: defaultdict(str))
pth = sys.argv[1] # get full path
myfiles = listdir(pth) # get list of all files in that path/directory
with open("Fer1_INDELs_clones_filtered.csv","w") as out: # file to write all filtered data to
out.write("Fermentor\tTrajectory\tChromosome\tPosition\tMutation\tGene\tEffect\t1.20.1\t1.20.2\t1.20.3\t1.30.1\t1.30.2\t1.30.3\t1.40.1\t1.40.3\t1.50.1\t1.50.2\t1.50.3\t1.60.1\t1.60.2\t1.60.3\t1.90.1\t1.90.2\t1.90.3\t1.100.1\t1.100.2\t1.100.3\t1.130.1\t1.130.2\t1.130.3\t1.200.1\t1.200.2\t1.200.3\n")
for f1 in myfiles:
with open(path.join(pth, f1)) as f: # join -> pth/f1
tpp = f1.split("_", 1)[0].split(".")
tp = ".".join(tpp[0:3]) # same as tp=tpp[0]+'.'+tpp[1]+'.'+tpp[2]
for line in f:
ls = line.split()
if line.find("#") == -1 and len(ls) > 6:
print(line)
# use unpacking and slicing
chrom, pos, ref, alt, freq, typ, gene = ls[:7]
if len(alt) == 1:
# use str.fromat
snps["{}_{}-{}_{}_{}_{}".format(pos,ref,alt,chrom,gene,typ)][tp] = freq
elif len(alt) > 1:
# use enumerate
for ind,k in enumerate(alt.split(",")):
snps["{}_{}_{}_{}_{}_{}".format(pos,ref,k,chrom,gene,typ)][tp] = freq.split(",")[ind]
traj = 1
tp_list = ['1.20.1', '1.20.2', '1.20.3', '1.30.1', '1.30.2', '1.30.3', '1.40.1', '1.40.2', '1.40.3', '1.50.1', '1.50.2',
'1.50.3', '1.60.1', '1.60.2', '1.60.3', '1.90.1', '1.90.2', '1.90.3', '1.100.1', '1.100.2', '1.100.3',
'1.130.1', '1.130.2', '1.130.3', '1.200.1', '1.200.2', '1.200.3']
for pos in sorted(snps):
# split once and again use unpacking and slicing
pos1, mut, chrom, gene, typ = pos.split("_")[:5]
tp_string = ""
for tp in tp_list:
#print(tp)
if snps[pos][tp]: # empty value will be False no need to check len
tp_string += "\t{}".format(snps[pos][tp])
else:
tp_string += "\t0/0"
out.write(("F1{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(traj,chrom,pos1,mut,gene,typ,tp_string)))
traj += 1