在特定情况下连接文件列表中的文件
Concatenate files from list of files in specific cases
我在名为 data.tsv 的文件中有一个文件名列表。每行对应相同的样本 ID,每个 ID 最多可能有 8 个文件。我需要合并以“_1.[可变扩展名]”和“_2.[可变扩展名].
结尾的文件
以下数据,但 stackover flow 将制表符转换为空格 - 应以制表符分隔:
I5_fastq_path_1 I5_fastq_path_2 I5_fastq_path_3 I5_fastq_path_4 I5_fastq_path_5 I5_fastq_path_6 I5_fastq_path_7 I5_fastq_path_8
/some/path/to/directory/PD7597b_1_1.fastq.gz /some/path/to/directory/PD7597b_1_2.fastq.gz /some/path/to/directory/PD7597b_2_1.fastq.gz /some/path/to/directory/PD7597b_2_2.fastq.gz /some/path/to/directory/PD7597b_3_1.fastq.gz /some/path/to/directory/PD7597b_3_2.fastq.gz NA NA
/some/path/to/directory/WTCHG_65902_709501_1.fastq.gz /some/path/to/directory/WTCHG_65902_709501_2.fastq.gz /some/path/to/directory/WTCHG_68106_709501_1.fastq.gz /some/path/to/directory/WTCHG_68106_709501_2.fastq.gz /some/path/to/directory/WTCHG_68107_709501_1.fastq.gz /some/path/to/directory/WTCHG_68107_709501_2.fastq.gz /some/path/to/directory/WTCHG_68108_709501_1.fastq.gz /some/path/to/directory/WTCHG_68108_709501_1.fastq.gz
/some/path/to/directory/WTCHG_65902_702501_1.fastq.gz /some/path/to/directory/WTCHG_65902_702501_2.fastq.gz /some/path/to/directory/WTCHG_68106_702501_1.fastq.gz /some/path/to/directory/WTCHG_68106_702501_2.fastq.gz /some/path/to/directory/WTCHG_68107_702501_1.fastq.gz /some/path/to/directory/WTCHG_68107_702501_2.fastq.gz NA NA
/some/path/to/directory/WTCHG_87945_712502_1.fastq.gz /some/path/to/directory/WTCHG_87945_712502_2.fastq.gz /some/path/to/directory/WTCHG_88506_712502_1.fastq.gz /some/path/to/directory/WTCHG_88506_712502_2.fastq.gz /some/path/to/directory/WTCHG_88507_712502_1.fastq.gz /some/path/to/directory/WTCHG_88507_712502_2.fastq.gz NA NA
/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L5_1.fq.gz /some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L5_2.fq.gz /some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L6_1.fq.gz /some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L6_2.fq.gz NA NA NA NA
/some/path/to/directory/WTCHG_65902_710501_1.fastq.gz /some/path/to/directory/WTCHG_65902_710501_2.fastq.gz /some/path/to/directory/WTCHG_68106_710501_1.fastq.gz /some/path/to/directory/WTCHG_68106_710501_2.fastq.gz /some/path/to/directory/WTCHG_68107_710501_1.fastq.gz /some/path/to/directory/WTCHG_68107_710501_2.fastq.gz NA NA
/some/path/to/directory/NG178_S1_R1_001.fastq.gz /some/path/to/directory/NG178_S1_R2_001.fastq.gz /some/path/to/directory/NG178_S3_R1_001.fastq.gz /some/path/to/directory/NG178_S3_R2_001.fastq.gz NA NA NA NA
/some/path/to/directory/NG232_S8_R1_001.fastq.gz /some/path/to/directory/NG232_S8_R2_001.fastq.gz /some/path/to/directory/NG232_S2_R1_001.fastq.gz /some/path/to/directory/NG232_S2_R2_001.fastq.gz NA NA NA NA
/some/path/to/directory/NG367_S19_R1_001.fastq.gz /some/path/to/directory/NG367_S19_R2_001.fastq.gz /some/path/to/directory/NG367_S6_R1_001.fastq.gz /some/path/to/directory/NG367_S6_R2_001.fastq.gz NA NA NA NA
/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTN3CCXY_L1_1.fq.gz /research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTN3CCXY_L1_2.fq.gz /research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTVYCCXY_L3_1.fq.gz /research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTVYCCXY_L3_2.fq.gz /research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMVCKCCXY_L5_1.fq.gz /research/HGIGData/2022_NGS_BACKUP/IBD/PR0005-A5_HMVCKCCXY_L5_2.fq.gz NA NA 6_NDHE04397-A
如果更简单,您可以 运行 html 中的代码,当复制到文本编辑器时,代码以制表符分隔。
<table>
<thead>
<tr>
<td>I5_fastq_path_1</td>
<td>I5_fastq_path_2</td>
<td>I5_fastq_path_3</td>
<td>I5_fastq_path_4</td>
<td>I5_fastq_path_5</td>
<td>I5_fastq_path_6</td>
<td>I5_fastq_path_7</td>
<td>I5_fastq_path_8</td>
</tr>
</thead>
<tr>
<td>/some/path/to/directory/PD7597b_1_1.fastq.gz</td>
<td>/some/path/to/directory/PD7597b_1_2.fastq.gz</td>
<td>/some/path/to/directory/PD7597b_2_1.fastq.gz</td>
<td>/some/path/to/directory/PD7597b_2_2.fastq.gz</td>
<td>/some/path/to/directory/PD7597b_3_1.fastq.gz</td>
<td>/some/path/to/directory/PD7597b_3_2.fastq.gz</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/WTCHG_65902_709501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_65902_709501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68106_709501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68106_709501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68107_709501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68107_709501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68108_709501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68108_709501_1.fastq.gz</td>
</tr>
<tr>
<td>/some/path/to/directory/WTCHG_65902_702501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_65902_702501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68106_702501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68106_702501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68107_702501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68107_702501_2.fastq.gz</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/WTCHG_87945_712502_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_87945_712502_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_88506_712502_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_88506_712502_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_88507_712502_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_88507_712502_2.fastq.gz</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L5_1.fq.gz</td>
<td>/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L5_2.fq.gz</td>
<td>/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L6_1.fq.gz</td>
<td>/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L6_2.fq.gz</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/WTCHG_65902_710501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_65902_710501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68106_710501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68106_710501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68107_710501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68107_710501_2.fastq.gz</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/NG178_S1_R1_001.fastq.gz</td>
<td>/some/path/to/directory/NG178_S1_R2_001.fastq.gz</td>
<td>/some/path/to/directory/NG178_S3_R1_001.fastq.gz</td>
<td>/some/path/to/directory/NG178_S3_R2_001.fastq.gz</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/NG232_S8_R1_001.fastq.gz</td>
<td>/some/path/to/directory/NG232_S8_R2_001.fastq.gz</td>
<td>/some/path/to/directory/NG232_S2_R1_001.fastq.gz</td>
<td>/some/path/to/directory/NG232_S2_R2_001.fastq.gz</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/NG367_S19_R1_001.fastq.gz</td>
<td>/some/path/to/directory/NG367_S19_R2_001.fastq.gz</td>
<td>/some/path/to/directory/NG367_S6_R1_001.fastq.gz</td>
<td>/some/path/to/directory/NG367_S6_R2_001.fastq.gz</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTN3CCXY_L1_1.fq.gz</td>
<td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTN3CCXY_L1_2.fq.gz</td>
<td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTVYCCXY_L3_1.fq.gz</td>
<td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTVYCCXY_L3_2.fq.gz</td>
<td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMVCKCCXY_L5_1.fq.gz</td>
<td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMVCKCCXY_L5_2.fq.gz</td>
<td>NA</td>
<td>NA</td>
</tr>
</table>
如果每行有 4 列,则合并第 1 列和第 3 列中的文件,并合并第 2 列和第 4 列中的文件。
如果每行有 6 列,则合并第 1、3、5 列中的文件,并合并第 2、4、6 列中的文件。
此模式最多可延续到 8 列。请注意,并非所有列都具有相同数量的文件,但它们始终是偶数。
我已经成功地完成了 4 列,但我想要一种方法来为 6、8 甚至更多列执行此操作,而无需对其进行硬编码...我的硬编码尝试如下:
while read -r line; do
path1=$(echo "$line"| cut -f1)
path2=$(echo "$line"| cut -f2)
path3=$(echo "$line"| cut -f3)
path4=$(echo "$line"| cut -f4)
#extract sample ID and append 'merged'
ID=$(echo "$path1" | cut -d "." -f1 | sed 's#.*/##' | sed 's/_1/_merged/')
#merge files
cat ${path1} ${path3} > /research/merged_fq/${ID}_1.fq.gz
cat ${path2} ${path4} > /research/merged_fq/${ID}_2.fq.gz
#create file with new file paths in
a=$(echo "/research/merged_fq/${ID}_1.fq.gz")
b=$(echo "/research/merged_fq/${ID}_2.fq.gz")
echo "$a" "$b" >>new_files
done < data.tsv
修正了你所有的脚本并在各处添加了注释,这样你就可以在细节上了解它是如何工作的:
#!/usr/bin/env bash
# Path where to store merged fq files
mergePath='/research/merged_fq'
{
# Print header for merged files tsv
printf 'MergedOddFile\tMergedEvenFile\n'
# Read in dummy variable to skip header row of data.tsv
read -r _
# Iterate reading rows of data until End Of File
while read -r row; do
# Map row fields to files array
read -ra files <<<"$row"
# Initialize arrays for odd and even files
oddFiles=()
evenFiles=()
# Extract sample ID _merged, suffix for odd and even fastq files
# Remove path to get file name only
oddBaseName="${files[0]##*/}"
# Remove everything from after the last '_' and add '_merged'
oddID="${oddBaseName%_*}_merged"
# Remove everything until the last '_'
oddSuffix="${oddBaseName##*_}"
# Strip the all the .extension like .fq.gz
oddSuffix="${oddSuffix%%.*}"
# Same for even
evenBaseName="${files[1]##*/}"
evenID="${evenBaseName%_*}_merged"
evenSuffix="${evenBaseName##*_}"
evenSuffix="${evenSuffix%%.*}"
# Iterate the files array indexes
for i in "${!files[@]}"; do
# Stop processing files when name is 'NA' or empty
[[ 'NA' == "${files[i]}" || -z "${files[i]}" ]] && break
# Add file to corresponding merge array
if ((i % 2)); then
evenFiles+=("${files[i]}")
else
oddFiles+=("${files[i]}")
fi
done
# Compose merged file names
oddMerge="${mergePath}/${oddID}_${oddSuffix}.fq.gz"
evenMerge="${mergePath}/${evenID}_${evenSuffix}.fq.gz"
# Merge odd and even files separately
cat "${oddFiles[@]}" > "$oddMerge"
cat "${evenFiles[@]}" > "$evenMerge"
# Log merged files
printf '%s\t%s\n' "$oddMerge" "$evenMerge"
done
} < data.tsv > merged_files.tsv
我在名为 data.tsv 的文件中有一个文件名列表。每行对应相同的样本 ID,每个 ID 最多可能有 8 个文件。我需要合并以“_1.[可变扩展名]”和“_2.[可变扩展名].
结尾的文件以下数据,但 stackover flow 将制表符转换为空格 - 应以制表符分隔:
I5_fastq_path_1 I5_fastq_path_2 I5_fastq_path_3 I5_fastq_path_4 I5_fastq_path_5 I5_fastq_path_6 I5_fastq_path_7 I5_fastq_path_8
/some/path/to/directory/PD7597b_1_1.fastq.gz /some/path/to/directory/PD7597b_1_2.fastq.gz /some/path/to/directory/PD7597b_2_1.fastq.gz /some/path/to/directory/PD7597b_2_2.fastq.gz /some/path/to/directory/PD7597b_3_1.fastq.gz /some/path/to/directory/PD7597b_3_2.fastq.gz NA NA
/some/path/to/directory/WTCHG_65902_709501_1.fastq.gz /some/path/to/directory/WTCHG_65902_709501_2.fastq.gz /some/path/to/directory/WTCHG_68106_709501_1.fastq.gz /some/path/to/directory/WTCHG_68106_709501_2.fastq.gz /some/path/to/directory/WTCHG_68107_709501_1.fastq.gz /some/path/to/directory/WTCHG_68107_709501_2.fastq.gz /some/path/to/directory/WTCHG_68108_709501_1.fastq.gz /some/path/to/directory/WTCHG_68108_709501_1.fastq.gz
/some/path/to/directory/WTCHG_65902_702501_1.fastq.gz /some/path/to/directory/WTCHG_65902_702501_2.fastq.gz /some/path/to/directory/WTCHG_68106_702501_1.fastq.gz /some/path/to/directory/WTCHG_68106_702501_2.fastq.gz /some/path/to/directory/WTCHG_68107_702501_1.fastq.gz /some/path/to/directory/WTCHG_68107_702501_2.fastq.gz NA NA
/some/path/to/directory/WTCHG_87945_712502_1.fastq.gz /some/path/to/directory/WTCHG_87945_712502_2.fastq.gz /some/path/to/directory/WTCHG_88506_712502_1.fastq.gz /some/path/to/directory/WTCHG_88506_712502_2.fastq.gz /some/path/to/directory/WTCHG_88507_712502_1.fastq.gz /some/path/to/directory/WTCHG_88507_712502_2.fastq.gz NA NA
/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L5_1.fq.gz /some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L5_2.fq.gz /some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L6_1.fq.gz /some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L6_2.fq.gz NA NA NA NA
/some/path/to/directory/WTCHG_65902_710501_1.fastq.gz /some/path/to/directory/WTCHG_65902_710501_2.fastq.gz /some/path/to/directory/WTCHG_68106_710501_1.fastq.gz /some/path/to/directory/WTCHG_68106_710501_2.fastq.gz /some/path/to/directory/WTCHG_68107_710501_1.fastq.gz /some/path/to/directory/WTCHG_68107_710501_2.fastq.gz NA NA
/some/path/to/directory/NG178_S1_R1_001.fastq.gz /some/path/to/directory/NG178_S1_R2_001.fastq.gz /some/path/to/directory/NG178_S3_R1_001.fastq.gz /some/path/to/directory/NG178_S3_R2_001.fastq.gz NA NA NA NA
/some/path/to/directory/NG232_S8_R1_001.fastq.gz /some/path/to/directory/NG232_S8_R2_001.fastq.gz /some/path/to/directory/NG232_S2_R1_001.fastq.gz /some/path/to/directory/NG232_S2_R2_001.fastq.gz NA NA NA NA
/some/path/to/directory/NG367_S19_R1_001.fastq.gz /some/path/to/directory/NG367_S19_R2_001.fastq.gz /some/path/to/directory/NG367_S6_R1_001.fastq.gz /some/path/to/directory/NG367_S6_R2_001.fastq.gz NA NA NA NA
/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTN3CCXY_L1_1.fq.gz /research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTN3CCXY_L1_2.fq.gz /research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTVYCCXY_L3_1.fq.gz /research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTVYCCXY_L3_2.fq.gz /research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMVCKCCXY_L5_1.fq.gz /research/HGIGData/2022_NGS_BACKUP/IBD/PR0005-A5_HMVCKCCXY_L5_2.fq.gz NA NA 6_NDHE04397-A
如果更简单,您可以 运行 html 中的代码,当复制到文本编辑器时,代码以制表符分隔。
<table>
<thead>
<tr>
<td>I5_fastq_path_1</td>
<td>I5_fastq_path_2</td>
<td>I5_fastq_path_3</td>
<td>I5_fastq_path_4</td>
<td>I5_fastq_path_5</td>
<td>I5_fastq_path_6</td>
<td>I5_fastq_path_7</td>
<td>I5_fastq_path_8</td>
</tr>
</thead>
<tr>
<td>/some/path/to/directory/PD7597b_1_1.fastq.gz</td>
<td>/some/path/to/directory/PD7597b_1_2.fastq.gz</td>
<td>/some/path/to/directory/PD7597b_2_1.fastq.gz</td>
<td>/some/path/to/directory/PD7597b_2_2.fastq.gz</td>
<td>/some/path/to/directory/PD7597b_3_1.fastq.gz</td>
<td>/some/path/to/directory/PD7597b_3_2.fastq.gz</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/WTCHG_65902_709501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_65902_709501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68106_709501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68106_709501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68107_709501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68107_709501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68108_709501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68108_709501_1.fastq.gz</td>
</tr>
<tr>
<td>/some/path/to/directory/WTCHG_65902_702501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_65902_702501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68106_702501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68106_702501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68107_702501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68107_702501_2.fastq.gz</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/WTCHG_87945_712502_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_87945_712502_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_88506_712502_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_88506_712502_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_88507_712502_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_88507_712502_2.fastq.gz</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L5_1.fq.gz</td>
<td>/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L5_2.fq.gz</td>
<td>/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L6_1.fq.gz</td>
<td>/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L6_2.fq.gz</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/WTCHG_65902_710501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_65902_710501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68106_710501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68106_710501_2.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68107_710501_1.fastq.gz</td>
<td>/some/path/to/directory/WTCHG_68107_710501_2.fastq.gz</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/NG178_S1_R1_001.fastq.gz</td>
<td>/some/path/to/directory/NG178_S1_R2_001.fastq.gz</td>
<td>/some/path/to/directory/NG178_S3_R1_001.fastq.gz</td>
<td>/some/path/to/directory/NG178_S3_R2_001.fastq.gz</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/NG232_S8_R1_001.fastq.gz</td>
<td>/some/path/to/directory/NG232_S8_R2_001.fastq.gz</td>
<td>/some/path/to/directory/NG232_S2_R1_001.fastq.gz</td>
<td>/some/path/to/directory/NG232_S2_R2_001.fastq.gz</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/some/path/to/directory/NG367_S19_R1_001.fastq.gz</td>
<td>/some/path/to/directory/NG367_S19_R2_001.fastq.gz</td>
<td>/some/path/to/directory/NG367_S6_R1_001.fastq.gz</td>
<td>/some/path/to/directory/NG367_S6_R2_001.fastq.gz</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTN3CCXY_L1_1.fq.gz</td>
<td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTN3CCXY_L1_2.fq.gz</td>
<td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTVYCCXY_L3_1.fq.gz</td>
<td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTVYCCXY_L3_2.fq.gz</td>
<td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMVCKCCXY_L5_1.fq.gz</td>
<td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMVCKCCXY_L5_2.fq.gz</td>
<td>NA</td>
<td>NA</td>
</tr>
</table>
如果每行有 4 列,则合并第 1 列和第 3 列中的文件,并合并第 2 列和第 4 列中的文件。
如果每行有 6 列,则合并第 1、3、5 列中的文件,并合并第 2、4、6 列中的文件。
此模式最多可延续到 8 列。请注意,并非所有列都具有相同数量的文件,但它们始终是偶数。
我已经成功地完成了 4 列,但我想要一种方法来为 6、8 甚至更多列执行此操作,而无需对其进行硬编码...我的硬编码尝试如下:
while read -r line; do
path1=$(echo "$line"| cut -f1)
path2=$(echo "$line"| cut -f2)
path3=$(echo "$line"| cut -f3)
path4=$(echo "$line"| cut -f4)
#extract sample ID and append 'merged'
ID=$(echo "$path1" | cut -d "." -f1 | sed 's#.*/##' | sed 's/_1/_merged/')
#merge files
cat ${path1} ${path3} > /research/merged_fq/${ID}_1.fq.gz
cat ${path2} ${path4} > /research/merged_fq/${ID}_2.fq.gz
#create file with new file paths in
a=$(echo "/research/merged_fq/${ID}_1.fq.gz")
b=$(echo "/research/merged_fq/${ID}_2.fq.gz")
echo "$a" "$b" >>new_files
done < data.tsv
修正了你所有的脚本并在各处添加了注释,这样你就可以在细节上了解它是如何工作的:
#!/usr/bin/env bash
# Path where to store merged fq files
mergePath='/research/merged_fq'
{
# Print header for merged files tsv
printf 'MergedOddFile\tMergedEvenFile\n'
# Read in dummy variable to skip header row of data.tsv
read -r _
# Iterate reading rows of data until End Of File
while read -r row; do
# Map row fields to files array
read -ra files <<<"$row"
# Initialize arrays for odd and even files
oddFiles=()
evenFiles=()
# Extract sample ID _merged, suffix for odd and even fastq files
# Remove path to get file name only
oddBaseName="${files[0]##*/}"
# Remove everything from after the last '_' and add '_merged'
oddID="${oddBaseName%_*}_merged"
# Remove everything until the last '_'
oddSuffix="${oddBaseName##*_}"
# Strip the all the .extension like .fq.gz
oddSuffix="${oddSuffix%%.*}"
# Same for even
evenBaseName="${files[1]##*/}"
evenID="${evenBaseName%_*}_merged"
evenSuffix="${evenBaseName##*_}"
evenSuffix="${evenSuffix%%.*}"
# Iterate the files array indexes
for i in "${!files[@]}"; do
# Stop processing files when name is 'NA' or empty
[[ 'NA' == "${files[i]}" || -z "${files[i]}" ]] && break
# Add file to corresponding merge array
if ((i % 2)); then
evenFiles+=("${files[i]}")
else
oddFiles+=("${files[i]}")
fi
done
# Compose merged file names
oddMerge="${mergePath}/${oddID}_${oddSuffix}.fq.gz"
evenMerge="${mergePath}/${evenID}_${evenSuffix}.fq.gz"
# Merge odd and even files separately
cat "${oddFiles[@]}" > "$oddMerge"
cat "${evenFiles[@]}" > "$evenMerge"
# Log merged files
printf '%s\t%s\n' "$oddMerge" "$evenMerge"
done
} < data.tsv > merged_files.tsv