在特定情况下连接文件列表中的文件

Concatenate files from list of files in specific cases

我在名为 data.tsv 的文件中有一个文件名列表。每行对应相同的样本 ID,每个 ID 最多可能有 8 个文件。我需要合并以“_1.[可变扩展名]”和“_2.[可变扩展名].

结尾的文件

以下数据,但 stackover flow 将制表符转换为空格 - 应以制表符分隔:

I5_fastq_path_1 I5_fastq_path_2 I5_fastq_path_3 I5_fastq_path_4 I5_fastq_path_5 I5_fastq_path_6 I5_fastq_path_7 I5_fastq_path_8                             
/some/path/to/directory/PD7597b_1_1.fastq.gz    /some/path/to/directory/PD7597b_1_2.fastq.gz    /some/path/to/directory/PD7597b_2_1.fastq.gz    /some/path/to/directory/PD7597b_2_2.fastq.gz    /some/path/to/directory/PD7597b_3_1.fastq.gz    /some/path/to/directory/PD7597b_3_2.fastq.gz    NA  NA                              
/some/path/to/directory/WTCHG_65902_709501_1.fastq.gz   /some/path/to/directory/WTCHG_65902_709501_2.fastq.gz   /some/path/to/directory/WTCHG_68106_709501_1.fastq.gz   /some/path/to/directory/WTCHG_68106_709501_2.fastq.gz   /some/path/to/directory/WTCHG_68107_709501_1.fastq.gz   /some/path/to/directory/WTCHG_68107_709501_2.fastq.gz   /some/path/to/directory/WTCHG_68108_709501_1.fastq.gz   /some/path/to/directory/WTCHG_68108_709501_1.fastq.gz                               
/some/path/to/directory/WTCHG_65902_702501_1.fastq.gz   /some/path/to/directory/WTCHG_65902_702501_2.fastq.gz   /some/path/to/directory/WTCHG_68106_702501_1.fastq.gz   /some/path/to/directory/WTCHG_68106_702501_2.fastq.gz   /some/path/to/directory/WTCHG_68107_702501_1.fastq.gz   /some/path/to/directory/WTCHG_68107_702501_2.fastq.gz   NA  NA                              
/some/path/to/directory/WTCHG_87945_712502_1.fastq.gz   /some/path/to/directory/WTCHG_87945_712502_2.fastq.gz   /some/path/to/directory/WTCHG_88506_712502_1.fastq.gz   /some/path/to/directory/WTCHG_88506_712502_2.fastq.gz   /some/path/to/directory/WTCHG_88507_712502_1.fastq.gz   /some/path/to/directory/WTCHG_88507_712502_2.fastq.gz   NA  NA                              
/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L5_1.fq.gz    /some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L5_2.fq.gz    /some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L6_1.fq.gz    /some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L6_2.fq.gz    NA  NA  NA  NA                              
/some/path/to/directory/WTCHG_65902_710501_1.fastq.gz   /some/path/to/directory/WTCHG_65902_710501_2.fastq.gz   /some/path/to/directory/WTCHG_68106_710501_1.fastq.gz   /some/path/to/directory/WTCHG_68106_710501_2.fastq.gz   /some/path/to/directory/WTCHG_68107_710501_1.fastq.gz   /some/path/to/directory/WTCHG_68107_710501_2.fastq.gz   NA  NA                              
/some/path/to/directory/NG178_S1_R1_001.fastq.gz    /some/path/to/directory/NG178_S1_R2_001.fastq.gz    /some/path/to/directory/NG178_S3_R1_001.fastq.gz    /some/path/to/directory/NG178_S3_R2_001.fastq.gz    NA  NA  NA  NA                              
/some/path/to/directory/NG232_S8_R1_001.fastq.gz    /some/path/to/directory/NG232_S8_R2_001.fastq.gz    /some/path/to/directory/NG232_S2_R1_001.fastq.gz    /some/path/to/directory/NG232_S2_R2_001.fastq.gz    NA  NA  NA  NA                              
/some/path/to/directory/NG367_S19_R1_001.fastq.gz   /some/path/to/directory/NG367_S19_R2_001.fastq.gz   /some/path/to/directory/NG367_S6_R1_001.fastq.gz    /some/path/to/directory/NG367_S6_R2_001.fastq.gz    NA  NA  NA  NA                              
/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTN3CCXY_L1_1.fq.gz  /research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTN3CCXY_L1_2.fq.gz  /research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTVYCCXY_L3_1.fq.gz  /research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTVYCCXY_L3_2.fq.gz  /research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMVCKCCXY_L5_1.fq.gz  /research/HGIGData/2022_NGS_BACKUP/IBD/PR0005-A5_HMVCKCCXY_L5_2.fq.gz   NA  NA                              6_NDHE04397-A

如果更简单,您可以 运行 html 中的代码,当复制到文本编辑器时,代码以制表符分隔。

<table>
    <thead>
        <tr>
            <td>I5_fastq_path_1</td>
            <td>I5_fastq_path_2</td>
            <td>I5_fastq_path_3</td>
            <td>I5_fastq_path_4</td>
            <td>I5_fastq_path_5</td>
            <td>I5_fastq_path_6</td>
            <td>I5_fastq_path_7</td>
            <td>I5_fastq_path_8</td>
        </tr>
    </thead>
    <tr>
        <td>/some/path/to/directory/PD7597b_1_1.fastq.gz</td>
        <td>/some/path/to/directory/PD7597b_1_2.fastq.gz</td>
        <td>/some/path/to/directory/PD7597b_2_1.fastq.gz</td>
        <td>/some/path/to/directory/PD7597b_2_2.fastq.gz</td>
        <td>/some/path/to/directory/PD7597b_3_1.fastq.gz</td>
        <td>/some/path/to/directory/PD7597b_3_2.fastq.gz</td>
        <td>NA</td>
        <td>NA</td>
    </tr>
    <tr>
        <td>/some/path/to/directory/WTCHG_65902_709501_1.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_65902_709501_2.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68106_709501_1.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68106_709501_2.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68107_709501_1.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68107_709501_2.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68108_709501_1.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68108_709501_1.fastq.gz</td>
    </tr>
    <tr>
        <td>/some/path/to/directory/WTCHG_65902_702501_1.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_65902_702501_2.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68106_702501_1.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68106_702501_2.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68107_702501_1.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68107_702501_2.fastq.gz</td>
        <td>NA</td>
        <td>NA</td>
    </tr>
    <tr>
        <td>/some/path/to/directory/WTCHG_87945_712502_1.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_87945_712502_2.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_88506_712502_1.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_88506_712502_2.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_88507_712502_1.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_88507_712502_2.fastq.gz</td>
        <td>NA</td>
        <td>NA</td>
    </tr>
    <tr>
        <td>/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L5_1.fq.gz</td>
        <td>/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L5_2.fq.gz</td>
        <td>/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L6_1.fq.gz</td>
        <td>/some/path/to/directory/IFS003-W_DHE04956-7_HW3LFCCXX_L6_2.fq.gz</td>
        <td>NA</td>
        <td>NA</td>
        <td>NA</td>
        <td>NA</td>
    </tr>
    <tr>
        <td>/some/path/to/directory/WTCHG_65902_710501_1.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_65902_710501_2.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68106_710501_1.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68106_710501_2.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68107_710501_1.fastq.gz</td>
        <td>/some/path/to/directory/WTCHG_68107_710501_2.fastq.gz</td>
        <td>NA</td>
        <td>NA</td>
    </tr>
    <tr>
        <td>/some/path/to/directory/NG178_S1_R1_001.fastq.gz</td>
        <td>/some/path/to/directory/NG178_S1_R2_001.fastq.gz</td>
        <td>/some/path/to/directory/NG178_S3_R1_001.fastq.gz</td>
        <td>/some/path/to/directory/NG178_S3_R2_001.fastq.gz</td>
        <td>NA</td>
        <td>NA</td>
        <td>NA</td>
        <td>NA</td>
    </tr>
    <tr>
        <td>/some/path/to/directory/NG232_S8_R1_001.fastq.gz</td>
        <td>/some/path/to/directory/NG232_S8_R2_001.fastq.gz</td>
        <td>/some/path/to/directory/NG232_S2_R1_001.fastq.gz</td>
        <td>/some/path/to/directory/NG232_S2_R2_001.fastq.gz</td>
        <td>NA</td>
        <td>NA</td>
        <td>NA</td>
        <td>NA</td>
    </tr>
    <tr>
        <td>/some/path/to/directory/NG367_S19_R1_001.fastq.gz</td>
        <td>/some/path/to/directory/NG367_S19_R2_001.fastq.gz</td>
        <td>/some/path/to/directory/NG367_S6_R1_001.fastq.gz</td>
        <td>/some/path/to/directory/NG367_S6_R2_001.fastq.gz</td>
        <td>NA</td>
        <td>NA</td>
        <td>NA</td>
        <td>NA</td>
    </tr>
    <tr>
        <td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTN3CCXY_L1_1.fq.gz</td>
        <td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTN3CCXY_L1_2.fq.gz</td>
        <td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTVYCCXY_L3_1.fq.gz</td>
        <td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMTVYCCXY_L3_2.fq.gz</td>
        <td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMVCKCCXY_L5_1.fq.gz</td>
        <td>/research/HGIGData/2022_NGS_BACKUP/IBD/PR0006_NDHE04397-A5-A5_HMVCKCCXY_L5_2.fq.gz</td>
        <td>NA</td>
        <td>NA</td>
    </tr>
</table>

如果每行有 4 列,则合并第 1 列和第 3 列中的文件,并合并第 2 列和第 4 列中的文件。

如果每行有 6 列,则合并第 1、3、5 列中的文件,并合并第 2、4、6 列中的文件。

此模式最多可延续到 8 列。请注意,并非所有列都具有相同数量的文件,但它们始终是偶数。

我已经成功地完成了 4 列,但我想要一种方法来为 6、8 甚至更多列执行此操作,而无需对其进行硬编码...我的硬编码尝试如下:

while read -r line; do
    path1=$(echo "$line"| cut -f1)
    path2=$(echo "$line"| cut -f2)
    path3=$(echo "$line"| cut -f3)
    path4=$(echo "$line"| cut -f4)

    #extract sample ID and append 'merged'
    ID=$(echo "$path1" | cut -d "." -f1 | sed 's#.*/##' | sed 's/_1/_merged/')

    #merge files
cat ${path1} ${path3} > /research/merged_fq/${ID}_1.fq.gz
cat ${path2} ${path4} > /research/merged_fq/${ID}_2.fq.gz

#create file with new file paths in
a=$(echo "/research/merged_fq/${ID}_1.fq.gz")
b=$(echo "/research/merged_fq/${ID}_2.fq.gz")
echo "$a" "$b" >>new_files
done < data.tsv

修正了你所有的脚本并在各处添加了注释,这样你就可以在细节上了解它是如何工作的:

#!/usr/bin/env bash

# Path where to store merged fq files
mergePath='/research/merged_fq'

{
  # Print header for merged files tsv
  printf 'MergedOddFile\tMergedEvenFile\n'

  # Read in dummy variable to skip header row of data.tsv
  read -r _

  # Iterate reading rows of data until End Of File
  while read -r row; do
    # Map row fields to files array
    read -ra files <<<"$row"

    # Initialize arrays for odd and even files
    oddFiles=()
    evenFiles=()

    # Extract sample ID _merged, suffix for odd and even fastq files

    # Remove path to get file name only
    oddBaseName="${files[0]##*/}"

    # Remove everything from after the last '_' and add '_merged'
    oddID="${oddBaseName%_*}_merged"

    # Remove everything until the last '_'
    oddSuffix="${oddBaseName##*_}"

    # Strip the all the .extension like .fq.gz
    oddSuffix="${oddSuffix%%.*}"

    # Same for even
    evenBaseName="${files[1]##*/}"
    evenID="${evenBaseName%_*}_merged"
    evenSuffix="${evenBaseName##*_}"
    evenSuffix="${evenSuffix%%.*}"

    # Iterate the files array indexes
    for i in "${!files[@]}"; do

      # Stop processing files when name is 'NA' or empty
      [[ 'NA' == "${files[i]}" || -z "${files[i]}" ]] && break

      # Add file to corresponding merge array
      if ((i % 2)); then
        evenFiles+=("${files[i]}")
      else
        oddFiles+=("${files[i]}")
      fi
    done

    # Compose merged file names
    oddMerge="${mergePath}/${oddID}_${oddSuffix}.fq.gz"
    evenMerge="${mergePath}/${evenID}_${evenSuffix}.fq.gz"

    # Merge odd and even files separately
    cat "${oddFiles[@]}" > "$oddMerge"
    cat "${evenFiles[@]}" > "$evenMerge"

    # Log merged files
    printf '%s\t%s\n' "$oddMerge" "$evenMerge"
  done
} < data.tsv > merged_files.tsv