在比较数字时合并两个大文本文件

Merge two large textfiles while comparing numbers

我想使用 Java 从两个大文本文件(大约 2 或 3 gb)中的数据创建一个大文本文件。我必须将这两个文件合并为一个文件,同时比较那些文本中的数字 files.One 文件包含如下信息:

    chr1  100  200  abcd  +
    chr2  150  227  abba  +
    .......................
    .......................

它只不过是一个床文件(用于生物信息学)。另一个文件包含如下信息:

    >chr1:
    AATTTATTTATTTTATTTTTTTATTTACCCACCCCCCCATTATTTACCAGGGGAGGGATTT
    ATTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCAATTTTTT...........
    .............................................................
    >chr2:
    ATTTTTTTATTTACCCACCCCCCCATTATTTACCAGGGGAGGGATTTCCCCCCCCCCCCCC
    ATTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCAATTTTTT...........
    .............................................................
    >chr3:
    AATTTATTTATTTTATTTTTTTATTTACCCACCCCCCCATTATTTACCAGGGGAGGGATTT
    ATTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCAATTTTTT...........
    .............................................................

它只是一个fasta文件(用于生物信息学) 我必须做的是,我必须从 bed 文件中选择一行,并需要从 fasta 文件中提取该染色体开始和结束位置的序列(在 bed 文件第二和第三列中提到)并制作如下文件:

    chr1  100  200  abcd  +  ATTTATCC.....ATTT
    chr2  150  227  abba  +  TTATCC.....ATTTCC
    ..........................................
    ..........................................

我可以用小文件来做,而且很有效。我拆分每个输入文件的行并将它们存储在两个 ArrayList 中。然后,我比较两个 ArrayList 的元素。如果元素匹配,我合并两个文件的特定行。

这是我的适用于小文件的代码:

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Scanner;

public class RetrieveFromTwoFile{
    private static ArrayList<String> store(String f1) throws FileNotFoundException{

        Scanner read=new Scanner(new File(f1));
        ArrayList<String> list =new ArrayList<String>();
        while(read.hasNext()){
            String temp=read.nextLine();
            String[] sts=temp.split("\s+");
            for(int i=0;i<sts.length;i++){
                if(!(sts[i].equals("")) && !(sts[i].equals("\n"))){

                    list.add(sts[i]);
                }

            }

        }
        return list;        

    }
    private static ArrayList<String> storeLine(String f1) throws FileNotFoundException{

        Scanner read=new Scanner(new File(f1));
        ArrayList<String> list1 =new ArrayList<String>();
        while(read.hasNext()){

            String line=read.nextLine();

            list1.add(line);

            //return list;      

        }
        return list1;       

    }

    private static void writer(ArrayList<String> out,String fname) throws IOException{

        FileWriter writr= new FileWriter(new File(fname));
        for(int i=0;i<out.size();i++){
            writr.write(out.get(i)+"\n");

        }
        writr.close();

    }

    public static void main(String [] args) throws Exception{


            ArrayList<String> file1;
            ArrayList<String> file2;
            ArrayList<String> file3;
            ArrayList<String> finl=new ArrayList<String>();
            file1=store("region.txt");//storing every chunk of strings if there is space between them in region.txt
            file2=store("specific.txt");//storing every chunk of strings if there is space between them in specific.txt
            file3=storeLine("specific.txt");//storing each line in region.txt

            for(int i=0;i<file1.size();i=i+6){//c will hold the chrome number
                long initial=Long.parseLong(file1.get(i+1));
                long end=Long.parseLong(file1.get(i+2));
                String chrom=""+file1.get(i);
                System.out.println("chrome for file1 : "+chrom);
                String region=""+file1.get(i+3);
                System.out.println("region for file1 : "+region);
                //finl.add(region);
                //finl.add(file1.get(j));
                for(int x=0,z=0;x<file2.size() && z<file3.size();x=x+6,z=z+1){
                    long res=Long.parseLong(file2.get(x+1));//resultant number in specific.txt.this number is there after 6 more elements
                    String match=file2.get(x);
                    //boo
                    System.out.println("chrom type : "+chrom+" "+match);
                    //int index=x/6;

                    if(match.equals(chrom)== true){ 
                        System.out.println("hi");                   
                        if(res>=initial && res<=end){
                        System.out.println("hi1");
                        String ress=file3.get(x/6);
                        String finress=""+region+"\t"+ress+"";//merging line from region.txt and specific.txt
                        System.out.println("Initial : "+initial+" end : "+end+" item :"+res);

                        System.out.println("The item is :"+ress);

                        finl.add(finress);//adding the mergedline in another arraylist
                        System.out.println("The item is :"+finress);
                                //System.out.println("The item is :" +finl.get(z));
                                //flag=1;
                        }
                    }


                }
                System.out.println("h2i");

            }

            for(int i=0;i<finl.size();i++){
                System.out.println("******* item is**** :"+finl.get(i));
            }
            writer(finl,"result.txt");//writing result.txt with the arraylist finl


        //}



    }
}

您可能想尝试增加虚拟内存的大小。如果它适用于小文件而不适用于大文件,那么您可能 运行 内存不足。每个文件 2-3 GB 对于文本文件来说确实很大。