Java 将 avro 文件转换为文本文件的程序

Java program to convert avro files to text file

需要有关 Java 程序的帮助,该程序将 avro.avsc 架构文件和 avrofile 作为输入并将它们转换为 java 中的文本文件。

这个 java 代码对我有用,希望对其他人有帮助。 导入 java.io.; 导入 java.util.;

import org.apache.avro.*;
import org.apache.avro.generic.*;
import org.apache.avro.file.*;
import org.apache.avro.io.*;

public class AvrotoTextFormatter

{

public static void main ( String args[]) throws Exception

{

    InputStream in = null;
    in = new FileInputStream(args[0]);
    BufferedReader br;
    BufferedInputStream inStream = new BufferedInputStream(in);
    PrintWriter pr1 = new PrintWriter(args[1], "UTF-8");
    PrintWriter pr = new PrintWriter(args[2], "UTF-8");
    StringTokenizer st;
    StringTokenizer st1;
    int row_counter = 0;
    String header_fields = "";
    String content_records = "";
    String sCurrentLine = "";


    GenericDatumReader<Object> reader = new GenericDatumReader<Object>();
    DataFileStream<Object> fileReader = new DataFileStream<Object>(inStream, reader);



    pr1.println(fileReader.getSchema().getFields());

    pr1.close();

    br = new BufferedReader(new java.io.FileReader(args[1]));


    while ((sCurrentLine = br.readLine()) != null)
    {
    st = new StringTokenizer(sCurrentLine," ");
    while (st.hasMoreTokens())
    {
        header_fields = header_fields + st.nextToken() + "|";
        st.nextToken();
        st.nextToken();             
    }        
    }
    header_fields = header_fields.substring(1,header_fields.length()-1); 
    pr.println(header_fields);


    File file = new File(args[0]);
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(fileReader.getSchema());
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(file, datumReader);
    GenericRecord user = null;
        while (dataFileReader.hasNext())
           {
              content_records = "";
              user = dataFileReader.next(user);
              st1 = new StringTokenizer(header_fields,"|");
              while (st1.hasMoreTokens())
            {
                content_records = content_records + user.get(st1.nextToken()) + "|";              
                }
    content_records = content_records.substring(0,content_records.length()-1);
    pr.println(content_records); 
            } 
    fileReader.close();
    br.close();
    pr.close();  

}

}

发现使用 spark 执行相同操作的代码片段更快更容易。 分享对别人有帮助

import org.apache.avro.mapreduce.AvroKeyInputFormat
import org.apache.avro.mapred.AvroKey
import org.apache.hadoop.io.NullWritable
val avroRdd = sc.newAPIHadoopFile("/sit/data/presentation/bbsbi/alayer/test/000000_0", classOf[AvroKeyInputFormat[String]], classOf[AvroKey[String]], classOf[NullWritable]).keys.map(_.toString)

val n=avroRdd.map(_.split(",").map(_.split(":")(1).trim).map(l=>l.substring(l.indexOf("\"")+1,l.lastIndexOf("\""))).mkString("|"))
n.collect.foreach(println)

下面的代码适合我

private static JSONArray readJsonFromAvro(String absFilePath)throws IOException,
InterruptedException
{
    JSONArray jsonarray = new JSONArray();
    InputStream in = new FileInputStream(absFilePath);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    BufferedInputStream inStream = new BufferedInputStream(in);
    GenericDatumReader<Object> reader = new GenericDatumReader<Object>();
    DataFileStream<Object> fileReader = new DataFileStream<Object>(
            inStream, reader);
    try {
        final Schema schema = fileReader.getSchema();
        final JsonEncoder encoder = EncoderFactory.get().jsonEncoder(
                schema, baos);
        for (final Object datum : fileReader) {
            //writer.write(datum, encoder);
             JSONObject jsonObj = new JSONObject(datum.toString());
             jsonarray.put(jsonObj);
        }
        encoder.flush();
        System.out.println();
    } finally {
        fileReader.close();
    }
    return jsonarray;
}

我如何运行编写下面的程序?我的意思是我可以提供哪 3 个命令行输入?

import org.apache.avro.*;
import org.apache.avro.generic.*;
import org.apache.avro.file.*;
import org.apache.avro.io.*;

public class AvrotoTextFormatter

{

public static void main ( String args[]) throws Exception

{

    InputStream in = null;
    in = new FileInputStream(args[0]);
    BufferedReader br;
    BufferedInputStream inStream = new BufferedInputStream(in);
    PrintWriter pr1 = new PrintWriter(args[1], "UTF-8");
    PrintWriter pr = new PrintWriter(args[2], "UTF-8");
    StringTokenizer st;
    StringTokenizer st1;
    int row_counter = 0;
    String header_fields = "";
    String content_records = "";
    String sCurrentLine = "";


    GenericDatumReader<Object> reader = new GenericDatumReader<Object>();
    DataFileStream<Object> fileReader = new DataFileStream<Object>(inStream, reader);



    pr1.println(fileReader.getSchema().getFields());

    pr1.close();

    br = new BufferedReader(new java.io.FileReader(args[1]));


    while ((sCurrentLine = br.readLine()) != null)
    {
    st = new StringTokenizer(sCurrentLine," ");
    while (st.hasMoreTokens())
    {
        header_fields = header_fields + st.nextToken() + "|";
        st.nextToken();
        st.nextToken();             
    }        
    }
    header_fields = header_fields.substring(1,header_fields.length()-1); 
    pr.println(header_fields);


    File file = new File(args[0]);
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(fileReader.getSchema());
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(file, datumReader);
    GenericRecord user = null;
        while (dataFileReader.hasNext())
           {
              content_records = "";
              user = dataFileReader.next(user);
              st1 = new StringTokenizer(header_fields,"|");
              while (st1.hasMoreTokens())
            {
                content_records = content_records + user.get(st1.nextToken()) + "|";              
                }
    content_records = content_records.substring(0,content_records.length()-1);
    pr.println(content_records); 
            } 
    fileReader.close();
    br.close();
    pr.close();  

}

}