Apache Pig 过滤元组中的空值或文字
Apache Pig filtering null values or literals in tuple
我编写了下面的 Pig UDF 来测试 chararray 列是否具有有效的 'yyyy-MM-dd' 日期格式。但是在使用以下脚本进行测试时,出现以下错误。数据是否有任何问题,因为我正在处理空元组以考虑数据中的 NULL 值以及不存在的值。另外,我是否应该删除数据文件中的空行?
Caused by: org.apache.pig.backend.executionengine.ExecException: ERROR 2114: Expected input to be chararray, but got NULL
at IsValidDateTime.exec(IsValidDateTime.java:41)
at IsValidDateTime.exec(IsValidDateTime.java:18)
dates.txt
2019-12-27,2020-08-20
2017-05-09,2018-10-04
2016-09-25,2020-01-19
,2020-08-20
NULL,2017-09-28
2016-11-15,NULL
2018-04-17,Thu Aug-20 2020
2017-05-09,2020-08-20
Mon Jan-20 2020,2020-08-20
<empty line>
------------------------------------------
dates_valid (expected all valid 'yyyy-MM-dd' start_dt and end_dt)
2019-12-27,2020-08-20
2017-05-09,2018-10-04
2016-09-25,2020-01-19
2017-05-09,2020-08-20
Pig script
REGISTER 'IsValidDateTime.jar'
DEFINE IsValidDateTime IsValidDateTime();
dates = LOAD 'dates.txt' USING PigStorage(',') AS (start_dt:chararray, end_dt:chararray);
DUMP dates;
dates_valid = FILTER dates BY (IsValidDateTime(start_dt) AND IsValidDateTime(end_dt));
DUMP dates_valid;
IsValidDateTime Filter UDF
import org.apache.pig.FilterFunc;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
public class IsValidDateTime extends FilterFunc {
private static String datePattern = "yyyy-MM-dd";
public Boolean exec(Tuple input) throws IOException {
if (input == null || input.size() == 0)
return false;
try {
Object date = input.get(0);
if(DataType.findType(date) == DataType.CHARARRAY){
String dateStr = String.valueOf(date);
if(dateStr != null && dateStr.length() != 0) {
try {
SimpleDateFormat format = new SimpleDateFormat(datePattern);
format.setLenient(false);
format.parse(dateStr);
} catch (ParseException | IllegalArgumentException e) {
return false; //date string does not match 'yyyy-MM-dd' format
}
return true; //date string is of valid format 'yyyy-MM-dd'
}
return false; //empty or null date string
} else {
int errCode = 2114;
String msg = "Expected input to be chararray, but got " + DataType.findTypeName(date) ;
throw new ExecException(msg, errCode, PigException.BUG);
}
} catch(ExecException ee) {
throw ee;
}
}
}
删除了外部 if-condition 检查 DataType.CHARARRAY,只需将输入元组中的值放入字符串中并检查是否为 null 或为空。这是唯一需要的条件。下面是最终代码。
import org.apache.pig.FilterFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
public class IsValidDateTime extends FilterFunc {
private static String datePattern = "yyyy-MM-dd";
public Boolean exec(Tuple input) throws IOException {
try {
String date = (String)input.get(0);
if(date != null && date.length() != 0) {
try {
SimpleDateFormat format = new SimpleDateFormat(datePattern);
format.setLenient(false);
format.parse(date);
} catch (ParseException | IllegalArgumentException e) {
return false; //date string does not match 'yyyy-MM-dd' format
}
return true; //date string is of valid format 'yyyy-MM-dd'
} else {
return false; //empty or null date string
}
} catch(ExecException ee) {
throw ee;
}
}
}
Getting the expected output
(2019-12-27,2020-08-20)
(2017-05-09,2018-10-04)
(2016-09-25,2020-01-19)
(2017-05-09,2020-08-20)
我编写了下面的 Pig UDF 来测试 chararray 列是否具有有效的 'yyyy-MM-dd' 日期格式。但是在使用以下脚本进行测试时,出现以下错误。数据是否有任何问题,因为我正在处理空元组以考虑数据中的 NULL 值以及不存在的值。另外,我是否应该删除数据文件中的空行?
Caused by: org.apache.pig.backend.executionengine.ExecException: ERROR 2114: Expected input to be chararray, but got NULL
at IsValidDateTime.exec(IsValidDateTime.java:41)
at IsValidDateTime.exec(IsValidDateTime.java:18)
dates.txt
2019-12-27,2020-08-20
2017-05-09,2018-10-04
2016-09-25,2020-01-19
,2020-08-20
NULL,2017-09-28
2016-11-15,NULL
2018-04-17,Thu Aug-20 2020
2017-05-09,2020-08-20
Mon Jan-20 2020,2020-08-20
<empty line>
------------------------------------------
dates_valid (expected all valid 'yyyy-MM-dd' start_dt and end_dt)
2019-12-27,2020-08-20
2017-05-09,2018-10-04
2016-09-25,2020-01-19
2017-05-09,2020-08-20
Pig script
REGISTER 'IsValidDateTime.jar'
DEFINE IsValidDateTime IsValidDateTime();
dates = LOAD 'dates.txt' USING PigStorage(',') AS (start_dt:chararray, end_dt:chararray);
DUMP dates;
dates_valid = FILTER dates BY (IsValidDateTime(start_dt) AND IsValidDateTime(end_dt));
DUMP dates_valid;
IsValidDateTime Filter UDF
import org.apache.pig.FilterFunc;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
public class IsValidDateTime extends FilterFunc {
private static String datePattern = "yyyy-MM-dd";
public Boolean exec(Tuple input) throws IOException {
if (input == null || input.size() == 0)
return false;
try {
Object date = input.get(0);
if(DataType.findType(date) == DataType.CHARARRAY){
String dateStr = String.valueOf(date);
if(dateStr != null && dateStr.length() != 0) {
try {
SimpleDateFormat format = new SimpleDateFormat(datePattern);
format.setLenient(false);
format.parse(dateStr);
} catch (ParseException | IllegalArgumentException e) {
return false; //date string does not match 'yyyy-MM-dd' format
}
return true; //date string is of valid format 'yyyy-MM-dd'
}
return false; //empty or null date string
} else {
int errCode = 2114;
String msg = "Expected input to be chararray, but got " + DataType.findTypeName(date) ;
throw new ExecException(msg, errCode, PigException.BUG);
}
} catch(ExecException ee) {
throw ee;
}
}
}
删除了外部 if-condition 检查 DataType.CHARARRAY,只需将输入元组中的值放入字符串中并检查是否为 null 或为空。这是唯一需要的条件。下面是最终代码。
import org.apache.pig.FilterFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
public class IsValidDateTime extends FilterFunc {
private static String datePattern = "yyyy-MM-dd";
public Boolean exec(Tuple input) throws IOException {
try {
String date = (String)input.get(0);
if(date != null && date.length() != 0) {
try {
SimpleDateFormat format = new SimpleDateFormat(datePattern);
format.setLenient(false);
format.parse(date);
} catch (ParseException | IllegalArgumentException e) {
return false; //date string does not match 'yyyy-MM-dd' format
}
return true; //date string is of valid format 'yyyy-MM-dd'
} else {
return false; //empty or null date string
}
} catch(ExecException ee) {
throw ee;
}
}
}
Getting the expected output
(2019-12-27,2020-08-20)
(2017-05-09,2018-10-04)
(2016-09-25,2020-01-19)
(2017-05-09,2020-08-20)