Pig 抛出不兼容的类型错误
Pig throwing incompatible type error
我正在使用以下代码通过在 datafu 中使用 sessionize UDF 在 pig 中生成 sessionId。
SET mapred.min.split.size 1073741824
SET mapred.job.queue.name 'marathon'
SET mapred.output.compress true;
--SET avro.output.codec snappy;
--SET pig.maxCombinedSplitSize 536870912;
page_view_pre = LOAD '/data/tracking/PageViewEvent/' USING LiAvroStorage('date.range','start.date=20150226;end.date=20150226;error.on.missing=true'); -----logic is currently for 2015-02-26,will later replace them with date parameters
p_key = LOAD '/projects/dwh/dwh_dim/dim_page_key/#LATEST' USING LiAvroStorage();
page_view_pre = FILTER page_view_pre BY (requestHeader.userAgent != 'CRAWLER' and requestHeader.browserId != 'CRAWLER') and NOT IsTestMemberId(header.memberId);
page_view_pre = FOREACH page_view_pre GENERATE
(int) (header.memberId <0 ? -9 : header.memberId ) as member_sk,
(chararray) requestHeader.browserId as browserId,
--(chararray) requestHeader.sessionId as sessionId,
(chararray) UnixToISO(header.time) as pageViewTime,
header.time as pv_time,
(chararray) requestHeader.path as path,
(chararray) requestHeader.referer as referer,
(chararray) epochToFormat(header.time, 'yyyyMMdd', 'America/Los_Angeles') as tracking_date,
(chararray) requestHeader.pageKey as pageKey,
(chararray) SUBSTRING(requestHeader.trackingCode, 0, 500) as trackingCode,
FLATTEN(botLookup(requestHeader.userAgent, requestHeader.browserId)) as (is_crawler, crawler_type),
(int) totalTime as totalTime,
((int) totalTime < 20 ? 1 :0) as bounce_flag;
page_view_pre = FILTER page_view_pre BY is_crawler == 'N' ;
p_key = FILTER p_key By is_aggregate ==1;
page_view_agg = JOIN page_view_pre by pageKey ,p_key by page_key;
page_view_agg = FOREACH page_view_agg GENERATE
(chararray)page_view_pre::member_sk as member_sk,
(chararray)page_view_pre::browserId as browserId,
--page_view_pre::sessionId as sessionId,
(chararray)page_view_pre::pageViewTime as pageViewTime,
(long)page_view_pre::pv_time as pv_time,
(chararray)page_view_pre::tracking_date as tracking_date,
(chararray)page_view_pre::path as path,
(chararray)page_view_pre::referer as referer,
(chararray)page_view_pre::pageKey as pageKey,
(int)p_key::page_key_sk as page_key_sk,
(chararray)page_view_pre::trackingCode as trackingCode,
(int)page_view_pre::totalTime as totalTime,
(int)page_view_pre::bounce_flag as bounce_flag;
page_view_agg = FILTER page_view_agg By (member_sk is NOT null) OR (browserId IS NOT NULL) ;
pvs_by_member_browser_pair = GROUP page_view_agg BY (member_sk,browserId);
***session_groups = FOREACH pvs_by_member_browser_pair {
visits = ORDER page_view_agg BY pv_time;
GENERATE FLATTEN(Sessionize(visits)) AS (
pageViewTime,member_sk, pv_time,tracking_date, pageKey,page_key_sk,browserId,referer ,path, trackingCode,totalTime, sessionId
);
}***
粗体部分给我以下错误:
ERROR 1031: Incompatable schema: left is "pageViewTime:NULL,member_sk:NULL,pv_time:NULL,tracking_date:NULL,pageKey:NULL,page_key_sk:NULL,browserId:NULL,referer:NULL,path:NULL,trackingCode:NULL,totalTime:NULL,sessionId:NULL", right is "datafu.pig.sessions.sessionize_visits_43::member_sk:chararray,datafu.pig.sessions.sessionize_visits_43::browserId:chararray,datafu.pig.sessions.sessionize_visits_43::pageViewTime:chararray,datafu.pig.sessions.sessionize_visits_43::pv_time:long,datafu.pig.sessions.sessionize_visits_43::tracking_date:chararray,datafu.pig.sessions.sessionize_visits_43::path:chararray,datafu.pig.sessions.sessionize_visits_43::referer:chararray,datafu.pig.sessions.sessionize_visits_43::pageKey:chararray,datafu.pig.sessions.sessionize_visits_43::page_key_sk:int,datafu.pig.sessions.sessionize_visits_43::trackingCode:chararray,datafu.pig.sessions.sessionize_visits_43::totalTime:int,datafu.pig.sessions.sessionize_visits_43::bounce_flag:int,datafu.pig.sessions.sessionize_visits_43::session_id:chararray"
我最初认为这与 null 成员或浏览器 id's.I 也为他们过滤有关,但错误仍然是 persisting.I 一直卡在这里 hours.Would 非常感谢解决此问题的一些建议或解决方案。
谢谢
这是模式不匹配的经典案例:
page_view_pre = LOAD '/data/tracking/PageViewEvent/' USING LiAvroStorage('date.range','start.date=20150226;end.date=20150226;error.on.missing=true'); -----logic is currently for 2015-02-26,will later replace them with date parameters
只需在这一行之后添加 illustrate page_view_pre
即可确定架构。
我正在使用以下代码通过在 datafu 中使用 sessionize UDF 在 pig 中生成 sessionId。
SET mapred.min.split.size 1073741824
SET mapred.job.queue.name 'marathon'
SET mapred.output.compress true;
--SET avro.output.codec snappy;
--SET pig.maxCombinedSplitSize 536870912;
page_view_pre = LOAD '/data/tracking/PageViewEvent/' USING LiAvroStorage('date.range','start.date=20150226;end.date=20150226;error.on.missing=true'); -----logic is currently for 2015-02-26,will later replace them with date parameters
p_key = LOAD '/projects/dwh/dwh_dim/dim_page_key/#LATEST' USING LiAvroStorage();
page_view_pre = FILTER page_view_pre BY (requestHeader.userAgent != 'CRAWLER' and requestHeader.browserId != 'CRAWLER') and NOT IsTestMemberId(header.memberId);
page_view_pre = FOREACH page_view_pre GENERATE
(int) (header.memberId <0 ? -9 : header.memberId ) as member_sk,
(chararray) requestHeader.browserId as browserId,
--(chararray) requestHeader.sessionId as sessionId,
(chararray) UnixToISO(header.time) as pageViewTime,
header.time as pv_time,
(chararray) requestHeader.path as path,
(chararray) requestHeader.referer as referer,
(chararray) epochToFormat(header.time, 'yyyyMMdd', 'America/Los_Angeles') as tracking_date,
(chararray) requestHeader.pageKey as pageKey,
(chararray) SUBSTRING(requestHeader.trackingCode, 0, 500) as trackingCode,
FLATTEN(botLookup(requestHeader.userAgent, requestHeader.browserId)) as (is_crawler, crawler_type),
(int) totalTime as totalTime,
((int) totalTime < 20 ? 1 :0) as bounce_flag;
page_view_pre = FILTER page_view_pre BY is_crawler == 'N' ;
p_key = FILTER p_key By is_aggregate ==1;
page_view_agg = JOIN page_view_pre by pageKey ,p_key by page_key;
page_view_agg = FOREACH page_view_agg GENERATE
(chararray)page_view_pre::member_sk as member_sk,
(chararray)page_view_pre::browserId as browserId,
--page_view_pre::sessionId as sessionId,
(chararray)page_view_pre::pageViewTime as pageViewTime,
(long)page_view_pre::pv_time as pv_time,
(chararray)page_view_pre::tracking_date as tracking_date,
(chararray)page_view_pre::path as path,
(chararray)page_view_pre::referer as referer,
(chararray)page_view_pre::pageKey as pageKey,
(int)p_key::page_key_sk as page_key_sk,
(chararray)page_view_pre::trackingCode as trackingCode,
(int)page_view_pre::totalTime as totalTime,
(int)page_view_pre::bounce_flag as bounce_flag;
page_view_agg = FILTER page_view_agg By (member_sk is NOT null) OR (browserId IS NOT NULL) ;
pvs_by_member_browser_pair = GROUP page_view_agg BY (member_sk,browserId);
***session_groups = FOREACH pvs_by_member_browser_pair {
visits = ORDER page_view_agg BY pv_time;
GENERATE FLATTEN(Sessionize(visits)) AS (
pageViewTime,member_sk, pv_time,tracking_date, pageKey,page_key_sk,browserId,referer ,path, trackingCode,totalTime, sessionId
);
}***
粗体部分给我以下错误:
ERROR 1031: Incompatable schema: left is "pageViewTime:NULL,member_sk:NULL,pv_time:NULL,tracking_date:NULL,pageKey:NULL,page_key_sk:NULL,browserId:NULL,referer:NULL,path:NULL,trackingCode:NULL,totalTime:NULL,sessionId:NULL", right is "datafu.pig.sessions.sessionize_visits_43::member_sk:chararray,datafu.pig.sessions.sessionize_visits_43::browserId:chararray,datafu.pig.sessions.sessionize_visits_43::pageViewTime:chararray,datafu.pig.sessions.sessionize_visits_43::pv_time:long,datafu.pig.sessions.sessionize_visits_43::tracking_date:chararray,datafu.pig.sessions.sessionize_visits_43::path:chararray,datafu.pig.sessions.sessionize_visits_43::referer:chararray,datafu.pig.sessions.sessionize_visits_43::pageKey:chararray,datafu.pig.sessions.sessionize_visits_43::page_key_sk:int,datafu.pig.sessions.sessionize_visits_43::trackingCode:chararray,datafu.pig.sessions.sessionize_visits_43::totalTime:int,datafu.pig.sessions.sessionize_visits_43::bounce_flag:int,datafu.pig.sessions.sessionize_visits_43::session_id:chararray"
我最初认为这与 null 成员或浏览器 id's.I 也为他们过滤有关,但错误仍然是 persisting.I 一直卡在这里 hours.Would 非常感谢解决此问题的一些建议或解决方案。
谢谢
这是模式不匹配的经典案例:
page_view_pre = LOAD '/data/tracking/PageViewEvent/' USING LiAvroStorage('date.range','start.date=20150226;end.date=20150226;error.on.missing=true'); -----logic is currently for 2015-02-26,will later replace them with date parameters
只需在这一行之后添加 illustrate page_view_pre
即可确定架构。