Hive SQL:如何按前缀计算唯一字符串
Hive SQL: How to count unique strings by prefix
想象一下 table
text
----
h
he
hel // All above are prefixes
helll123 // hel is a prefix of helll123; this is the first occurrence of helll123
helll123 // second helll123
f
fa
fals
falst0 // fals is a prefix of falst0
下面的查询是伪代码来演示我所追求的
SELECT
unique_by_prefix(text) AS unique_text, // pseudo code
COUNT(*)
FROM
my_table
GROUP BY
1
应该生成以下结果
unique_text count
helll123 2
falst0 1
基本上,我们将忽略前缀,只计算唯一文本。
我认为您无法通过一次查询在 Hive 中完成此操作。
这是一种可能性:
select text, count(*)
from t
where not exists (select 1
from t t2
where t2.text <> t.text and t2.text like t1.text || '%'
)
group by text;
尽管这抓住了逻辑,但我怀疑 Hive 希望相关子句相等。
这是一种方法。
select distinct text into my_table1 from my_table
alter table my_table1 add sno int identity
create table my_table2 (text varchar(max), counter int)
declare @i int = 0
While (@i < (select COUNT(*) from my_table1))
Begin
set @i = @i + 1
declare @text varchar(max) = (select text FROM my_table1 where sno = @i)
insert into my_table2 values(
(select text from my_table1 where sno = @i),
(select COUNT(*) from my_table1 where text like @text + '%'))
End
select A.text, count(*) from my_table A left join my_table2 B on A.text = B.text where B.counter = 1 group by A.text
窗口函数
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+WindowingAndAnalytics
select text,
lead(text) over ( order by text ) as next_text,
lag(text) over ( order by text ) as pre_text
from my_table;
结果将是:
text next_text pre_text
h he NULL
he hel h
hel helll123 he
helll123 helll123 hel
helll123 f helll123
f NULL helll123
然后您可以与这些值进行比较:如果next_text以文本开头,则此记录不是您想要的,否则获取此记录。
case when instr(next_text, text) = 1 then null else text as text_u_want
然后删除 null 并加入 my_table ,您可以获得文本计数
想象一下 table
text
----
h
he
hel // All above are prefixes
helll123 // hel is a prefix of helll123; this is the first occurrence of helll123
helll123 // second helll123
f
fa
fals
falst0 // fals is a prefix of falst0
下面的查询是伪代码来演示我所追求的
SELECT
unique_by_prefix(text) AS unique_text, // pseudo code
COUNT(*)
FROM
my_table
GROUP BY
1
应该生成以下结果
unique_text count
helll123 2
falst0 1
基本上,我们将忽略前缀,只计算唯一文本。
我认为您无法通过一次查询在 Hive 中完成此操作。
这是一种可能性:
select text, count(*)
from t
where not exists (select 1
from t t2
where t2.text <> t.text and t2.text like t1.text || '%'
)
group by text;
尽管这抓住了逻辑,但我怀疑 Hive 希望相关子句相等。
这是一种方法。
select distinct text into my_table1 from my_table
alter table my_table1 add sno int identity
create table my_table2 (text varchar(max), counter int)
declare @i int = 0
While (@i < (select COUNT(*) from my_table1))
Begin
set @i = @i + 1
declare @text varchar(max) = (select text FROM my_table1 where sno = @i)
insert into my_table2 values(
(select text from my_table1 where sno = @i),
(select COUNT(*) from my_table1 where text like @text + '%'))
End
select A.text, count(*) from my_table A left join my_table2 B on A.text = B.text where B.counter = 1 group by A.text
窗口函数 https://cwiki.apache.org/confluence/display/Hive/LanguageManual+WindowingAndAnalytics
select text,
lead(text) over ( order by text ) as next_text,
lag(text) over ( order by text ) as pre_text
from my_table;
结果将是:
text next_text pre_text
h he NULL
he hel h
hel helll123 he
helll123 helll123 hel
helll123 f helll123
f NULL helll123
然后您可以与这些值进行比较:如果next_text以文本开头,则此记录不是您想要的,否则获取此记录。
case when instr(next_text, text) = 1 then null else text as text_u_want
然后删除 null 并加入 my_table ,您可以获得文本计数