MySQL - 长文本字段的词频计数
MySQL - word frequency count on long textual field
我有一个 MySQL
table,其中一个字段包含文本描述(~5-200 字)。
例如Reviews
:
Rev_id Place_id Stars Category Text
1 12 3 Food Nice food but a bad dirty place.
2 31 4 Sport Not bad, they have everything.
3 55 1 Bar Poor place,bad audience.
我想做一些字数分析,比如一般的词频统计(每个词出现了多少次)或者每个类别的前 K 个词。
示例中:
word count
bad 3
place 2
...
有没有办法在不涉及编程语言的情况下只用 MySQL
做到这一点?
我这道题的逻辑是:把所有的单词都抽出来统计!
因此,创建一个 table 类似于您存储的数据:
CREATE TABLE `tbltest` (
`Rev_id` int(11) NOT NULL AUTO_INCREMENT,
`place_id` int(11) DEFAULT NULL,
`Stars` int(11) DEFAULT NULL,
`Category` varchar(45) DEFAULT NULL,
`Text` varchar(255) DEFAULT NULL,
PRIMARY KEY (`Rev_id`),
UNIQUE KEY `id_UNIQUE` (`Rev_id`)
) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8;
并为单词创建 table:
CREATE TABLE `counting` (
`word` varchar(45) NOT NULL,
`counts` int(11) DEFAULT NULL,
PRIMARY KEY (`word`),
UNIQUE KEY `word_UNIQUE` (`word`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
现在,创建 MySQL Stored Procedure 用于拆分句子和计算单词:
drop procedure if exists sentence_words;
delimiter #
create procedure sentence_words(IN Cat VARCHAR(45))
begin
declare w_max int unsigned default 1;
declare w_counter int unsigned default 0;
declare done int unsigned default 0;
declare sentence varchar(255) default null;
declare cur cursor for select `text` from `tbltest` where `Category` = Cat;
declare continue handler for not found set done=1;
set done=0;
open cur;
myloop: loop
fetch cur into sentence;
if done = 1 then leave myloop; end if;
-- refine sentence!
set sentence = replace(replace(replace(replace(
sentence
,'.',' '),'!',' '),',',' '),';',' ');
set sentence = replace(trim(sentence),' ',' ');
set w_max = length(sentence)-length(replace(sentence,' ',''))+1;
start transaction;
while w_counter < w_max do
insert into `counting`(counts,word) values
(1, substring_index( substring_index(
sentence,' ',w_counter+1) ,' ',-1)
)
ON DUPLICATE KEY UPDATE counts=counts+1;
set w_counter=w_counter+1;
end while;
commit;
end loop;
close cur;
end #
delimiter ;
最后,您可以调用该过程并在 counting
table 中查找单词和计数。如果您需要将每个类别的字数分开,请记住在为每个类别调用过程之前 truncate
或备份 counting
table。
truncate `counting`;
call sentence_words('Bar');
select * from `counting` order by counts desc; -- ? where length(word)>2
-- words | counts --
'audience', '1'
'bad', '1'
'place', '1'
'Poor', '1'
我有一个 MySQL
table,其中一个字段包含文本描述(~5-200 字)。
例如Reviews
:
Rev_id Place_id Stars Category Text
1 12 3 Food Nice food but a bad dirty place.
2 31 4 Sport Not bad, they have everything.
3 55 1 Bar Poor place,bad audience.
我想做一些字数分析,比如一般的词频统计(每个词出现了多少次)或者每个类别的前 K 个词。
示例中:
word count
bad 3
place 2
...
有没有办法在不涉及编程语言的情况下只用 MySQL
做到这一点?
我这道题的逻辑是:把所有的单词都抽出来统计!
因此,创建一个 table 类似于您存储的数据:
CREATE TABLE `tbltest` (
`Rev_id` int(11) NOT NULL AUTO_INCREMENT,
`place_id` int(11) DEFAULT NULL,
`Stars` int(11) DEFAULT NULL,
`Category` varchar(45) DEFAULT NULL,
`Text` varchar(255) DEFAULT NULL,
PRIMARY KEY (`Rev_id`),
UNIQUE KEY `id_UNIQUE` (`Rev_id`)
) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8;
并为单词创建 table:
CREATE TABLE `counting` (
`word` varchar(45) NOT NULL,
`counts` int(11) DEFAULT NULL,
PRIMARY KEY (`word`),
UNIQUE KEY `word_UNIQUE` (`word`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
现在,创建 MySQL Stored Procedure 用于拆分句子和计算单词:
drop procedure if exists sentence_words;
delimiter #
create procedure sentence_words(IN Cat VARCHAR(45))
begin
declare w_max int unsigned default 1;
declare w_counter int unsigned default 0;
declare done int unsigned default 0;
declare sentence varchar(255) default null;
declare cur cursor for select `text` from `tbltest` where `Category` = Cat;
declare continue handler for not found set done=1;
set done=0;
open cur;
myloop: loop
fetch cur into sentence;
if done = 1 then leave myloop; end if;
-- refine sentence!
set sentence = replace(replace(replace(replace(
sentence
,'.',' '),'!',' '),',',' '),';',' ');
set sentence = replace(trim(sentence),' ',' ');
set w_max = length(sentence)-length(replace(sentence,' ',''))+1;
start transaction;
while w_counter < w_max do
insert into `counting`(counts,word) values
(1, substring_index( substring_index(
sentence,' ',w_counter+1) ,' ',-1)
)
ON DUPLICATE KEY UPDATE counts=counts+1;
set w_counter=w_counter+1;
end while;
commit;
end loop;
close cur;
end #
delimiter ;
最后,您可以调用该过程并在 counting
table 中查找单词和计数。如果您需要将每个类别的字数分开,请记住在为每个类别调用过程之前 truncate
或备份 counting
table。
truncate `counting`;
call sentence_words('Bar');
select * from `counting` order by counts desc; -- ? where length(word)>2
-- words | counts --
'audience', '1'
'bad', '1'
'place', '1'
'Poor', '1'