从大型语料库填充句子 table
Populating sentences from large corpus table
我有五个 table 包含词汇数据。我想显示从语料库到给定冰岛语词条(包括所有单词形式)的句子。使用以下方法,找到 5 个句子需要 2 秒。我正在寻找一种可以显示所有可用句子的解决方案。
预期结果:
包含查询中指定的给定词条的所有单词形式的句子列表。
当前结果:
当前结果returns仅匹配关键词基本形式的句子:
word_form w_id s_id pos sentence
hest 11484 794930 1 Sentence 1. .....
hest 11484 795623 12 Sentence 2 .....
预期结果:
word_form w_id s_id pos sentence
hest 11484 794930 1 Sentence 1. .....
hest 11484 795623 12 Sentence 2 .....
...
hestur .. .. .. Sentence 13.
hestur .. .. .. Sentence 14.
...
hesti .. .. .. Sentence 21.
...
建议的查询有更改,但以错误结束。
SELECT w0.keyword, w.word_form, w3.w_id, w4.s_id, w4.pos, s.sentence
FROM `1_headword` w0
INNER JOIN `2_wordform` w ON w.keyword = w0.keyword
INNER JOIN `3_words` w3 ON w3.word = w.word_form
INNER JOIN `4_inv_w` w4 ON w4.w_id = w3.w_id
INNER JOIN `5_sentences` s
ON s.s_id = w4.s_id WHERE w0.keyword like 'hestur' group by w4.s_id
备注:
关键字是一个,基本形式 - 在本例中为 "hestur"。在这种情况下,单词形式是 - "hest"、"hesti"、"hestar"(请参阅插入 table)等。
换句话说,查询应该采用给定词条的所有词形式并匹配词形式出现的句子。
更新二.
很少观察。
1.The 以下简化查询接收 w_id 所有单词形式 returns 行,第一个单词形式重复 w_id。
2. 3_words
table.
中的单词形式可以有几行
SELECT w.keyword, w.word_form, w3.w_id FROM `2_wordform1` w
JOIN `3_words` w3
ON w3.word = w.keyword and w3.gram = w.gram
WHERE w.keyword like 'tala' and w.gram = 'f'
行
tala tala 8809
tala tala 89664
tala tala 97991
Tala Tala 8809
Tala Tala 89664
Tala Tala 97991
tala tölur 8809
tala tölur 89664
tala tölur 97991
表格和数据
table - 词条,70000 行
CREATE TABLE IF NOT EXISTS `1_headword` (
`id` int(9) NOT NULL,
`keyword` varchar(100) CHARACTER SET utf8 COLLATE utf8_icelandic_ci NOT NULL,
`num_keyword` int(9) NOT NULL DEFAULT '0',
`gram` varchar(40) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC AUTO_INCREMENT=55328 ;
ALTER TABLE `1_headword`
ADD PRIMARY KEY (`id`), ADD KEY `keyword` (`keyword`);
table - 单词形式 - 700 000 行
CREATE TABLE IF NOT EXISTS `2_wordform` (
`id` int(10) NOT NULL,
`keyword` varchar(120) CHARACTER SET utf8 COLLATE utf8_icelandic_ci NOT NULL,
`num_keyword` int(4) NOT NULL,
`word_form` varchar(120) CHARACTER SET utf8 COLLATE utf8_icelandic_ci NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8 AUTO_INCREMENT=678480 ;
ALTER TABLE `2_wordform`
ADD PRIMARY KEY (`id`), ADD KEY `word_form` (`word_form`);
table - 从语料库中标记的单词形式 w_id(单词 id),100 万行
CREATE TABLE `3_words` (
`w_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`word` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL,
`gram` varchar(255) DEFAULT NULL,
`freq` int(10) unsigned DEFAULT NULL,
PRIMARY KEY (`w_id`),
KEY `word` (`word`),
KEY `w_id` (`w_id`)
) ENGINE=MyISAM AUTO_INCREMENT=800468 DEFAULT CHARSET=utf8;
table - w_id(word id)连接到s_id(sentence id),word可以在几个句子中找到,加上在句子中的位置,2200万行
CREATE TABLE `4_inv_w` (
`w_id` int(10) unsigned NOT NULL DEFAULT '0',
`s_id` int(10) unsigned NOT NULL DEFAULT '0',
`pos` mediumint(2) unsigned NOT NULL DEFAULT '0',
KEY `w_id` (`w_id`),
KEY `s_id` (`s_id`),
KEY `w_s` (`w_id`,`s_id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
table - s_id (sentence id) with sentence, 100万行
CREATE TABLE `5_sentences` (
`s_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`sentence` text,
KEY `s_id` (`s_id`)
) ENGINE=MyISAM AUTO_INCREMENT=999953 DEFAULT CHARSET=utf8;
进程
select 给定词条的所有词形式 f.e "hestur"(英语中的马)
SELECT `word_form` FROM `2_wordform` WHERE `keyword` like 'hestur'
结果由16至50个结果组成,现在将结果循环为f.e。 "hestur"
的宾格 "hest"
SELECT `w_id` FROM `3_words` WHERE `word` like 'hest'
结果可以包含多个w_id、f.e。使用“10138”
SELECT `s_id`, `pos` FROM `4_inv_w` WHERE `w_id` = '10138' group by `s_id`
结果可以包含几个句子,以显示f.e。句子 '7201'
SELECT `sentence` FROM `5_sentences` WHERE `s_id` = '7201'
更新
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42490, 'hestur', 0, 'hest') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42498, 'hestur', 0, 'hesta') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42501, 'hestur', 0, 'hestana') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42503, 'hestur', 0, 'hestanna') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42497, 'hestur', 0, 'hestar') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42500, 'hestur', 0, 'hestarnir') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42491, 'hestur', 0, 'hesti') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42494, 'hestur', 0, 'hestinn') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42495, 'hestur', 0, 'hestinum') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42492, 'hestur', 0, 'hests') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42496, 'hestur', 0, 'hestsins') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42499, 'hestur', 0, 'hestum') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42502, 'hestur', 0, 'hestunum') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42489, 'hestur', 0, 'hestur') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42493, 'hestur', 0, 'hesturinn') ;
插入 3_words
(w_id
、word
、gram
、freq
)值
(11484, 'hestur', 'nken', 122),
(60681, 'Hestur', 'nken', 15),
(484318, 'HESTUR', 'nken', 1),
(491111, 'Hestur', 'nken-s', 1);
插入 3_words
(w_id
、word
、gram
、freq
)值
(10138, 'hest', 'nkeo', 141),
(159967, 'Hest', 'nkeo', 4),
(491114, 'Hest', 'ssm', 1);
插入 4_inv_w
(w_id
、s_id
、pos
)值
(11484, 2671, 4),
(11484, 22522, 7),
(11484, 30169, 8),
(11484, 32487, 4),
(11484, 33841, 9),
(11484, 38116, 5),
(11484, 40450, 6),
(11484, 42741, 32),
(11484, 45789, 10),
(11484, 58998, 3),
(11484, 74343, 4),
(11484, 76001, 3),
(11484, 99014, 9),
(11484, 99688, 6),
(11484, 109849, 21),
(11484, 119708, 21),
(11484, 131353, 34),
(11484, 147820, 6),
(11484, 148326, 25),
(11484, 160475, 40),
(11484, 167227, 2),
(11484, 170401, 3),
(11484, 178416, 18),
(11484, 197295, 12),
(11484, 197295, 6),
(11484, 198420, 19),
(11484, 203446, 28),
(11484, 204448, 1),
(11484, 215402, 1),
(11484, 237323, 4),
(11484, 249282, 4),
(11484, 263949, 1),
(11484, 263949, 22),
(11484, 266489, 27),
(11484, 270540, 5),
(11484, 272543, 5),
(11484, 272560, 1),
(11484, 272560, 8),
(11484, 282170, 20),
(11484, 284407, 26),
(11484, 290524, 6),
(11484, 291438, 10),
(11484, 293344, 6),
(11484, 294034, 49),
(11484, 317007, 7),
(11484, 325049, 22),
(11484, 328392, 14),
(11484, 368188, 47),
(11484, 391892, 14),
(11484, 401157, 11),
(11484, 412656, 24),
(11484, 421635, 17),
(11484, 439320, 3),
(11484, 467063, 5),
(11484, 469324, 23),
(11484, 477392, 2),
(11484, 480318, 4),
(11484, 487883, 1),
(11484, 490577, 42),
(11484, 499783, 9),
(11484, 500405, 23),
(11484, 501118, 15),
(11484, 527227, 3),
(11484, 539686, 25),
(11484, 543056, 9),
(11484, 544261, 3),
(11484, 547700, 20),
(11484, 555638, 19),
(11484, 570234, 2),
(11484, 592710, 2),
(11484, 616662, 1),
(11484, 619011, 16),
(11484, 632123, 2),
(11484, 633124, 2),
(11484, 636792, 8),
(11484, 636792, 3),
(11484, 646603, 17),
(11484, 664738, 4),
(11484, 670017, 4),
(11484, 685997, 4),
(11484, 686202, 1),
(11484, 691794, 12),
(11484, 698341, 2),
(11484, 715281, 3),
(11484, 715984, 37),
(11484, 716970, 10),
(11484, 716970, 4),
(11484, 752605, 36),
(11484, 756660, 19),
(11484, 760277, 3),
(11484, 776593, 3),
(11484, 785701, 24),
(11484, 789099, 3),
(11484, 794930, 1),
(11484, 795623, 12),
(11484, 802997, 6),
(11484, 812806, 6),
(11484, 814046, 21),
(11484, 820178, 6),
(11484, 823173, 22),
(11484, 843094, 3),
(11484, 844156, 1),
(11484, 844736, 24),
(11484, 853350, 18),
(11484, 869322, 3),
(11484, 885176, 2),
(11484, 899545, 22),
(11484, 904086, 16),
(11484, 907863, 9),
(11484, 909396, 9),
(11484, 912876, 3),
(11484, 919994, 4),
(11484, 927840, 24),
(11484, 927840, 5),
(11484, 934220, 40),
(11484, 936941, 11),
(11484, 952837, 13),
(11484, 969201, 11),
(11484, 970240, 1),
(11484, 970836, 19),
(11484, 972107, 1),
(11484, 990474, 6);
插入 4_inv_w
(w_id
、s_id
、pos
)值
(10138, 7201, 27),
(10138, 18772, 3),
(10138, 30001, 6),
(10138, 42089, 4),
(10138, 42089, 14),
(10138, 42234, 4),
(10138, 49383, 5),
(10138, 54795, 18),
(10138, 57564, 23),
(10138, 88542, 7),
(10138, 93027, 10),
(10138, 101097, 21),
(10138, 134312, 12),
(10138, 139116, 33),
(10138, 139522, 6),
(10138, 159109, 7),
(10138, 159109, 16),
(10138, 161497, 21),
(10138, 163948, 2),
(10138, 165301, 20),
(10138, 166478, 21),
(10138, 183452, 6),
(10138, 184390, 20),
(10138, 189930, 25),
(10138, 201629, 9),
(10138, 204590, 4),
(10138, 211374, 5),
(10138, 216483, 14),
(10138, 223617, 5),
(10138, 233652, 12),
(10138, 236571, 11),
(10138, 241302, 8),
(10138, 246485, 10),
(10138, 256910, 16),
(10138, 262349, 3),
(10138, 262925, 5),
(10138, 267047, 28),
(10138, 291988, 18),
(10138, 292680, 22),
(10138, 294814, 32),
(10138, 326917, 6),
(10138, 330019, 12),
(10138, 333411, 35),
(10138, 337880, 5),
(10138, 342003, 13),
(10138, 355325, 12),
(10138, 356409, 13),
(10138, 363795, 5),
(10138, 365735, 26),
(10138, 376570, 25),
(10138, 378214, 10),
(10138, 379159, 11),
(10138, 379236, 4),
(10138, 379533, 2),
(10138, 388753, 8),
(10138, 420633, 18),
(10138, 433121, 5),
(10138, 434645, 10),
(10138, 435895, 3),
(10138, 455575, 5),
(10138, 461900, 23),
(10138, 464040, 6),
(10138, 466657, 6),
(10138, 469848, 11),
(10138, 475569, 17),
(10138, 482701, 41),
(10138, 527708, 29),
(10138, 527708, 16),
(10138, 529426, 7),
(10138, 530753, 10),
(10138, 538071, 27),
(10138, 542685, 10),
(10138, 553742, 22),
(10138, 553742, 13),
(10138, 557216, 4),
(10138, 563747, 9),
(10138, 564716, 4),
(10138, 569146, 7),
(10138, 578368, 3),
(10138, 581713, 9),
(10138, 595890, 9),
(10138, 599015, 5),
(10138, 608570, 30),
(10138, 610218, 11),
(10138, 610218, 2),
(10138, 612099, 9),
(10138, 612568, 14),
(10138, 612894, 9),
(10138, 615361, 19),
(10138, 618001, 14),
(10138, 624969, 7),
(10138, 628252, 16),
(10138, 628635, 12),
(10138, 635977, 10),
(10138, 643675, 8),
(10138, 650487, 9),
(10138, 651489, 3),
(10138, 657552, 18),
(10138, 672884, 12),
(10138, 677130, 2),
(10138, 678841, 7),
(10138, 678841, 26),
(10138, 682904, 4),
(10138, 691251, 19),
(10138, 706325, 9),
(10138, 714680, 45),
(10138, 717460, 5),
(10138, 717489, 11),
(10138, 722393, 5),
(10138, 729972, 12),
(10138, 735745, 12),
(10138, 738334, 7),
(10138, 740791, 21),
(10138, 775696, 8),
(10138, 776984, 16),
(10138, 786073, 31),
(10138, 793185, 17),
(10138, 821475, 4),
(10138, 835234, 7),
(10138, 842713, 3),
(10138, 842730, 8),
(10138, 847372, 9),
(10138, 849612, 20),
(10138, 861768, 26),
(10138, 864231, 6),
(10138, 865927, 7),
(10138, 873939, 7),
(10138, 883591, 29),
(10138, 884260, 19),
(10138, 894952, 17),
(10138, 898453, 19),
(10138, 899290, 4),
(10138, 909225, 29),
(10138, 910173, 4),
(10138, 922447, 2),
(10138, 939319, 2),
(10138, 956278, 4),
(10138, 967342, 18),
(10138, 977090, 3),
(10138, 991346, 31),
(10138, 991346, 40);
插入 5_sentences
(s_id
, sentence
) 值
(2671, 'Hrímnir|nken-s frá|aþ Hrafnagili|nkeþ-s Glæsilegasti|lkenve hestur|nken aldar|nvee !|!');
插入 5_sentences
(s_id
, sentence
) 值
(7201, 'Hann|fpken heilsar|sfg3en öllum|fokfþ nema|c Braga|nkeþ-s sem|ct nú|aa dregur|sfg3en í|ao land|nheo og|c vill|sfg3en friðmælast|snm við|ao Loka|nkeo-s með|aþ loforði|nheþ um|ao góðar|lvfosf gjafir|nvfo ,|, sverð|nhfo ,|, hest|nkeo og|c hring|nkeo en|c hann|fpken svarar|sfg3en bara|aa með|aþ illu|lheþsf .|.');
任何数据样本与预期结果一起都会非常有帮助。
到目前为止你可以从这个尝试开始:
http://sqlfiddle.com/#!9/7110ef/4
SELECT w.word_form,
w3.w_id,
w4.s_id,
w4.pos,
s.sentence
FROM `2_wordform` w
INNER JOIN `3_words` w3
ON w3.`word` = w.keyword
INNER JOIN `4_inv_w` w4
ON w4.w_id = w3.w_id
INNER JOIN `5_sentences` s
ON s.s_id = w4.s_id
WHERE w.keyword like '%hestur%'
如果你只需要一句话你可以:
SELECT DISTINCT s.sentence
FROM `2_wordform` w
INNER JOIN `3_words` w3
ON w3.`word` = w.keyword
INNER JOIN `4_inv_w` w4
ON w4.w_id = w3.w_id
INNER JOIN `5_sentences` s
ON s.s_id = w4.s_id
WHERE w.keyword like '%hestur%'
我有五个 table 包含词汇数据。我想显示从语料库到给定冰岛语词条(包括所有单词形式)的句子。使用以下方法,找到 5 个句子需要 2 秒。我正在寻找一种可以显示所有可用句子的解决方案。
预期结果:
包含查询中指定的给定词条的所有单词形式的句子列表。
当前结果:
当前结果returns仅匹配关键词基本形式的句子:
word_form w_id s_id pos sentence
hest 11484 794930 1 Sentence 1. .....
hest 11484 795623 12 Sentence 2 .....
预期结果:
word_form w_id s_id pos sentence
hest 11484 794930 1 Sentence 1. .....
hest 11484 795623 12 Sentence 2 .....
...
hestur .. .. .. Sentence 13.
hestur .. .. .. Sentence 14.
...
hesti .. .. .. Sentence 21.
...
建议的查询有更改,但以错误结束。
SELECT w0.keyword, w.word_form, w3.w_id, w4.s_id, w4.pos, s.sentence
FROM `1_headword` w0
INNER JOIN `2_wordform` w ON w.keyword = w0.keyword
INNER JOIN `3_words` w3 ON w3.word = w.word_form
INNER JOIN `4_inv_w` w4 ON w4.w_id = w3.w_id
INNER JOIN `5_sentences` s
ON s.s_id = w4.s_id WHERE w0.keyword like 'hestur' group by w4.s_id
备注: 关键字是一个,基本形式 - 在本例中为 "hestur"。在这种情况下,单词形式是 - "hest"、"hesti"、"hestar"(请参阅插入 table)等。 换句话说,查询应该采用给定词条的所有词形式并匹配词形式出现的句子。
更新二.
很少观察。
1.The 以下简化查询接收 w_id 所有单词形式 returns 行,第一个单词形式重复 w_id。
2. 3_words
table.
SELECT w.keyword, w.word_form, w3.w_id FROM `2_wordform1` w
JOIN `3_words` w3
ON w3.word = w.keyword and w3.gram = w.gram
WHERE w.keyword like 'tala' and w.gram = 'f'
行
tala tala 8809
tala tala 89664
tala tala 97991
Tala Tala 8809
Tala Tala 89664
Tala Tala 97991
tala tölur 8809
tala tölur 89664
tala tölur 97991
表格和数据
table - 词条,70000 行
CREATE TABLE IF NOT EXISTS `1_headword` (
`id` int(9) NOT NULL,
`keyword` varchar(100) CHARACTER SET utf8 COLLATE utf8_icelandic_ci NOT NULL,
`num_keyword` int(9) NOT NULL DEFAULT '0',
`gram` varchar(40) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC AUTO_INCREMENT=55328 ;
ALTER TABLE `1_headword`
ADD PRIMARY KEY (`id`), ADD KEY `keyword` (`keyword`);
table - 单词形式 - 700 000 行
CREATE TABLE IF NOT EXISTS `2_wordform` (
`id` int(10) NOT NULL,
`keyword` varchar(120) CHARACTER SET utf8 COLLATE utf8_icelandic_ci NOT NULL,
`num_keyword` int(4) NOT NULL,
`word_form` varchar(120) CHARACTER SET utf8 COLLATE utf8_icelandic_ci NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8 AUTO_INCREMENT=678480 ;
ALTER TABLE `2_wordform`
ADD PRIMARY KEY (`id`), ADD KEY `word_form` (`word_form`);
table - 从语料库中标记的单词形式 w_id(单词 id),100 万行
CREATE TABLE `3_words` (
`w_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`word` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL,
`gram` varchar(255) DEFAULT NULL,
`freq` int(10) unsigned DEFAULT NULL,
PRIMARY KEY (`w_id`),
KEY `word` (`word`),
KEY `w_id` (`w_id`)
) ENGINE=MyISAM AUTO_INCREMENT=800468 DEFAULT CHARSET=utf8;
table - w_id(word id)连接到s_id(sentence id),word可以在几个句子中找到,加上在句子中的位置,2200万行
CREATE TABLE `4_inv_w` (
`w_id` int(10) unsigned NOT NULL DEFAULT '0',
`s_id` int(10) unsigned NOT NULL DEFAULT '0',
`pos` mediumint(2) unsigned NOT NULL DEFAULT '0',
KEY `w_id` (`w_id`),
KEY `s_id` (`s_id`),
KEY `w_s` (`w_id`,`s_id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
table - s_id (sentence id) with sentence, 100万行
CREATE TABLE `5_sentences` (
`s_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`sentence` text,
KEY `s_id` (`s_id`)
) ENGINE=MyISAM AUTO_INCREMENT=999953 DEFAULT CHARSET=utf8;
进程
select 给定词条的所有词形式 f.e "hestur"(英语中的马)
SELECT `word_form` FROM `2_wordform` WHERE `keyword` like 'hestur'
结果由16至50个结果组成,现在将结果循环为f.e。 "hestur"
的宾格 "hest"SELECT `w_id` FROM `3_words` WHERE `word` like 'hest'
结果可以包含多个w_id、f.e。使用“10138”
SELECT `s_id`, `pos` FROM `4_inv_w` WHERE `w_id` = '10138' group by `s_id`
结果可以包含几个句子,以显示f.e。句子 '7201'
SELECT `sentence` FROM `5_sentences` WHERE `s_id` = '7201'
更新
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42490, 'hestur', 0, 'hest') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42498, 'hestur', 0, 'hesta') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42501, 'hestur', 0, 'hestana') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42503, 'hestur', 0, 'hestanna') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42497, 'hestur', 0, 'hestar') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42500, 'hestur', 0, 'hestarnir') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42491, 'hestur', 0, 'hesti') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42494, 'hestur', 0, 'hestinn') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42495, 'hestur', 0, 'hestinum') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42492, 'hestur', 0, 'hests') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42496, 'hestur', 0, 'hestsins') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42499, 'hestur', 0, 'hestum') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42502, 'hestur', 0, 'hestunum') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42489, 'hestur', 0, 'hestur') ;
插入 2_wordform
(id
, keyword
, num_keyword
, word_form
) 值 (42493, 'hestur', 0, 'hesturinn') ;
插入 3_words
(w_id
、word
、gram
、freq
)值
(11484, 'hestur', 'nken', 122),
(60681, 'Hestur', 'nken', 15),
(484318, 'HESTUR', 'nken', 1),
(491111, 'Hestur', 'nken-s', 1);
插入 3_words
(w_id
、word
、gram
、freq
)值
(10138, 'hest', 'nkeo', 141),
(159967, 'Hest', 'nkeo', 4),
(491114, 'Hest', 'ssm', 1);
插入 4_inv_w
(w_id
、s_id
、pos
)值
(11484, 2671, 4),
(11484, 22522, 7),
(11484, 30169, 8),
(11484, 32487, 4),
(11484, 33841, 9),
(11484, 38116, 5),
(11484, 40450, 6),
(11484, 42741, 32),
(11484, 45789, 10),
(11484, 58998, 3),
(11484, 74343, 4),
(11484, 76001, 3),
(11484, 99014, 9),
(11484, 99688, 6),
(11484, 109849, 21),
(11484, 119708, 21),
(11484, 131353, 34),
(11484, 147820, 6),
(11484, 148326, 25),
(11484, 160475, 40),
(11484, 167227, 2),
(11484, 170401, 3),
(11484, 178416, 18),
(11484, 197295, 12),
(11484, 197295, 6),
(11484, 198420, 19),
(11484, 203446, 28),
(11484, 204448, 1),
(11484, 215402, 1),
(11484, 237323, 4),
(11484, 249282, 4),
(11484, 263949, 1),
(11484, 263949, 22),
(11484, 266489, 27),
(11484, 270540, 5),
(11484, 272543, 5),
(11484, 272560, 1),
(11484, 272560, 8),
(11484, 282170, 20),
(11484, 284407, 26),
(11484, 290524, 6),
(11484, 291438, 10),
(11484, 293344, 6),
(11484, 294034, 49),
(11484, 317007, 7),
(11484, 325049, 22),
(11484, 328392, 14),
(11484, 368188, 47),
(11484, 391892, 14),
(11484, 401157, 11),
(11484, 412656, 24),
(11484, 421635, 17),
(11484, 439320, 3),
(11484, 467063, 5),
(11484, 469324, 23),
(11484, 477392, 2),
(11484, 480318, 4),
(11484, 487883, 1),
(11484, 490577, 42),
(11484, 499783, 9),
(11484, 500405, 23),
(11484, 501118, 15),
(11484, 527227, 3),
(11484, 539686, 25),
(11484, 543056, 9),
(11484, 544261, 3),
(11484, 547700, 20),
(11484, 555638, 19),
(11484, 570234, 2),
(11484, 592710, 2),
(11484, 616662, 1),
(11484, 619011, 16),
(11484, 632123, 2),
(11484, 633124, 2),
(11484, 636792, 8),
(11484, 636792, 3),
(11484, 646603, 17),
(11484, 664738, 4),
(11484, 670017, 4),
(11484, 685997, 4),
(11484, 686202, 1),
(11484, 691794, 12),
(11484, 698341, 2),
(11484, 715281, 3),
(11484, 715984, 37),
(11484, 716970, 10),
(11484, 716970, 4),
(11484, 752605, 36),
(11484, 756660, 19),
(11484, 760277, 3),
(11484, 776593, 3),
(11484, 785701, 24),
(11484, 789099, 3),
(11484, 794930, 1),
(11484, 795623, 12),
(11484, 802997, 6),
(11484, 812806, 6),
(11484, 814046, 21),
(11484, 820178, 6),
(11484, 823173, 22),
(11484, 843094, 3),
(11484, 844156, 1),
(11484, 844736, 24),
(11484, 853350, 18),
(11484, 869322, 3),
(11484, 885176, 2),
(11484, 899545, 22),
(11484, 904086, 16),
(11484, 907863, 9),
(11484, 909396, 9),
(11484, 912876, 3),
(11484, 919994, 4),
(11484, 927840, 24),
(11484, 927840, 5),
(11484, 934220, 40),
(11484, 936941, 11),
(11484, 952837, 13),
(11484, 969201, 11),
(11484, 970240, 1),
(11484, 970836, 19),
(11484, 972107, 1),
(11484, 990474, 6);
插入 4_inv_w
(w_id
、s_id
、pos
)值
(10138, 7201, 27),
(10138, 18772, 3),
(10138, 30001, 6),
(10138, 42089, 4),
(10138, 42089, 14),
(10138, 42234, 4),
(10138, 49383, 5),
(10138, 54795, 18),
(10138, 57564, 23),
(10138, 88542, 7),
(10138, 93027, 10),
(10138, 101097, 21),
(10138, 134312, 12),
(10138, 139116, 33),
(10138, 139522, 6),
(10138, 159109, 7),
(10138, 159109, 16),
(10138, 161497, 21),
(10138, 163948, 2),
(10138, 165301, 20),
(10138, 166478, 21),
(10138, 183452, 6),
(10138, 184390, 20),
(10138, 189930, 25),
(10138, 201629, 9),
(10138, 204590, 4),
(10138, 211374, 5),
(10138, 216483, 14),
(10138, 223617, 5),
(10138, 233652, 12),
(10138, 236571, 11),
(10138, 241302, 8),
(10138, 246485, 10),
(10138, 256910, 16),
(10138, 262349, 3),
(10138, 262925, 5),
(10138, 267047, 28),
(10138, 291988, 18),
(10138, 292680, 22),
(10138, 294814, 32),
(10138, 326917, 6),
(10138, 330019, 12),
(10138, 333411, 35),
(10138, 337880, 5),
(10138, 342003, 13),
(10138, 355325, 12),
(10138, 356409, 13),
(10138, 363795, 5),
(10138, 365735, 26),
(10138, 376570, 25),
(10138, 378214, 10),
(10138, 379159, 11),
(10138, 379236, 4),
(10138, 379533, 2),
(10138, 388753, 8),
(10138, 420633, 18),
(10138, 433121, 5),
(10138, 434645, 10),
(10138, 435895, 3),
(10138, 455575, 5),
(10138, 461900, 23),
(10138, 464040, 6),
(10138, 466657, 6),
(10138, 469848, 11),
(10138, 475569, 17),
(10138, 482701, 41),
(10138, 527708, 29),
(10138, 527708, 16),
(10138, 529426, 7),
(10138, 530753, 10),
(10138, 538071, 27),
(10138, 542685, 10),
(10138, 553742, 22),
(10138, 553742, 13),
(10138, 557216, 4),
(10138, 563747, 9),
(10138, 564716, 4),
(10138, 569146, 7),
(10138, 578368, 3),
(10138, 581713, 9),
(10138, 595890, 9),
(10138, 599015, 5),
(10138, 608570, 30),
(10138, 610218, 11),
(10138, 610218, 2),
(10138, 612099, 9),
(10138, 612568, 14),
(10138, 612894, 9),
(10138, 615361, 19),
(10138, 618001, 14),
(10138, 624969, 7),
(10138, 628252, 16),
(10138, 628635, 12),
(10138, 635977, 10),
(10138, 643675, 8),
(10138, 650487, 9),
(10138, 651489, 3),
(10138, 657552, 18),
(10138, 672884, 12),
(10138, 677130, 2),
(10138, 678841, 7),
(10138, 678841, 26),
(10138, 682904, 4),
(10138, 691251, 19),
(10138, 706325, 9),
(10138, 714680, 45),
(10138, 717460, 5),
(10138, 717489, 11),
(10138, 722393, 5),
(10138, 729972, 12),
(10138, 735745, 12),
(10138, 738334, 7),
(10138, 740791, 21),
(10138, 775696, 8),
(10138, 776984, 16),
(10138, 786073, 31),
(10138, 793185, 17),
(10138, 821475, 4),
(10138, 835234, 7),
(10138, 842713, 3),
(10138, 842730, 8),
(10138, 847372, 9),
(10138, 849612, 20),
(10138, 861768, 26),
(10138, 864231, 6),
(10138, 865927, 7),
(10138, 873939, 7),
(10138, 883591, 29),
(10138, 884260, 19),
(10138, 894952, 17),
(10138, 898453, 19),
(10138, 899290, 4),
(10138, 909225, 29),
(10138, 910173, 4),
(10138, 922447, 2),
(10138, 939319, 2),
(10138, 956278, 4),
(10138, 967342, 18),
(10138, 977090, 3),
(10138, 991346, 31),
(10138, 991346, 40);
插入 5_sentences
(s_id
, sentence
) 值
(2671, 'Hrímnir|nken-s frá|aþ Hrafnagili|nkeþ-s Glæsilegasti|lkenve hestur|nken aldar|nvee !|!');
插入 5_sentences
(s_id
, sentence
) 值
(7201, 'Hann|fpken heilsar|sfg3en öllum|fokfþ nema|c Braga|nkeþ-s sem|ct nú|aa dregur|sfg3en í|ao land|nheo og|c vill|sfg3en friðmælast|snm við|ao Loka|nkeo-s með|aþ loforði|nheþ um|ao góðar|lvfosf gjafir|nvfo ,|, sverð|nhfo ,|, hest|nkeo og|c hring|nkeo en|c hann|fpken svarar|sfg3en bara|aa með|aþ illu|lheþsf .|.');
任何数据样本与预期结果一起都会非常有帮助。
到目前为止你可以从这个尝试开始:
http://sqlfiddle.com/#!9/7110ef/4
SELECT w.word_form,
w3.w_id,
w4.s_id,
w4.pos,
s.sentence
FROM `2_wordform` w
INNER JOIN `3_words` w3
ON w3.`word` = w.keyword
INNER JOIN `4_inv_w` w4
ON w4.w_id = w3.w_id
INNER JOIN `5_sentences` s
ON s.s_id = w4.s_id
WHERE w.keyword like '%hestur%'
如果你只需要一句话你可以:
SELECT DISTINCT s.sentence
FROM `2_wordform` w
INNER JOIN `3_words` w3
ON w3.`word` = w.keyword
INNER JOIN `4_inv_w` w4
ON w4.w_id = w3.w_id
INNER JOIN `5_sentences` s
ON s.s_id = w4.s_id
WHERE w.keyword like '%hestur%'