非常大 table JOIN with GROUP BY
Very large table JOIN with GROUP BY
我需要将来自 2700 万行 table 的信息与 700 万行 table 的信息结合起来并进行一些过滤。
CREATE TABLE event_participation (
place_id int(4),
person_id varchar(12),
event_id varchar(10),
event_description varchar(230),
.... and more fields about that specific participation
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE INDEX IDX_1 ON event_participation (place_id);
CREATE INDEX IDX_2 ON event_participation (person_id);
CREATE INDEX IDX_3 ON event_participation (event_id);
CREATE TABLE person (
person_id varchar(12),
last_name varchar(25),
first_name varchar(20),
middle_name varchar(20),
person_attr1 varchar(20),
...
person_attr50 varchar(20),
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE INDEX IDX_10 ON person (person_id);
CREATE INDEX IDX_11 ON person (person_attr1);
CREATE INDEX IDX_12 ON person (person_attr2);
...
我在查询中使用的所有属性都有索引。
event_participation table 有 2700 万行,table 人有 700 万行。
我需要 运行 这样的查询:
SELECT person.last_name, person.first_name
FROM event_participation
LEFT JOIN person ON event_participation.person_id = person.person_id
WHERE event_id IN ("event 1", "event 2", "event 3", "event 4",
"event 5", "event 6", "event 7") AND person.person_attr1 = 'A' AND
person.person_attr2 = 'B' AND place_id = 90
GROUP BY event_participation.person_id
HAVING count(event_id) >= 3
说明是:
*************************** 1. row ***************************
id: 1
select_type: SIMPLE
table: event_participation
type: ref
possible_keys: person_id,event_id,place_id
key: place_id
key_len: 5
ref: const
rows: 6437170
Extra: Using where; Using temporary; Using filesort
*************************** 2. row ***************************
id: 1
select_type: SIMPLE
table: person
type: ref
possible_keys: person_id,person_attr1,person_attr2
key: person_id
key_len: 39
ref: event_participation.person_id
rows: 1
Extra: Using where
我正在寻找参加列表中至少 3 项活动并满足其他一些条件的活跃人士。通常我会修复与事件和 运行 多个查询相关的标准,这些查询仅改变人员属性。
即使对于非常小的子集,此查询也非常慢,因此我寻找替代方法。
我创建了一个缓存 table:
CREATE TABLE temp_name (
person_id varchar(12),
PRIMARY KEY (person_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
运行 查询如:
INSERT INTO temp_name ( temp_name )
SELECT DISTINCT event_participation.person_id
FROM event_participation
WHERE event_id IN ("event 1", "event 2", "event 3", "event 4",
"event 5", "event 6", "event 7") AND place_id = 90
GROUP BY event_participation.person_id
HAVING count(event_id) >= 3
然后 运行 过滤查询,例如:
SELECT person.last_name, person.first_name
FROM temp_name LEFT JOIN person ON temp_name.person_id = person.person_id
WHERE person.person_attr1 = 'A' AND person.person_attr2 = 'B'
虽然我可以接受最终查询的当前性能,但临时 tables 的创建和管理让我很痛苦。任何建议将不胜感激。
你能试试这个吗:
SELECT person.last_name, person.first_name
FROM person pers
INNER JOIN
(SELECT person_id, count(*) as count
FROM event_participation
WHERE event_id IN ("event 1", "event 2", "event 3", "event 4",
"event 5", "event 6", "event 7") AND person.person_attr1 = 'A' AND
person.person_attr1 = 'B' AND place_id = 90
Group by person_id
) as event_count on event_count.person_id = pers.person_id AND event_count.count>2
更新:我忘了将组依据添加到内部查询。
每个 InnoDB table 都应该有一个 explicit PRIMARY KEY
。我怀疑 person_id
应该是 person
.
的 PK
ALTER TABLE person
DROP INDEX(person),
ADD PRIMARY KEY (person);
这里有什么?...
WHERE person.person_attr1 = 'A'
AND person.person_attr1 = 'B'
是'impossible'。也就是说,您应该没有行。请编辑问题。同时,假设您指的是其中之一的 attr2,这个综合索引可能会有很大帮助:
INDEX(person_attr1, person_attr1, place_id, event_id)
LEFT
有什么原因吗?如果不是,请将其删除,以便研究更多优化。
我需要将来自 2700 万行 table 的信息与 700 万行 table 的信息结合起来并进行一些过滤。
CREATE TABLE event_participation (
place_id int(4),
person_id varchar(12),
event_id varchar(10),
event_description varchar(230),
.... and more fields about that specific participation
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE INDEX IDX_1 ON event_participation (place_id);
CREATE INDEX IDX_2 ON event_participation (person_id);
CREATE INDEX IDX_3 ON event_participation (event_id);
CREATE TABLE person (
person_id varchar(12),
last_name varchar(25),
first_name varchar(20),
middle_name varchar(20),
person_attr1 varchar(20),
...
person_attr50 varchar(20),
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE INDEX IDX_10 ON person (person_id);
CREATE INDEX IDX_11 ON person (person_attr1);
CREATE INDEX IDX_12 ON person (person_attr2);
...
我在查询中使用的所有属性都有索引。
event_participation table 有 2700 万行,table 人有 700 万行。
我需要 运行 这样的查询:
SELECT person.last_name, person.first_name
FROM event_participation
LEFT JOIN person ON event_participation.person_id = person.person_id
WHERE event_id IN ("event 1", "event 2", "event 3", "event 4",
"event 5", "event 6", "event 7") AND person.person_attr1 = 'A' AND
person.person_attr2 = 'B' AND place_id = 90
GROUP BY event_participation.person_id
HAVING count(event_id) >= 3
说明是:
*************************** 1. row ***************************
id: 1
select_type: SIMPLE
table: event_participation
type: ref
possible_keys: person_id,event_id,place_id
key: place_id
key_len: 5
ref: const
rows: 6437170
Extra: Using where; Using temporary; Using filesort
*************************** 2. row ***************************
id: 1
select_type: SIMPLE
table: person
type: ref
possible_keys: person_id,person_attr1,person_attr2
key: person_id
key_len: 39
ref: event_participation.person_id
rows: 1
Extra: Using where
我正在寻找参加列表中至少 3 项活动并满足其他一些条件的活跃人士。通常我会修复与事件和 运行 多个查询相关的标准,这些查询仅改变人员属性。
即使对于非常小的子集,此查询也非常慢,因此我寻找替代方法。 我创建了一个缓存 table:
CREATE TABLE temp_name (
person_id varchar(12),
PRIMARY KEY (person_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
运行 查询如:
INSERT INTO temp_name ( temp_name )
SELECT DISTINCT event_participation.person_id
FROM event_participation
WHERE event_id IN ("event 1", "event 2", "event 3", "event 4",
"event 5", "event 6", "event 7") AND place_id = 90
GROUP BY event_participation.person_id
HAVING count(event_id) >= 3
然后 运行 过滤查询,例如:
SELECT person.last_name, person.first_name
FROM temp_name LEFT JOIN person ON temp_name.person_id = person.person_id
WHERE person.person_attr1 = 'A' AND person.person_attr2 = 'B'
虽然我可以接受最终查询的当前性能,但临时 tables 的创建和管理让我很痛苦。任何建议将不胜感激。
你能试试这个吗:
SELECT person.last_name, person.first_name
FROM person pers
INNER JOIN
(SELECT person_id, count(*) as count
FROM event_participation
WHERE event_id IN ("event 1", "event 2", "event 3", "event 4",
"event 5", "event 6", "event 7") AND person.person_attr1 = 'A' AND
person.person_attr1 = 'B' AND place_id = 90
Group by person_id
) as event_count on event_count.person_id = pers.person_id AND event_count.count>2
更新:我忘了将组依据添加到内部查询。
每个 InnoDB table 都应该有一个 explicit PRIMARY KEY
。我怀疑 person_id
应该是 person
.
ALTER TABLE person
DROP INDEX(person),
ADD PRIMARY KEY (person);
这里有什么?...
WHERE person.person_attr1 = 'A'
AND person.person_attr1 = 'B'
是'impossible'。也就是说,您应该没有行。请编辑问题。同时,假设您指的是其中之一的 attr2,这个综合索引可能会有很大帮助:
INDEX(person_attr1, person_attr1, place_id, event_id)
LEFT
有什么原因吗?如果不是,请将其删除,以便研究更多优化。