使用覆盖索引优化查询
Optimization of query using covering indices
我有以下带有子查询和自连接的查询:
SELECT bucket.patient_sid AS sid
FROM
(SELECT clinical_data.patient_sid,
clinical_data.lft,
clinical_data.rgt
FROM clinical_data INNER JOIN
(SELECT clinical_data.patient_sid,
clinical_data.lft,
clinical_data.rgt,
clinical_data.attribute_id
FROM clinical_data
WHERE clinical_data.attribute_id = '33' AND clinical_data.string_value = '2160-0') AS attribute
ON clinical_data.patient_sid = attribute.patient_sid
AND clinical_data.lft >= attribute.lft
AND clinical_data.rgt <= attribute.rgt
WHERE clinical_data.attribute_id = '36') AS bucket;
我在此定义了以下索引:
KEY `idx_bucket` (`attribute_id`,`string_value`)
KEY `idx_self_join` (`patient_sid`,`attribute_id`,`lft`,`rgt`)
当我查看使用 EXPLAIN 的查询时,使用覆盖索引 idx_bucket 的子查询肯定是优化的,但自连接和 where 子句不是。此外,为什么它报告只有 patient_sid
和 attribute_id
用于 used_key_parts
而 attachment_condition
用于 lft
、rgt
(什么这是否意味着?)。 lft
和 'rgt` 都只是定义为没有特殊属性的整数,那么为什么我的覆盖索引中没有使用它们?
更奇怪的是当我定义
KEY `idx_self_join` (`patient_sid`,`lft`,`rgt`,`attribute_id`)
只有patient_sid
在used_key_parts.
中注册,而且filtered
从11.00%
下降到1.60%
!
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "645186.71"
},
"nested_loop": [
{
"table": {
"table_name": "clinical_data",
"access_type": "ref",
"possible_keys": [
"fk_attribute_idx",
"idx_value_string",
"idx_value_double",
"idx_bucket",
"idx_self_join_idx"
],
"key": "idx_bucket",
"used_key_parts": [
"attribute_id",
"string_value"
],
"key_length": "308",
"ref": [
"const",
"const"
],
"rows_examined_per_scan": 126402,
"rows_produced_per_join": 126402,
"filtered": "100.00",
"cost_info": {
"read_cost": "126402.00",
"eval_cost": "25280.40",
"prefix_cost": "151682.40",
"data_read_per_join": "46M"
},
"used_columns": [
"patient_sid",
"string_value",
"attribute_id",
"lft",
"rgt"
],
"attached_condition": "(`ns_large2`.`clinical_data`.`patient_sid` is not null)"
}
},
{
"table": {
"table_name": "clinical_data",
"access_type": "ref",
"possible_keys": [
"fk_attribute_idx",
"idx_value_string",
"idx_value_double",
"idx_bucket",
"idx_self_join_idx"
],
"key": "idx_self_join_idx",
"used_key_parts": [
"attribute_id",
"patient_sid"
],
"key_length": "10",
"ref": [
"const",
"ns_large2.clinical_data.patient_sid"
],
"rows_examined_per_scan": 14,
"rows_produced_per_join": 201169,
"filtered": "11.11",
"using_index": true,
"cost_info": {
"read_cost": "131327.39",
"eval_cost": "40233.83",
"prefix_cost": "645186.71",
"data_read_per_join": "73M"
},
"used_columns": [
"patient_sid",
"attribute_id",
"lft",
"rgt"
],
"attached_condition": "((`ns_large2`.`clinical_data`.`lft` >= `ns_large2`.`clinical_data`.`lft`) and (`ns_large2`.`clinical_data`.`rgt` <= `ns_large2`.`clinical_data`.`rgt`))"
}
}
]
}
}
"Used_columns" 表示 是 'covering'。最后的 "used key parts" 没有全部用作 "key" 因为它们在 "range" 中需要,而不是 '='.
摆脱外部查询:
SELECT clinical_data.patient_sid, clinical_data.lft, clinical_data.rgt
FROM clinical_data
INNER JOIN
( SELECT clinical_data.patient_sid, clinical_data.lft, clinical_data.rgt,
clinical_data.attribute_id
FROM clinical_data
WHERE clinical_data.attribute_id = '33'
AND clinical_data.string_value = '2160-0'
) AS attribute ON clinical_data.patient_sid = attribute.patient_sid
AND clinical_data.lft >= attribute.lft
AND clinical_data.rgt <= attribute.rgt
WHERE clinical_data.attribute_id = '36'
抱歉,lft-rgt 架构效率不高。
这是您的基本 JOIN:
SELECT
FROM clinical_data cd1
JOIN clinical_data cd2
ON cd1.patient_sid = cd2.patient_sid
AND cd2.attribute_id = '33'
WHERE cd1.attribute_id = '36'
这是我最终得出的结论:
SELECT
cd1.patient_sid as sid
FROM clinical_data cd1
JOIN clinical_data cd2
ON cd1.patient_sid = cd2.patient_sid
AND cd1.lft >= cd2.lft
AND cd1.rgt <= cd2.rgt
WHERE cd1.attribute_id = '36'
AND cd2.attribute_id = '33'
AND cd2.string_value = '2160-0'
我有以下带有子查询和自连接的查询:
SELECT bucket.patient_sid AS sid
FROM
(SELECT clinical_data.patient_sid,
clinical_data.lft,
clinical_data.rgt
FROM clinical_data INNER JOIN
(SELECT clinical_data.patient_sid,
clinical_data.lft,
clinical_data.rgt,
clinical_data.attribute_id
FROM clinical_data
WHERE clinical_data.attribute_id = '33' AND clinical_data.string_value = '2160-0') AS attribute
ON clinical_data.patient_sid = attribute.patient_sid
AND clinical_data.lft >= attribute.lft
AND clinical_data.rgt <= attribute.rgt
WHERE clinical_data.attribute_id = '36') AS bucket;
我在此定义了以下索引:
KEY `idx_bucket` (`attribute_id`,`string_value`)
KEY `idx_self_join` (`patient_sid`,`attribute_id`,`lft`,`rgt`)
当我查看使用 EXPLAIN 的查询时,使用覆盖索引 idx_bucket 的子查询肯定是优化的,但自连接和 where 子句不是。此外,为什么它报告只有 patient_sid
和 attribute_id
用于 used_key_parts
而 attachment_condition
用于 lft
、rgt
(什么这是否意味着?)。 lft
和 'rgt` 都只是定义为没有特殊属性的整数,那么为什么我的覆盖索引中没有使用它们?
更奇怪的是当我定义
KEY `idx_self_join` (`patient_sid`,`lft`,`rgt`,`attribute_id`)
只有patient_sid
在used_key_parts.
中注册,而且filtered
从11.00%
下降到1.60%
!
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "645186.71"
},
"nested_loop": [
{
"table": {
"table_name": "clinical_data",
"access_type": "ref",
"possible_keys": [
"fk_attribute_idx",
"idx_value_string",
"idx_value_double",
"idx_bucket",
"idx_self_join_idx"
],
"key": "idx_bucket",
"used_key_parts": [
"attribute_id",
"string_value"
],
"key_length": "308",
"ref": [
"const",
"const"
],
"rows_examined_per_scan": 126402,
"rows_produced_per_join": 126402,
"filtered": "100.00",
"cost_info": {
"read_cost": "126402.00",
"eval_cost": "25280.40",
"prefix_cost": "151682.40",
"data_read_per_join": "46M"
},
"used_columns": [
"patient_sid",
"string_value",
"attribute_id",
"lft",
"rgt"
],
"attached_condition": "(`ns_large2`.`clinical_data`.`patient_sid` is not null)"
}
},
{
"table": {
"table_name": "clinical_data",
"access_type": "ref",
"possible_keys": [
"fk_attribute_idx",
"idx_value_string",
"idx_value_double",
"idx_bucket",
"idx_self_join_idx"
],
"key": "idx_self_join_idx",
"used_key_parts": [
"attribute_id",
"patient_sid"
],
"key_length": "10",
"ref": [
"const",
"ns_large2.clinical_data.patient_sid"
],
"rows_examined_per_scan": 14,
"rows_produced_per_join": 201169,
"filtered": "11.11",
"using_index": true,
"cost_info": {
"read_cost": "131327.39",
"eval_cost": "40233.83",
"prefix_cost": "645186.71",
"data_read_per_join": "73M"
},
"used_columns": [
"patient_sid",
"attribute_id",
"lft",
"rgt"
],
"attached_condition": "((`ns_large2`.`clinical_data`.`lft` >= `ns_large2`.`clinical_data`.`lft`) and (`ns_large2`.`clinical_data`.`rgt` <= `ns_large2`.`clinical_data`.`rgt`))"
}
}
]
}
}
"Used_columns" 表示 是 'covering'。最后的 "used key parts" 没有全部用作 "key" 因为它们在 "range" 中需要,而不是 '='.
摆脱外部查询:
SELECT clinical_data.patient_sid, clinical_data.lft, clinical_data.rgt
FROM clinical_data
INNER JOIN
( SELECT clinical_data.patient_sid, clinical_data.lft, clinical_data.rgt,
clinical_data.attribute_id
FROM clinical_data
WHERE clinical_data.attribute_id = '33'
AND clinical_data.string_value = '2160-0'
) AS attribute ON clinical_data.patient_sid = attribute.patient_sid
AND clinical_data.lft >= attribute.lft
AND clinical_data.rgt <= attribute.rgt
WHERE clinical_data.attribute_id = '36'
抱歉,lft-rgt 架构效率不高。
这是您的基本 JOIN:
SELECT
FROM clinical_data cd1
JOIN clinical_data cd2
ON cd1.patient_sid = cd2.patient_sid
AND cd2.attribute_id = '33'
WHERE cd1.attribute_id = '36'
这是我最终得出的结论:
SELECT
cd1.patient_sid as sid
FROM clinical_data cd1
JOIN clinical_data cd2
ON cd1.patient_sid = cd2.patient_sid
AND cd1.lft >= cd2.lft
AND cd1.rgt <= cd2.rgt
WHERE cd1.attribute_id = '36'
AND cd2.attribute_id = '33'
AND cd2.string_value = '2160-0'