Snowflake 改进搜索加入 IP over Network
Snowflake improve search join IP over Network
我想知道 ip 列表的国家名称,问题是评估加入的时间真的很长。
这是想法:
WITH IP_GEO_CITY AS
(SELECT
AS_INTEGER(PARSE_IP(A.NETWORK, 'INET'):ipv4_range_start) AS IP_START,
AS_INTEGER(PARSE_IP(A.NETWORK, 'INET'):ipv4_range_end) AS IP_END,
B.COUNTRY_NAME AS COUNTRY_NAME
FROM
GEOLITE_CITY_BLOCK_IPV4 AS A
LEFT JOIN
GEOLITE_LOCATIONS AS B
ON
A.GEONAME_ID = B.GEONAME_ID
ORDER BY
IP_START ASC
)
SELECT
UNIQUE_IP_NUMBER,
COUNTRY_NAME
FROM
UNIQUE_IP_NUMBER AS A
LEFT JOIN
IP_GEO_CITY AS B
ON
A.UNIQUE_IP_NUMBER >= B.IP_START AND
A.UNIQUE_IP_NUMBER <= B.IP_END
与 LEFT JOIN
我得到相同的结果:
SELECT
A.UNIQUE_IP_NUMBER,
C.COUNTRY_NAME AS COUNTRY_NAME
FROM
UNIQUE_IP_NUMBER AS A
LEFT JOIN
GEOLITE_CITY_BLOCK_IPV4 AS B
ON
A.UNIQUE_IP_NUMBER >= AS_INTEGER(PARSE_IP(B.NETWORK, 'INET'):ipv4_range_start) AND
A.UNIQUE_IP_NUMBER <= AS_INTEGER(PARSE_IP(B.NETWORK, 'INET'):ipv4_range_end)
LEFT JOIN
GEOLITE_LOCATIONS AS C
ON
B.GEONAME_ID = C.GEONAME_ID
我认为问题是ON
条件:
ON
A.UNIQUE_IP_NUMBER >= B.IP_START AND
A.UNIQUE_IP_NUMBER <= B.IP_END
但我不知道如何避免这种情况。
GEOLITE_CITY_BLOCK_IPV4 可以包含如下行:
NETWORK GEONAME_ID REGISTERED_COUNTRY_GEONAME_ID REPRESENTED_COUNTRY_GEONAME_ID IS_ANONYMOUS_PROXY IS_SATELLITE_PROVIDER ETL_ID ETL_TIMESTAMP FILENAME
1.0.0.0/24 2077456 2077456 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.1.0/24 1814991 1814991 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.2.0/23 1814991 1814991 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.4.0/22 2077456 2077456 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.8.0/21 1814991 1814991 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.16.0/20 1861060 1861060 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.32.0/19 1814991 1814991 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.64.0/18 1861060 1861060 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.128.0/17 1605651 1605651 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.1.0.0/24 1814991 1814991 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
UNIQUE_IP_NUMBER是将普通ips转成整数
Snowflake 在范围扫描上的表现往往不如您所拥有的那样
ON
A.UNIQUE_IP_NUMBER >= B.IP_START AND
A.UNIQUE_IP_NUMBER <= B.IP_END
我们发现 between
在某些情况下有帮助,而您的匹配是一个很好的例子,您可能会获得更好的表现。 PARSE_IP 帮助
中使用的模式
ON A.UNIQUE_IP_NUMBER BETWEEN B.IP_START AND B.IP_END
我们甚至发现 >= & <
在末尾使用 BETWEEN
和 <
更快:
ON A.UNIQUE_IP_NUMBER BETWEEN B.IP_START AND B.IP_END
AND B.IP_START < B.IP_END
但我们发现 SNOWFLAKE 的亮点在于 equi join,我们发现将数据增加 32 倍,如下所示,速度大幅提升,甚至超过 10 亿行连接到 1 亿行。
WITH ip_geo_city AS (
SELECT
PARSE_IP(a.network, 'INET') as ipv4_range_end
AS_INTEGER(ip:ipv4_range_end) AS ip_end,
AS_INTEGER(ip:netmask_prefix_length) AS ip_netlenmask
BITOR(BITSHIFTLEFT(ip_netlenmask, 32), ip_end) as lookup_key
b.country_name AS country_name
FROM geolite_city_block_ipv4 AS a
LEFT JOIN geolite_locations AS b
ON a.geoname_id = b.geoname_id
ORDER BY lookup_key
), ipv4_masks AS (
SELECT ROW_NUMBER() OVER(ORDER BY TRUE) as rn
,32-rn as net_len
--,BITSHIFTLEFT(1, rn) as b
--,b-1 as bm
--,BITNOT(bm, 4294967295) as bn
--,-b as bnn -- due to two's complement -b = BITNOT(b-1)
,BITAND(4294967295, -BITSHIFTLEFT(1, rn)) as net_mask
FROM table(generator(rowcount => 31)) ;
), unique_ip_number_lookups AS (
SELECT a.unique_ip_number
,BITOR(BITSHIFTLEFT(m.net_len, 32), BITAND(m.net_mask, a.unique_ip_number) as lookup_key
FROM unique_ip_number AS a
JOIN ipv4_masks as m
)
SELECT
a.unique_ip_number,
b.country_name
FROM
unique_ip_number_lookups AS a
LEFT JOIN ip_geo_city AS b
ON a.lookup_key = b.lookup_key
我想知道 ip 列表的国家名称,问题是评估加入的时间真的很长。 这是想法:
WITH IP_GEO_CITY AS
(SELECT
AS_INTEGER(PARSE_IP(A.NETWORK, 'INET'):ipv4_range_start) AS IP_START,
AS_INTEGER(PARSE_IP(A.NETWORK, 'INET'):ipv4_range_end) AS IP_END,
B.COUNTRY_NAME AS COUNTRY_NAME
FROM
GEOLITE_CITY_BLOCK_IPV4 AS A
LEFT JOIN
GEOLITE_LOCATIONS AS B
ON
A.GEONAME_ID = B.GEONAME_ID
ORDER BY
IP_START ASC
)
SELECT
UNIQUE_IP_NUMBER,
COUNTRY_NAME
FROM
UNIQUE_IP_NUMBER AS A
LEFT JOIN
IP_GEO_CITY AS B
ON
A.UNIQUE_IP_NUMBER >= B.IP_START AND
A.UNIQUE_IP_NUMBER <= B.IP_END
与 LEFT JOIN
我得到相同的结果:
SELECT
A.UNIQUE_IP_NUMBER,
C.COUNTRY_NAME AS COUNTRY_NAME
FROM
UNIQUE_IP_NUMBER AS A
LEFT JOIN
GEOLITE_CITY_BLOCK_IPV4 AS B
ON
A.UNIQUE_IP_NUMBER >= AS_INTEGER(PARSE_IP(B.NETWORK, 'INET'):ipv4_range_start) AND
A.UNIQUE_IP_NUMBER <= AS_INTEGER(PARSE_IP(B.NETWORK, 'INET'):ipv4_range_end)
LEFT JOIN
GEOLITE_LOCATIONS AS C
ON
B.GEONAME_ID = C.GEONAME_ID
我认为问题是ON
条件:
ON
A.UNIQUE_IP_NUMBER >= B.IP_START AND
A.UNIQUE_IP_NUMBER <= B.IP_END
但我不知道如何避免这种情况。
GEOLITE_CITY_BLOCK_IPV4 可以包含如下行:
NETWORK GEONAME_ID REGISTERED_COUNTRY_GEONAME_ID REPRESENTED_COUNTRY_GEONAME_ID IS_ANONYMOUS_PROXY IS_SATELLITE_PROVIDER ETL_ID ETL_TIMESTAMP FILENAME
1.0.0.0/24 2077456 2077456 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.1.0/24 1814991 1814991 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.2.0/23 1814991 1814991 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.4.0/22 2077456 2077456 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.8.0/21 1814991 1814991 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.16.0/20 1861060 1861060 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.32.0/19 1814991 1814991 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.64.0/18 1861060 1861060 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.0.128.0/17 1605651 1605651 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
1.1.0.0/24 1814991 1814991 0 0 2019-10-25 00:00:00.000000000 2019-10-25 08:39:19.000000000 GeoLite2-Country-CSV_20191022/GeoLite2-Country-Blocks-IPv4.csv
UNIQUE_IP_NUMBER是将普通ips转成整数
Snowflake 在范围扫描上的表现往往不如您所拥有的那样
ON
A.UNIQUE_IP_NUMBER >= B.IP_START AND
A.UNIQUE_IP_NUMBER <= B.IP_END
我们发现 between
在某些情况下有帮助,而您的匹配是一个很好的例子,您可能会获得更好的表现。 PARSE_IP 帮助
ON A.UNIQUE_IP_NUMBER BETWEEN B.IP_START AND B.IP_END
我们甚至发现 >= & <
在末尾使用 BETWEEN
和 <
更快:
ON A.UNIQUE_IP_NUMBER BETWEEN B.IP_START AND B.IP_END
AND B.IP_START < B.IP_END
但我们发现 SNOWFLAKE 的亮点在于 equi join,我们发现将数据增加 32 倍,如下所示,速度大幅提升,甚至超过 10 亿行连接到 1 亿行。
WITH ip_geo_city AS (
SELECT
PARSE_IP(a.network, 'INET') as ipv4_range_end
AS_INTEGER(ip:ipv4_range_end) AS ip_end,
AS_INTEGER(ip:netmask_prefix_length) AS ip_netlenmask
BITOR(BITSHIFTLEFT(ip_netlenmask, 32), ip_end) as lookup_key
b.country_name AS country_name
FROM geolite_city_block_ipv4 AS a
LEFT JOIN geolite_locations AS b
ON a.geoname_id = b.geoname_id
ORDER BY lookup_key
), ipv4_masks AS (
SELECT ROW_NUMBER() OVER(ORDER BY TRUE) as rn
,32-rn as net_len
--,BITSHIFTLEFT(1, rn) as b
--,b-1 as bm
--,BITNOT(bm, 4294967295) as bn
--,-b as bnn -- due to two's complement -b = BITNOT(b-1)
,BITAND(4294967295, -BITSHIFTLEFT(1, rn)) as net_mask
FROM table(generator(rowcount => 31)) ;
), unique_ip_number_lookups AS (
SELECT a.unique_ip_number
,BITOR(BITSHIFTLEFT(m.net_len, 32), BITAND(m.net_mask, a.unique_ip_number) as lookup_key
FROM unique_ip_number AS a
JOIN ipv4_masks as m
)
SELECT
a.unique_ip_number,
b.country_name
FROM
unique_ip_number_lookups AS a
LEFT JOIN ip_geo_city AS b
ON a.lookup_key = b.lookup_key