Bigquery:将 IP 地址与另一个 table 中的 IP CIDR 匹配
Bigquery: Match IP address to IP CIDR in another table
我参考这篇文章是为了查找国家/地区的 IP 地址:
https://cloud.google.com/blog/products/data-analytics/geolocation-with-bigquery-de-identify-76-million-ip-addresses-in-20-seconds
但是,public 数据集:fh-bigquery.geocode.201806_geolite2_city_ipv4_locs 不再可用。所以我不得不导入自己的查找 table.
我在 Bigquery 中有 2 个 table:
Row| IP_CIDR (String)| Country_Code
-------------------------
1 | 160.181.205.0/24| ME
2 | 197.157.224.0/24| SL
3 | 196.32.200.0/21 | CF
4 | 160.181.224.0/24| SI
...
BQ 中有 40 万行
和
Row| IP (String)
-------------------------
1 | 160.181.205.222
2 | 197.157.224.123
3 | 196.32.200.33
4 | 160.181.224.44
...
在 BQ 中有 10k 行
如何将 IP 地址与另一个 table 中的 IP_CIDR 匹配,这样结果将是:
Row| IP (String) |Country_Code
-------------------------
1 | 160.181.205.222 | ME
2 | 197.157.224.123 | SL
3 | 196.32.200.33 | CF
4 | 160.181.224.44 | SI
...
如果在 IP_CIDR
中找不到 IP,则为 null
这是文章推荐的原创查询:
WITH source_of_ip_addresses AS (
SELECT REGEXP_REPLACE(contributor_ip, 'xxx', '0') ip, COUNT(*) c
FROM `publicdata.samples.wikipedia`
WHERE contributor_ip IS NOT null
GROUP BY 1
)
SELECT country_name, SUM(c) c
FROM (
SELECT ip, country_name, c
FROM (
SELECT *, NET.SAFE_IP_FROM_STRING(ip) & NET.IP_NET_MASK(4, mask) network_bin
FROM source_of_ip_addresses, UNNEST(GENERATE_ARRAY(9,32)) mask
WHERE BYTE_LENGTH(NET.SAFE_IP_FROM_STRING(ip)) = 4
)
JOIN `fh-bigquery.geocode.201806_geolite2_city_ipv4_locs`
USING (network_bin, mask)
)
GROUP BY 1
ORDER BY 2 DESC
您可以使用如下查询
WITH table2 as (
SELECT 1 as Row, '160.181.205.0/24' as IP_CIDRString,'ME' as Country_Code
UNION ALL
SELECT 4 as Row, '160.181.224.0/24' as IP_CIDRString,'SI' as Country_Code
)
, Table1 as (
SELECT 1 as Row,'160.181.205.222' as IPString
UNION ALL
SELECT 2 as Row,'197.157.224.123' as IPString
UNION ALL
SELECT 3 as Row,'196.32.200.33' as IPString
UNION ALL
SELECT 4 as Row,'160.181.224.44' as IPString
),
maptable as
(
select Row, Country_Code, POW(2,(32- CAST(RIGHT(IP_CIDRString,2) AS INT64)))-1 as cidrrange, SPLIT(IP_CIDRString,'.') as IPblocks
from Table2
),
sourcetable as
(
select Row, IPString, SPLIT(IPString,'.') as IPblocks
from Table1
)
select
A.Row
,A.IPString
,B.Country_Code
from
sourcetable A
left join maptable B
on A.IPblocks[ORDINAL(1)] = B.IPblocks[ORDINAL(1)]
AND A.IPblocks[ORDINAL(2)] = B.IPblocks[ORDINAL(2)]
AND A.IPblocks[ORDINAL(3)] = B.IPblocks[ORDINAL(3)]
AND CAST(A.IPblocks[ORDINAL(4)] AS INT64) BETWEEN 0 AND cidrrange
查询结果
考虑以下方法
create temp function SubnetToRange(CIDR string)
returns struct<start_ip string, end_ip string>
language js as '''
var beg = CIDR.substr(CIDR,CIDR.indexOf('/'));
var end = beg;
var off = (1<<(32-parseInt(CIDR.substr(CIDR.indexOf('/')+1))))-1;
var sub = beg.split('.').map(function(a){return parseInt(a)});
var buf = new ArrayBuffer(4);
var i32 = new Uint32Array(buf);
i32[0] = (sub[0]<<24) + (sub[1]<<16) + (sub[2]<<8) + (sub[3]) + off;
var end = Array.apply([],new Uint8Array(buf)).reverse().join('.');
return {start_ip: beg, end_ip: end};
''';
select IP, Country_Code
from table2
left join (
select IP_CIDR, Country_Code, SubnetToRange(IP_CIDR).*
from table1
)
on net.ipv4_to_int64(net.ip_from_string(IP))
between net.ipv4_to_int64(net.ip_from_string(start_ip))
and net.ipv4_to_int64(net.ip_from_string(end_ip))
如果应用于您问题中的示例数据
with table1 as (
select '160.181.205.0/24' IP_CIDR, 'ME' Country_Code union all
select '197.157.224.0/24', 'SL' union all
select '196.32.200.0/21', 'CF' union all
select '160.181.224.0/24', 'SI'
), table2 as (
select '160.181.205.222' IP union all
select '197.157.224.123' union all
select '196.32.200.33' union all
select '160.181.224.44'
)
输出是
我参考这篇文章是为了查找国家/地区的 IP 地址: https://cloud.google.com/blog/products/data-analytics/geolocation-with-bigquery-de-identify-76-million-ip-addresses-in-20-seconds
但是,public 数据集:fh-bigquery.geocode.201806_geolite2_city_ipv4_locs 不再可用。所以我不得不导入自己的查找 table.
我在 Bigquery 中有 2 个 table:
Row| IP_CIDR (String)| Country_Code
-------------------------
1 | 160.181.205.0/24| ME
2 | 197.157.224.0/24| SL
3 | 196.32.200.0/21 | CF
4 | 160.181.224.0/24| SI
...
BQ 中有 40 万行
和
Row| IP (String)
-------------------------
1 | 160.181.205.222
2 | 197.157.224.123
3 | 196.32.200.33
4 | 160.181.224.44
...
在 BQ 中有 10k 行
如何将 IP 地址与另一个 table 中的 IP_CIDR 匹配,这样结果将是:
Row| IP (String) |Country_Code
-------------------------
1 | 160.181.205.222 | ME
2 | 197.157.224.123 | SL
3 | 196.32.200.33 | CF
4 | 160.181.224.44 | SI
...
如果在 IP_CIDR
中找不到 IP,则为 null这是文章推荐的原创查询:
WITH source_of_ip_addresses AS (
SELECT REGEXP_REPLACE(contributor_ip, 'xxx', '0') ip, COUNT(*) c
FROM `publicdata.samples.wikipedia`
WHERE contributor_ip IS NOT null
GROUP BY 1
)
SELECT country_name, SUM(c) c
FROM (
SELECT ip, country_name, c
FROM (
SELECT *, NET.SAFE_IP_FROM_STRING(ip) & NET.IP_NET_MASK(4, mask) network_bin
FROM source_of_ip_addresses, UNNEST(GENERATE_ARRAY(9,32)) mask
WHERE BYTE_LENGTH(NET.SAFE_IP_FROM_STRING(ip)) = 4
)
JOIN `fh-bigquery.geocode.201806_geolite2_city_ipv4_locs`
USING (network_bin, mask)
)
GROUP BY 1
ORDER BY 2 DESC
您可以使用如下查询
WITH table2 as (
SELECT 1 as Row, '160.181.205.0/24' as IP_CIDRString,'ME' as Country_Code
UNION ALL
SELECT 4 as Row, '160.181.224.0/24' as IP_CIDRString,'SI' as Country_Code
)
, Table1 as (
SELECT 1 as Row,'160.181.205.222' as IPString
UNION ALL
SELECT 2 as Row,'197.157.224.123' as IPString
UNION ALL
SELECT 3 as Row,'196.32.200.33' as IPString
UNION ALL
SELECT 4 as Row,'160.181.224.44' as IPString
),
maptable as
(
select Row, Country_Code, POW(2,(32- CAST(RIGHT(IP_CIDRString,2) AS INT64)))-1 as cidrrange, SPLIT(IP_CIDRString,'.') as IPblocks
from Table2
),
sourcetable as
(
select Row, IPString, SPLIT(IPString,'.') as IPblocks
from Table1
)
select
A.Row
,A.IPString
,B.Country_Code
from
sourcetable A
left join maptable B
on A.IPblocks[ORDINAL(1)] = B.IPblocks[ORDINAL(1)]
AND A.IPblocks[ORDINAL(2)] = B.IPblocks[ORDINAL(2)]
AND A.IPblocks[ORDINAL(3)] = B.IPblocks[ORDINAL(3)]
AND CAST(A.IPblocks[ORDINAL(4)] AS INT64) BETWEEN 0 AND cidrrange
查询结果
考虑以下方法
create temp function SubnetToRange(CIDR string)
returns struct<start_ip string, end_ip string>
language js as '''
var beg = CIDR.substr(CIDR,CIDR.indexOf('/'));
var end = beg;
var off = (1<<(32-parseInt(CIDR.substr(CIDR.indexOf('/')+1))))-1;
var sub = beg.split('.').map(function(a){return parseInt(a)});
var buf = new ArrayBuffer(4);
var i32 = new Uint32Array(buf);
i32[0] = (sub[0]<<24) + (sub[1]<<16) + (sub[2]<<8) + (sub[3]) + off;
var end = Array.apply([],new Uint8Array(buf)).reverse().join('.');
return {start_ip: beg, end_ip: end};
''';
select IP, Country_Code
from table2
left join (
select IP_CIDR, Country_Code, SubnetToRange(IP_CIDR).*
from table1
)
on net.ipv4_to_int64(net.ip_from_string(IP))
between net.ipv4_to_int64(net.ip_from_string(start_ip))
and net.ipv4_to_int64(net.ip_from_string(end_ip))
如果应用于您问题中的示例数据
with table1 as (
select '160.181.205.0/24' IP_CIDR, 'ME' Country_Code union all
select '197.157.224.0/24', 'SL' union all
select '196.32.200.0/21', 'CF' union all
select '160.181.224.0/24', 'SI'
), table2 as (
select '160.181.205.222' IP union all
select '197.157.224.123' union all
select '196.32.200.33' union all
select '160.181.224.44'
)
输出是