如何对附近的纬度和经度进行分组,并在 SQL 服务器中为该组分配一个 Name/Number?
How to group nearby latitude and longitude and assign a Name/Number to that group in SQL Server?
我有一个 table 和 ID
、LATITUDE
、LONGITUDE
、COUNTRY_CD
,我正在尝试分组(聚类)ID
在 40 米以内并为该组分配一个 Name/Number。前任。 40米距离内有7个ID来自下面的记录,需要分配一个Name/Number。
我的table有10万条全球经纬度记录,一个国家会有100多个簇,不知道每个国家有多少簇。
我可以拾取附近点的链 Ex,ID1 和 ID3 都是 'close' 到 ID6(但不是彼此)。
create table #temp
(
ID varchar(10),
LATITUDE [decimal](11, 8),
LONGITUDE [decimal](11, 8),
COUNTRY_CD [char](2)
)
insert into #temp select 'ID1', 10.81583689, 78.61898689, 'IN'
insert into #temp select 'ID2', 10.81513789, 78.61898789, 'IN'
insert into #temp select 'ID3', 10.81514889, 78.61894889, 'IN'
insert into #temp select 'ID4', 10.81523989, 78.61898989, 'IN'
insert into #temp select 'ID5', 10.81521089, 78.61891089, 'IN'
insert into #temp select 'ID6', 10.81551189, 78.61891189, 'IN'
insert into #temp select 'ID7', 10.81551189, 78.61791189, 'IN'
insert into #temp select 'ID8', 10.81561189, 78.61792189, 'IN'
insert into #temp select 'ID9', 10.81571189, 78.61793189, 'IN'
select
t1.ID, t2.ID,
t1.LATITUDE,
t1.LONGITUDE,
t1.COUNTRY_CD,
--calculate the distance in meters
cast(6378137.0 * sqrt(power((radians(t1.LATITUDE) - radians(t2.LATITUDE)), 2)
+ power((radians(t1.LONGITUDE) - radians(t2.LONGITUDE)) * cos(radians(t1.LATITUDE)), 2)) as integer) as MAPPING_DISTANCE,
(row_number() over (partition by t1.ID order by
--rank the distance in meters
cast(6378137.0*sqrt(power((radians(t1.LATITUDE)-radians(t2.LATITUDE)),2)
+ power((radians(t1.LONGITUDE)-radians(t2.LONGITUDE))*cos(radians(t1.LATITUDE)),2)) as integer) asc
)) as DISTANCE_RANK
from
(select
ID, LATITUDE, LONGITUDE, COUNTRY_CD
from
#temp) t1
--join the above list of ID to get near by ID
inner join
(select
ID, LATITUDE, LONGITUDE, COUNTRY_CD
from
#temp) t2 on t1.COUNTRY_CD = t2.COUNTRY_CD
--this brings ID available in 75 meters radius
and (t2.LATITUDE between (t1.LATITUDE - 0.00056) and (t1.LATITUDE + 0.00056))
and (t2.LONGITUDE between (t1.LONGITUDE - 0.00076) and (t1.LONGITUDE + 0.00076))
--distance between t1 co-ordinates and t2 co-ordinates in meters
and (cast(6378137.0*sqrt(power((radians(t1.LATITUDE)-radians(t2.LATITUDE)),2) + power((radians(t1.LONGITUDE)-radians(t2.LONGITUDE))*cos(radians(t1.LATITUDE)),2)) as integer)) <= 40 --limit to 40 meters
and t1.ID != t2.ID --exclude the same ID
上面的查询带的是40米以内的ID,但是我不知道如何过滤集群中的ID?。例如,'Cluster_1' ?
参考这张图片2 clusters from above 9 ID
请注意,我不会提供任何特定坐标作为输入,但查询必须从 table 中的可用坐标自动选择距离内的 ID。
我的预期结果如下,
ID LATITUDE LONGITUDE COUNTRY_CD CLUSTER_NAME
ID1 10.81583689 78.61898689 IN Cluster_1
ID2 10.81513789 78.61898789 IN Cluster_1
ID3 10.81514889 78.61894889 IN Cluster_1
ID4 10.81523989 78.61898989 IN Cluster_1
ID5 10.81521089 78.61891089 IN Cluster_1
ID6 10.81551189 78.61891189 IN Cluster_1
ID7 10.81551189 78.61791189 IN Cluster_2
ID8 10.81561189 78.61792189 IN Cluster_2
ID9 10.81571189 78.61793189 IN Cluster_2
关于如何过滤集群中的 ID 的任何建议?如果有任何其他简单的方法可以做到这一点,那就太好了!
首先,让我们创建一个用于存储位置坐标的计算 geography
列。我们将使用此列让 SQL 服务器为我们计算距离:
ALTER TABLE #temp
ADD Point_Geolocation AS geography::STPointFromText('POINT(' + CAST(LONGITUDE AS VARCHAR(100))+ ' ' + CAST(LATITUDE AS VARCHAR(100)) +')', 4326) PERSISTED
其次,让我们创建一个包含附近所有位置的 table:
IF OBJECT_ID('tempdb..#Nearby_Points') IS NOT NULL DROP TABLE #Nearby_Points
CREATE TABLE #Nearby_Points (
ID_1 VARCHAR(10) NOT NULL,
ID_2 VARCHAR(10) NOT NULL,
PRIMARY KEY (ID_1, ID_2)
)
INSERT INTO #Nearby_Points
(
ID_1,
ID_2
)
SELECT t1.ID AS p1_ID
,t2.ID AS p2_ID
FROM #temp t1
INNER JOIN #temp t2
ON t1.ID < t2.ID
WHERE t1.Point_Geolocation.STDistance(t2.Point_Geolocation) < 40 -- Specify distance criteria here
-- SELECT * FROM #Nearby_Points
注意: 使用 100k+ 坐标,我们正在查看大约 50 亿次计算:(100,000 ^ 2) / 2
。上述查询可能需要一段时间才能执行。
第三,让我们创建一个table来存储我们的集群列表:
IF OBJECT_ID('tempdb..#Clusters') IS NOT NULL DROP TABLE #Clusters
CREATE TABLE #Clusters(
Cluster_ID INT NOT NULL,
Point_ID VARCHAR(10) NOT NULL,
PRIMARY KEY(Cluster_ID, Point_ID)
);
-- This index may improve performance a little
CREATE NONCLUSTERED INDEX IX_Point_ID ON #Clusters(Point_ID);
最后,下面的代码会:
- 为第一个点创建一个新的集群
群集。
- 重复re-scan集群table并向现有集群添加额外的点,直到每个集群包含所有应该属于它的点。
- 转到上面的步骤 1 并重复,直到没有新的集群被创建。
DECLARE @Rowcount INT
INSERT INTO #Clusters
(
Cluster_ID,
Point_ID
)
SELECT COALESCE((SELECT MAX(Cluster_ID) FROM #Clusters),0) + 1
,MIN(np.ID_1)
FROM #Nearby_Points np
WHERE np.ID_1 NOT IN (SELECT Point_ID FROM #Clusters)
HAVING MIN(np.ID_1) IS NOT NULL
SET @Rowcount = @@ROWCOUNT
WHILE @Rowcount > 0
BEGIN
WHILE @Rowcount > 0
BEGIN
INSERT INTO #Clusters
(
Cluster_ID,
Point_ID
)
SELECT Cluster_ID
,Point_ID
FROM (
SELECT np.ID_2 AS Point_ID
,c.Cluster_ID
FROM #Nearby_Points np
INNER JOIN #Clusters c
ON np.ID_1 = c.Point_ID
UNION
SELECT np.ID_1
,c.Cluster_ID
FROM #Nearby_Points np
INNER JOIN #Clusters c
ON np.ID_2 = c.Point_ID
) vals
WHERE NOT EXISTS (
SELECT 1
FROM #Clusters
WHERE Cluster_ID = vals.Cluster_ID
AND Point_ID = vals.Point_ID
)
SET @Rowcount = @@ROWCOUNT
END
INSERT INTO #Clusters
(
Cluster_ID,
Point_ID
)
SELECT COALESCE((SELECT MAX(Cluster_ID) FROM #Clusters),0) + 1
,MIN(np.ID_1)
FROM #Nearby_Points np
WHERE np.ID_1 NOT IN (SELECT Point_ID FROM #Clusters)
HAVING MIN(np.ID_1) IS NOT NULL
SET @Rowcount = @@ROWCOUNT
END
瞧瞧:
SELECT *
FROM #Clusters c
|Cluster_ID | Point_ID|
|-----------|---------|
| 1 | ID1 |
| 1 | ID2 |
| 1 | ID3 |
| 1 | ID4 |
| 1 | ID5 |
| 1 | ID6 |
| 2 | ID7 |
| 2 | ID8 |
| 2 | ID9 |
我有一个 table 和 ID
、LATITUDE
、LONGITUDE
、COUNTRY_CD
,我正在尝试分组(聚类)ID
在 40 米以内并为该组分配一个 Name/Number。前任。 40米距离内有7个ID来自下面的记录,需要分配一个Name/Number。
我的table有10万条全球经纬度记录,一个国家会有100多个簇,不知道每个国家有多少簇。
我可以拾取附近点的链 Ex,ID1 和 ID3 都是 'close' 到 ID6(但不是彼此)。
create table #temp
(
ID varchar(10),
LATITUDE [decimal](11, 8),
LONGITUDE [decimal](11, 8),
COUNTRY_CD [char](2)
)
insert into #temp select 'ID1', 10.81583689, 78.61898689, 'IN'
insert into #temp select 'ID2', 10.81513789, 78.61898789, 'IN'
insert into #temp select 'ID3', 10.81514889, 78.61894889, 'IN'
insert into #temp select 'ID4', 10.81523989, 78.61898989, 'IN'
insert into #temp select 'ID5', 10.81521089, 78.61891089, 'IN'
insert into #temp select 'ID6', 10.81551189, 78.61891189, 'IN'
insert into #temp select 'ID7', 10.81551189, 78.61791189, 'IN'
insert into #temp select 'ID8', 10.81561189, 78.61792189, 'IN'
insert into #temp select 'ID9', 10.81571189, 78.61793189, 'IN'
select
t1.ID, t2.ID,
t1.LATITUDE,
t1.LONGITUDE,
t1.COUNTRY_CD,
--calculate the distance in meters
cast(6378137.0 * sqrt(power((radians(t1.LATITUDE) - radians(t2.LATITUDE)), 2)
+ power((radians(t1.LONGITUDE) - radians(t2.LONGITUDE)) * cos(radians(t1.LATITUDE)), 2)) as integer) as MAPPING_DISTANCE,
(row_number() over (partition by t1.ID order by
--rank the distance in meters
cast(6378137.0*sqrt(power((radians(t1.LATITUDE)-radians(t2.LATITUDE)),2)
+ power((radians(t1.LONGITUDE)-radians(t2.LONGITUDE))*cos(radians(t1.LATITUDE)),2)) as integer) asc
)) as DISTANCE_RANK
from
(select
ID, LATITUDE, LONGITUDE, COUNTRY_CD
from
#temp) t1
--join the above list of ID to get near by ID
inner join
(select
ID, LATITUDE, LONGITUDE, COUNTRY_CD
from
#temp) t2 on t1.COUNTRY_CD = t2.COUNTRY_CD
--this brings ID available in 75 meters radius
and (t2.LATITUDE between (t1.LATITUDE - 0.00056) and (t1.LATITUDE + 0.00056))
and (t2.LONGITUDE between (t1.LONGITUDE - 0.00076) and (t1.LONGITUDE + 0.00076))
--distance between t1 co-ordinates and t2 co-ordinates in meters
and (cast(6378137.0*sqrt(power((radians(t1.LATITUDE)-radians(t2.LATITUDE)),2) + power((radians(t1.LONGITUDE)-radians(t2.LONGITUDE))*cos(radians(t1.LATITUDE)),2)) as integer)) <= 40 --limit to 40 meters
and t1.ID != t2.ID --exclude the same ID
上面的查询带的是40米以内的ID,但是我不知道如何过滤集群中的ID?。例如,'Cluster_1' ?
参考这张图片2 clusters from above 9 ID
请注意,我不会提供任何特定坐标作为输入,但查询必须从 table 中的可用坐标自动选择距离内的 ID。
我的预期结果如下,
ID LATITUDE LONGITUDE COUNTRY_CD CLUSTER_NAME
ID1 10.81583689 78.61898689 IN Cluster_1
ID2 10.81513789 78.61898789 IN Cluster_1
ID3 10.81514889 78.61894889 IN Cluster_1
ID4 10.81523989 78.61898989 IN Cluster_1
ID5 10.81521089 78.61891089 IN Cluster_1
ID6 10.81551189 78.61891189 IN Cluster_1
ID7 10.81551189 78.61791189 IN Cluster_2
ID8 10.81561189 78.61792189 IN Cluster_2
ID9 10.81571189 78.61793189 IN Cluster_2
关于如何过滤集群中的 ID 的任何建议?如果有任何其他简单的方法可以做到这一点,那就太好了!
首先,让我们创建一个用于存储位置坐标的计算 geography
列。我们将使用此列让 SQL 服务器为我们计算距离:
ALTER TABLE #temp
ADD Point_Geolocation AS geography::STPointFromText('POINT(' + CAST(LONGITUDE AS VARCHAR(100))+ ' ' + CAST(LATITUDE AS VARCHAR(100)) +')', 4326) PERSISTED
其次,让我们创建一个包含附近所有位置的 table:
IF OBJECT_ID('tempdb..#Nearby_Points') IS NOT NULL DROP TABLE #Nearby_Points
CREATE TABLE #Nearby_Points (
ID_1 VARCHAR(10) NOT NULL,
ID_2 VARCHAR(10) NOT NULL,
PRIMARY KEY (ID_1, ID_2)
)
INSERT INTO #Nearby_Points
(
ID_1,
ID_2
)
SELECT t1.ID AS p1_ID
,t2.ID AS p2_ID
FROM #temp t1
INNER JOIN #temp t2
ON t1.ID < t2.ID
WHERE t1.Point_Geolocation.STDistance(t2.Point_Geolocation) < 40 -- Specify distance criteria here
-- SELECT * FROM #Nearby_Points
注意: 使用 100k+ 坐标,我们正在查看大约 50 亿次计算:(100,000 ^ 2) / 2
。上述查询可能需要一段时间才能执行。
第三,让我们创建一个table来存储我们的集群列表:
IF OBJECT_ID('tempdb..#Clusters') IS NOT NULL DROP TABLE #Clusters
CREATE TABLE #Clusters(
Cluster_ID INT NOT NULL,
Point_ID VARCHAR(10) NOT NULL,
PRIMARY KEY(Cluster_ID, Point_ID)
);
-- This index may improve performance a little
CREATE NONCLUSTERED INDEX IX_Point_ID ON #Clusters(Point_ID);
最后,下面的代码会:
- 为第一个点创建一个新的集群 群集。
- 重复re-scan集群table并向现有集群添加额外的点,直到每个集群包含所有应该属于它的点。
- 转到上面的步骤 1 并重复,直到没有新的集群被创建。
DECLARE @Rowcount INT
INSERT INTO #Clusters
(
Cluster_ID,
Point_ID
)
SELECT COALESCE((SELECT MAX(Cluster_ID) FROM #Clusters),0) + 1
,MIN(np.ID_1)
FROM #Nearby_Points np
WHERE np.ID_1 NOT IN (SELECT Point_ID FROM #Clusters)
HAVING MIN(np.ID_1) IS NOT NULL
SET @Rowcount = @@ROWCOUNT
WHILE @Rowcount > 0
BEGIN
WHILE @Rowcount > 0
BEGIN
INSERT INTO #Clusters
(
Cluster_ID,
Point_ID
)
SELECT Cluster_ID
,Point_ID
FROM (
SELECT np.ID_2 AS Point_ID
,c.Cluster_ID
FROM #Nearby_Points np
INNER JOIN #Clusters c
ON np.ID_1 = c.Point_ID
UNION
SELECT np.ID_1
,c.Cluster_ID
FROM #Nearby_Points np
INNER JOIN #Clusters c
ON np.ID_2 = c.Point_ID
) vals
WHERE NOT EXISTS (
SELECT 1
FROM #Clusters
WHERE Cluster_ID = vals.Cluster_ID
AND Point_ID = vals.Point_ID
)
SET @Rowcount = @@ROWCOUNT
END
INSERT INTO #Clusters
(
Cluster_ID,
Point_ID
)
SELECT COALESCE((SELECT MAX(Cluster_ID) FROM #Clusters),0) + 1
,MIN(np.ID_1)
FROM #Nearby_Points np
WHERE np.ID_1 NOT IN (SELECT Point_ID FROM #Clusters)
HAVING MIN(np.ID_1) IS NOT NULL
SET @Rowcount = @@ROWCOUNT
END
瞧瞧:
SELECT *
FROM #Clusters c
|Cluster_ID | Point_ID|
|-----------|---------|
| 1 | ID1 |
| 1 | ID2 |
| 1 | ID3 |
| 1 | ID4 |
| 1 | ID5 |
| 1 | ID6 |
| 2 | ID7 |
| 2 | ID8 |
| 2 | ID9 |