列表的内存中完全外部连接
In-memory full outer join for lists
如何着手在具有以下签名的列表上编写类似 SQL 的完全连接操作?
fullJoin :: [a] -> [b] -> (a -> b -> Bool) -> [(Maybe a, Maybe b)]
fullJoin xs [] _ = map (\x -> (Just x, Nothing)) xs
fullJoin [] ys _ = map (\y -> (Nothing, Just y)) ys
fullJoin xs@(xh:xt) ys@(yh:yt) on =
if on xh yh
then (Just xh, Just yh) : fullJoin xt yt
else ???
所写条件由a -> b -> Bool
提供。在示例中,它被设置为 (\n i -> length n == i)
,这意味着来自 names
的记录与 nums
中的数字具有相同的长度。
示例:
names = ["Foo","Bar", "John" , "Emily", "Connor"]
nums = [1,2,3,4,5]
fullJoin names nums (\n i -> length n == i)
== [ (Just "Foo", Just 3)
, (Just "Bar", Just 3)
, (Just "John", Just 4)
, (Just "Emily", Just 5)
, (Just "Connor", Nothing)
, (Nothing, Just 1)
, (Nothing, Just 2)
]
为了阐明上述函数的确切 SQL 等价物,下面是它在 PostgreSQL 中的写法:
create table names as
select * from (values
('Foo'),
('Bar'),
('John'),
('Emily'),
('Connor')
)
as z(name);
create table nums as
select * from (values
(1),
(2),
(3),
(4),
(5)
)
as z(num);
select * from names
full join nums
on char_length(name) = num
并且 运行 这将产生:
name num
(null) 1
(null) 2
Foo 3
Bar 3
John 4
Emily 5
Connor (null)
Fiddle link : sqlfiddle
只是迟钝地实现你想要它做的事情。超级低效:
fullJoin :: [a] -> [b] -> (a -> b -> Bool)
-> [(Maybe a, Maybe b)]
fullJoin xs ys p = concatMap (map (Just *** Just) . snd) a
++ [(Just x, Nothing) | (x, []) <- a]
++ [(Nothing, Just y) | (y, []) <- b]
where
a = [(x, [(x, y) | y <- ys, p x y]) | x <- xs]
b = [(y, [(x, y) | x <- xs, p x y]) | y <- ys]
如果允许我们在 a
和 b
类型上添加 Eq
约束,我们可以使用 Data.List.\
来查找不匹配的元素,而不是使第二扫.
测试:
> mapM_ print $ fullJoin names nums (\n i -> length n == i)
(Just "Foo",Just 3)
(Just "Bar",Just 3)
(Just "John",Just 4)
(Just "Emily",Just 5)
(Just "Connor",Nothing)
(Nothing,Just 1)
(Nothing,Just 2)
条件 rel(x, y)
下表 X
和 Y
之间的完全外部联接可以被认为是三个(不相交的)部分的并集:
- 对的集合
(x, y)
其中 rel(x,y)
- 对
(x0, null)
的集合,其中对于所有 y in Y
、not (rel(x0, y))
- 对
(null, y0)
的集合,其中对于所有 x in X
、not (rel(x, y0))
我们可以用类似的方式构造我们的 Haskell 程序:
fullJoin :: [a] -> [b] -> (a -> b -> Bool) -> [(Maybe a, Maybe b)]
fullJoin xs ys rel = concat $
[[(Just x, Just y) | y <- ys, x `rel` y] | x <- xs] -- for each x, find all the related ys
<> [[(Just x, Nothing)] | x <- xs, all (\y -> not (x `rel` y)) ys] -- find all xs that have no related ys
<> [[(Nothing, Just y)] | y <- ys, all (\x -> not (x `rel` y)) xs] -- find all ys that have no related xs
问题的提出方式,你不能比 O(n^2)
更快。也就是说,我的解决方案虽然是 O(n^2)
,但并不是最优的:它遍历了 xs
两次。你可以想想你必须做什么才能只遍历 xs
一次。它与找到一种方法来跟踪您已经看过的xs
有关。
这是一个天真的 Python 实现。是O(n^2)
.
#! /usr/bin/env python
def full_join_cond (list1, list2, condition_fn):
answer = []
for x in list1:
matches = []
for y in list2:
if condition_fn(x, y):
matches.append(y)
if len(matches):
answer.extend([[x, y] for y in matches])
else:
answer.append([x, None])
for y in list2:
matched = False
for x in list1:
if condition_fn(x, y):
matched = True
break
if not matched:
answer.append([None, y])
return answer
names = ["Foo","Bar", "John" , "Emily", "Connor"]
nums = [1,2,3,4,5]
cond = lambda x, y: len(x) == y
print(full_join_cond(names, nums, cond))
这是一个更接近于数据库执行此连接的方式的实现。 O(size_of_output)
经常是 O(n)
.
def list_map_to_dict (l, m):
d = {}
for x in l:
mx = m(x)
if mx in d:
d[mx].append(x)
else:
d[mx] = [x]
return d
def full_join_map (list1, map1, list2, map2):
dict1 = list_map_to_dict(list1, map1)
dict2 = list_map_to_dict(list2, map2)
answer = []
for mx, xs in dict1.iteritems():
if mx in dict2:
for y in dict2[mx]:
answer.extend([[x, y] for x in xs])
dict2.pop(mx)
else:
answer.extend([[x, None] for x in xs])
for my, ys in dict2.iteritems():
answer.extend([[None, y] for y in ys])
return answer
print(full_join_map(names, lambda x: len(x), nums, lambda y: y))
对于咯咯笑和咧嘴笑,这两者可以结合在一个既通用又可以 运行 快速的函数中。 (我没有尝试使函数签名合理 - 只是想展示这个想法。)
def full_join (list1, map1, list2, map2, cond=None):
if map1 is None:
map1 = lambda x: None
if map2 is None:
map2 = lambda y: None
if cond is None:
cond = lambda x, y: True
dict1 = list_map_to_dict(list1, map1)
dict2 = list_map_to_dict(list2, map2)
answer = []
for mx, xs in dict1.iteritems():
if mx in dict2:
answer.extend(full_join_cond(xs, dict2[mx], cond))
dict2.pop(mx)
else:
answer.extend([[x, None] for x in xs])
for my, ys in dict2.iteritems():
answer.extend([[None, y] for y in ys])
return answer
如何着手在具有以下签名的列表上编写类似 SQL 的完全连接操作?
fullJoin :: [a] -> [b] -> (a -> b -> Bool) -> [(Maybe a, Maybe b)]
fullJoin xs [] _ = map (\x -> (Just x, Nothing)) xs
fullJoin [] ys _ = map (\y -> (Nothing, Just y)) ys
fullJoin xs@(xh:xt) ys@(yh:yt) on =
if on xh yh
then (Just xh, Just yh) : fullJoin xt yt
else ???
所写条件由a -> b -> Bool
提供。在示例中,它被设置为 (\n i -> length n == i)
,这意味着来自 names
的记录与 nums
中的数字具有相同的长度。
示例:
names = ["Foo","Bar", "John" , "Emily", "Connor"]
nums = [1,2,3,4,5]
fullJoin names nums (\n i -> length n == i)
== [ (Just "Foo", Just 3)
, (Just "Bar", Just 3)
, (Just "John", Just 4)
, (Just "Emily", Just 5)
, (Just "Connor", Nothing)
, (Nothing, Just 1)
, (Nothing, Just 2)
]
为了阐明上述函数的确切 SQL 等价物,下面是它在 PostgreSQL 中的写法:
create table names as
select * from (values
('Foo'),
('Bar'),
('John'),
('Emily'),
('Connor')
)
as z(name);
create table nums as
select * from (values
(1),
(2),
(3),
(4),
(5)
)
as z(num);
select * from names
full join nums
on char_length(name) = num
并且 运行 这将产生:
name num
(null) 1
(null) 2
Foo 3
Bar 3
John 4
Emily 5
Connor (null)
Fiddle link : sqlfiddle
只是迟钝地实现你想要它做的事情。超级低效:
fullJoin :: [a] -> [b] -> (a -> b -> Bool)
-> [(Maybe a, Maybe b)]
fullJoin xs ys p = concatMap (map (Just *** Just) . snd) a
++ [(Just x, Nothing) | (x, []) <- a]
++ [(Nothing, Just y) | (y, []) <- b]
where
a = [(x, [(x, y) | y <- ys, p x y]) | x <- xs]
b = [(y, [(x, y) | x <- xs, p x y]) | y <- ys]
如果允许我们在 a
和 b
类型上添加 Eq
约束,我们可以使用 Data.List.\
来查找不匹配的元素,而不是使第二扫.
测试:
> mapM_ print $ fullJoin names nums (\n i -> length n == i)
(Just "Foo",Just 3)
(Just "Bar",Just 3)
(Just "John",Just 4)
(Just "Emily",Just 5)
(Just "Connor",Nothing)
(Nothing,Just 1)
(Nothing,Just 2)
条件 rel(x, y)
下表 X
和 Y
之间的完全外部联接可以被认为是三个(不相交的)部分的并集:
- 对的集合
(x, y)
其中rel(x,y)
- 对
(x0, null)
的集合,其中对于所有y in Y
、not (rel(x0, y))
- 对
(null, y0)
的集合,其中对于所有x in X
、not (rel(x, y0))
我们可以用类似的方式构造我们的 Haskell 程序:
fullJoin :: [a] -> [b] -> (a -> b -> Bool) -> [(Maybe a, Maybe b)]
fullJoin xs ys rel = concat $
[[(Just x, Just y) | y <- ys, x `rel` y] | x <- xs] -- for each x, find all the related ys
<> [[(Just x, Nothing)] | x <- xs, all (\y -> not (x `rel` y)) ys] -- find all xs that have no related ys
<> [[(Nothing, Just y)] | y <- ys, all (\x -> not (x `rel` y)) xs] -- find all ys that have no related xs
问题的提出方式,你不能比 O(n^2)
更快。也就是说,我的解决方案虽然是 O(n^2)
,但并不是最优的:它遍历了 xs
两次。你可以想想你必须做什么才能只遍历 xs
一次。它与找到一种方法来跟踪您已经看过的xs
有关。
这是一个天真的 Python 实现。是O(n^2)
.
#! /usr/bin/env python
def full_join_cond (list1, list2, condition_fn):
answer = []
for x in list1:
matches = []
for y in list2:
if condition_fn(x, y):
matches.append(y)
if len(matches):
answer.extend([[x, y] for y in matches])
else:
answer.append([x, None])
for y in list2:
matched = False
for x in list1:
if condition_fn(x, y):
matched = True
break
if not matched:
answer.append([None, y])
return answer
names = ["Foo","Bar", "John" , "Emily", "Connor"]
nums = [1,2,3,4,5]
cond = lambda x, y: len(x) == y
print(full_join_cond(names, nums, cond))
这是一个更接近于数据库执行此连接的方式的实现。 O(size_of_output)
经常是 O(n)
.
def list_map_to_dict (l, m):
d = {}
for x in l:
mx = m(x)
if mx in d:
d[mx].append(x)
else:
d[mx] = [x]
return d
def full_join_map (list1, map1, list2, map2):
dict1 = list_map_to_dict(list1, map1)
dict2 = list_map_to_dict(list2, map2)
answer = []
for mx, xs in dict1.iteritems():
if mx in dict2:
for y in dict2[mx]:
answer.extend([[x, y] for x in xs])
dict2.pop(mx)
else:
answer.extend([[x, None] for x in xs])
for my, ys in dict2.iteritems():
answer.extend([[None, y] for y in ys])
return answer
print(full_join_map(names, lambda x: len(x), nums, lambda y: y))
对于咯咯笑和咧嘴笑,这两者可以结合在一个既通用又可以 运行 快速的函数中。 (我没有尝试使函数签名合理 - 只是想展示这个想法。)
def full_join (list1, map1, list2, map2, cond=None):
if map1 is None:
map1 = lambda x: None
if map2 is None:
map2 = lambda y: None
if cond is None:
cond = lambda x, y: True
dict1 = list_map_to_dict(list1, map1)
dict2 = list_map_to_dict(list2, map2)
answer = []
for mx, xs in dict1.iteritems():
if mx in dict2:
answer.extend(full_join_cond(xs, dict2[mx], cond))
dict2.pop(mx)
else:
answer.extend([[x, None] for x in xs])
for my, ys in dict2.iteritems():
answer.extend([[None, y] for y in ys])
return answer