删除已排序 Go 切片重复项的最快方法
Fastest way to remove duplicates of a sorted Go slice
mySlice := make([]uint32, 0, 4294967290)
// ...
// Sort the slice
sort.Slice(mySlice, func(i, j int) bool {
x := mySlice[i]
y := mySlice[j]
return x < y
})
删除切片重复项的最快方法是什么?
我如何利用 slice 已经排序的事实?
更新
我想到了这个:
func RemoveDuplicates(s []uint32) {
if len(s) < 2 {
return
}
tmp := make([]uint32, 0, len(s))
for i := uint32(0); i < uint32(len(s)); i++ {
// If current is not equal to next then store the current
if s[i] != s[i+1] {
tmp = append(tmp, s[i])
}
}
// The last must be stored
// Note that if it was repeated, the duplicates are NOT stored before
tmp = append(tmp, s[len(s)-1])
// Modify original slice
s = nil
s = append(s, tmp...)
}
有错吗?任何错误?有什么改进的方法吗?
更新
如@mh-cbon 所述,正确的循环最大值应该是 i < uint32(len(s)) - 1
:
for i := uint32(0); i < uint32(len(s)) - 1; i++ {
不是最快的答案,而是循序渐进的应用Go语言优化一段代码的方法。
有关什么是最快的非常正式的见解,请参阅
您的代码有错误。总是写一个测试。
首先,让我们写一个main
package main
import (
"math/rand"
"sort"
)
func main() {
}
func randSlice(max int) (ret []uint32) {
// we should check that max does not exceed maxUINT32
ret = make([]uint32, 0, max)
r := rand.New(rand.NewSource(99))
for i := 0; i < max; i++ {
ret = append(ret, uint32(r.Intn(max)))
}
sort.Slice(ret, func(i, j int) bool {
return ret[i] < ret[j]
})
return
}
func dedup1(s []uint32) []uint32 {
if len(s) < 2 {
return s
}
tmp := make([]uint32, 0, len(s))
for i := uint32(0); i < uint32(len(s)); i++ {
// If current is not equal to next then store the current
if s[i] != s[i+1] {
tmp = append(tmp, s[i])
}
}
// The last must be stored
// Note that if it was repeated, the duplicates are NOT stored before
tmp = append(tmp, s[len(s)-1])
// Modify original slice
s = nil
s = append(s, tmp...)
return s
}
以及伴随的测试
package main
import "testing"
func TestDedup1(t *testing.T) {
s := randSlice(10)
res := dedup1(s)
uniq := map[uint32]bool{}
for _, r := range res {
_, ok := uniq[r]
if ok {
t.Fatalf("found duplicates\ninput=%#v\nresult=%#v\n", s, res)
}
uniq[r] = true
}
}
运行 我们得到
$ go test -v .
=== RUN TestDedup1
--- FAIL: TestDedup1 (0.00s)
panic: runtime error: index out of range [10] with length 10 [recovered]
panic: runtime error: index out of range [10] with length 10
goroutine 18 [running]:
testing.tRunner.func1.1(0x536680, 0xc0000da040)
/home/mh-cbon/.gvm/gos/go1.15.2/src/testing/testing.go:1076 +0x30d
testing.tRunner.func1(0xc000082600)
/home/mh-cbon/.gvm/gos/go1.15.2/src/testing/testing.go:1079 +0x41a
panic(0x536680, 0xc0000da040)
/home/mh-cbon/.gvm/gos/go1.15.2/src/runtime/panic.go:969 +0x175
test/d/dup.dedup1(0xc000094060, 0xa, 0xa, 0xa, 0x6124a0, 0xc00003c770)
/home/mh-cbon/gow/src/test/d/dup/main.go:32 +0x248
test/d/dup.TestDedup1(0xc000082600)
/home/mh-cbon/gow/src/test/d/dup/main_test.go:7 +0x70
testing.tRunner(0xc000082600, 0x54fbf0)
/home/mh-cbon/.gvm/gos/go1.15.2/src/testing/testing.go:1127 +0xef
created by testing.(*T).Run
/home/mh-cbon/.gvm/gos/go1.15.2/src/testing/testing.go:1178 +0x386
FAIL test/d/dup 0.006s
FAIL
我们通过适当检查切片边界来解决这个问题。
在dedup1
中,将此条件if s[i] != s[i+1] {
更改为if i+1 < uint32(len(s)) && s[i] != s[i+1] {
,或者更好,将迭代最大值减少一个for i := uint32(0); i < uint32(len(s))-1; i++ {
接下来,编写一个函数来生成具有随机重复项的切片。
func randSliceWithDups(max int) (ret []uint32) {
ret = randSlice(max / 2)
ret = append(ret, ret...)
sort.Slice(ret, func(i, j int) bool {
return ret[i] < ret[j]
})
return
}
写randSliceWithDups(50)
得到切片如[0 0 1 1 2 2 3 3 3 3 4 4 5 5 5 5 6 6 6 6 7 7 8 8 9 9 9 9 12 12 13 13 18 18 18 18 18 18 19 19 20 20 20 20 21 21 22 22 24 24]
再次更新测试
func TestDedup1_with_dups(t *testing.T) {
s := randSliceWithDups(10)
res := dedup1(s)
uniq := map[uint32]bool{}
for _, r := range res {
_, ok := uniq[r]
if ok {
t.Fatalf("found duplicates\ninput=%#v\nresult=%#v\n", s, res)
}
uniq[r] = true
}
}
接下来,添加一个基准;它将帮助我们发现性能问题并随着时间的推移保持性能。
func BenchmarkDedup1_1000(b *testing.B) {
s := randSliceWithDups(100)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = dedup1(s)
}
}
运行 我们得到:
$ go test -v . -bench=.
=== RUN TestDedup1
--- PASS: TestDedup1 (0.00s)
=== RUN TestDedup1_with_dups
--- PASS: TestDedup1_with_dups (0.00s)
goos: linux
goarch: amd64
pkg: test/d/dup
BenchmarkDedup1_1000
BenchmarkDedup1_1000-4 172087 6353 ns/op 5504 B/op 2 allocs/op
PASS
ok test/d/dup 1.174s
让我们声明显而易见的是,每个人都已经发现阅读您的初始代码,甚至没有编写基准测试,您正在分配。
这就提出了一个问题,即是否允许您就地修改输入切片。如果您可以就地更改它,我们可能会利用它来防止分配并加快您的程序。
一个解决方案,从头开始编写(考虑在网站上搜索 geekforgeeks 以获得普遍接受的解决方案),迭代切片并维护下一个写入位置的索引。当找到非重复项时,将非重复项保存到最后一个位置,然后将该索引递增 1。最后,return 切片达到递增索引的值。
func dedup2(s []uint32) []uint32 {
if len(s) < 2 {
return s
}
var e int
for i := 1; i < len(s); i++ {
if s[i] == s[i-1] {
continue
}
s[e] = s[i]
e++
}
return s[:e]
}
再次添加测试和基准测试,并检查结果。
func TestDedup2(t *testing.T) {
s := randSlice(10)
res := dedup2(s)
uniq := map[uint32]bool{}
for _, r := range res {
_, ok := uniq[r]
if ok {
t.Fatalf("found duplicates\ninput=%#v\nresult=%#v\n", s, res)
}
uniq[r] = true
}
}
func TestDedup2_with_dups(t *testing.T) {
s := randSliceWithDups(10)
res := dedup2(s)
uniq := map[uint32]bool{}
for _, r := range res {
_, ok := uniq[r]
if ok {
t.Fatalf("found duplicates\ninput=%#v\nresult=%#v\n", s, res)
}
uniq[r] = true
}
}
func BenchmarkDedup2_1000(b *testing.B) {
s := randSliceWithDups(100)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = dedup2(s)
}
}
产生:
$ go test -v . -bench=.
=== RUN TestDedup1
--- PASS: TestDedup1 (0.00s)
=== RUN TestDedup1_with_dups
--- PASS: TestDedup1_with_dups (0.00s)
=== RUN TestDedup2
--- PASS: TestDedup2 (0.00s)
=== RUN TestDedup2_with_dups
--- PASS: TestDedup2_with_dups (0.00s)
goos: linux
goarch: amd64
pkg: test/d/dup
BenchmarkDedup1_1000
BenchmarkDedup1_1000-4 1764574 673 ns/op 544 B/op 2 allocs/op
BenchmarkDedup2_1000
BenchmarkDedup2_1000-4 7758907 152 ns/op 0 B/op 0 allocs/op
PASS
ok test/d/dup 3.224s
4 倍的改进!凉爽的 !下一步是什么 ?接下来是找到一个更好的算法,它产生更少的执行、更少的查找等等。
尽管如此,上一个版本包含一个错误!你发现了吗?
看到这个测试,它比另一个更好,因为它不依赖于随机数,而是依赖于具有强相等性检查的静态值。使用这些类型的测试,您可以定制输入以检查细粒度情况。
func TestDedup2_static(t *testing.T) {
type expectation struct {
input []uint32
want []uint32
}
expectations := []expectation{
expectation{
input: []uint32{0, 0, 1, 2, 3, 3, 3, 4, 4, 5},
want: []uint32{0, 1, 2, 3, 4, 5},
},
expectation{
input: []uint32{0, 1, 2, 3, 3, 3, 4, 4, 5},
want: []uint32{0, 1, 2, 3, 4, 5},
},
}
for _, e := range expectations {
res := dedup2(e.input)
if !reflect.DeepEqual(res, e.want) {
t.Fatalf("invlaid result, wanted=%#v\ngot=%#v\n", e.want, res)
}
}
}
它使用 table 路测,如 https://dave.cheney.net/2013/06/09/writing-table-driven-tests-in-go
所述
让我们解决这个问题:
func dedup2(s []uint32) []uint32 {
if len(s) < 2 {
return s
}
var e int = 1
for i := 1; i < len(s); i++ {
if s[i] == s[i-1] {
continue
}
s[e] = s[i]
e++
}
return s[:e]
}
要删除切片的重复元素,您可以创建一个映射并将切片值指定为映射的键,然后遍历映射并将键值附加到新的 slice.Here 是代码同样的逻辑:
package main
import (
"fmt"
"sort"
)
func removeDupe(slc []int) []int {
var tmpSlice []int
chkMap := make(map[int]bool)
for _, val := range slc {
chkMap[val] = true
}
for k, _ := range chkMap {
tmpSlice = append(tmpSlice, k)
}
sort.Ints(tmpSlice)
return tmpSlice
}
func main() {
mySlice := []int{1, 2, 3, 4, 5, 1, 2, 3, 9, 0}
formattedSlice := removeDupe(mySlice)
fmt.Println(formattedSlice)
}
输出:
[0 1 2 3 4 5 9]
mySlice := make([]uint32, 0, 4294967290)
// ...
// Sort the slice
sort.Slice(mySlice, func(i, j int) bool {
x := mySlice[i]
y := mySlice[j]
return x < y
})
删除切片重复项的最快方法是什么?
我如何利用 slice 已经排序的事实?
更新
我想到了这个:
func RemoveDuplicates(s []uint32) {
if len(s) < 2 {
return
}
tmp := make([]uint32, 0, len(s))
for i := uint32(0); i < uint32(len(s)); i++ {
// If current is not equal to next then store the current
if s[i] != s[i+1] {
tmp = append(tmp, s[i])
}
}
// The last must be stored
// Note that if it was repeated, the duplicates are NOT stored before
tmp = append(tmp, s[len(s)-1])
// Modify original slice
s = nil
s = append(s, tmp...)
}
有错吗?任何错误?有什么改进的方法吗?
更新
如@mh-cbon 所述,正确的循环最大值应该是 i < uint32(len(s)) - 1
:
for i := uint32(0); i < uint32(len(s)) - 1; i++ {
不是最快的答案,而是循序渐进的应用Go语言优化一段代码的方法。
有关什么是最快的非常正式的见解,请参阅
您的代码有错误。总是写一个测试。
首先,让我们写一个main
package main
import (
"math/rand"
"sort"
)
func main() {
}
func randSlice(max int) (ret []uint32) {
// we should check that max does not exceed maxUINT32
ret = make([]uint32, 0, max)
r := rand.New(rand.NewSource(99))
for i := 0; i < max; i++ {
ret = append(ret, uint32(r.Intn(max)))
}
sort.Slice(ret, func(i, j int) bool {
return ret[i] < ret[j]
})
return
}
func dedup1(s []uint32) []uint32 {
if len(s) < 2 {
return s
}
tmp := make([]uint32, 0, len(s))
for i := uint32(0); i < uint32(len(s)); i++ {
// If current is not equal to next then store the current
if s[i] != s[i+1] {
tmp = append(tmp, s[i])
}
}
// The last must be stored
// Note that if it was repeated, the duplicates are NOT stored before
tmp = append(tmp, s[len(s)-1])
// Modify original slice
s = nil
s = append(s, tmp...)
return s
}
以及伴随的测试
package main
import "testing"
func TestDedup1(t *testing.T) {
s := randSlice(10)
res := dedup1(s)
uniq := map[uint32]bool{}
for _, r := range res {
_, ok := uniq[r]
if ok {
t.Fatalf("found duplicates\ninput=%#v\nresult=%#v\n", s, res)
}
uniq[r] = true
}
}
运行 我们得到
$ go test -v .
=== RUN TestDedup1
--- FAIL: TestDedup1 (0.00s)
panic: runtime error: index out of range [10] with length 10 [recovered]
panic: runtime error: index out of range [10] with length 10
goroutine 18 [running]:
testing.tRunner.func1.1(0x536680, 0xc0000da040)
/home/mh-cbon/.gvm/gos/go1.15.2/src/testing/testing.go:1076 +0x30d
testing.tRunner.func1(0xc000082600)
/home/mh-cbon/.gvm/gos/go1.15.2/src/testing/testing.go:1079 +0x41a
panic(0x536680, 0xc0000da040)
/home/mh-cbon/.gvm/gos/go1.15.2/src/runtime/panic.go:969 +0x175
test/d/dup.dedup1(0xc000094060, 0xa, 0xa, 0xa, 0x6124a0, 0xc00003c770)
/home/mh-cbon/gow/src/test/d/dup/main.go:32 +0x248
test/d/dup.TestDedup1(0xc000082600)
/home/mh-cbon/gow/src/test/d/dup/main_test.go:7 +0x70
testing.tRunner(0xc000082600, 0x54fbf0)
/home/mh-cbon/.gvm/gos/go1.15.2/src/testing/testing.go:1127 +0xef
created by testing.(*T).Run
/home/mh-cbon/.gvm/gos/go1.15.2/src/testing/testing.go:1178 +0x386
FAIL test/d/dup 0.006s
FAIL
我们通过适当检查切片边界来解决这个问题。
在dedup1
中,将此条件if s[i] != s[i+1] {
更改为if i+1 < uint32(len(s)) && s[i] != s[i+1] {
,或者更好,将迭代最大值减少一个for i := uint32(0); i < uint32(len(s))-1; i++ {
接下来,编写一个函数来生成具有随机重复项的切片。
func randSliceWithDups(max int) (ret []uint32) {
ret = randSlice(max / 2)
ret = append(ret, ret...)
sort.Slice(ret, func(i, j int) bool {
return ret[i] < ret[j]
})
return
}
写randSliceWithDups(50)
得到切片如[0 0 1 1 2 2 3 3 3 3 4 4 5 5 5 5 6 6 6 6 7 7 8 8 9 9 9 9 12 12 13 13 18 18 18 18 18 18 19 19 20 20 20 20 21 21 22 22 24 24]
再次更新测试
func TestDedup1_with_dups(t *testing.T) {
s := randSliceWithDups(10)
res := dedup1(s)
uniq := map[uint32]bool{}
for _, r := range res {
_, ok := uniq[r]
if ok {
t.Fatalf("found duplicates\ninput=%#v\nresult=%#v\n", s, res)
}
uniq[r] = true
}
}
接下来,添加一个基准;它将帮助我们发现性能问题并随着时间的推移保持性能。
func BenchmarkDedup1_1000(b *testing.B) {
s := randSliceWithDups(100)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = dedup1(s)
}
}
运行 我们得到:
$ go test -v . -bench=.
=== RUN TestDedup1
--- PASS: TestDedup1 (0.00s)
=== RUN TestDedup1_with_dups
--- PASS: TestDedup1_with_dups (0.00s)
goos: linux
goarch: amd64
pkg: test/d/dup
BenchmarkDedup1_1000
BenchmarkDedup1_1000-4 172087 6353 ns/op 5504 B/op 2 allocs/op
PASS
ok test/d/dup 1.174s
让我们声明显而易见的是,每个人都已经发现阅读您的初始代码,甚至没有编写基准测试,您正在分配。
这就提出了一个问题,即是否允许您就地修改输入切片。如果您可以就地更改它,我们可能会利用它来防止分配并加快您的程序。
一个解决方案,从头开始编写(考虑在网站上搜索 geekforgeeks 以获得普遍接受的解决方案),迭代切片并维护下一个写入位置的索引。当找到非重复项时,将非重复项保存到最后一个位置,然后将该索引递增 1。最后,return 切片达到递增索引的值。
func dedup2(s []uint32) []uint32 {
if len(s) < 2 {
return s
}
var e int
for i := 1; i < len(s); i++ {
if s[i] == s[i-1] {
continue
}
s[e] = s[i]
e++
}
return s[:e]
}
再次添加测试和基准测试,并检查结果。
func TestDedup2(t *testing.T) {
s := randSlice(10)
res := dedup2(s)
uniq := map[uint32]bool{}
for _, r := range res {
_, ok := uniq[r]
if ok {
t.Fatalf("found duplicates\ninput=%#v\nresult=%#v\n", s, res)
}
uniq[r] = true
}
}
func TestDedup2_with_dups(t *testing.T) {
s := randSliceWithDups(10)
res := dedup2(s)
uniq := map[uint32]bool{}
for _, r := range res {
_, ok := uniq[r]
if ok {
t.Fatalf("found duplicates\ninput=%#v\nresult=%#v\n", s, res)
}
uniq[r] = true
}
}
func BenchmarkDedup2_1000(b *testing.B) {
s := randSliceWithDups(100)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_ = dedup2(s)
}
}
产生:
$ go test -v . -bench=.
=== RUN TestDedup1
--- PASS: TestDedup1 (0.00s)
=== RUN TestDedup1_with_dups
--- PASS: TestDedup1_with_dups (0.00s)
=== RUN TestDedup2
--- PASS: TestDedup2 (0.00s)
=== RUN TestDedup2_with_dups
--- PASS: TestDedup2_with_dups (0.00s)
goos: linux
goarch: amd64
pkg: test/d/dup
BenchmarkDedup1_1000
BenchmarkDedup1_1000-4 1764574 673 ns/op 544 B/op 2 allocs/op
BenchmarkDedup2_1000
BenchmarkDedup2_1000-4 7758907 152 ns/op 0 B/op 0 allocs/op
PASS
ok test/d/dup 3.224s
4 倍的改进!凉爽的 !下一步是什么 ?接下来是找到一个更好的算法,它产生更少的执行、更少的查找等等。
尽管如此,上一个版本包含一个错误!你发现了吗?
看到这个测试,它比另一个更好,因为它不依赖于随机数,而是依赖于具有强相等性检查的静态值。使用这些类型的测试,您可以定制输入以检查细粒度情况。
func TestDedup2_static(t *testing.T) {
type expectation struct {
input []uint32
want []uint32
}
expectations := []expectation{
expectation{
input: []uint32{0, 0, 1, 2, 3, 3, 3, 4, 4, 5},
want: []uint32{0, 1, 2, 3, 4, 5},
},
expectation{
input: []uint32{0, 1, 2, 3, 3, 3, 4, 4, 5},
want: []uint32{0, 1, 2, 3, 4, 5},
},
}
for _, e := range expectations {
res := dedup2(e.input)
if !reflect.DeepEqual(res, e.want) {
t.Fatalf("invlaid result, wanted=%#v\ngot=%#v\n", e.want, res)
}
}
}
它使用 table 路测,如 https://dave.cheney.net/2013/06/09/writing-table-driven-tests-in-go
所述让我们解决这个问题:
func dedup2(s []uint32) []uint32 {
if len(s) < 2 {
return s
}
var e int = 1
for i := 1; i < len(s); i++ {
if s[i] == s[i-1] {
continue
}
s[e] = s[i]
e++
}
return s[:e]
}
要删除切片的重复元素,您可以创建一个映射并将切片值指定为映射的键,然后遍历映射并将键值附加到新的 slice.Here 是代码同样的逻辑:
package main
import (
"fmt"
"sort"
)
func removeDupe(slc []int) []int {
var tmpSlice []int
chkMap := make(map[int]bool)
for _, val := range slc {
chkMap[val] = true
}
for k, _ := range chkMap {
tmpSlice = append(tmpSlice, k)
}
sort.Ints(tmpSlice)
return tmpSlice
}
func main() {
mySlice := []int{1, 2, 3, 4, 5, 1, 2, 3, 9, 0}
formattedSlice := removeDupe(mySlice)
fmt.Println(formattedSlice)
}
输出:
[0 1 2 3 4 5 9]