如何使数组不同

How to make array distinct

我有一个整数和字符串字段数组。为了使其与众不同,我目前逐行复制到新数组中,并且对于每条记录,我检查该记录是否已存在于新数组中,如果不存在,我将其复制。最后,我将新数组复制回原始数组。

有效,但速度较慢。有没有更快、更简单的方法来做到这一点?

TArrayMixed = record
    Field1: integer;
    Field2: integer;
    Field3: integer;
    Field4: string;
    Field5: string;
    Field6: string;
 end;

procedure TForm1.Button10Click(Sender: TObject);
var
  ArrayMixed, ArrayMixed_tmp: array of TArrayMixed;
  i, j, vIdx: integer;
  vExists: boolean;
begin
  SetLength(ArrayMixed, 100000);
  for i := 0 to 99999 do
  begin
    ArrayMixed[i].Field1 := 1 + Random(5);
    ArrayMixed[i].Field2 := 1 + Random(5);
    ArrayMixed[i].Field3 := 1 + Random(5);
    ArrayMixed[i].Field4 := 'String';
    ArrayMixed[i].Field5 := 'Another string';
    ArrayMixed[i].Field6 := 'New string';
  end;

  // Sort
  TArray.Sort<TArrayMixed > (ArrayMixed, TComparer<TArrayMixed > .Construct(function(const Left, Right: TArrayMixed): Integer
    begin
      Result := MyCompareAMixed(Left, Right);
    end
    ));

  // Distinct
  SetLength(ArrayMixed_tmp, Length(ArrayMixed));
  vIdx := 0;
  for i := Low(ArrayMixed) to High(ArrayMixed) do
  begin
    vExists := False;
    for j := Low(ArrayMixed_tmp) to vIdx - 1 do
      if (ArrayMixed_tmp[j].Field1 = ArrayMixed[i].Field1) and
        (ArrayMixed_tmp[j].Field2 = ArrayMixed[i].Field2) and
        (ArrayMixed_tmp[j].Field3 = ArrayMixed[i].Field3) and
        (ArrayMixed_tmp[j].Field4 = ArrayMixed[i].Field4) and
        (ArrayMixed_tmp[j].Field5 = ArrayMixed[i].Field5) and
        (ArrayMixed_tmp[j].Field6 = ArrayMixed[i].Field6) then
      begin
        vExists := True;
        Break;
      end;

    if not vExists then
    begin
      ArrayMixed_tmp[vIdx] := ArrayMixed[i];
      Inc(vIdx);
    end;
  end;
  SetLength(ArrayMixed_tmp, vIdx);

  // now copy back to original array
  SetLength(ArrayMixed, 0);
  SetLength(ArrayMixed, Length(ArrayMixed_tmp));
  for i := Low(ArrayMixed_tmp) to High(ArrayMixed_tmp) do
    ArrayMixed[i] := ArrayMixed_tmp[i];

  sleep(0);

end;

编辑:

由于在实际数据中,字符串并不完全相同,因此当原始数组像这样填充时,它创建不同数组的部分速度较慢:

编辑 #2:(在编辑 #1 中复制了错误的代码)

for i := 0 to 999999 do
  begin
    ArrayMixed[i].Field1 := 1 + Random(5);
    ArrayMixed[i].Field2 := 1 + Random(5);
    ArrayMixed[i].Field3 := 1 + Random(5);
    ArrayMixed[i].Field4 := 'String'+IntToStr(i mod 5);
    ArrayMixed[i].Field5 := 'Another string'+IntToStr(i mod 5);
    ArrayMixed[i].Field6 := 'New string'+IntToStr( i mod 5);
  end;

编辑 #3:发布排序代码 - 仅对前 3 个字段进行排序!

TMyArray3 = array[1..3] of Integer;

    function CompareIntegerArray3(const lhs, rhs: TMyArray3): Integer;
    var
      i: Integer;
    begin
      Assert(Length(lhs) = Length(rhs));
      for i := low(lhs) to high(lhs) do
        if lhs[i] < rhs[i] then
          exit(-1)
        else if lhs[i] > rhs[i] then
          exit(1);

      exit(0);
    end;

    function GetMyArrayAMixed(const Value: TArrayMixed): TMyArray3;
    begin
      Result[1] := Value.Field1;
      Result[2] := Value.Field2;
      Result[3] := Value.Field3;
    end;

    function MyCompareAMixed(const lhs, rhs: TArrayMixed): Integer;
    begin
      Result := CompareIntegerArray3(GetMyArrayAMixed(lhs), GetMyArrayAMixed(rhs));
    end;

由于您已经对 ArrayMixed 进行了排序,因此无需相互比较每个项目即可找到重复项。重复项已经彼此相邻放置。因此,您只需遍历 ArrayMixed 并将当前项目与 ArrayMixed_tmp 中的最后一项进行比较。 因此,复制不同的项目可以进行得更快,看起来像这样:

vIdx := 0;
for i := Low(ArrayMixed) to High(ArrayMixed) do
begin
  if (vIdx = 0) or // the first item can never be a duplicate
     (ArrayMixed_tmp[vIdx].Field1 <> ArrayMixed[i].Field1) or
     (ArrayMixed_tmp[vIdx].Field2 <> ArrayMixed[i].Field2) or
     (ArrayMixed_tmp[vIdx].Field3 <> ArrayMixed[i].Field3) or
     (ArrayMixed_tmp[vIdx].Field4 <> ArrayMixed[i].Field4) or
     (ArrayMixed_tmp[vIdx].Field5 <> ArrayMixed[i].Field5) or
     (ArrayMixed_tmp[vIdx].Field6 <> ArrayMixed[i].Field6) then
  begin
    ArrayMixed_tmp[vIdx] := ArrayMixed[i];
    Inc(vIdx);
  end;
end;

只需检查先前索引旁边的重复项,因为数组已排序。这里也是重用的排序比较器。

function RemoveDuplicates(const anArray: array of TArrayMixed): TArray<TArrayMixed>;
var
  j, vIdx: integer;
begin
  // Sort
  TArray.Sort<TArrayMixed > (anArray, TComparer<TArrayMixed >.Construct(function(const Left, Right: TArrayMixed): Integer
    begin
      Result := MyCompareAMixed(Left, Right);
    end
    ));

  // Distinct
  SetLength(Result, Length(anArray));
  vIdx := 0;
  j := 0;
  while (j <= High(anArray) do
  begin
    Result[vIdx] := anArray[j];
    Inc(j);
    While (j <= High(anArray)) and (MyCompareAMixed(Result[vIdx],anArray[j]) = 0) do
      Inc(j);
   Inc(vIdx);
  end;
  SetLength(Result, vIdx);
end;

更新:

在问题的更新中指出数组仅部分排序。减少迭代次数以删除重复项的一种方法是:

  • 查找共享第一个排序条件的项目的开始和停止索引。
  • 在它们之间进行迭代以排除重复项。

您尚未发布 MyCompareAMixed() 函数的代码,因此无法测试包含此未定义函数的实际代码的性能,包括当前排序性能.

但是,由于您发布的重复检测方法不依赖于排序的数组,我只是从代码中删除了排序。

在没有任何进一步优化的情况下,所产生的重复数据删除过程在不到 50 毫秒的时间内完成,这不是 "slow" 在我的书中对 100,000 个复杂项目的重复数据删除。即不是单个值,而是记录多个值的项目。

如果由于其他原因需要排序那么你可以保留排序并根据其他人给出的答案优化重复数据删除过程,但我首先会质疑为什么你认为这个过程很慢并且如果低于50毫秒真的很慢,你的目标是什么?

可能是排序增加了开销(正如我所说,没有你的比较功能,我们无法量化这增加的开销)所以如果由于其他原因不需要这种排序并且如果低于 50 -msecs 可以接受这个数组的重复数据删除,然后我会继续其他任务。

  • 为记录实施一些方法(相等性检查、哈希码)以摆脱大量样板代码

type
  TArrayMixed = record
    Field1: integer;
    Field2: integer;
    Field3: integer;
    Field4: string;
    Field5: string;
    Field6: string;
    class operator Equal( const a, b: TArrayMixed ): Boolean;
    class function Compare( const L, R: TArrayMixed ): integer; overload; static;
    function Compare( const Other: TArrayMixed ): integer; overload;
    function GetHashCode( ): integer;
  end;

  { TArrayMixed }

class function TArrayMixed.Compare( const L, R: TArrayMixed ): integer;
begin
  Result := L.Compare( R );
end;

function TArrayMixed.Compare( const Other: TArrayMixed ): integer;
begin
  Result := Field1 - Other.Field1;
  if Result = 0
  then
    begin
      Result := Field2 - Other.Field2;
      if Result = 0
      then
        begin
          Result := Field3 - Other.Field3;
          if Result = 0
          then
            begin
              Result := CompareStr( Field4, Other.Field4 );
              if Result = 0
              then
                begin
                  Result := CompareStr( Field5, Other.Field5 );
                  if Result = 0
                  then
                    begin
                      Result := CompareStr( Field6, Other.Field6 );
                    end;
                end;
            end;
        end;
    end;
end;

class operator TArrayMixed.Equal( const a, b: TArrayMixed ): Boolean;
begin
  Result := true
  {} and ( a.Field1 = b.Field1 )
  {} and ( a.Field2 = b.Field2 )
  {} and ( a.Field3 = b.Field3 )
  {} and ( a.Field4 = b.Field4 )
  {} and ( a.Field5 = b.Field5 )
  {} and ( a.Field6 = b.Field6 );
end;

function TArrayMixed.GetHashCode: integer;
begin
{$IFOPT Q+}
{$Q-}
{$DEFINE SET_Q_ON}
{$ENDIF}
  Result := 17;
  Result := Result * 31 + Field1;
  Result := Result * 31 + Field2;
  Result := Result * 31 + Field3;
  Result := Result * 31 + Field4.GetHashCode;
  Result := Result * 31 + Field5.GetHashCode;
  Result := Result * 31 + Field6.GetHashCode;
{$IFDEF SET_Q_ON}
{$Q+}
{$UNDEF SET_Q_ON}
{$ENDIF}
end;
  • 使用 TDictionary 作为 HashSet 检查重复项

procedure Test;
var
  arr1, arr2: TArray<TArrayMixed>;
  idx       : integer;
  lst       : TDictionary<TArrayMixed, integer>;
begin
  // fill the array
  SetLength( arr1, 100000 );
  for idx := low( arr1 ) to high( arr1 ) do
    begin
      arr1[ idx ].Field1 := 1 + Random( 5 );
      arr1[ idx ].Field2 := 1 + Random( 5 );
      arr1[ idx ].Field3 := 1 + Random( 5 );
      arr1[ idx ].Field4 := 'String' + IntToStr( idx mod 5 );
      arr1[ idx ].Field5 := 'Another string' + IntToStr( idx mod 5 );
      arr1[ idx ].Field6 := 'New string' + IntToStr( idx mod 5 );
    end;

  // distinct
  lst := TDictionary<TArrayMixed, integer>.Create( TEqualityComparer<TArrayMixed>.Construct(
    function( const L, R: TArrayMixed ): Boolean
    begin
      Result := ( L = R );
    end,
    function( const i: TArrayMixed ): integer
    begin
      Result := i.GetHashCode( );
    end ) );
  try
    for idx := low( arr1 ) to high( arr1 ) do
      begin
        lst.AddOrSetValue( arr1[ idx ], 0 );
      end;

    arr2 := lst.Keys.ToArray;

  finally
    lst.Free;
  end;
end;

我会做这样的事情。您基本上可以即时创建结果并使用二分搜索同时进行排序以删除重复项。

function RemoveDuplicates(aSourceArray: TArray<TArrayMixed>): TArray<TArrayMixed>;
var
  i: Integer;
  index: Integer;
  sortList: TList<TArrayMixed>;
begin
  sortList := TList<TArrayMixed>.Create;
  try
    for i := Low(aSourceArray) to High(aSourceArray) do
    begin
      if not sortList.BinarySearch(aSourceArray[i], index,
        TDelegatedComparer<TArrayMixed>.Construct(
        function(const L, R: TArrayMixed): integer
        begin
          Result := L.Field1 - R.Field1;
          if Result <> 0 then Exit;
          Result := L.Field2 - R.Field2;
          if Result <> 0 then Exit;
          Result := L.Field3 - R.Field3;
          if Result <> 0 then Exit;
          Result := CompareStr(L.Field4, R.Field4);
          if Result <> 0 then Exit;
          Result := CompareStr(L.Field5, R.Field5);
          if Result <> 0 then Exit;
          Result := CompareStr(L.Field6, R.Field6);
        end)) then
      begin
        sortList.Insert(index, aSourceArray[i]);
      end;
    end;
    Result := sortList.ToArray;
  finally
    sortList.Free;
  end;
end;

要使用此代码,您可以这样做:

procedure TForm1.Button10Click(Sender: TObject);
var
  ArrayMixed, ArrayMixed_tmp: TArray<TArrayMixed>;
  i: Integer;
begin
  SetLength(ArrayMixed, 100000);
  for i := 0 to 999999 do
  begin
    ArrayMixed[i].Field1 := 1 + Random(5);
    ArrayMixed[i].Field2 := 1 + Random(5);
    ArrayMixed[i].Field3 := 1 + Random(5);
    ArrayMixed[i].Field4 := 'String'+IntToStr(i mod 5);
    ArrayMixed[i].Field5 := 'Another string'+IntToStr(i mod 5);
    ArrayMixed[i].Field6 := 'New string'+IntToStr( i mod 5);
  end;
  ArrayMixed_tmp := RemoveDuplicates(ArrayMixed);
end;