从 Common Lisp / SBCL 中榨取更多的速度

Question

This paper 声称可以使某个 Lisp 程序运行比它的 C 程序快相等的。试图重现结果，我能够接近（Lisp 比 C) 慢 50% 但想知道是否有人知道如何挤压更多性能超出 SBCL 1.3.1.

目标问题是向 800 x 中的每个单元格添加一个常量单浮点数 800 个单浮点数组。方法是用C写程序，在和Common Lisp 比较时代。使用这个 portable timer，C 代码如下如下：

#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>

#include "./modules/tictoc/tictoc.h"

const int HORZ = 800;
const int VERT = 800;

#define PERF_REPS 1000

typedef float DATA_T;

struct image_s {
    size_t n;
    size_t w, h;
    DATA_T * data;
};
typedef struct image_s image;

image * make_image (size_t w, size_t h) {
    size_t n = w * h;
    DATA_T * data = (DATA_T *)malloc(sizeof(DATA_T) * n);
    assert (NULL != data);
    image * result = (image *)malloc(sizeof(image));
    assert (NULL != result);
    result->n = n;
    result->w = w;
    result->h = h;
    result->data = data;
    return result;
}

void free_image (image * it) {
    assert (NULL != it);
    assert (NULL != it->data);
    free (it->data);
    free (it);
}

image * init_to_value (image * it, DATA_T val) {
    assert (NULL != it);
    assert (NULL != it->data);
    size_t i;
    const size_t n = it->n;
    for (i = 0; i < n; ++i) {
        it->data[i] = val;
    }
    return it;
}

void add (image * to, image * from, DATA_T val) {
    assert (NULL != to);
    assert (NULL != to->data);
    assert (NULL != from);
    assert (NULL != from->data);
    size_t i;
    const size_t n = to->n;
    for (i = 0; i < n; ++i) {
        to->data[i] = from->data[i] + val;
    }
}

int main (int argc, char ** argv) {
    image * from = init_to_value (make_image (HORZ, VERT), 0.0f);
    image * to = init_to_value (make_image (HORZ, VERT), 0.0f);
    TicTocTimer clock = tic();
    for (size_t i = 0; i < PERF_REPS; ++i)
        add (to, from, 42.0);
    printf("Elapsed time %f seconds.\n",toc(&clock));
    free_image (to);
    free_image (from);
    return 0;
}

我编译运行代码如下：

gcc -O3 image-add.c ./modules/tictoc/libtictoc.a && ./a.out

我的 mac book pro 的典型时间约为 0.178 秒。挺好看的

等效的 Lisp 代码，使用了我在 Lisp 中能够找到的每个选项 hyperspec, in the new book Common Lisp Recipes, and in the SBCL user manual, 是如下。评论表明我尝试过的一些事情没有成功区别。

;;; None of the commented-out declarations made any difference in speed. 

(declaim (optimize speed (safety 0)))

(defun image-add (to from val)
  (declare (type (simple-array single-float (*))
                 to from))
  (declare (type single-float val))
  (let ((size (array-dimension to 0)))
    ;(declare (type fixnum size))
    (dotimes (i size)
      ;(declare (type fixnum i))
      (setf (aref to i) (+ (aref from i) val)))))

(defparameter HORZ 800)
(defparameter VERT 800)

(defparameter PERF-REPS 1000)

(let ((to (make-array (* HORZ VERT) :element-type 'single-float))
      (fm (make-array (* HORZ VERT) :element-type 'single-float)))
  ;(declare (type fixnum HORZ))
  ;(declare (type fixnum VERT))
  (time (dotimes (_ PERF-REPS)
          ;(declare (type fixnum PERF-REPS))
          ;(declare (type fixnum _))
          ;(declare (inline image-add))
          (image-add to fm 42.0))))

我编译运行如下：

sbcl --script image-perf.lisp

和典型的运行次是 0.276。不错，但我想要更好。当然，练习的重点是 Lisp 代码更短，但是任何人都知道如何使它变得更快或更快？

Answer 1

SBCL 提供了一堆信息

当我保存你的代码（并将最终的 let 示例包装到一个单独的函数中），并使用 SBCL 编译它时，我实际上得到了一堆诊断输出，通知我们一些可以生成更好代码的地方。有很多，但请浏览这里，虽然它都在 test 中，所以它可能有帮助也可能没有帮助。但是，由于测试代码可能会减慢速度，因此值得一试。

CL-USER> (compile-file ".../compile.lisp")
; compiling file ".../compile.lisp" (written 25 JAN 2016 01:53:23 PM):
; compiling (DECLAIM (OPTIMIZE SPEED ...))
; compiling (DEFUN IMAGE-ADD ...)
; compiling (DEFPARAMETER HORZ ...)
; compiling (DEFPARAMETER VERT ...)
; compiling (DEFPARAMETER PERF-REPS ...)
; compiling (DEFUN TEST ...)

; file: /home/taylorj/tmp/compile.lisp
; in: DEFUN TEST
;     (* HORZ VERT)
; 
; note: unable to
;   convert x*2^k to shift
; due to type uncertainty:
;   The first argument is a NUMBER, not a INTEGER.
;   The second argument is a NUMBER, not a INTEGER.
; 
; note: unable to
;   convert x*2^k to shift
; due to type uncertainty:
;   The first argument is a NUMBER, not a INTEGER.
;   The second argument is a NUMBER, not a INTEGER.

;     (DOTIMES (_ PERF-REPS) (IMAGE-ADD TO FM 42.0))
; --> DO BLOCK LET TAGBODY UNLESS IF >= IF 
; ==>
;   (< SB-C::X SB-C::Y)
; 
; note: forced to do GENERIC-< (cost 10)
;       unable to do inline fixnum comparison (cost 4) because:
;       The first argument is a UNSIGNED-BYTE, not a FIXNUM.
;       The second argument is a INTEGER, not a FIXNUM.

; --> DO BLOCK LET TAGBODY PSETQ PSETF LET* MULTIPLE-VALUE-BIND LET 1+ 
; ==>
;   (+ _ 1)
; 
; note: forced to do GENERIC-+ (cost 10)
;       unable to do inline fixnum arithmetic (cost 1) because:
;       The first argument is a UNSIGNED-BYTE, not a FIXNUM.
;       The result is a (VALUES (INTEGER 1) &OPTIONAL), not a (VALUES FIXNUM
;                                                                     &REST T).
;       unable to do inline fixnum arithmetic (cost 2) because:
;       The first argument is a UNSIGNED-BYTE, not a FIXNUM.
;       The result is a (VALUES (INTEGER 1) &OPTIONAL), not a (VALUES FIXNUM
;                                                                     &REST T).
;       etc.

;     (* HORZ VERT)
; 
; note: forced to do GENERIC-* (cost 30)
;       unable to do inline fixnum arithmetic (cost 4) because:
;       The first argument is a NUMBER, not a FIXNUM.
;       The second argument is a NUMBER, not a FIXNUM.
;       unable to do inline (signed-byte 64) arithmetic (cost 5) because:
;       The first argument is a NUMBER, not a (SIGNED-BYTE 64).
;       The second argument is a NUMBER, not a (SIGNED-BYTE 64).
;       etc.
; 
; note: forced to do GENERIC-* (cost 30)
;       unable to do inline fixnum arithmetic (cost 4) because:
;       The first argument is a NUMBER, not a FIXNUM.
;       The second argument is a NUMBER, not a FIXNUM.
;       unable to do inline (signed-byte 64) arithmetic (cost 5) because:
;       The first argument is a NUMBER, not a (SIGNED-BYTE 64).
;       The second argument is a NUMBER, not a (SIGNED-BYTE 64).
;       etc.
; 
; compilation unit finished
;   printed 6 notes

; .../compile.fasl written
; compilation finished in 0:00:00.009

这些东西都在 test 中，但是因为你 are 计时你的 dotimes 循环，至少修复该比较可能是有意义的。这是带有声明的 test 代码，应该使 dotimes 更快一点：

(defun test ()
  (declare (type fixnum HORZ VERT PERF-REPS))
  (let ((to (make-array (* HORZ VERT) :element-type 'single-float))
        (fm (make-array (* HORZ VERT) :element-type 'single-float)))
    (time (dotimes (_ PERF-REPS)
            (image-add to fm 42.0)))))

之后，您可能想要研究可能的循环展开、缓存数组维度并考虑数组的内存局部性。不过，这些都是非常通用的提示。我不确定哪些具体的事情可以在这里提供更多帮助。

关于"be sure to use optimize and safety"

的标准答案

Edit: shoot, I missed the toplevel declaim that does reference speed and safety. It's still worth checking into whether those are having the desired effect, but if they are, then this answer is mostly redundant.

在大多数情况下，允许编译器完全忽略声明。（唯一的例外是 special 声明，它改变了变量的绑定语义。）所以编译器如何处理它们取决于它。像 type 这样的声明至少可以以两种不同的方式使用。如果您正在尝试编译非常安全的代码，类型声明会让编译器知道可以添加 额外的 检查。当然，这会导致代码变慢，但会更安全。另一方面，如果您试图生成非常快速的代码，那么编译器可以将这些类型声明作为您保证值始终是正确类型的保证，因此生成更快的代码。

您似乎只是在添加类型声明。如果您想要更快（或更安全）的代码，您也需要添加声明来说明这一点。在这种情况下，您可能需要 (declare (optimize (speed 3) (safety 0)))。例如，看一下 returns 其 fixnum 参数的简单函数的一些反汇编。首先，仅声明类型，代码最终为 18 个字节：

(defun int-identity (x)
  (declare (type fixnum x))
  x)

INT-IDENTITY
CL-USER> (disassemble 'int-identity)
; disassembly for INT-IDENTITY
; Size: 18 bytes. Origin: #x100470619A
; 9A:       488BE5           MOV RSP, RBP                     ; no-arg-parsing entry point
; 9D:       F8               CLC
; 9E:       5D               POP RBP
; 9F:       C3               RET
; A0:       CC0A             BREAK 10                         ; error trap
; A2:       02               BYTE #X02
; A3:       19               BYTE #X19                        ; INVALID-ARG-COUNT-ERROR
; A4:       9A               BYTE #X9A                        ; RCX
; A5:       CC0A             BREAK 10                         ; error trap
; A7:       04               BYTE #X04
; A8:       08               BYTE #X08                        ; OBJECT-NOT-FIXNUM-ERROR
; A9:       FE1B01           BYTE #XFE, #X1B, #X01            ; RDX
NIL

现在，如果我们还添加速度优化，代码大小会增加一点点。（但这不一定是坏事。一些速度优化，例如循环展开或函数内联，会生成更大的代码。）

CL-USER> 
(defun int-identity (x)
  (declare (type fixnum x)
           (optimize (speed 3)))
  x)

STYLE-WARNING: redefining COMMON-LISP-USER::INT-IDENTITY in DEFUN
INT-IDENTITY
CL-USER> (disassemble 'int-identity)
; disassembly for INT-IDENTITY
; Size: 20 bytes. Origin: #x1004A5D23D
; 3D:       488BE5           MOV RSP, RBP                     ; no-arg-parsing entry point
; 40:       F8               CLC
; 41:       5D               POP RBP
; 42:       C3               RET
; 43:       CC0A             BREAK 10                         ; error trap
; 45:       04               BYTE #X04
; 46:       19               BYTE #X19                        ; INVALID-ARG-COUNT-ERROR
; 47:       FE9A01           BYTE #XFE, #X9A, #X01            ; RBX
; 4A:       CC0A             BREAK 10                         ; error trap
; 4C:       04               BYTE #X04
; 4D:       08               BYTE #X08                        ; OBJECT-NOT-FIXNUM-ERROR
; 4E:       FE1B01           BYTE #XFE, #X1B, #X01            ; RDX
NIL

最后，当我们去掉安全后，我们终于得到一些真正的短代码，只有9个字节：

CL-USER> 
(defun int-identity (x)
  (declare (type fixnum x)
           (optimize (speed 3)
                     (safety 0)))
  x)

STYLE-WARNING: redefining COMMON-LISP-USER::INT-IDENTITY in DEFUN
INT-IDENTITY
CL-USER> (disassemble 'int-identity)
; disassembly for INT-IDENTITY
; Size: 9 bytes. Origin: #x1004AFF3E2
; 2:       488BD1           MOV RDX, RCX                      ; no-arg-parsing entry point
; 5:       488BE5           MOV RSP, RBP
; 8:       F8               CLC
; 9:       5D               POP RBP
; A:       C3               RET

Answer 2

使用@coredump 的:lparallel 建议，我能够持续运行 0.125 秒，明显快于 gcc -O3 的 0.175。编译文件和内联 image-add 函数的注释中建议的各种技术并未产生明显的加速。这是迄今为止最快的代码。

(load "~/quicklisp/setup.lisp")
(ql:quickload :lparallel)

(declaim (optimize (speed 3) (safety 0)))

(defparameter HORZ 800)
(defparameter VERT 800)

(defparameter PERF-REPS 1000)

(setf lparallel:*kernel* (lparallel:make-kernel 4))

(defun test ()
  (declare (type fixnum HORZ VERT PERF-REPS))
  (let ((to (make-array (* HORZ VERT) :element-type 'single-float))
        (fm (make-array (* HORZ VERT) :element-type 'single-float)))
    (time (dotimes (_ PERF-REPS)
            (lparallel:pmap-into to (lambda (x) (+ x 42f0)) fm)))))

(test)

编辑：我会注意到这并不完全公平：我向 Lisp 代码添加了显式并行性，但没有向 C 代码添加。然而，值得注意的是，使用 Lisp 做到这一点是多么容易。因为在这种情况下 Lisp 相对于 C 的主要优势是代码简洁和添加并行性等功能相对容易，所以 trade-off 的方向是正确的（说明 Lisp 的相对灵活性）。我怀疑并行 C 代码（如果以及何时可以实现它）将再次比 Lisp 代码更快。

Answer 3

作为参考，这里有一些稍微修改过的结果。

C版

C版平均耗时0.197s.

Lisp 版本

(declaim (optimize (speed 3) (debug 0) (safety 0)))

(defconstant HORZ 800)
(defconstant VERT 800)
(defconstant PERF-REPS 1000)

(defun test ()
  (let ((target #1=(make-array (* HORZ VERT)
                               :element-type 'single-float
                               :initial-element 0f0))
        (source #1#))
    (declare (type (simple-array single-float (*)) target source))
    (time 
      (dotimes (_ PERF-REPS)
        (map-into target
                  (lambda (x)
                    (declare (single-float x))
                    (the single-float (+ x 42f0)))
                  source)))))

这是输出：

Evaluation took:                                                                                                 
  0.372 seconds of real time                                                                                     
  0.372024 seconds of total run time (0.368023 user, 0.004001 system)                                            
  100.00% CPU                                                                                                    
  965,075,988 processor cycles                                                                                   
  0 bytes consed

将map-into替换为lparallel:pmap-into，最短时间是用一个由4个worker组成的内核获得的，并给出：

Evaluation took:                                                                                                 
 0.122 seconds of real time                                                                                     
 0.496031 seconds of total run time (0.492030 user, 0.004001 system)                                            
 406.56% CPU                                                                                                    
 316,445,789 processor cycles                                                                                   
 753,280 bytes consed

注意内存使用的差异。

Answer 4

这是很老的 post 但您可能仍然感兴趣。

如果我追求速度，我会这样做！

当我们比较 C 和 Common Lisp 的速度时，我们需要考虑 C 编译器在高优化设置下可以做什么。

由于带有 -O3 的 C 会执行而 SBCL 不会执行自动矢量化，因此我们在进行比较时必须小心。

通过矢量化，我们可以看到下面的 SBCL 应该非常快。

请先安装sb-simd到~/quicklisp/local-projects。

然后运行 (ql:quickload :sb-simd) 来构建它。

你需要 sbcl-2.1.11 到运行最新的 sb-simd。

生成的矢量化 SBCL 可执行映像比同样进行矢量化的 C 更快，并且很可能还使用 -O3 编译器标志进行循环展开。

编辑：我刚刚检查了生成的程序集，它是矢量化的。使用 --march=native --mavx2 速度确实略有提高。

(declaim (optimize speed (safety 0)))
(ql:quickload :sb-simd :silent t)
(use-package :sb-simd)

(declaim (ftype (function (f32vec f32vec f32) null) image-add))
(defun image-add (to from val)
  (do-vectorized (i 0 (array-total-size to))
    (:unroll 2)
    (setf (f32-aref to i) (f32+ (f32-aref from i) val))))

(defparameter *HORZ* 800)
(defparameter *VERT* 800)

(defparameter *PERF-REPS* 1000)

(defun main ()
  (declare (type fixnum *HORZ* *VERT* *PERF-REPS*))  
  (let ((to   (make-array (* *HORZ* *VERT*) :element-type 'f32))
        (from (make-array (* *HORZ* *VERT*) :element-type 'f32)))
    (time (dotimes (_ *PERF-REPS*)
            (image-add to from 42.0)))))

(save-lisp-and-die "image-add" :executable t :toplevel #'main)

编辑：在我的 i7-7700HQ CPU 我得到：

$ gcc -O3 image-add.c ./tictoc/build/libtictoc.a
$ ./a.out 
Elapsed time 0.124956 seconds.

$ gcc -O3 -march=native -mavx2 image-add.c ./tictoc/build/libtictoc.a
$ ./a.out
Elapsed time 0.116989 seconds.

$ ./image-add 
Evaluation took:
  0.072 seconds of real time
  0.070440 seconds of total run time (0.070236 user, 0.000204 system)
  97.22% CPU
  199,079,678 processor cycles
  0 bytes consed

从 Common Lisp / SBCL 中榨取更多的速度

Squeeze more speed from Common Lisp / SBCL

sbcl

common-lisp

SBCL 提供了一堆信息

关于"be sure to use optimize and safety"

C版

Lisp 版本