Clojure 中指定大小的随机 AST 生成

Question

我想生成一个随机的抽象语法树

(def terminal-set #{'x 'R})
(def function-arity {'+ 2, '- 2, '* 2, '% 2})
(def function-set (into #{} (keys function-arity)))
(def terminal-vec (into [] terminal-set))
(def function-vec (into [] function-set))

;; protected division
(defn % [^Number x ^Number y]
  (if (zero? y)
    0
    (/ x y)))

具有指定尺寸

(defn treesize [tree] (count (flatten tree)))

遵循 Sean Luke 一书的算法，2013 年，Essentials of Metaheuristics，Lulu，第二版，可在 https://cs.gmu.edu/~sean/book/metaheuristics/

获取

We randomly extend the horizon of a tree with nonleaf nodes until the number of nonleaf nodes, plus the remaining spots, is greater than or equal to the desired size. We then populate the remaining slots with leaf nodes:

例如

(+ (* x (+ x x)) x)

尺寸为 7。

书上的算法用的是pointers/referencesQ在那里很方便。在我的例子中，我必须使用某种递归来构造树。问题是我无法在所有使用递归的算法之间保持树的状态 size，这会导致更大的树：

(defn ptc2-tree
  "Generate a random tree up to its `max-size`.
  Note: `max-size` is the number of nodes, not the same as its depth."
  [max-size]
  (if (> 2 max-size)
    (rand-nth terminal-vec)
    (let [fun   (rand-nth function-vec)
          arity (function-arity fun)]
      (cons fun (repeatedly arity #(ptc2-tree (- max-size arity 1)))))))

我也尝试过使用 atom 作为大小，但仍然无法获得我想要的确切树大小，根据实施情况，它要么太小要么太大。

除此之外，我还必须以某种方式随机化插入新 node/tree 的位置。

这个算法怎么写？

编辑： 正确解决方案的最后一步：

(defn sequentiate [v] 
  (map #(if (seqable? %) (sequentiate %) %) (seq v)))

Answer 1

以下是文章中PTC2算法的逐字逐句翻译。这不是很地道的 Clojure 代码；您可能希望将其拆分为您认为合理的函数/更小的块。

(defn ptc2 [target-size]
  (if (= 1 target-size)
    (rand-nth terminal-vec)
    (let [f (rand-nth function-vec)
          arity (function-arity f)]
      ;; Generate a tree like [`+ nil nil] and iterate upon it
      (loop [ast (into [f] (repeat arity nil))
             ;; q will be something like ([1] [2]), being a list of paths to the
             ;; nil elements in the AST
             q (for [i (range arity)] [(inc i)])
             c 1]
        (if (< (+ c (count q)) target-size)
          ;; Replace one of the nils in the tree with a new node
          (let [a (rand-nth q)
                f (rand-nth function-vec)
                arity (function-arity f)]
            (recur (assoc-in ast a (into [f] (repeat arity nil)))
                   (into (remove #{a} q)
                         (for [i (range arity)] (conj a (inc i))))
                   (inc c)))
          ;; In the end, fill all remaining slots with terminals
          (reduce (fn [t path] (assoc-in t path (rand-nth terminal-vec)))
                  ast q))))))

您可以使用 Clojure 的 loop 构造（或 reduce 来保持迭代的状态——在此算法中，状态包括）：

ast，这是一个嵌套向量，表示正在构建的公式，其中未完成的节点标记为nil;
q，对应伪代码中的Q，是ast,
和c，也就是树中非叶节点的个数。

在结果中，您会得到如下内容：

(ptc2 10) ;; => [* [- R [% R [% x x]]] [- x R]]

我们使用向量（而不是列表）制作 AST，因为它允许我们使用 assoc-in 逐步构建树；如果需要，您可能希望自己将其转换为嵌套列表。

Answer 2

巧合的是，我一直在研究the Tupelo Forest library. You can see example code here, and a video from the 2017 Clojure/Conj here中的AST操作代码。

下面显示我将如何解决这个问题。我尽量让名字不言自明，这样应该很容易理解算法是如何进行的。

基础知识：

(def op->arity {:add 2
                :sub 2
                :mul 2
                :div 2
                :pow 2})
(def op-set (set (keys op->arity)))
(defn choose-rand-op [] (rand-elem op-set))

(def arg-set #{:x :y})
(defn choose-rand-arg [] (rand-elem arg-set))

(defn num-hids [] (count (all-hids)))

辅助函数：

(s/defn hid->empty-kids :- s/Int
  [hid :- HID]
  (let [op             (hid->attr hid :op)
        arity          (grab op op->arity)
        kid-slots-used (count (hid->kids hid))
        result         (- arity kid-slots-used)]
    (verify (= 2 arity))
    (verify (not (neg? result)))
    result))

(s/defn node-has-empty-slot? :- s/Bool
  [hid :- HID]
  (pos? (hid->empty-kids hid)))

(s/defn total-empty-kids :- s/Int
  []
  (reduce +
    (mapv hid->empty-kids (all-hids))))

(s/defn add-op-node :- HID
  [op :- s/Keyword]
  (add-node {:tag :op :op op} )) ; add node w no kids

(s/defn add-leaf-node :- tsk/KeyMap
  [parent-hid :- HID
   arg :- s/Keyword]
  (kids-append parent-hid [(add-leaf {:tag :arg :arg arg})]))

(s/defn need-more-op? :- s/Bool
  [tgt-size :- s/Int]
  (let [num-op            (num-hids)
        total-size-so-far (+ num-op (total-empty-kids))
        result            (< total-size-so-far tgt-size)]
    result))

主要算法：

(s/defn build-rand-ast :- tsk/Vec ; bush result
  [ast-size]
  (verify (<= 3 ast-size)) ; 1 op & 2 args minimum;  #todo refine this
  (with-debug-hid
    (with-forest (new-forest)
      (let [root-hid (add-op-node (choose-rand-op))] ; root of AST
        ; Fill in random op nodes into the tree
        (while (need-more-op? ast-size)
          (let [node-hid (rand-elem (all-hids))]
            (when (node-has-empty-slot? node-hid)
              (kids-append node-hid
                [(add-op-node (choose-rand-op))]))))
        ; Fill in random arg nodes in empty leaf slots
        (doseq [node-hid (all-hids)]
          (while (node-has-empty-slot? node-hid)
            (add-leaf-node node-hid (choose-rand-arg))))
        (hid->bush root-hid)))))

(defn bush->form [it]
  (let [head (xfirst it)
        tag  (grab :tag head)]
    (if (= :op tag)
      (list (kw->sym (grab :op head))
        (bush->form (xsecond it))
        (bush->form (xthird it)))
      (kw->sym (grab :arg head)))))

(dotest
  (let [tgt-size 13]
    (dotimes [i 5]
      (let [ast     (build-rand-ast tgt-size)
            res-str (pretty-str ast)]
        (nl)
        (println res-str)
        (println (pretty-str (bush->form ast))) ))))

它以层次 "bush" 格式和 lispy 格式打印结果。以下是 2 个典型结果：

[{:tag :op, :op :mul}
 [{:tag :op, :op :div}
  [{:tag :op, :op :pow}
   [{:tag :op, :op :sub}
    [{:tag :arg, :arg :y, :value nil}]
    [{:tag :arg, :arg :x, :value nil}]]
   [{:tag :op, :op :div}
    [{:tag :arg, :arg :y, :value nil}]
    [{:tag :arg, :arg :y, :value nil}]]]
  [{:tag :arg, :arg :y, :value nil}]]
 [{:tag :op, :op :pow}
  [{:tag :arg, :arg :x, :value nil}]
  [{:tag :arg, :arg :y, :value nil}]]]

(mul (div (pow (sub y x) (div y y)) y) (pow x y))


[{:tag :op, :op :div}
 [{:tag :op, :op :mul}
  [{:tag :op, :op :pow}
   [{:tag :arg, :arg :x, :value nil}]
   [{:tag :arg, :arg :y, :value nil}]]
  [{:tag :op, :op :add}
   [{:tag :op, :op :div}
    [{:tag :arg, :arg :x, :value nil}]
    [{:tag :arg, :arg :y, :value nil}]]
   [{:tag :arg, :arg :x, :value nil}]]]
 [{:tag :op, :op :mul}
  [{:tag :arg, :arg :x, :value nil}]
  [{:tag :arg, :arg :y, :value nil}]]]

(div (mul (pow x y) (add (div x y) x)) (mul x y))

为简单起见，我使用了三个字母的操作码而不是数学符号，但它们可以很容易地替换为 Clojure 函数符号名称以输入 eval。

Clojure 中指定大小的随机 AST 生成

Random AST generation with the specified size in Clojure

random

algorithm

tree

clojure

genetic-programming