将符号方程转换为整数序列

Convert symbolic equations into integer sequence

为了避免在编译 C++ 程序时反复等待几分钟,我想检测 R 中的某些方程形式并将它们转换为整数序列,以供某些进一步的函数(已经编译过)处理在主要软件中)。所讨论的方程目前仅由一个未知参数/参数、一些偏移/缩放类型的操作以及对数和指数的一些组合组成。一个小例子如下:

    #user defined list of equations
eqs <- c('2*log(1+exp(0+3*d))-2', '2*d+1')


#replace this next line with code to detect / arrange equation list into integers
inteqs <- data.frame(type=c(1L,0L),outeroffset = c(-2L,1L),
  inneroffset=c(0L,0L),outermultiply=c(2L,2L),innermultiply=c(3L,1L))

#transformation function within main program
tform <- function(d,type,outeroffset,inneroffset, outermultiply, innermultiply){
  if(type==0) return(outeroffset + outermultiply * (inneroffset + innermultiply *d))
  if(type==1) return(outeroffset + outermultiply * log(1+exp(inneroffset + innermultiply *d)))
}

for(d in c(-2,0,3.1)){ #arbitrary values of the unknown / parameter
  print(2*log(1+exp(0+3*d))-2) #true value eq1
  print(do.call(tform,c(d=d,inteqs[1,]))) #function returned value eq1
  print(2*d+1) #true value eq2
  print(do.call(tform,c(d=d,inteqs[2,])))#function returned value eq2
}

如示例中所述,我想要一些函数将所描述形式的任意方程转换为适当的整数序列。

这是一种方法。这不是最有效或最稳健的方式,但它可以处理您的示例中给出的方程式,并且(据我测试过)对操作数顺序和括号使用方面的差异具有稳健性。它处理non-atomic偏移值(例如,d + -3,而不是d - 3)。

# This recursive function walks down the tree and extracts offsets/multipliers.
# The "e" argument contains the expression we want to parse.  The "l" argument
# specifies the current level we're trying to parse (outer offset -> outer
# multiplier -> type -> inner offset -> inner multiplier).  It returns a named
# list with the relevant values specified.
library(dplyr)
extract.integers = function(e, l) {

  # If we're done, no need to do anything else.
  if(l == "done") { return(list(type = 0)) }

  # Initialize the list of values we're going to return.
  values.to.return = list()

  # Based on the current level, which level will we explore next?
  next.level = "done"
  if(l == "outeroffset") { next.level = "outermultiply" }
  else if(l == "outermultiply") { next.level = "type" }
  else if(l == "type") { next.level = "inneroffset" }
  else if(l == "inneroffset") { next.level = "innermultiply" }

  # If we're finding an offset, determine its value by adding to (or
  # subtracting from) zero.  If we're finding a multiplier, determine its value
  # by multiplying by (or dividing into) one.
  default.arg = 0
  if(grepl("multiply", l)) {
    default.arg = 1
  }

  # If the expression does not involve an operation, we've hit the bottom of
  # the tree.
  if(!is.call(e)) {
    return(list())
  }
  # Otherwise, get the top-level operation.
  else {
    operation = e[[1]]
    log.equation = F
  }

  # If this is a log or parentheses, go straight down to the next level (and
  # set the equation type appropriately if this is a log).
  if(is.element(as.character(operation), c("log", "("))) {
    next.expression = e[[2]]
    if(as.character(operation) == "log") {
      log.equation = T
      if(is.numeric(next.expression[[2]])) {
        next.expression = next.expression[[3]]
      } else {
        next.expression = next.expression[[2]]
      }
      next.expression = next.expression[[2]]
    }
    else {
      next.level = l
    }
  }
  # Otherwise, figure out which argument has the actual value of the
  # offset/multiplier and which has the next expression we're going to parse.
  else {
    arg1 = e[[2]]
    arg2 = e[[3]]
    arg.with.value = arg1
    next.expression = arg2
    if(is.numeric(arg2)) {
      arg.with.value = arg2
      next.expression = arg1
    }
    # If the operation matches the level we're trying to identify, proceed.
    if((grepl("offset", l) & is.element(as.character(operation), c("+", "-"))) |
       (grepl("multiply", l) & is.element(as.character(operation), c("*", "/")))) {
      values.to.return[[l]] = eval(as.call(list(operation, default.arg, arg.with.value)))
    }
    # Otherwise, try the next level down.
    else {
      next.expression = e
    }
  }

  # Recursive call to get values "lower down" in the expression.  Fill in the
  # value of the equation type, if known.
  sub.values.to.return = extract.integers(next.expression, next.level)
  if(log.equation) {
    values.to.return[["type"]] = 1
  }

  # Collect all the values we know so far.  Order is important, because we want
  # to overwrite default/earlier values appropriately.
  return(c(list(type = 0,
                outeroffset = 0,
                inneroffset = 0,
                outermultiply = 1,
                innermultiply = 1),
           sub.values.to.return,
           values.to.return))

}

使用提供的方程式和一些变体进行测试:

# Test equations.
eqs <- c('2*log(1+exp(0+3*d))-2',
         '2*log(1+exp(3*d))-2',
         'log(1+exp(3*d+0))*2-2',
         '2*d+1',
         '(2*d)+1',
         '(1)+(2*d)')
# Parse test equations.
inteqs = do.call(
  "bind_rows",
  lapply(
    eqs,
    function(x) {
      extract.integers(parse(text = x)[[1]], "outeroffset")
    }
  )
) %>%
  select(type, outeroffset, inneroffset, outermultiply, innermultiply)
# Check whether parses are correct.
for(d in c(-2, 0, 3.1)) { #arbitrary values of the unknown / parameter
  print(2*log(1+exp(0+3*d))-2) #true value eq1
  print(do.call(tform,c(d=d,inteqs[1,]))) #function returned value eq1
  print(2*d+1) #true value eq2
  print(do.call(tform,c(d=d,inteqs[4,])))#function returned value eq2
}

完全不同的方法:使用user-provided公式生成数据点,然后对这些点进行曲线拟合以恢复参数。作为奖励,我们可以使用 AIC(或您喜欢的任何度量)来确定用户首先提供的公式类型。

优点: 对编写相同公式的各种方式都很稳健。相对容易地扩大到不同类型的配方。代码比 "tree-walking" 替代方案更具可读性。

缺点: 说真的,这似乎有点矫枉过正。也有可能有点嘈杂(即不能 100% 保证恢复正确的公式)。函数可能需要进行一些调整以使用不同范围的 x 值,具体取决于我们正在测试的公式。您还必须手动指定哪些公式类型允许哪些参数。

# Function to figure out the parameters of a user-written formula.
library(plyr)
library(dplyr)
fit.eqs = function(e) {
  # List the types of formulas we might encounter.
  formula.types = data.frame(
    type = 0:3,
    formula = c(
      "y ~ a * x + b",
      "y ~ a * log(1 + exp(c * x + d)) + b",
      "y ~ a * log(x + d) + b",
      "y ~ a * (exp(x) / log(1 + exp(x + d))) + b"
    ),
    outeroffset = 0,
    inneroffset = c(NA, 0, 0, 0),
    outermultiply = 1,
    innermultiply = c(NA, 1, NA, NA),
    aic = NA,
    stringsAsFactors = F
  )
  # Get some x values over a wide range, and compute the corresponding y
  # values.
  xs = seq(0.01, 10, 0.01)
  ys = eval(eval(substitute(substitute(e, list(d = xs)), list(e = as.quoted(e)[[1]]))))
  data.to.fit = data.frame(x = xs, y = ys + rnorm(length(ys), 0, min(diff(ys)) / 100))
  # Try to fit each formula to the data.
  for(i in 1:nrow(formula.types)) {
    start.params = list(a = 1, b = 0)
    if(!is.na(formula.types$innermultiply[i])) {
      start.params[["c"]] = 1
    }
    if(!is.na(formula.types$inneroffset[i])) {
      start.params[["d"]] = 0
    }
    fit = nls(as.formula(formula.types$formula[i]),
              data = data.to.fit,
              start = start.params,
              control = list(warnOnly = T))
    formula.types$outeroffset[i] = round(coef(fit)[["b"]])
    formula.types$outermultiply[i] = round(coef(fit)[["a"]])
    if(!is.na(formula.types$innermultiply[i])) {
      formula.types$innermultiply[i] = round(coef(fit)[["c"]])
    }
    if(!is.na(formula.types$inneroffset[i])) {
      formula.types$inneroffset[i] = round(coef(fit)[["d"]])
    }
    formula.types$aic[i] = AIC(fit)
  }
  # Return the values we found.
  return(formula.types %>%
           filter(aic == min(aic)) %>%
           mutate(inneroffset = coalesce(inneroffset, 0),
                  innermultiply = coalesce(innermultiply, 1)) %>%
           select(type, outeroffset, inneroffset, outermultiply, innermultiply))
}

# Equations for testing.
eqs <- c('2*log(1+exp(0+3*d))-2',
         '2*log(1+exp(3*d))-2',
         'log(1+exp(3*d+0))*2-2',
         '2*d+1',
         '(2*d)+1',
         '(1)+(2*d)',
         'log(1+d)*2')

# Parse the equations and produce the correct integers.
inteqs.fitted = do.call(
  "bind_rows",
  lapply(eqs, fit.eqs)
)

由于发布的曲线拟合方法 A.S.K 不适用于身份不明的系统(例如 y ~ a * (c * x + d) + b),我更改了一些元素,完整的解决方案可能有帮助,所以我在这里发帖。这不会 return 指定的确切方程,而是一个可比较的方程。

library(plyr)
library(dplyr)
library(mize)

types=0:3
fit.eqs = function(e) {
  # List the types of formulas we might encounter.
  formula.types = data.frame(
    type =types,
    formula = c(
      "y ~ a * x + b",
      "y ~ a * log(1 + exp(c * x + d)) + b",
      "y ~ a * exp(c * x + d) + b",
      "y ~ a * (exp(c * x + d) / (1 + exp(c * x + d))) + b"
    ),
    outeroffset = 0,
    inneroffset = c(NA, rep(0,length(types)-1)),
    outermultiply = 1,
    innermultiply = c(NA, rep(1,length(types)-1)),
    lsfit = NA,
    stringsAsFactors = FALSE
  )
  # Get some x values over a wide range, and compute the corresponding y
  # values.
  x = c(seq(-2, 2, .1),seq(-10,10,.5),c(rnorm(10)))
  y = eval(eval(substitute(substitute(e, list(param = x)), list(e = as.quoted(e)[[1]]))))
  x <- x[abs(y) < 1e5]; 
  y <- y[abs(y) < 1e5]

  # Try to fit each formula to the data.
  for(i in 1:nrow(formula.types)) {
    start.params = list(a = 1.01, b = 0.01)
    if(!is.na(formula.types$innermultiply[i])) {
      start.params[["c"]] = 1.01
    }
    if(!is.na(formula.types$inneroffset[i])) {
      start.params[["d"]] = 0.01
    }

    ff <- function(pars){
      a=pars[1];b=pars[2];c=pars[3];d=pars[4]
      yest<- eval(parse(text=gsub('y ~','',as.character(formula.types$formula[i]),fixed=TRUE)))
      res <- (sum( ((y-yest)^2)/(abs(y)+.01)))
      if(is.na(res)) res <- 1e100
      return(res)
    }

    ffg <- function(pars){
      g=try(numDeriv::grad(ff,pars,method='simple',
        method.args=list(eps=1e-8,d=1e-10,r=2) 
      ),silent=TRUE)
      if(class(g)=='try-error') g <- rnorm(pars)
      if(any(is.na(g))) g[is.na(g)] <- rnorm(sum(is.na(g)))
      return(g)
    }

    fit = try(mize(par = unlist(start.params),
      fg = list(fn=ff,gr=ffg),
      max_iter=100,abs_tol=1e-3,rel_tol=1e-5,
      method='BFGS'))

    if(fit$f < .1 && fit$f > 1e-5) {
      message('close, ', round(fit$f,3))
      fit = try(mize(par = unlist(start.params), #if close, refine estimate
      fg = list(fn=ff,gr=ffg),
      max_iter=500,abs_tol=1e-5,rel_tol=1e-6,
      method='BFGS'))
    }

    if(class(fit)=='try-error') browser()
    formula.types$outeroffset[i] = fit$par[2] #round(coef(fit)[["b"]])
    formula.types$outermultiply[i] = fit$par[1] #round(coef(fit)[["a"]])
    if(!is.na(formula.types$innermultiply[i])) {
      formula.types$innermultiply[i] = fit$par[3] #round(coef(fit)[["c"]])
    }
    if(!is.na(formula.types$inneroffset[i])) {
      formula.types$inneroffset[i] = fit$par[4] #round(coef(fit)[["d"]])
    }
    formula.types$lsfit[i] = fit$f #AIC(fit)
  }
  # Return the values we found.
  print(formula.types)
  return(formula.types %>%
      filter(lsfit == min(lsfit)) %>%
      mutate(inneroffset = coalesce(inneroffset, 0),
        innermultiply = coalesce(innermultiply, 1)) %>%
      select(type, outeroffset, inneroffset, outermultiply, innermultiply,lsfit))
}

# Equations for testing.
eqs <- c('2*log(1+exp(0+3*param))-2',
  '2*log(1+(exp(3*param)))-2',
  '1.4 * (exp(3 * param + 8) / (1 + exp(3 * param + 8))) + .4',
  '2*param+1',
  '(2*param)+1',
  '(1)+(2*param)',
  'exp(1+param*1)*3',
  'param^3',
  'sqrt(exp((param^2)))'
  )

# Parse the equations and produce the correct integers.
inteqs.fitted = do.call(
  "bind_rows",
  lapply(eqs, fit.eqs)
)

round(inteqs.fitted,3)