将转译后的代码映射回原始标记脚本

Question

最近有人问是否有如下简单的方法来转换自定义标记，包括嵌套标记。示例包括...

对于 \k[hello] 输出将是 hello
对于 \i[world]，输出将是 world
对于hello \k[dear \i[world]]，输出将是hello dear world
对于 \b[some text](url)，输出将是 <a href=”url”>some text</a>
对于 \r[some text](url)，输出将是 <img alt=”some text” src=”url” />

有趣的是，将上面的内容转换为 javascript，包括嵌套的考虑，非常简单，尤其是在标记语法一致的情况下。

//
// Define the syntax and translation to javascript.
//
const grammar = {

  syntax: {
    k:      {markUp: `\k[`, javascript: `"+grammar.oneArg("k","`,  pre: `<b>`,  post: `</b>`},
    i:      {markUp: `\i[`, javascript: `"+grammar.oneArg("i","`,  pre: `<em>`, post: `</em>`},
    b:      {markUp: `\b[`, javascript: `"+grammar.twoArgs("b","`, pattern: `<a href=""></a>`},
    r:      {markUp: `\r[`, javascript: `"+grammar.twoArgs("r","`, pattern: `<img alt="" src=""/>`},
    close0: {markUp: `](`,   javascript: `","`},
    close1: {markUp: `)`,    javascript: `")+"`},
    close2: {markUp: `]`,    javascript: `")+"`}
  },

  oneArg: function( command, arg1 ) {
    return grammar.syntax[ command ].pre + arg1 + grammar.syntax[ command ].post;
  },

  twoArgs: function( command, arg1, arg2 ) {
    return grammar.syntax[ command ].pattern.split( `` ).join( arg1 ).split( `` ).join( arg2 );
  }
}


function transpileAndExecute( markUpString ) {
  // Convert the markUp to javascript.
  for ( command in grammar.syntax ) {
    markUpString = markUpString.split( grammar.syntax[ command ].markUp ).join( grammar.syntax[ command ].javascript );
  }

  // With the markUp now converted to javascript, let's execute it!
  return new Function( `return "${markUpString}"` )();
}

var markUpTest = `Hello \k[dear \i[world!]] \b[\i[Search:] \k[Engine 1]](http://www.google.com) \r[\i[Search:] \k[Engine 2]](http://www.yahoo.com)`;

console.log( transpileAndExecute( markUpTest ) );

请注意，显然还有一些预处理问题也必须解决，例如如何处理普通文本中包含的标记。例如，将“]”作为文本字符串的一部分包含在转译器中会抛出一个曲线球，因此强制执行一个规则，例如使用“\]”来表示“]”，然后替换所有此类出现的“\]”在转译之前使用无害的文本然后在之后重新替换可以简单地解决这个问题...

在转译方面，使用上面定义的语法，以下标记...

Hello \k[dear \i[world!]] \b[\i[Search:] \k[Engine 1]](http://www.google.com) \r[\i[Search:] \k[Engine 2]](http://www.yahoo.com)

...被转译为...

"Hello world! "+grammar.oneArg("k","dear "+grammar.oneArg("i","world")+"")+" "+grammar.twoArgs("b",""+grammar.oneArg("i","Search:")+" "+grammar.oneArg("k","Engine 1")+"","http://www.google.com")+" "+grammar.twoArgs("r",""+grammar.oneArg("i","Search:")+" "+grammar.oneArg("k","Engine 2")+"","http://www.yahoo.com")+""

...一旦作为 javascript 函数执行，结果...

Hello <b>dear <em>world!</em></b> <a href="http://www.google.com"><em>Search:</em> <b>Engine 1</b></a> <img alt="<em>Search:</em> <b>Engine 2</b>" src="http://www.yahoo.com"/>

但真正的挑战是语法错误的处理，尤其是当有大量标记需要转译时。 CertainPerformance 的 crystal 明确答案（参见 Find details of SyntaxError thrown by javascript new Function() constructor ）提供了一种从动态编译的 javascript 函数中捕获语法错误的行号和字符号的方法，但我不太确定将转译代码的语法错误映射回原始标记的最佳方法。

例如，如果一个额外的']'不合适（在"Goodbye"之后）...

Hello World! \b[\i[Goodbye]]] \k[World!]]

...这转换为...

"Hello World! "+grammar.twoArgs("b",""+grammar.oneArg("i","Goodbye")+"")+"")+" "+grammar.oneArg("k","World!")+"")+""
                                                                           ^

...和 CertainPerformance 的 checkSyntax 函数 returns "Error thrown at: 1:76"，正如预期的那样，上面用“^”标记。

问题是，如何将其映射回原始标记以帮助缩小标记中的错误？（显然，在这种情况下，很容易看到标记中的错误，但如果有标记页面被转译，则必须协助缩小语法错误的范围。）维护标记和转译代码之间的映射似乎棘手的是，因为转译器在遍历语法转换矩阵时将标记逐步变异为 javascript 代码。我的直觉告诉我有一个更简单的方法...感谢您的关注。

Answer 1

我建议您编写一个语法检查器，有点像 jsonlint 或 jslint 等...在实际将文本编译为人类可读的文本之前检查是否正确检查和关闭了所有内容。

这允许调试，并防止格式错误的代码运行乱七八糟，并允许您在编辑文本时提供错误突出显示的文档编辑器。

下面是一个概念证明，它只检查括号是否正确闭合。

var grammarLint = function(text) {
  var nestingCounter = 0;
  var isCommand = char => char == '\';
  var isOpen = char => char == '[';
  var isClose = char => char == ']';
  var lines = text.split('\n');
  for(var i = 0; i < lines.length; i++) {
    text = lines[i];
    for(var c = 0; c < text.length; c++) {
     var char = text.charAt(c);
     if(isCommand(char) && isOpen(text.charAt(c+2))) {
        c += 2;
        nestingCounter++;
        continue;
     }
     if(isClose(char)) {
        nestingCounter--;
        if(nestingCounter < 0) {
            throw new Error('Command closed but not opened at on line '+(i+1)+' char '+(c+1));
        }
      }
    }
  }
  if(nestingCounter > 0) {
     throw new Error(nestingCounter + ' Unclosed command brackets found');
  }
}
text = 'Hello World! \b[\i[Goodbye]]] \k[World!]]';
try {
   grammarLint(text);
}
catch(e) {
   console.error(e.message);
}
text = 'Hello World! \b[\i[Goodbye \k[World!]]';
try {
   grammarLint(text);
}
catch(e) {
   console.error(e.message);
}

Answer 2

追查利用 javascript 编译器捕获转译代码中的语法错误并将其引用回原始标记的能力。简而言之，这涉及一种在转译代码中加入注释以允许引用回标记的方案，从而提供缩小标记错误的方法。（有一点缺点是错误消息实际上是一个转译器语法错误，不一定与标记错误完全对应，但提供了一个找出标记问题所在的战斗机会。）

该算法还利用了 CertainPerformance 技术 ( Find details of SyntaxError thrown by javascript new Function() constructor ) 的概念，即使用 setTimeout 捕获转译代码的语法错误。我穿插了一个 javascript Promise 来平滑流程。

"use strict";

//
// Define the syntax and translation to javascript.
//
class Transpiler {

  static _syntaxCheckCounter = 0;
  static _syntaxCheck = {};
  static _currentSyntaxCheck = null;

  constructor() {
    this.grammar = {

      syntax: {
        k:      {markUp: `\k[`, javascript: `"►+grammar.oneArg("k",◄"`,  pre: `<b>`,  post: `</b>`},
        i:      {markUp: `\i[`, javascript: `"►+grammar.oneArg("i",◄"`,  pre: `<em>`, post: `</em>`},
        b:      {markUp: `\b[`, javascript: `"►+grammar.twoArgs("b",◄"`, pattern: `<a href=""></a>`},
        r:      {markUp: `\r[`, javascript: `"►+grammar.twoArgs("r",◄"`, pattern: `<img alt="" src=""/>`},
        close0: {markUp: `](`,   javascript: `"►,◄"`},
        close1: {markUp: `)`,    javascript: `"►)+◄"`},
        close2: {markUp: `]`,    javascript: `"►)+◄"`}
      },

      marker: {           // https://www.w3schools.com/charsets/ref_utf_geometric.asp
        begMarker: `►`,   // 25ba
        endMarker: `◄`,   // 25c4
        begComment: `◆`,  // 25c6
        endComment: `◇`,  // 25c7
        fillerChar: `●`   // 25cf
      },

      oneArg: function( command, arg1 ) {
        return this.syntax[ command ].pre + arg1 + this.syntax[ command ].post;
      },

      twoArgs: function( command, arg1, arg2 ) {
        return this.syntax[ command ].pattern.split( `` ).join( arg1 ).split( `` ).join( arg2 );
      }
    };
  };

  static transpilerSyntaxChecker(err) {
    // Uncomment the following line to disable default console error message.
    //err.preventDefault();

    let transpiledLine = Transpiler._syntaxCheck[ Transpiler._currentSyntaxCheck ].transpiledFunction.split(`\n`)[1];

    let lo = parseInt( transpiledLine.substr( transpiledLine.substr( 0, err.colno ).lastIndexOf( `●` ) + 1 ) );
    let hi = parseInt( transpiledLine.substr( transpiledLine.substr( err.colno ).indexOf( `●` ) + err.colno + 1 ) );

    let markUpLine = Transpiler._syntaxCheck[ Transpiler._currentSyntaxCheck ].markUp;
    let errString = markUpLine.substring( lo - 40, hi + 40 ).split(`\n`).join(`↵`) + `\n`;
    errString += ( `.`.repeat( lo ) + `^`.repeat( hi - lo ) ).substring( lo - 40, hi + 40 );

    Transpiler._syntaxCheck[Transpiler._currentSyntaxCheck].rejectFunction( new Error(`'${ err.message }' in transpiled code, corresponding to character range ${ lo }:${ hi } in the markup.\n${ errString }`) );

    window.removeEventListener('error', Transpiler.transpilerSyntaxChecker);
    delete Transpiler._syntaxCheck[Transpiler._currentSyntaxCheck];
  };

  async transpileAndExecute( markUpString ) {
    // Convert the markUp to javascript.

    console.log( markUpString );

    let gm = this.grammar.marker;
    let markUpIndex = markUpString;
    let transpiled = markUpString;
    for ( let n in this.grammar.syntax ) {
      let command = this.grammar.syntax[ n ];
      let markUpIndexSplit = markUpIndex.split( command.markUp );
      let transpiledSplit = transpiled.split( command.markUp );

      if ( markUpIndexSplit.length !== transpiledSplit.length ) {
        throw `Ambiguous grammar when searching for "${ command.markUp }" to replace with "${ command.javascript }".`;
      }

      for ( let i = 0; i < markUpIndexSplit.length; i++ ) {
        if ( i === 0 ) {
          markUpIndex = markUpIndexSplit[ 0 ];
          transpiled = transpiledSplit[ 0 ];
        } else {
          let js = command.javascript.replace( gm.begMarker, gm.begComment + gm.fillerChar + markUpIndex.length + gm.endComment );
          markUpIndex += gm.fillerChar.repeat( command.markUp.length );
          js = js.replace( gm.endMarker, gm.begComment + gm.fillerChar + markUpIndex.length + gm.endComment );
          markUpIndex += markUpIndexSplit[ i ];
          transpiled += js + transpiledSplit[ i ];
        }
      }
    };

    transpiled = transpiled.split( gm.begComment ).join( `/*` );
    transpiled = transpiled.split( gm.endComment ).join( `*/` );
    transpiled = `/*${ gm.fillerChar }0*/"${ transpiled }"/*${ gm.fillerChar }${ markUpIndex.length + 1 }*/`;

    console.log( markUpIndex );
    console.log( transpiled );

    let self = this;

    var id = ++Transpiler._syntaxCheckCounter;
    Transpiler._syntaxCheck[id] = {};

    let transpiledFunction = `"use strict"; if ( run ) return\n${ transpiled.split(`\n`).join(` `) }`;
    Transpiler._syntaxCheck[id].markUp = markUpString;
    Transpiler._syntaxCheck[id].transpiledFunction = transpiledFunction;

    //
    // Here's where it gets tricky.  (See "CertainPerformance's" post at
    // 
    // for details behind the concept.)  In this implementation a Promise
    // is created, which on success of the JS compiler syntax check, is resolved
    // immediately.  Otherwise, if there is a syntax error, the transpilerSyntaxChecker
    // routine, which has access to a reference to the Promise reject function,
    // calls the reject function to resolve the promise, returning the error back
    // to the calling process. 
    // 
    let checkSyntaxPromise = new Promise((resolve, reject) => {
      setTimeout( () => {
        Transpiler._currentSyntaxCheck = id;
        window.addEventListener('error', Transpiler.transpilerSyntaxChecker);

        // Perform the syntax check by attempting to compile the transpiled function.
        new Function( `grammar`, `run`, transpiledFunction )( self.grammar );

        resolve( null );
        window.removeEventListener('error', Transpiler.transpilerSyntaxChecker);
        delete Transpiler._syntaxCheck[id];
      });
      Transpiler._syntaxCheck[id].rejectFunction = reject;
    });

    let result = await checkSyntaxPromise;

    // With the markUp now converted to javascript and syntax checked, let's execute it!
    return ( new Function( `grammar`, `run`, transpiledFunction.replace(`return\n`,`return `) )( this.grammar, true ) );

  };

}

这里有一些示例运行s 带有拙劣的标记，以及相应的控制台输出。以下标记有一个额外的 ]...

let markUp = `Hello World \k[Goodbye]] World`;
new Transpiler().transpileAndExecute( markUp ).then(result => console.log( result )).catch( err => console.log( err ));

...导致转译代码...

/*●0*/""/*●0*/+grammar.oneArg("i",/*●2*/"Hello World"/*●13*/)+/*●14*/" "/*●15*/+grammar.oneArg("k",/*●17*/""/*●17*/+grammar.oneArg("i",/*●19*/"Goodbye"/*●26*/)+/*●27*/" World"/*●34*/

注意散布的注释，它们指向原始标记中的字符位置。然后，当 javascript 编译器抛出错误时，它被 transpilerSyntaxChecker 捕获，它使用嵌入的注释来识别标记中的位置，将以下结果转储到控制台...

Uncaught SyntaxError: Unexpected token )
    at new Function (<anonymous>)
    at markUp.html:127
Error: 'Uncaught SyntaxError: Unexpected token )' in transpiled code, corresponding to character range 22:23 in the markup.
Hello World k[Goodbye]] World
......................^
    at transpilerSyntaxChecker (markUp.html:59)

请注意，Unexpected token ) 消息指的是转译后的代码，而不是标记脚本，但输出指向有问题的 ]。

这是另一个示例运行，在本例中缺少关闭 ]...

let markUp = `\i[Hello World] \k[\i[Goodbye] World`;
new Transpiler().transpileAndExecute( markUp ).then(result => console.log( result )).catch(err => console.log( err ));

...生成以下转译代码...

/*●0*/""/*●0*/+grammar.oneArg("i",/*●2*/"Hello World"/*●13*/)+/*●14*/" "/*●15*/+grammar.oneArg("k",/*●17*/""/*●17*/+grammar.oneArg("i",/*●19*/"Goodbye"/*●26*/)+/*●27*/" World"/*●34*/

...引发以下错误...

Uncaught SyntaxError: missing ) after argument list
    at new Function (<anonymous>)
    at markUp.html:127
Error: 'Uncaught SyntaxError: missing ) after argument list' in transpiled code, corresponding to character range 27:34 in the markup.
i[Hello World] k[i[Goodbye] World
...........................^^^^^^^
    at transpilerSyntaxChecker (markUp.html:59)

也许不是最好的解决方案，但却是懒人的解决方案。 Tschallacka 的响应在对标记执行真正的语法检查方面具有优点（即，自定义语法检查器或使用类似 Jison 的东西），没有 setTimeout / Promise 复杂性，也没有使用转译器错误消息引用原始标记的有点不精确的方法...

将转译后的代码映射回原始标记脚本

Mapping transpiled code back to the original markup script

javascript

algorithm

transpiler