使用 ocamllex 进行 Lexing 和 include 指令
Lexing and include directive with ocamllex
我正在为必须支持#include 指令(仅在文件开头)的类 C 语言制作编译器
一种简单但不优雅的方法是创建一个子程序,该子程序查找每次出现的指令,并用新临时文件中的相应文件替换。
现在这一点都不好。所以我尝试了以下方法:
lexer = parse
| "#include \"" ( [^'"' '\n']* as filename) '"'
{ lexer (Lexing.from_channel (open_in filename)) ; lexer lexbuf }
想法如下:每当您找到一个包含,使用给定的文件名打开一个新频道,并在该频道上递归调用 "lexer" 规则。之后,继续你的 lexing-buffer 的当前状态并继续 lexing。
问题是,它从来没有奏效。
我还看到当缓冲区 lexbuf 达到 eof 时可以重新填充。但我没能找到更多信息。这让我想到将上面的代码更改为以下代码:
lexer = parse
| "#include \"" ( [^'"' '\n']* as filename) '"'
{ addCurrentLexBufToAStack lexbuf ;lexer (Lexing.from_channel (open_in filename)); }
而在笔芯中,您将从堆栈的头部继续
但似乎对工作很有野心。
有什么想法吗?
P.s。从另一个模块调用词法分析器(以及解析器)(我们称之为 Main.ml)
嗯,您对词法分析和语法分析有点困惑吗?
我看到的是:
If my lexeme is #include ident I want to parse what's in the
file pointed by ident and add it.
然后你混淆了解析和词法分析
你可以这样写:(这是一个小程序,但它可以工作 ;-))
ast.mli
type operation =
| Plus of operation * operation
| Minus of operation * operation
| Int of int
type prog = string list * operation list
lexer.mll
{
open Parser
open Lexing
open Ast
let current_pos b =
lexeme_start_p b,
lexeme_end_p b
}
let newline = '\n'
let space = [' ' '\t' '\r']
let digit = ['0' - '9']
let integer = digit+
rule token = parse
| newline { token lexbuf}
| space+ { token lexbuf}
| "#include \"" ( [^'"' '\n']* as filename) '"' { INCLUDE filename }
| integer as i { INTEGER (int_of_string i) }
| "+" { PLUSI }
| "-" { MINUSI }
| ";" { SC }
| "main" { MAIN }
| eof
{ EOF }
parser.mly
%{
open Ast
%}
%token <string> INCLUDE
%token EOF SC
%token PLUSI
%token MINUSI
%token MAIN
%token <int> INTEGER
%left PLUSI MINUSI
%start <Ast.prog> prog
%%
prog:
include_list MAIN operations EOF { (, ) }
include_list:
| { [] }
| INCLUDE include_list { :: }
operation:
| operation PLUSI operation { Plus (, ) }
| operation MINUSI operation { Minus (, ) }
| INTEGER { Int }
operations:
| operation { [] }
| operation SC operations { :: }
因此,如您所见,当我解析时,我会记住我必须解析的文件名并且
main.ml
open Lexing
open Ast
let rec print_op fmt op =
match op with
| Plus (op1, op2) ->
Format.fprintf fmt "(%a + %a)"
print_op op1 print_op op2
| Minus (op1, op2) ->
Format.fprintf fmt "(%a - %a)"
print_op op1 print_op op2
| Int i -> Format.fprintf fmt "%d" i
let rec read_includes fl =
List.fold_left (fun acc f ->
let c = open_in f in
let lb = Lexing.from_channel c in
let fl, p = Parser.prog Lexer.token lb in
close_in c;
let acc' = read_includes fl in
acc' @ p
) [] fl
let () =
try
let p = read_includes [Sys.argv.(1)] in
List.iter (Format.eprintf "%a@." print_op) p
with _ -> Format.eprintf "Bad Boy !@."
这意味着当我解析完第一个文件时,我解析了包含的文件。
最重要的是你对词法分析的困惑(这是编译器中最愚蠢的事情,你只要问 "What is the next token that you see ?" 他就回答 "I see #include "filename"
" 而解析器不是那么愚蠢并说“嘿,词法分析器看到 #include "filename"
所以我会记住这个文件名,因为我可能需要它并且我会继续。
如果我有这三个文件:
文件1
#include "file2"
main
6; 7
文件2
#include "file3"
main
4; 5
文件3
main
1; 2; 3
如果我调用 ./compile file1
,我会得到我想要的输出 1 2 3 4 5 6
。 ;-)
[编辑]
使用词法分析器处理包含的新版本:
ast.mli
type operation =
| Plus of operation * operation
| Minus of operation * operation
| Int of int
type prog = operation list
lexer.mll
{
open Parser
let fset = Hashtbl.create 17
(* set keeping all the filenames *)
}
let newline = '\n'
let space = [' ' '\t' '\r']
let digit = ['0' - '9']
let integer = digit+
rule token = parse
| newline { token lexbuf}
| space+ { token lexbuf}
| "#include \"" ( [^'"' '\n']* as filename) '"'
{ if Hashtbl.mem fset filename then
raise Exit
else
let c = open_in filename in
Hashtbl.add fset filename ();
let lb = Lexing.from_channel c in
let p = Parser.prog token lb in
INCLUDE p
}
| integer as i { INTEGER (int_of_string i) }
| "+" { PLUSI }
| "-" { MINUSI }
| ";" { SC }
| "main" { MAIN }
| eof
{ EOF }
parser.mly
%{
open Ast
%}
%token <Ast.prog> INCLUDE
%token EOF SC
%token PLUSI
%token MINUSI
%token MAIN
%token <int> INTEGER
%left PLUSI MINUSI
%start <Ast.prog> prog
%%
prog:
include_list MAIN operations EOF { List.rev_append (List.rev ) }
include_list:
| { [] }
| INCLUDE include_list { List.rev_append (List.rev ) }
operation:
| operation PLUSI operation { Plus (, ) }
| operation MINUSI operation { Minus (, ) }
| INTEGER { Int }
operations:
| operation { [] }
| operation SC operations { :: }
main.ml
open Lexing
open Ast
let rec print_op fmt op =
match op with
| Plus (op1, op2) ->
Format.fprintf fmt "(%a + %a)"
print_op op1 print_op op2
| Minus (op1, op2) ->
Format.fprintf fmt "(%a - %a)"
print_op op1 print_op op2
| Int i -> Format.fprintf fmt "%d" i
let () =
try
let c = open_in Sys.argv.(1) in
let lb = Lexing.from_channel c in
let p = Parser.prog Lexer.token lb in
close_in c;
List.iter (Format.eprintf "%a@." print_op) p
with _ -> Format.eprintf "Bad Boy !@."
所以,在词法分析器中,当我看到 #include filename
时,我立即调用由 filename
和 returns 链接的文件上的解析器,Ast.prog
解析为之前的解析调用。
我希望你已经清楚了 ;-)
[第二次编辑]
我不能让这段代码变成这样,我对其进行了编辑以避免包含循环(在 lexer.mll 中);-)
我正在为必须支持#include 指令(仅在文件开头)的类 C 语言制作编译器
一种简单但不优雅的方法是创建一个子程序,该子程序查找每次出现的指令,并用新临时文件中的相应文件替换。
现在这一点都不好。所以我尝试了以下方法:
lexer = parse
| "#include \"" ( [^'"' '\n']* as filename) '"'
{ lexer (Lexing.from_channel (open_in filename)) ; lexer lexbuf }
想法如下:每当您找到一个包含,使用给定的文件名打开一个新频道,并在该频道上递归调用 "lexer" 规则。之后,继续你的 lexing-buffer 的当前状态并继续 lexing。
问题是,它从来没有奏效。
我还看到当缓冲区 lexbuf 达到 eof 时可以重新填充。但我没能找到更多信息。这让我想到将上面的代码更改为以下代码:
lexer = parse
| "#include \"" ( [^'"' '\n']* as filename) '"'
{ addCurrentLexBufToAStack lexbuf ;lexer (Lexing.from_channel (open_in filename)); }
而在笔芯中,您将从堆栈的头部继续
但似乎对工作很有野心。
有什么想法吗?
P.s。从另一个模块调用词法分析器(以及解析器)(我们称之为 Main.ml)
嗯,您对词法分析和语法分析有点困惑吗?
我看到的是:
If my lexeme is #include ident I want to parse what's in the file pointed by ident and add it.
然后你混淆了解析和词法分析
你可以这样写:(这是一个小程序,但它可以工作 ;-))
ast.mli
type operation =
| Plus of operation * operation
| Minus of operation * operation
| Int of int
type prog = string list * operation list
lexer.mll
{
open Parser
open Lexing
open Ast
let current_pos b =
lexeme_start_p b,
lexeme_end_p b
}
let newline = '\n'
let space = [' ' '\t' '\r']
let digit = ['0' - '9']
let integer = digit+
rule token = parse
| newline { token lexbuf}
| space+ { token lexbuf}
| "#include \"" ( [^'"' '\n']* as filename) '"' { INCLUDE filename }
| integer as i { INTEGER (int_of_string i) }
| "+" { PLUSI }
| "-" { MINUSI }
| ";" { SC }
| "main" { MAIN }
| eof
{ EOF }
parser.mly
%{
open Ast
%}
%token <string> INCLUDE
%token EOF SC
%token PLUSI
%token MINUSI
%token MAIN
%token <int> INTEGER
%left PLUSI MINUSI
%start <Ast.prog> prog
%%
prog:
include_list MAIN operations EOF { (, ) }
include_list:
| { [] }
| INCLUDE include_list { :: }
operation:
| operation PLUSI operation { Plus (, ) }
| operation MINUSI operation { Minus (, ) }
| INTEGER { Int }
operations:
| operation { [] }
| operation SC operations { :: }
因此,如您所见,当我解析时,我会记住我必须解析的文件名并且
main.ml
open Lexing
open Ast
let rec print_op fmt op =
match op with
| Plus (op1, op2) ->
Format.fprintf fmt "(%a + %a)"
print_op op1 print_op op2
| Minus (op1, op2) ->
Format.fprintf fmt "(%a - %a)"
print_op op1 print_op op2
| Int i -> Format.fprintf fmt "%d" i
let rec read_includes fl =
List.fold_left (fun acc f ->
let c = open_in f in
let lb = Lexing.from_channel c in
let fl, p = Parser.prog Lexer.token lb in
close_in c;
let acc' = read_includes fl in
acc' @ p
) [] fl
let () =
try
let p = read_includes [Sys.argv.(1)] in
List.iter (Format.eprintf "%a@." print_op) p
with _ -> Format.eprintf "Bad Boy !@."
这意味着当我解析完第一个文件时,我解析了包含的文件。
最重要的是你对词法分析的困惑(这是编译器中最愚蠢的事情,你只要问 "What is the next token that you see ?" 他就回答 "I see #include "filename"
" 而解析器不是那么愚蠢并说“嘿,词法分析器看到 #include "filename"
所以我会记住这个文件名,因为我可能需要它并且我会继续。
如果我有这三个文件:
文件1
#include "file2"
main
6; 7
文件2
#include "file3"
main
4; 5
文件3
main
1; 2; 3
如果我调用 ./compile file1
,我会得到我想要的输出 1 2 3 4 5 6
。 ;-)
[编辑]
使用词法分析器处理包含的新版本:
ast.mli
type operation =
| Plus of operation * operation
| Minus of operation * operation
| Int of int
type prog = operation list
lexer.mll
{
open Parser
let fset = Hashtbl.create 17
(* set keeping all the filenames *)
}
let newline = '\n'
let space = [' ' '\t' '\r']
let digit = ['0' - '9']
let integer = digit+
rule token = parse
| newline { token lexbuf}
| space+ { token lexbuf}
| "#include \"" ( [^'"' '\n']* as filename) '"'
{ if Hashtbl.mem fset filename then
raise Exit
else
let c = open_in filename in
Hashtbl.add fset filename ();
let lb = Lexing.from_channel c in
let p = Parser.prog token lb in
INCLUDE p
}
| integer as i { INTEGER (int_of_string i) }
| "+" { PLUSI }
| "-" { MINUSI }
| ";" { SC }
| "main" { MAIN }
| eof
{ EOF }
parser.mly
%{
open Ast
%}
%token <Ast.prog> INCLUDE
%token EOF SC
%token PLUSI
%token MINUSI
%token MAIN
%token <int> INTEGER
%left PLUSI MINUSI
%start <Ast.prog> prog
%%
prog:
include_list MAIN operations EOF { List.rev_append (List.rev ) }
include_list:
| { [] }
| INCLUDE include_list { List.rev_append (List.rev ) }
operation:
| operation PLUSI operation { Plus (, ) }
| operation MINUSI operation { Minus (, ) }
| INTEGER { Int }
operations:
| operation { [] }
| operation SC operations { :: }
main.ml
open Lexing
open Ast
let rec print_op fmt op =
match op with
| Plus (op1, op2) ->
Format.fprintf fmt "(%a + %a)"
print_op op1 print_op op2
| Minus (op1, op2) ->
Format.fprintf fmt "(%a - %a)"
print_op op1 print_op op2
| Int i -> Format.fprintf fmt "%d" i
let () =
try
let c = open_in Sys.argv.(1) in
let lb = Lexing.from_channel c in
let p = Parser.prog Lexer.token lb in
close_in c;
List.iter (Format.eprintf "%a@." print_op) p
with _ -> Format.eprintf "Bad Boy !@."
所以,在词法分析器中,当我看到 #include filename
时,我立即调用由 filename
和 returns 链接的文件上的解析器,Ast.prog
解析为之前的解析调用。
我希望你已经清楚了 ;-)
[第二次编辑]
我不能让这段代码变成这样,我对其进行了编辑以避免包含循环(在 lexer.mll 中);-)