From mboxrd@z Thu Jan 1 00:00:00 1970 Received: by margaux.inria.fr, Tue, 20 Oct 92 15:02:22 +0100 Received: from pomerol.inria.fr by margaux.inria.fr, Tue, 20 Oct 92 14:45:31 +0100 Received: by pomerol.inria.fr, Tue, 20 Oct 92 14:45:29 +0100 From: Xavier Leroy Message-Id: <9210201345.AA01696@pomerol.inria.fr> Subject: A "generic" lexer in Caml Light To: caml-list@margaux Date: Tue, 20 Oct 92 14:45:29 MET Sender: weis@margaux Some users have complained about the lack of some kind of ``standard'' lexical analyzer in Caml Light, similar to the one used by the grammars of Caml V3.1. This would make it easier to write a parser for a small embedded language that has roughly the same lexical conventions as Caml Light. The examples/lexer directory in the Caml Light distribution contains a sample lexer from which you can do cutting and pasting. But it seems that something more basic might be useful, so here it is. The following is a simple ``standard'' lexer, that turns a character stream into a token stream. It factors out the recognition of identifiers, numbers, strings, etc. It is parameterized by a set of keywords and identifiers. The interface is: ------------------------------ file genlex.mli ----------------------- (* A generic lexer *) type token = Kwd of string | Ident of string | Int of int | Float of float | String of string | Char of char ;; value make_lexer : string list -> (char stream -> token stream);; ---------------------------------------------------------------------- The lexical classes recognized are: * integer and floating-point numbers - same syntax as in Caml Light * string literals (between double quotes) * character literals (between backquotes) * identifiers: - either sequences of letters/digits/underscores/quotes - or sequences of "operator characters" (+, -, *, etc) * keywords: - either identifiers (both kinds) - or single "special characters" such as ( ) [ ] { } The first argument to make_lexer is the list of keywords. Identifiers s that are in this list as returned as (Kwd s); those that are not in the list are returned as (Ident s). Special characters that are in the list are returned as (Kwd s); those that are not in the list cause a lexical error. And now, the standard example: a parser for lambda-terms. ---------------------------------------------------------------------- #open "genlex";; type lambda = Var of string | Abstr of string * lambda | Appl of lambda * lambda;; let rec parse_top = function [< 'Kwd "\\"; 'Ident x; 'Kwd "."; parse_top l >] -> Abstr(x,l) | [< parse_simple l1; (parse_appl l1) l2 >] -> l2 and parse_appl l1 = function [< parse_simple l2; (parse_appl (Appl(l1,l2))) l3 >] -> l3 | [< >] -> l1 and parse_simple = function [< 'Kwd "("; parse_top l; 'Kwd ")" >] -> l | [< 'Ident x >] -> Var x ;; let lexer = make_lexer ["\\"; "."; "("; ")"];; let parse_lambda s = parse_lambda(lexer s);; ---------------------------------------------------------------------- The ``keywords'' here are backslash and dot (for lambda abstractions), plus the parentheses. The parse_lambda function has type char stream -> lambda. Weaknesses: comments are not recognized. The matching of keywords is not as efficient as it could be, because we're comparing strings instead of e.g. constant constructors, but this is intended for toy parsers anyway. If you've read so far, you might be interested in the implementation of genlex: ------------------------- file genlex.ml ------------------------- (* A generic lexer *) (* The string buffering machinery *) let initial_buffer = create_string 256;; let buffer = ref initial_buffer;; let bufpos = ref 0;; let reset_buffer () = buffer := initial_buffer; bufpos := 0 ;; let store c = if !bufpos >= string_length !buffer then begin let newbuffer = create_string (2 * !bufpos) in blit_string !buffer 0 newbuffer 0 !bufpos; buffer := newbuffer end; set_nth_char !buffer !bufpos c; incr bufpos ;; let get_string () = let s = sub_string !buffer 0 !bufpos in buffer := initial_buffer; s ;; (* The lexer *) let make_lexer keywords = let kwd_table = hashtbl__new 17 in do_list (fun s -> hashtbl__add kwd_table s (Kwd s)) keywords; let ident_or_keyword id = try hashtbl__find kwd_table id with Not_found -> Ident id and keyword_or_error c = let s = make_string 1 c in try hashtbl__find kwd_table s with Not_found -> raise Parse_error in let rec next_token = function [< '`A`..`Z`|`a`..`z` as c; s>] -> reset_buffer(); store c; ident s | [< '`!`|`%`|`&`|`$`|`#`|`+`|`/`|`:`|`<`|`=`|`>`|`?`|`@`|`\\`| `~`|`^`|`|`|`*` as c; s >] -> reset_buffer(); store c; ident2 s | [< '`0`..`9` as c; s>] -> reset_buffer(); store c; number s | [< '` `|`\n`|`\t`|`\r`; s >] -> next_token s | [< '`\``; char c; '`\`` >] -> Char c | [< '`"`; string str >] -> String str | [< '`-`; s >] -> neg_number s | [< 'c >] -> keyword_or_error c and ident = function [< '`A`..`Z`|`a`..`z`|`0`..`9`|`_`|`'` as c; s>] -> store c; ident s | [< >] -> ident_or_keyword(get_string()) and ident2 = function [< '`!`|`%`|`&`|`$`|`#`|`+`|`-`|`/`|`:`|`<`|`=`|`>`|`?`|`@`|`\\`| `~`|`^`|`|`|`*` as c; s >] -> store c; ident2 s | [< >] -> ident_or_keyword(get_string()) and neg_number = function [< '`0`..`9` as c; s >] -> reset_buffer(); store `-`; store c; number s | [< >] -> keyword_or_error `-` and number = function [< '`0`..`9` as c; s >] -> store c; number s | [< '`.`; s >] -> store `.`; decimal_part s | [< '`e`|`E`; s >] -> store `E`; exponent_part s | [< >] -> Int(int_of_string(get_string())) and decimal_part = function [< '`0`..`9` as c; s >] -> store c; decimal_part s | [< '`e`|`E`; s >] -> store `E`; exponent_part s | [< >] -> Float(float_of_string(get_string())) and exponent_part = function [< '`+`|`-` as c; s >] -> store c; end_exponent_part s | [< s >] -> end_exponent_part s and end_exponent_part = function [< '`0`..`9` as c; s >] -> store c; end_exponent_part s | [< >] -> Float(float_of_string(get_string())) and string = function [< '`"` >] -> get_string() | [< '`\\`; escape c; s >] -> store c; string s | [< 'c; s >] -> store c; string s and char = function [< '`\\`; escape c >] -> c | [< 'c >] -> c and escape = function [< '`n` >] -> `\n` | [< '`r` >] -> `\r` | [< '`t` >] -> `\t` | [< 'c >] -> c in fun input -> stream_from (fun () -> next_token input) ;; ---------------------------------------------------------------------- Enjoy, - Xavier Leroy