From mboxrd@z Thu Jan  1 00:00:00 1970
Received: by margaux.inria.fr, Tue, 20 Oct 92 15:02:22 +0100
Received: from pomerol.inria.fr by margaux.inria.fr, Tue, 20 Oct 92 14:45:31 +0100
Received: by pomerol.inria.fr, Tue, 20 Oct 92 14:45:29 +0100
From: Xavier Leroy <xleroy@pomerol>
Message-Id: <9210201345.AA01696@pomerol.inria.fr>
Subject: A "generic" lexer in Caml Light
To: caml-list@margaux
Date: Tue, 20 Oct 92 14:45:29 MET
Sender: weis@margaux

Some users have complained about the lack of some kind of ``standard''
lexical analyzer in Caml Light, similar to the one used by the
grammars of Caml V3.1.  This would make it easier to write a
parser for a small embedded language that has roughly the same lexical
conventions as Caml Light.

The examples/lexer directory in the Caml Light distribution contains a
sample lexer from which you can do cutting and pasting. But it seems
that something more basic might be useful, so here it is.

The following is a simple ``standard'' lexer, that turns a character
stream into a token stream. It factors out the recognition of
identifiers, numbers, strings, etc. It is parameterized by a set of
keywords and identifiers.

The interface is:

------------------------------ file genlex.mli -----------------------
(* A generic lexer *)

type token =
    Kwd of string
  | Ident of string
  | Int of int
  | Float of float
  | String of string
  | Char of char
;;

value make_lexer : string list -> (char stream -> token stream);;
----------------------------------------------------------------------

The lexical classes recognized are:

      * integer and floating-point numbers - same syntax as in Caml Light
      * string literals (between double quotes)
      * character literals (between backquotes)
      * identifiers:
          - either sequences of letters/digits/underscores/quotes
          - or sequences of "operator characters" (+, -, *, etc)
      * keywords:
          - either identifiers (both kinds)
          - or single "special characters" such as ( ) [ ] { }

The first argument to make_lexer is the list of keywords. Identifiers s
that are in this list as returned as (Kwd s); those that are not in
the list are returned as (Ident s). Special characters that are in the
list are returned as (Kwd s); those that are not in the list cause a
lexical error.

And now, the standard example: a parser for lambda-terms. 

----------------------------------------------------------------------
#open "genlex";;

type lambda =
  Var of string | Abstr of string * lambda | Appl of lambda * lambda;;

let rec parse_top = function
    [< 'Kwd "\\"; 'Ident x; 'Kwd "."; parse_top l >] -> Abstr(x,l)
  | [< parse_simple l1; (parse_appl l1) l2 >] -> l2

and parse_appl l1 = function
    [< parse_simple l2; (parse_appl (Appl(l1,l2))) l3 >] -> l3
  | [< >] -> l1

and parse_simple = function
    [< 'Kwd "("; parse_top l; 'Kwd ")" >] -> l
  | [< 'Ident x >] -> Var x
;;

let lexer = make_lexer ["\\"; "."; "("; ")"];;

let parse_lambda s = parse_lambda(lexer s);;
----------------------------------------------------------------------

The ``keywords'' here are backslash and dot (for lambda abstractions), plus
the parentheses. The parse_lambda function has type char stream -> lambda.

Weaknesses: comments are not recognized. The matching of keywords is
not as efficient as it could be, because we're comparing strings
instead of e.g. constant constructors, but this is intended for toy
parsers anyway.

If you've read so far, you might be interested in the implementation
of genlex:

------------------------- file genlex.ml -------------------------
(* A generic lexer *)

(* The string buffering machinery *)

let initial_buffer = create_string 256;;

let buffer = ref initial_buffer;;
let bufpos = ref 0;;

let reset_buffer () =
  buffer := initial_buffer;
  bufpos := 0
;;

let store c =
  if !bufpos >= string_length !buffer then begin
    let newbuffer = create_string (2 * !bufpos) in
    blit_string !buffer 0 newbuffer 0 !bufpos;
    buffer := newbuffer
  end;
  set_nth_char !buffer !bufpos c;
  incr bufpos
;;

let get_string () =
  let s = sub_string !buffer 0 !bufpos in buffer := initial_buffer; s
;;

(* The lexer *)

let make_lexer keywords =

  let kwd_table = hashtbl__new 17 in
  do_list (fun s -> hashtbl__add kwd_table s (Kwd s)) keywords;

  let ident_or_keyword id =
    try hashtbl__find kwd_table id with Not_found -> Ident id
  and keyword_or_error c =
    let s = make_string 1 c in
      try hashtbl__find kwd_table s
      with Not_found -> raise Parse_error in

  let rec next_token = function
    [< '`A`..`Z`|`a`..`z` as c; s>] ->
      reset_buffer(); store c; ident s
  | [< '`!`|`%`|`&`|`$`|`#`|`+`|`/`|`:`|`<`|`=`|`>`|`?`|`@`|`\\`|
        `~`|`^`|`|`|`*` as c; s >] ->
      reset_buffer(); store c; ident2 s
  | [< '`0`..`9` as c; s>] ->
      reset_buffer(); store c; number s
  | [< '` `|`\n`|`\t`|`\r`; s >] ->
      next_token s
  | [< '`\``; char c; '`\`` >] ->
      Char c
  | [< '`"`; string str >] ->
      String str
  | [< '`-`; s >] ->
      neg_number s
  | [< 'c >] ->
      keyword_or_error c
      
  and ident = function
    [< '`A`..`Z`|`a`..`z`|`0`..`9`|`_`|`'` as c; s>] ->
      store c; ident s
  | [< >] ->
      ident_or_keyword(get_string())

  and ident2 = function
    [< '`!`|`%`|`&`|`$`|`#`|`+`|`-`|`/`|`:`|`<`|`=`|`>`|`?`|`@`|`\\`|
        `~`|`^`|`|`|`*` as c; s >] ->
      store c; ident2 s
  | [< >] ->
      ident_or_keyword(get_string())

  and neg_number = function
    [< '`0`..`9` as c; s >] ->
      reset_buffer(); store `-`; store c; number s
  | [< >] ->
      keyword_or_error `-`
    
  and number = function
    [< '`0`..`9` as c; s >] ->
      store c; number s
  | [< '`.`; s >] ->
      store `.`; decimal_part s
  | [< '`e`|`E`; s >] ->
      store `E`; exponent_part s
  | [< >] ->
      Int(int_of_string(get_string()))

  and decimal_part = function
    [< '`0`..`9` as c; s >] ->
      store c; decimal_part s
  | [< '`e`|`E`; s >] ->
      store `E`; exponent_part s
  | [< >] ->
      Float(float_of_string(get_string()))

  and exponent_part = function
    [< '`+`|`-` as c; s >] ->
      store c; end_exponent_part s
  | [< s >] ->
      end_exponent_part s

  and end_exponent_part = function
    [< '`0`..`9` as c; s >] ->
      store c; end_exponent_part s
  | [< >] ->
      Float(float_of_string(get_string()))

  and string = function
    [< '`"` >] -> get_string()
  | [< '`\\`; escape c; s >] -> store c; string s
  | [< 'c; s >] -> store c; string s

  and char = function
    [< '`\\`; escape c >] -> c
  | [< 'c >] -> c

  and escape = function
    [< '`n` >] -> `\n`
  | [< '`r` >] -> `\r`
  | [< '`t` >] -> `\t`
  | [< 'c >] -> c

  in fun input -> stream_from (fun () -> next_token input)
;;
----------------------------------------------------------------------

Enjoy,

- Xavier Leroy