From: Pietro Abate <Pietro.Abate@pps.jussieu.fr>
To: caml-list@yquem.inria.fr
Subject: Re: [Caml-list] camlp4 and lexers
Date: Fri, 16 May 2008 17:24:04 +0200 [thread overview]
Message-ID: <20080516152404.GA23302@uranium.pps.jussieu.fr> (raw)
In-Reply-To: <20080515150033.GA31934@uranium.pps.jussieu.fr>
[-- Attachment #1: Type: text/plain, Size: 2607 bytes --]
Hi again.
I have a minimal (?) lexer (attached) working with the grammar below.
For the purpose of this excercise I used ulex. I started with the cduce
lexer and removed all cduce-specific functions. However I'm not enterely
happy.
First I'd like to have another example using ocamllex and not ulex (one
less dependecy), but I guess this is not too hard to do.
Second, I've copy-pasted some code in the lexer to instanciate the
camlp4 modules, but I'm not sure what is required and what is not. I
mean, I can look at the camlp4 modules sigs, but without documentation
there are a lot of functions that I don't really understand. Can anybody
explain the signature of the Loc, Token and Error modules ?
How these function used within the camlp4 parsing machinery ?
- Token.match_keyword
- Token.extract_string
- Token.Filter.mk
- Token.Filter.filter
- Token.Filter.define_filter
- Token.Filter.keyword_added
- Token.Filter.keyword_removed
Third, I'm not sure if this is the real minimal example I was looking
for. I've the impression I could reuse the Camlp4.PreCast.Loc module,
but I'm not sure if I can reuse the Camlp4.PreCast.Token since it is
linked with the token type definition. I don't think I can reuse/extend
the caml_token type... Making the lexer extensible would be a great !
Hope this helps.
comments ?
pietro
This is the _tags file to compile it:
---------- _tags -------
"parser.ml": use_camlp4, pp(camlp4of)
"ulexer.ml": pkg_ulex, use_camlp4, syntax_camlp4o
"ulexer.mli": use_camlp4, pkg_ulex
-----------
+ nicolas' universal myocamlbuil.ml
-------------------- parser.ml -----------------------
type t =
Seq of t * t
| Alt of t * t
| Opt of t
| Star of t
| Plus of t
| Dot
| Sym of char
open Ulexer
module RegExGram = Camlp4.Struct.Grammar.Static.Make(Ulexer)
let regex = RegExGram.Entry.mk "regex"
(* I guess I don't need to use KWD *)
EXTEND RegExGram
GLOBAL: regex;
regex: [[ e1 = SELF ; `KWD "|" ; e2 = concat -> Alt(e1,e2)
| e1 = concat -> e1 ]
];
concat:[[ e1 = SELF ; `KWD ";"; e2 = seq -> Seq(e1,e2)
| e1 = SELF ; e2 = seq -> Seq(e1,e2)
| e1 = seq -> e1 ]
];
seq: [[ e1 = simple ; `KWD "?" -> Opt e1
| e1 = simple ; `KWD "*" -> Star e1
| e1 = simple ; `KWD "+" -> Plus e1
| e1 = simple -> e1 ]
];
simple:[[ `KWD "." -> Dot
| `KWD "("; e1 = regex; `KWD ")" -> e1
| `CHAR(s) -> Sym s ]
];
END
let from_string s = RegExGram.parse_string regex (Loc.mk "<string>") s
------------------------------------------------------
[-- Attachment #2: ulexer.ml --]
[-- Type: text/plain, Size: 4658 bytes --]
open Camlp4.PreCast
module Loc = struct
type t = int * int
let mk _ = (0,0)
let ghost = (-1,-1)
let of_lexing_position _ = assert false
let to_ocaml_location _ = assert false
let of_ocaml_location _ = assert false
let of_lexbuf _ = assert false
let of_tuple _ = assert false
let to_tuple _ = assert false
let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2)
let join (x1, _) = (x1, x1)
let move _ _ _ = assert false
let shift _ _ = assert false
let move_line _ _ = assert false
let file_name _ = assert false
let start_line _ = assert false
let stop_line _ = assert false
let start_bol _ = assert false
let stop_bol _ = assert false
let start_off = fst
let stop_off = snd
let start_pos _ = assert false
let stop_pos _ = assert false
let is_ghost _ = assert false
let ghostify _ = assert false
let set_file_name _ = assert false
let strictly_before _ = assert false
let make_absolute _ = assert false
let print _ = assert false
let dump _ = assert false
let to_string _ = assert false
exception Exc_located of t * exn
let raise loc exn =
match exn with
| Exc_located _ -> raise exn
| _ -> raise (Exc_located (loc, exn))
let name = ref "_loc"
end
type token =
| KWD of string
| CHAR of char
| EOI
module Token = struct
open Format
module Loc = Loc
type t = token
type token = t
let sf = Printf.sprintf
let to_string =
function
| CHAR s -> sf "CHAR %c" s
| KWD s -> sf "KWD \"%s\"" s
| EOI -> sf "EOI"
let print ppf x = pp_print_string ppf (to_string x)
let match_keyword kwd =
function
| KWD kwd' when kwd = kwd' -> true
| _ -> false
let extract_string =
function
| KWD s -> s
| CHAR c -> String.make 1 c
| tok ->
invalid_arg ("Cannot extract a string from this token: "^
to_string tok)
module Error = struct
type t = string
exception E of string
let print = pp_print_string
let to_string x = x
end
module Filter = struct
type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter
type t =
{ is_kwd : string -> bool;
mutable filter : token_filter }
let mk is_kwd =
{ is_kwd = is_kwd;
filter = (fun s -> s) }
let filter x =
let f tok loc =
let tok' = tok in
(tok', loc)
in
let rec filter =
parser
| [< '(tok, loc); s >] -> [< ' f tok loc; filter s >]
| [< >] -> [< >]
in
fun strm -> x.filter (filter strm)
let define_filter x f = x.filter <- f x.filter
let keyword_added _ _ _ = ()
let keyword_removed _ _ = ()
end
end
module Error = Camlp4.Struct.EmptyError
module L = Ulexing
exception Error of int * int * string
let error i j s = raise (Error (i,j,s))
(***********************************************************)
(* Buffer for string literals *)
let string_buff = Buffer.create 1024
let store_lexeme lexbuf =
Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
let store_ascii = Buffer.add_char string_buff
let store_code = Utf8.store string_buff
let clear_buff () = Buffer.clear string_buff
let get_stored_string () =
let s = Buffer.contents string_buff in
clear_buff ();
Buffer.clear string_buff;
s
(***********************************************************)
(* Lexer *)
let illegal lexbuf =
error
(L.lexeme_start lexbuf)
(L.lexeme_end lexbuf)
"Illegal character"
let return lexbuf tok = (tok, L.loc lexbuf)
let return_loc i j tok = (tok, (i,j))
let rec token = lexer
| [' ' '\t'] -> token lexbuf
| ['*' '?' '+' '(' ')' ';' '|' ] ->
let k = KWD (L.latin1_lexeme lexbuf) in return lexbuf k
| ['A'-'Z' 'a'-'z'] ->
let c = CHAR (L.latin1_lexeme_char lexbuf 0) in return lexbuf c
| eof -> return lexbuf EOI
| _ -> illegal lexbuf
(***********************************************************)
let enc = ref L.Latin1
let lexbuf = ref None
let last_tok = ref (KWD "DUMMY")
let raise_clean e =
clear_buff ();
raise e
let mk () _loc cs =
let lb = L.from_var_enc_stream enc cs in
lexbuf := Some lb;
let next _ =
let tok, loc =
try token lb
with
| Ulexing.Error ->
raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
"Unexpected character"))
| Ulexing.InvalidCodepoint i ->
raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
"Code point invalid for the current encoding"))
| e -> raise_clean e
in
last_tok := tok;
Some (tok, loc)
in
Stream.from next
[-- Attachment #3: ulexer.mli --]
[-- Type: text/plain, Size: 312 bytes --]
open Camlp4.Sig
type token =
| KWD of string
| CHAR of char
| EOI
exception Error of int * int * string
module Loc : Loc with type t = int * int
module Token : Token with module Loc = Loc and type t = token
module Error : Error
val mk : unit -> (Loc.t -> char Stream.t -> (Token.t * Loc.t) Stream.t)
[-- Attachment #4: myocamlbuild.ml --]
[-- Type: text/plain, Size: 2045 bytes --]
open Ocamlbuild_plugin
open Command (* no longer needed for OCaml >= 3.10.2 *)
(* these functions are not really officially exported *)
let run_and_read = Ocamlbuild_pack.My_unix.run_and_read
let blank_sep_strings = Ocamlbuild_pack.Lexers.blank_sep_strings
(* this lists all supported packages *)
let find_packages () =
blank_sep_strings &
Lexing.from_string &
run_and_read "ocamlfind list | cut -d' ' -f1"
(* this is supposed to list available syntaxes, but I don't know how to do it. *)
let find_syntaxes () = ["camlp4o"; "camlp4r"]
(* ocamlfind command *)
let ocamlfind x = S[A"ocamlfind"; x]
let _ = dispatch begin function
| Before_options ->
(* override default commands by ocamlfind ones *)
Options.ocamlc := ocamlfind & A"ocamlc";
Options.ocamlopt := ocamlfind & A"ocamlopt";
Options.ocamldep := ocamlfind & A"ocamldep";
Options.ocamldoc := ocamlfind & A"ocamldoc"
| After_rules ->
(* When one link an OCaml library/binary/package, one should use -linkpkg *)
flag ["ocaml"; "link"] & A"-linkpkg";
(* For each ocamlfind package one inject the -package option when
* compiling, computing dependencies, generating documentation and
* linking. *)
List.iter begin fun pkg ->
flag ["ocaml"; "compile"; "pkg_"^pkg] & S[A"-package"; A pkg];
flag ["ocaml"; "ocamldep"; "pkg_"^pkg] & S[A"-package"; A pkg];
flag ["ocaml"; "doc"; "pkg_"^pkg] & S[A"-package"; A pkg];
flag ["ocaml"; "link"; "pkg_"^pkg] & S[A"-package"; A pkg];
end (find_packages ());
(* Like -package but for extensions syntax. Morover -syntax is useless
* when linking. *)
List.iter begin fun syntax ->
flag ["ocaml"; "compile"; "syntax_"^syntax] & S[A"-syntax"; A syntax];
flag ["ocaml"; "ocamldep"; "syntax_"^syntax] & S[A"-syntax"; A syntax];
flag ["ocaml"; "doc"; "syntax_"^syntax] & S[A"-syntax"; A syntax];
end (find_syntaxes ());
| _ -> ()
end
prev parent reply other threads:[~2008-05-16 15:25 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-05-15 15:00 Pietro Abate
2008-05-16 15:24 ` Pietro Abate [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080516152404.GA23302@uranium.pps.jussieu.fr \
--to=pietro.abate@pps.jussieu.fr \
--cc=caml-list@yquem.inria.fr \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox