From mboxrd@z Thu Jan 1 00:00:00 1970 Received: (from weis@localhost) by pauillac.inria.fr (8.7.6/8.7.3) id PAA05640 for caml-redistribution; Mon, 20 Sep 1999 15:33:17 +0200 (MET DST) Received: from nez-perce.inria.fr (nez-perce.inria.fr [192.93.2.78]) by pauillac.inria.fr (8.7.6/8.7.3) with ESMTP id WAA17555 for ; Sun, 19 Sep 1999 22:05:13 +0200 (MET DST) Received: from spog.gaertner.de (spog.gaertner.de [194.45.135.2]) by nez-perce.inria.fr (8.8.7/8.8.7) with ESMTP id WAA13749 for ; Sun, 19 Sep 1999 22:05:12 +0200 (MET DST) Received: from cumulus.is.gaertner.de (postfix@cumulus.is.gaertner.de [194.45.135.209]) by spog.gaertner.de (8.8.8/8.8.8/Nase) with ESMTP id WAA27649 for ; Sun, 19 Sep 1999 22:05:08 +0200 Received: by cumulus.is.gaertner.de (Postfix, from userid 1000) id 4F2879771; Sun, 19 Sep 1999 22:03:40 +0200 (CEST) Date: Sun, 19 Sep 1999 22:03:40 +0200 From: Christian Lindig To: Caml Mailing List Subject: Lightweight Regular Expressions Message-ID: <19990919220339.B249@cumulus> Reply-To: Christian Lindig Mail-Followup-To: Caml Mailing List Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii X-Mailer: Mutt 0.95.5i Sender: weis OCamls regular expression module Str is expressive and fast but requires custom linking. An algorithm by Mark Hopkins (once posted to comp.compilers) permits to implement regular expression matching in about 100 lines of code; the OCaml module is appended below in the hope that is useful. Someone mentioned this algorithm in comp.text.xml for checking XML DTDs and provided a Haskell source. The module below is mostly a port to OCaml. + no precompiling of regexp's - they are interpreted directly + small OCaml module, no custom linking, no C runtime + can match any sequence of symbols, not just characters - only regexp matching, no subexpression matching, no substitution - not as efficient as an automaton based approach -- Christian -- Christian Lindig Gaertner Datensysteme GbR, Braunschweig, Germany http://www.gaertner.de/~lindig lindig@gaertner.de (* $Id: rx.ml,v 1.1 1999/08/29 09:03:01 lindig Exp $ This module provides regular expression matching. Regular expressions don't need to be compiled before they can be matched against some input. Instead the automaton is build while the input is consumed. The drawback of this method is that no subexpression matching is possible to extract matching substrings. This algorithm used here is by Mark Hopkins who posted it once to comp.compiles. Search the comp.compilers archive for a detailed article and an implementation in C. 1999 Christian Lindig *) type 'a rx = | RXzero (* {} *) | RXunit (* "" *) | RXsym of 'a (* 'x' *) | RXmany of ('a rx) (* e* *) | RXsome of ('a rx) (* e+ *) | RXopt of ('a rx) (* e? *) | RXseq of ('a rx) * ('a rx) (* e1 e2 *) | RXalt of ('a rx) * ('a rx) (* e1 | e2 *) let zero = RXzero let unit = RXunit let sym x = RXsym x let many = function | RXunit -> RXunit | RXzero -> RXunit | x -> RXmany x let some = function | RXunit -> RXunit | RXzero -> RXzero | x -> RXsome x let opt = function | RXunit -> RXunit | RXzero -> RXunit | x -> RXopt x let seq x y = match (x,y) with | RXzero, x -> RXzero | RXunit, x -> x | x , RXzero-> RXzero | x , RXunit-> x | x , y -> RXseq(x,y) let alt x y = match (x,y) with | RXzero, x -> x | x , RXzero-> x | x , y -> RXalt(x,y) (* two convenience infix operators *) let ( ||| ) = alt let ( *** ) = seq (* [nullable e] is true, iff the empty sequence (RXzero) is recognized by [e]. *) let rec nullable = function | RXzero -> false | RXunit -> true | RXsym x -> false | RXmany e -> true | RXsome e -> nullable e | RXopt e -> true | RXseq(e1,e2) -> nullable e1 && nullable e2 | RXalt(e1,e2) -> nullable e1 || nullable e2 (* [residual e x] returns a regular expression e' that recognizes the language L(e') = { w | xw \in L(e)}. *) let rec residual e' x = match e' with | RXzero -> RXzero | RXunit -> RXzero | RXsym x' -> if x' = x then RXunit else RXzero | RXmany e -> seq (residual e x) (many e) | RXsome e -> seq (residual e x) (many e) | RXopt e -> residual e x | RXseq(e1,e2) -> if nullable e1 then alt (seq (residual e1 x) e2) (residual e2 x) else seq (residual e1 x) e2 | RXalt(e1,e2) -> alt (residual e1 x) (residual e2 x) (* [matches e syms] is true, iff the word [syms] is an element of L(e), i.e. [e] matches the symbols [syms]. *) let matches e syms = nullable (List.fold_left residual e syms) (* [matchstr e str] is true, iff string [str] is matched by regular expression [e] *) let matchstr e str = let len = String.length str in let rec loop e i = if i = len then nullable e else loop (residual e (String.get str i)) (i+1) in loop e 0 (* examples *) let e1 = many (sym 'a') *** some (sym 'b') let e2 = sym 'a' *** opt (sym 'b') *** some (sym 'a')