Mailing list for all users of the OCaml language and system.
 help / color / mirror / Atom feed
From: Christian Lindig <lindig@is.gaertner.de>
To: Caml Mailing List <caml-list@inria.fr>
Subject: Lightweight Regular Expressions
Date: Sun, 19 Sep 1999 22:03:40 +0200	[thread overview]
Message-ID: <19990919220339.B249@cumulus> (raw)


OCamls regular expression module Str is expressive and fast but
requires custom linking.  An algorithm by Mark Hopkins (once posted to
comp.compilers) permits to implement regular expression matching in
about 100 lines of code; the OCaml module is appended below in the
hope that is useful.  Someone mentioned this algorithm in
comp.text.xml for checking XML DTDs and provided a Haskell source. 
The module below is mostly a port to OCaml.

    + no precompiling of regexp's - they are interpreted directly
    + small OCaml module, no custom linking, no C runtime
    + can match any sequence of symbols, not just characters
    
    - only regexp matching, no subexpression matching, no substitution
    - not as efficient as an automaton based approach

-- Christian

-- 
 Christian Lindig       Gaertner Datensysteme GbR, Braunschweig,  Germany 
                        http://www.gaertner.de/~lindig lindig@gaertner.de


(*
    $Id: rx.ml,v 1.1 1999/08/29 09:03:01 lindig Exp $

    This module provides regular expression matching.  Regular
    expressions don't need to be compiled before they can be matched
    against some input.  Instead the automaton is build while the
    input is consumed.  The drawback of this method is that no
    subexpression matching is possible to extract matching substrings.

    This algorithm used here is by Mark Hopkins who posted it once to
    comp.compiles.  Search the comp.compilers archive for a detailed
    article and an implementation in C.
     
    1999 Christian Lindig <lindig@gaertner.de> 
*)


type 'a rx      = 
                | RXzero                                (* {}           *)
                | RXunit                                (* ""           *)
                | RXsym         of 'a                   (* 'x'          *)
                | RXmany        of ('a rx)              (* e*           *)
                | RXsome        of ('a rx)              (* e+           *)
                | RXopt         of ('a rx)              (* e?           *)
                | RXseq         of ('a rx) * ('a rx)    (* e1 e2        *)
                | RXalt         of ('a rx) * ('a rx)    (* e1 | e2      *)

let zero        = RXzero
let unit        = RXunit
let sym x       = RXsym x
let many        = function
                | RXunit        -> RXunit
                | RXzero        -> RXunit
                | x             -> RXmany x
let some        = function
                | RXunit        -> RXunit
                | RXzero        -> RXzero
                | x             -> RXsome x
let opt         = function
                | RXunit        -> RXunit
                | RXzero        -> RXunit
                | x             -> RXopt x
let seq x y     = match (x,y) with
                | RXzero, x     -> RXzero
                | RXunit, x     -> x
                | x     , RXzero-> RXzero
                | x     , RXunit-> x
                | x     , y     -> RXseq(x,y)
let alt x y     = match (x,y) with
                | RXzero, x     -> x
                | x     , RXzero-> x
                | x     , y     -> RXalt(x,y)

(* two convenience infix operators *)

let ( ||| ) = alt
let ( *** ) = seq

(* [nullable e] is true, iff the empty sequence (RXzero) is recognized 
   by [e]. *)

let rec nullable = function
    | RXzero            -> false
    | RXunit            -> true
    | RXsym x           -> false
    | RXmany e          -> true
    | RXsome e          -> nullable e
    | RXopt e           -> true
    | RXseq(e1,e2)      -> nullable e1 && nullable e2
    | RXalt(e1,e2)      -> nullable e1 || nullable e2

(* [residual e x] returns a regular expression e' that recognizes the
   language L(e') = { w | xw \in L(e)}. *)

let rec residual e' x = match e' with
    | RXzero            -> RXzero
    | RXunit            -> RXzero
    | RXsym x'          -> if   x' = x 
                           then RXunit
                           else RXzero
    | RXmany e          -> seq (residual e x) (many e)
    | RXsome e          -> seq (residual e x) (many e)
    | RXopt e           -> residual e x
    | RXseq(e1,e2)      -> if   nullable e1
                           then alt (seq (residual e1 x) e2) (residual e2 x)
                           else seq (residual e1 x) e2
    | RXalt(e1,e2)      -> alt (residual e1 x) (residual e2 x)
    
(* [matches e syms] is true, iff the word [syms] is an element of L(e), i.e. 
   [e] matches the symbols [syms]. *)

let matches e syms      = nullable (List.fold_left residual e syms)

(* [matchstr e str] is true, iff string [str] is matched by 
   regular expression [e] *)
 
let matchstr e str =
    let len = String.length str         in
    let rec loop e i =
        if   i = len 
        then nullable e
        else loop (residual e (String.get str i)) (i+1)
    in
        loop e 0

(* examples *)

let e1 = many (sym 'a') *** some (sym 'b')
let e2 = sym 'a' *** opt (sym 'b') *** some (sym 'a')




                




                        





                 reply	other threads:[~1999-09-20 13:33 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=19990919220339.B249@cumulus \
    --to=lindig@is.gaertner.de \
    --cc=caml-list@inria.fr \
    --cc=lindig@gaertner.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox