ExtLib/Camomile UTF8 - Berke Durak

Mailing list for all users of the OCaml language and system.
 help / color / mirror / Atom feed

From: "Berke Durak" <berke.durak@gmail.com>
To: "Yoriyuki Yamagata" <yoriyuki.y@gmail.com>
Cc: ocaml-lib-devel@lists.sourceforge.net,
	caml-list <caml-list@inria.fr>,
	"Richard Jones" <rich@annexia.org>
Subject: ExtLib/Camomile UTF8
Date: Mon, 5 May 2008 11:55:08 +0200	[thread overview]
Message-ID: <b903a8570805050255q2534a6c8l4bcc4de38e050cba@mail.gmail.com> (raw)


[-- Attachment #1.1: Type: text/plain, Size: 661 bytes --]

Hello,

A few months ago I submitted a small patch to ExtLib to the UTF8 module,
adding three small but important functions

  - substring
  - output_uchar
  - eof

The patch hasn't been integrated in the ExtLib SVN (I've checked the trunk
and tags on the SVN at code.google.com).
As it's actually a patch against Camomile, (at the time I didn't know at the
time that UTF8 was taken from Camomile),
Richard Jones suggested that this is where the patch should go.

Could you provide a substring function or integrate the patch into
Camomile?  Also maybe you could host Camomile at
forge.ocamlcore.org...

Attached is the diff against ExtLib 1.5.
-- 
Berke Durak

[-- Attachment #1.2: Type: text/html, Size: 811 bytes --]

[-- Attachment #2: extlib-patch --]
[-- Type: application/octet-stream, Size: 6266 bytes --]

Index: test/test_bd_UTF8_001.ml
===================================================================
--- test/test_bd_UTF8_001.ml	(revision 0)
+++ test/test_bd_UTF8_001.ml	(revision 0)
@@ -0,0 +1,115 @@
+let substring_inputs =
+[
+  [|
+    "";
+    "⟿";
+    "⟿ቄ";
+    "⟿ቄş";
+    "⟿ቄş龟";
+    "⟿ቄş龟¯";
+  |];
+  [|
+    "";
+    "ç";
+    "çe";
+    "çek";
+    "çeko";
+    "çekos";
+    "çekosl";
+    "çekoslo";
+    "çekoslov";
+    "çekoslova";
+    "çekoslovak";
+    "çekoslovaky";
+    "çekoslovakya";
+    "çekoslovakyal";
+    "çekoslovakyala";
+    "çekoslovakyalaş";
+    "çekoslovakyalaşt";
+    "çekoslovakyalaştı";
+    "çekoslovakyalaştır";
+    "çekoslovakyalaştıra";
+    "çekoslovakyalaştıram";
+    "çekoslovakyalaştırama";
+    "çekoslovakyalaştıramad";
+    "çekoslovakyalaştıramadı";
+    "çekoslovakyalaştıramadık";
+    "çekoslovakyalaştıramadıkl";
+    "çekoslovakyalaştıramadıkla";
+    "çekoslovakyalaştıramadıklar";
+    "çekoslovakyalaştıramadıkları";
+    "çekoslovakyalaştıramadıklarım";
+    "çekoslovakyalaştıramadıklarımı";
+    "çekoslovakyalaştıramadıklarımız";
+    "çekoslovakyalaştıramadıklarımızd";
+    "çekoslovakyalaştıramadıklarımızda";
+    "çekoslovakyalaştıramadıklarımızdan";
+    "çekoslovakyalaştıramadıklarımızdanm";
+    "çekoslovakyalaştıramadıklarımızdanmı";
+    "çekoslovakyalaştıramadıklarımızdanmıs";
+    "çekoslovakyalaştıramadıklarımızdanmısı";
+    "çekoslovakyalaştıramadıklarımızdanmısın";
+    "çekoslovakyalaştıramadıklarımızdanmısını";
+    "çekoslovakyalaştıramadıklarımızdanmısınız";
+  |]
+]
+
+let test_substring () =
+  let test a =
+    let m = Array.length a - 1 in
+    let v = a.(m) in
+    assert(UTF8.length v = m);
+    for i = 0 to m do
+      assert(a.(i) = UTF8.substring v 0 i);
+    done;
+    for i = 0 to m - 1 do
+      for j = i to m - 1 do
+        let u = UTF8.substring v i (j - i + 1) in
+        UTF8.validate u
+      done
+    done
+  in
+  List.iter test substring_inputs
+
+let split_inputs = [
+  "", [];
+  "de ne me", ["de";"ne";"me"];
+  "yoğurtun tadı ılık iken pek güzel olmaz", ["yoğurtun";"tadı";"ılık";"iken";"pek";"güzel";"olmaz"]
+]
+
+let split_at f u =
+  let m = UTF8.eof u in
+  let b = UTF8.Buf.create m in
+  let rec loop0 r i =
+    if i >= m then
+      List.rev r
+    else
+      if f (UTF8.look u i) then
+        loop0 r (UTF8.next u i)
+      else
+        loop1 r i
+  and loop1 r i =
+    if i = m || f (UTF8.look u i) then
+      begin
+        let x = UTF8.Buf.contents b in
+        UTF8.Buf.clear b;
+        loop0 (x::r) (UTF8.next u i)
+      end
+    else
+      begin
+        UTF8.Buf.add_char b (UTF8.look u i);
+        loop1 r (UTF8.next u i)
+      end
+  in
+  loop0 [] 0
+
+let test_split () =
+  List.iter
+    (fun (u, ul) ->
+      let space = UChar.of_char ' ' in
+      assert(ul = split_at ((=) space) u))
+    split_inputs
+
+let test () =
+  Util.run_test ~test_name:"bd_UTF.substring" test_substring;
+  Util.run_test ~test_name:"bd_UTF.split" test_split
Index: extlib/uTF8.mli
===================================================================
--- extlib/uTF8.mli	(revision 381)
+++ extlib/uTF8.mli	(working copy)
@@ -62,10 +62,18 @@
 (** The position of the head of the last Unicode character. *)
 val last : t -> index
 
+(** The (invalid) position of the head after the last Unicode character.
+    [next (last u i) = eof i] *)
+val eof : t -> index
+
 (** [look s i]
    returns the Unicode character of the location [i] in the string [s]. *)
 val look : t -> index -> uchar
 
+(** [substring s i m] returns the substring made of the Unicode locations [i] to [i + m - 1] inclusive.
+   The string is always copied *)
+val substring : t -> int -> int -> t
+
 (** [out_of_range s i]
    tests whether [i] is a position inside of [s]. *)
 val out_of_range : t -> index -> bool
@@ -113,6 +121,9 @@
    a negative integer if [s1] < [s2]. *)
 val compare : t -> t -> int
 
+(** Output the given char in UTF8 format over a binary channel *)
+val output_uchar : out_channel -> uchar -> unit
+
 (** Buffer module for UTF-8 strings *)
 module Buf : sig
   (** Buffers for UTF-8 strings. *) 
Index: extlib/uTF8.ml
===================================================================
--- extlib/uTF8.ml	(revision 381)
+++ extlib/uTF8.ml	(working copy)
@@ -76,6 +76,7 @@
   search_head s (i + 1)
 
 let next s i = 
+  if i >= String.length s then i else
   let n = Char.code s.[i] in
   if n < 0x80 then i + 1 else
   if n < 0xc0 then search_head s (i + 1) else
@@ -108,14 +109,56 @@
 
 let nth s n = nth_aux s 0 n
 
+let substring s i n =
+  let j = nth s i in
+  let j' = (nth_aux s j n) - 1 in
+  String.sub s j (j' - j + 1)
+
 let last s = search_head_backward s (String.length s - 1)
 
+let eof s = String.length s
+
 let out_of_range s i = i < 0 || i >= String.length s
 
 let compare_index _ i j = i - j
 
 let get s n = look s (nth s n)
 
+let generic_output_uchar f oc u =
+  let masq = 0b111111 in
+  let f = f oc in
+  let k = int_of_uchar u in
+  if k < 0 || k >= 0x4000000 then begin
+    f (0xfc + (k lsr 30));
+    f (0x80 lor ((k lsr 24) land masq)); 
+    f (0x80 lor ((k lsr 18) land masq));
+    f (0x80 lor ((k lsr 12) land masq));
+    f (0x80 lor ((k lsr 6) land masq));
+    f (0x80 lor (k land masq))
+  end else if k <= 0x7f then
+    f k
+  else if k <= 0x7ff then begin
+    f (0xc0 lor (k lsr 6));
+    f (0x80 lor (k land masq))
+  end else if k <= 0xffff then begin
+    f (0xe0 lor (k lsr 12));
+    f (0x80 lor ((k lsr 6) land masq));
+    f (0x80 lor (k land masq))
+  end else if k <= 0x1fffff then begin
+    f (0xf0 + (k lsr 18));
+    f (0x80 lor ((k lsr 12) land masq));
+    f (0x80 lor ((k lsr 6) land masq));
+    f (0x80 lor (k land masq))
+  end else begin
+    f (0xf8 + (k lsr 24));
+    f (0x80 lor ((k lsr 18) land masq));
+    f (0x80 lor ((k lsr 12) land masq));
+    f (0x80 lor ((k lsr 6) land masq));
+    f (0x80 lor (k land masq))
+  end 
+
+let output_uchar oc u = generic_output_uchar output_byte oc u
+
 let add_uchar buf u =
   let masq = 0b111111 in
   let k = int_of_uchar u in

next             reply	other threads:[~2008-05-05  9:55 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-05-05  9:55 Berke Durak [this message]
2008-05-14  7:54 ` [Ocaml-lib-devel] " Richard Jones
2010-01-26 14:31 ` Yoriyuki Yamagata
2010-01-26 16:01   ` Sylvain Le Gall

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b903a8570805050255q2534a6c8l4bcc4de38e050cba@mail.gmail.com \
    --to=berke.durak@gmail.com \
    --cc=caml-list@inria.fr \
    --cc=ocaml-lib-devel@lists.sourceforge.net \
    --cc=rich@annexia.org \
    --cc=yoriyuki.y@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox