Mailing list for all users of the OCaml language and system.
 help / color / mirror / Atom feed
* ExtLib/Camomile UTF8
@ 2008-05-05  9:55 Berke Durak
  2008-05-14  7:54 ` [Ocaml-lib-devel] " Richard Jones
  2010-01-26 14:31 ` Yoriyuki Yamagata
  0 siblings, 2 replies; 4+ messages in thread
From: Berke Durak @ 2008-05-05  9:55 UTC (permalink / raw)
  To: Yoriyuki Yamagata; +Cc: ocaml-lib-devel, caml-list, Richard Jones


[-- Attachment #1.1: Type: text/plain, Size: 661 bytes --]

Hello,

A few months ago I submitted a small patch to ExtLib to the UTF8 module,
adding three small but important functions

  - substring
  - output_uchar
  - eof

The patch hasn't been integrated in the ExtLib SVN (I've checked the trunk
and tags on the SVN at code.google.com).
As it's actually a patch against Camomile, (at the time I didn't know at the
time that UTF8 was taken from Camomile),
Richard Jones suggested that this is where the patch should go.

Could you provide a substring function or integrate the patch into
Camomile?  Also maybe you could host Camomile at
forge.ocamlcore.org...

Attached is the diff against ExtLib 1.5.
-- 
Berke Durak

[-- Attachment #1.2: Type: text/html, Size: 811 bytes --]

[-- Attachment #2: extlib-patch --]
[-- Type: application/octet-stream, Size: 6266 bytes --]

Index: test/test_bd_UTF8_001.ml
===================================================================
--- test/test_bd_UTF8_001.ml	(revision 0)
+++ test/test_bd_UTF8_001.ml	(revision 0)
@@ -0,0 +1,115 @@
+let substring_inputs =
+[
+  [|
+    "";
+    "⟿";
+    "⟿ቄ";
+    "⟿ቄş";
+    "⟿ቄş龟";
+    "⟿ቄş龟¯";
+  |];
+  [|
+    "";
+    "ç";
+    "çe";
+    "çek";
+    "çeko";
+    "çekos";
+    "çekosl";
+    "çekoslo";
+    "çekoslov";
+    "çekoslova";
+    "çekoslovak";
+    "çekoslovaky";
+    "çekoslovakya";
+    "çekoslovakyal";
+    "çekoslovakyala";
+    "çekoslovakyalaş";
+    "çekoslovakyalaşt";
+    "çekoslovakyalaştı";
+    "çekoslovakyalaştır";
+    "çekoslovakyalaştıra";
+    "çekoslovakyalaştıram";
+    "çekoslovakyalaştırama";
+    "çekoslovakyalaştıramad";
+    "çekoslovakyalaştıramadı";
+    "çekoslovakyalaştıramadık";
+    "çekoslovakyalaştıramadıkl";
+    "çekoslovakyalaştıramadıkla";
+    "çekoslovakyalaştıramadıklar";
+    "çekoslovakyalaştıramadıkları";
+    "çekoslovakyalaştıramadıklarım";
+    "çekoslovakyalaştıramadıklarımı";
+    "çekoslovakyalaştıramadıklarımız";
+    "çekoslovakyalaştıramadıklarımızd";
+    "çekoslovakyalaştıramadıklarımızda";
+    "çekoslovakyalaştıramadıklarımızdan";
+    "çekoslovakyalaştıramadıklarımızdanm";
+    "çekoslovakyalaştıramadıklarımızdanmı";
+    "çekoslovakyalaştıramadıklarımızdanmıs";
+    "çekoslovakyalaştıramadıklarımızdanmısı";
+    "çekoslovakyalaştıramadıklarımızdanmısın";
+    "çekoslovakyalaştıramadıklarımızdanmısını";
+    "çekoslovakyalaştıramadıklarımızdanmısınız";
+  |]
+]
+
+let test_substring () =
+  let test a =
+    let m = Array.length a - 1 in
+    let v = a.(m) in
+    assert(UTF8.length v = m);
+    for i = 0 to m do
+      assert(a.(i) = UTF8.substring v 0 i);
+    done;
+    for i = 0 to m - 1 do
+      for j = i to m - 1 do
+        let u = UTF8.substring v i (j - i + 1) in
+        UTF8.validate u
+      done
+    done
+  in
+  List.iter test substring_inputs
+
+let split_inputs = [
+  "", [];
+  "de ne me", ["de";"ne";"me"];
+  "yoğurtun tadı ılık iken pek güzel olmaz", ["yoğurtun";"tadı";"ılık";"iken";"pek";"güzel";"olmaz"]
+]
+
+let split_at f u =
+  let m = UTF8.eof u in
+  let b = UTF8.Buf.create m in
+  let rec loop0 r i =
+    if i >= m then
+      List.rev r
+    else
+      if f (UTF8.look u i) then
+        loop0 r (UTF8.next u i)
+      else
+        loop1 r i
+  and loop1 r i =
+    if i = m || f (UTF8.look u i) then
+      begin
+        let x = UTF8.Buf.contents b in
+        UTF8.Buf.clear b;
+        loop0 (x::r) (UTF8.next u i)
+      end
+    else
+      begin
+        UTF8.Buf.add_char b (UTF8.look u i);
+        loop1 r (UTF8.next u i)
+      end
+  in
+  loop0 [] 0
+
+let test_split () =
+  List.iter
+    (fun (u, ul) ->
+      let space = UChar.of_char ' ' in
+      assert(ul = split_at ((=) space) u))
+    split_inputs
+
+let test () =
+  Util.run_test ~test_name:"bd_UTF.substring" test_substring;
+  Util.run_test ~test_name:"bd_UTF.split" test_split
Index: extlib/uTF8.mli
===================================================================
--- extlib/uTF8.mli	(revision 381)
+++ extlib/uTF8.mli	(working copy)
@@ -62,10 +62,18 @@
 (** The position of the head of the last Unicode character. *)
 val last : t -> index
 
+(** The (invalid) position of the head after the last Unicode character.
+    [next (last u i) = eof i] *)
+val eof : t -> index
+
 (** [look s i]
    returns the Unicode character of the location [i] in the string [s]. *)
 val look : t -> index -> uchar
 
+(** [substring s i m] returns the substring made of the Unicode locations [i] to [i + m - 1] inclusive.
+   The string is always copied *)
+val substring : t -> int -> int -> t
+
 (** [out_of_range s i]
    tests whether [i] is a position inside of [s]. *)
 val out_of_range : t -> index -> bool
@@ -113,6 +121,9 @@
    a negative integer if [s1] < [s2]. *)
 val compare : t -> t -> int
 
+(** Output the given char in UTF8 format over a binary channel *)
+val output_uchar : out_channel -> uchar -> unit
+
 (** Buffer module for UTF-8 strings *)
 module Buf : sig
   (** Buffers for UTF-8 strings. *) 
Index: extlib/uTF8.ml
===================================================================
--- extlib/uTF8.ml	(revision 381)
+++ extlib/uTF8.ml	(working copy)
@@ -76,6 +76,7 @@
   search_head s (i + 1)
 
 let next s i = 
+  if i >= String.length s then i else
   let n = Char.code s.[i] in
   if n < 0x80 then i + 1 else
   if n < 0xc0 then search_head s (i + 1) else
@@ -108,14 +109,56 @@
 
 let nth s n = nth_aux s 0 n
 
+let substring s i n =
+  let j = nth s i in
+  let j' = (nth_aux s j n) - 1 in
+  String.sub s j (j' - j + 1)
+
 let last s = search_head_backward s (String.length s - 1)
 
+let eof s = String.length s
+
 let out_of_range s i = i < 0 || i >= String.length s
 
 let compare_index _ i j = i - j
 
 let get s n = look s (nth s n)
 
+let generic_output_uchar f oc u =
+  let masq = 0b111111 in
+  let f = f oc in
+  let k = int_of_uchar u in
+  if k < 0 || k >= 0x4000000 then begin
+    f (0xfc + (k lsr 30));
+    f (0x80 lor ((k lsr 24) land masq)); 
+    f (0x80 lor ((k lsr 18) land masq));
+    f (0x80 lor ((k lsr 12) land masq));
+    f (0x80 lor ((k lsr 6) land masq));
+    f (0x80 lor (k land masq))
+  end else if k <= 0x7f then
+    f k
+  else if k <= 0x7ff then begin
+    f (0xc0 lor (k lsr 6));
+    f (0x80 lor (k land masq))
+  end else if k <= 0xffff then begin
+    f (0xe0 lor (k lsr 12));
+    f (0x80 lor ((k lsr 6) land masq));
+    f (0x80 lor (k land masq))
+  end else if k <= 0x1fffff then begin
+    f (0xf0 + (k lsr 18));
+    f (0x80 lor ((k lsr 12) land masq));
+    f (0x80 lor ((k lsr 6) land masq));
+    f (0x80 lor (k land masq))
+  end else begin
+    f (0xf8 + (k lsr 24));
+    f (0x80 lor ((k lsr 18) land masq));
+    f (0x80 lor ((k lsr 12) land masq));
+    f (0x80 lor ((k lsr 6) land masq));
+    f (0x80 lor (k land masq))
+  end 
+
+let output_uchar oc u = generic_output_uchar output_byte oc u
+
 let add_uchar buf u =
   let masq = 0b111111 in
   let k = int_of_uchar u in

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-01-26 16:01 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-05-05  9:55 ExtLib/Camomile UTF8 Berke Durak
2008-05-14  7:54 ` [Ocaml-lib-devel] " Richard Jones
2010-01-26 14:31 ` Yoriyuki Yamagata
2010-01-26 16:01   ` Sylvain Le Gall

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox