ExtLib/Camomile UTF8

Mailing list for all users of the OCaml language and system.
 help / color / mirror / Atom feed

* ExtLib/Camomile UTF8
@ 2008-05-05  9:55 Berke Durak
  2008-05-14  7:54 ` [Ocaml-lib-devel] " Richard Jones
  2010-01-26 14:31 ` Yoriyuki Yamagata
  0 siblings, 2 replies; 4+ messages in thread
From: Berke Durak @ 2008-05-05  9:55 UTC (permalink / raw)
  To: Yoriyuki Yamagata; +Cc: ocaml-lib-devel, caml-list, Richard Jones


[-- Attachment #1.1: Type: text/plain, Size: 661 bytes --]

Hello,

A few months ago I submitted a small patch to ExtLib to the UTF8 module,
adding three small but important functions

  - substring
  - output_uchar
  - eof

The patch hasn't been integrated in the ExtLib SVN (I've checked the trunk
and tags on the SVN at code.google.com).
As it's actually a patch against Camomile, (at the time I didn't know at the
time that UTF8 was taken from Camomile),
Richard Jones suggested that this is where the patch should go.

Could you provide a substring function or integrate the patch into
Camomile?  Also maybe you could host Camomile at
forge.ocamlcore.org...

Attached is the diff against ExtLib 1.5.
-- 
Berke Durak

[-- Attachment #1.2: Type: text/html, Size: 811 bytes --]

[-- Attachment #2: extlib-patch --]
[-- Type: application/octet-stream, Size: 6266 bytes --]

Index: test/test_bd_UTF8_001.ml
===================================================================
--- test/test_bd_UTF8_001.ml	(revision 0)
+++ test/test_bd_UTF8_001.ml	(revision 0)
@@ -0,0 +1,115 @@
+let substring_inputs =
+[
+  [|
+    "";
+    "⟿";
+    "⟿ቄ";
+    "⟿ቄş";
+    "⟿ቄş龟";
+    "⟿ቄş龟¯";
+  |];
+  [|
+    "";
+    "ç";
+    "çe";
+    "çek";
+    "çeko";
+    "çekos";
+    "çekosl";
+    "çekoslo";
+    "çekoslov";
+    "çekoslova";
+    "çekoslovak";
+    "çekoslovaky";
+    "çekoslovakya";
+    "çekoslovakyal";
+    "çekoslovakyala";
+    "çekoslovakyalaş";
+    "çekoslovakyalaşt";
+    "çekoslovakyalaştı";
+    "çekoslovakyalaştır";
+    "çekoslovakyalaştıra";
+    "çekoslovakyalaştıram";
+    "çekoslovakyalaştırama";
+    "çekoslovakyalaştıramad";
+    "çekoslovakyalaştıramadı";
+    "çekoslovakyalaştıramadık";
+    "çekoslovakyalaştıramadıkl";
+    "çekoslovakyalaştıramadıkla";
+    "çekoslovakyalaştıramadıklar";
+    "çekoslovakyalaştıramadıkları";
+    "çekoslovakyalaştıramadıklarım";
+    "çekoslovakyalaştıramadıklarımı";
+    "çekoslovakyalaştıramadıklarımız";
+    "çekoslovakyalaştıramadıklarımızd";
+    "çekoslovakyalaştıramadıklarımızda";
+    "çekoslovakyalaştıramadıklarımızdan";
+    "çekoslovakyalaştıramadıklarımızdanm";
+    "çekoslovakyalaştıramadıklarımızdanmı";
+    "çekoslovakyalaştıramadıklarımızdanmıs";
+    "çekoslovakyalaştıramadıklarımızdanmısı";
+    "çekoslovakyalaştıramadıklarımızdanmısın";
+    "çekoslovakyalaştıramadıklarımızdanmısını";
+    "çekoslovakyalaştıramadıklarımızdanmısınız";
+  |]
+]
+
+let test_substring () =
+  let test a =
+    let m = Array.length a - 1 in
+    let v = a.(m) in
+    assert(UTF8.length v = m);
+    for i = 0 to m do
+      assert(a.(i) = UTF8.substring v 0 i);
+    done;
+    for i = 0 to m - 1 do
+      for j = i to m - 1 do
+        let u = UTF8.substring v i (j - i + 1) in
+        UTF8.validate u
+      done
+    done
+  in
+  List.iter test substring_inputs
+
+let split_inputs = [
+  "", [];
+  "de ne me", ["de";"ne";"me"];
+  "yoğurtun tadı ılık iken pek güzel olmaz", ["yoğurtun";"tadı";"ılık";"iken";"pek";"güzel";"olmaz"]
+]
+
+let split_at f u =
+  let m = UTF8.eof u in
+  let b = UTF8.Buf.create m in
+  let rec loop0 r i =
+    if i >= m then
+      List.rev r
+    else
+      if f (UTF8.look u i) then
+        loop0 r (UTF8.next u i)
+      else
+        loop1 r i
+  and loop1 r i =
+    if i = m || f (UTF8.look u i) then
+      begin
+        let x = UTF8.Buf.contents b in
+        UTF8.Buf.clear b;
+        loop0 (x::r) (UTF8.next u i)
+      end
+    else
+      begin
+        UTF8.Buf.add_char b (UTF8.look u i);
+        loop1 r (UTF8.next u i)
+      end
+  in
+  loop0 [] 0
+
+let test_split () =
+  List.iter
+    (fun (u, ul) ->
+      let space = UChar.of_char ' ' in
+      assert(ul = split_at ((=) space) u))
+    split_inputs
+
+let test () =
+  Util.run_test ~test_name:"bd_UTF.substring" test_substring;
+  Util.run_test ~test_name:"bd_UTF.split" test_split
Index: extlib/uTF8.mli
===================================================================
--- extlib/uTF8.mli	(revision 381)
+++ extlib/uTF8.mli	(working copy)
@@ -62,10 +62,18 @@
 (** The position of the head of the last Unicode character. *)
 val last : t -> index
 
+(** The (invalid) position of the head after the last Unicode character.
+    [next (last u i) = eof i] *)
+val eof : t -> index
+
 (** [look s i]
    returns the Unicode character of the location [i] in the string [s]. *)
 val look : t -> index -> uchar
 
+(** [substring s i m] returns the substring made of the Unicode locations [i] to [i + m - 1] inclusive.
+   The string is always copied *)
+val substring : t -> int -> int -> t
+
 (** [out_of_range s i]
    tests whether [i] is a position inside of [s]. *)
 val out_of_range : t -> index -> bool
@@ -113,6 +121,9 @@
    a negative integer if [s1] < [s2]. *)
 val compare : t -> t -> int
 
+(** Output the given char in UTF8 format over a binary channel *)
+val output_uchar : out_channel -> uchar -> unit
+
 (** Buffer module for UTF-8 strings *)
 module Buf : sig
   (** Buffers for UTF-8 strings. *) 
Index: extlib/uTF8.ml
===================================================================
--- extlib/uTF8.ml	(revision 381)
+++ extlib/uTF8.ml	(working copy)
@@ -76,6 +76,7 @@
   search_head s (i + 1)
 
 let next s i = 
+  if i >= String.length s then i else
   let n = Char.code s.[i] in
   if n < 0x80 then i + 1 else
   if n < 0xc0 then search_head s (i + 1) else
@@ -108,14 +109,56 @@
 
 let nth s n = nth_aux s 0 n
 
+let substring s i n =
+  let j = nth s i in
+  let j' = (nth_aux s j n) - 1 in
+  String.sub s j (j' - j + 1)
+
 let last s = search_head_backward s (String.length s - 1)
 
+let eof s = String.length s
+
 let out_of_range s i = i < 0 || i >= String.length s
 
 let compare_index _ i j = i - j
 
 let get s n = look s (nth s n)
 
+let generic_output_uchar f oc u =
+  let masq = 0b111111 in
+  let f = f oc in
+  let k = int_of_uchar u in
+  if k < 0 || k >= 0x4000000 then begin
+    f (0xfc + (k lsr 30));
+    f (0x80 lor ((k lsr 24) land masq)); 
+    f (0x80 lor ((k lsr 18) land masq));
+    f (0x80 lor ((k lsr 12) land masq));
+    f (0x80 lor ((k lsr 6) land masq));
+    f (0x80 lor (k land masq))
+  end else if k <= 0x7f then
+    f k
+  else if k <= 0x7ff then begin
+    f (0xc0 lor (k lsr 6));
+    f (0x80 lor (k land masq))
+  end else if k <= 0xffff then begin
+    f (0xe0 lor (k lsr 12));
+    f (0x80 lor ((k lsr 6) land masq));
+    f (0x80 lor (k land masq))
+  end else if k <= 0x1fffff then begin
+    f (0xf0 + (k lsr 18));
+    f (0x80 lor ((k lsr 12) land masq));
+    f (0x80 lor ((k lsr 6) land masq));
+    f (0x80 lor (k land masq))
+  end else begin
+    f (0xf8 + (k lsr 24));
+    f (0x80 lor ((k lsr 18) land masq));
+    f (0x80 lor ((k lsr 12) land masq));
+    f (0x80 lor ((k lsr 6) land masq));
+    f (0x80 lor (k land masq))
+  end 
+
+let output_uchar oc u = generic_output_uchar output_byte oc u
+
 let add_uchar buf u =
   let masq = 0b111111 in
   let k = int_of_uchar u in

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [Ocaml-lib-devel] ExtLib/Camomile UTF8
  2008-05-05  9:55 ExtLib/Camomile UTF8 Berke Durak
@ 2008-05-14  7:54 ` Richard Jones
  2010-01-26 14:31 ` Yoriyuki Yamagata
  1 sibling, 0 replies; 4+ messages in thread
From: Richard Jones @ 2008-05-14  7:54 UTC (permalink / raw)
  To: Berke Durak; +Cc: Yoriyuki Yamagata, ocaml-lib-devel, caml-list

On Mon, May 05, 2008 at 11:55:08AM +0200, Berke Durak wrote:
> As it's actually a patch against Camomile, (at the time I didn't know at the
> time that UTF8 was taken from Camomile),
> Richard Jones suggested that this is where the patch should go.

Yup, I think this should go into Camomile, and the UTF8 module should
be dropped from Extlib.

It's an unnecessary pain keeping two modules with the same name in
synch ... and if it is needed at all, then it is something which
should be done by the distributions.

Rich.

-- 
Richard Jones
Red Hat


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: ExtLib/Camomile UTF8
  2008-05-05  9:55 ExtLib/Camomile UTF8 Berke Durak
  2008-05-14  7:54 ` [Ocaml-lib-devel] " Richard Jones
@ 2010-01-26 14:31 ` Yoriyuki Yamagata
  2010-01-26 16:01   ` Sylvain Le Gall
  1 sibling, 1 reply; 4+ messages in thread
From: Yoriyuki Yamagata @ 2010-01-26 14:31 UTC (permalink / raw)
  To: Berke Durak; +Cc: ocaml-lib-devel, caml-list, Richard Jones

Sorry for answering very old message.  I was looking to camomile
related email and found this message.

2008/5/5 Berke Durak <berke.durak@gmail.com>:
> Hello,
>
> A few months ago I submitted a small patch to ExtLib to the UTF8 module,
> adding three small but important functions
>
>   - substring
>   - output_uchar
>   - eof

Camomile do have substring (SubText module) and output_char (This is
done by CharEncoding module).   I'm not sure that EoF is required.
The idea behind Camomile is that channels raise End_of_File exception
when no character is available to read.  I think today most file does
not use EOF character to signify the end of file.  They simply
terminates.

> The patch hasn't been integrated in the ExtLib SVN (I've checked the trunk
> and tags on the SVN at code.google.com).
> As it's actually a patch against Camomile, (at the time I didn't know at the
> time that UTF8 was taken from Camomile),
> Richard Jones suggested that this is where the patch should go.

As I stated, Camomile does have things you suggested.  It is different
matter whether Extlib shoud have them.  This is the matter for
judgement of Extlib team.

> Also maybe you could host Camomile at
> forge.ocamlcore.org...

What is the benefit of it?
-- 
Yoriyuki　Yamagata
yoriyuki.y@gmail.com

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: ExtLib/Camomile UTF8
  2010-01-26 14:31 ` Yoriyuki Yamagata
@ 2010-01-26 16:01   ` Sylvain Le Gall
  0 siblings, 0 replies; 4+ messages in thread
From: Sylvain Le Gall @ 2010-01-26 16:01 UTC (permalink / raw)
  To: caml-list; +Cc: ocaml-lib-devel

On 26-01-2010, Yoriyuki Yamagata <yoriyuki.y@gmail.com> wrote:
>
>> Also maybe you could host Camomile at
>> forge.ocamlcore.org...
>
> What is the benefit of it?

There are many benefits:
- better visibility among other OCaml projects 
- beginners can easily find your library
- people that don't know OCaml can easily see that they are many
  libraries for OCaml, just having a look at forge.ocamlcore.org
- integration of news from your project directly into the feed of the
  forge which goes to planet.ocamlcore.org
- people that could fill a bug report have a high probability to be
  subscribed to forge.ocamlcore.org for their own projects

On the other hand, I don't like to criticize but Sourceforge is not
cristal clear with UI (I agree that GForge/forge.ocamlcore.org is also
not very good at this). Moreover, we do have a lot of problems with
tracking new release of Sourceforge projects in Debian: we cannot easily
access download section.

Regards,
Sylvain Le Gall


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-01-26 16:01 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-05-05  9:55 ExtLib/Camomile UTF8 Berke Durak
2008-05-14  7:54 ` [Ocaml-lib-devel] " Richard Jones
2010-01-26 14:31 ` Yoriyuki Yamagata
2010-01-26 16:01   ` Sylvain Le Gall

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox