From: "Berke Durak" <berke.durak@gmail.com>
To: "Yoriyuki Yamagata" <yoriyuki.y@gmail.com>
Cc: ocaml-lib-devel@lists.sourceforge.net,
caml-list <caml-list@inria.fr>,
"Richard Jones" <rich@annexia.org>
Subject: ExtLib/Camomile UTF8
Date: Mon, 5 May 2008 11:55:08 +0200 [thread overview]
Message-ID: <b903a8570805050255q2534a6c8l4bcc4de38e050cba@mail.gmail.com> (raw)
[-- Attachment #1.1: Type: text/plain, Size: 661 bytes --]
Hello,
A few months ago I submitted a small patch to ExtLib to the UTF8 module,
adding three small but important functions
- substring
- output_uchar
- eof
The patch hasn't been integrated in the ExtLib SVN (I've checked the trunk
and tags on the SVN at code.google.com).
As it's actually a patch against Camomile, (at the time I didn't know at the
time that UTF8 was taken from Camomile),
Richard Jones suggested that this is where the patch should go.
Could you provide a substring function or integrate the patch into
Camomile? Also maybe you could host Camomile at
forge.ocamlcore.org...
Attached is the diff against ExtLib 1.5.
--
Berke Durak
[-- Attachment #1.2: Type: text/html, Size: 811 bytes --]
[-- Attachment #2: extlib-patch --]
[-- Type: application/octet-stream, Size: 6266 bytes --]
Index: test/test_bd_UTF8_001.ml
===================================================================
--- test/test_bd_UTF8_001.ml (revision 0)
+++ test/test_bd_UTF8_001.ml (revision 0)
@@ -0,0 +1,115 @@
+let substring_inputs =
+[
+ [|
+ "";
+ "⟿";
+ "⟿ቄ";
+ "⟿ቄş";
+ "⟿ቄş龟";
+ "⟿ቄş龟¯";
+ |];
+ [|
+ "";
+ "ç";
+ "çe";
+ "çek";
+ "çeko";
+ "çekos";
+ "çekosl";
+ "çekoslo";
+ "çekoslov";
+ "çekoslova";
+ "çekoslovak";
+ "çekoslovaky";
+ "çekoslovakya";
+ "çekoslovakyal";
+ "çekoslovakyala";
+ "çekoslovakyalaş";
+ "çekoslovakyalaşt";
+ "çekoslovakyalaştı";
+ "çekoslovakyalaştır";
+ "çekoslovakyalaştıra";
+ "çekoslovakyalaştıram";
+ "çekoslovakyalaştırama";
+ "çekoslovakyalaştıramad";
+ "çekoslovakyalaştıramadı";
+ "çekoslovakyalaştıramadık";
+ "çekoslovakyalaştıramadıkl";
+ "çekoslovakyalaştıramadıkla";
+ "çekoslovakyalaştıramadıklar";
+ "çekoslovakyalaştıramadıkları";
+ "çekoslovakyalaştıramadıklarım";
+ "çekoslovakyalaştıramadıklarımı";
+ "çekoslovakyalaştıramadıklarımız";
+ "çekoslovakyalaştıramadıklarımızd";
+ "çekoslovakyalaştıramadıklarımızda";
+ "çekoslovakyalaştıramadıklarımızdan";
+ "çekoslovakyalaştıramadıklarımızdanm";
+ "çekoslovakyalaştıramadıklarımızdanmı";
+ "çekoslovakyalaştıramadıklarımızdanmıs";
+ "çekoslovakyalaştıramadıklarımızdanmısı";
+ "çekoslovakyalaştıramadıklarımızdanmısın";
+ "çekoslovakyalaştıramadıklarımızdanmısını";
+ "çekoslovakyalaştıramadıklarımızdanmısınız";
+ |]
+]
+
+let test_substring () =
+ let test a =
+ let m = Array.length a - 1 in
+ let v = a.(m) in
+ assert(UTF8.length v = m);
+ for i = 0 to m do
+ assert(a.(i) = UTF8.substring v 0 i);
+ done;
+ for i = 0 to m - 1 do
+ for j = i to m - 1 do
+ let u = UTF8.substring v i (j - i + 1) in
+ UTF8.validate u
+ done
+ done
+ in
+ List.iter test substring_inputs
+
+let split_inputs = [
+ "", [];
+ "de ne me", ["de";"ne";"me"];
+ "yoğurtun tadı ılık iken pek güzel olmaz", ["yoğurtun";"tadı";"ılık";"iken";"pek";"güzel";"olmaz"]
+]
+
+let split_at f u =
+ let m = UTF8.eof u in
+ let b = UTF8.Buf.create m in
+ let rec loop0 r i =
+ if i >= m then
+ List.rev r
+ else
+ if f (UTF8.look u i) then
+ loop0 r (UTF8.next u i)
+ else
+ loop1 r i
+ and loop1 r i =
+ if i = m || f (UTF8.look u i) then
+ begin
+ let x = UTF8.Buf.contents b in
+ UTF8.Buf.clear b;
+ loop0 (x::r) (UTF8.next u i)
+ end
+ else
+ begin
+ UTF8.Buf.add_char b (UTF8.look u i);
+ loop1 r (UTF8.next u i)
+ end
+ in
+ loop0 [] 0
+
+let test_split () =
+ List.iter
+ (fun (u, ul) ->
+ let space = UChar.of_char ' ' in
+ assert(ul = split_at ((=) space) u))
+ split_inputs
+
+let test () =
+ Util.run_test ~test_name:"bd_UTF.substring" test_substring;
+ Util.run_test ~test_name:"bd_UTF.split" test_split
Index: extlib/uTF8.mli
===================================================================
--- extlib/uTF8.mli (revision 381)
+++ extlib/uTF8.mli (working copy)
@@ -62,10 +62,18 @@
(** The position of the head of the last Unicode character. *)
val last : t -> index
+(** The (invalid) position of the head after the last Unicode character.
+ [next (last u i) = eof i] *)
+val eof : t -> index
+
(** [look s i]
returns the Unicode character of the location [i] in the string [s]. *)
val look : t -> index -> uchar
+(** [substring s i m] returns the substring made of the Unicode locations [i] to [i + m - 1] inclusive.
+ The string is always copied *)
+val substring : t -> int -> int -> t
+
(** [out_of_range s i]
tests whether [i] is a position inside of [s]. *)
val out_of_range : t -> index -> bool
@@ -113,6 +121,9 @@
a negative integer if [s1] < [s2]. *)
val compare : t -> t -> int
+(** Output the given char in UTF8 format over a binary channel *)
+val output_uchar : out_channel -> uchar -> unit
+
(** Buffer module for UTF-8 strings *)
module Buf : sig
(** Buffers for UTF-8 strings. *)
Index: extlib/uTF8.ml
===================================================================
--- extlib/uTF8.ml (revision 381)
+++ extlib/uTF8.ml (working copy)
@@ -76,6 +76,7 @@
search_head s (i + 1)
let next s i =
+ if i >= String.length s then i else
let n = Char.code s.[i] in
if n < 0x80 then i + 1 else
if n < 0xc0 then search_head s (i + 1) else
@@ -108,14 +109,56 @@
let nth s n = nth_aux s 0 n
+let substring s i n =
+ let j = nth s i in
+ let j' = (nth_aux s j n) - 1 in
+ String.sub s j (j' - j + 1)
+
let last s = search_head_backward s (String.length s - 1)
+let eof s = String.length s
+
let out_of_range s i = i < 0 || i >= String.length s
let compare_index _ i j = i - j
let get s n = look s (nth s n)
+let generic_output_uchar f oc u =
+ let masq = 0b111111 in
+ let f = f oc in
+ let k = int_of_uchar u in
+ if k < 0 || k >= 0x4000000 then begin
+ f (0xfc + (k lsr 30));
+ f (0x80 lor ((k lsr 24) land masq));
+ f (0x80 lor ((k lsr 18) land masq));
+ f (0x80 lor ((k lsr 12) land masq));
+ f (0x80 lor ((k lsr 6) land masq));
+ f (0x80 lor (k land masq))
+ end else if k <= 0x7f then
+ f k
+ else if k <= 0x7ff then begin
+ f (0xc0 lor (k lsr 6));
+ f (0x80 lor (k land masq))
+ end else if k <= 0xffff then begin
+ f (0xe0 lor (k lsr 12));
+ f (0x80 lor ((k lsr 6) land masq));
+ f (0x80 lor (k land masq))
+ end else if k <= 0x1fffff then begin
+ f (0xf0 + (k lsr 18));
+ f (0x80 lor ((k lsr 12) land masq));
+ f (0x80 lor ((k lsr 6) land masq));
+ f (0x80 lor (k land masq))
+ end else begin
+ f (0xf8 + (k lsr 24));
+ f (0x80 lor ((k lsr 18) land masq));
+ f (0x80 lor ((k lsr 12) land masq));
+ f (0x80 lor ((k lsr 6) land masq));
+ f (0x80 lor (k land masq))
+ end
+
+let output_uchar oc u = generic_output_uchar output_byte oc u
+
let add_uchar buf u =
let masq = 0b111111 in
let k = int_of_uchar u in
next reply other threads:[~2008-05-05 9:55 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-05-05 9:55 Berke Durak [this message]
2008-05-14 7:54 ` [Ocaml-lib-devel] " Richard Jones
2010-01-26 14:31 ` Yoriyuki Yamagata
2010-01-26 16:01 ` Sylvain Le Gall
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=b903a8570805050255q2534a6c8l4bcc4de38e050cba@mail.gmail.com \
--to=berke.durak@gmail.com \
--cc=caml-list@inria.fr \
--cc=ocaml-lib-devel@lists.sourceforge.net \
--cc=rich@annexia.org \
--cc=yoriyuki.y@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox