From: Max Skaller <maxs@in.ot.com.au>
To: luther@dpt-info.u-strasbg.fr
Cc: John Max Skaller <skaller@maxtal.com.au>, caml-list@inria.fr
Subject: Re: Unsigned integers?
Date: Thu, 23 Mar 2000 13:08:54 +1100 [thread overview]
Message-ID: <38D97CB6.1465CD8@in.ot.com.au> (raw)
In-Reply-To: <20000322172215.A9391@dpt-info.u-strasbg.fr>
Sven LUTHER wrote:
>
> On Wed, Mar 22, 2000 at 09:22:15AM +1100, John Max Skaller wrote:
> > I have some code for processing ISO-10646 characters and UTF-8,
> > which uses caml integers. ISO-10646 has 2^31 code points, which
> > can be covered by caml integers on a 32bit machine. Using an
> > unboxed type is mandatory for performance.
> >
> > Unfortunately, caml integers are signed, which makes most of the
> > code I have written wrong (I haven't taken the care to handle
> > integers over 2^30 correctly).
> >
> > What is the best way to handle this problem?
> > Would a (standard?) library module (written in C), that treats
> > integers as unsigned be a reasonable solution?
> >
> > [This may require writing 'uint_add x y' instead of 'x+y',
> > but that doesn't matter in the above mentioned application,
> > since the integers are being used to represent characters]
>
> Just use the caml integer and ignore the fact that they are signed ?
>
> after the moto : that doesn't matter in the above mentioned application,
Perhaps my explanation was unclear. In my code, I must
calculate a UTF-8 encoding from a ISO-10646 code point,
and calculate an ISO-10646 code point from a UTF-8 encoding.
The code is below. The code works for values <2^30,
but fails when and int goes negative.
I would be happy to replace, in this code,
evey use of 'lor', 'land', + - * < etc with
'ulor' 'uland' 'uplus' 'uminus' 'uless' etc, if only
I could define them. (I could do this in C .. but then,
I could write the below routines in C too)
Note these operations MUST be extremely fast,
and in particular, compact storage of ISO-10646
code points in arrays of integers is OK,
while arrays of boxed values is out of the question.
(So I can't use int32).
-------------------------------------------------------
let parse_utf8 (s : string) (i : int) : int * int =
let ord = int_of_char
and n = (String.length s) - i
in if n <= 0 then begin print_endline "FAILURE"; (-1),i end
else let lead = ord (s.[i]) in
if (lead land 0x80) = 0 then
lead land 0x7F,i+1 (* ASCII *)
else if lead land 0xE0 = 0xC0 && n > 1 then
((lead land 0x1F) lsl 6) lor
(ord(s.[i+1]) land 0x3F),i+2
else if lead land 0xF0 = 0xE0 && n > 2 then
((lead land 0x1F) lsl 12) lor
((ord(s.[i+1]) land 0x3F) lsl 6) lor
(ord(s.[i+2]) land 0x3F),i+3
else if lead land 0xF8 = 0xF0 && n > 3 then
((lead land 0x1F) lsl 18) lor
((ord(s.[i+1]) land 0x3F) lsl 12) lor
((ord(s.[i+2]) land 0x3F) lsl 6) lor
(ord(s.[i+3]) land 0x3F),i+4
else if lead land 0xFC = 0xF8 && n > 4 then
((lead land 0x1F) lsl 24) lor
((ord(s.[i+1]) land 0x3F) lsl 18) lor
((ord(s.[i+2]) land 0x3F) lsl 12) lor
((ord(s.[i+3]) land 0x3F) lsl 6) lor
(ord(s.[i+4]) land 0x3F),i+5
else if lead land 0xFE = 0xFC && n > 5 then
((lead land 0x1F) lsl 30) lor
((ord(s.[i+1]) land 0x3F) lsl 24) lor
((ord(s.[i+2]) land 0x3F) lsl 18) lor
((ord(s.[i+3]) land 0x3F) lsl 12) lor
((ord(s.[i+4]) land 0x3F) lsl 6) lor
(ord(s.[i+5]) land 0x3F),i+6
else lead, i+1 (* error, just use bad character *)
(* convert an integer into a utf-8 encoded string of bytes *)
let utf8_of_int i =
let chr x = String.make 1 (Char.chr x) in
if i < 0x80 then
chr(i)
else if i < 0x800 then
chr(0xC0 lor ((i lsr 6) land 0x1F)) ^
chr(0x80 lor (i land 0x3F))
else if i < 0x10000 then
chr(0xE0 lor ((i lsr 12) land 0xF)) ^
chr(0x80 lor ((i lsr 6) land 0x3F)) ^
chr(0x80 lor (i land 0x3F))
else if i < 0x200000 then
chr(0xF0 lor ((i lsr 18) land 0x7)) ^
chr(0x80 lor ((i lsr 12) land 0x3F)) ^
chr(0x80 lor ((i lsr 6) land 0x3F)) ^
chr(0x80 lor (i land 0x3F))
else if i < 0x4000000 then
chr(0xF8 lor ((i lsr 24) land 0x3)) ^
chr(0x80 lor ((i lsr 18) land 0x3F)) ^
chr(0x80 lor ((i lsr 12) land 0x3F)) ^
chr(0x80 lor ((i lsr 6) land 0x3F)) ^
chr(0x80 lor (i land 0x3F))
else chr(0xFC lor ((i lsr 30) land 0x1)) ^
chr(0x80 lor ((i lsr 24) land 0x3F)) ^
chr(0x80 lor ((i lsr 18) land 0x3F)) ^
chr(0x80 lor ((i lsr 12) land 0x3F)) ^
chr(0x80 lor ((i lsr 6) land 0x3F)) ^
chr(0x80 lor (i land 0x3F))
--
John (Max) Skaller at OTT [Open Telecommications Ltd]
mailto:maxs@in.ot.com.au -- at work
mailto:skaller@maxtal.com.au -- at home
next prev parent reply other threads:[~2000-03-23 12:50 UTC|newest]
Thread overview: 60+ messages / expand[flat|nested] mbox.gz Atom feed top
2000-03-14 16:53 Syntax for label Don Syme
2000-03-14 18:05 ` Pierre Weis
2000-03-15 3:15 ` Syntax for label, NEW PROPOSAL Jacques Garrigue
2000-03-15 6:58 ` Christophe Raffalli
2000-03-15 21:54 ` Julian Assange
2000-03-15 11:56 ` Wolfram Kahl
2000-03-15 13:58 ` Pierre Weis
2000-03-15 15:26 ` Sven LUTHER
2000-03-17 7:44 ` Pierre Weis
2000-03-15 17:04 ` John Prevost
2000-03-17 10:11 ` Jacques Garrigue
2000-03-15 17:06 ` Markus Mottl
2000-03-15 19:11 ` Remi VANICAT
2000-03-17 8:30 ` Pierre Weis
2000-03-17 14:05 ` Jacques Garrigue
2000-03-17 16:08 ` Pierre Weis
2000-03-18 10:32 ` Syntax for label, NEW SOLUTION Christophe Raffalli
2000-03-19 2:29 ` Jacques Garrigue
2000-03-20 18:25 ` Christophe Raffalli
2000-03-22 8:37 ` Claudio Sacerdoti Coen
2000-03-21 23:29 ` John Max Skaller
2000-03-29 8:42 ` Semantic of label: The best (only ?) solution to merge both mode Christophe Raffalli
2000-03-29 9:53 ` Christophe Raffalli
2000-03-30 9:49 ` John Max Skaller
2000-03-30 9:39 ` John Max Skaller
2000-03-31 4:34 ` Jacques Garrigue
2000-04-01 1:53 ` John Max Skaller
2000-04-02 19:24 ` Christophe Raffalli
2000-04-04 5:50 ` Jacques Garrigue
2000-04-03 7:57 ` backward compatibility Christophe Raffalli
2000-03-15 21:30 ` Syntax for label, NEW PROPOSAL John Max Skaller
2000-03-16 2:55 ` Jacques Garrigue
2000-03-17 15:13 ` Pierre Weis
2000-03-17 17:33 ` Wolfram Kahl
2000-03-18 11:59 ` Jacques Garrigue
2000-03-21 16:51 ` Pascal Brisset
2000-03-23 11:14 ` Nicolas barnier
2000-03-24 9:54 ` labels & ocaml 3 & co David Mentré
2000-03-24 12:19 ` David Mentré
2000-03-21 22:22 ` Unsigned integers? John Max Skaller
2000-03-22 16:22 ` Sven LUTHER
2000-03-23 2:08 ` Max Skaller [this message]
2000-03-23 7:50 ` Sven LUTHER
2000-03-24 2:50 ` Jacques Garrigue
2000-03-24 15:59 ` Xavier Leroy
2000-03-25 4:03 ` John Max Skaller
2000-03-24 14:50 ` Xavier Leroy
2000-03-22 17:05 ` Jean-Christophe Filliatre
2000-03-22 19:10 ` Markus Mottl
2000-03-23 2:41 ` Max Skaller
2000-03-22 19:47 ` Xavier Leroy
2000-03-23 12:55 ` John Max Skaller
2000-03-16 8:50 ` Syntax for label, NEW PROPOSAL Pascal Brisset
2000-03-17 11:15 ` Sven LUTHER
2000-03-18 0:04 ` Syntax for label, ANOTHER " Steven Thomson
2000-03-15 20:39 ` Syntax for label (and more) Xavier Leroy
2000-03-17 10:03 ` Christian RINDERKNECHT
2000-03-17 17:19 ` Christophe Raffalli
2000-03-21 1:29 ` Markus Mottl
2000-03-23 19:42 Unsigned integers? Damien Doligez
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=38D97CB6.1465CD8@in.ot.com.au \
--to=maxs@in.ot.com.au \
--cc=caml-list@inria.fr \
--cc=luther@dpt-info.u-strasbg.fr \
--cc=skaller@maxtal.com.au \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox