From: Julien Moutinho <julien.moutinho@gmail.com>
To: caml-list@inria.fr
Subject: Re: Warning on home-made functions dealing with UTF-8.
Date: Tue, 16 Oct 2007 20:46:21 +0200 [thread overview]
Message-ID: <20071016184621.GA12628@localhost> (raw)
In-Reply-To: <20071015203509.GA5212@localhost>
Here, I have reused some old code of mine to secure and extend J.Skaller's:
unicode_of_utf8 ~ parse_utf8
utf8_of_unicode ~ utf8_of_int
May it help, and may it not be too bugged.
exception Bad_utf8 of string * (string * int * int * int)
(* raised with an error description and its location:
* bytes
* start (0 < start <= String.length bytes)
* size (0 < size <= String.length bytes)
* position (0 <= position <= size) *)
exception Insufficient of int
(* raised when more bytes are needed.
* The absolute value of the integer is the minimal amount of bytes needed.
* A positive sign means that they have to be appended.
* A negative sign means that they have to be prepended. *)
let in_bounds
~(size: int)
~(pos: int) =
if size <> 0 then begin
if pos < 0 then begin
let i = size - ((- pos) mod size) in
if i = size then 0 else i
end else (pos mod size)
end else 0
let position__char_size__offset
(bytes: string)
?(start = 0)
?(size = String.length bytes)
~(pos: int) : int * int * int =
if size <= 0 then (0, 0, 0)
else begin
let pos = in_bounds ~size ~pos in
let char_pos = start + pos in
let char_start = ref char_pos in
let on_tail = ref true in
let loc = (bytes, start, size, pos) in
(* go backward to find a head *)
while !on_tail do
if char_pos - !char_start > 3
then raise (Bad_utf8 ("cannot find a head nearby", loc))
else if !char_start < start
then raise (Insufficient (-1))
else begin
let cod = Char.code bytes.[!char_start] in
if (cod land 0b1100_0000) = 0b1000_0000 (* on a trailing byte *)
then decr char_start
else on_tail := false
end
done;
let char_start = !char_start in
(* decode the head *)
let head = Char.code bytes.[char_start] in
let overlong boo =
(* check for overlong forms (when a character uses more trailing bytes than needed),
* see http://en.wikipedia.org/wiki/UTF-8#Overlong_forms.2C_invalid_input.2C_and_security_considerations *)
if boo then raise (Bad_utf8 ("overlong form", loc))
in
let may_be_overlong = ref false in
let char_size = (* get the size of the character *)
(* 0zzzzzzz -> 0zzzzzzz = 7 bits *)
if (head land 0b1_0000000) = 0b0_0000000 then 1
(* 110YYYYy 10zzzzzz -> 00000yyy yyzzzzzz = 11 bits *)
else if (head land 0b111_00000) = 0b110_00000
then (overlong ((head land 0b000_11110) = 0); 2)
(* 1110XXXX 10Yyyyyy 10zzzzzz -> xxxxyyyy yyzzzzzz = 16 bits *)
else if (head land 0b1111_0000) = 0b1110_0000
then (may_be_overlong := ((head land 0b0000_1111) = 0); 3)
(* 11110WWW 10XXxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz = 21 bits *)
else if (head land 0b1111_1000) = 0b1111_0000
then (may_be_overlong := ((head land 0b00000_111) = 0); 4)
(* 4 bytes is the maximun size of an UTF-8 character by now *)
else raise (Bad_utf8 ("invalid head", loc))
in
(* decode the tail *)
let off = ref (char_start + 1) in
let t_end = start + size in
let char_end = char_start + char_size in
let max_off = min char_end t_end in
(* check whether the trailing bytes of a character
* are of the form 0b10_xxxxxx *)
while !off < max_off do
let cod = (Char.code bytes.[!off]) in
if (cod land 0b11_000000) <> 0b10_000000
then raise (Bad_utf8 ("invalid tail", loc));
incr off
done;
(* complete the overlong check *)
if max_off >= char_start + 1 (* if there is a second byte *)
&& !may_be_overlong
then overlong
( (char_size = 3
&& ((Char.code bytes.[char_start + 1]) land 0b00_100000) = 0)
|| (char_size = 4
&& ((Char.code bytes.[char_start + 1]) land 0b00_110000) = 0) );
(* check the tail length *)
if char_end > t_end
then raise (Insufficient (char_end - (char_pos + 1)));
(pos, char_size, char_pos - char_start)
end
let unicode_of_utf8
(bytes: string)
?(start = 0)
?(size = String.length bytes)
(pos: int) : int * int =
let pos, char_size, offset =
position__char_size__offset bytes ~start ~size ~pos in
let char_start = pos - offset in
let unicode =
match char_size with
| 1 -> (* 0zzzzzzz -> 0zzzzzzz *)
Char.code bytes.[char_start]
| 2 -> (* 110yyyyy 10zzzzzz -> 00000yyy yyzzzzzz *)
let cod0 = Char.code bytes.[char_start] in
let cod1 = Char.code bytes.[char_start + 1]
in ((cod0 land 0b000_11111) lsl 6)
lor (cod1 land 0b00_111111)
| 3 -> (* 1110xxxx 10yyyyyy 10zzzzzz -> xxxxyyyy yyzzzzzz *)
let cod0 = Char.code bytes.[char_start] in
let cod1 = Char.code bytes.[char_start + 1] in
let cod2 = Char.code bytes.[char_start + 2]
in ((cod0 land 0b0000_1111) lsl 12)
lor ((cod1 land 0b00_111111) lsl 6)
lor (cod2 land 0b00_111111)
| 4 -> (* 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz *)
let cod0 = Char.code bytes.[char_start] in
let cod1 = Char.code bytes.[char_start + 1] in
let cod2 = Char.code bytes.[char_start + 2] in
let cod3 = Char.code bytes.[char_start + 3]
in ((cod0 land 0b00000_111) lsl 18)
lor ((cod1 land 0b00_111111) lsl 12)
lor ((cod2 land 0b00_111111) lsl 6)
lor (cod3 land 0b00_111111)
| _ -> assert false
in
match unicode with
| cod when cod >= 0xD800 && cod <= 0xDFFF ->
(* The definition of UTF-8 prohibits encoding character numbers between
* U+D800 and U+DFFF, which are reserved for use with the UTF-16
* encoding form (as surrogate pairs) and do not directly represent characters. *)
raise (Bad_utf8 ("prohibited code point", (bytes, start, size, pos)))
| cod when cod > 0x10FFFF ->
raise (Bad_utf8 ("invalid code point", (bytes, start, size, pos)))
| _ -> (unicode, (char_size - offset))
exception Bad_unicode of string * int
(* raised with an error description and an integer
* which is either a prohibited or an invalid unicode code point *)
let utf8_of_unicode :
int -> string =
function
| cod when cod >= 0x00 && cod <= 0x7F -> (* 0zzzzzzz -> 0zzzzzzz *)
String.make 1 (Char.chr cod)
| cod when cod <= 0x07FF -> (* 00000yyy yyzzzzzz -> 110yyyyy 10zzzzzz *)
let str = String.create 2 in
str.[0] <- Char.chr (0b110_00000 lor (cod lsr 6));
str.[1] <- Char.chr (0b10_000000 lor (cod land 0b00_111111));
str
| cod when cod >= 0xD800 && cod <= 0xDFFF ->
(* The definition of UTF-8 prohibits encoding character numbers between
* U+D800 and U+DFFF, which are reserved for use with the UTF-16
* encoding form (as surrogate pairs) and do not directly represent characters. *)
raise (Bad_unicode ("prohibited code point", cod))
| cod when cod <= 0xFFFF -> (* xxxxyyyy yyzzzzzz -> 1110xxxx 10yyyyyy 10zzzzzz *)
let str = String.create 3 in
str.[0] <- Char.chr (0b1110_0000 lor (cod lsr 12));
str.[1] <- Char.chr (0b10_000000 lor ((cod lsr 6) land 0b00_111111));
str.[2] <- Char.chr (0b10_000000 lor ( cod land 0b00_111111));
str
| cod when cod <= 0x10FFFF -> (* 000wwwxx xxxxyyyy yyzzzzzz -> 11110www 10xxxxxx 10yyyyyy 10zzzzzz *)
let str = String.create 4 in
str.[0] <- Char.chr (0b11110_000 lor ( cod lsr 18));
str.[1] <- Char.chr (0b10_000000 lor ((cod lsr 12) land 0b00_111111));
str.[2] <- Char.chr (0b10_000000 lor ((cod lsr 6) land 0b00_111111));
str.[3] <- Char.chr (0b10_000000 lor ( cod land 0b00_111111));
str
| cod -> raise (Bad_unicode ("invalid code point", cod))
next prev parent reply other threads:[~2007-10-16 18:45 UTC|newest]
Thread overview: 51+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-10-08 15:08 Correct way of programming a CGI script Tom
2007-10-08 15:32 ` [Caml-list] " Dario Teixeira
2007-10-08 16:04 ` Gerd Stolpmann
2007-10-08 21:37 ` skaller
2007-10-08 22:21 ` Erik de Castro Lopo
2007-10-08 23:05 ` skaller
2007-10-08 23:19 ` skaller
2007-10-08 23:23 ` Arnaud Spiwack
2007-10-08 23:47 ` skaller
2007-10-09 5:49 ` David Teller
2007-10-09 10:15 ` Christophe TROESTLER
2007-10-09 15:29 ` skaller
2007-10-09 15:49 ` Vincent Hanquez
2007-10-09 16:00 ` Jon Harrop
2007-10-09 14:02 ` William D. Neumann
2007-10-09 15:25 ` skaller
2007-10-09 15:33 ` William D. Neumann
2007-10-09 15:48 ` Jon Harrop
2007-10-08 23:37 ` skaller
2007-10-09 10:20 ` Christophe TROESTLER
2007-10-09 13:40 ` Rope is the new string Jon Harrop
2007-10-09 15:57 ` [Caml-list] " Vincent Hanquez
2007-10-09 16:42 ` Loup Vaillant
2007-10-09 16:55 ` Vincent Hanquez
2007-10-09 17:32 ` Loup Vaillant
2007-10-09 19:51 ` Vincent Hanquez
2007-10-09 21:06 ` Loup Vaillant
2007-10-10 7:35 ` Vincent Hanquez
2007-10-10 8:05 ` Loup Vaillant
2007-10-11 13:23 ` Vincent Hanquez
2007-10-09 22:04 ` Chris King
2007-10-11 13:03 ` Vincent Hanquez
2007-10-11 13:54 ` skaller
2007-10-11 14:21 ` Vincent Hanquez
2007-10-11 14:27 ` Benjamin Monate
2007-10-11 14:48 ` skaller
2007-10-11 21:16 ` Alain Frisch
2007-10-15 20:35 ` Warning on home-made functions dealing with UTF-8 Julien Moutinho
2007-10-15 23:51 ` [Caml-list] " skaller
2007-10-16 2:21 ` Julien Moutinho
2007-10-16 18:46 ` Julien Moutinho [this message]
2007-10-16 18:51 ` Julien Moutinho
2007-10-17 2:23 ` [Caml-list] " skaller
2007-10-09 10:26 ` [Caml-list] Correct way of programming a CGI script Gerd Stolpmann
2007-10-09 15:16 ` skaller
2007-10-09 15:31 ` William D. Neumann
2007-10-09 12:52 ` Brian Hurt
2007-10-09 13:56 ` Jon Harrop
2007-10-09 15:18 ` William D. Neumann
2007-10-08 16:11 ` Loup Vaillant
2007-10-08 19:07 ` Christophe TROESTLER
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20071016184621.GA12628@localhost \
--to=julien.moutinho@gmail.com \
--cc=caml-list@inria.fr \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox