* Utf8 (and other) code for ocaml (fwd)
@ 1999-10-21 17:06 Pierre Weis
0 siblings, 0 replies; only message in thread
From: Pierre Weis @ 1999-10-21 17:06 UTC (permalink / raw)
To: caml-list
Dear Caml list readers,
Would you please note that, for security reasons, no attachments are
allowed in messages sent to you from the Caml list. So, please, do not
send messages with attachments to the list, since I'm obliged to send
these messages back, or to remove attachments by hand, as I did for
this message from John Skaller. I will not do this again.
Thank you in advance.
[Pierre Weis as moderator of the Caml list]
----- Forwarded message from skaller -----
>From weis@pauillac.inria.fr Thu Oct 21 06:52:19 1999
Sender: root@ruby
Message-ID: <380E9AA4.8AEC3665@maxtal.com.au>
Date: Thu, 21 Oct 1999 14:46:28 +1000
From: skaller <skaller@maxtal.com.au>
Organization: Maxtal
X-Mailer: Mozilla 4.51 [en] (X11; I; Linux 2.2.12 i586)
X-Accept-Language: en
MIME-Version: 1.0
CC: caml-list@inria.fr
Subject: Utf8 (and other) code for ocaml
References: <380CB30E.56D1A8A2@maxtal.com.au> <99102100543400.15513@ice>
John Skaller, mailto:skaller@maxtal.com.au
1/10 Toxteth Rd Glebe NSW 2037 Australia
homepage: http://www.maxtal.com.au/~skaller
downloads: http://www.triode.net.au/~skaller
let hexchar_of_int i =
if i < 10
then char_of_int (i + (int_of_char '0'))
else char_of_int (i- 10 + (int_of_char 'A'))
;;
let hex4 i =
let j = ref i in
let s = String.create 4 in
for k = 0 to 3 do s.[3-k] <- hexchar_of_int (!j mod 16); j := !j / 16 done;
s
;;
let escape_of_char quote ch =
if ch = '\\' then "\\\\"
else if ch = quote then "\\" ^ (String.make 1 quote)
else if ch = '\n' then "\\n"
else if ch < ' '
or ch > char_of_int 126
then "\\u" ^ (hex4 (Char.code ch))
else String.make 1 ch
;;
let escape_of_string quote x =
let esc = escape_of_char quote in
let res = ref "" in
for i = 0 to (String.length x -1) do
res := !res ^ (esc x.[i])
done;
(String.make 1) quote ^ !res ^ (String.make 1 quote)
;;
let dquote_of_string = escape_of_string '"';;
let quote_of_string = escape_of_string '\'';;
let hex_char2int s =
let c = Char.code s in
match s with
_ when (s >= '0' & s <= '9') ->
c - (Char.code '0')
| _ when (s >= 'a' & s <= 'f') ->
(c - (Char.code 'a')) + 10
| _ when (s >= 'A' & s <= 'F') ->
(c - (Char.code 'A')) + 10
| _ -> raise (Py_exceptions.LexError "in hexadecimal character")
;;
let oct_char2int s =
let c = Char.code s in
match s with
_ when (s >= '0' & s <= '7') ->
c - (Char.code '0')
| _ -> raise (Py_exceptions.LexError "in octal character")
;;
let dec_char2int s =
let c = Char.code s in
match s with
_ when (s >= '0' & s <= '9') ->
c - (Char.code '0')
| _ -> raise (Py_exceptions.LexError "in decimal character")
;;
let len = String.length;;
let hexint_of_string s =
let len = len s in
let value = ref (hex_char2int s.[2]) in
for i = 3 to (len - 1) do
value := !value * 16 + (hex_char2int s.[i])
done;
!value
;;
let hexbig_int_of_string s =
let len = len s in
let value = ref (Big_int.big_int_of_int (hex_char2int s.[2])) in
for i = 3 to (len - 1) do
value :=
Big_int.add_int_big_int
(hex_char2int s.[i])
(Big_int.mult_int_big_int 16 !value)
done;
!value
;;
(* WARNING: THIS CODE WILL NOT WORK FOR THE HIGHER PLANES
BECAUSE OCAML ONLY SUPPORTS 31 bit signed integers;
THIS CODE REQUIRES 32 bits [This can be fixed by using
negative codes but hasn't been done]
*)
(* parse the first utf8 encoded character of a string s
starting at index position i, return a pair
consisting of the decoded integers, and the position
of the first character not decoded.
If the first character is bad, it is returned,
otherwise if the encoding is bad, the result is
an unspecified value.
COMPATIBILITY NOTE: if this function is called
with a SINGLE character string, it will return
the usual value for the character, in range
0 .. 255
*)
let parse_utf8 (s:string) (i:int) : int * int =
let ord = int_of_char
and n = (String.length s) - i
in if n <= 0 then begin print_endline "FAILURE"; (-1),i end
else let lead = ord (s.[i]) in
if (lead land 0x80) = 0 then
lead land 0x7F,i+1 (* ASCII *)
else if lead land 0xE0 = 0xC0 && n > 1 then
((lead land 0x1F) lsl 6) lor
(ord(s.[i+1]) land 0x3F),i+2
else if lead land 0xF0 = 0xE0 && n > 2 then
((lead land 0x1F) lsl 12) lor
((ord(s.[i+1]) land 0x3F) lsl 6) lor
(ord(s.[i+2]) land 0x3F),i+3
else if lead land 0xF8 = 0xF0 && n > 3 then
((lead land 0x1F) lsl 18) lor
((ord(s.[i+1]) land 0x3F) lsl 12) lor
((ord(s.[i+2]) land 0x3F) lsl 6) lor
(ord(s.[i+3]) land 0x3F),i+4
else if lead land 0xFC = 0xF8 && n > 4 then
((lead land 0x1F) lsl 24) lor
((ord(s.[i+1]) land 0x3F) lsl 18) lor
((ord(s.[i+2]) land 0x3F) lsl 12) lor
((ord(s.[i+3]) land 0x3F) lsl 6) lor
(ord(s.[i+4]) land 0x3F),i+5
else if lead land 0xFE = 0xFC && n > 5 then
((lead land 0x1F) lsl 30) lor
((ord(s.[i+1]) land 0x3F) lsl 24) lor
((ord(s.[i+2]) land 0x3F) lsl 18) lor
((ord(s.[i+3]) land 0x3F) lsl 12) lor
((ord(s.[i+4]) land 0x3F) lsl 6) lor
(ord(s.[i+5]) land 0x3F),i+6
else lead, i+1 (* error, just use bad character *)
;;
(* convert an integer into a utf-8 encoded string of bytes *)
let utf8_of_int i =
let chr x = String.make 1 (Char.chr x) in
if i < 0x80 then
chr(i)
else if i < 0x800 then
chr(0xC0 lor ((i lsr 6) land 0x1F)) ^
chr(0x80 lor (i land 0x3F))
else if i < 0x10000 then
chr(0xE0 lor ((i lsr 12) land 0xF)) ^
chr(0x80 lor ((i lsr 6) land 0x3F)) ^
chr(0x80 lor (i land 0x3F))
else if i < 0x200000 then
chr(0xF0 lor ((i lsr 18) land 0x7)) ^
chr(0x80 lor ((i lsr 12) land 0x3F)) ^
chr(0x80 lor ((i lsr 6) land 0x3F)) ^
chr(0x80 lor (i land 0x3F))
else if i < 0x4000000 then
chr(0xF8 lor ((i lsr 24) land 0x3)) ^
chr(0x80 lor ((i lsr 18) land 0x3F)) ^
chr(0x80 lor ((i lsr 12) land 0x3F)) ^
chr(0x80 lor ((i lsr 6) land 0x3F)) ^
chr(0x80 lor (i land 0x3F))
else chr(0xFC lor ((i lsr 30) land 0x1)) ^
chr(0x80 lor ((i lsr 24) land 0x3F)) ^
chr(0x80 lor ((i lsr 18) land 0x3F)) ^
chr(0x80 lor ((i lsr 12) land 0x3F)) ^
chr(0x80 lor ((i lsr 6) land 0x3F)) ^
chr(0x80 lor (i land 0x3F))
;;
let unescape s =
let hex_limit = 2 in
let n = len s in
let s' = ref "" in
let tack_string ss = s':= !s' ^ ss in
let tack_char ch = tack_string (String.make 1 ch) in
let tack_utf8 code = tack_string (utf8_of_int code) in
let i= ref 0 in
while !i< n do let ch = s.[!i] in
if ch = '\\' then begin
incr i;
if !i = n then tack_char '\\'
else match s.[!i] with
| 'a' -> tack_char '\007'; incr i (* 7 *)
| 't' -> tack_char '\t'; incr i (* 9 *)
| 'n' -> tack_char '\n'; incr i (* 10 *)
| 'r' -> tack_char '\r'; incr i (* 13 *)
| 'v' -> tack_char '\011'; incr i
| 'f' -> tack_char '\012'; incr i
| 'b' -> tack_char '\008'; incr i
| '\\' -> tack_char '\\'; incr i
| '"' -> tack_char '"'; incr i (* NOTE OCAMLLEX BUG: TWO SPACES REQUIRED *)
| '\'' -> tack_char '\''; incr i
| 'x' ->
begin
incr i;
let j = ref 0 and value = ref 0 in
while
(!i < n) &
(!j < hex_limit) &
(String.contains "0123456789ABCDEFabcdef" s.[!i]) do
value := !value * 16 + (hex_char2int s.[!i]);
incr i;
incr j
done;
tack_utf8 !value
end
| 'u' ->
begin
incr i;
let j = ref 0 and value = ref 0 in
while
(!i < n) &
(!j < 4) &
(String.contains "0123456789ABCDEFabcdef" s.[!i]) do
value := !value * 16 + (hex_char2int s.[!i]);
incr i;
incr j
done;
tack_utf8 !value
end
| 'U' ->
begin
incr i;
let j = ref 0 and value = ref 0 in
while
(!i < n) &
(!j < 8) &
(String.contains "0123456789ABCDEFabcdef" s.[!i]) do
value := !value * 16 + (hex_char2int s.[!i]);
incr i;
incr j
done;
tack_utf8 !value
end
| 'd' ->
begin
incr i;
let j = ref 0 and value = ref 0 in
while
(!i < n) &
(!j < 3) &
(String.contains "0123456789" s.[!i]) do
value := !value * 10 + (dec_char2int s.[!i]);
incr i;
incr j
done;
tack_utf8 !value
end
| 'o' ->
begin
incr i;
let j = ref 0 and value = ref 0 in
while
(!i < n) &
(!j < 3) &
(String.contains "01234567" s.[!i]) do
value := !value * 8 + (oct_char2int s.[!i]);
incr i;
incr j
done;
tack_utf8 !value
end
| '0'
| '1'
| '2'
| '3'
| '4'
| '5'
| '6'
| '7' ->
begin
let j = ref 0 and value = ref 0 in
while
(!i < n) &
(!j < 3) &
(String.contains "01234567" s.[!i]) do
value := !value * 8 + (oct_char2int s.[!i]);
incr i;
incr j
done;
tack_utf8 !value
end
| x -> tack_char '\\'; tack_char x;
incr i;
end else (tack_char s.[!i]; incr i)
done;
!s'
;;
----- End of forwarded message from skaller -----
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~1999-10-21 17:10 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
1999-10-21 17:06 Utf8 (and other) code for ocaml (fwd) Pierre Weis
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox