From: malc <malc@pulsesoft.com>
To: Gerd Stolpmann <info@gerd-stolpmann.de>
Cc: caml-list@inria.fr
Subject: Re: [Caml-list] Re: zcat vs CamlZip
Date: Wed, 30 Aug 2006 04:44:44 +0400 (MSD) [thread overview]
Message-ID: <Pine.LNX.4.64.0608300433220.8632@home.oyster.ru> (raw)
In-Reply-To: <1156881856.21883.124.camel@localhost.localdomain>
On Tue, 29 Aug 2006, Gerd Stolpmann wrote:
> Am Dienstag, den 29.08.2006, 15:15 -0400 schrieb Sam Steingold:
>> at any rate, do you really expect that using Gzip.input and then
>> searching the result for a newline, slicing and dicing to get the
>> individual input lines, &c &c would be faster?
>
> Ah yes, and there is an easy solution with ocamlnet:
[..snip..]
> This adds a buffering layer.
The Netchannels buffering looks very elegant, but my (admittedly rather
cursory) testing shows that it's also rather slow.
Following code implements 4 line readers:
Sam's original [char]
Netchannels [net]
open_process_in [zcat]
and buffered (trying to stay compatible with original interface) [block]
While Netchannels do win over original implementation it looses to all
other methods (on my machine).
let buf = Buffer.create 1024
let gz_input_line gz_in char_counter line_counter =
Buffer.clear buf;
let finish () = incr line_counter; Buffer.contents buf in
let rec loop () =
let ch = Gzip.input_char gz_in in
char_counter := Int64.succ !char_counter;
if ch = '\n' then finish () else ( Buffer.add_char buf ch; loop (); ) in
try loop ()
with End_of_file ->
if Buffer.length buf = 0 then raise End_of_file else finish ()
class input_gzip_rec gzip_ch : Netchannels.rec_in_channel =
object(self)
method input s p l =
let n = Gzip.input gzip_ch s p l in
if n = 0 then raise End_of_file;
n
method close_in() =
Gzip.close_in gzip_ch
end
let wrap_gz gz_in =
let s = String.create 4096 in
let b = Buffer.create 1024 in
let r = ref (fun _ _ -> assert false) in
let findlf s start finish =
let rec loop pos = if pos >= finish then None
else if String.unsafe_get s pos = '\n' then Some pos else loop (succ pos)
in loop start
in
let rec cont pos char_counter line_counter =
let n = Gzip.input gz_in s pos (String.length s - pos) in
let rec subcont pos len char_counter line_counter =
let finish = pos + len in
match findlf s pos finish with
| None ->
Buffer.add_substring b s pos len;
cont 0 char_counter line_counter
| Some lfpos ->
let runlen = lfpos - pos in
incr line_counter;
Buffer.add_substring b s pos runlen;
let s = Buffer.contents b in
Buffer.clear b;
r := subcont (succ lfpos) (len - succ runlen);
s
in
if n = 0
then raise End_of_file
else (
char_counter := Int64.add (Int64.of_int n) !char_counter;
subcont pos n char_counter line_counter
)
in
let exec c l = !r c l in
r := cont 0;
exec
let char () =
let gz = Gzip.open_in_chan stdin in
let cc = ref 0L in
let lc = ref 0 in
try
while true
do
let _line = gz_input_line gz cc lc in
()
done
with End_of_file ->
Format.printf "cc=%Ld lc=%d@." !cc !lc
let block () =
let gz = Gzip.open_in_chan stdin in
let cc = ref 0L in
let lc = ref 0 in
let lg = wrap_gz gz in
try
while true
do
let _line = lg cc lc in
()
done
with End_of_file ->
Format.printf "cc=%Ld lc=%d@." !cc !lc
let zcat () =
let ic = Unix.open_process_in "zcat" in
let cc = ref 0L in
let lc = ref 0 in
try
while true
do
let _line = input_line ic in
cc := Int64.add (Int64.of_int (String.length _line + 1)) !cc;
incr lc
done
with End_of_file ->
Format.printf "cc=%Ld lc=%d@." !cc !lc
let net () =
let gz_in = Gzip.open_in_chan stdin in
let gz_ch = Netchannels.lift_in (`Rec (new input_gzip_rec gz_in)) in
let cc = ref 0L in
let lc = ref 0 in
try
while true
do
let _line = gz_ch#input_line () in
cc := Int64.add (Int64.of_int (String.length _line + 1)) !cc;
incr lc
done
with End_of_file ->
Format.printf "cc=%Ld lc=%d@." !cc !lc
let _ =
match Sys.argv with
| [| _; "char" |] -> char ()
| [| _; "zcat" |] -> zcat ()
| [| _; "block" |] -> block ()
| [| _; "net" |] -> net ()
| _ -> prerr_endline (Sys.argv.(0) ^ ": [char|zcat|block|net]")
--
mailto:malc@pulsesoft.com
next prev parent reply other threads:[~2006-08-30 0:45 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-08-29 18:40 Sam Steingold
2006-08-29 18:54 ` Bardur Arantsson
2006-08-29 19:01 ` [Caml-list] " Florian Hars
2006-08-29 19:15 ` Sam Steingold
2006-08-29 19:48 ` Bárður Árantsson
2006-08-29 19:54 ` [Caml-list] " Gerd Stolpmann
2006-08-29 20:04 ` Gerd Stolpmann
2006-08-30 0:44 ` malc [this message]
2006-08-30 0:53 ` Jonathan Roewen
2006-08-29 19:37 ` John Carr
2006-08-29 19:11 ` [Caml-list] " Eric Cooper
2006-08-30 6:12 ` Jeff Henrikson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=Pine.LNX.4.64.0608300433220.8632@home.oyster.ru \
--to=malc@pulsesoft.com \
--cc=caml-list@inria.fr \
--cc=info@gerd-stolpmann.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox