From: Michal Moskal <malekith@pld-linux.org>
To: Ram Bhamidipaty <ramb@sonic.net>
Cc: caml-list@pauillac.inria.fr
Subject: Re: [Caml-list] what is the functional way to solve this problem?
Date: Thu, 2 Oct 2003 16:53:27 +0200 [thread overview]
Message-ID: <20031002145327.GA4690@roke.freak> (raw)
In-Reply-To: <16252.6810.502406.679837@oak.ramandgita.com>
[-- Attachment #1: Type: text/plain, Size: 2415 bytes --]
On Thu, Oct 02, 2003 at 05:31:22AM -0700, Ram Bhamidipaty wrote:
> Someone suggested that my previous question about dynamically resizing
> arrays hinted that my solution may be going in a non-functional
> direction. That might be true.
>
> So here is the problem I am trying to solve. I would like to solve it
> in a "functional way".
Use Map. You won't notice a difference.
[...]
> I have already written a python application that can read this data file and
> generate the data I am looking for. I was not happy with the python solution
> because it was not very fast. Even with using a heap to store the top 100
> largest files I was not able to create a python solution that could beat
> a "grep | sort" pipeline (on unix of course).
[...]
./stats < /shm/d 0.11s user 0.00s system 78% cpu 0.139 total
grep '^F' /shm/d 0.02s user 0.00s system 22% cpu 0.090 total
sort -n -t' ' -k2,2 0.32s user 0.05s system 71% cpu 0.517 total
tail -100 0.02s user 0.00s system 3% cpu 0.514 total
86287 263948 1912177 /shm/d
Did I win something? ;-)
[...]
> My goal of implementing this in OCaml is to beat the grep+sort
> combination. If I can create a solution that can output all the
> date I want - in one pass - AND still be faster than the grep+sort
> partial solution - _that_ would be cool!
>
> Having said all that - I wanted to use an array to hold the data for
> each directory. I hoped that using an array would be faster than a
> hash table since I know that the directory numbers are assigned
> sequentially.
Map! Map! Stop thinking in such an imperative way ;-)
Key points of my implementation:
1. It also uses heap (simple one from pure-fun)
2. It is compiled with ocamlopt (ocamlc version is 6 times slower)
3. It doesn't use Scanf. For such linear task as this Scanf takes 10 or
more times to parse input then actual computations.
4. There is little trick, that heap holds 101 elements, and always
discards last one, so we don't need to check if newly inserted
element is really bigger then min we just deleted.
5. The code isn't very pretty (it would be better to make heap functor
with item count, and so on). It is left as an exercise to the reader.
You're free to use it. I also attach program I used to create input.
--
: Michal Moskal :: http://www.kernel.pl/~malekith : GCS {C,UL}++++$ a? !tv
: When in doubt, use brute force. -- Ken Thompson : {E-,w}-- {b++,e}>+++ h
[-- Attachment #2: stats.ml --]
[-- Type: text/plain, Size: 4105 bytes --]
exception Empty
module Heap (Element : Map.OrderedType) =
struct
module Elem = Element
type heap = E | T of int * Elem.t * heap * heap
let rank = function E -> 0 | T (r,_,_,_) -> r
let makeT x a b =
if rank a >= rank b then T (rank b + 1, x, a, b)
else T (rank a + 1, x, b, a)
let empty = E
let is_empty h = h = E
let rec merge h1 h2 = match h1, h2 with
| _, E -> h1
| E, _ -> h2
| T (_, x, a1, b1), T (_, y, a2, b2) ->
if Elem.compare x y <= 0 then makeT x a1 (merge b1 h2)
else makeT y a2 (merge h1 b2)
let insert x h = merge (T (1, x, E, E)) h
let find_min = function E -> raise Empty | T (_, x, _, _) -> x
let delete_min = function E -> raise Empty | T (_, x, a, b) -> merge a b
let flatten x =
let rec loop acc = function
| E -> acc
| T (_, x, a, b) -> loop (loop (x :: acc) a) b
in loop [] x
end
module Dir = struct
type t =
{
size : int;
name : string;
parent : t option;
no : int;
}
let compare x y = x.size - y.size
end
module Int = struct
type t = int
let compare x y = x - y
end
module File = struct
type t =
{
size : int;
dir : Dir.t;
name : string;
}
let compare x y = x.size - y.size
end
module Dir_set = Map.Make(Int)
module Dir_heap = Heap(Dir)
module File_heap = Heap(File)
type line =
| D of int * int * string
| F of int * string
| Eof
let limit = 100
let read () =
try
let l = read_line () in
let rec scan_int pos =
match l.[pos] with
| '0' .. '9' -> scan_int (pos + 1)
| _ -> pos
in
let int_from pos =
let xend = scan_int pos in
(int_of_string (String.sub l pos (xend - pos)), xend + 1)
in
match l.[0] with
| 'D' ->
let (x, p) = int_from 2 in
let (y, p) = int_from p in
let s = String.sub l p (String.length l - p) in
D (x, y, s)
| 'F' ->
let (x, p) = int_from 2 in
let s = String.sub l p (String.length l - p) in
F (x, s)
| _ -> invalid_arg "read"
with End_of_file -> Eof
let rec loop dh fh ds cur_dir =
let push_dir () =
let (dh_sz, dh) = dh in
if dh_sz > limit then
(dh_sz, Dir_heap.insert cur_dir (Dir_heap.delete_min dh))
else
(dh_sz + 1, Dir_heap.insert cur_dir dh)
in
function
| Eof ->
(push_dir (), fh)
| D (no, par, nam) ->
let cur_dir =
{
Dir.parent = Some (Dir_set.find par ds);
Dir.name = nam;
Dir.size = 0;
Dir.no = no;
}
in loop (push_dir ()) fh (Dir_set.add cur_dir.Dir.no cur_dir ds) cur_dir (read ())
| F (sz, name) ->
let f = { File.size = sz; File.name = name; File.dir = cur_dir } in
let fh =
let (fh_sz, fh) = fh in
if fh_sz > limit then
(fh_sz, File_heap.insert f (File_heap.delete_min fh))
else
(fh_sz + 1, File_heap.insert f fh)
in loop dh fh ds {cur_dir with Dir.size = cur_dir.Dir.size + sz} (read ())
let main =
let start nam =
let dir = { Dir.name = nam; Dir.size = 0; Dir.no = 0; Dir.parent = None } in
let ds = Dir_set.add 0 dir Dir_set.empty in
loop (0, Dir_heap.empty) (0, File_heap.empty) ds dir (read ())
in
let l = read_line () in
let (dh, fh) = Scanf.sscanf l "R 0 %[^\n]" start in
let dir_name d =
let rec loop acc = function
| { Dir.name = n; Dir.parent = None } -> n :: acc
| { Dir.name = n; Dir.parent = Some p } -> loop (n :: acc) p
in String.concat "/" (loop [] d)
in
let print_dir d =
Printf.printf "%10d %s\n" d.Dir.size (dir_name d)
in
let print_file f =
Printf.printf "%10d %s/%s\n" f.File.size (dir_name f.File.dir) f.File.name
in
let dh =
let (dh_sz, dh) = dh in
if dh_sz > limit then Dir_heap.delete_min dh
else dh
in
Printf.printf "DIRS:\n";
List.iter print_dir (List.sort Dir.compare (Dir_heap.flatten dh));
let fh =
let (fh_sz, fh) = fh in
if fh_sz > limit then File_heap.delete_min fh
else fh
in
Printf.printf "FILES:\n";
List.iter print_file (List.sort File.compare (File_heap.flatten fh));
[-- Attachment #3: mkd.ml --]
[-- Type: text/plain, Size: 695 bytes --]
let id = ref 0
let rec go dir_name par_id =
Unix.chdir dir_name;
incr id;
let id = !id in
Printf.printf "D %d %d %s\n" id par_id dir_name;
let d = Unix.opendir "." in
let dirs = ref [] in
let rec loop () =
let f = Unix.readdir d in
let st = Unix.lstat f in
let _ =
match f.[0], st.Unix.st_kind with
| '.', _ -> ()
| _, Unix.S_DIR -> dirs := f :: !dirs
| _, Unix.S_REG -> Printf.printf "F %d %s\n" st.Unix.st_size f
| _ -> ()
in loop ()
in
begin
List.iter (fun f -> go f id)
(try
loop ()
with End_of_file -> Unix.closedir d; !dirs)
end;
Unix.chdir ".."
let _ =
Printf.printf "R 0 .\n";
go "." 0
next prev parent reply other threads:[~2003-10-02 14:55 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2003-10-02 12:31 Ram Bhamidipaty
2003-10-02 14:53 ` Michal Moskal [this message]
2003-10-08 14:48 ` Pierre Weis
2003-10-08 16:09 ` Michal Moskal
2003-10-08 20:34 ` Pierre Weis
2003-10-08 21:22 ` scanf (Re: [Caml-list] what is the functional way to solve this problem?) Michal Moskal
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20031002145327.GA4690@roke.freak \
--to=malekith@pld-linux.org \
--cc=caml-list@pauillac.inria.fr \
--cc=ramb@sonic.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox