From: Paul Stodghill <stodghil@CS.Cornell.EDU>
To: caml-list@inria.fr
Subject: [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C
Date: Fri, 18 Oct 2002 16:09:59 -0400 [thread overview]
Message-ID: <r99u1jjh5k8.fsf@quimby-xp.cs.cornell.edu> (raw)
[-- Attachment #1: Type: text/plain, Size: 1959 bytes --]
Doug Bagley's Great Language Shootout says that O'Caml and GCC get about
the same performance for Matrix-Matrix Multiplication (MMM). I'm seeing
somewhat different results for my code. In particular, on my machine
(1.2GHz PIIIM, 256M), the O'Caml version of the code is 6 times slower
than the C version.
The codes are attached. A few of the differences between my code and
Doug's code is that I have tiled the loops and I use constants for the
matrices and the blocksize. I have not (yet) hoisted the invariant
expressions.
Looking at the assembly produced by O'Caml and GCC, it appears that GCC
is performance loop unrolling (as requested with -funroll-loops) and
strength reduction in the inner loops. I can easily see why these two
optimizations can result in such a tremendous performance difference.
My question is this: I can obviously performance loop unrolling myself
by hand - does ocamlopt perform strength reduction? Is there anyway that
I can get the O'Caml code to close to the performance of the C code?
I can provide additonal information about my set up if that would help.
Thanks.
quimby-xp$ uname -a
CYGWIN_NT-5.1 QUIMBY-XP 1.3.13(0.62/3/2) 2002-10-13 23:15 i686 unknown
quimby-xp$ ocamlopt.opt -v
The Objective Caml native-code compiler, version 3.06
Standard library directory: /usr/local/ocaml/lib/ocaml
quimby-xp$ gcc --version
gcc (GCC) 3.2 20020818 (prerelease)
Copyright (C) 2002 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
quimby-xp$ ocamlopt.opt -S -o mmm_ml.exe unix.cmxa mmm_ml.ml
quimby-xp$ gcc -O4 -funroll-loops -o mmm_c.exe mmm_c.c
quimby-xp$ ./mmm_ml
(* 86.83 mflops *)
quimby-xp$ ./mmm_ml
(* 88.16 mflops *)
quimby-xp$ ./mmm_ml
(* 89.07 mflops *)
quimby-xp$ ./mmm_c
/* 523.64 mflops */
quimby-xp$ ./mmm_c
/* 523.64 mflops */
quimby-xp$ ./mmm_c
/* 523.64 mflops */
quimby-xp$
[-- Attachment #2: mmm_ml.ml --]
[-- Type: application/octet-stream, Size: 1683 bytes --]
(* ocamlopt.opt -S -o mmm_ml.exe unix.cmxa mmm_ml.ml *)
open Printf
let t = 5;;
let n = 120;;
let rec f create mmm n =
let (a,b,c) = create () in
(* do the work once to load the cache *)
mmm n a b c;
(* do the work t times and time it *)
let begin_time = Unix.gettimeofday () in
let _ =
for t = 1 to t do
mmm n a b c
done
in let end_time = Unix.gettimeofday () in
let n_f = float_of_int n in
let total_flops = 2.*.(float_of_int t)*.n_f*.n_f*.n_f in
let mflops = (total_flops /. (end_time -. begin_time)) /. 1e6 in
printf "(* %.2f mflops *)\n" mflops;
()
;;
(* ***************************************************************** *)
let create_arrays () =
let a = Array.create (n*n) 0. and
b = Array.create (n*n) 0. and
c = Array.create (n*n) 0. in
(* Init a and b*)
for i = 0 to n-1 do
for j = 0 to n-1 do
a.(i*n+j) <- (float_of_int (i+j));
b.(i*n+j) <- (float_of_int (i+j));
done
done;
(a,b,c)
(* ***************************************************************** *)
let mmm n (a: float array) (b: float array) (c: float array) =
assert (120 mod 30 = 0);
let num_blocks = 120/30 in
for bi = 0 to num_blocks-1 do
for bj = 0 to num_blocks-1 do
for bk = 0 to num_blocks-1 do
for i = bi*30 to (bi+1)*30-1 do
for j = bj*30 to (bj+1)*30-1 do
for k = bk*30 to (bk+1)*30-1 do
Array.unsafe_set c (i*120+j)
((Array.unsafe_get c (i*120+j))
+. (Array.unsafe_get a (i*120+k))
*. (Array.unsafe_get b (k*120+j)));
done
done
done
done
done
done
;;
(* 83.08 mflops (0.03% of peak) *)
let _ = f create_arrays mmm n;;
[-- Attachment #3: mmm_c.c --]
[-- Type: application/octet-stream, Size: 1304 bytes --]
#include <assert.h>
#include <stdio.h>
#include <sys/time.h>
#define T 5
#define N 120
#define BB 30
static double A[N*N];
static double B[N*N];
static double C[N*N];
void mmm();
int main(int argc, char** argv)
{
int i, j, k;
int t;
struct timeval start_time;
struct timeval end_time;
double seconds;
double total_flops;
double mflops;
/* init the matrices */
for (i=0; i<N; i++)
for (j=0; j<N; j++)
A[i*N+j] = B[i*N+j] = i+j;
/* warm the cache */
mmm();
/* do the work T times and time it */
gettimeofday(&start_time,NULL);
for (t=1; t<=T; t++)
mmm();
gettimeofday(&end_time,NULL);
seconds = (double)(end_time.tv_usec - start_time.tv_usec)/1e6;
seconds += (double)(end_time.tv_sec - start_time.tv_sec);
total_flops = 2.*(double)T*(double)N*(double)N*(double)N;
mflops = (total_flops / seconds) / 1e6;
printf("/* %.2f mflops */\n", mflops);
}
#define NUM_BLOCKS (N/BB)
void mmm()
{
int i, j, k;
int bi, bj, bk;
assert (N % BB == 0);
for (bi=0; bi<NUM_BLOCKS; bi++)
for (bj=0; bj<NUM_BLOCKS; bj++)
for (bk=0; bk<NUM_BLOCKS; bk++)
for (i=bi*BB; i<(bi+1)*BB; i++)
for (j=bj*BB; j<(bj+1)*BB; j++)
for (k=bk*BB; k<(bk+1)*BB; k++)
C[i*N+j] += A[i*N+k] * B[k*N+j];
}
next reply other threads:[~2002-10-18 20:10 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2002-10-18 20:09 Paul Stodghill [this message]
2002-10-20 9:42 ` Xavier Leroy
2002-10-20 18:06 ` Paul Stodghill
2002-10-20 21:58 ` Issac Trotts
2002-10-23 10:16 ` [Caml-list] Re: float boxing (was: matrix-matrix multiply) Christophe TROESTLER
2002-10-23 12:08 ` malc
2002-10-24 0:00 ` malc
2002-10-21 12:53 ` [Caml-list] matrix-matrix multiply - O'Caml is 6 times slower than C Xavier Leroy
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=r99u1jjh5k8.fsf@quimby-xp.cs.cornell.edu \
--to=stodghil@cs.cornell.edu \
--cc=caml-list@inria.fr \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox