From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <thelema314@gmail.com>
X-Spam-Checker-Version: SpamAssassin 3.1.3 (2006-06-01) on yquem.inria.fr
X-Spam-Level: *
X-Spam-Status: No, score=1.0 required=5.0 tests=AWL,SPF_NEUTRAL 
	autolearn=disabled version=3.1.3
Received: from mail4-relais-sop.national.inria.fr (mail4-relais-sop.national.inria.fr [192.134.164.105])
	by yquem.inria.fr (Postfix) with ESMTP id B7EA0BC69
	for <caml-list@yquem.inria.fr>; Thu,  3 Jan 2008 18:11:59 +0100 (CET)
X-IronPort-Anti-Spam-Filtered: true
X-IronPort-Anti-Spam-Result: AgAAALelfEdA6aaxkGdsb2JhbACQDwEBAQEHBAQkB4EUknqFWg
X-IronPort-AV: E=Sophos;i="4.24,240,1196636400"; 
   d="scan'208";a="20879209"
Received: from py-out-1112.google.com ([64.233.166.177])
  by mail4-smtp-sop.national.inria.fr with ESMTP; 03 Jan 2008 18:11:59 +0100
Received: by py-out-1112.google.com with SMTP id v53so2083064pyh.33
        for <caml-list@yquem.inria.fr>; Thu, 03 Jan 2008 09:11:58 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=gamma;
        h=domainkey-signature:received:received:subject:from:to:cc:in-reply-to:references:content-type:date:message-id:mime-version:x-mailer:content-transfer-encoding;
        bh=zulr1X4v6vhcdzgJLPSRVbbq9RT7RuQTtyaUypNWvLo=;
        b=jrgMXzcWvUZcMgsCmUV82pOdcbf3H8DZBDpPUUj92MsgHA+Qdtx3ka8z4YgP/K+SFW3zu43mvT5hqmBdt/auSHHiof+wM7a2b3T004Y634WtWAde6Isri0djrEqx5AUoQsiOleLIGGwrRZKiksNHAlIdaUFtH+yQClm5ryLTa5Y=
DomainKey-Signature: a=rsa-sha1; c=nofws;
        d=gmail.com; s=gamma;
        h=subject:from:to:cc:in-reply-to:references:content-type:date:message-id:mime-version:x-mailer:content-transfer-encoding;
        b=sT1H1Fo4tHWboYnLCqXn51VA6dWXta3t8264ZYO15ffroyTz/rEyW9lLw9ANa3HpNgbz1MdqhebvxH1noVjwi9vBW758RJVxBCf5hwlnC8BLz00Xirqj+GWi0tW+gJqUi59zhdqnbn99twVCWMic02UCFvNJKuGakxnubQSRyG8=
Received: by 10.35.26.14 with SMTP id d14mr19429364pyj.8.1199380318160;
        Thu, 03 Jan 2008 09:11:58 -0800 (PST)
Received: from ?192.168.0.14? ( [69.152.95.238])
        by mx.google.com with ESMTPS id p77sm26719432pyb.16.2008.01.03.09.11.56
        (version=TLSv1/SSLv3 cipher=RC4-MD5);
        Thu, 03 Jan 2008 09:11:57 -0800 (PST)
Subject: Re: [Caml-list] Performance questions, -inline, ...
From: Edgar Friendly <thelema314@gmail.com>
To: Kuba Ober <ober.14@osu.edu>
Cc: caml-list@yquem.inria.fr
In-Reply-To: <200801031128.30183.ober.14@osu.edu>
References: <200801031128.30183.ober.14@osu.edu>
Content-Type: text/plain
Date: Thu, 03 Jan 2008 11:11:45 -0600
Message-Id: <1199380305.6057.13.camel@thelema-laptop>
Mime-Version: 1.0
X-Mailer: Evolution 2.12.1 
Content-Transfer-Encoding: 7bit
X-Spam: no; 0.00; -inline:01 ocamlopt:01 -inline:01 -unsafe:01 mips:01 utime:01 vectors:01 inlining:01 cheers:01 mips:01 const:01 accum:01 accum:01 const:01 arrays:01 


On Thu, 2008-01-03 at 11:28 -0500, Kuba Ober wrote:
> I haven't looked at assembly output yet, but I've run into some unexpected
> behavior in my benchmarks.
> 
> This was compiled by ocamlopt -inline 100 -unsafe, the results and code are 
> below (MIPS is obtained by dividing 50 million iterations by (Unix.times 
> ()) . Unix.tms_utime it took to run). I haven't included the timing etc. code 
> (it's part of a larger benchmark).
> 
> What I wonder is why vector-to-vector add is so much faster than (constant) 
> scalar to vector add. Vectors are preinitialized each time with a 1.0000, 
> 1.0001, ... sequence.
> 
> Also, the very bad performance from generic vector-to-vector *with* inlining 
> is another puzzler, whereas generic add of scalar-to-scalar performs 
> similarly to straight-coded one.
> 
> Cheers, Kuba
> 
> * add1: add scalar to scalar   120 MIPS
> * add3: add scalar to vector   250 MIPS
> * add5: add vector to vector   320 MIPS
> * add2: generic add scalar to scalar   100 MIPS
> * add4: generic add vector to vector   38 MIPS
> 
> let start = 1.3
> 
> (* generic scalar operation *)
> let op1 op const nloop =
> 	let accum = ref start in
> 	for i = 1 to nloop do
> 		accum := op !accum const
> 	done
> 
> (* generic vector operation *)
> let op2 op const a b (nloop : int) =
> 	let len = Array.length a in
> 	for j = 0 to len-1 do
> 		for i = 0 to len-1 do
> 			b.(i) <- op a.(i) b.(i)
> 		done;
> 	done
> 
> (** addition **)
> let add1 nloop =
> 	let accum = ref start in
> 	for i = 1 to nloop do
> 		accum := !accum +. addconst
> 	done
> let add2 = op1 ( +. ) addconst
> let add3 a b nloop =
> 	let len = Array.length a in
> 	for j = 0 to len-1 do
> 		for i = 0 to len-1 do
> 			b.(i) <- a.(i) +. addconst
> 		done;
> 	done
> let add4 = op2 ( +. ) addconst
> let add5 a b nloop =
> 	let len = Array.length a in
> 	for j = 0 to len-1 do
> 		for i = 0 to len-1 do
> 			b.(i) <- a.(i) +. b.(i)
> 		done;
> 	done
> 
how about:

(* generic vector operation *)
let op2 op a b nloop =
	let len = Array.length a in
	for j = 0 to nloop do
		for i = 0 to len-1 do
			b.(i) <- op a.(i) b.(i)
		done;
	done

let add4 = op2 (+.)


Why does your code have the j loops?  You add a constant (or vector
element) a number of times equal to the length of your vector?  Do you
mean for j = 1 to nloop do ...  And I'd move that out of the test
function if I could, into the testing harness.

Arrays of floats have some optimizations built in to the compiler (no
boxing, even though they're not 31-bit values), so you should get as
good performance as you'll get.  

E.