benchmarks

Fri, 9 Nov 2001 10:33:22 -0500 (EST)

> > Strangely, there are some decent speedups in ratio-regions and
> > wc-input1, a 2X speed up in matrix-multiply, and an amazing 9X
> > speedup in md5!
> 
> Truly amazing.  Hopefully we can figure it out and attribute it to
> something other than caching effects.  We should keep in mind that
> another possibility is bugs :-(.

I think that md5 is a real speedup.  What I believe happened is there were
two functions that were just a little too big to be inlined by the CPS
inliner; the second round of flattening in SSA eliminated enough selects
from those functions so that the SSA inliner did inline them. 
(There is more discussion after all the code.)

md5.inlineCPS.post.cps:

fun transform_1 (x_1018) = ...
   where
   fun L_1802 () [] = 
      let
	 val x_3362 = Word8Vector_subWord (x_1026, x_3630)
	 val x_1028 = (x_1021,
		       x_1023,
		       x_1024,
		       x_1025,
		       x_3470,
		       global_182,
		       global_195)
	 fun L_379 = ...
      in
	 L_379 (x_1027 (x_1028))
      end
   fun L_379 (x_1183) [] = 
      let
	 val x_1029 = (x_1025,
		       x_1183,
		       x_1023,
		       x_1024,
		       x_3386,
		       global_180,
		       global_194)
	 fun L_381 = ...
      in
	 L_381 (x_1027 (x_1029))
      end
   ... 13 more non-tail calls to x_1027 ...
   fun L_407 (x_1169) [] = 
      let
	 val x_1043 = (x_1172,
		       x_1169,
		       x_1170,
		       x_1171,
		       x_3362,
		       global_176,
		       global_177)
	 fun L_409 = ...
      in
	 L_409 (x_1027 (x_1043))
      end
   fun L_409 (x_1168) [] = 
      let
	 val x_1045 = (x_1171,
		       x_1168,
		       x_1169,
		       x_1170,
		       x_3386,
		       global_162,
		       global_175)
	 fun L_411 = ...
      in
	 L_411 (x_1044 (x_1045))
      end
   ... 14 more non-tail calls to x_1044 ...
   fun L_439 (x_1153) [] = 
      let
	 val x_1060 = (x_1156,
		       x_1153,
		       x_1154,
		       x_1155,
		       x_3422,
		       global_156,
		       global_157)
	 fun L_441 = ...
      in
	 L_441 (x_1044 (x_1060))
      end
   fun L_441 (x_1152) [] = 
      let
	 val x_3629 = Word32_xorb (x_1153, x_1154)
         ... > 300 bit fiddles ...
	 val x_1100 = (x_2486, x_2485, x_2484, x_2483)
      in
	 x_1100
      end
fun x_1044 (x_1295) = 
   let
      val x_1303 = #7 x_1295
      val x_1302 = #6 x_1295
      val x_1301 = #5 x_1295
      val x_1297 = #4 x_1295
      val x_1300 = #3 x_1295
      val x_1298 = #2 x_1295
      val x_1299 = #1 x_1295
      val x_2516 = Word32_andb (x_1298, x_1297)
      val x_2517 = Word32_notb (x_1297)
      val x_2515 = Word32_andb (x_1300, x_2517)
      val x_2514 = Word32_orb (x_2516, x_2515)
      val x_2513 = Word32_add (x_2514, x_1301)
      val x_2512 = Word32_add (x_2513, x_1303)
      val x_2511 = Word32_add (x_1299, x_2512)
      fun L_637 = ...
      fun L_1929 = ...
      fun L_1930 = ...
      val x_3725 = Word32_ge (x_1302, global_198)
   in
      case x_3725 of
	false => L_1930 | true => L_1929
   end
   where
   fun L_1930 () [] = 
      let
	 val x_3726 = Word32_lshift (x_2511, x_1302)
      in
	 L_637 (x_3726)
      end
   fun L_1929 () [] = 
      L_637 (global_74)
   fun L_1931 () [] = 
      let
	 val x_3727 = Word32_rshift (x_2511, x_1317)
      in
	 L_639 (x_3727)
      end
   fun L_1932 () [] = 
      L_639 (global_74)
   fun L_639 (x_1315) [] = 
      let
	 val x_2519 = Word32_orb (x_1316, x_1315)
	 val x_2518 = Word32_add (x_2519, x_1298)
      in
	 x_2518
      end
   fun L_637 (x_1316) [] = 
      let
	 val x_1317 = Word32_sub (global_198, x_1302)
	 fun L_639 = ...
	 fun L_1932 = ...
	 fun L_1931 = ...
	 val x_3728 = Word32_ge (x_1317, global_198)
      in
	 case x_3728 of
	   false => L_1931 | true => L_1932
      end
fun x_1027 (x_1325) = 
   let
      val x_1333 = #7 x_1325
      val x_1332 = #6 x_1325
      val x_1331 = #5 x_1325
      val x_1330 = #4 x_1325
      val x_1327 = #3 x_1325
      val x_1328 = #2 x_1325
      val x_1329 = #1 x_1325
      val x_2525 = Word32_andb (x_1328, x_1327)
      val x_2526 = Word32_notb (x_1328)
      val x_2524 = Word32_andb (x_2526, x_1330)
      val x_2523 = Word32_orb (x_2525, x_2524)
      val x_2522 = Word32_add (x_2523, x_1331)
      val x_2521 = Word32_add (x_2522, x_1333)
      val x_2520 = Word32_add (x_1329, x_2521)
      fun L_659 = ...
      fun L_1933 = ...
      fun L_1934 = ...
      val x_3729 = Word32_ge (x_1332, global_198)
   in
      case x_3729 of
	false => L_1934 | true => L_1933
   end
   where
   fun L_1934 () [] = 
      let
	 val x_3730 = Word32_lshift (x_2520, x_1332)
      in
	 L_659 (x_3730)
      end
   fun L_1933 () [] = 
      L_659 (global_74)
   fun L_1935 () [] = 
      let
	 val x_3731 = Word32_rshift (x_2520, x_1347)
      in
	 L_661 (x_3731)
      end
   fun L_1936 () [] = 
      L_661 (global_74)
   fun L_661 (x_1345) [] = 
      let
	 val x_2528 = Word32_orb (x_1346, x_1345)
	 val x_2527 = Word32_add (x_2528, x_1328)
      in
	 x_2527
      end
   fun L_659 (x_1346) [] = 
      let
	 val x_1347 = Word32_sub (global_198, x_1332)
	 fun L_661 = ...
	 fun L_1936 = ...
	 fun L_1935 = ...
	 val x_3732 = Word32_ge (x_1347, global_198)
      in
	 case x_3732 of
	   false => L_1935 | true => L_1936
      end

md5.inlineSSA.pre.ssa: (i.e., after flattenSSA)

fun x_1027 (x_4048, x_4047, x_4046, x_4045, x_4044, x_4043, x_4042) =
L_2188()
  L_2188 ()
    x_2525 = Word32_andb (x_4044, x_4043)
    x_2526 = Word32_notb (x_4043)
    x_2524 = Word32_andb (x_2526, x_4045)
    x_2523 = Word32_orb (x_2524, x_2525)
    x_2522 = Word32_add (x_2523, x_4046)
    x_2521 = Word32_add (x_2522, x_4048)
    x_2520 = Word32_add (x_4042, x_2521)
    x_3729 = Word32_ge (x_4047, global_383)
    case x_3729 of
      false => L_1934 | true => L_2097
  L_1934 ()
    x_3730 = Word32_lshift (x_2520, x_4047)
    L_659 (x_3730)
  L_2097 ()
    L_659 (global_384)
  L_659 (x_1346)
    x_1347 = Word32_sub (global_383, x_4047)
    x_3732 = Word32_ge (x_1347, global_383)
    case x_3732 of
      false => L_1935 | true => L_2098
  L_1935 ()
    x_3731 = Word32_rshift (x_2520, x_1347)
    L_661 (x_3731)
  L_2098 ()
    L_661 (global_384)
  L_661 (x_1345)
    x_2528 = Word32_orb (x_1346, x_1345)
    x_2527 = Word32_add (x_2528, x_4043)
    x_2527
fun x_1044 (x_4041, x_4040, x_4039, x_4038, x_4037, x_4036, x_4035) =
L_2187()
  L_2187 ()
    x_2516 = Word32_andb (x_4038, x_4036)
    x_2517 = Word32_notb (x_4038)
    x_2515 = Word32_andb (x_4037, x_2517)
    x_2514 = Word32_orb (x_2515, x_2516)
    x_2513 = Word32_add (x_2514, x_4039)
    x_2512 = Word32_add (x_2513, x_4041)
    x_2511 = Word32_add (x_4035, x_2512)
    x_3725 = Word32_ge (x_4040, global_383)
    case x_3725 of
      false => L_1930 | true => L_2095
  L_1930 ()
    x_3726 = Word32_lshift (x_2511, x_4040)
    L_637 (x_3726)
  L_2095 ()
    L_637 (global_384)
  L_637 (x_1316)
    x_1317 = Word32_sub (global_383, x_4040)
    x_3728 = Word32_ge (x_1317, global_383)
    case x_3728 of
      false => L_1931 | true => L_2096
  L_1931 ()
    x_3727 = Word32_rshift (x_2511, x_1317)
    L_639 (x_3727)
  L_2096 ()
    L_639 (global_384)
  L_639 (x_1315)
    x_2519 = Word32_orb (x_1315, x_1316)
    x_2518 = Word32_add (x_2519, x_4036)
    x_2518
fun transform_1 (x_4034, x_4033, x_4032) = L_2138()
  ...
  L_1802 ()
    x_3362 = Word8Vector_subWord (x_4034, x_3630)
    L_379 (x_1027 (global_386,
		   global_387,
		   x_3470,
		   x_1025,
		   x_1024,
		   x_1023,
		   x_1021)) None
  ... 14 more non-tail calls to x_1027 ...
  L_407 (x_1169)
    L_409 (x_1027 (global_405,
		   global_393,
		   x_3362,
		   x_1171,
		   x_1170,
		   x_1169,
		   x_1172)) None
  L_409 (x_1168)
    L_411 (x_1044 (global_406,
		   global_407,
		   x_3386,
		   x_1170,
		   x_1169,
		   x_1168,
		   x_1171)) None
  ... 14 more non-tail calls to x_1044 ...
  L_439 (x_1153)
    L_441 (x_1044 (global_425,
		   global_413,
		   x_3422,
		   x_1155,
		   x_1154,
		   x_1153,
		   x_1156)) None
  L_441 (x_1152)
    x_3629 = Word32_xorb (x_1154, x_1153)
    ... > 300 bit fiddles ...
    x_2483 = Word32_add (x_1025, x_3278)
    (x_2483, x_2484, x_2485, x_2486)

Now we inline all calls to x_1027 and x_1044.

The speed up seems primarily due to eliminating the ephemeral tuple
allocation and selects and elimination of the non-tail calls.  Of course,
we should be paying for it in code size, but turns out we're not:
                   release   cps & ssa simplification
md5                34,038    30,249
Not entirely sure why, although considering the size of those inlined
functions, the setup and return from a non-tail call, plus a limit check
at function entry, may just be too big.  That might suggest that the
defaults for inlining could do with a little more tweaking (although, the
SSA inliner is using a slightly different size heuristic anyways, so we
should probably try some different numbers).

And, it gets even better -- after inlining, all of the conditional
branches that were in x_1027 and x_1044 are based on arithmetic and
comparisons on globals; so the SSA shrinker should constant fold and
eliminate the conditional test.  What will be left is a (huge) block with
approximately 620 bit fiddles -- a pipelined processor's dream come true!