benchmarks
Matthew Fluet
Matthew Fluet <fluet@CS.Cornell.EDU>
Fri, 9 Nov 2001 10:33:22 -0500 (EST)
> > Strangely, there are some decent speedups in ratio-regions and
> > wc-input1, a 2X speed up in matrix-multiply, and an amazing 9X
> > speedup in md5!
>
> Truly amazing. Hopefully we can figure it out and attribute it to
> something other than caching effects. We should keep in mind that
> another possibility is bugs :-(.
I think that md5 is a real speedup. What I believe happened is there were
two functions that were just a little too big to be inlined by the CPS
inliner; the second round of flattening in SSA eliminated enough selects
from those functions so that the SSA inliner did inline them.
(There is more discussion after all the code.)
md5.inlineCPS.post.cps:
fun transform_1 (x_1018) = ...
where
fun L_1802 () [] =
let
val x_3362 = Word8Vector_subWord (x_1026, x_3630)
val x_1028 = (x_1021,
x_1023,
x_1024,
x_1025,
x_3470,
global_182,
global_195)
fun L_379 = ...
in
L_379 (x_1027 (x_1028))
end
fun L_379 (x_1183) [] =
let
val x_1029 = (x_1025,
x_1183,
x_1023,
x_1024,
x_3386,
global_180,
global_194)
fun L_381 = ...
in
L_381 (x_1027 (x_1029))
end
... 13 more non-tail calls to x_1027 ...
fun L_407 (x_1169) [] =
let
val x_1043 = (x_1172,
x_1169,
x_1170,
x_1171,
x_3362,
global_176,
global_177)
fun L_409 = ...
in
L_409 (x_1027 (x_1043))
end
fun L_409 (x_1168) [] =
let
val x_1045 = (x_1171,
x_1168,
x_1169,
x_1170,
x_3386,
global_162,
global_175)
fun L_411 = ...
in
L_411 (x_1044 (x_1045))
end
... 14 more non-tail calls to x_1044 ...
fun L_439 (x_1153) [] =
let
val x_1060 = (x_1156,
x_1153,
x_1154,
x_1155,
x_3422,
global_156,
global_157)
fun L_441 = ...
in
L_441 (x_1044 (x_1060))
end
fun L_441 (x_1152) [] =
let
val x_3629 = Word32_xorb (x_1153, x_1154)
... > 300 bit fiddles ...
val x_1100 = (x_2486, x_2485, x_2484, x_2483)
in
x_1100
end
fun x_1044 (x_1295) =
let
val x_1303 = #7 x_1295
val x_1302 = #6 x_1295
val x_1301 = #5 x_1295
val x_1297 = #4 x_1295
val x_1300 = #3 x_1295
val x_1298 = #2 x_1295
val x_1299 = #1 x_1295
val x_2516 = Word32_andb (x_1298, x_1297)
val x_2517 = Word32_notb (x_1297)
val x_2515 = Word32_andb (x_1300, x_2517)
val x_2514 = Word32_orb (x_2516, x_2515)
val x_2513 = Word32_add (x_2514, x_1301)
val x_2512 = Word32_add (x_2513, x_1303)
val x_2511 = Word32_add (x_1299, x_2512)
fun L_637 = ...
fun L_1929 = ...
fun L_1930 = ...
val x_3725 = Word32_ge (x_1302, global_198)
in
case x_3725 of
false => L_1930 | true => L_1929
end
where
fun L_1930 () [] =
let
val x_3726 = Word32_lshift (x_2511, x_1302)
in
L_637 (x_3726)
end
fun L_1929 () [] =
L_637 (global_74)
fun L_1931 () [] =
let
val x_3727 = Word32_rshift (x_2511, x_1317)
in
L_639 (x_3727)
end
fun L_1932 () [] =
L_639 (global_74)
fun L_639 (x_1315) [] =
let
val x_2519 = Word32_orb (x_1316, x_1315)
val x_2518 = Word32_add (x_2519, x_1298)
in
x_2518
end
fun L_637 (x_1316) [] =
let
val x_1317 = Word32_sub (global_198, x_1302)
fun L_639 = ...
fun L_1932 = ...
fun L_1931 = ...
val x_3728 = Word32_ge (x_1317, global_198)
in
case x_3728 of
false => L_1931 | true => L_1932
end
fun x_1027 (x_1325) =
let
val x_1333 = #7 x_1325
val x_1332 = #6 x_1325
val x_1331 = #5 x_1325
val x_1330 = #4 x_1325
val x_1327 = #3 x_1325
val x_1328 = #2 x_1325
val x_1329 = #1 x_1325
val x_2525 = Word32_andb (x_1328, x_1327)
val x_2526 = Word32_notb (x_1328)
val x_2524 = Word32_andb (x_2526, x_1330)
val x_2523 = Word32_orb (x_2525, x_2524)
val x_2522 = Word32_add (x_2523, x_1331)
val x_2521 = Word32_add (x_2522, x_1333)
val x_2520 = Word32_add (x_1329, x_2521)
fun L_659 = ...
fun L_1933 = ...
fun L_1934 = ...
val x_3729 = Word32_ge (x_1332, global_198)
in
case x_3729 of
false => L_1934 | true => L_1933
end
where
fun L_1934 () [] =
let
val x_3730 = Word32_lshift (x_2520, x_1332)
in
L_659 (x_3730)
end
fun L_1933 () [] =
L_659 (global_74)
fun L_1935 () [] =
let
val x_3731 = Word32_rshift (x_2520, x_1347)
in
L_661 (x_3731)
end
fun L_1936 () [] =
L_661 (global_74)
fun L_661 (x_1345) [] =
let
val x_2528 = Word32_orb (x_1346, x_1345)
val x_2527 = Word32_add (x_2528, x_1328)
in
x_2527
end
fun L_659 (x_1346) [] =
let
val x_1347 = Word32_sub (global_198, x_1332)
fun L_661 = ...
fun L_1936 = ...
fun L_1935 = ...
val x_3732 = Word32_ge (x_1347, global_198)
in
case x_3732 of
false => L_1935 | true => L_1936
end
md5.inlineSSA.pre.ssa: (i.e., after flattenSSA)
fun x_1027 (x_4048, x_4047, x_4046, x_4045, x_4044, x_4043, x_4042) =
L_2188()
L_2188 ()
x_2525 = Word32_andb (x_4044, x_4043)
x_2526 = Word32_notb (x_4043)
x_2524 = Word32_andb (x_2526, x_4045)
x_2523 = Word32_orb (x_2524, x_2525)
x_2522 = Word32_add (x_2523, x_4046)
x_2521 = Word32_add (x_2522, x_4048)
x_2520 = Word32_add (x_4042, x_2521)
x_3729 = Word32_ge (x_4047, global_383)
case x_3729 of
false => L_1934 | true => L_2097
L_1934 ()
x_3730 = Word32_lshift (x_2520, x_4047)
L_659 (x_3730)
L_2097 ()
L_659 (global_384)
L_659 (x_1346)
x_1347 = Word32_sub (global_383, x_4047)
x_3732 = Word32_ge (x_1347, global_383)
case x_3732 of
false => L_1935 | true => L_2098
L_1935 ()
x_3731 = Word32_rshift (x_2520, x_1347)
L_661 (x_3731)
L_2098 ()
L_661 (global_384)
L_661 (x_1345)
x_2528 = Word32_orb (x_1346, x_1345)
x_2527 = Word32_add (x_2528, x_4043)
x_2527
fun x_1044 (x_4041, x_4040, x_4039, x_4038, x_4037, x_4036, x_4035) =
L_2187()
L_2187 ()
x_2516 = Word32_andb (x_4038, x_4036)
x_2517 = Word32_notb (x_4038)
x_2515 = Word32_andb (x_4037, x_2517)
x_2514 = Word32_orb (x_2515, x_2516)
x_2513 = Word32_add (x_2514, x_4039)
x_2512 = Word32_add (x_2513, x_4041)
x_2511 = Word32_add (x_4035, x_2512)
x_3725 = Word32_ge (x_4040, global_383)
case x_3725 of
false => L_1930 | true => L_2095
L_1930 ()
x_3726 = Word32_lshift (x_2511, x_4040)
L_637 (x_3726)
L_2095 ()
L_637 (global_384)
L_637 (x_1316)
x_1317 = Word32_sub (global_383, x_4040)
x_3728 = Word32_ge (x_1317, global_383)
case x_3728 of
false => L_1931 | true => L_2096
L_1931 ()
x_3727 = Word32_rshift (x_2511, x_1317)
L_639 (x_3727)
L_2096 ()
L_639 (global_384)
L_639 (x_1315)
x_2519 = Word32_orb (x_1315, x_1316)
x_2518 = Word32_add (x_2519, x_4036)
x_2518
fun transform_1 (x_4034, x_4033, x_4032) = L_2138()
...
L_1802 ()
x_3362 = Word8Vector_subWord (x_4034, x_3630)
L_379 (x_1027 (global_386,
global_387,
x_3470,
x_1025,
x_1024,
x_1023,
x_1021)) None
... 14 more non-tail calls to x_1027 ...
L_407 (x_1169)
L_409 (x_1027 (global_405,
global_393,
x_3362,
x_1171,
x_1170,
x_1169,
x_1172)) None
L_409 (x_1168)
L_411 (x_1044 (global_406,
global_407,
x_3386,
x_1170,
x_1169,
x_1168,
x_1171)) None
... 14 more non-tail calls to x_1044 ...
L_439 (x_1153)
L_441 (x_1044 (global_425,
global_413,
x_3422,
x_1155,
x_1154,
x_1153,
x_1156)) None
L_441 (x_1152)
x_3629 = Word32_xorb (x_1154, x_1153)
... > 300 bit fiddles ...
x_2483 = Word32_add (x_1025, x_3278)
(x_2483, x_2484, x_2485, x_2486)
Now we inline all calls to x_1027 and x_1044.
The speed up seems primarily due to eliminating the ephemeral tuple
allocation and selects and elimination of the non-tail calls. Of course,
we should be paying for it in code size, but turns out we're not:
release cps & ssa simplification
md5 34,038 30,249
Not entirely sure why, although considering the size of those inlined
functions, the setup and return from a non-tail call, plus a limit check
at function entry, may just be too big. That might suggest that the
defaults for inlining could do with a little more tweaking (although, the
SSA inliner is using a slightly different size heuristic anyways, so we
should probably try some different numbers).
And, it gets even better -- after inlining, all of the conditional
branches that were in x_1027 and x_1044 are based on arithmetic and
comparisons on globals; so the SSA shrinker should constant fold and
eliminate the conditional test. What will be left is a (huge) block with
approximately 620 bit fiddles -- a pipelined processor's dream come true!