[MLton-user] Why is recursion slower than C?

Tue Apr 24 08:33:37 PDT 2007

According to
   http://shootout.alioth.debian.org/debian/benchmark.php? 
test=recursive&lang=all
MLton is much slower than C. Indeed, if one looks at the simplest  
case where it's slower (Fibonacci) the difference is over 50%.

In looking over the assembler generated for the SML function
   fun fib n = if n < 2 then 1 else fib (n - 2) + fib (n - 1)
as compared to the C function
   int fib(int n) {
     if (n < 2) {
       return 1;
     }
     return fib(n - 2) + fib(n - 1);
   }
The main differences appear to be that:
   MLton checks for stack and integer overflow (the gcc code doesn't)
   MLton uses jmp instead of call/return
   gcc uses leal to do the subtraction, while MLton uses subl.

What I don't understand is why the MLton version is so much slower.  
The overflow checks don't seem particularly costly. I'm assuming the  
problem is that MLton uses a lot of MOVs+JMP instead of the CALL  
instruction. Is this correct? My assembler is really rusty, so I'm  
not sure what's slower here.

 From a theoretical point of view, shouldn't MLton have a performance  
advantage from not having to conform to the C calling convention?  
Most of the function prelude in gcc's assembler seems superfluous.

Also, it seems MLton is using %ebp for the stack pointer, instead of % 
esp. Why is this? I gather this is why call/return are out.

GCC:
fib:
         pushl   %ebp
         movl    $1, %eax
         movl    %esp, %ebp
         subl    $12, %esp
         movl    %esi, -4(%ebp)
         movl    8(%ebp), %esi
         movl    %ebx, -8(%ebp)
         cmpl    $1, %esi
         jle     .L12
         leal    -2(%esi), %eax
         movl    %eax, (%esp)
         call    fib
         movl    %eax, %ebx
         leal    -1(%esi), %eax
         movl    %eax, (%esp)
         call    fib
         leal    (%eax,%ebx), %eax
.L12:
         movl    -8(%ebp), %ebx
         movl    -4(%ebp), %esi
         movl    %ebp, %esp
         popl    %ebp
         ret

MLton:
fib_0:
L_534:
         cmpl %ebp,((gcState+12)+(0*4)) // guard against stack overflow
         jnb L_533
L_525:
// This seems to all be about growing the stack.
         movl (c_stackP+(0*4)),%esi
         xchgl %esi,%esp
         subl $12,%esp
         pushl $(__LINE__+9)
         pushl $fileName
         pushl $0x0
         pushl $0x0
         pushl $gcState
         addl $8,%ebp
         movl $L_524,(0+(-1*4))(%ebp)
         movl %ebp,((gcState+8)+(0*4))
         movl %esi,((gcState+0)+(0*4))
         call GC_collect
         addl $32,%esp
         movl ((gcState+0)+(0*4)),%esi
         xchgl %esi,%esp
         movl ((gcState+8)+(0*4)),%ebp
         jmp L_524
.p2align 4
.long 21
L_524:
         addl $-8,%ebp
L_533:
// Method starts here
         movl (0+((0*1)+(0*1)))(%ebp),%esi
         cmpl $0x2,%esi
         jl L_1284
L_532:
         subl $0x2,%esi
         jo L_1283 // catch integer overflow
L_530:
         movl %esi,(0+((8*1)+(0*1)))(%ebp)
         addl $8,%ebp
         movl $L_529,(0+(-1*4))(%ebp)
         jmp fib_0 // resumes control at L_529
.p2align 4
L_1283:
L_531:
         movl $0x1,(globalPointerNonRoot+((0*4)+(0*1)))
         movl ((gcState+1028)+(0*4)),%ebp
         addl ((gcState+16)+(0*4)),%ebp
         jmp *(0+(-1*4))(%ebp)
.p2align 4
L_1284:
L_526:
         movl $0x1,(0+((0*1)+(0*1)))(%ebp)
         jmp *(0+(-1*4))(%ebp)
.p2align 4
.long 48
L_529:
         addl $-8,%ebp
         movl (0+((8*1)+(0*1)))(%ebp),%esi
         movl %esi,(0+((4*1)+(0*1)))(%ebp)
         movl (0+((0*1)+(0*1)))(%ebp),%esi
         decl %esi
         movl %esi,(0+((12*1)+(0*1)))(%ebp)
         addl $12,%ebp
         movl $L_528,(0+(-1*4))(%ebp)
         jmp fib_0
.p2align 4
.long 4
L_528:
         addl $-12,%ebp
         movl (0+((12*1)+(0*1)))(%ebp),%esi
         movl %esi,(0+((8*1)+(0*1)))(%ebp)
         movl (0+((4*1)+(0*1)))(%ebp),%edi
         addl %esi,%edi
         jo L_531
L_527:
         movl %edi,(0+((0*1)+(0*1)))(%ebp)
         jmp *(0+(-1*4))(%ebp)