// file kernel/n/x86/toom.S: Toom multiplication of natural integers
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                          Multiplication de Toom                       |
 |                                                                       |
 +-----------------------------------------------------------------------*/

#if defined(assembly_sn_toommul) || defined(assembly_sn_toomsqr)
        
                         # +-------------------------+
                         # |  Addition/soustraction  |
                         # +-------------------------+

# entre :
#   a = naturel de longueur 2p+q   esi = &a,  edx = p,  ecx = q
#   b = naturel de longueur 2p+2   edi = &b
# contraintes : 0 < q <= p
#
# sortie :
#   b[0..p]      <-  a[0..p-1] + a[p..2p-1] + a[2p..2p+q-1]
#   b[p+1..2p+1] <- |a[0..p-1] - a[p..2p-1] + a[2p..2p+q-1]|
# CF <- signe de a[0..p-1] - a[p..2p-1] + a[2p..2p+q-1]
#
#
# registres modifis :
#   ecx <- 0
#   eax,ebx,edx,esi,edi,ebp <- ind.

#undef L
#define L(x) .Lsn_fadd_sub3_##x
        ALIGN_32
.Lsn_fadd_sub3:

#ifdef use_sse2

        leal  (%esi,%ecx,4), %esi       # esi <- &a[q]
        leal  (%edi,%ecx,4), %edi       # edi <- &b[q]
        leal  (%esi,%edx,4), %eax       # eax <- &a[p+q]
        leal 4(%edi,%edx,4), %ebp       # ebp <- &b[p+q+1]
        leal  (%eax,%edx,4), %ebx       # ebx <- &a[2p+q]
        subl   %ecx,    %edx            # edx <- p-q
        negl   %ecx

        # additionne/soustrait les q premiers chiffres 
        movd  (%esi,%ecx,4), %mm3
        movd  (%ebx,%ecx,4), %mm4
        movd  (%eax,%ecx,4), %mm2
        incl   %ecx
        paddq  %mm4,    %mm3
        movq   %mm3,    %mm4
        paddq  %mm2,    %mm3
        psubq  %mm2,    %mm4
        movd   %mm3, -4(%edi,%ecx,4)
        movd   %mm4, -4(%ebp,%ecx,4)
        pshufw $0xfe,   %mm3, %mm3
        pshufw $0xfe,   %mm4, %mm4
        jz     2f
        ALIGN_4
1:
        movd  (%esi,%ecx,4), %mm0
        movd  (%ebx,%ecx,4), %mm1
        movd  (%eax,%ecx,4), %mm2
        incl   %ecx
        paddq  %mm1,    %mm0
        movq   %mm0,    %mm1
        paddq  %mm2,    %mm0
        psubq  %mm2,    %mm1
        paddq  %mm0,    %mm3
        paddq  %mm1,    %mm4
        movd   %mm3, -4(%edi,%ecx,4)
        movd   %mm4, -4(%ebp,%ecx,4)
        pshufw $0xfe,   %mm3, %mm3
        pshufw $0xfe,   %mm4, %mm4
        jne    1b
2:

        # continue avec les p-q chiffres restants
        leal  (%esi,%edx,4), %esi       # esi <- &a[p]
        leal  (%eax,%edx,4), %eax       # eax <- &a[2p]
        leal  (%edi,%edx,4), %edi       # edi <- &b[p]
        leal  (%ebp,%edx,4), %ebp       # ebp <- &b[2p+1]
        negl   %edx
        jz     2f
        ALIGN_4
1:
        movd  (%esi,%edx,4), %mm0
        movd  (%eax,%edx,4), %mm2
        movq   %mm0,    %mm1
        incl   %edx
        paddq  %mm2,    %mm0
        psubq  %mm2,    %mm1
        paddq  %mm0,    %mm3
        paddq  %mm1,    %mm4
        movd   %mm3, -4(%edi,%edx,4)
        movd   %mm4, -4(%ebp,%edx,4)
        pshufw $0xfe,   %mm3, %mm3
        pshufw $0xfe,   %mm4, %mm4
        jne    1b
2:
        movd   %mm3,   (%edi)           # b[p] <- retenue addition
        movd   %mm4,    %eax
        testl  %eax,    %eax            # CF <- 0
        jns    L(positif)

        # si la diffrence est ngative, change de signe
        leal 4(%edi),   %eax
        subl   %ebp,    %eax
        sarl   $2,      %eax            # eax <- -p
        pxor   %mm4,    %mm4            # mm4 <- 0 (retenue)
        ALIGN_4
1:
        movd  (%ebp,%eax,4), %mm0
        psubq  %mm0,    %mm4
        movd   %mm4,   (%ebp,%eax,4)
        incl   %eax
        pshufw $0xfe,   %mm4, %mm4
        jne    1b
        stc                             # CF <- 1

L(positif):     
        movl   %eax,   (%ebp)           # b[2p+1] <- retenue soustraction
        emms
        ret

#else /* use_sse2 */

        pushl  %edx                    # sauve p
        pushl  %edi                    # sauve &b0

        leal   (%esi,%edx,8), %ebx     # ebx <- &a2
        call   .Lsn_fadd               # b0 <- a0+a2
        adcl   %ecx,    %ecx           # sauve la retenue
        movl   %ecx,   (%edi)
        
        leal 4(%edi),   %edi           # edi <- &b1
        movl   %esi,    %ebx           # ebx <- &a1
        movl  (%esp),   %esi           # esi <- &b0
        movl 4(%esp),   %ecx           # ecx <- p
        leal 1(%ecx),   %edx           # edx <- p+1
        pushl  %ebx                    # sauve &a1
        call   .Lsn_fasub              # b1 <- |a0-a1+a2|

        popl   %ebx                    # ebx <- &a1
        popl   %esi                    # esi <- &b0
        popl   %ecx                    # ecx <- p
        pushf                          # sauve le signe de a0-a1+a2
        call   .Lsn_finc_1             # b0 <- a0+a1+a2
        adcl   %ecx,   (%esi)          # dernier chiffre

        popf                           # rcupre le signe de a0-a1+a2
        ret
        
#endif /* use_sse2 */

        
                        # +--------------------------+
                        # |  Addition avec dcalage  |
                        # +--------------------------+

# entre :
#   a = naturel de longueur 2p+q   esi = &a,  edx = p,  ecx = q
#   b = naturel de longueur p+3    edi = &b
# contraintes : 0 < q <= p, p > 2
#
# sortie :
#   b <-  a[0..p-1] + BASE*a[p..2p-1] + BASE^2*a[2p..2p+q-1]
#
# registres modifis :
#   eax,ebx,ecx,edx,esi,edi,ebp <- ind.

#undef L
#define L(x) .Lsn_fadd_base_##x
        ALIGN_32
.Lsn_fadd_base:

#ifdef use_sse2

        # b[0] <- a0[0]
        movl  (%esi),   %eax
        movl   %eax,   (%edi)

        # additionne les min(p-2,q) chiffres communs
        leal -2(%edx),  %eax
        cmpl   %eax,    %ecx
/*      cmovb  %ecx,    %eax            # eax <- r = min(p-2,q) */
	.byte  0x0f, 0x42, 0xc1
        leal  8(%esi,%eax,4),%esi       # esi <- &a0[r+2]
        leal -4(%esi,%edx,4),%ebx       # ebx <- &a1[r+1]
        leal -4(%ebx,%edx,4),%ebp       # ebp <- &a2[r]
        leal  8(%edi,%eax,4),%edi       # edi <- &b[r+2]
        subl   %eax,    %ecx            # ecx <- q-r
        subl   %eax,    %edx            # edx <- p-r
        negl   %eax
        
        movd -4(%esi,%eax,4), %mm3
        movd -4(%ebx,%eax,4), %mm1
        paddq  %mm1,    %mm3
        movd   %mm3, -4(%edi,%eax,4)
        pshufw $0xfe,   %mm3, %mm3
        ALIGN_4
1:
        movd  (%esi,%eax,4), %mm0
        movd  (%ebx,%eax,4), %mm1
        movd  (%ebp,%eax,4), %mm2
        incl   %eax
        paddq  %mm0,    %mm1
        paddq  %mm2,    %mm3
        paddq  %mm1,    %mm3
        movd   %mm3, -4(%edi,%eax,4)
        pshufw $0xfe,   %mm3, %mm3
        jne    1b

        # termine a0
        subl   $2,      %edx            # edx <- p-(r+2)
        jbe    2f
        leal  (%esi,%edx,4), %esi       # esi <- &a0[p]
        leal  (%ebx,%edx,4), %ebx       # ebx <- &a1[p-1]
        leal  (%edi,%edx,4), %edi       # edi <- &b[p]
        negl   %edx
        ALIGN_4
1:
        movd  (%esi,%edx,4), %mm0
        movd  (%ebx,%edx,4), %mm1
        incl   %edx
        paddq  %mm0,    %mm1
        paddq  %mm1,    %mm3
        movd   %mm3, -4(%edi,%edx,4)
        pshufw $0xfe,   %mm3, %mm3
        jne    1b
2:
        
        # fin de a1 et retenues
        movd  (%ebx),   %mm0
        jecxz  1f
        movd  (%ebp),   %mm1
        paddq  %mm1,    %mm0
        decl   %ecx
1:
        paddq  %mm0,    %mm3
        movd   %mm3,   (%edi)
        pshufw $0xfe,   %mm3, %mm3
        jecxz  2f
        movd 4(%ebp),   %mm0
        paddq  %mm0,    %mm3
2:
        movd   %mm3,  4(%edi)
        pshufw $0xfe,   %mm3, %mm3
        movd   %mm3,  8(%edi)

        emms
        ret
        
#else  /* use_sse2 */

        pushl  %edx                     # sauve p
        pushl  %ecx                     # sauve q
        pushl  %edi                     # sauve &b
        
        # b <- a0 + BASE*a1
        movl  (%esi),   %eax
        movl   %eax,   (%edi)
        leal  (%esi,%edx,4), %ebx       # ebx <- &a1
        leal 4(%esi),   %esi            # esi <- &a0[1]
        leal 4(%edi),   %edi            # edi <- &b[1]
        leal -1(%edx),  %ecx            # ecx <- p-1
        call  .Lsn_fadd_1

        movl  (%ebx),   %eax
        adcl   %ecx,    %eax
        movl   %eax,   (%edi)
        movl   %ecx,  8(%edi)
        adcl   %ecx,    %ecx
        movl   %ecx,  4(%edi)

        # b <- b + BASE^2*a2
        popl   %esi
        popl   %ecx                     # ecx <- q
        popl   %edx
        leal 4(%ebx),   %ebx            # ebx <- &a2
        leal 8(%esi),   %esi            # esi <- &b[2]
        leal 1(%edx),   %edx            # edx <- p+1
        jmp  .Lsn_finc

#endif /* use_sse2 */

#endif /* defined(assembly_sn_toommul) || defined(assembly_sn_toomsqr) */

                            # +------------------+
                            # |  Multiplication  |
                            # +------------------+
        
# entre :
#   a = naturel de longueur la     esi = &a, edx = la
#   b = naturel de longueur lb     ebx = &b, ecx = lb
#   c = naturel de longueur la+lb  edi = &c
# contraintes : 0 < lb <= la
#
# sortie :
#   c <- a * b
#
# registres modifis :
#   eax,ebx,ecx,edx,esi,edi,ebp <- ind.
        
#ifdef assembly_sn_toommul
        ALIGN_4
#ifdef debug_toommul
.Lsn_ftoommul_buggy:
#else
.Lsn_ftoommul:
#endif

#undef L
#define L(x) .Lsn_ftoommul_##x

        # petite multiplication => algorithme de Karatsuba
        cmpl   $toommul_lim, %ecx
        jbe    .Lsn_fkaramul
        
        leal   2(%edx), %eax
        xorl   %edx,    %edx
        movl   $3,      %ebp
        divl   %ebp                     # eax <- p = ceil(la/3)
	movl   %eax,    %ebp            # ebp <- p
        shll   $1,      %eax            # eax <- 2p
        subl   %eax,    %ecx            # ecx <- r = lb - 2p
        jbe    L(tranches)              # si lb <= 2p, dcoupe a en tranches

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _e_
        #undef  _f_
        #undef  _p_
        #undef  _q_
        #undef  _r_
        #undef  _x_
        
        #define _a_   36(%esp)
        #define _b_   32(%esp)
        #define _c_   28(%esp)
        #define _d_   24(%esp)
        #define _e_   20(%esp)
        #define _f_   16(%esp)
        #define _p_   12(%esp)
        #define _q_    8(%esp)
        #define _r_    4(%esp)
        #define _x_     (%esp)

        leal  10(%eax,%eax,2), %eax     # eax <- 6p+10
        leal  (,%eax,4), %eax
        ALLOCA                          # rserve 6p+10 chiffres dans la pile
        leal   -2(%edx,%ebp,1), %edx    # edx <- q

        pushl  %esi                     # sauve &a
        pushl  %ebx                     # sauve &b
        pushl  %edi                     # sauve &c
        leal   12(%esp), %esi
        pushl  %esi                     # sauve &d
        leal   8(%esi,%ebp,8), %esi
        pushl  %esi                     # sauve &e
        leal   8(%esi,%ebp,8), %esi
        pushl  %esi                     # sauve &f
        pushl  %ebp                     # sauve p
        pushl  %edx                     # sauve q
        pushl  %ecx                     # sauve r
        pushl  $0                       # x <- 0

        # c[0..p] <- a0 + a1 + a2, c[p+1..2p+1] <- |a0 - a1 + a2|
        movl   _a_,     %esi
        movl   _c_,     %edi
        movl   _p_,     %edx
        movl   _q_,     %ecx
        call   .Lsn_fadd_sub3
        adcl   %ecx,    _x_
        
        # c[2p+2..3p+2] <- b0 + b1 + b2, c[3p+3..4p+3] <- |b0 - b1 + b2|
        movl   _b_,     %esi
        movl   _c_,     %edi
        movl   _p_,     %edx
        movl   _r_,     %ecx
        leal   8(%edi,%edx,8), %edi
        call   .Lsn_fadd_sub3
        adcl   %ecx,    _x_

        # d <- (a0 + a1 + a2)(b0 + b1 + b2) = c0 + c1 + c2 + c3 + c4
        movl   _p_,     %ecx
        movl   %ecx,    %edx
        movl   _c_,     %ebx
        leal  8(%ebx,%ecx,8), %esi
        movl   _d_,     %edi
        movl   $0,  (%edi,%edx,8)
        testl  $-1, (%esi,%edx,4)
        jz     1f
        incl   %edx
1:
        testl  $-1, (%ebx,%ecx,4)
        jz     2f
        incl   %ecx
        xchgl  %ecx,    %edx
        xchgl  %ebx,    %esi
2:
        call   .Lsn_ftoommul
        
        # e <- |a0 - a1 + a2|*|b0 - b1 + b2| = |c0 - c1 + c2 - c3 + c4|
        movl   _p_,     %ecx
        movl   %ecx,    %edx
        movl   _c_,     %ebx
        leal  4(%ebx,%ecx,4), %ebx
        leal  8(%ebx,%ecx,8), %esi
        movl   _e_,     %edi
        movl   $0,  (%edi,%edx,8)
        testl  $-1, (%esi,%edx,4)
        jz     1f
        incl   %edx
1:
        testl  $-1, (%ebx,%ecx,4)
        jz     2f
        incl   %ecx
        xchgl  %ecx,    %edx
        xchgl  %ebx,    %esi
2:
        call   .Lsn_ftoommul
        
        # c[0..p+2] <- a0 + BASE*a1 + BASE^2*a2
        movl   _a_,     %esi
        movl   _c_,     %edi
        movl   _p_,     %edx
        movl   _q_,     %ecx
        call   .Lsn_fadd_base
        
        # c[p+3..2p+5] <- b0 + BASE*b1 + BASE^2*b2
        movl   _b_,     %esi
        movl   _c_,     %edi
        movl   _p_,     %edx
        movl   _r_,     %ecx
        leal 12(%edi,%edx,4), %edi
        call   .Lsn_fadd_base

        # f <- (a0 + BASE*a1 + BASE^2*a2)*(b0 + BASE*b1 + BASE^2*b2)
        #    = c0 + BASE*c1 + BASE^2*c2 + BASE^3*c3 + BASE^4*c4
        movl   _p_,     %ecx
        movl   _f_,     %edi
        xorl   %eax,    %eax
#if 0
        # ceci est inutile, les chiffres de rang 2p+2  2p+5
        # de f vont tre ignors (voir plus bas)
        movl   %eax,    8(%edi,%ecx,8)  # f[2p+2] <- 0
        movl   %eax,   12(%edi,%ecx,8)  # f[2p+3] <- 0
        movl   %eax,   16(%edi,%ecx,8)  # f[2p+4] <- 0
        movl   %eax,   20(%edi,%ecx,8)  # f[2p+5] <- 0
#endif
        addl   $3,      %ecx            # ecx <- p+3
        movl   %ecx,    %edx            # edx <- p+3
        movl   _c_,     %esi
        leal   (%esi,%edx,4), %ebx      # ebx <- &c[p+3]
        cmpl  -4(%esi,%edx,4), %eax     # edx <- lg(a0+BASE*a1+BASE^2*a2)
        adcl   $-1,     %edx
        cmpl  -4(%esi,%edx,4), %eax
        adcl   $-1,     %edx
        cmpl  -4(%ebx,%ecx,4), %eax     # ecx <- lg(b0+BASE*b1+BASE^2*b2)
        adcl   $-1,     %ecx
        cmpl  -4(%ebx,%ecx,4), %eax
        adcl   $-1,     %ecx
        cmpl   %edx,    %ecx            # classe les arguments pour avoir
        jbe    1f                       # ... lg(esi,edx) >= lg(ebx,ecx)
        xchgl  %ecx,    %edx
        xchgl  %ebx,    %esi
1:
        call   .Lsn_ftoommul

        # c[0..2p-1] <- a0*b0 = c0
        movl   _a_,     %esi
        movl   _b_,     %ebx
        movl   _c_,     %edi
        movl   _p_,     %edx
        movl   %edx,    %ecx
        call   .Lsn_ftoommul

        # c[4*p..4p+q+r-1] <- a2*b2 = c4
        movl   _p_,     %eax
        leal   (,%eax,2), %eax          # eax <- 2p
        movl   _a_,     %esi
        movl   _b_,     %ebx
        movl   _c_,     %edi
        leal   (%esi,%eax,4), %esi      # esi <- &a2
        leal   (%ebx,%eax,4), %ebx      # ebx <- &b2
        leal   (%edi,%eax,8), %edi      # edi <- &c[4p]
        movl   _q_,     %edx
        movl   _r_,     %ecx
        call   .Lsn_ftoommul
        
        # point de chute pour toom_sqr
.Lsn_toom_aux:

#ifdef use_sse2
        
        # x:c[2p..4p-1] <- (d+e)/2 = c0 + c2 + c4, d <- (d-e)/2 = c1 + c3
        movl   _p_,     %ecx
        leal   1(,%ecx,2), %ecx         # ecx <- 2p+1
        movl   _d_,     %esi
        movl   _e_,     %ebx
        movl   _c_,     %edi
        leal -4(%edi,%ecx,4), %edi      # edi <- &c[2p]
        movl   %esi,    %edx
        movl -4(%edi,%ecx,4), %eax      # sauve c[4p] dans x
        xchgl  %eax,    _x_
        bt     $0,      %eax            # CF <- signe(e)
        jnc    1f
        xchgl  %edi,    %edx
1:
        call   .Lsn_fhalf_add_sub
        
        # x:c[2p..4p-1] <- c[2p..4p] - c0 - c4 = c2
        movl   _p_,     %edx
        leal   (,%edx,2), %edx
        movl   _c_,     %esi
        movl   _q_,     %ecx
        addl   _r_,     %ecx            # ecx <- q+r
        movl   (%esi,%edx,8), %eax
        xchgl   %eax,    _x_            # restaure c[4p]
        movl    %eax,  (%esi,%edx,8)
        leal   (%esi,%ecx,4), %esi      # esi <- &c0[q+r]
        leal   (%esi,%edx,4), %edi      # edi <- &c2[q+r]
        leal   (%esi,%edx,8), %ebx      # ebx <- &c4[q+r]
        subl   %ecx,    %edx            # edx <- 2p-q-r
        negl   %ecx

        movd  (%esi,%ecx,4), %mm0       # chiffres communs  c0,c2,c4
        movd  (%ebx,%ecx,4), %mm1
        movd  (%edi,%ecx,4), %mm3
        incl   %ecx
        paddq  %mm0,    %mm1
        psubq  %mm1,    %mm3
        movd   %mm3, -4(%edi,%ecx,4)
        ALIGN_4
1:
        movd  (%esi,%ecx,4), %mm0
        movd  (%ebx,%ecx,4), %mm1
        movd  (%edi,%ecx,4), %mm2
        incl   %ecx
        pshufw $0xfe,   %mm3, %mm3
        paddq  %mm0,    %mm1
        paddq  %mm2,    %mm3
        psubq  %mm1,    %mm3
        movd   %mm3, -4(%edi,%ecx,4)
        jne    1b

        leal  (%esi,%edx,4), %esi       # esi <- &c0[2p]
        leal  (%edi,%edx,4), %edi       # edi <- &c2[2p]
        negl   %edx
        jz     2f
        ALIGN_4
1:
        movd  (%esi,%edx,4), %mm0       # fin de c0 et c2
        movd  (%edi,%edx,4), %mm2
        incl   %edx
        pshufw $0xfe,   %mm3, %mm3
        paddq  %mm2,    %mm3
        psubq  %mm0,    %mm3
        movd   %mm3, -4(%edi,%edx,4)
        jne    1b
2:
        pshufw $0xfe,   %mm3, %mm3
        movd   %mm3,    %eax
        addl   %eax,    _x_             # x -= retenue

        # f <- f - c0 - BASE*d - BASE^2*c2 - BASE^4*c4 = BASE*(BASE^2 - 1)*c3
        #
        # rmq1 : f a 2p+6 chiffres mais on s en sert pour calculer -BASE*c3
        # qui tient sur p+q+2 chiffres -> on peut ignorer les chiffres de rang
        # 2p+2  2p+5 (d ailleurs on ne les a peut-tre mme pas calculs)
        #
        # rmq2 : f et c0 ont mme chiffre des units, donc on peut commencer
        # la soustraction au rang 1. Ce n est mme pas la peine de forcer
        # le premier chiffre  zro, on ne s en servira pas

        # traite les chiffres de rang 1,2,3  part car il n y a pas c4
        movl   _f_,     %edi
        leal 4(%edi),   %edi            # edi <- &f[1]
        movl   _c_,     %esi
        leal 4(%esi),   %esi            # esi <- &c0[1]
        movl   _d_,     %ebx            # ebx <- &d[0]
        movl   _p_,     %edx
        leal -4(,%edx,2),%edx           # edx <- 2p-4
        leal  8(%esi,%edx,4), %ebp      # ebp <- &c2[-1]

        movd  (%edi),   %mm1            # mm1 <- f[1]
        movd  (%esi),   %mm2            # mm2 <- c0[1]
        movd  (%ebx),   %mm3            # mm2 <- d[0]
        paddq  %mm2,    %mm3            # mm3 <- c0[1]+d[0]
        psubq  %mm3,    %mm1            # mm1 -= c0[1]+d[0]
        movd   %mm1,   (%edi)           # sauve f[1]

        pshufw $0xfe,   %mm1, %mm1      # mm1 <- retenue
        movd 4(%edi),   %mm0            # mm0 <- f[2]
        movd 4(%esi),   %mm2            # mm2 <- c0[2]
        movd 4(%ebx),   %mm3            # mm2 <- d[1]
        movd 4(%ebp),   %mm4            # mm4 <- c2[0]
        paddq  %mm0,    %mm1            # mm1 <- f[2] + ret
        paddq  %mm2,    %mm3            # mm3 <- c0[2]+d[1]
        psubq  %mm3,    %mm1            # mm1 -= c0[2]+d[1]
        psubq  %mm4,    %mm1            # mm1 -= c2[0]
        movd   %mm1,  4(%edi)           # sauve f[2]

        pshufw $0xfe,   %mm1, %mm1      # mm1 <- retenue
        movd 8(%edi),   %mm0            # mm0 <- f[3]
        movd 8(%esi),   %mm2            # mm2 <- c0[3]
        movd 8(%ebx),   %mm3            # mm2 <- d[2]
        movd 8(%ebp),   %mm4            # mm4 <- c2[1]
        paddq  %mm0,    %mm1            # mm1 <- f[3] + ret
        paddq  %mm2,    %mm3            # mm3 <- c0[3]+d[2]
        psubq  %mm3,    %mm1            # mm1 -= c0[3]+d[2]
        psubq  %mm4,    %mm1            # mm1 -= c2[1]
        movd   %mm1,  8(%edi)           # sauve f[2]

        # partie commune  tous les nombres de longueur min(q+r,2p-4)
        # on fait comme si le minimum vaut q+r, quitte  changer c0 et c4
        movl   _q_,     %ecx
        addl   _r_,     %ecx            # ecx <- q+r
        leal 2(%edx),   %eax            # eax <- 2p-2
        cmpl   %eax,    %ecx
/*      cmova  %eax,    %ecx            # ecx <- min(q+r,2p-2) */
	.byte 0x0f, 0x47, 0xc8
        movd   %eax,    %mm7
        leal 8(%ebp,%edx,4), %eax       # eax <- &c4[-3]
        cmpl   %ecx,    %edx
        jae    1f
        xchgl  %ecx,    %edx            # ecx <- min(q+r,2p-4)
        xchgl  %eax,    %esi            # edx <- max(min(q+r,2p-2),2p-4)
1:
        movd   %edx,    %mm6
        psubq  %mm6,    %mm7            # mm7 <- 2p-2-max
        subl   %ecx,    %edx            # edx <- max-min
        leal 12(%edi,%ecx,4), %edi      # edi <- &f[q+r+4]
        leal 12(%esi,%ecx,4), %esi      # esi <- &c0[q+r+4]
        leal 12(%ebx,%ecx,4), %ebx      # ebx <- &d[q+r+3]
        leal 12(%ebp,%ecx,4), %ebp      # ebp <- &c2[q+r+2]
        leal 12(%eax,%ecx,4), %eax      # ebp <- &c4[q+r]
        negl   %ecx
        
        ALIGN_4
1:
        pshufw $0xfe,   %mm1, %mm1      # mm1 <- retenue
        movd  (%edi,%ecx,4), %mm0       # mm0 <- f[i]
        movd  (%esi,%ecx,4), %mm2       # mm2 <- c0[i]
        movd  (%ebx,%ecx,4), %mm3       # mm3 <- d[i-1]
        movd  (%ebp,%ecx,4), %mm4       # mm4 <- c2[i-2]
        movd  (%eax,%ecx,4), %mm5       # mm5 <- c4[i-4]
        incl   %ecx
        paddq  %mm0,    %mm1            # mm1 <- f[i] + ret
        paddq  %mm2,    %mm3            # mm3 <- c0[i]+d[i-1]
        paddq  %mm4,    %mm5            # mm5 <- c2[i-2]+c4[i-4]
        psubq  %mm3,    %mm1            # mm1 -= c0[i]+d[i-1]
        psubq  %mm5,    %mm1            # mm1 -= c2[i-2]+c4[i-4]
        movd   %mm1, -4(%edi,%ecx,4)    # sauve f[i]
        jne    1b

        # partie suivante commune  quatre nombres (f,c0,d,c2)
        leal  (%edi,%edx,4), %edi       # edi <- &f[2p]
        leal  (%esi,%edx,4), %esi       # esi <- &c0[2p]
        leal  (%ebx,%edx,4), %ebx       # ebx <- &d[2p-1]
        leal  (%ebp,%edx,4), %ebp       # ebp <- &c2[2p-2]
        negl   %edx
        jz     2f
        ALIGN_4
1:
        pshufw $0xfe,   %mm1, %mm1      # mm1 <- retenue
        movd  (%edi,%edx,4), %mm0       # mm0 <- f[i]
        movd  (%esi,%edx,4), %mm2       # mm2 <- c0[i]
        movd  (%ebx,%edx,4), %mm3       # mm3 <- d[i-1]
        movd  (%ebp,%edx,4), %mm4       # mm4 <- c2[i-2]
        incl   %edx
        paddq  %mm0,    %mm1            # mm1 <- f[i] + ret
        paddq  %mm2,    %mm3            # mm3 <- c0[i]+d[i-1]
        psubq  %mm3,    %mm1            # mm1 -= c0[i]+d[i-1]
        psubq  %mm4,    %mm1            # mm1 -= c2[i-2]
        movd   %mm1, -4(%edi,%edx,4)    # sauve f[i]
        jne    1b
2:
        
        # dernire partie commune  trois nombres (f,d,c2)
        movd   %mm7,    %ecx            # ecx <- 2p-2-max
        leal  (%edi,%ecx,4), %edi       # edi <- &f[2p+2]
        leal  (%ebx,%ecx,4), %ebx       # ebx <- &d[2p+1]
        leal  (%ebp,%ecx,4), %ebp       # ebp <- &c2[2p]
        negl   %ecx
        jz     2f
        ALIGN_4
1:
        pshufw $0xfe,   %mm1, %mm1      # mm1 <- retenue
        movd  (%edi,%ecx,4), %mm0       # mm0 <- f[i]
        movd  (%ebx,%ecx,4), %mm3       # mm3 <- d[i-1]
        movd  (%ebp,%ecx,4), %mm4       # mm4 <- c2[i-2]
        incl   %ecx
        paddq  %mm0,    %mm1            # mm1 <- f[i] + ret
        paddq  %mm3,    %mm4            # mm4 <- d[i-1]+c2[i-2]
        psubq  %mm4,    %mm1            # mm1 -= d[i-1]+c2[i-2]
        movd   %mm1, -4(%edi,%ecx,4)    # sauve f[i]
        jne    1b
2:

        # divise f/BASE par 1-BASE^2 -> -c3 mod BASE^(2p+1)
        # ajoute  d -> c1
        # ajoute c1  c[p...] et retranche -c3 de c[3p...]
        # ... le tout en une seule passe !
                
        movl   _p_,     %ecx
        movl   _f_,     %esi
        movl   _d_,     %ebx
        movl   _c_,     %edi
        leal  (%edi,%ecx,4), %edi       # edi <- &c[p]
        leal  (%edi,%ecx,8), %ebp       # ebp <- &c[3p]

        # traite le 1er chiffre  part pour amorcer la division
        movd  4(%esi),  %mm2            # mm2 <- f[1] = -c3[0]
        movd  8(%esi),  %mm1            # mm1 <- f[2] = -c3[1]
        movd 12(%esi),  %mm4            # mm4 <- f[3]
        movd  (%ebx),   %mm0            # mm0 <- d[0]
        movd  (%edi),   %mm6            # mm6 <- c[p]
        movd  (%ebp),   %mm7            # mm7 <- c[3p]
        paddq  %mm2,    %mm4            # mm4 <- f[3]-c3[0] = -c3[2]
        paddq  %mm0,    %mm6
        paddq  %mm2,    %mm6            # mm6 <- c[p]+d[0]-c3[0] = c[p]+c1[0]
        psubq  %mm2,    %mm7            # mm7 <- c[3p]+c3[0]
        movd   %mm6,   (%edi)           # sauve c[p]+c1[0]
        movd   %mm7,   (%ebp)           # sauve c[3p]+c3[0]
        
        leal  8(%esi,%ecx,8), %esi      # esi <- &f[2p+2]
        leal -4(%ebx,%ecx,8), %ebx      # ebx <- &d[2p-1]
        leal -4(%edi,%ecx,8), %edi      # edi <- &c[3p-1]
        leal -4(%ebp,%ecx,8), %ebp      # ebp <- &c[5p-1]
        decl   %ecx                     # ecx <- p-1
        negl   %ecx

        # traite les 2p-2 chiffres suivants
        # rmq: c est assez long pour l addition de c3 car q+r >= p-1
        ALIGN_4
1:
        pshufw $0xfe,   %mm4, %mm3      # mm3 <- retenue(-c3[2i])
        pshufw $0xf4,   %mm4, %mm2      # mm2 <- -c3[2i]
        pshufw $0xfe,   %mm6, %mm6      # mm6 <- retenue(c[p+2i-2]+c1[2i-2])
        pshufw $0xfe,   %mm7, %mm7      # mm7 <- retenue(c[3p+2i-2]+c3[2i-2])
        movd   (%esi,%ecx,8), %mm4      # mm4 <- f[2i+2]
        movd   (%ebx,%ecx,8), %mm0      # mm0 <- d[2i-1]
        movd   (%edi,%ecx,8), %mm5      # mm5 <- c[p+2i-1]
        paddq  %mm4,    %mm3            # mm3 <- f[2i+2] + ret
        movd   (%ebp,%ecx,8), %mm4      # mm4 <- c[3p+2i-1]
        paddq  %mm0,    %mm5            # mm5 <- c[p+2i-1] + d[2i-1]
        paddq  %mm1,    %mm6            # mm6 <- -c3[2i-1] + ret
        psubq  %mm1,    %mm7            # mm7 <- c3[2i-1] + ret
        paddq  %mm1,    %mm3            # mm3 <- f[2i+2]-c3[2i-1] = -c3[2i+1]
        paddq  %mm5,    %mm6            # mm6 <- c[p+2i-1] + c1[2i-1]
        paddq  %mm4,    %mm7            # mm7 <- c[3p+2i-1] + c3[2i-1]
        movd   %mm6,   (%edi,%ecx,8)    # sauve c[p+2i-1] + c1[2i-1]
        movd   %mm7,   (%ebp,%ecx,8)    # sauve c[3p+2i-1] + c3[2i-1]
        
        pshufw $0xfe,   %mm3, %mm4      # mm4 <- retenue(-c3[2i+1])
        pshufw $0xf4,   %mm3, %mm1      # mm1 <- -c3[2i+1]
        pshufw $0xfe,   %mm6, %mm6      # mm6 <- retenue(c[p+2i-1]+c1[2i-1])
        pshufw $0xfe,   %mm7, %mm7      # mm7 <- retenue(c[3p+2i-1]+c3[2i-1])
        movd  4(%esi,%ecx,8), %mm3      # mm3 <- f[2i+3]
        movd  4(%ebx,%ecx,8), %mm0      # mm0 <- d{2i]
        movd  4(%edi,%ecx,8), %mm5      # mm5 <- c[p+2i]
        paddq  %mm3,    %mm4            # mm4 <- f[2i+3] + ret
        movd  4(%ebp,%ecx,8), %mm3      # mm3 <- c[3p+2i]
        incl   %ecx
        paddq  %mm2,    %mm6            # mm6 <- -c3[2i] + ret
        psubq  %mm2,    %mm7            # mm7 <- c3[2i] + ret
        paddq  %mm0,    %mm5            # mm5 <- c[p+2i] + d{2i]
        paddq  %mm2,    %mm4            # mm4 <- f[2i+3]-c3[2i] = -c3[2i+2]
        paddq  %mm5,    %mm6            # mm6 <- c[p+2i] + c1[2i]
        paddq  %mm3,    %mm7            # mm7 <- c[3p+2i] + c3[2i]
        movd   %mm6, -4(%edi,%ecx,8)    # sauve c[p+2i] + c1[2i]
        movd   %mm7, -4(%ebp,%ecx,8)    # sauve c[3p+2i] + c3[2i]
        jne    1b

        # termine le calcul et l addition de c1
        pshufw $0xf4,   %mm4, %mm2      # mm2 <- -c3[2p]
        pshufw $0xfe,   %mm6, %mm6      # mm6 <- retenue(c[3p-2] + c1[2p-2])
        movd  (%ebx),   %mm0            # mm0 <- d[2p-1]
        movd  (%edi),   %mm5            # mm5 <- c[3p-1]
        paddq  %mm1,    %mm6            # mm6 <- -c3[2p-1] + ret
        paddq  %mm0,    %mm5            # mm5 <- c[3p-1] + d[2p-1]
        paddq  %mm5,    %mm6            # mm6 <- c[3p-1] + c1[2p-1]
        movd   %mm6,   (%edi)           # sauve c[3p-1] + c1[2p-1]
        pshufw $0xfe,   %mm6, %mm6      # mm6 <- retenue(c[3p-1] + c1[2p-1])
        movd 4(%ebx),   %mm0            # mm0 <- d[2p]
        movd 4(%edi),   %mm5            # mm5 <- c[3p]
        paddq  %mm2,    %mm6            # mm6 <- -c3[2p] + ret
        paddq  %mm0,    %mm5            # mm5 <- c[3p] + d[2p]
        paddq  %mm5,    %mm6            # mm6 <- c[3p] + c1[2p]
        movd   %mm6,  4(%edi)           # sauve c[3p] + c1[2p]
        pshufw $0xfe,   %mm6, %mm6      # mm6 <- retenue(c[3p] + c1[2p])
        
        # termine l addition de c3: il reste q+r-p+1 chiffres  traiter
        pshufw $0xfe,   %mm7, %mm7      # mm7 <- retenue(c[5p-2] + c3[2p-2])
        movl   _q_,     %ecx
        addl   _r_,     %ecx
        subl   _p_,     %ecx
        incl   %ecx
        jz     1f
        movd  (%ebp),   %mm5            # mm5 <- c[5p-1]
        psubq  %mm1,    %mm7            # mm7 <- c3[2p-1] + ret
        paddq  %mm5,    %mm7            # mm7 <- c[5p-1] + c3[2p-1]
        movd   %mm7,   (%ebp)           # sauve c[5p-1]
        pshufw $0xfe,   %mm7, %mm7      # mm7 <- retenue(c[5p-1] + c3[2p-1])
        decl   %ecx
        jz     1f
        movd 4(%ebp),   %mm5            # mm5 <- c[5p]
        psubq  %mm2,    %mm7            # mm7 <- c3[2p] + ret
        paddq  %mm5,    %mm7            # mm7 <- c[5p] + c3[2p]
        movd   %mm7,  4(%ebp)           # sauve c[5p] + c3[2p]
        pshufw $0xfe,   %mm7, %mm7      # mm6 <- retenue(c[5p] + c3[2p])
1:

        # ici mm2 = -c3[2p] mod BASE, mm6 = retenue de c1, mm7 = retenue de c3
        # valeurs possibles pour mm2 :
        #
        #    si c3 = 0            alors mm2 = 0
        #    si 0 < c3 <= BASE^2p alors mm2 = BASE-1
        #    si BASE^2p < c3      alors mm2 = BASE-2
        #
        # dans les deux derniers cas, il faut diminuer mm6 de 1 et augmenter
        # mm7 de 1 pour tenir compte de la retenue sur -c3

        pshufw $0x55,   %mm2, %mm2      # mm2 <- 0 ou -1
        paddq  %mm2,    %mm6
        psubq  %mm2,    %mm7
        
        # propage la retenue sur c1
        movd   %mm6,    %ecx
        jecxz  2f
1:
        leal  4(%edi),  %edi
        incl  4(%edi)
        jz    1b
2:
        # propage la retenue sur c3
        movd   %mm7,    %ecx
        jecxz  2f
1:
        leal  4(%ebp),  %ebp
        incl  4(%ebp)
        jz    1b
2:
        emms

        
#else /* use_sse2 */
        
        # c[2p..4p] <- (d+e)/2 = c0 + c2 + c4, d <- (d-e)/2 = c1 + c3
        movl   _c_,     %edi
        movl   _d_,     %esi
        movl   _e_,     %ebx
        movl   _p_,     %ecx
        leal   1(,%ecx,2), %ecx         # ecx <- 2p+1
        leal   -4(%edi,%ecx,4), %edi    # edi <- &c[2p]
        movl   -4(%edi,%ecx,4), %eax    # sauve c[4p] dans x
        xchgl  %eax,    _x_
        bt     $0,      %eax            # CF <- signe(e)
        jnc    1f
        call   .Lsn_fsub_1              # c[2p..4p] <- d - |e|
        jmp    2f
1:
        call   .Lsn_fadd_1              # c[2p..4p] <- d + |e|
2:
        movl   _c_,     %esi
        movl   _p_,     %ecx
        leal   1(,%ecx,2), %ecx         # ecx <- 2p+1
        movl   %ecx,    %edx            # edx <- 2p+1
        leal   -4(%esi,%ecx,4), %esi    # esi <- &c[2p]
        movl   %esi,    %ebx            # ebx <- &c[2p]
        call   .Lsn_fhalf               # c[2p..4p] /= 2
        movl   %edx,    %ecx            # ecx <- 2p+1
        movl   _d_,     %esi
        call   .Lsn_fdec_1              # d -= c[2p..4p]
        movl   -4(%ebx), %eax
        xchgl   %eax,    _x_            # restaure c[4p]
        movl    %eax,   -4(%ebx)

        # c[2p..4p] <- c[2p..4p] - c0 - c4 = c2
        movl   _p_,     %ecx
        leal   (,%ecx,2), %ecx          # ecx <- 2p
        movl   _c_,     %ebx
        leal   (%ebx,%ecx,4), %esi      # esi <- &c[2p]
        call   .Lsn_fdec_1              # c[2p..4p-1] -= c0
        sbbl   %ecx,    _x_             # x -= retenue
        movl   %ebx,    %esi            # esi <- &c[2p]
        movl   _p_,     %edx
        leal   (,%edx,2), %edx          # edx <- 2p
        movl   _r_,     %ecx
        addl   _q_,     %ecx            # ecx <- q+r
        leal   (%esi,%edx,4), %ebx      # ebx <- &c[4p]
        call   .Lsn_fdec                # c[2p..4p-1] -= c4
        sbbl   %ecx,    _x_             # x -= retenue

        # f <- f - c0 - BASE^2*c2 - BASE^4*c4 = BASE*c1 + BASE^3*c3
        #
        # rmq1 : f a 2p+6 chiffres mais on s en sert pour calculer -BASE*c3
        # qui tient sur p+q+2 chiffres -> on peut ignorer les chiffres de rang
        # 2p+2  2p+5 (d ailleurs on ne les a peut-tre mme pas calculs)
        #
        # rmq2 : f et c0 ont mme chiffre des units, donc on peut commencer
        # la soustraction au rang 1. Ce n est mme pas la peine de forcer
        # le premier chiffre  zro, on ne s en servira pas
        movl   _p_,     %ecx
        leal   -1(,%ecx,2), %ecx        # ecx <- 2p-1
        movl   _c_,     %ebx
        leal   4(%ebx), %ebx            # ebx <- &c[1]
        movl   _f_,     %esi
        leal   4(%esi), %esi            # esi <- &f[1]
        call   .Lsn_fdec_1              # f -= c0
        sbbl   %ecx,    (%esi)          # propage la retenue
        sbbl   %ecx,   4(%esi)
        movl   _p_,     %ecx
        leal   (,%ecx,2), %ecx          # ecx <- 2p
        movl   _f_,     %esi
        leal   8(%esi), %esi            # esi <- &f[2]
        call   .Lsn_fdec_1              # f -= c2*BASE^2
        movl   _p_,     %edx
        # on pourrait prendre edx = 2p-2 ici puisqu on ne veut que les
        # 2p+2 premiers chiffres de f, mais sn_fdec va planter si
        # on a q+r > 2p-2.
        leal   (,%edx,2), %edx          # edx <- 2p
        movl   _f_,     %esi
        leal   16(%esi), %esi           # esi <- &f[4]
        movl   _r_,     %ecx
        addl   _q_,     %ecx            # ecx <- q+r
        call   .Lsn_fdec                # f -= c4*BASE^4

        # f <- f - BASE*d = BASE*(BASE^2 - 1)*c3
        movl   _p_,     %ecx
        leal   1(,%ecx,2), %ecx         # ecx <- 2p+1
        movl   _f_,     %esi
        leal   4(%esi), %esi            # esi <- &f[1]
        movl   _d_,     %ebx            # ebx <- &d
        call   .Lsn_fdec_1              # f -= BASE*d

        # f <- -f/(BASE^2 - 1) = -BASE*c3 mod BASE^(2p+2)
        movl   _p_,     %ecx
        leal   -1(,%ecx,2), %ecx        # ecx <- 2p - 1
        movl   _f_,     %ebx
        leal   4(%ebx), %ebx            # ebx <- &f[1]
        leal   8(%ebx), %esi            # esi <- &f[3]
        call   .Lsn_finc_1              # divise par 1 - BASE^2
        testl  $-1,  -4(%esi)
        jz     L(c3_nul)                # ZF = 1 ssi c3 = 0

        # c[3p..4p+q+r-1] += c3
        movl   _c_,     %esi
        movl   _p_,     %ecx
        leal   (%esi,%ecx,4), %esi
        leal   (%esi,%ecx,8), %esi      # esi <- &c[3p]
        addl   _q_,     %ecx
        incl   %ecx                     # ecx <- p+q+1
        movl   _f_,     %ebx
        leal   4(%ebx), %ebx            # ebx <- &f[1]
        call   .Lsn_fdec_1              # retranche BASE^(p+q+1) - c3
        jb     2f                       # s il n y a pas de retenue, alors
1:                                      # il faut ajouter BASE^(p+q+1)
        incl   (%esi)
        leal   4(%esi), %esi
        jz     1b
2:
        
        # d <- d + f/BASE = c1
        movl   _p_,     %ecx
        leal   1(,%ecx,2), %ecx         # ecx <- 2p+1
        movl   _d_,     %esi            # esi <- &d
        movl   _f_,     %ebx
        leal   4(%ebx), %ebx            # ebx <- &f[1]
        call   .Lsn_finc_1              # d -= c3

        # c[p..4p+q+r-1] += c1
L(c3_nul):
        movl   _c_,     %esi
        movl   _p_,     %edx
        leal   (%esi,%edx,4), %esi      # esi <- &c[p]
        leal   1(,%edx,2), %ecx         # ecx <- 2p+1
        addl   _r_,     %edx
        addl   _q_,     %edx
        leal  -1(%edx,%ecx,1), %edx     # edx <- 3p+q+r
        movl   _d_,     %ebx
        call   .Lsn_finc                # ajoute c1

#endif /* use_sse2 */

        # c[4p] += x
        movl   _c_,     %esi
        movl   _p_,     %ecx
        leal   (%esi,%ecx,8), %esi
        leal   (%esi,%ecx,8), %esi      # esi <- &c[4p]
        movl   _x_,     %eax
        addl   %eax,    (%esi)          # c[4p] += x
        jnc    2f
1:
        leal   4(%esi), %esi            # propage la retenue
        incl   (%esi)
        jz     1b
2:

        # termin
        movl   _p_,     %eax
        leal  (%eax,%eax,2), %eax       # eax <- 3p
        leal  80(%esp,%eax,8), %esp     # nettoie la pile
        ret

        # ici lb <= 2*ceil(la/3) : dcoupage en tranches
        ALIGN_4
L(tranches):
        
        addl   %eax,    %ecx            # ecx <- lb
        leal   -2(%edx,%ebp,1), %edx    # edx <- q
        leal   (%edx,%ebp,2),   %edx    # edx <- la

        # Le code qui suit est recopi mot  mot dans karamul, en remplaant
        # les deux appels  sn_ftoommul par des appels  sn_fkaramul.
        # Attention  rpercuter les mises  jour !

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _la_
        #undef  _lb_
        #define _d_  20(%esp)
        #define _la_ 16(%esp)
        #define _lb_ 12(%esp)
        #define _a_   8(%esp)
        #define _b_   4(%esp)
        #define _c_    (%esp)
        
        leal   (,%ecx,4), %eax
	ALLOCA                          # rserve lb chiffres dans la pile
        pushl  %edx                     # sauve la
        pushl  %ecx                     # sauve lb

        # premire multiplication : c <- a[0..(la % lb)-1]*b
        movl   %edx,    %eax
        movl   $0,      %edx            # edx:eax <- la
        divl   %ecx                     # edx <- la % lb
        testl  %edx,    %edx            # si la est multiple de lb ...
        jnz    1f
        movl   %ecx,    %edx
1:
        xchgl  %ebx,    %esi            # permute les arguments ...
        xchgl  %ecx,    %edx            # pour avoir edx >= ecx
        leal   (%ebx,%ecx,4), %eax
        pushl  %eax                     # a += a[la % lb]
        pushl  %esi                     # sauve &b
        leal   (%edi,%ecx,4), %eax
        pushl  %eax                     # c += c[la % lb]
        subl   %ecx,    _la_            # la -= la % lb
        call   .Lsn_ftoommul

        # multiplications suivantes
        ALIGN_4
L(loop):
        movl   _c_,     %esi
        leal   _d_,     %edi
        movl   _lb_,    %ecx
        cld;   REP(movsl)               # d <- c[0..lb-1]
        
        movl   _c_,     %edi
        movl   _b_,     %esi
        movl   _a_,     %ebx
        movl   _lb_,    %edx
        movl   %edx,    %ecx            # ecx <- lb
        call   .Lsn_ftoommul            # c[0..2lb-1] <- a[0..lb-1]*b

        movl   _c_,     %esi
        leal   _d_,     %ebx
        movl   _lb_,    %ecx
        leal   (,%ecx,2), %edx          # edx <- 2*lb
        call   .Lsn_finc                # c <- c + d

        movl   _lb_,    %eax
        leal   (,%eax,4), %ecx
        addl   %ecx,    _c_             # c+=lb
        addl   %ecx,    _a_             # a+=lb
        subl   %eax,    _la_            # la -= lb
        jne    L(loop)

        # termin
        leal   20(%esp,%eax,4), %esp    # nettoie la pile
        ret

                              # +---------------+
                              # |  Interface C  |
                              # +---------------+
                
#  void xn(toommul)(chiffre *a, long la, chiffre *b, long lb, chiffre *c)
#
#  entre :
#  a = naturel de longueur la
#  b = naturel de longueur lb
#  c = naturel de longueur la+lb, non confondu avec a ou b
#  contraintes : 0 < lb <= la
#
#  sortie :
#  c <- a*b

#ifdef debug_toommul
ENTER(sn_toommul_buggy)
#else
ENTER(sn_toommul)
#endif

        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %edx            # edx <- la
        movl   arg3,    %ebx            # ebx <- &b
        movl   arg4,    %ecx            # ecx <- lb
        movl   arg5,    %edi            # edi <- &c
#ifdef debug_toommul
        call   .Lsn_ftoommul_buggy      # effectue la multiplication
#else
        call   .Lsn_ftoommul
#endif
        RETURN_WITH_SP
#endif /* assembly_sn_toommul */

        # cas o la version assembleur est dsactive :
        # sn_ftoommul renvoie vers la version C

#if !defined(assembly_sn_toommul) || defined(debug_toommul)
        ALIGN_32
.Lsn_ftoommul:

        pushl  %edi
        pushl  %ecx
        pushl  %ebx
        pushl  %edx
        pushl  %esi
        call   SUBR(sn_toommul)
        leal   20(%esp), %esp
        ret

#endif /* !defined(assembly_sn_toommul) || defined(debug_toommul) */


                                 # +---------+
                                 # |  Carr  |
                                 # +---------+

# entre :
#   a = naturel de longueur la     esi = &a, edx = la
#   c = naturel de longueur 2*la   edi = &c
# contraintes : 0 < la
#
# sortie :
#   c <- a^2
#
# registres modifis :
#   eax,ebx,ecx,edx,esi,edi,ebp <- ind.

#ifdef assembly_sn_toomsqr
        ALIGN_32
#ifdef debug_toommul
.Lsn_ftoomsqr_buggy:
#else
.Lsn_ftoomsqr:
#endif

#undef L
#define L(x) .Lsn_ftoomsqr_##x

        # petit carr => algorithme de Karatsuba
        cmpl   $toomsqr_lim, %edx
        jbe    .Lsn_fkarasqr

        movl   %edx,    %ecx            # ecx <- la
        leal   2(%edx), %eax
        xorl   %edx,    %edx
        movl   $3,      %ebp
        divl   %ebp                     # eax <- p = ceil(la/3)
	movl   %eax,    %ebp            # ebp <- p
        shll   $1,      %eax            # eax <- 2p
        subl   %eax,    %ecx            # ecx <- q = la - 2p

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _e_
        #undef  _f_
        #undef  _p_
        #undef  _q_
        #undef  _r_
        #undef  _x_
        
        #define _a_   36(%esp)
        #define _b_   32(%esp)
        #define _c_   28(%esp)
        #define _d_   24(%esp)
        #define _e_   20(%esp)
        #define _f_   16(%esp)
        #define _p_   12(%esp)
        #define _q_    8(%esp)
        #define _r_    4(%esp)
        #define _x_     (%esp)
        
        leal  10(%eax,%eax,2), %eax     # eax <- 6p+10
        leal  (,%eax,4), %eax
        ALLOCA                          # rserve 6p+10 chiffres dans la pile
        pushl  %esi                     # sauve &a
        pushl  %esi                     # sauve &b (= &a)
        pushl  %edi                     # sauve &c
        leal   12(%esp), %esi
        pushl  %esi                     # sauve &d
        leal   8(%esi,%ebp,8), %esi
        pushl  %esi                     # sauve &e
        leal   8(%esi,%ebp,8), %esi
        pushl  %esi                     # sauve &f
        pushl  %ebp                     # sauve p
        pushl  %ecx                     # sauve q
        pushl  %ecx                     # sauve r (= q)
        pushl  $0                       # x <- 0

        # c[0..p] <- a0 + a1 + a2, c[p+1..2p+1] <- |a0 - a1 + a2|
        movl   _a_,     %esi
        movl   _c_,     %edi
        movl   _p_,     %edx
        movl   _q_,     %ecx
        call   .Lsn_fadd_sub3

        # d <- (a0 + a1 + a2)^2 = c0 + c1 + c2 + c3 + c4
        movl   _p_,     %edx
        movl   _c_,     %esi
        movl   _d_,     %edi
        movl   $0,  (%edi,%edx,8)
        testl  $-1, (%esi,%edx,4)
        jz     1f
        incl   %edx
1:
        call   .Lsn_ftoomsqr

        # e <- (a0 - a1 + a2)^2 = c0 - c1 + c2 - c3 + c4
        movl   _p_,     %edx
        movl   _c_,     %esi
        leal  4(%esi,%edx,4), %esi
        movl   _e_,     %edi
        movl   $0,  (%edi,%edx,8)
        testl  $-1, (%esi,%edx,4)
        jz     1f
        incl   %edx
1:
        call   .Lsn_ftoomsqr
        
        # c[0..p+2] <- a0 + BASE*a1 + BASE^2*a2
        movl   _a_,     %esi
        movl   _c_,     %edi
        movl   _p_,     %edx
        movl   _q_,     %ecx
        call   .Lsn_fadd_base
        
        # f <- (a0 + BASE*a1 + BASE^2*a2)^2
        #    = c0 + BASE*c1 + BASE^2*c2 + BASE^3*c3 + BASE^4*c4
        movl   _p_,     %edx
        movl   _f_,     %edi
        xorl   %eax,    %eax
#if 0
        # ceci est inutile, les chiffres de rang 2p+2  2p+5
        # de f vont tre ignors (voir plus haut)
        movl   %eax,    8(%edi,%edx,8)  # f[2p+2] <- 0
        movl   %eax,   12(%edi,%edx,8)  # f[2p+3] <- 0
        movl   %eax,   16(%edi,%edx,8)  # f[2p+4] <- 0
        movl   %eax,   20(%edi,%edx,8)  # f[2p+5] <- 0
#endif
        addl   $3,      %edx            # edx <- p+3
        movl   _c_,     %esi
        cmpl  -4(%esi,%edx,4), %eax     # edx <- lg(a0+BASE*a1+BASE^2*a2)
        adcl   $-1,     %edx
        cmpl  -4(%esi,%edx,4), %eax
        adcl   $-1,     %edx
        call   .Lsn_ftoomsqr

        # c[0..2p-1] <- a0^2 = c0
        movl   _a_,     %esi
        movl   _c_,     %edi
        movl   _p_,     %edx
        call   .Lsn_ftoomsqr

        # c[4*p..4p+q+r-1] <- a2^2 = c4
        movl   _p_,     %eax
        leal   (,%eax,2), %eax          # eax <- 2p
        movl   _a_,     %esi
        movl   _c_,     %edi
        leal   (%esi,%eax,4), %esi      # esi <- &a2
        leal   (%edi,%eax,8), %edi      # edi <- &c[4p]
        movl   _q_,     %edx
        call   .Lsn_ftoomsqr
        
        jmp    .Lsn_toom_aux            # continue avec toommul

        
                              # +---------------+
                              # |  interface C  |
                              # +---------------+

#  void xn(toomsqr)(chiffre *a, long la, chiffre *b)
#
#  entre :
#  a = naturel de longueur la
#  b = naturel de longueur 2*la, non confondu avec a
#  contraintes : 0 < la
#
#  sortie :
#  b <- a^2

#ifdef debug_toommul
ENTER(sn_toomsqr_buggy)
#else
ENTER(sn_toomsqr)
#endif

        movl   arg1,    %esi            # esi <- &a
        movl   arg2,    %edx            # edx <- la
        movl   arg3,    %edi            # edi <- &b
#ifdef debug_toommul
        call   .Lsn_ftoomsqr_buggy      # calcule le carr
#else
        call   .Lsn_ftoomsqr      
#endif
        RETURN_WITH_SP
#endif /* assembly_sn_toomsqr */

        # cas o la version assembleur est dsactive ou dbogue :
        # sn_ftoomsqr renvoie vers la version C
        
#if !defined(assembly_sn_toomsqr) || defined(debug_toommul)
        ALIGN_32
.Lsn_ftoomsqr:

        pushl  %edi
        pushl  %edx
        pushl  %esi
        call   SUBR(sn_toomsqr)
        leal   12(%esp), %esp
        ret
        
#endif /* !defined(assembly_sn_toomsqr) || defined(debug_toommul) */




