optimize exponential asm for i386

up to 30% faster exp2 by avoiding slow frndint and fscale functions. expm1 also takes a much more direct path for small arguments (the expected usage case).
author: Rich Felker <dalias@aerifal.cx> 2012-03-19 09:00:30 -0400
committer: Rich Felker <dalias@aerifal.cx> 2012-03-19 09:00:30 -0400
commit: 02db27d9deaee71b244c91e720ec819c74dab150 (patch)
tree: ef2543fd54a6fdbca8839cb14c71bb10ffdaa8f1 /src/math/i386
parent: da7458a602a6f0bdea25d6b9b613372048a974e6 (diff)
download: musl-02db27d9deaee71b244c91e720ec819c74dab150.tar.gz
musl-02db27d9deaee71b244c91e720ec819c74dab150.tar.bz2
musl-02db27d9deaee71b244c91e720ec819c74dab150.tar.xz
musl-02db27d9deaee71b244c91e720ec819c74dab150.zip
2 files changed, 77 insertions, 58 deletions
diff --git a/src/math/i386/exp.s b/src/math/i386/exp.s
index f4769d59..76ab4d64 100644
--- a/src/math/i386/exp.s
+++ b/src/math/i386/exp.s
@@ -1,3 +1,37 @@
+.global expm1f
+.type expm1f,@function
+expm1f:
+	flds 4(%esp)
+	jmp 1f
+
+.global expm1l
+.type expm1l,@function
+expm1l:
+	fldt 4(%esp)
+	jmp 1f
+
+.global expm1
+.type expm1,@function
+expm1:
+	fldl 4(%esp)
+1:	fldl2e
+	fmulp
+	fld1
+	fld %st(1)
+	fabs
+	fucom %st(1)
+	fnstsw %ax
+	fstp %st(0)
+	fstp %st(0)
+	sahf
+	ja 1f
+	f2xm1
+	ret
+1:	call 1f
+	fld1
+	fsubrp
+	ret
+
 .global exp2f
 .type exp2f,@function
 exp2f:
@@ -34,22 +68,53 @@ exp:
 .type exp2,@function
 exp2:
 	fldl 4(%esp)
-1:	fxam
-	fnstsw %ax
+1:	mov $0x47000000,%eax
+	push %eax
+	flds (%esp)
+	shl $7,%eax
+	push %eax
+	add %eax,%eax
+	push %eax
+	fld %st(1)
+	fabs
+	fucom %st(1)
+	fnstsw
 	sahf
-	jnp 1f
-	jnc 1f
-	fstps 4(%esp)
-	mov $0xfe,%al
-	and %al,7(%esp)
-	flds 4(%esp)
-1:	fld %st(0)
-	frndint
+	ja 2f
+	fstp %st(0)
+	fstp %st(0)
+	fld %st(0)
+	fistpl 8(%esp)
+	fildl 8(%esp)
 	fxch %st(1)
 	fsub %st(1)
+	mov $0x3fff,%eax
+	add %eax,8(%esp)
 	f2xm1
 	fld1
 	faddp
-	fscale
+	fldt (%esp)
+	fmulp
 	fstp %st(1)
+	add $12,%esp
+	ret
+
+2:	fstp %st(0)
+	fstp %st(0)
+	fsts 8(%esp)
+	mov 8(%esp),%eax
+	lea (%eax,%eax),%ecx
+	cmp $0xff000000,%ecx
+	ja 2f
+	fstp %st(0)
+	xor %ecx,%ecx
+	inc %ecx
+	add %eax,%eax
+	jc 1f
+	mov $0x7ffe,%ecx
+1:	mov %ecx,8(%esp)
+	fldt (%esp)
+	fld %st(0)
+	fmulp
+2:	add $12,%esp
 	ret
diff --git a/src/math/i386/expm1.s b/src/math/i386/expm1.s
index bbb5d12e..f335a3e5 100644
--- a/src/math/i386/expm1.s
+++ b/src/math/i386/expm1.s
@@ -1,47 +1 @@
-.global expm1f
-.type expm1f,@function
-expm1f:
-	flds 4(%esp)
-	jmp 1f
-
-.global expm1l
-.type expm1l,@function
-expm1l:
-	fldt 4(%esp)
-	jmp 1f
-
-.global expm1
-.type expm1,@function
-expm1:
-	fldl 4(%esp)
-1:	fxam
-	fnstsw %ax
-	sahf
-	jnp 1f
-	jnc 1f
-	fstps 4(%esp)
-	mov $0xfe,%al
-	and %al,7(%esp)
-	flds 4(%esp)
-1:	fldl2e
-	fmulp
-	fld %st(0)
-	frndint
-	fldz
-	fcomp
-	fnstsw %ax
-	sahf
-	jnz 1f
-	fstp %st(0)
-	f2xm1
-	ret
-1:	fxch %st(1)
-	fsub %st(1)
-	f2xm1
-	fld1
-	faddp
-	fscale
-	fld1
-	fsubrp
-	fstp %st(1)
-	ret
+# see exp.s
author	Rich Felker <dalias@aerifal.cx>	2012-03-19 09:00:30 -0400
committer	Rich Felker <dalias@aerifal.cx>	2012-03-19 09:00:30 -0400
commit	02db27d9deaee71b244c91e720ec819c74dab150 (patch)
tree	ef2543fd54a6fdbca8839cb14c71bb10ffdaa8f1 /src/math/i386
parent	da7458a602a6f0bdea25d6b9b613372048a974e6 (diff)
download	musl-02db27d9deaee71b244c91e720ec819c74dab150.tar.gz musl-02db27d9deaee71b244c91e720ec819c74dab150.tar.bz2 musl-02db27d9deaee71b244c91e720ec819c74dab150.tar.xz musl-02db27d9deaee71b244c91e720ec819c74dab150.zip