summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRich Felker <dalias@aerifal.cx>2012-03-20 00:51:32 -0400
committerRich Felker <dalias@aerifal.cx>2012-03-20 00:51:32 -0400
commitbaa43bca0a051e8deb0d6a9a8882ceeea5c27249 (patch)
treef5fe7ae916d9039adfe82217716e2aafd08702fb
parent7513d3ecabb998e2c8c4cb9ed5de48c4b64a166b (diff)
downloadmusl-baa43bca0a051e8deb0d6a9a8882ceeea5c27249.tar.gz
musl-baa43bca0a051e8deb0d6a9a8882ceeea5c27249.tar.bz2
musl-baa43bca0a051e8deb0d6a9a8882ceeea5c27249.tar.xz
musl-baa43bca0a051e8deb0d6a9a8882ceeea5c27249.zip
optimize scalbn family
the fscale instruction is slow everywhere, probably because it involves a costly and unnecessary integer truncation operation that ends up being a no-op in common usages. instead, construct a floating point scale value with integer arithmetic and simply multiply by it, when possible. for float and double, this is always possible by going to the next-larger type. we use some cheap but effective saturating arithmetic tricks to make sure even very large-magnitude exponents fit. for long double, if the scaling exponent is too large to fit in the exponent of a long double value, we simply fallback to the expensive fscale method. on atom cpu, these changes speed up scalbn by over 30%. (min rdtsc timing dropped from 110 cycles to 70 cycles.)
-rw-r--r--src/math/i386/scalbn.s19
-rw-r--r--src/math/i386/scalbnf.s18
-rw-r--r--src/math/i386/scalbnl.s16
3 files changed, 46 insertions, 7 deletions
diff --git a/src/math/i386/scalbn.s b/src/math/i386/scalbn.s
index e275d14f..8bf302f2 100644
--- a/src/math/i386/scalbn.s
+++ b/src/math/i386/scalbn.s
@@ -11,10 +11,23 @@ scalbln:
.global scalbn
.type scalbn,@function
scalbn:
- fildl 12(%esp)
+ mov 12(%esp),%eax
+ add $0x3ffe,%eax
+ cmp $0x7ffd,%eax
+ jb 1f
+ sub $0x3ffe,%eax
+ sar $31,%eax
+ xor $0xfff,%eax
+ add $0x3ffe,%eax
+1: inc %eax
fldl 4(%esp)
- fscale
- fstp %st(1)
+ mov %eax,12(%esp)
+ mov $0x80000000,%eax
+ mov %eax,8(%esp)
+ xor %eax,%eax
+ mov %eax,4(%esp)
+ fldt 4(%esp)
+ fmulp
fstpl 4(%esp)
fldl 4(%esp)
ret
diff --git a/src/math/i386/scalbnf.s b/src/math/i386/scalbnf.s
index 40232b6a..9cb9ef5f 100644
--- a/src/math/i386/scalbnf.s
+++ b/src/math/i386/scalbnf.s
@@ -11,10 +11,22 @@ scalblnf:
.global scalbnf
.type scalbnf,@function
scalbnf:
- fildl 8(%esp)
+ mov 8(%esp),%eax
+ add $0x3fe,%eax
+ cmp $0x7fd,%eax
+ jb 1f
+ sub $0x3fe,%eax
+ sar $31,%eax
+ xor $0x1ff,%eax
+ add $0x3fe,%eax
+1: inc %eax
+ shl $20,%eax
flds 4(%esp)
- fscale
- fstp %st(1)
+ mov %eax,8(%esp)
+ xor %eax,%eax
+ mov %eax,4(%esp)
+ fldl 4(%esp)
+ fmulp
fstps 4(%esp)
flds 4(%esp)
ret
diff --git a/src/math/i386/scalbnl.s b/src/math/i386/scalbnl.s
index 224b1bef..54414c2e 100644
--- a/src/math/i386/scalbnl.s
+++ b/src/math/i386/scalbnl.s
@@ -11,7 +11,21 @@ scalblnl:
.global scalbnl
.type scalbnl,@function
scalbnl:
- fildl 16(%esp)
+ mov 16(%esp),%eax
+ add $0x3ffe,%eax
+ cmp $0x7ffd,%eax
+ jae 1f
+ inc %eax
+ fldt 4(%esp)
+ mov %eax,12(%esp)
+ mov $0x80000000,%eax
+ mov %eax,8(%esp)
+ xor %eax,%eax
+ mov %eax,4(%esp)
+ fldt 4(%esp)
+ fmulp
+ ret
+1: fildl 16(%esp)
fldt 4(%esp)
fscale
fstp %st(1)