diff options
author | Rich Felker <dalias@aerifal.cx> | 2013-08-28 03:34:57 -0400 |
---|---|---|
committer | Rich Felker <dalias@aerifal.cx> | 2013-08-28 03:34:57 -0400 |
commit | 90edf1cc15cec685c18ec2485ddce5b655963464 (patch) | |
tree | 97b728abcd94fbb0e41a2a1d14a7a15d70b24c36 /src/string/memcpy.c | |
parent | 38e6acbf89afd3dfabb4f4d0506319c339b13663 (diff) | |
download | musl-90edf1cc15cec685c18ec2485ddce5b655963464.tar.gz musl-90edf1cc15cec685c18ec2485ddce5b655963464.tar.bz2 musl-90edf1cc15cec685c18ec2485ddce5b655963464.tar.xz musl-90edf1cc15cec685c18ec2485ddce5b655963464.zip |
optimized C memcpy
unlike the old C memcpy, this version handles word-at-a-time reads and
writes even for misaligned copies. it does not require that the cpu
support misaligned accesses; instead, it performs bit shifts to
realign the bytes for the destination.
essentially, this is the C version of the ARM assembly language
memcpy. the ideas are all the same, and it should perform well on any
arch with a decent number of general-purpose registers that has a
barrel shift operation. since the barrel shifter is an optional cpu
feature on microblaze, it may be desirable to provide an alternate asm
implementation on microblaze, but otherwise the C code provides a
competitive implementation for "generic risc-y" cpu archs that should
alleviate the urgent need for arch-specific memcpy asm.
Diffstat (limited to 'src/string/memcpy.c')
-rw-r--r-- | src/string/memcpy.c | 127 |
1 files changed, 111 insertions, 16 deletions
diff --git a/src/string/memcpy.c b/src/string/memcpy.c index 8e98302f..06e88742 100644 --- a/src/string/memcpy.c +++ b/src/string/memcpy.c @@ -1,29 +1,124 @@ #include <string.h> -#include <stdlib.h> #include <stdint.h> - -#define SS (sizeof(size_t)) -#define ALIGN (sizeof(size_t)-1) -#define ONES ((size_t)-1/UCHAR_MAX) +#include <endian.h> void *memcpy(void *restrict dest, const void *restrict src, size_t n) { unsigned char *d = dest; const unsigned char *s = src; - if (((uintptr_t)d & ALIGN) != ((uintptr_t)s & ALIGN)) - goto misaligned; +#ifdef __GNUC__ + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define LS >> +#define RS << +#else +#define LS << +#define RS >> +#endif + + typedef uint32_t __attribute__((__may_alias__)) u32; + uint32_t w, x; - for (; ((uintptr_t)d & ALIGN) && n; n--) *d++ = *s++; - if (n) { - size_t *wd = (void *)d; - const size_t *ws = (const void *)s; + for (; (uintptr_t)s % 4 && n; n--) *d++ = *s++; - for (; n>=SS; n-=SS) *wd++ = *ws++; - d = (void *)wd; - s = (const void *)ws; -misaligned: - for (; n; n--) *d++ = *s++; + if ((uintptr_t)d % 4 == 0) { + for (; n>=16; s+=16, d+=16, n-=16) { + *(u32 *)(d+0) = *(u32 *)(s+0); + *(u32 *)(d+4) = *(u32 *)(s+4); + *(u32 *)(d+8) = *(u32 *)(s+8); + *(u32 *)(d+12) = *(u32 *)(s+12); + } + if (n&8) { + *(u32 *)(d+0) = *(u32 *)(s+0); + *(u32 *)(d+4) = *(u32 *)(s+4); + d += 8; s += 8; + } + if (n&4) { + *(u32 *)(d+0) = *(u32 *)(s+0); + d += 4; s += 4; + } + if (n&2) { + *d++ = *s++; *d++ = *s++; + } + if (n&1) { + *d = *s; + } + return dest; + } + + if (n >= 32) switch ((uintptr_t)d % 4) { + case 1: + w = *(u32 *)s; + *d++ = *s++; + *d++ = *s++; + *d++ = *s++; + n -= 3; + for (; n>=17; s+=16, d+=16, n-=16) { + x = *(u32 *)(s+1); + *(u32 *)(d+0) = (w LS 24) | (x RS 8); + w = *(u32 *)(s+5); + *(u32 *)(d+4) = (x LS 24) | (w RS 8); + x = *(u32 *)(s+9); + *(u32 *)(d+8) = (w LS 24) | (x RS 8); + w = *(u32 *)(s+13); + *(u32 *)(d+12) = (x LS 24) | (w RS 8); + } + break; + case 2: + w = *(u32 *)s; + *d++ = *s++; + *d++ = *s++; + n -= 2; + for (; n>=18; s+=16, d+=16, n-=16) { + x = *(u32 *)(s+2); + *(u32 *)(d+0) = (w LS 16) | (x RS 16); + w = *(u32 *)(s+6); + *(u32 *)(d+4) = (x LS 16) | (w RS 16); + x = *(u32 *)(s+10); + *(u32 *)(d+8) = (w LS 16) | (x RS 16); + w = *(u32 *)(s+14); + *(u32 *)(d+12) = (x LS 16) | (w RS 16); + } + break; + case 3: + w = *(u32 *)s; + *d++ = *s++; + n -= 1; + for (; n>=19; s+=16, d+=16, n-=16) { + x = *(u32 *)(s+3); + *(u32 *)(d+0) = (w LS 8) | (x RS 24); + w = *(u32 *)(s+7); + *(u32 *)(d+4) = (x LS 8) | (w RS 24); + x = *(u32 *)(s+11); + *(u32 *)(d+8) = (w LS 8) | (x RS 24); + w = *(u32 *)(s+15); + *(u32 *)(d+12) = (x LS 8) | (w RS 24); + } + break; + } + if (n&16) { + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; } + if (n&8) { + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; + } + if (n&4) { + *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; + } + if (n&2) { + *d++ = *s++; *d++ = *s++; + } + if (n&1) { + *d = *s; + } + return dest; +#endif + + for (; n; n--) *d++ = *s++; return dest; } |