.global memset
.type memset,@function
memset:
movzbl %sil,%esi
mov $0x101010101010101,%rax
# 64-bit imul has 3-7 cycles latency, launch early
imul %rsi,%rax
cmp $16,%rdx
jb 1f
lea -1(%rdx),%rcx
mov %rdi,%r8
shr $3,%rcx
mov %rax,-8(%rdi,%rdx)
rep
stosq
mov %r8,%rax
ret
1: test %edx,%edx
jz 1f
mov %al,(%rdi)
mov %al,-1(%rdi,%rdx)
cmp $2,%edx
jbe 1f
mov %al,1(%rdi)
mov %al,-2(%rdi,%rdx)
cmp $4,%edx
jbe 1f
mov %eax,(%rdi)
mov %eax,-4(%rdi,%rdx)
cmp $8,%edx
jbe 1f
mov %eax,4(%rdi)
mov %eax,-8(%rdi,%rdx)
1: mov %rdi,%rax
ret