.global memset
.type memset,@function
memset:
	movzbl %sil,%esi
	mov $0x101010101010101,%rax
	# 64-bit imul has 3-7 cycles latency, launch early
	imul %rsi,%rax

	cmp $16,%rdx
	jb 1f

	mov %rdx,%rcx
	mov %rdi,%r8
	shr $3,%rcx
	mov %rax,-8(%rdi,%rdx)
	rep
	stosq
	mov %r8,%rax
	ret

1:	test %edx,%edx
	jz 1f

	mov %al,(%rdi)
	mov %al,-1(%rdi,%rdx)
	cmp $2,%edx
	jbe 1f

	mov %al,1(%rdi)
	mov %al,-2(%rdi,%rdx)
	cmp $4,%edx
	jbe 1f

	mov %eax,(%rdi)
	mov %eax,-4(%rdi,%rdx)
	cmp $8,%edx
	jbe 1f

	mov %eax,4(%rdi)
	mov %eax,-8(%rdi,%rdx)

1:	mov %rdi,%rax
	ret