Just wrote a 64-bit version of fillchar.
Just wrote a 64-bit version of fillchar.
It's always about2x 3x faster than the current RTL code. And it should also works under OSX, because PIC is free in x64.
I don't have access to every CPU out there, so if anyone want to speed test on their CPU please feel free. If it's slower I'd be really interested.
On my CPU (K10+Intel core2) it is faster for any size (even very small sizes).
{$ifdef CPUX64}
procedure FillCharJB(var Dest; Count: NativeInt; Value: Byte);
//rcx = dest
//rdx=count
//r8b=value
asm
.noframe
.align 16
movzx r8,r8b //There's no need to optimize for count <= 3
mov rax,$0101010101010101
mov r9d,edx
imul rax,r8 //fill rax with value.
cmp edx,32
jl @Below32
@Above32: mov r11,rcx
mov r8b,7 //code shrink to help alignment.
lea r9,[rcx+rdx] //r9=end of array
and r11,r8 //and 7 See if dest is aligned
jz @tail
@NotAligned: mov [rcx],rax //unaligned write
xor rcx,r11 //align dest
lea rdx,[rdx+r11-8]
add rcx,8
@tail: test r9,r8 //and 7 is tail aligned?
jz @alignOK
@tailwrite: mov [r9-8],rax //no, we need to do a tail write
and r9,r8 //and 7
sub rdx,r9 //dec(count, tailcount)
@alignOK: mov r10,rdx
mov r8b,64 //code shrink to help alignment.
and edx,(32+16+8) //count the partial iterations of the loop
mov r9,rdx
jz @Initloop64
@partialloop: shr r9,1 //every instruction is 4 bytes
lea r11,[rip +@partial+(4*7)] //start at the end of the loop
sub r11,r9 //step back as needed
add rcx,rdx //add the partial loop count to dest
test r10,r10 //do we need to do more loops?
jmp r11 //do a partial loop
rep nop //nop2
@Initloop64: shr r10,6 //any work left?
jz @done //no, return
mov rdx,r10
shr r10,(19-6) //use non-temporal move for > 512kb
jnz @InitFillHuge
@Doloop64: add rcx,r8
dec edx
mov [rcx-64+00H],rax
mov [rcx-64+08H],rax
mov [rcx-64+10H],rax
mov [rcx-64+18H],rax
mov [rcx-64+20H],rax
mov [rcx-64+28H],rax
mov [rcx-64+30H],rax
mov [rcx-64+38H],rax
jnz @DoLoop64
@done: rep ret
db $66,$66,$0f,$1f,$44,$00,$00 //nop7
@partial: mov [rcx-64+08H],rax
mov [rcx-64+10H],rax
mov [rcx-64+18H],rax
mov [rcx-64+20H],rax
mov [rcx-64+28H],rax
mov [rcx-64+30H],rax
mov [rcx-64+38H],rax
jnz @Initloop64 //are we done with all loops?
rep ret
@InitFillHuge:
@FillHuge: add rcx,r8
dec rdx
db $48,$0F,$C3,$41,$C0 // movnti [rcx-64+00H],rax
db $48,$0F,$C3,$41,$C8 // movnti [rcx-64+08H],rax
db $48,$0F,$C3,$41,$D0 // movnti [rcx-64+10H],rax
db $48,$0F,$C3,$41,$D8 // movnti [rcx-64+18H],rax
db $48,$0F,$C3,$41,$E0 // movnti [rcx-64+20H],rax
db $48,$0F,$C3,$41,$E8 // movnti [rcx-64+28H],rax
db $48,$0F,$C3,$41,$F0 // movnti [rcx-64+30H],rax
db $48,$0F,$C3,$41,$F8 // movnti [rcx-64+38H],rax
jnz @FillHuge
@donefillhuge:mfence
ret
@Below32: and r9d,not(3)
jz @SizeIs3
@FillTail: lea r10,[rip +@SmallFill+ (7*4)]
sub r10,r9
sub edx,4
jmp r10
@SmallFill: rep mov [rcx+24], eax
rep mov [rcx+20], eax
rep mov [rcx+16], eax
rep mov [rcx+12], eax
rep mov [rcx+08], eax
rep mov [rcx+04], eax
mov [rcx],eax
@Fallthough: mov [rcx+rdx],eax //unaligned write to fix up tail
ret
@SizeIs3: shl edx,2 //r9 <= 3 r9*4
lea r10,[rip +@do3+ (4*3)]
sub r10,rdx
jmp r10
@do3: rep mov [rcx+2],al
@do2: mov [rcx],ax
ret
@do1: mov [rcx],al
rep ret
@do0: rep ret
end;
{$endif}
PS I also have a faster version of FillChar32, but the my new 32-bit version is still slower for structures > 256 bytes.
It's always about
I don't have access to every CPU out there, so if anyone want to speed test on their CPU please feel free. If it's slower I'd be really interested.
On my CPU (K10+Intel core2) it is faster for any size (even very small sizes).
{$ifdef CPUX64}
procedure FillCharJB(var Dest; Count: NativeInt; Value: Byte);
//rcx = dest
//rdx=count
//r8b=value
asm
.noframe
.align 16
movzx r8,r8b //There's no need to optimize for count <= 3
mov rax,$0101010101010101
mov r9d,edx
imul rax,r8 //fill rax with value.
cmp edx,32
jl @Below32
@Above32: mov r11,rcx
mov r8b,7 //code shrink to help alignment.
lea r9,[rcx+rdx] //r9=end of array
and r11,r8 //and 7 See if dest is aligned
jz @tail
@NotAligned: mov [rcx],rax //unaligned write
xor rcx,r11 //align dest
lea rdx,[rdx+r11-8]
add rcx,8
@tail: test r9,r8 //and 7 is tail aligned?
jz @alignOK
@tailwrite: mov [r9-8],rax //no, we need to do a tail write
and r9,r8 //and 7
sub rdx,r9 //dec(count, tailcount)
@alignOK: mov r10,rdx
mov r8b,64 //code shrink to help alignment.
and edx,(32+16+8) //count the partial iterations of the loop
mov r9,rdx
jz @Initloop64
@partialloop: shr r9,1 //every instruction is 4 bytes
lea r11,[rip +@partial+(4*7)] //start at the end of the loop
sub r11,r9 //step back as needed
add rcx,rdx //add the partial loop count to dest
test r10,r10 //do we need to do more loops?
jmp r11 //do a partial loop
rep nop //nop2
@Initloop64: shr r10,6 //any work left?
jz @done //no, return
mov rdx,r10
shr r10,(19-6) //use non-temporal move for > 512kb
jnz @InitFillHuge
@Doloop64: add rcx,r8
dec edx
mov [rcx-64+00H],rax
mov [rcx-64+08H],rax
mov [rcx-64+10H],rax
mov [rcx-64+18H],rax
mov [rcx-64+20H],rax
mov [rcx-64+28H],rax
mov [rcx-64+30H],rax
mov [rcx-64+38H],rax
jnz @DoLoop64
@done: rep ret
db $66,$66,$0f,$1f,$44,$00,$00 //nop7
@partial: mov [rcx-64+08H],rax
mov [rcx-64+10H],rax
mov [rcx-64+18H],rax
mov [rcx-64+20H],rax
mov [rcx-64+28H],rax
mov [rcx-64+30H],rax
mov [rcx-64+38H],rax
jnz @Initloop64 //are we done with all loops?
rep ret
@InitFillHuge:
@FillHuge: add rcx,r8
dec rdx
db $48,$0F,$C3,$41,$C0 // movnti [rcx-64+00H],rax
db $48,$0F,$C3,$41,$C8 // movnti [rcx-64+08H],rax
db $48,$0F,$C3,$41,$D0 // movnti [rcx-64+10H],rax
db $48,$0F,$C3,$41,$D8 // movnti [rcx-64+18H],rax
db $48,$0F,$C3,$41,$E0 // movnti [rcx-64+20H],rax
db $48,$0F,$C3,$41,$E8 // movnti [rcx-64+28H],rax
db $48,$0F,$C3,$41,$F0 // movnti [rcx-64+30H],rax
db $48,$0F,$C3,$41,$F8 // movnti [rcx-64+38H],rax
jnz @FillHuge
@donefillhuge:mfence
ret
@Below32: and r9d,not(3)
jz @SizeIs3
@FillTail: lea r10,[rip +@SmallFill+ (7*4)]
sub r10,r9
sub edx,4
jmp r10
@SmallFill: rep mov [rcx+24], eax
rep mov [rcx+20], eax
rep mov [rcx+16], eax
rep mov [rcx+12], eax
rep mov [rcx+08], eax
rep mov [rcx+04], eax
mov [rcx],eax
@Fallthough: mov [rcx+rdx],eax //unaligned write to fix up tail
ret
@SizeIs3: shl edx,2 //r9 <= 3 r9*4
lea r10,[rip +@do3+ (4*3)]
sub r10,rdx
jmp r10
@do3: rep mov [rcx+2],al
@do2: mov [rcx],ax
ret
@do1: mov [rcx],al
rep ret
@do0: rep ret
end;
{$endif}
PS I also have a faster version of FillChar32, but the my new 32-bit version is still slower for structures > 256 bytes.
A. Bouchez
ReplyDeleteWould it be possible for me to be able to raise issues in synopse Github. I've got a few other speed issues to take care of and don't want to spam the google+ group with that all the time.
Just as a bonus an even faster version of fillchar +5% (a full 300% faster than stock fillchar).
{$ifdef CPUX64}
procedure FillChar(var Dest; Count: NativeInt; Value: Byte);
//rcx = dest
//rdx=count
//r8b=value
asm
.noframe
.align 16
movzx r8,r8b //There's no need to optimize for count <= 3
mov rax,$0101010101010101
mov r9d,edx
imul rax,r8 //fill rax with value.
cmp edx,59 //Use simple code for small blocks.
jl @Below32
@Above32: mov r11,rcx
rep mov r8b,7 //code shrink to help alignment.
lea r9,[rcx+rdx] //r9=end of array
sub rdx,8
rep mov [rcx],rax
add rcx,8
and r11,r8 //and 7 See if dest is aligned
jz @tail
@NotAligned: xor rcx,r11 //align dest
lea rdx,[rdx+r11]
@tail: test r9,r8 //and 7 is tail aligned?
jz @alignOK
@tailwrite: mov [r9-8],rax //no, we need to do a tail write
and r9,r8 //and 7
sub rdx,r9 //dec(count, tailcount)
@alignOK: mov r10,rdx
and edx,(32+16+8) //count the partial iterations of the loop
mov r8b,64 //code shrink to help alignment.
mov r9,rdx
jz @Initloop64
@partialloop: shr r9,1 //every instruction is 4 bytes
lea r11,[rip +@partial+(4*7)] //start at the end of the loop
sub r11,r9 //step back as needed
add rcx,rdx //add the partial loop count to dest
cmp r10,r8 //do we need to do more loops?
jmp r11 //do a partial loop
@Initloop64: shr r10,6 //any work left?
jz @done //no, return
mov rdx,r10
shr r10,(19-6) //use non-temporal move for > 512kb
jnz @InitFillHuge
@Doloop64: add rcx,r8
dec edx
mov [rcx-64+00H],rax
mov [rcx-64+08H],rax
mov [rcx-64+10H],rax
mov [rcx-64+18H],rax
mov [rcx-64+20H],rax
mov [rcx-64+28H],rax
mov [rcx-64+30H],rax
mov [rcx-64+38H],rax
jnz @DoLoop64
@done: rep ret
//db $66,$66,$0f,$1f,$44,$00,$00 //nop7
@partial: mov [rcx-64+08H],rax
mov [rcx-64+10H],rax
mov [rcx-64+18H],rax
mov [rcx-64+20H],rax
mov [rcx-64+28H],rax
mov [rcx-64+30H],rax
mov [rcx-64+38H],rax
jge @Initloop64 //are we done with all loops?
rep ret
db $0F,$1F,$40,$00
@InitFillHuge:
@FillHuge: add rcx,r8
dec rdx
db $48,$0F,$C3,$41,$C0 // movnti [rcx-64+00H],rax
db $48,$0F,$C3,$41,$C8 // movnti [rcx-64+08H],rax
db $48,$0F,$C3,$41,$D0 // movnti [rcx-64+10H],rax
db $48,$0F,$C3,$41,$D8 // movnti [rcx-64+18H],rax
db $48,$0F,$C3,$41,$E0 // movnti [rcx-64+20H],rax
db $48,$0F,$C3,$41,$E8 // movnti [rcx-64+28H],rax
db $48,$0F,$C3,$41,$F0 // movnti [rcx-64+30H],rax
ReplyDeletedb $48,$0F,$C3,$41,$F8 // movnti [rcx-64+38H],rax
jnz @FillHuge
@donefillhuge:mfence
rep ret
db $0F,$1F,$44,$00,$00 //db $0F,$1F,$40,$00
@Below32: and r9d,not(3)
jz @SizeIs3
@FillTail: sub edx,4
lea r10,[rip +@SmallFill+ (15*4)]
sub r10,r9
jmp r10
@SmallFill: rep mov [rcx+56], eax
rep mov [rcx+52], eax
rep mov [rcx+48], eax
rep mov [rcx+44], eax
rep mov [rcx+40], eax
rep mov [rcx+36], eax
rep mov [rcx+32], eax
rep mov [rcx+28], eax
rep mov [rcx+24], eax
rep mov [rcx+20], eax
rep mov [rcx+16], eax
rep mov [rcx+12], eax
rep mov [rcx+08], eax
rep mov [rcx+04], eax
mov [rcx],eax
@Fallthough: mov [rcx+rdx],eax //unaligned write to fix up tail
rep ret
@SizeIs3: shl edx,2 //r9 <= 3 r9*4
lea r10,[rip +@do3+ (4*3)]
sub r10,rdx
jmp r10
@do3: rep mov [rcx+2],al
@do2: mov [rcx],ax
ret
@do1: mov [rcx],al
rep ret
@do0: rep ret
end;
{$endif}
Johan Bontes or rather use the synopse.info forum...
ReplyDeleteA. Bouchez
ReplyDeleteHmm OK if you insist. I find the github route much easier though.