Just wrote a 64-bit version of fillchar.

Just wrote a 64-bit version of fillchar.

It's always about 2x 3x faster than the current RTL code. And it should also works under OSX, because PIC is free in x64.

I don't have access to every CPU out there, so if anyone want to speed test on their CPU please feel free. If it's slower I'd be really interested.
On my CPU (K10+Intel core2) it is faster for any size (even very small sizes).

{$ifdef CPUX64}
procedure FillCharJB(var Dest; Count: NativeInt; Value: Byte);
//rcx = dest
//rdx=count
//r8b=value
asm
              .noframe
              .align 16
              movzx r8,r8b           //There's no need to optimize for count <= 3
              mov rax,$0101010101010101
              mov r9d,edx
              imul rax,r8            //fill rax with value.
              cmp edx,32
              jl @Below32
@Above32:     mov r11,rcx
              mov r8b,7              //code shrink to help alignment.
              lea r9,[rcx+rdx]       //r9=end of array
              and r11,r8             //and 7 See if dest is aligned
              jz @tail
@NotAligned:  mov [rcx],rax          //unaligned write
              xor rcx,r11            //align dest
              lea rdx,[rdx+r11-8]
              add rcx,8
@tail:        test r9,r8             //and 7 is tail aligned?
              jz @alignOK
@tailwrite:   mov [r9-8],rax         //no, we need to do a tail write
              and r9,r8              //and 7
              sub rdx,r9             //dec(count, tailcount)
@alignOK:     mov r10,rdx
              mov r8b,64             //code shrink to help alignment.
              and edx,(32+16+8)      //count the partial iterations of the loop
              mov r9,rdx
              jz @Initloop64
@partialloop: shr r9,1              //every instruction is 4 bytes
              lea r11,[rip +@partial+(4*7)] //start at the end of the loop
              sub r11,r9            //step back as needed
              add rcx,rdx            //add the partial loop count to dest
              test r10,r10           //do we need to do more loops?
              jmp r11                //do a partial loop
              rep nop                //nop2
@Initloop64:  shr r10,6              //any work left?
              jz @done               //no, return
              mov rdx,r10
              shr r10,(19-6)         //use non-temporal move for > 512kb
              jnz @InitFillHuge
@Doloop64:    add rcx,r8
              dec edx
              mov [rcx-64+00H],rax
              mov [rcx-64+08H],rax
              mov [rcx-64+10H],rax
              mov [rcx-64+18H],rax
              mov [rcx-64+20H],rax
              mov [rcx-64+28H],rax
              mov [rcx-64+30H],rax
              mov [rcx-64+38H],rax
              jnz @DoLoop64
@done:        rep ret
              db $66,$66,$0f,$1f,$44,$00,$00 //nop7
@partial:     mov [rcx-64+08H],rax
              mov [rcx-64+10H],rax
              mov [rcx-64+18H],rax
              mov [rcx-64+20H],rax
              mov [rcx-64+28H],rax
              mov [rcx-64+30H],rax
              mov [rcx-64+38H],rax
              jnz @Initloop64        //are we done with all loops?
              rep ret
@InitFillHuge:
@FillHuge:    add rcx,r8
              dec rdx
              db $48,$0F,$C3,$41,$C0 // movnti  [rcx-64+00H],rax
              db $48,$0F,$C3,$41,$C8 // movnti  [rcx-64+08H],rax
              db $48,$0F,$C3,$41,$D0 // movnti  [rcx-64+10H],rax
              db $48,$0F,$C3,$41,$D8 // movnti  [rcx-64+18H],rax
              db $48,$0F,$C3,$41,$E0 // movnti  [rcx-64+20H],rax
              db $48,$0F,$C3,$41,$E8 // movnti  [rcx-64+28H],rax
              db $48,$0F,$C3,$41,$F0 // movnti  [rcx-64+30H],rax
              db $48,$0F,$C3,$41,$F8 // movnti  [rcx-64+38H],rax
              jnz @FillHuge
@donefillhuge:mfence
              ret
@Below32:     and  r9d,not(3)
              jz @SizeIs3
@FillTail:    lea   r10,[rip +@SmallFill+ (7*4)]
              sub   r10,r9
              sub   edx,4
              jmp   r10
@SmallFill:   rep mov [rcx+24], eax
              rep mov [rcx+20], eax
              rep mov [rcx+16], eax
              rep mov [rcx+12], eax
              rep mov [rcx+08], eax
              rep mov [rcx+04], eax
              mov [rcx],eax
@Fallthough:  mov [rcx+rdx],eax  //unaligned write to fix up tail
              ret
@SizeIs3:     shl edx,2           //r9 <= 3  r9*4
              lea r10,[rip +@do3+ (4*3)]
              sub r10,rdx
              jmp r10
@do3:         rep mov [rcx+2],al
@do2:         mov [rcx],ax
              ret
@do1:         mov [rcx],al
              rep ret
@do0:         rep ret
end;
{$endif}


PS I also have a faster version of FillChar32, but the my new 32-bit version is still slower for structures > 256 bytes.

Comments

  1. A. Bouchez
     Would it be possible for me to be able to raise issues in synopse Github. I've got a few other speed issues to take care of and don't want to spam the google+ group with that all the time.


    Just as a bonus an even faster version of fillchar +5% (a full 300% faster than stock fillchar).

    {$ifdef CPUX64}
    procedure FillChar(var Dest; Count: NativeInt; Value: Byte);
    //rcx = dest
    //rdx=count
    //r8b=value
    asm
                  .noframe
                  .align 16
                  movzx r8,r8b           //There's no need to optimize for count <= 3
                  mov rax,$0101010101010101
                  mov r9d,edx
                  imul rax,r8            //fill rax with value.
                  cmp edx,59             //Use simple code for small blocks.
                  jl @Below32
    @Above32:     mov r11,rcx
                  rep mov r8b,7          //code shrink to help alignment.
                  lea r9,[rcx+rdx]       //r9=end of array
                  sub rdx,8
                  rep mov [rcx],rax
                  add rcx,8
                  and r11,r8             //and 7 See if dest is aligned
                  jz @tail
    @NotAligned:  xor rcx,r11            //align dest
                  lea rdx,[rdx+r11]
    @tail:        test r9,r8             //and 7 is tail aligned?
                  jz @alignOK
    @tailwrite:   mov [r9-8],rax         //no, we need to do a tail write
                  and r9,r8              //and 7
                  sub rdx,r9             //dec(count, tailcount)
    @alignOK:     mov r10,rdx
                  and edx,(32+16+8)      //count the partial iterations of the loop
                  mov r8b,64             //code shrink to help alignment.
                  mov r9,rdx
                  jz @Initloop64
    @partialloop: shr r9,1              //every instruction is 4 bytes
                  lea r11,[rip +@partial+(4*7)] //start at the end of the loop
                  sub r11,r9            //step back as needed
                  add rcx,rdx            //add the partial loop count to dest
                  cmp r10,r8             //do we need to do more loops?
                  jmp r11                //do a partial loop
    @Initloop64:  shr r10,6              //any work left?
                  jz @done               //no, return
                  mov rdx,r10
                  shr r10,(19-6)         //use non-temporal move for > 512kb
                  jnz @InitFillHuge
    @Doloop64:    add rcx,r8
                  dec edx
                  mov [rcx-64+00H],rax
                  mov [rcx-64+08H],rax
                  mov [rcx-64+10H],rax
                  mov [rcx-64+18H],rax
                  mov [rcx-64+20H],rax
                  mov [rcx-64+28H],rax
                  mov [rcx-64+30H],rax
                  mov [rcx-64+38H],rax
                  jnz @DoLoop64
    @done:        rep ret
                  //db $66,$66,$0f,$1f,$44,$00,$00 //nop7
    @partial:     mov [rcx-64+08H],rax
                  mov [rcx-64+10H],rax
                  mov [rcx-64+18H],rax
                  mov [rcx-64+20H],rax
                  mov [rcx-64+28H],rax
                  mov [rcx-64+30H],rax
                  mov [rcx-64+38H],rax
                  jge @Initloop64        //are we done with all loops?
                  rep ret
                  db $0F,$1F,$40,$00
    @InitFillHuge:
    @FillHuge:    add rcx,r8
                  dec rdx
                  db $48,$0F,$C3,$41,$C0 // movnti  [rcx-64+00H],rax
                  db $48,$0F,$C3,$41,$C8 // movnti  [rcx-64+08H],rax
                  db $48,$0F,$C3,$41,$D0 // movnti  [rcx-64+10H],rax
                  db $48,$0F,$C3,$41,$D8 // movnti  [rcx-64+18H],rax
                  db $48,$0F,$C3,$41,$E0 // movnti  [rcx-64+20H],rax
                  db $48,$0F,$C3,$41,$E8 // movnti  [rcx-64+28H],rax

    ReplyDelete
  2. db $48,$0F,$C3,$41,$F0 // movnti  [rcx-64+30H],rax
                  db $48,$0F,$C3,$41,$F8 // movnti  [rcx-64+38H],rax
                  jnz @FillHuge
    @donefillhuge:mfence
                  rep ret
                  db $0F,$1F,$44,$00,$00  //db $0F,$1F,$40,$00
    @Below32:     and  r9d,not(3)
                  jz @SizeIs3
    @FillTail:    sub   edx,4
                  lea   r10,[rip +@SmallFill+ (15*4)]
                  sub   r10,r9
                  jmp   r10
    @SmallFill:   rep mov [rcx+56], eax
                  rep mov [rcx+52], eax
                  rep mov [rcx+48], eax
                  rep mov [rcx+44], eax
                  rep mov [rcx+40], eax
                  rep mov [rcx+36], eax
                  rep mov [rcx+32], eax
                  rep mov [rcx+28], eax
                  rep mov [rcx+24], eax
                  rep mov [rcx+20], eax
                  rep mov [rcx+16], eax
                  rep mov [rcx+12], eax
                  rep mov [rcx+08], eax
                  rep mov [rcx+04], eax
                  mov [rcx],eax
    @Fallthough:  mov [rcx+rdx],eax  //unaligned write to fix up tail
                  rep ret

    @SizeIs3:     shl edx,2           //r9 <= 3  r9*4
                  lea r10,[rip +@do3+ (4*3)]
                  sub r10,rdx
                  jmp r10
    @do3:         rep mov [rcx+2],al
    @do2:         mov [rcx],ax
                  ret
    @do1:         mov [rcx],al
                  rep ret
    @do0:         rep ret
    end;
    {$endif}

    ReplyDelete
  3. Johan Bontes or rather use the synopse.info forum...

    ReplyDelete
  4. A. Bouchez
    Hmm OK if you insist. I find the github route much easier though.

    ReplyDelete

Post a Comment