//
// Copyright (c) 2012. OverHertz Ltd
//
inline procedure mmx_strlcpy(dest, source, len) {
  push edi
  push esi
  push ecx
  push edx
  edi = $dest;
  esi = $source;  
  ecx = $len;
  edx = ecx;
  
  shr ecx, 6
  if (ecx <> 0) {
    @loop1:
      movq mm0, qword[esi+0]
      movq mm1, qword[esi+8]
      movq mm2, qword[esi+16]
      movq mm3, qword[esi+24]
      movq mm4, qword[esi+32]
      movq mm5, qword[esi+40]
      movq mm6, qword[esi+48]
      movq mm7, qword[esi+56]
      
      movq qword[edi+0], mm0
      movq qword[edi+8], mm1
      movq qword[edi+16], mm2
      movq qword[edi+24], mm3
      movq qword[edi+32], mm4
      movq qword[edi+40], mm5
      movq qword[edi+48], mm6
      movq qword[edi+56], mm7
      
      esi += 64;
      edi += 64;
      
      edx -= 64;
      
      ecx--;
    jnz @loop1;  
  }
  
  ecx = edx;  
  shr ecx, 3
  
  if (ecx <> 0) {
    @loop2:
      movq mm0, qword[esi]
      movq qword[edi], mm0
      
      esi += 8;
      edi += 8;
      
      ecx--;
    jnz @loop2;
  }
  ecx = edx;
  and ecx, 3
  rep movsb
  pop edx
  pop ecx
  pop esi
  pop edi
  
  emms
}
program WIN32CUI 'test';
#include 'ch.zir';
//
// Copyright (c) 2012. OverHertz Ltd
//
inline procedure mmx_strlcpy2(dest, source, len) {
  push edi
  push esi
  push ecx
  edi = $dest;
  esi = $source;
  ecx = $len;
  
  shr ecx, 3
  @loop:
    movq mm0, qword[esi]
    movq qword[edi], mm0
    
    esi += 8;
    edi += 8;
    
    ecx--;
  jnz @loop;
  ecx = $len;
  and ecx, 3
  rep movsb
  pop ecx
  pop esi
  pop edi
  
  emms
}
//
// Copyright (c) 2012. OverHertz Ltd
//
inline procedure mmx_strlcpy(dest, source, len) {
  push edi
  push esi
  push ecx
  push edx
  edi = $dest;
  esi = $source;  
  ecx = $len;
  edx = ecx;
  
  shr ecx, 6
  if (ecx <> 0) {
    @loop1:
      movq mm0, qword[esi+0]
      movq mm1, qword[esi+8]
      movq mm2, qword[esi+16]
      movq mm3, qword[esi+24]
      movq mm4, qword[esi+32]
      movq mm5, qword[esi+40]
      movq mm6, qword[esi+48]
      movq mm7, qword[esi+56]
      
      movq qword[edi+0], mm0
      movq qword[edi+8], mm1
      movq qword[edi+16], mm2
      movq qword[edi+24], mm3
      movq qword[edi+32], mm4
      movq qword[edi+40], mm5
      movq qword[edi+48], mm6
      movq qword[edi+56], mm7
      
      esi += 64;
      edi += 64;
      
      edx -= 64;
      
      ecx--;
    jnz @loop1;  
  }
  
  ecx = edx;  
  shr ecx, 3
  
  if (ecx <> 0) {
    @loop2:
      movq mm0, qword[esi]
      movq qword[edi], mm0
      
      esi += 8;
      edi += 8;
      
      ecx--;
    jnz @loop2;
  }
  ecx = edx;
  and ecx, 3
  rep movsb
  pop edx
  pop ecx
  pop esi
  pop edi
  
  emms
}
char str1[8192];
char str2[8192];
////////////
edi = GetTickCount();
for (esi = 1 to 500000) {
  strlcpy(@str2, @str1, 8192);
}
eax = GetTickCount();
eax -= edi;
print('strlcpy: ', eax, 'ms\r\n');
//////////
edi = GetTickCount();
for (esi = 1 to 500000) {
  mmx_strlcpy2(@str2, @str1, 8192);
}
eax = GetTickCount();
eax -= edi;
print('mmx_strlcpy (old): ', eax, 'ms\r\n');
//////////
edi = GetTickCount();
for (esi = 1 to 500000) {
  mmx_strlcpy(@str2, @str1, 8192);
}
eax = GetTickCount();
eax -= edi;
print('mmx_strlcpy: ', eax, 'ms\r\n');
wait_key(0);
    
ExitProcess(0);
strlcpy: 3531ms
mmx_strlcpy (old): 2969ms
mmx_strlcpy: 1328ms