|
unsigned int gcd (unsigned int a, unsigned int b) { if (a == 0 &&b == 0) b = 1; else if (b == 0) b = a; else if (a != 0) while (a != b) if (a <b) b -= a; else a -= b; return b; } |
; WATCOM C/C++ v10.0a output gcd: mov ebx,eax mov eax,edx test ebx,ebx jne L1 test edx,edx jne L1 mov eax,1 ret L1: test eax,eax jne L2 mov eax,ebx ret L2: test ebx,ebx je L5 L3; cmp ebx,eax je L5 jae L4 sub eax,ebx jmp L3 L4: sub ebx,eax jmp L3 L5: ret |
; Author: Paul Hsieh gcd: neg eax je L3 L1: neg eax xchg eax,edx L2: sub eax,edx jg L2 jne L1 L3: add eax,edx jne L4 inc eax L4: ret |
x = min
(a,b); y = max(a,b); while( y!= 0 ) { x = min(x,y-x); // min (x,y-x) y - = x; // (y-x)+(x) - min(x,y-x) } |
int gcd(int x, int y) { int t; y += x; if(y==0) y=1; else do { x += x; t = y- x; // (y-x) - (x) x >>= 1; x += (t>>31)&t; // min(y-x,x) y - = x; // (y-x)+(x)-min(x,y-x) == max(x,y-x) } while(x!=0); return y; } |
; WATCOM C/C++ v10.0a output gcd: add edx,eax jne L1 mov eax,1 jmp L2 L1: mov ebx,edx ; 1 dep: edx add eax,eax ; 1 sub ebx,eax ; 2 dep: eax mov ecx,ebx ; 3 dep: ebx sar ecx,1fH ; 4 dep: ecx sar eax,1 ; 4/5 shifters and ebx,ecx ; 5 dep: ecx add eax,ebx ; 6 dep: ebx sub edx,eax ; 7 dep: eax test eax,eax ; 7 jne L1 ; 7 mov eax,edx L2: |
; Author: Paul Hsieh gcd: add edx,eax jne L1 mov eax,1 jmp L2 L1: mov ebx,edx ; 1 dep: edx add eax,eax ; 1 sub ebx,eax ; 2 dep: eax shr eax,1 ; 2 mov ecx,ebx ; 3 dep: ebx sar ebx,1fH ; 3 and ebx,ecx ; 4 dep: ecx add eax,ebx ; 5 dep: ebx sub edx,eax ; 6 dep: eax test eax,eax ; 6 jne L1 ; 6 mov eax,edx L2: |
#define SLIM (64) static unsigned smallGcds[SLIM][SLIM]; static unsigned uintGcd (unsigned a, unsigned b) { unsigned c, d; /* Divide out low powers of 2 if we can (to decrease a, b) */ d = 1; while (((a|b) & 1) == 0) { a >>= 1; b >>= 1; d <=>=> } goto start; do { /* Find largest 2^t*b, less than a */ c = b; do { c += c; } while (c <=>=> /* a -= 2^t*b */ a -= (c >> 1); start:; /* Make sure a > b, and b != 0 */ if (a < b)="" {=""> if (a == 0) return d * b; c = a; a = b; b = c; } } while (a >= SLIM); /* Return with precalculated small gcds */ return d * smallGcds[a][b]; } |
; int gcdAsm(int a, int b) ; ; computes gcd(a,b) according to: ; 1. a even, b even: gcd(a,b) = 2 * gcd(a/2,b/2), ; and remember how often this happened ; 2. a even, b uneven: gcd(a,b) = gcd(a/2,b) ; 3. a uneven, b even: gcd(a,b) = gcd(a,b/2) ; 4. a uneven, b uneven: a>b ? a -= b : b -= a, ; i.e. gcd(a,b) = gcd(min(a,b),max(a,b) - min(a,b)) ; do 1., repeat 2. - 4. until a = 0 or b = 0 ; return (a + b) corrected by the remembered value from 1. BITS 32 GLOBAL _gcdAsm SECTION .text _gcdAsm: push ebp mov ebp,esp push ebx push ecx push edx push edi mov eax,[ebp + 8] ; eax = a (0 <= a <= 2^31 - 1) mov ebx,[ebp + 12] ; ebx = b (0 <= b <= 2^31 - 1) ; by definition: gcd(a,0) = a, gcd(0,b) = b, gcd(0,0) = 1 ! mov ecx,eax or ecx,ebx bsf ecx,ecx ; greatest common power of 2 of a and b jnz notBoth0 mov eax,1 ; if a = 0 and b = 0, return 1 jmp done notBoth0: mov edi,ecx test eax,eax jnz aNot0 mov eax,ebx ; if a = 0, return b jmp done aNot0: test ebx,ebx jz done ; if b = 0, return a bsf ecx,eax ; "simplify" a as much as possible bsf ecx,ebx ; "simplify" b as much as possible mainLoop: mov ecx,ebx sub ecx,eax ; b - a sbb edx,edx ; edx = 0 if b >= a or - 1 if a > b and ecx,edx ; ecx = 0 if b >= a or b - a if a > b add eax,ecx ; a-new = min(a,b) sub ebx,ecx ; b- new = max(a,b) sub ebx,eax ; the difference is >= 0 bsf ecx,eax ; "simplify" as much as possible by 2 shr eax,cl bsf ecx,ebx ; "simplify" as much as possible by 2 shr ebx,cl jnz mainLoop ; keep looping until ebx = 0 mov ecx,edi ; shift back with common power of 2 shl eax,cl done: pop edi pop edx pop ecx pop ebx mov esp,ebp pop ebp ret ; eax = gcd(a,b) |
/*---------------------------------------------------//*! * brief Computes GCD of two 32- bit unsigned integers * param x First unsigned integer * param y Second unsigned integer * return gcd(x,y) * note gcd(x,0) -> x, gcd(0,y) -> y * note Implemented in x86 assembler (PentiumPro and * above only as cmov instructions are used) * note Send all comments/whines to wili@hybrid.fi * todo [wili 021026] Implement another version that * uses sbb trickery rather than cmov * instructions *-----------------------------------------------------*/ #pragma warning(disable:4035) // no missing return value inline unsigned int gcd (Uint32 x, Uint32 y) { __asm { mov ecx, dword ptr [y] mov edx, dword ptr [x] test ecx, ecx mov eax, edx je done ;// if (y = 0) -> return x test eax, eax mov eax, ecx je done ;// if (x = 0) -> return y push edi bsf ebx, eax ;// ebx = trailingZeroes(y) bsf ecx, edx ;// ecx = trailingZeroes(x) cmp ebx, ecx mov edi, ecx cmovl edi, ebx ;// edi = min(ebx,ecx) shr edx, cl ;// x >>= trailingzeroes (x) mov ecx, ebx shr eax, cl ;// y >>= trailingzeroes (y) align 8 mainloop: ;// for (;;) cmp eax, edx mov ecx, eax je skiploop ;// if (x == y) - > break cmovb eax, edx cmovb edx, ecx ;// if (y > x) swap(x,y) sub eax, edx ;// x -= y bsf ecx, eax shr eax, cl ;// x >>= trailingzeroes (x) jmp mainloop align 8 skiploop: mov ecx, edi shl eax, cl ;// return x<<finalShiftLeft pop edi done: ;// return value&n bsp;in eax } } #pragma warning(default:4035) |
unsigned int existnzkey (struct foobar * table, unsigned int tablelength)
{ int i; for(i=0;i<tablelength;i++) if( table[i].key !=0 ) return table[i].key; return 0; } |
static unsigned int existnzkey (unsigned int tablelength, struct foobar *
table) { int i,c,d; d=i=0; c=tablelength; goto Start; do { d = table[i].key; if( d !=0 ) return d; i++; c--; Start:; } while( c!=0 ); return d; } |
struct foobar { unsigned int key; void * content; }; |
; compiler xor eax,eax test ecx,ecx je L2 L1: mov eax,[esi] ; 1 U - test eax,eax ; 2 U jne L2 ; 2 V add esi,8 ; 3 U dec ecx ; 3 V jne L1 ; 5 - V +1brt L2: |
; by Pisen Chiang and Paul Hsieh xor eax,eax test ecx,ecx je L2 sub esi,8 L1: add esi,8 ; 1 U sub ecx,1 ; 1 V sbb eax,eax ; 2 U - or eax,[esi] ; 3 U jz L1 ; 4 V +1brt mov eax,[esi] L2: |
typedef unsigned long UINT; void TestC(UINT &b0, UINT &b1) { UINT L1, L2, L3; UINT H1, H2; // x = (x>>31)^(x>>30)^(x<<32) (mod 2^63) L1 = (b1<<1)|(b0>>31); L2 = (b1<<2)|(b0>>30); H1 = (b1>>31); H2 = (b1>>30); b1 = H1^H2^b0 &0x7FFFFFFF; b0 = L1^L2; } |
; compiler lea esi,[edx*2] ; 1 U - shr ebx,31 ; 2 U - or esi,ebx ; 3 U - mov ebx,eax ; 4 U - lea ecx,[edx*4] ; 5 U shr ebx,30 ; 5 V or ebx,ecx ; 7 U - +agi mov ecx,edx ; 8 U - shr ecx,31 ; 9 U - shr edx,30 ;10 U - xor edx,ecx ;11 U - xor edx,eax ;12 U - mov eax,esi ;13 U and edx,07FFFFFFFh ;13 V xor eax,ebx ;14 U |
; by Paul Hsieh and Pisen Chiang mov esi,edx ; 1 U xor cl,cl ; 1 V shld edx,eax,1 ; 5 NP +4c* shld esi,eax,2 ; 9 NP +4c adc cl,0 ;10 U xor edx,esi ;10 V xchg eax,edx ;12 NP +2c xor dl,cl ;13 U |
; by hand for Pentiums mov esi,edx ; 1 U xor cl,cl ; 1 V shl esi,2 ; 2 U mov ebx,eax ; 2 V adc cl,0 ; 3 U lea edx,[edx*2] ; 3 V shr ebx,30 ; 4 U xor edx,esi ; 4 V xor edx,ebx ; 5 U xor al,cl ; 5 V shr ebx,1 ; 6 U - xchg eax,edx ; 8 NP +2c xor eax,ebx ; 9 U |
; by Andrew Howe outerloop: lea ebx,[edi+ecx*4] mov eax,[edi] cmploop: sub ebx,4 cmp eax,[ebx] jle notyet xchg eax,[ebx] notyet: cmp ebx,edi jnz cmploop stosd loop outerloop |
; compiler mov edx,ebx cmp byte ptr [ebx],0 je l2 l1: mov ah,[ebx+1] ; U inc ebx ; V test ah,ah ; U jne l1 ; V +1brt l2: sub ebx,edx |
; by Paul Hsieh lea ecx,[ebx-1] l1: inc ecx test ecx,3 jz l2 cmp byte ptr [ecx],0 jne l1 jmp l6 l2: mov eax,[ecx] ; U add ecx,4 ; V test al,al ; U jz l5 ; V test ah,ah ; U jz l4 ; V test eax,0ff0000h ; U jz l3 ; V test eax,0ff000000h ; U jnz l2 ; V +1brt inc ecx l3: inc ecx l4: inc ecx l5: sub ecx,4 l6: sub ecx,ebx |
; by Paul Hsieh lea ecx,[ebx-1] l1: inc ecx test ecx,3 jnz l3 l2: mov edx,[ecx] ; U mov eax,07F7F7F7Fh ; V and eax,edx ; U add ecx,4 ; V add eax,07F7F7F7Fh ; U or eax,edx ; U and eax,080808080h ; U cmp eax,080808080h ; U je l2 ; V +1brt sub ecx,4 l3: cmp byte ptr [ecx],0 jne l1 sub ecx,ebx |
int fstrlen(char *s) { char *p; int d; #define M1 (0x7f7f7f7f) #define M2 (0x80808080) #define SW (sizeof(int)/sizeof(char)) p=s-1; do { p++; if( (((int)p)&(SW-1))==0 ) { do { d = *((int *)p); p += SW; d = (((d&M1)+M1)|d) &M2; } while( d==M2 ); p -= SW; } } while( *p!=0 ); return p-s; } |
; MMX version by Ryan Mack ; Roughly 13 + 3n + BRANCH clocks on a P-II const unsigned __int64 STRINGTBL[8] = {0, 0xff, 0xffff, 0xffffff, 0xffffffff, 0xffffffffff, 0xffffffffffff, 0xffffffffffffff} /* ... */ pxor mm1, mm1 mov ecx, eax mov edx, eax and ecx, -8 and eax, 7 movq mm0, [ecx] por mm0, [STRINGTBL+eax*8] MAIN: add ecx, 8 pcmpeqb mm0, mm1 packsswb mm0, mm0 movd eax, mm0 movq mm0, [ecx] test eax, eax jz MAIN bsf eax, eax shr eax, 2 lea ecx, [ecx+eax-8] sub ecx, edx |
Nakon nekoliko minuta istraživanja ispalo je da je početno otkrivanje nultog bajta od strane Alan Mycroft-a superiornije u odnosu na formulu na jednom vašem sajtu jer ima kratke lance
zavisnosti. Na PIII, strcpy() performanse su skočile skoro 20% nakon zamene jednog sa drugim. Mycroft's macro je
prikazao ovo kao:
Uostalom, jedna od originalnih poruka Mycroft-a o ovom macro se može naći na Deja. Poruka datira iz 27-04-1987! |
#define hasNulByte(x) ((x - 0x01010101) & ~x & 0x80808080)
#define SW (sizeof (int) / sizeof (char)) int xstrlen (const char *s) { const char *p; int d; p = s - 1; do { p++; if ((((int) p) & (SW - 1)) == 0) { do { d = *((int *) p); p += SW; } while (!hasNulByte (d)); p -= SW; } } while (*p != 0); return p - s; } |
char * scrptr; char * destptr; /* ... */ /* copy scanline except where transparent. */ for(x=x0;x<xl;x++) { if( srcptr[x]!=0 ) destptr[x]=srcptr[x]; } |
; by Russ Williams pushad mov esi,[pbSrc] mov ecx,[w] shr ecx,2 mov edi,[pbDest] sub edi,esi looptop: xor ebx,ebx mov eax,[esi] add esi,4 mov edx,eax ror edx,16 add al,255 ; cf = 1 if al!=0 or 0 if al==0 sbb bl,0 ; bl = -1 if al!=0 or 0 if al==0 add ah,255 sbb bh,0 mov eax,edx shl ebx,16 add al,255 sbb bl,0 add ah,255 sbb bh,0 mov eax,[edi] ror edx,16 and eax,ebx ; dest &bg.mask (P6: PRS) or eax,edx ; merge with src dec ecx mov [edi+esi],eax jnz looptop popad |
; by Paul Hsieh sub esi,edi looptop: mov eax,[esi+edi] ; read sprite source bits mov ebx,7f7f7f7fh and ebx,eax add ebx,7f7f7f7fh or ebx,eax ; 80808080 bits have non-zero status of bytes. and ebx,80808080h shr ebx,7 ; move to 01010101 bits. add ebx,7f7f7f7fh ; 80==on or 7f==off values in each byte. and ebx,7f7f7f7fh ; 00==on or 7f==off masks. lea eax,[ebx+ebx] ; eax has 00==on or fe==off masks. (P5: AGI stall) or eax,ebx ; eax has 00==on or ff==off masks. mov ebx,eax not ebx ; ebx has 00==off or ff==on masks. and eax,[edi] ; background bits or eax,[esi+edi] ; merge with sprite bits for final result mov [edi],eax ; draw! add edi,4 dec ecx jnz looptop |
; Idea from Terje Mathisen, implemented by Sean T. Barrett sub esi,edi mov edx,temp sub edx,edi ; edx = (temp-edi) ... loop: mov al,[esi+edi] ; al=src cmp al,1 ; cf = 1 if src==0, 0 if src!=0 sbb ebx,ebx ; ebx = -1 if src==0, 0 if src!=0 and ebx,edx ; ebx = (temp-edi) if src==0, 0 if src! =0 mov [ebx+edi],al ; store to dest (edi) or garbage (temp) inc edi dec ecx jnz loop ; well predicted branch |
; by Paolo Bonzini mov eax, [esi+edi] ; read sprite source mov ebx, [edi] ; source mov ecx, eax and ecx, 7f7f7f7fh add ecx, 7f7f7f7fh ; set bit 7 set if bits 6-0 nonzero or ecx, eax ; set bit 7 if byte nonzero and ecx, 80808080h ; 80 == on or 00 == off xor eax, ebx ; eax = dest ^ source shr ecx, 7 ; 01010101 bits set if byte nonzero add ecx, 7f7f7f7fh ; 80 == on or 7f == off xor ecx, 7f7f7f7fh ; ff == on or 00 == off and eax, ecx ; d^s== on or 00 == off xor ebx, eax ; set to source if on mov [edi], ebx ; set to source if on |
movq mm1, [esi+edi] pxor mm0, mm0 movq mm2, [edi] pcmpeqb mm0, mm1 ; mm0 = 0 if on, ff if off pand mm2, mm0 ; mm2 = 0 if on, d if off por mm2, mm1 ; mm2 = s if on, d if off movq [edi], mm2 |
64-bitni MMX reg ima korisne vrednosti samo na byte7 and byte3 (AXXXBXXX, gde A/B-korisno, X-beskorisno). Želim da kopiram byte7 u byte6, byte5, byte4; i da kopiram copy byte3 u byte2, byte1, byte0. |
psrld mm0, 24
; 0 0 0 A 0 0 0 B packssdw mm0, mm0 ; 0 A 0 B 0 A 0 B punpckhwd mm0, mm0 ; 0 A 0 A 0 B 0 B pmullw mm0, Const_0101010101010101 ; A A A A B B B B |
psrld mm0, 24 ; 0 0 0 A 0 0 0 B packssdw mm0, mm0 ; 0 A 0 B 0 A 0 B packuswb mm0, mm0 ; A B A B A B A B punpcklbw mm0, mm0 ; A A B B A A B B punpcklbw mm0, mm0 ; A A A A B B B B |
unsigned long gensh(unsigned long v, int x) { int a, b; a = (v << x) & -(((unsigned int)x) < 32); x = -x; b = (v >> x) & -(((unsigned int)x) < 32); return a| b; } |
; by Paul Hsieh mov ebx, eax ; 1 shl eax, cl ; 1 cmp ecx, 32 ; 1 sbb edx, edx ; 2 dep: CF neg ecx ; 2 issue and eax, edx ; 3 dep: edx cmp ecx, 32 ; 3 dep: ecx sbb edx, edx ; 4 dep: CF shr ebx, cl ; 3 dep: ecx and ebx, edx ; 5 dep: edx or eax, ebx ; 6 dep: |
; Microsoft Visual C++ cmp ecx, 32 ; 1 mov esi, eax ; 1 sbb edx, edx ; 2 dep: CF shl esi, cl ; 2 dep: esi neg edx ; 3 dep: edx ?? neg edx ; 4 dep: edx ?? neg ecx ; 1 and edx, esi ; 5 dep: edx cmp ecx, 32 ; 2 dep: ecx sbb esi, esi ; 3 dep: CF sar eax, cl ; 3 issue (ANSI) neg esi ; 4 dep: esi ?? neg esi ; 5 dep: esi ?? and eax, esi ; 6 dep: esi or eax, edx ; 7 dep: |
; WATCOM C/C++ cmp edx, 32 ; 1 setb cl ; 2 dep: CF mov ebx, ecx ; 4 dep: ecx, size mov esi, eax ; 1 and ebx, 255 ; 5 dep: ebx mov cl, dl ; 1 rename cl neg ebx ; 6 dep: ebx shl esi, cl ; 2 dep: esi,cl neg edx ; 2 issue mov ecx, esi ; 3 dep: esi and ebx, esi ; 7 dep: ebx cmp edx, 32 ; 3 dep: edx setb dh ; 4 dep: CF xor ecx, esi ; 4 dep: ecx ?? mov cl, dh ; 5 dep: dh mov esi, ecx ; 7 dep: ecx, size mov cl, dl ; 3 dep: edx neg esi ; 8 dep: esi sar eax, cl ; 5 issue (ANSI) and eax, esi ; 9 dep: esi or eax, ebx ; 10 dep: |
; by Norbert Juffa mov eax, dword ptr [x] ; x (lo) mov ebx, dword ptr [y] ; y (lo) mov edx, dword ptr [x+4] ; x (hi) mov ecx, dword ptr [y+4] ; y (hi) ; here: EDX:EAX = augend, ECX:EBX = addend mov esi, eax ; x lea edi, [eax+66666666h] ; x + 0x66666666 xor esi, ebx ; x ^ y add eax, ebx ; x + y shr esi, 1 ; t1 = (x ^ y) >> 1 add edi, ebx ; x + y + 0x66666666 sbb ebx, ebx ; capture carry rcr edi, 1 ; t2 = (x + y + 0x66666666) >> 1 xor esi, edi ; t1 ^ t2 and esi, 88888888h ; t3 = (t1 ^ t2) & 0x88888888 add eax, esi ; x + y + t3 shr esi, 2 ; t3 >> 2 sub eax, esi ; x + y + t3 - (t3 >> 2) sub edx, ebx ; propagate carry mov esi, edx ; x lea edi, [edx+66666666h] ; x + 0x66666666 xor esi, ecx ; x ^ y add edx, ecx ; x + y shr esi, 1 ; t1 = (x ^ y) >> 1 add edi, ecx ; x + y + 0x66666666 ;;sbb esi, esi ; capture carry rcr edi, 1 ; t2 = (x + y + 0x66666666) >> 1 xor esi, edi ; t1 ^ t2 and esi, 88888888h ; t3 = (t1 ^ t2) & 0x88888888 add edx, esi ; x + y + t3 shr esi, 2 ; t3 >> 2 sub edx, esi ; x + y + t3 - (t3 >> 2) ; here EDX:EAX = sum mov edi, z mov [edi], eax mov [edi+4], edx |
MagicConst dd 48124812h,48124812h ;.... movq mm7, MagicConst pcmpeqb mm6, mm6 ; 1111111111111111 psllw mm6, 12 ; 1111000000000000 @quads: movq mm0, [esi] ; 0006000500080007 movq mm1, mm6 ; XXXX000000000000 pmullw mm0, mm7 ; 8765705800870070 pand mm1, mm0 ; 8765000000000000 psrld mm0, 28 ; 0000000000004321 psrlw mm1, 8 ; 0000000087650000 por mm0, mm1 ; 0000000087654321 packuswb mm0, mm0 ; [?B?A?B?A] packuswb mm0, mm0 ; [BABABABA] movd eax, mm0 mov [edi],ax add esi, 8 add edi, 2 dec ecx jnz @quads |
; Integer Solution by Paul Hsieh LTop: mov eax, [esi] mov ebx, [edx] mov ebp, eax and eax, ebx xor ebx, ebp shr ebx, 1 and ebx, 7F7F7F7Fh add eax, ebx mov [edi], eax dec ecx jnz LTop |
; MMX Solution by Paul Hsieh LTop: movq mm0, [esi] movq mm1, [edx] movq mm2, mm0 pand mm0, mm1 pxor mm1, mm2 psrlb mm1, 1 paddb mm0, mm1 movq [edi], mm0 dec ecx jnz LTop |
Ova resenja upotrebite za pracenje :
A + B = (A and B) * 2 + (A xor B)
(A + B)/2 = (A and B) + (A xor B)/2 |
; Integer solution by Paul Hsieh mov eax, [esi] ; src0 mov ebx, [edi] ; src1 mov ecx, eax mov edx, ebx and ecx, ebx ; first bit carry xor edx, eax ; first bit add (mod 2) and eax, 0xFEFEFEFE and ebx, 0xFEFEFEFE add eax, ebx ; Add top 7 bits (A&0xFE)+(B&0xFE) sar eax, 1 ; >>= 1 to capture carry bits and ecx, 0x01010101 ; Is there a carry to the second bit? add eax, ecx ; (...)>>1 mov ecx, eax and edx, 0x01010101 ; first bit and eax, 0x7F7F7F7F shr ecx, 7 shl eax, 1 ; (...)&0xFE and ecx, 0x01010101 ; overflows or eax, edx ; ((...)&0xFE) + (((A&0x01)+(B&0x01))&1) xor ecx, 0x81818181 ; blockers sub ecx, 0x01010101 ; 0->80, 1->7F and ecx, 0x7F7F7F7F ; 0->00, 1->7F or eax, ecx shl ecx, 1 or eax, ecx |
A = (A&0xFE) + (A&0x01) B = (B&0xFE) + (B&0x01)
|
Moram da konvertujem vrednost boja (u stvari tekel preuzet iz A4R4G4B4 teksture) u A15R15G15B15 vrednost boje održane u MMX registrima. |
movzx eax, word ptr input_A4R4G4B4 movzx ebx, al shr eax, 8 movd mm0, dword ptr Table_G4B4_to_G15B15 [ebx*4] movd mm1, dword ptr Table_A4R4_to_A15R15 [eax*4] punpckldq mm0, mm1 |
; Integer SIMD uppercase(string) by Paul Hsieh
mov eax, src[esi] |
; Integer SIMD uppercase(string) on UTF-8 data by Paul Hsieh
mov eax, src[esi] |
; Integer SIMD uppercase(string) on UTF-8 data v2 by Paul Hsieh
mov eax, src[esi] |
; Integer SIMD uppercase(string) on UTF-8 data v2 by Paul Hsieh
uint32_t upperCaseSIMD (uint32_t x) { |
MVBITS (FROM, FROMPOS, LEN, TO, TOPOS) Opis: Kopira sekvencu bitova (bit polje) sa jedne lokacije na drugu. Klasa: Elementarna podrutina Argumenti: Postoje pet argumenata:
|
; Solution by Norbert Juffa mov ecx, [LEN] ; len mov eax, [FROM] ; from cmp ecx, 32 ; (len < 32)="" cy="" :=""> sbb edx, edx ; (len < 32)="" ~0="" :=""> shl edx, cl ; (len < 32)="" ((~0)="" <<="" len)="" :=""> mov ecx, [FROMPOS]; frompos not edx ; mask = (len < 32)="" ~((~0)="" <<="" len)="" :=""> shr eax, cl ; from >> frompos mov ecx, [TOPOS] ; topos and eax, edx ; (from >> frompos) & mask shl edx, cl ; mask << topos shl eax, cl ; bits << topos not edx ; ~(mask << topos) and edx, [TO] ; to & ~(mask << topos) or eax, edx ; to=(to&~(mask<<topos)|((from>>frompos)&mask) |
if (ECX == 0) { EAX = EDX EDX = 0 } else { /* ECX == 0xFFFFFFFF */ EAX = EAX EDX = EDX } |
; Solution by Norbert Juffa sub eax, edx ; a - d and eax, ecx ; c ? a - d : 0 add eax, edx ; c ? a : d and edx, ecx ; c ? d : 0 |
; Solution by Norbert Juffa __declspec (naked) __int64 __stdcall MYLSHIFT64 (const __int64 *i, const int *sh) { __asm { mov ecx, [esp+8] ; &sh mov eax, [esp+4] ; &i mov ecx, [ecx] ; sh mov edx, [eax+4] ; i_hi mov eax, [eax] ; i_lo shld edx, eax, cl ; sll (i,sh & 0x1f) shl eax, cl ; #if (CPU == i386) test ecx, 32 ; sh >= 32 ? jz $lshift_done ; nope, done mov edx, eax ; sll (i,32) xor eax, eax ; $lshift_done: cmp ecx, 64 ; (sh>=64)||(sh<> sbb ecx, ecx ; (sh>=64)||(sh<0) 0="" :="">0)> and eax, ecx ; (sh>=64)||(sh<0) 0="" :="" sll="">0)> and edx, ecx ; #else /* Athlon, P6+ */ test ecx, -64 ; (sh>=64)||(sh<0) nz="" :="">0)> ror ecx, 6 ; (64>sh>=32) ? CY : NC(ZF safe) mov ecx, 0 ; 0 cmovc edx, eax ; i=(64>sh>=32) ? sll(i,32) : i cmovc eax, ecx ; cmovnz edx, ecx ; (sh>=64)||(sh<0) 0="" :="" sll="">0)> cmovnz eax, ecx ; #endif ret 8 ; pop two DWORD parms and ret } } |
Imam 2 64-bitna (nikad iznad 2^40, za napomenu) celih brojeva u
C, radim sledeće:
Kompajler sada okreće promenu u SHL i SHLD, sa kojim nemam problem. Međutim, if izjava je renderovana korišćenjem cmps i jumps, koja, pošto je vrednost veoma nepredvidiva, me izaziva da uzmem malo od performanse. Šta će predložena 64-bitna bez granata bit vrteška da uradi gore? Znam da je 32-bitni ekvivalent neka fanki kombinacija subs, sbbs, ands, drugih i stvari, ali da li se konvertujei u 64 -bita? Da li postoji uopšte verzija cmovcc? |
Pa, to izgleda kao da želiš nešto ovako:
Dakle, kako da uradimo to - (tpow > p) u potpuno predikatnom načinu? Da vidimo ovo je isto kao -(p - tpow < 0),="" tako="" da="" želimo="" da="" izvršimo="" 64="" bita="" oduzimanje="" od="" tpow="" od="" p="" i="" da="" uhvatimo="" carry,="" i="" onda="" ga="" iskoristimo="" kao="" masku="" i="" protiv="" p="" i="" zatim="" oduzmemo="" od="" tpow.="">
Voila! U redu, mislim da bi cmovCC trebalo da bude još lakši.Vaš kod je ekvivalentan:
Tako da ćemo samo izračunati samo tpow - p, uhvatiti CF zastave, zatim uslovno zameniti tpow:
|
mov eax, 1 lock add [counter], eax ; [[1]] mov eax, 0 lock adc [counter+4], eax ; [[2]] |
; Eighth generation (64bit) x86 code. mov rax, 1 lock xadd [counter], rax inc rax |
retry: mov eax, dword ptr [counter]
; [[1]] mov edx, dword ptr [counter+4] ; [[2]] mov ebx, eax mov ecx, edx add ebx, 1 adc ecx, 0 lock cmpxchg8b qword ptr [counter] ; [[3]] jnz retry |
; by Paul Hsieh, Matt Taylor and Terje Mathisen incrementCounterAndRead: mov eax, 1 ;;cli mov edx, [counter+4] ; [[1]] lock xadd [counter], eax cmp eax, 07FFFFFFFh jne incrDone inc edx mov [counter+4],edx ; [[2]] mov ebx, 080000000h lock add [counter], ebx ; [[3]] ;;sti jmp readDone incrDone: ;;sti jb noLockBit retry: cmp edx, [counter+4] ; Early correction. jne topIsUpToDate mov ebx, [counter] cmp ebx, 07FFFFFFFh ja retry ; Wait for "lock bit"&nb sp;to reset. topIsUpToDate: mov edx, [counter+4] jmp readDone noLockBit: mov ebx, [counter+4] cmp edx, ebx je readDone cmp eax, 040000000h ; bit 30 on -> before wrap. jae readDone mov edx, ebx ; after wrap. readDone: lea eax, [2*eax+1] ; previous low + 1. shrd eax, edx, 1 shr edx, 1 |