Code Search for Developers
 
 
  

x86-32.asm from guliverkli at Krugle


Show x86-32.asm syntax highlighted


	.686
	.model flat
	.mmx
	.xmm

	.const

	__uvmin DD 0d01502f9r ; -1e+010
	__uvmax DD 0501502f9r ; +1e+010

	.code
	
;
; memsetd
;

@memsetd@12 proc public

	push	edi
	
	mov		edi, ecx
	mov		eax, edx
	mov		ecx, [esp+4+4]
	cld
	rep		stosd
	
	pop		edi

	ret		4
	
@memsetd@12 endp

;
; SaturateColor
;
			
@SaturateColor_sse2@4 proc public

	pxor		xmm0, xmm0
	movdqa		xmm1, [ecx]
	packssdw	xmm1, xmm0
	packuswb	xmm1, xmm0
	punpcklbw	xmm1, xmm0
	punpcklwd	xmm1, xmm0
	movdqa		[ecx], xmm1

	ret

@SaturateColor_sse2@4 endp

@SaturateColor_asm@4 proc public

	push	esi

	mov		esi, ecx

	xor		eax, eax
	mov		edx, 000000ffh

	mov		ecx, [esi]
	cmp		ecx, eax
	cmovl	ecx, eax
	cmp		ecx, edx
	cmovg	ecx, edx
	mov		[esi], ecx

	mov		ecx, [esi+4]
	cmp		ecx, eax
	cmovl	ecx, eax
	cmp		ecx, edx
	cmovg	ecx, edx
	mov		[esi+4], ecx

	mov		ecx, [esi+8]
	cmp		ecx, eax
	cmovl	ecx, eax
	cmp		ecx, edx
	cmovg	ecx, edx
	mov		[esi+8], ecx

	mov		ecx, [esi+12]
	cmp		ecx, eax
	cmovl	ecx, eax
	cmp		ecx, edx
	cmovg	ecx, edx
	mov		[esi+12], ecx
	
	pop		esi
	
	ret
	
@SaturateColor_asm@4 endp

;
; swizzling
;

punpck macro op, sd0, sd2, s1, s3, d1, d3

	movdqa					@CatStr(xmm, %d1),	@CatStr(xmm, %sd0)
	pshufd					@CatStr(xmm, %d3),	@CatStr(xmm, %sd2), 0e4h
	
	@CatStr(punpckl, op)	@CatStr(xmm, %sd0),	@CatStr(xmm, %s1)
	@CatStr(punpckh, op)	@CatStr(xmm, %d1),	@CatStr(xmm, %s1)
	@CatStr(punpckl, op)	@CatStr(xmm, %sd2),	@CatStr(xmm, %s3)
	@CatStr(punpckh, op)	@CatStr(xmm, %d3),	@CatStr(xmm, %s3)

	endm
	
punpcknb macro

	movdqa	xmm4, xmm0
	pshufd	xmm5, xmm1, 0e4h

	psllq	xmm1, 4
	psrlq	xmm4, 4

	movdqa	xmm6, xmm7
	pand	xmm0, xmm7
	pandn	xmm6, xmm1
	por		xmm0, xmm6

	movdqa	xmm6, xmm7
	pand	xmm4, xmm7
	pandn	xmm6, xmm5
	por		xmm4, xmm6

	movdqa	xmm1, xmm4

	movdqa	xmm4, xmm2
	pshufd	xmm5, xmm3, 0e4h

	psllq	xmm3, 4
	psrlq	xmm4, 4

	movdqa	xmm6, xmm7
	pand	xmm2, xmm7
	pandn	xmm6, xmm3
	por		xmm2, xmm6

	movdqa	xmm6, xmm7
	pand	xmm4, xmm7
	pandn	xmm6, xmm5
	por		xmm4, xmm6

	movdqa	xmm3, xmm4

	punpck	bw, 0, 2, 1, 3, 4, 6

	endm

;
; unSwizzleBlock32
;

@unSwizzleBlock32_sse2@12 proc public

	push		ebx

	mov			ebx, [esp+4+4]
	lea			eax, [ebx*2]
	add			eax, ebx

	movdqa		xmm0, [ecx+16*0]
	movdqa		xmm1, [ecx+16*1]
	movdqa		xmm2, [ecx+16*2]
	movdqa		xmm3, [ecx+16*3]

	punpck		qdq, 0, 2, 1, 3, 4, 6

	movdqa		[edx], xmm0
	movdqa		[edx+16], xmm2
	movdqa		[edx+ebx], xmm4
	movdqa		[edx+ebx+16], xmm6

	movdqa		xmm0, [ecx+16*4]
	movdqa		xmm1, [ecx+16*5]
	movdqa		xmm2, [ecx+16*6]
	movdqa		xmm3, [ecx+16*7]

	punpck		qdq, 0, 2, 1, 3, 4, 6

	movdqa		[edx+ebx*2], xmm0
	movdqa		[edx+ebx*2+16], xmm2
	movdqa		[edx+eax], xmm4
	movdqa		[edx+eax+16], xmm6
	
	lea			edx, [edx+ebx*4]

	movdqa		xmm0, [ecx+16*8]
	movdqa		xmm1, [ecx+16*9]
	movdqa		xmm2, [ecx+16*10]
	movdqa		xmm3, [ecx+16*11]

	punpck		qdq, 0, 2, 1, 3, 4, 6

	movdqa		[edx], xmm0
	movdqa		[edx+16], xmm2
	movdqa		[edx+ebx], xmm4
	movdqa		[edx+ebx+16], xmm6

	movdqa		xmm0, [ecx+16*12]
	movdqa		xmm1, [ecx+16*13]
	movdqa		xmm2, [ecx+16*14]
	movdqa		xmm3, [ecx+16*15]

	punpck		qdq, 0, 2, 1, 3, 4, 6

	movdqa		[edx+ebx*2], xmm0
	movdqa		[edx+ebx*2+16], xmm2
	movdqa		[edx+eax], xmm4
	movdqa		[edx+eax+16], xmm6

	pop			ebx

	ret			4

@unSwizzleBlock32_sse2@12 endp

;
; unSwizzleBlock16
;

@unSwizzleBlock16_sse2@12 proc public
	
	push		ebx

	mov			ebx, [esp+4+4]
	mov			eax, 4
	
	align 16
@@:
	movdqa		xmm0, [ecx+16*0]
	movdqa		xmm1, [ecx+16*1]
	movdqa		xmm2, [ecx+16*2]
	movdqa		xmm3, [ecx+16*3]

	punpck		wd, 0, 2, 1, 3, 4, 6
	punpck		dq, 0, 4, 2, 6, 1, 3
	punpck		wd, 0, 4, 1, 3, 2, 6

	movdqa		[edx], xmm0
	movdqa		[edx+16], xmm2
	movdqa		[edx+ebx], xmm4
	movdqa		[edx+ebx+16], xmm6

	add			ecx, 64
	lea			edx, [edx+ebx*2]

	dec			eax
	jnz			@B
	
	pop			ebx
	
	ret			4
	
@unSwizzleBlock16_sse2@12 endp

;
; unSwizzleBlock8
;

@unSwizzleBlock8_sse2@12 proc public

	push		ebx

	mov			ebx, [esp+4+4]
	mov			eax, 2

	align 16
@@:
	; col 0, 2

	movdqa		xmm0, [ecx+16*0]
	movdqa		xmm1, [ecx+16*1]
	movdqa		xmm4, [ecx+16*2]
	movdqa		xmm5, [ecx+16*3]

	punpck		bw,  0, 4, 1, 5, 2, 6
	punpck		wd,  0, 2, 4, 6, 1, 3
	punpck		bw,  0, 2, 1, 3, 4, 6
	punpck		qdq, 0, 2, 4, 6, 1, 3

	pshufd		xmm1, xmm1, 0b1h
	pshufd		xmm3, xmm3, 0b1h

	movdqa		[edx], xmm0
	movdqa		[edx+ebx], xmm2
	lea			edx, [edx+ebx*2]

	movdqa		[edx], xmm1
	movdqa		[edx+ebx], xmm3
	lea			edx, [edx+ebx*2]

	; col 1, 3

	movdqa		xmm0, [ecx+16*4]
	movdqa		xmm1, [ecx+16*5]
	movdqa		xmm4, [ecx+16*6]
	movdqa		xmm5, [ecx+16*7]

	punpck		bw,  0, 4, 1, 5, 2, 6
	punpck		wd,  0, 2, 4, 6, 1, 3
	punpck		bw,  0, 2, 1, 3, 4, 6
	punpck		qdq, 0, 2, 4, 6, 1, 3

	pshufd		xmm0, xmm0, 0b1h
	pshufd		xmm2, xmm2, 0b1h

	movdqa		[edx], xmm0
	movdqa		[edx+ebx], xmm2
	lea			edx, [edx+ebx*2]

	movdqa		[edx], xmm1
	movdqa		[edx+ebx], xmm3
	lea			edx, [edx+ebx*2]

	add			ecx, 128

	dec			eax
	jnz			@B

	pop			ebx
	
	ret			4

@unSwizzleBlock8_sse2@12 endp

;
; unSwizzleBlock4
;

@unSwizzleBlock4_sse2@12 proc public

	push		ebx

	mov         eax, 0f0f0f0fh
	movd        xmm7, eax 
	pshufd      xmm7, xmm7, 0

	mov			ebx, [esp+4+4]
	mov			eax, 2

	align 16
@@:
	; col 0, 2

	movdqa		xmm0, [ecx+16*0]
	movdqa		xmm1, [ecx+16*1]
	movdqa		xmm4, [ecx+16*2]
	movdqa		xmm3, [ecx+16*3]

	punpck		dq, 0, 4, 1, 3, 2, 6
	punpck		dq, 0, 2, 4, 6, 1, 3
	punpcknb
	punpck		bw, 0, 2, 4, 6, 1, 3
	punpck		wd, 0, 2, 1, 3, 4, 6

	pshufd		xmm0, xmm0, 0d8h
	pshufd		xmm2, xmm2, 0d8h
	pshufd		xmm4, xmm4, 0d8h
	pshufd		xmm6, xmm6, 0d8h

	punpck		qdq, 0, 2, 4, 6, 1, 3

	pshuflw		xmm1, xmm1, 0b1h
	pshuflw		xmm3, xmm3, 0b1h
	pshufhw		xmm1, xmm1, 0b1h
	pshufhw		xmm3, xmm3, 0b1h

	movdqa		[edx], xmm0
	movdqa		[edx+ebx], xmm2
	lea			edx, [edx+ebx*2]

	movdqa		[edx], xmm1
	movdqa		[edx+ebx], xmm3
	lea			edx, [edx+ebx*2]

	; col 1, 3

	movdqa		xmm0, [ecx+16*4]
	movdqa		xmm1, [ecx+16*5]
	movdqa		xmm4, [ecx+16*6]
	movdqa		xmm3, [ecx+16*7]

	punpck		dq, 0, 4, 1, 3, 2, 6
	punpck		dq, 0, 2, 4, 6, 1, 3
	punpcknb
	punpck		bw, 0, 2, 4, 6, 1, 3
	punpck		wd, 0, 2, 1, 3, 4, 6

	pshufd		xmm0, xmm0, 0d8h
	pshufd		xmm2, xmm2, 0d8h
	pshufd		xmm4, xmm4, 0d8h
	pshufd		xmm6, xmm6, 0d8h

	punpck		qdq, 0, 2, 4, 6, 1, 3

	pshuflw		xmm0, xmm0, 0b1h
	pshuflw		xmm2, xmm2, 0b1h
	pshufhw		xmm0, xmm0, 0b1h
	pshufhw		xmm2, xmm2, 0b1h

	movdqa		[edx], xmm0
	movdqa		[edx+ebx], xmm2
	lea			edx, [edx+ebx*2]

	movdqa		[edx], xmm1
	movdqa		[edx+ebx], xmm3
	lea			edx, [edx+ebx*2]

	add			ecx, 128

	dec			eax
	jnz			@B

	pop			ebx
	
	ret			4

@unSwizzleBlock4_sse2@12 endp

;
; unSwizzleBlock8HP
;

@unSwizzleBlock8HP_sse2@12 proc public

	push		ebx

	mov			ebx, [esp+4+4]
	mov			eax, 4

	align 16
@@:
	movdqa		xmm0, [ecx+16*0]
	movdqa		xmm1, [ecx+16*1]
	movdqa		xmm2, [ecx+16*2]
	movdqa		xmm3, [ecx+16*3]

	punpck		qdq, 0, 2, 1, 3, 4, 6
	
	psrld		xmm0, 24
	psrld		xmm2, 24
	psrld		xmm4, 24
	psrld		xmm6, 24
	
	packssdw	xmm0, xmm2
	packssdw	xmm4, xmm6
	packuswb	xmm0, xmm4

	movlps		qword ptr [edx], xmm0
	movhps		qword ptr [edx+ebx], xmm0

	add			ecx, 64
	lea			edx, [edx+ebx*2]

	dec			eax
	jnz			@B

	pop			ebx

	ret			4

@unSwizzleBlock8HP_sse2@12 endp

;
; unSwizzleBlock4HLP
;

@unSwizzleBlock4HLP_sse2@12 proc public

	push		ebx
	
	mov         eax, 0f0f0f0fh
	movd        xmm7, eax 
	pshufd      xmm7, xmm7, 0

	mov			ebx, [esp+4+4]
	mov			eax, 4
	
	align 16
@@:
	movdqa		xmm0, [ecx+16*0]
	movdqa		xmm1, [ecx+16*1]
	movdqa		xmm2, [ecx+16*2]
	movdqa		xmm3, [ecx+16*3]

	punpck		qdq, 0, 2, 1, 3, 4, 6
	
	psrld		xmm0, 24
	psrld		xmm2, 24
	psrld		xmm4, 24
	psrld		xmm6, 24
	
	packssdw	xmm0, xmm2
	packssdw	xmm4, xmm6
	packuswb	xmm0, xmm4

	pand		xmm0, xmm7

	movlps		qword ptr [edx], xmm0
	movhps		qword ptr [edx+ebx], xmm0

	add			ecx, 64
	lea			edx, [edx+ebx*2]

	dec			eax
	jnz			@B

	pop			ebx

	ret			4

@unSwizzleBlock4HLP_sse2@12 endp

;
; unSwizzleBlock4HHP
;

@unSwizzleBlock4HHP_sse2@12 proc public

	push		ebx

	mov			ebx, [esp+4+4]
	mov			eax, 4

	align 16
@@:
	movdqa		xmm0, [ecx+16*0]
	movdqa		xmm1, [ecx+16*1]
	movdqa		xmm2, [ecx+16*2]
	movdqa		xmm3, [ecx+16*3]

	punpck		qdq, 0, 2, 1, 3, 4, 6
	
	psrld		xmm0, 28
	psrld		xmm2, 28
	psrld		xmm4, 28
	psrld		xmm6, 28
	
	packssdw	xmm0, xmm2
	packssdw	xmm4, xmm6
	packuswb	xmm0, xmm4

	movlps		qword ptr [edx], xmm0
	movhps		qword ptr [edx+ebx], xmm0

	add			ecx, 64
	lea			edx, [edx+ebx*2]

	dec			eax
	jnz			@B

	pop			ebx

	ret			4

@unSwizzleBlock4HHP_sse2@12 endp

;
; unSwizzleBlock4P
;

@unSwizzleBlock4P_sse2@12 proc public

	push		esi
	push		edi

	mov         eax, 0f0f0f0fh
	movd        xmm7, eax 
	pshufd      xmm7, xmm7, 0

	mov			esi, [esp+4+8]
	lea			edi, [esi*2]
	add			edi, esi

	; col 0

	movdqa		xmm0, [ecx+16*0]
	movdqa		xmm1, [ecx+16*1]
	movdqa		xmm2, [ecx+16*2]
	movdqa		xmm3, [ecx+16*3]

	punpck		bw, 0, 2, 1, 3, 4, 6
	punpck		wd, 0, 4, 2, 6, 1, 3
	punpck		bw, 0, 4, 1, 3, 2, 6

	movdqa		xmm1, xmm7
	pandn		xmm1, xmm0
	pand		xmm0, xmm7
	pshufd		xmm1, xmm1, 0b1h
	psrlq		xmm1, 4

	movdqa		xmm3, xmm7
	pandn		xmm3, xmm2
	pand		xmm2, xmm7
	pshufd		xmm3, xmm3, 0b1h
	psrlq		xmm3, 4

	movdqa		[edx], xmm0
	movdqa		[edx+16], xmm2
	movdqa		[edx+esi*2], xmm1
	movdqa		[edx+esi*2+16], xmm3
	
	movdqa		xmm1, xmm7
	pandn		xmm1, xmm4
	pand		xmm4, xmm7
	pshufd		xmm1, xmm1, 0b1h
	psrlq		xmm1, 4
	
	movdqa		xmm3, xmm7
	pandn		xmm3, xmm6
	pand		xmm6, xmm7
	pshufd		xmm3, xmm3, 0b1h
	psrlq		xmm3, 4

	movdqa		[edx+esi], xmm4
	movdqa		[edx+esi+16], xmm6
	movdqa		[edx+edi], xmm1
	movdqa		[edx+edi+16], xmm3

	lea			edx, [edx+esi*4]

	; col 1

	movdqa		xmm0, [ecx+16*4]
	movdqa		xmm1, [ecx+16*5]
	movdqa		xmm2, [ecx+16*6]
	movdqa		xmm3, [ecx+16*7]

	punpck		bw, 0, 2, 1, 3, 4, 6
	punpck		wd, 0, 4, 2, 6, 1, 3
	punpck		bw, 0, 4, 1, 3, 2, 6

	movdqa		xmm1, xmm7
	pandn		xmm1, xmm0
	pand		xmm0, xmm7
	pshufd		xmm0, xmm0, 0b1h
	psrlq		xmm1, 4

	movdqa		xmm3, xmm7
	pandn		xmm3, xmm2
	pand		xmm2, xmm7
	pshufd		xmm2, xmm2, 0b1h
	psrlq		xmm3, 4

	movdqa		[edx], xmm0
	movdqa		[edx+16], xmm2
	movdqa		[edx+esi*2], xmm1
	movdqa		[edx+esi*2+16], xmm3
	
	movdqa		xmm1, xmm7
	pandn		xmm1, xmm4
	pand		xmm4, xmm7
	pshufd		xmm4, xmm4, 0b1h
	psrlq		xmm1, 4
	
	movdqa		xmm3, xmm7
	pandn		xmm3, xmm6
	pand		xmm6, xmm7
	pshufd		xmm6, xmm6, 0b1h
	psrlq		xmm3, 4

	movdqa		[edx+esi], xmm4
	movdqa		[edx+esi+16], xmm6
	movdqa		[edx+edi], xmm1
	movdqa		[edx+edi+16], xmm3

	lea			edx, [edx+esi*4]

	; col 2

	movdqa		xmm0, [ecx+16*8]
	movdqa		xmm1, [ecx+16*9]
	movdqa		xmm2, [ecx+16*10]
	movdqa		xmm3, [ecx+16*11]

	punpck		bw, 0, 2, 1, 3, 4, 6
	punpck		wd, 0, 4, 2, 6, 1, 3
	punpck		bw, 0, 4, 1, 3, 2, 6

	movdqa		xmm1, xmm7
	pandn		xmm1, xmm0
	pand		xmm0, xmm7
	pshufd		xmm1, xmm1, 0b1h
	psrlq		xmm1, 4

	movdqa		xmm3, xmm7
	pandn		xmm3, xmm2
	pand		xmm2, xmm7
	pshufd		xmm3, xmm3, 0b1h
	psrlq		xmm3, 4

	movdqa		[edx], xmm0
	movdqa		[edx+16], xmm2
	movdqa		[edx+esi*2], xmm1
	movdqa		[edx+esi*2+16], xmm3
	
	movdqa		xmm1, xmm7
	pandn		xmm1, xmm4
	pand		xmm4, xmm7
	pshufd		xmm1, xmm1, 0b1h
	psrlq		xmm1, 4
	
	movdqa		xmm3, xmm7
	pandn		xmm3, xmm6
	pand		xmm6, xmm7
	pshufd		xmm3, xmm3, 0b1h
	psrlq		xmm3, 4

	movdqa		[edx+esi], xmm4
	movdqa		[edx+esi+16], xmm6
	movdqa		[edx+edi], xmm1
	movdqa		[edx+edi+16], xmm3

	lea			edx, [edx+esi*4]

	; col 3

	movdqa		xmm0, [ecx+16*12]
	movdqa		xmm1, [ecx+16*13]
	movdqa		xmm2, [ecx+16*14]
	movdqa		xmm3, [ecx+16*15]

	punpck		bw, 0, 2, 1, 3, 4, 6
	punpck		wd, 0, 4, 2, 6, 1, 3
	punpck		bw, 0, 4, 1, 3, 2, 6

	movdqa		xmm1, xmm7
	pandn		xmm1, xmm0
	pand		xmm0, xmm7
	pshufd		xmm0, xmm0, 0b1h
	psrlq		xmm1, 4

	movdqa		xmm3, xmm7
	pandn		xmm3, xmm2
	pand		xmm2, xmm7
	pshufd		xmm2, xmm2, 0b1h
	psrlq		xmm3, 4

	movdqa		[edx], xmm0
	movdqa		[edx+16], xmm2
	movdqa		[edx+esi*2], xmm1
	movdqa		[edx+esi*2+16], xmm3
	
	movdqa		xmm1, xmm7
	pandn		xmm1, xmm4
	pand		xmm4, xmm7
	pshufd		xmm4, xmm4, 0b1h
	psrlq		xmm1, 4
	
	movdqa		xmm3, xmm7
	pandn		xmm3, xmm6
	pand		xmm6, xmm7
	pshufd		xmm6, xmm6, 0b1h
	psrlq		xmm3, 4

	movdqa		[edx+esi], xmm4
	movdqa		[edx+esi+16], xmm6
	movdqa		[edx+edi], xmm1
	movdqa		[edx+edi+16], xmm3

	; lea			edx, [edx+esi*4]

	pop			edi
	pop			esi

	ret			4

@unSwizzleBlock4P_sse2@12 endp

;
; swizzling
;

;
; SwizzleBlock32
;

@SwizzleBlock32_sse2@16 proc public


	push		esi
	push		edi

	mov			edi, ecx
	mov			esi, edx
	mov			edx, [esp+4+8]
	mov			ecx, 4

	mov			eax, [esp+8+8]
	cmp			eax, 0ffffffffh
	jnz			SwizzleBlock32_sse2@WM

	align 16
@@:
	movdqa		xmm0, [esi]
	movdqa		xmm4, [esi+16]
	movdqa		xmm1, [esi+edx]
	movdqa		xmm5, [esi+edx+16]

	punpck		qdq, 0, 4, 1, 5, 2, 6

	movdqa		[edi+16*0], xmm0
	movdqa		[edi+16*1], xmm2
	movdqa		[edi+16*2], xmm4
	movdqa		[edi+16*3], xmm6

	lea			esi, [esi+edx*2]
	add			edi, 64

	dec			ecx
	jnz			@B

	pop			edi
	pop			esi

	ret			8

SwizzleBlock32_sse2@WM:

	movd		xmm7, eax
	pshufd		xmm7, xmm7, 0
	
	align 16
@@:
	movdqa		xmm0, [esi]
	movdqa		xmm4, [esi+16]
	movdqa		xmm1, [esi+edx]
	movdqa		xmm5, [esi+edx+16]

	punpck		qdq, 0, 4, 1, 5, 2, 6

	movdqa		xmm3, xmm7
	pshufd		xmm5, xmm7, 0e4h

	pandn		xmm3, [edi+16*0]
	pand		xmm0, xmm7
	por			xmm0, xmm3
	movdqa		[edi+16*0], xmm0

	pandn		xmm5, [edi+16*1]
	pand		xmm2, xmm7
	por			xmm2, xmm5
	movdqa		[edi+16*1], xmm2

	movdqa		xmm3, xmm7
	pshufd		xmm5, xmm7, 0e4h

	pandn		xmm3, [edi+16*2]
	pand		xmm4, xmm7
	por			xmm4, xmm3
	movdqa		[edi+16*2], xmm4

	pandn		xmm5, [edi+16*3]
	pand		xmm6, xmm7
	por			xmm6, xmm5
	movdqa		[edi+16*3], xmm6

	lea			esi, [esi+edx*2]
	add			edi, 64

	dec			ecx
	jnz			@B

	pop			edi
	pop			esi

	ret			8
	
@SwizzleBlock32_sse2@16 endp

;
; SwizzleBlock16
;

@SwizzleBlock16_sse2@12 proc public

	push		ebx

	mov			ebx, [esp+4+4]
	mov			eax, 4

	align 16
@@:
	movdqa		xmm0, [edx]
	movdqa		xmm1, [edx+16]
	movdqa		xmm2, [edx+ebx]
	movdqa		xmm3, [edx+ebx+16]

	punpck		wd, 0, 2, 1, 3, 4, 6
	punpck		qdq, 0, 4, 2, 6, 1, 5

	movdqa		[ecx+16*0], xmm0
	movdqa		[ecx+16*1], xmm1
	movdqa		[ecx+16*2], xmm4
	movdqa		[ecx+16*3], xmm5

	lea			edx, [edx+ebx*2]
	add			ecx, 64

	dec			eax
	jnz			@B

	pop			ebx

	ret			4

@SwizzleBlock16_sse2@12 endp

;
; SwizzleBlock8
;

@SwizzleBlock8_sse2@12 proc public

	push		ebx

	mov			ebx, [esp+4+4]
	mov			eax, 2

	align 16
@@:
	; col 0, 2

	movdqa		xmm0, [edx]
	movdqa		xmm2, [edx+ebx]
	lea			edx, [edx+ebx*2]

	pshufd		xmm1, [edx], 0b1h
	pshufd		xmm3, [edx+ebx], 0b1h
	lea			edx, [edx+ebx*2]

	punpck		bw, 0, 2, 1, 3, 4, 6
	punpck		wd, 0, 2, 4, 6, 1, 3
	punpck		qdq, 0, 1, 2, 3, 4, 5

	movdqa		[ecx+16*0], xmm0
	movdqa		[ecx+16*1], xmm4
	movdqa		[ecx+16*2], xmm1
	movdqa		[ecx+16*3], xmm5

	; col 1, 3

	pshufd		xmm0, [edx], 0b1h
	pshufd		xmm2, [edx+ebx], 0b1h
	lea			edx, [edx+ebx*2]

	movdqa		xmm1, [edx]
	movdqa		xmm3, [edx+ebx]
	lea			edx, [edx+ebx*2]

	punpck		bw, 0, 2, 1, 3, 4, 6
	punpck		wd, 0, 2, 4, 6, 1, 3
	punpck		qdq, 0, 1, 2, 3, 4, 5

	movdqa		[ecx+16*4], xmm0
	movdqa		[ecx+16*5], xmm4
	movdqa		[ecx+16*6], xmm1
	movdqa		[ecx+16*7], xmm5

	add			ecx, 128

	dec			eax
	jnz			@B

	pop			ebx

	ret			4
	
@SwizzleBlock8_sse2@12 endp

;
; SwizzleBlock4
;

@SwizzleBlock4_sse2@12 proc public

	push		ebx
	
	mov         eax, 0f0f0f0fh
	movd        xmm7, eax 
	pshufd      xmm7, xmm7, 0

	mov			ebx, [esp+4+4]
	mov			eax, 2

	align 16
@@:
	; col 0, 2

	movdqa		xmm0, [edx]
	movdqa		xmm2, [edx+ebx]
	lea			edx, [edx+ebx*2]

	movdqa		xmm1, [edx]
	movdqa		xmm3, [edx+ebx]
	lea			edx, [edx+ebx*2]

	pshuflw		xmm1, xmm1, 0b1h
	pshuflw		xmm3, xmm3, 0b1h
	pshufhw		xmm1, xmm1, 0b1h
	pshufhw		xmm3, xmm3, 0b1h

	punpcknb
	punpck		bw, 0, 2, 4, 6, 1, 3
	punpck		bw, 0, 2, 1, 3, 4, 6
	punpck		qdq, 0, 4, 2, 6, 1, 3

	movdqa		[ecx+16*0], xmm0
	movdqa		[ecx+16*1], xmm1
	movdqa		[ecx+16*2], xmm4
	movdqa		[ecx+16*3], xmm3

	; col 1, 3

	movdqa		xmm0, [edx]
	movdqa		xmm2, [edx+ebx]
	lea			edx, [edx+ebx*2]

	movdqa		xmm1, [edx]
	movdqa		xmm3, [edx+ebx]
	lea			edx, [edx+ebx*2]

	pshuflw		xmm0, xmm0, 0b1h
	pshuflw		xmm2, xmm2, 0b1h
	pshufhw		xmm0, xmm0, 0b1h
	pshufhw		xmm2, xmm2, 0b1h

	punpcknb
	punpck		bw, 0, 2, 4, 6, 1, 3
	punpck		bw, 0, 2, 1, 3, 4, 6
	punpck		qdq, 0, 4, 2, 6, 1, 3

	movdqa		[ecx+16*4], xmm0
	movdqa		[ecx+16*5], xmm1
	movdqa		[ecx+16*6], xmm4
	movdqa		[ecx+16*7], xmm3

	add			ecx, 128

	dec			eax
	jnz			@B

	pop			ebx

	ret			4

@SwizzleBlock4_sse2@12 endp

;
; swizzling with unaligned reads
;

;
; SwizzleBlock32u
;

@SwizzleBlock32u_sse2@16 proc public

	push		esi
	push		edi

	mov			edi, ecx
	mov			esi, edx
	mov			edx, [esp+4+8]
	mov			ecx, 4

	mov			eax, [esp+8+8]
	cmp			eax, 0ffffffffh
	jnz			SwizzleBlock32u_sse2@WM

	align 16
@@:
	movdqu		xmm0, [esi]
	movdqu		xmm4, [esi+16]
	movdqu		xmm1, [esi+edx]
	movdqu		xmm5, [esi+edx+16]

	punpck		qdq, 0, 4, 1, 5, 2, 6

	movdqa		[edi+16*0], xmm0
	movdqa		[edi+16*1], xmm2
	movdqa		[edi+16*2], xmm4
	movdqa		[edi+16*3], xmm6

	lea			esi, [esi+edx*2]
	add			edi, 64

	dec			ecx
	jnz			@B

	pop			edi
	pop			esi

	ret			8

SwizzleBlock32u_sse2@WM:

	movd		xmm7, eax
	pshufd		xmm7, xmm7, 0
	
	align 16
@@:
	movdqu		xmm0, [esi]
	movdqu		xmm4, [esi+16]
	movdqu		xmm1, [esi+edx]
	movdqu		xmm5, [esi+edx+16]

	punpck		qdq, 0, 4, 1, 5, 2, 6

	movdqa		xmm3, xmm7
	pshufd		xmm5, xmm7, 0e4h

	pandn		xmm3, [edi+16*0]
	pand		xmm0, xmm7
	por			xmm0, xmm3
	movdqa		[edi+16*0], xmm0

	pandn		xmm5, [edi+16*1]
	pand		xmm2, xmm7
	por			xmm2, xmm5
	movdqa		[edi+16*1], xmm2

	movdqa		xmm3, xmm7
	pshufd		xmm5, xmm7, 0e4h

	pandn		xmm3, [edi+16*2]
	pand		xmm4, xmm7
	por			xmm4, xmm3
	movdqa		[edi+16*2], xmm4

	pandn		xmm5, [edi+16*3]
	pand		xmm6, xmm7
	por			xmm6, xmm5
	movdqa		[edi+16*3], xmm6

	lea			esi, [esi+edx*2]
	add			edi, 64

	dec			ecx
	jnz			@B

	pop			edi
	pop			esi

	ret			8
	
@SwizzleBlock32u_sse2@16 endp

;
; SwizzleBlock16u
;

@SwizzleBlock16u_sse2@12 proc public

	push		ebx

	mov			ebx, [esp+4+4]
	mov			eax, 4

	align 16
@@:
	movdqu		xmm0, [edx]
	movdqu		xmm1, [edx+16]
	movdqu		xmm2, [edx+ebx]
	movdqu		xmm3, [edx+ebx+16]

	punpck		wd, 0, 2, 1, 3, 4, 6
	punpck		qdq, 0, 4, 2, 6, 1, 5

	movdqa		[ecx+16*0], xmm0
	movdqa		[ecx+16*1], xmm1
	movdqa		[ecx+16*2], xmm4
	movdqa		[ecx+16*3], xmm5

	lea			edx, [edx+ebx*2]
	add			ecx, 64

	dec			eax
	jnz			@B

	pop			ebx

	ret			4

@SwizzleBlock16u_sse2@12 endp

;
; SwizzleBlock8u
;

@SwizzleBlock8u_sse2@12 proc public

	push		ebx

	mov			ebx, [esp+4+4]
	mov			eax, 2

	align 16
@@:
	; col 0, 2

	movdqu		xmm0, [edx]
	movdqu		xmm2, [edx+ebx]
	lea			edx, [edx+ebx*2]

	movdqu		xmm1, [edx]
	movdqu		xmm3, [edx+ebx]
	pshufd		xmm1, xmm1, 0b1h
	pshufd		xmm3, xmm3, 0b1h
	lea			edx, [edx+ebx*2]

	punpck		bw, 0, 2, 1, 3, 4, 6
	punpck		wd, 0, 2, 4, 6, 1, 3
	punpck		qdq, 0, 1, 2, 3, 4, 5

	movdqa		[ecx+16*0], xmm0
	movdqa		[ecx+16*1], xmm4
	movdqa		[ecx+16*2], xmm1
	movdqa		[ecx+16*3], xmm5

	; col 1, 3

	movdqu		xmm0, [edx]
	movdqu		xmm2, [edx+ebx]
	pshufd		xmm0, xmm0, 0b1h
	pshufd		xmm2, xmm2, 0b1h
	lea			edx, [edx+ebx*2]

	movdqu		xmm1, [edx]
	movdqu		xmm3, [edx+ebx]
	lea			edx, [edx+ebx*2]

	punpck		bw, 0, 2, 1, 3, 4, 6
	punpck		wd, 0, 2, 4, 6, 1, 3
	punpck		qdq, 0, 1, 2, 3, 4, 5

	movdqa		[ecx+16*4], xmm0
	movdqa		[ecx+16*5], xmm4
	movdqa		[ecx+16*6], xmm1
	movdqa		[ecx+16*7], xmm5

	add			ecx, 128

	dec			eax
	jnz			@B

	pop			ebx

	ret			4
	
@SwizzleBlock8u_sse2@12 endp

;
; SwizzleBlock4u
;

@SwizzleBlock4u_sse2@12 proc public

	push		ebx
	
	mov         eax, 0f0f0f0fh
	movd        xmm7, eax 
	pshufd      xmm7, xmm7, 0

	mov			ebx, [esp+4+4]
	mov			eax, 2

	align 16
@@:
	; col 0, 2

	movdqu		xmm0, [edx]
	movdqu		xmm2, [edx+ebx]
	lea			edx, [edx+ebx*2]

	movdqu		xmm1, [edx]
	movdqu		xmm3, [edx+ebx]
	lea			edx, [edx+ebx*2]

	pshuflw		xmm1, xmm1, 0b1h
	pshuflw		xmm3, xmm3, 0b1h
	pshufhw		xmm1, xmm1, 0b1h
	pshufhw		xmm3, xmm3, 0b1h

	punpcknb
	punpck		bw, 0, 2, 4, 6, 1, 3
	punpck		bw, 0, 2, 1, 3, 4, 6
	punpck		qdq, 0, 4, 2, 6, 1, 3

	movdqa		[ecx+16*0], xmm0
	movdqa		[ecx+16*1], xmm1
	movdqa		[ecx+16*2], xmm4
	movdqa		[ecx+16*3], xmm3

	; col 1, 3

	movdqu		xmm0, [edx]
	movdqu		xmm2, [edx+ebx]
	lea			edx, [edx+ebx*2]

	movdqu		xmm1, [edx]
	movdqu		xmm3, [edx+ebx]
	lea			edx, [edx+ebx*2]

	pshuflw		xmm0, xmm0, 0b1h
	pshuflw		xmm2, xmm2, 0b1h
	pshufhw		xmm0, xmm0, 0b1h
	pshufhw		xmm2, xmm2, 0b1h

	punpcknb
	punpck		bw, 0, 2, 4, 6, 1, 3
	punpck		bw, 0, 2, 1, 3, 4, 6
	punpck		qdq, 0, 4, 2, 6, 1, 3

	movdqa		[ecx+16*4], xmm0
	movdqa		[ecx+16*5], xmm1
	movdqa		[ecx+16*6], xmm4
	movdqa		[ecx+16*7], xmm3

	add			ecx, 128

	dec			eax
	jnz			@B

	pop			ebx

	ret			4

@SwizzleBlock4u_sse2@12 endp

	end



See more files for this project here

guliverkli

Home of VobSub, Media Player Classic (MPC) and other misc utils.

Project homepage: http://sourceforge.net/projects/guliverkli
Programming language(s): C,C++,PHP
License: other

  res/
    GSdx9.rc2
    hlsl_merge.fx
    hlsl_rb.fx
    hlsl_tfx.fx
    logo1.bmp
    ps11_en00.psh
    ps11_en01.psh
    ps11_en10.psh
    ps11_en11.psh
    ps11_tfx000.psh
    ps11_tfx010.psh
    ps11_tfx011.psh
    ps11_tfx1x0.psh
    ps11_tfx1x1.psh
    ps11_tfx200.psh
    ps11_tfx210.psh
    ps11_tfx211.psh
    ps11_tfx300.psh
    ps11_tfx310.psh
    ps11_tfx311.psh
    ps11_tfx4xx.psh
    ps14_en00.psh
    ps14_en01.psh
    ps14_en10.psh
    ps14_en11.psh
  GS.cpp
  GS.h
  GSCapture.cpp
  GSCapture.h
  GSCaptureDlg.cpp
  GSCaptureDlg.h
  GSHash.cpp
  GSHash.h
  GSLocalMemory.cpp
  GSLocalMemory.h
  GSPerfMon.cpp
  GSPerfMon.h
  GSRegs.cpp
  GSRenderer.cpp
  GSRenderer.h
  GSRendererHW.cpp
  GSRendererHW.h
  GSRendererNull.cpp
  GSRendererNull.h
  GSRendererSoft.cpp
  GSRendererSoft.h
  GSSettingsDlg.cpp
  GSSettingsDlg.h
  GSSoftVertex.cpp
  GSSoftVertex.h
  GSState.cpp
  GSState.h
  GSTables.cpp
  GSTables.h
  GSTextureCache.cpp
  GSTextureCache.h
  GSTransfer.cpp
  GSUtil.cpp
  GSUtil.h
  GSVertexList.cpp
  GSVertexList.h
  GSWnd.cpp
  GSWnd.h
  GSdx9.cpp
  GSdx9.def
  GSdx9.h
  GSdx9.icproj
  GSdx9.rc
  GSdx9.sln
  GSdx9.vcproj
  GSdx9_ic.sln
  GSdx9_vs2005.sln
  GSdx9_vs2005.vcproj
  resource.h
  stdafx.cpp
  stdafx.h
  x86-32.asm
  x86-64.asm
  x86.cpp
  x86.h