;
; Copyright 2009, Christian Packmann.
; All rights reserved.
; Distributed under the terms of the MIT License, see
; http://www.opensource.org/licenses/mit-license.php

; Assembly code for Painter::_DrawBitmapBilinearCopy32() in Painter.cpp
; This code implements only the inner x-loop, all other processing
; is done in the C code.


; ******  GENERAL NOTES  *****

; The implemented algorithm looks like this:
; (pixLT * leftWeight  +  pixRT * rightWeight) * topWeight
;                            +
; (pixLB * leftWeight  +  pixRB * rightWeight) * bottomWeight
;
; with LT = LeftTop, RT = RightTop, LB = LeftBottom, RB = RightBottom
;
; For more detailed information, see the C implementation in
; Painter.cpp
;
; Implementation notes:
; The calculations are performed with 16-bit arithmetic. All values
; are held in vars/registers as 8-bit values high-shifted by 8 bits;
; i.e. 255<<8. This works because PMULHUW is used for MULs, and this
; algorithm limits the variable values appropriately during all steps.
; This will not work for all algorithms, so take note of that if you
; want to recycle some of the code.

; Notes on the code itself:
; I've tried to keep the code small. That's why I'm using memory accesses
; via index registers as much as possible. This costs execution time due
; to the generated µops, but should minimize decode bandwidth pressure
; due to the many MMX instructions.
; Temporary variables are always stored to the stack instead of global
; data space for this reason. So far I haven't exceeded 8-byte offsets,
; so the instructions only need to encode a BYTE-offset instead of a DWORD.

; Notes on code formatting/comments:
; - integer and vector instructions are indented differently. I find this
;   helpful when parsing code, especially when I haven't looked at it for a
;   longer time.
; - I've tried to comment the code so that it will be understandable and
;   maintainable in the future, and also by other persons than myself.
;   The current comments aren't yet fully standardized, I'm still working
;   on a coherent system for indicating the variables held within a register
;   which will help in understanding the data flow. Any suggestions
;   regarding this are welcome.
; - Abbreviations for datatypes:
;   B = Byte		  8 bit
;   W = Word		 16 bit
;   DW = Doubleword	 32 bit
;   QW = Quadword	 64 bit
;	DQ = Doublequad	128 bit
;	A "p" in front of one of the datatypes signifies that the
;	variable/register is encoded in packed form; i.e. pW means
;	"packed Words"; four Words for a MMX register, 8 for a SSE register.
;	This should help in understanding the logical meaning of the data
;	transformations.
;	For better readability, the datatype indicator for a register is
;   breacketed with '#', a MMX register with 2 uint32 of value 255 would be
;   #pD# 255 255


; ******  Global exports  *****

; Do NOT use '_' in front of your defines, this is done
; with YASMs --prefix option at assembly time.
GLOBAL bilinear_scale_xloop_mmxsse


; ********************
; ******  DATA  ******
; ********************
SECTION .data

DATA_SECTION:
ALIGN 16
DATA_SSSE3:
; data which is identical for MMX and SSE code is shared by declaring
; it as DQ but providing two labels. MMX code just accesses the
; first half.
c4x16UW_129_LShift8: 	TIMES 4 dw 129<<8
c4x16UW_255_LShift8:	TIMES 4 dw 255<<8
c2x32UD_ff000000:		TIMES 4 DD 0xff000000

; Argument definitions

; Parameter offsets assume "push ebp"
PAR_srcPtr EQU 	8
PAR_dstPtr EQU 	12
PAR_xWeightPtr EQU 16
PAR_xmin EQU 	20
PAR_xmax EQU 	24
PAR_wTop EQU 	28
PAR_srcBPR EQU 	32

; Stack storage definitions
ST_Q_wTop					EQU 0
ST_Q_wBottom				EQU 8
ST_Q_c4x16UW_129_LShift8	EQU 16
ST_Q_c4x16UW_255_LShift8	EQU 24
ST_Q_lftWeight_A			EQU 32
ST_Q_rgtWeight_A			EQU 40
ST_Q_lftWeight_B			EQU 48
ST_Q_rgtWeight_B			EQU 56


; ********************
; ******  CODE  ******
; ********************
SECTION .code


; void bilinear_scale_xloop_mmxsse(void* src, void* dst, void* xWeights,
;				uint32 xmin, uint32 xmax, uint16 wTop, uint32 srcBPR )
; Loop stats:
;		34 instructions (6 moves, 5 integer, 23 vector)
; 		12 memory accesses
ALIGN 16
bilinear_scale_xloop_mmxsse:
	push	ebp
	mov		ebp, esp
	and		esp, 0xfffffff8	; align stack to 8-byte boundary
	push	ebx
	push	edi
	push	esi
	sub		esp, 4 + 32	; +4 aligns to 8-byte boundary again; add 4 x QW
; xmin > xmax?
	mov		eax, [ebp + PAR_xmin]
	cmp		eax, [ebp + PAR_xmax]
	ja		.exit
; preparations
	; prepare wTop
	mov		eax, [ebp + PAR_wTop]	; #pB#: 0 0 0 top
	shl		eax, 8					; #pB#: 0 0 top 0
	movd		mm0, eax			; #pW# 0 0 0 top
	pshufw		mm0, mm0, 00000000b	; #pW# top top top top
	movq		[esp + ST_Q_wTop], mm0
	; move constants
	movq		mm5, [c4x16UW_255_LShift8]
	movq		[esp + ST_Q_c4x16UW_255_LShift8], mm5
	; prepare wBottom
	movq		mm1, mm5	; #pW# 255 255 255 255
	psubw		mm1, mm0	; 255 - wTop = wBottom
	movq		[esp + ST_Q_wBottom], mm1

; load params; leave ebx, ecx as scratch
	mov		eax, [ebp + PAR_xmin]	; loop counter
	mov		edx, [ebp + PAR_xWeightPtr]	; xWeights array
	mov		esi, [ebp + PAR_srcPtr]		; source bitmap
	mov		edi, [ebp + PAR_dstPtr]		; desination bitmap
	movq	mm6, [c4x16UW_129_LShift8]
	movq	mm7, [c2x32UD_ff000000]

; main loop
ALIGN 16
.loop:
	; load Left/Right weights into mm0/mm1
	movzx	ebx, WORD [edx + eax*4 + 2] ; xWeights + x*4 + 2-> FilterInfo[x].weight
	shl			ebx, 8		; #pB# 0 0 leftW 0
	pxor		mm2, mm2	; clear before use
	movd		mm0, ebx	; #pW# 0 0 0 leftW
	movq		mm1, [esp + ST_Q_c4x16UW_255_LShift8]
	pshufw		mm0, mm0, 00000000b	; #pW# lW lW lW lW
	psubw		mm1, mm0			; #pW# rW rW rW rW
	movzx	ecx, WORD [edx + eax*4] ; xWeights + x*4 -> FilterInfo[x].index
	pxor		mm3, mm3	; clear before use
	mov		ebx, ecx
	; process top and bottom pixels, interleave instructions to avoid latencies
	pxor		mm4, mm4	; clear before use
	; unpack pixel to high byte
	punpcklbw	mm2, [esi + ecx]	; pixLeftTop
	; unpack pixel to high byte
	punpcklbw	mm3, [esi + ecx + 4] ; pixRightTop

	add		ebx, [ebp + PAR_srcBPR]	; address:bottom pixels
	pmulhuw		mm2, mm0	; pixLT * leftWeight
	pmulhuw		mm3, mm1	; pixRT * rightWeight
	; calc address for bottom pix
	pxor		mm5, mm5	; clear before use
	punpcklbw	mm4, [esi + ebx]	; pixLeftBottom
	punpcklbw	mm5, [esi + ebx + 4] ; pixRightBottom
	pmulhuw		mm4, mm0	; pixLB * leftWeight
	pmulhuw		mm5, mm1	; pixRB * rightWeight

	paddw		mm2, mm3	; pixLT + pixRT
	paddw		mm4, mm5	; pixLB + pixRB
	pmulhuw		mm2, [esp + ST_Q_wTop]	; * weightTop
	pmulhuw		mm4, [esp + ST_Q_wBottom]	; * weightBottom

	; add both temp results
	paddw		mm2, mm4
	; divide by 65025 using integer reciprocal: (*129 >> 7)
	pmulhuw		mm2, mm6
	psrlw		mm2, 7
	; pack & store
	packuswb	mm2, mm2
	por			mm2, mm7	; | 0xff000000
	movd		[edi], mm2	; store pixel as DWord
	add		edi, 4
; loopctr <= xmax?
	inc		eax
	cmp		eax, [ebp + PAR_xmax]
	jle		.loop
.exit:
	emms	; Don't EVER forget to call EMMS!
	add		esp, 4 + 32	; restore  stack pointer
	pop		esi
	pop		edi
	pop		ebx
	mov		esp, ebp
	pop		ebp
	ret