1; 2; Copyright 2009, Christian Packmann. 3; All rights reserved. 4; Distributed under the terms of the MIT License, see 5; http://www.opensource.org/licenses/mit-license.php 6 7; Assembly code for Painter::_DrawBitmapBilinearCopy32() in Painter.cpp 8; This code implements only the inner x-loop, all other processing 9; is done in the C code. 10 11 12; ****** GENERAL NOTES ***** 13 14; The implemented algorithm looks like this: 15; (pixLT * leftWeight + pixRT * rightWeight) * topWeight 16; + 17; (pixLB * leftWeight + pixRB * rightWeight) * bottomWeight 18; 19; with LT = LeftTop, RT = RightTop, LB = LeftBottom, RB = RightBottom 20; 21; For more detailed information, see the C implementation in 22; Painter.cpp 23; 24; Implementation notes: 25; The calculations are performed with 16-bit arithmetic. All values 26; are held in vars/registers as 8-bit values high-shifted by 8 bits; 27; i.e. 255<<8. This works because PMULHUW is used for MULs, and this 28; algorithm limits the variable values appropriately during all steps. 29; This will not work for all algorithms, so take note of that if you 30; want to recycle some of the code. 31 32; Notes on the code itself: 33; I've tried to keep the code small. That's why I'm using memory accesses 34; via index registers as much as possible. This costs execution time due 35; to the generated µops, but should minimize decode bandwidth pressure 36; due to the many MMX instructions. 37; Temporary variables are always stored to the stack instead of global 38; data space for this reason. So far I haven't exceeded 8-byte offsets, 39; so the instructions only need to encode a BYTE-offset instead of a DWORD. 40 41; Notes on code formatting/comments: 42; - integer and vector instructions are indented differently. I find this 43; helpful when parsing code, especially when I haven't looked at it for a 44; longer time. 45; - I've tried to comment the code so that it will be understandable and 46; maintainable in the future, and also by other persons than myself. 47; The current comments aren't yet fully standardized, I'm still working 48; on a coherent system for indicating the variables held within a register 49; which will help in understanding the data flow. Any suggestions 50; regarding this are welcome. 51; - Abbreviations for datatypes: 52; B = Byte 8 bit 53; W = Word 16 bit 54; DW = Doubleword 32 bit 55; QW = Quadword 64 bit 56; DQ = Doublequad 128 bit 57; A "p" in front of one of the datatypes signifies that the 58; variable/register is encoded in packed form; i.e. pW means 59; "packed Words"; four Words for a MMX register, 8 for a SSE register. 60; This should help in understanding the logical meaning of the data 61; transformations. 62; For better readability, the datatype indicator for a register is 63; breacketed with '#', a MMX register with 2 uint32 of value 255 would be 64; #pD# 255 255 65 66 67 68; ****** Global exports ***** 69 70; Do NOT use '_' in front of your defines, this is done 71; with YASMs --prefix option at assembly time. 72GLOBAL bilinear_scale_xloop_mmxsse 73 74 75; ******************** 76; ****** DATA ****** 77; ******************** 78SECTION .data 79 80DATA_SECTION: 81ALIGN 16 82DATA_SSSE3: 83; data which is identical for MMX and SSE code is shared by declaring 84; it as DQ but providing two labels. MMX code just accesses the 85; first half. 86c4x16UW_129_LShift8: TIMES 4 dw 129<<8 87c4x16UW_255_LShift8: TIMES 4 dw 255<<8 88c2x32UD_ff000000: TIMES 4 DD 0xff000000 89 90; Argument definitions 91 92; Parameter offsets assume "push ebp" 93PAR_srcPtr EQU 8 94PAR_dstPtr EQU 12 95PAR_xWeightPtr EQU 16 96PAR_xmin EQU 20 97PAR_xmax EQU 24 98PAR_wTop EQU 28 99PAR_srcBPR EQU 32 100 101; Stack storage definitions 102ST_Q_wTop EQU 0 103ST_Q_wBottom EQU 8 104ST_Q_c4x16UW_129_LShift8 EQU 16 105ST_Q_c4x16UW_255_LShift8 EQU 24 106ST_Q_lftWeight_A EQU 32 107ST_Q_rgtWeight_A EQU 40 108ST_Q_lftWeight_B EQU 48 109ST_Q_rgtWeight_B EQU 56 110 111 112; ******************** 113; ****** CODE ****** 114; ******************** 115SECTION .code 116 117 118; void bilinear_scale_xloop_mmxsse(void* src, void* dst, void* xWeights, 119; uint32 xmin, uint32 xmax, uint16 wTop, uint32 srcBPR ) 120; Loop stats: 121; 34 instructions (6 moves, 5 integer, 23 vector) 122; 12 memory accesses 123ALIGN 16 124bilinear_scale_xloop_mmxsse: 125 push ebp 126 mov ebp, esp 127 and esp, 0xfffffff8 ; align stack to 8-byte boundary 128 push ebx 129 push edi 130 push esi 131 sub esp, 4 + 32 ; +4 aligns to 8-byte boundary again; add 4 x QW 132; xmin > xmax? 133 mov eax, [ebp + PAR_xmin] 134 cmp eax, [ebp + PAR_xmax] 135 ja .exit 136; preparations 137 ; prepare wTop 138 mov eax, [ebp + PAR_wTop] ; #pB#: 0 0 0 top 139 shl eax, 8 ; #pB#: 0 0 top 0 140 movd mm0, eax ; #pW# 0 0 0 top 141 pshufw mm0, mm0, 00000000b ; #pW# top top top top 142 movq [esp + ST_Q_wTop], mm0 143 ; move constants 144 movq mm5, [c4x16UW_255_LShift8] 145 movq [esp + ST_Q_c4x16UW_255_LShift8], mm5 146 ; prepare wBottom 147 movq mm1, mm5 ; #pW# 255 255 255 255 148 psubw mm1, mm0 ; 255 - wTop = wBottom 149 movq [esp + ST_Q_wBottom], mm1 150 151; load params; leave ebx, ecx as scratch 152 mov eax, [ebp + PAR_xmin] ; loop counter 153 mov edx, [ebp + PAR_xWeightPtr] ; xWeights array 154 mov esi, [ebp + PAR_srcPtr] ; source bitmap 155 mov edi, [ebp + PAR_dstPtr] ; desination bitmap 156 movq mm6, [c4x16UW_129_LShift8] 157 movq mm7, [c2x32UD_ff000000] 158 159; main loop 160ALIGN 16 161.loop: 162 ; load Left/Right weights into mm0/mm1 163 movzx ebx, WORD [edx + eax*4 + 2] ; xWeights + x*4 + 2-> FilterInfo[x].weight 164 shl ebx, 8 ; #pB# 0 0 leftW 0 165 pxor mm2, mm2 ; clear before use 166 movd mm0, ebx ; #pW# 0 0 0 leftW 167 movq mm1, [esp + ST_Q_c4x16UW_255_LShift8] 168 pshufw mm0, mm0, 00000000b ; #pW# lW lW lW lW 169 psubw mm1, mm0 ; #pW# rW rW rW rW 170 movzx ecx, WORD [edx + eax*4] ; xWeights + x*4 -> FilterInfo[x].index 171 pxor mm3, mm3 ; clear before use 172 mov ebx, ecx 173 ; process top and bottom pixels, interleave instructions to avoid latencies 174 pxor mm4, mm4 ; clear before use 175 ; unpack pixel to high byte 176 punpcklbw mm2, [esi + ecx] ; pixLeftTop 177 ; unpack pixel to high byte 178 punpcklbw mm3, [esi + ecx + 4] ; pixRightTop 179 180 add ebx, [ebp + PAR_srcBPR] ; address:bottom pixels 181 pmulhuw mm2, mm0 ; pixLT * leftWeight 182 pmulhuw mm3, mm1 ; pixRT * rightWeight 183 ; calc address for bottom pix 184 pxor mm5, mm5 ; clear before use 185 punpcklbw mm4, [esi + ebx] ; pixLeftBottom 186 punpcklbw mm5, [esi + ebx + 4] ; pixRightBottom 187 pmulhuw mm4, mm0 ; pixLB * leftWeight 188 pmulhuw mm5, mm1 ; pixRB * rightWeight 189 190 paddw mm2, mm3 ; pixLT + pixRT 191 paddw mm4, mm5 ; pixLB + pixRB 192 pmulhuw mm2, [esp + ST_Q_wTop] ; * weightTop 193 pmulhuw mm4, [esp + ST_Q_wBottom] ; * weightBottom 194 195 ; add both temp results 196 paddw mm2, mm4 197 ; divide by 65025 using integer reciprocal: (*129 >> 7) 198 pmulhuw mm2, mm6 199 psrlw mm2, 7 200 ; pack & store 201 packuswb mm2, mm2 202 por mm2, mm7 ; | 0xff000000 203 movd [edi], mm2 ; store pixel as DWord 204 add edi, 4 205; loopctr <= xmax? 206 inc eax 207 cmp eax, [ebp + PAR_xmax] 208 jle .loop 209.exit: 210 emms ; Don't EVER forget to call EMMS! 211 add esp, 4 + 32 ; restore stack pointer 212 pop esi 213 pop edi 214 pop ebx 215 mov esp, ebp 216 pop ebp 217 ret 218