1 /* 2 Copyright (c) 2004, Thomas Kurschel 3 4 5 Part of Radeon In add-on 6 7 YUV converter 8 9 The Rage Theatre always provides YUV422 data and starting with Radeon, ATI 10 moved colour converter from 2D to 3D unit, so if you need another format you 11 must convert it manually, unless you get 3D working (which is, starting with r300, 12 hopeless anyway as there is no spec). 13 14 Colour temperature is according to BT.601; for YCbCr format see also GraphicsDefs.h 15 16 This header is included from VideoIn.cpp, with various defines to convert to 17 the wished format (RGB15, RGB16 or RGB32). 18 19 Things to improve: 20 - colour components should be interpolated for odd pixels 21 */ 22 23 static const int8 c_offs[8] = 24 { 128, 128, 128, 128, 128, 128, 128, 128 }; 25 26 static const int16 y_offs[4] = 27 { 16*128, 16*128, 16*128, 16*128 }; 28 29 static const uint16 masks[2][4] = { 30 // high byte mask 31 { 0xff00, 0xff00, 0xff00, 0xff00 }, 32 // low byte mask 33 { 0x00ff, 0x00ff, 0x00ff, 0x00ff }, 34 }; 35 36 static const int16 scale[5][4] = { 37 // Y pre-scale 38 { (int16)(1.1678 * 512), (int16)(1.1678 * 512), (int16)(1.1678 * 512), (int16)(1.1678 * 512) }, 39 // CbG CrG CbG CrG 40 { (int16)(-0.3929 * 256), (int16)(-0.8154 * 256), (int16)(-0.3929 * 256), (int16)(-0.8154 * 256) }, 41 // CbB CrR CbB CrR 42 { (int16)(2.0232 * 256), (int16)(1.6007 * 256), (int16)(2.0232 * 256), (int16)(1.6007 * 256) }, 43 }; 44 45 static const int8 masks_8bit[2][8] = { 46 // r/b 16 bit mask and r/g/b 15 bit mask 47 { 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8 }, 48 // g 16 bit mask 49 { 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc }, 50 }; 51 52 asm volatile( 53 "2:\n" 54 "pxor %%mm7,%%mm7\n" 55 56 "1:\n" 57 "movq (%0),%%mm0\n" // mm0 = Cr2'Y3' Cb2'Y2' Cr0'Y1' Cb0'Y0' 58 "movq %%mm0,%%mm1\n" // mm1 = Cr2'Y3' Cb2'Y2' Cr0'Y1' Cb0'Y0' 59 60 // Y is in 16..235 61 // we need to substract 16 and scale to full range 62 // as standard MMX has a _signed_ integer multiply only, the highest bit must 63 // be zero before scaling, i.e. we use signed format 9.7 64 "pand 8+%4,%%mm0\n" // mm2 = Y3' Y2' Y1' Y0' 65 "psllw $7,%%mm0\n" 66 "psubusw %3,%%mm0\n" 67 "pmulhw %5,%%mm0\n" // mm0 = Y3 Y2 Y1 Y0 68 69 // Cb and Cr is biased; compensate that 70 "psubb %2,%%mm1\n" // mm1 = Cr2 xxx Cb2 xxx Cr0 xxx Cb0 xxx 71 "pand %4,%%mm1\n" // mm1 = Cr2 Cb2 Cr0 Cb0 72 73 // transform Cb and Cr to green component 74 "movq %%mm1,%%mm2\n" 75 "pmaddwd 8+%5,%%mm1\n" // mm1 = CbCrG2 xxxxxxx CbCrG0 xxxxxxx 76 "psrad $16,%%mm1\n" // mm1 = CbCrG2 CbCrG0 77 "packssdw %%mm1,%%mm1\n" // mm1 = CbCrG2 CbCrG0 CbCrG2 CbCrG0 78 "punpcklwd %%mm1,%%mm1\n" // mm1 = CbCrG2 CbCrG2 CbCrG0 CbCrG0 79 80 // transform Cb to blue and Cr to red component 81 "pmulhw 16+%5,%%mm2\n" // mm2 = CrR2 CbB2 CrR0 CbB0 82 83 // nasty shuffling to separate and duplicate components 84 "movq %%mm2,%%mm3\n" 85 "punpcklwd %%mm3,%%mm3\n" // mm3 = CrR0 CrR0 CbB0 CbB0 86 "punpckhwd %%mm2,%%mm2\n" // mm2 = CrR2 CrR2 CbB2 CbB2 87 88 "movq %%mm3,%%mm4\n" 89 "punpckldq %%mm2,%%mm3\n" // mm3 = CbB2 CbB2 CbB0 CbB0 90 "punpckhdq %%mm2,%%mm4\n" // mm4 = CrR2 CrR2 CrR0 CrR0 91 92 // add Y to get final RGB 93 "paddsw %%mm0,%%mm1\n" // mm1 = G3 G2 G1 G0 94 "paddsw %%mm0,%%mm3\n" // mm3 = B3 B2 B1 B0 95 "paddsw %%mm0,%%mm4\n" // mm4 = R3 R2 R1 R0 96 97 // now, RBG can be converted to 8 bits each 98 "packuswb %%mm0,%%mm1\n" // mm1 = Y3 Y2 Y1 Y0 G3 G2 G1 G0 99 "packuswb %%mm4,%%mm3\n" // mm3 = R3 R2 R1 R0 B3 B2 B1 B0 100 101 #ifdef RGB32 102 // convertion to RGB32 103 "movq %%mm3,%%mm2\n" 104 "punpckhbw %%mm1,%%mm3\n" // mm3 = Y3 R3 Y2 R2 Y1 R1 Y0 R0 105 "punpcklbw %%mm1,%%mm2\n" // mm2 = G3 B3 G2 B2 G1 B1 G0 B0 106 107 "movq %%mm2,%%mm1\n" 108 "punpcklwd %%mm3,%%mm2\n" 109 "movq %%mm2,0x00(%1)\n" // dst = Y1 R1 G1 B1 Y0 R0 G0 B0 110 111 "punpckhwd %%mm3,%%mm1\n" 112 "movq %%mm1,0x08(%1)\n" // dst = Y3 R3 G3 B3 Y2 R2 G2 B2 113 114 "addl $0x08,%0\n" // source += 8 115 "addl $0x10,%1\n" // destination += 16 116 "subl $0x10,%7\n" // next pixels 117 #endif 118 119 #ifdef RGB16 120 // convertion to RGB16 121 // things would be much easier if Intel had added a RGB32->RGB16 instruction 122 "pand %6,%%mm3\n" // mm3 - R3 R2 R1 R0 B3 B2 B1 B0 (masked) 123 "pand 8+%6,%%mm1\n" // mm1 - Y3 Y2 Y1 Y0 G3 G2 G1 G0 (masked) 124 125 "punpcklbw %%mm7,%%mm1\n" // mm1 - G3 G2 G1 G0 126 "movq %%mm7,%%mm2\n" 127 "punpckhbw %%mm3,%%mm2\n" // mm2 - R3 R2 R1 R0 128 "punpcklbw %%mm7,%%mm3\n" // mm3 - B3 B2 B1 B0 129 130 "psllw $3,%%mm1\n" // mm1 - G3 G2 G1 G0 131 "psrlw $3,%%mm3\n" // mm3 - B3 B2 B1 B0 132 133 "por %%mm2,%%mm1\n" 134 "por %%mm3,%%mm1\n" 135 "movq %%mm1,(%1)\n" 136 137 "addl $0x08,%0\n" // source += 8 138 "addl $0x08,%1\n" // destination += 8 139 "subl $0x08,%7\n" // next pixels 140 #endif 141 142 #ifdef RGB15 143 // convertion to RGB15 144 // same problem as before 145 "pand %6,%%mm3\n" // mm3 - R3 R2 R1 R0 B3 B2 B1 B0 (masked) 146 "pand %6,%%mm1\n" // mm1 - Y3 Y2 Y1 Y0 G3 G2 G1 G0 (masked) 147 148 "punpcklbw %%mm7,%%mm1\n" // mm1 - G3 G2 G1 G0 149 "movq %%mm7,%%mm2\n" 150 "punpckhbw %%mm3,%%mm2\n" // mm2 - R3 R2 R1 R0 151 "punpcklbw %%mm7,%%mm3\n" // mm3 - B3 B2 B1 B0 152 153 "psllw $2,%%mm1\n" // mm1 - G3 G2 G1 G0 154 "psrlw $1,%%mm2\n" // mm2 - R3 R2 R1 R0 155 "psrlw $3,%%mm3\n" // mm3 - B3 B2 B1 B0 156 157 "por %%mm2,%%mm1\n" 158 "por %%mm3,%%mm1\n" 159 "movq %%mm1,(%1)\n" 160 161 "addl $0x08,%0\n" // source += 8 162 "addl $0x08,%1\n" // destination += 8 163 "subl $0x08,%7\n" // next pixels 164 #endif 165 166 // next 167 "jg 1b\n" 168 169 "movl %9,%7\n" 170 "subl %7,%8\n" 171 172 "jg 2b\n" 173 "emms\n" 174 : 175 : "a" (convert_buffer), "d" (bits), 176 "g" (c_offs), "g" (y_offs), "g" (masks), "g" (scale), "g" (masks_8bit), 177 "c" (bytesPerRow), "S" (bitsLength), "D" (bytesPerRow)); 178