00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavcodec/dsputil.h"
00023 #include "libavcodec/simple_idct.h"
00024 #include "dsputil_mmx.h"
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00037 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00038 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00039 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00040 #if 0
00041 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00042 #else
00043 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
00044 #endif
00045 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00046 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00047 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00048
00049 #define ROW_SHIFT 11
00050 #define COL_SHIFT 20 // 6
00051
00052 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
00053 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
00054
00055 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
00056 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
00057
00058
00059 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
00060
00061
00062
00063
00064 C4, C4, C4, C4,
00065 C4, -C4, C4, -C4,
00066
00067 C2, C6, C2, C6,
00068 C6, -C2, C6, -C2,
00069
00070 C1, C3, C1, C3,
00071 C5, C7, C5, C7,
00072
00073 C3, -C7, C3, -C7,
00074 -C1, -C5, -C1, -C5,
00075
00076 C5, -C1, C5, -C1,
00077 C7, C3, C7, C3,
00078
00079 C7, -C5, C7, -C5,
00080 C3, -C1, C3, -C1
00081 };
00082
00083 #if 0
00084 static void unused_var_killer(void)
00085 {
00086 int a= wm1010 + d40000;
00087 temp[0]=a;
00088 }
00089
00090 static void inline idctCol (int16_t * col, int16_t *input)
00091 {
00092 #undef C0
00093 #undef C1
00094 #undef C2
00095 #undef C3
00096 #undef C4
00097 #undef C5
00098 #undef C6
00099 #undef C7
00100 int a0, a1, a2, a3, b0, b1, b2, b3;
00101 const int C0 = 23170;
00102 const int C1 = 22725;
00103 const int C2 = 21407;
00104 const int C3 = 19266;
00105 const int C4 = 16383;
00106 const int C5 = 12873;
00107 const int C6 = 8867;
00108 const int C7 = 4520;
00109
00110
00111
00112
00113
00114
00115
00116 col[8*0] = input[8*0 + 0];
00117 col[8*1] = input[8*2 + 0];
00118 col[8*2] = input[8*0 + 1];
00119 col[8*3] = input[8*2 + 1];
00120 col[8*4] = input[8*4 + 0];
00121 col[8*5] = input[8*6 + 0];
00122 col[8*6] = input[8*4 + 1];
00123 col[8*7] = input[8*6 + 1];
00124
00125 a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
00126 a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
00127 a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
00128 a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
00129
00130 b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
00131 b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
00132 b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
00133 b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
00134
00135 col[8*0] = (a0 + b0) >> COL_SHIFT;
00136 col[8*1] = (a1 + b1) >> COL_SHIFT;
00137 col[8*2] = (a2 + b2) >> COL_SHIFT;
00138 col[8*3] = (a3 + b3) >> COL_SHIFT;
00139 col[8*4] = (a3 - b3) >> COL_SHIFT;
00140 col[8*5] = (a2 - b2) >> COL_SHIFT;
00141 col[8*6] = (a1 - b1) >> COL_SHIFT;
00142 col[8*7] = (a0 - b0) >> COL_SHIFT;
00143 }
00144
00145 static void inline idctRow (int16_t * output, int16_t * input)
00146 {
00147 int16_t row[8];
00148
00149 int a0, a1, a2, a3, b0, b1, b2, b3;
00150 const int C0 = 23170;
00151 const int C1 = 22725;
00152 const int C2 = 21407;
00153 const int C3 = 19266;
00154 const int C4 = 16383;
00155 const int C5 = 12873;
00156 const int C6 = 8867;
00157 const int C7 = 4520;
00158
00159 row[0] = input[0];
00160 row[2] = input[1];
00161 row[4] = input[4];
00162 row[6] = input[5];
00163 row[1] = input[8];
00164 row[3] = input[9];
00165 row[5] = input[12];
00166 row[7] = input[13];
00167
00168 if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
00169 row[0] = row[1] = row[2] = row[3] = row[4] =
00170 row[5] = row[6] = row[7] = row[0]<<3;
00171 output[0] = row[0];
00172 output[2] = row[1];
00173 output[4] = row[2];
00174 output[6] = row[3];
00175 output[8] = row[4];
00176 output[10] = row[5];
00177 output[12] = row[6];
00178 output[14] = row[7];
00179 return;
00180 }
00181
00182 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
00183 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
00184 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
00185 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
00186
00187 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
00188 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
00189 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
00190 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
00191
00192 row[0] = (a0 + b0) >> ROW_SHIFT;
00193 row[1] = (a1 + b1) >> ROW_SHIFT;
00194 row[2] = (a2 + b2) >> ROW_SHIFT;
00195 row[3] = (a3 + b3) >> ROW_SHIFT;
00196 row[4] = (a3 - b3) >> ROW_SHIFT;
00197 row[5] = (a2 - b2) >> ROW_SHIFT;
00198 row[6] = (a1 - b1) >> ROW_SHIFT;
00199 row[7] = (a0 - b0) >> ROW_SHIFT;
00200
00201 output[0] = row[0];
00202 output[2] = row[1];
00203 output[4] = row[2];
00204 output[6] = row[3];
00205 output[8] = row[4];
00206 output[10] = row[5];
00207 output[12] = row[6];
00208 output[14] = row[7];
00209 }
00210 #endif
00211
00212 static inline void idct(int16_t *block)
00213 {
00214 DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
00215 int16_t * const temp= (int16_t*)align_tmp;
00216
00217 __asm__ volatile(
00218 #if 0 //Alternative, simpler variant
00219
00220 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00221 "movq " #src0 ", %%mm0 \n\t" \
00222 "movq " #src4 ", %%mm1 \n\t" \
00223 "movq " #src1 ", %%mm2 \n\t" \
00224 "movq " #src5 ", %%mm3 \n\t" \
00225 "movq 16(%2), %%mm4 \n\t" \
00226 "pmaddwd %%mm0, %%mm4 \n\t" \
00227 "movq 24(%2), %%mm5 \n\t" \
00228 "pmaddwd %%mm5, %%mm0 \n\t" \
00229 "movq 32(%2), %%mm5 \n\t" \
00230 "pmaddwd %%mm1, %%mm5 \n\t" \
00231 "movq 40(%2), %%mm6 \n\t" \
00232 "pmaddwd %%mm6, %%mm1 \n\t" \
00233 "movq 48(%2), %%mm7 \n\t" \
00234 "pmaddwd %%mm2, %%mm7 \n\t" \
00235 #rounder ", %%mm4 \n\t"\
00236 "movq %%mm4, %%mm6 \n\t" \
00237 "paddd %%mm5, %%mm4 \n\t" \
00238 "psubd %%mm5, %%mm6 \n\t" \
00239 "movq 56(%2), %%mm5 \n\t" \
00240 "pmaddwd %%mm3, %%mm5 \n\t" \
00241 #rounder ", %%mm0 \n\t"\
00242 "paddd %%mm0, %%mm1 \n\t" \
00243 "paddd %%mm0, %%mm0 \n\t" \
00244 "psubd %%mm1, %%mm0 \n\t" \
00245 "pmaddwd 64(%2), %%mm2 \n\t" \
00246 "paddd %%mm5, %%mm7 \n\t" \
00247 "movq 72(%2), %%mm5 \n\t" \
00248 "pmaddwd %%mm3, %%mm5 \n\t" \
00249 "paddd %%mm4, %%mm7 \n\t" \
00250 "paddd %%mm4, %%mm4 \n\t" \
00251 "psubd %%mm7, %%mm4 \n\t" \
00252 "paddd %%mm2, %%mm5 \n\t" \
00253 "psrad $" #shift ", %%mm7 \n\t"\
00254 "psrad $" #shift ", %%mm4 \n\t"\
00255 "movq %%mm1, %%mm2 \n\t" \
00256 "paddd %%mm5, %%mm1 \n\t" \
00257 "psubd %%mm5, %%mm2 \n\t" \
00258 "psrad $" #shift ", %%mm1 \n\t"\
00259 "psrad $" #shift ", %%mm2 \n\t"\
00260 "packssdw %%mm1, %%mm7 \n\t" \
00261 "packssdw %%mm4, %%mm2 \n\t" \
00262 "movq %%mm7, " #dst " \n\t"\
00263 "movq " #src1 ", %%mm1 \n\t" \
00264 "movq 80(%2), %%mm4 \n\t" \
00265 "movq %%mm2, 24+" #dst " \n\t"\
00266 "pmaddwd %%mm1, %%mm4 \n\t" \
00267 "movq 88(%2), %%mm7 \n\t" \
00268 "pmaddwd 96(%2), %%mm1 \n\t" \
00269 "pmaddwd %%mm3, %%mm7 \n\t" \
00270 "movq %%mm0, %%mm2 \n\t" \
00271 "pmaddwd 104(%2), %%mm3 \n\t" \
00272 "paddd %%mm7, %%mm4 \n\t" \
00273 "paddd %%mm4, %%mm2 \n\t" \
00274 "psubd %%mm4, %%mm0 \n\t" \
00275 "psrad $" #shift ", %%mm2 \n\t"\
00276 "psrad $" #shift ", %%mm0 \n\t"\
00277 "movq %%mm6, %%mm4 \n\t" \
00278 "paddd %%mm1, %%mm3 \n\t" \
00279 "paddd %%mm3, %%mm6 \n\t" \
00280 "psubd %%mm3, %%mm4 \n\t" \
00281 "psrad $" #shift ", %%mm6 \n\t"\
00282 "packssdw %%mm6, %%mm2 \n\t" \
00283 "movq %%mm2, 8+" #dst " \n\t"\
00284 "psrad $" #shift ", %%mm4 \n\t"\
00285 "packssdw %%mm0, %%mm4 \n\t" \
00286 "movq %%mm4, 16+" #dst " \n\t"\
00287
00288 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
00289 "movq " #src0 ", %%mm0 \n\t" \
00290 "movq " #src4 ", %%mm1 \n\t" \
00291 "movq " #src1 ", %%mm2 \n\t" \
00292 "movq " #src5 ", %%mm3 \n\t" \
00293 "movq 16(%2), %%mm4 \n\t" \
00294 "pmaddwd %%mm0, %%mm4 \n\t" \
00295 "movq 24(%2), %%mm5 \n\t" \
00296 "pmaddwd %%mm5, %%mm0 \n\t" \
00297 "movq 32(%2), %%mm5 \n\t" \
00298 "pmaddwd %%mm1, %%mm5 \n\t" \
00299 "movq 40(%2), %%mm6 \n\t" \
00300 "pmaddwd %%mm6, %%mm1 \n\t" \
00301 "movq %%mm4, %%mm6 \n\t" \
00302 "movq 48(%2), %%mm7 \n\t" \
00303 "pmaddwd %%mm2, %%mm7 \n\t" \
00304 "paddd %%mm5, %%mm4 \n\t" \
00305 "psubd %%mm5, %%mm6 \n\t" \
00306 "movq %%mm0, %%mm5 \n\t" \
00307 "paddd %%mm1, %%mm0 \n\t" \
00308 "psubd %%mm1, %%mm5 \n\t" \
00309 "movq 56(%2), %%mm1 \n\t" \
00310 "pmaddwd %%mm3, %%mm1 \n\t" \
00311 "pmaddwd 64(%2), %%mm2 \n\t" \
00312 "paddd %%mm1, %%mm7 \n\t" \
00313 "movq 72(%2), %%mm1 \n\t" \
00314 "pmaddwd %%mm3, %%mm1 \n\t" \
00315 "paddd %%mm4, %%mm7 \n\t" \
00316 "paddd %%mm4, %%mm4 \n\t" \
00317 "psubd %%mm7, %%mm4 \n\t" \
00318 "paddd %%mm2, %%mm1 \n\t" \
00319 "psrad $" #shift ", %%mm7 \n\t"\
00320 "psrad $" #shift ", %%mm4 \n\t"\
00321 "movq %%mm0, %%mm2 \n\t" \
00322 "paddd %%mm1, %%mm0 \n\t" \
00323 "psubd %%mm1, %%mm2 \n\t" \
00324 "psrad $" #shift ", %%mm0 \n\t"\
00325 "psrad $" #shift ", %%mm2 \n\t"\
00326 "packssdw %%mm7, %%mm7 \n\t" \
00327 "movd %%mm7, " #dst " \n\t"\
00328 "packssdw %%mm0, %%mm0 \n\t" \
00329 "movd %%mm0, 16+" #dst " \n\t"\
00330 "packssdw %%mm2, %%mm2 \n\t" \
00331 "movd %%mm2, 96+" #dst " \n\t"\
00332 "packssdw %%mm4, %%mm4 \n\t" \
00333 "movd %%mm4, 112+" #dst " \n\t"\
00334 "movq " #src1 ", %%mm0 \n\t" \
00335 "movq 80(%2), %%mm4 \n\t" \
00336 "pmaddwd %%mm0, %%mm4 \n\t" \
00337 "movq 88(%2), %%mm7 \n\t" \
00338 "pmaddwd 96(%2), %%mm0 \n\t" \
00339 "pmaddwd %%mm3, %%mm7 \n\t" \
00340 "movq %%mm5, %%mm2 \n\t" \
00341 "pmaddwd 104(%2), %%mm3 \n\t" \
00342 "paddd %%mm7, %%mm4 \n\t" \
00343 "paddd %%mm4, %%mm2 \n\t" \
00344 "psubd %%mm4, %%mm5 \n\t" \
00345 "psrad $" #shift ", %%mm2 \n\t"\
00346 "psrad $" #shift ", %%mm5 \n\t"\
00347 "movq %%mm6, %%mm4 \n\t" \
00348 "paddd %%mm0, %%mm3 \n\t" \
00349 "paddd %%mm3, %%mm6 \n\t" \
00350 "psubd %%mm3, %%mm4 \n\t" \
00351 "psrad $" #shift ", %%mm6 \n\t"\
00352 "psrad $" #shift ", %%mm4 \n\t"\
00353 "packssdw %%mm2, %%mm2 \n\t" \
00354 "packssdw %%mm6, %%mm6 \n\t" \
00355 "movd %%mm2, 32+" #dst " \n\t"\
00356 "packssdw %%mm4, %%mm4 \n\t" \
00357 "packssdw %%mm5, %%mm5 \n\t" \
00358 "movd %%mm6, 48+" #dst " \n\t"\
00359 "movd %%mm4, 64+" #dst " \n\t"\
00360 "movd %%mm5, 80+" #dst " \n\t"\
00361
00362
00363 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00364 "movq " #src0 ", %%mm0 \n\t" \
00365 "movq " #src4 ", %%mm1 \n\t" \
00366 "movq " #src1 ", %%mm2 \n\t" \
00367 "movq " #src5 ", %%mm3 \n\t" \
00368 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
00369 "pand %%mm0, %%mm4 \n\t"\
00370 "por %%mm1, %%mm4 \n\t"\
00371 "por %%mm2, %%mm4 \n\t"\
00372 "por %%mm3, %%mm4 \n\t"\
00373 "packssdw %%mm4,%%mm4 \n\t"\
00374 "movd %%mm4, %%eax \n\t"\
00375 "orl %%eax, %%eax \n\t"\
00376 "jz 1f \n\t"\
00377 "movq 16(%2), %%mm4 \n\t" \
00378 "pmaddwd %%mm0, %%mm4 \n\t" \
00379 "movq 24(%2), %%mm5 \n\t" \
00380 "pmaddwd %%mm5, %%mm0 \n\t" \
00381 "movq 32(%2), %%mm5 \n\t" \
00382 "pmaddwd %%mm1, %%mm5 \n\t" \
00383 "movq 40(%2), %%mm6 \n\t" \
00384 "pmaddwd %%mm6, %%mm1 \n\t" \
00385 "movq 48(%2), %%mm7 \n\t" \
00386 "pmaddwd %%mm2, %%mm7 \n\t" \
00387 #rounder ", %%mm4 \n\t"\
00388 "movq %%mm4, %%mm6 \n\t" \
00389 "paddd %%mm5, %%mm4 \n\t" \
00390 "psubd %%mm5, %%mm6 \n\t" \
00391 "movq 56(%2), %%mm5 \n\t" \
00392 "pmaddwd %%mm3, %%mm5 \n\t" \
00393 #rounder ", %%mm0 \n\t"\
00394 "paddd %%mm0, %%mm1 \n\t" \
00395 "paddd %%mm0, %%mm0 \n\t" \
00396 "psubd %%mm1, %%mm0 \n\t" \
00397 "pmaddwd 64(%2), %%mm2 \n\t" \
00398 "paddd %%mm5, %%mm7 \n\t" \
00399 "movq 72(%2), %%mm5 \n\t" \
00400 "pmaddwd %%mm3, %%mm5 \n\t" \
00401 "paddd %%mm4, %%mm7 \n\t" \
00402 "paddd %%mm4, %%mm4 \n\t" \
00403 "psubd %%mm7, %%mm4 \n\t" \
00404 "paddd %%mm2, %%mm5 \n\t" \
00405 "psrad $" #shift ", %%mm7 \n\t"\
00406 "psrad $" #shift ", %%mm4 \n\t"\
00407 "movq %%mm1, %%mm2 \n\t" \
00408 "paddd %%mm5, %%mm1 \n\t" \
00409 "psubd %%mm5, %%mm2 \n\t" \
00410 "psrad $" #shift ", %%mm1 \n\t"\
00411 "psrad $" #shift ", %%mm2 \n\t"\
00412 "packssdw %%mm1, %%mm7 \n\t" \
00413 "packssdw %%mm4, %%mm2 \n\t" \
00414 "movq %%mm7, " #dst " \n\t"\
00415 "movq " #src1 ", %%mm1 \n\t" \
00416 "movq 80(%2), %%mm4 \n\t" \
00417 "movq %%mm2, 24+" #dst " \n\t"\
00418 "pmaddwd %%mm1, %%mm4 \n\t" \
00419 "movq 88(%2), %%mm7 \n\t" \
00420 "pmaddwd 96(%2), %%mm1 \n\t" \
00421 "pmaddwd %%mm3, %%mm7 \n\t" \
00422 "movq %%mm0, %%mm2 \n\t" \
00423 "pmaddwd 104(%2), %%mm3 \n\t" \
00424 "paddd %%mm7, %%mm4 \n\t" \
00425 "paddd %%mm4, %%mm2 \n\t" \
00426 "psubd %%mm4, %%mm0 \n\t" \
00427 "psrad $" #shift ", %%mm2 \n\t"\
00428 "psrad $" #shift ", %%mm0 \n\t"\
00429 "movq %%mm6, %%mm4 \n\t" \
00430 "paddd %%mm1, %%mm3 \n\t" \
00431 "paddd %%mm3, %%mm6 \n\t" \
00432 "psubd %%mm3, %%mm4 \n\t" \
00433 "psrad $" #shift ", %%mm6 \n\t"\
00434 "packssdw %%mm6, %%mm2 \n\t" \
00435 "movq %%mm2, 8+" #dst " \n\t"\
00436 "psrad $" #shift ", %%mm4 \n\t"\
00437 "packssdw %%mm0, %%mm4 \n\t" \
00438 "movq %%mm4, 16+" #dst " \n\t"\
00439 "jmp 2f \n\t"\
00440 "1: \n\t"\
00441 "pslld $16, %%mm0 \n\t"\
00442 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
00443 "psrad $13, %%mm0 \n\t"\
00444 "packssdw %%mm0, %%mm0 \n\t"\
00445 "movq %%mm0, " #dst " \n\t"\
00446 "movq %%mm0, 8+" #dst " \n\t"\
00447 "movq %%mm0, 16+" #dst " \n\t"\
00448 "movq %%mm0, 24+" #dst " \n\t"\
00449 "2: \n\t"
00450
00451
00452
00453 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
00454
00455
00456
00457
00458 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
00459 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
00460 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
00461
00462
00463
00464 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
00465 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
00466 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
00467 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00468
00469 #else
00470
00471 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00472 "movq " #src0 ", %%mm0 \n\t" \
00473 "movq " #src4 ", %%mm1 \n\t" \
00474 "movq " #src1 ", %%mm2 \n\t" \
00475 "movq " #src5 ", %%mm3 \n\t" \
00476 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
00477 "pand %%mm0, %%mm4 \n\t"\
00478 "por %%mm1, %%mm4 \n\t"\
00479 "por %%mm2, %%mm4 \n\t"\
00480 "por %%mm3, %%mm4 \n\t"\
00481 "packssdw %%mm4,%%mm4 \n\t"\
00482 "movd %%mm4, %%eax \n\t"\
00483 "orl %%eax, %%eax \n\t"\
00484 "jz 1f \n\t"\
00485 "movq 16(%2), %%mm4 \n\t" \
00486 "pmaddwd %%mm0, %%mm4 \n\t" \
00487 "movq 24(%2), %%mm5 \n\t" \
00488 "pmaddwd %%mm5, %%mm0 \n\t" \
00489 "movq 32(%2), %%mm5 \n\t" \
00490 "pmaddwd %%mm1, %%mm5 \n\t" \
00491 "movq 40(%2), %%mm6 \n\t" \
00492 "pmaddwd %%mm6, %%mm1 \n\t" \
00493 "movq 48(%2), %%mm7 \n\t" \
00494 "pmaddwd %%mm2, %%mm7 \n\t" \
00495 #rounder ", %%mm4 \n\t"\
00496 "movq %%mm4, %%mm6 \n\t" \
00497 "paddd %%mm5, %%mm4 \n\t" \
00498 "psubd %%mm5, %%mm6 \n\t" \
00499 "movq 56(%2), %%mm5 \n\t" \
00500 "pmaddwd %%mm3, %%mm5 \n\t" \
00501 #rounder ", %%mm0 \n\t"\
00502 "paddd %%mm0, %%mm1 \n\t" \
00503 "paddd %%mm0, %%mm0 \n\t" \
00504 "psubd %%mm1, %%mm0 \n\t" \
00505 "pmaddwd 64(%2), %%mm2 \n\t" \
00506 "paddd %%mm5, %%mm7 \n\t" \
00507 "movq 72(%2), %%mm5 \n\t" \
00508 "pmaddwd %%mm3, %%mm5 \n\t" \
00509 "paddd %%mm4, %%mm7 \n\t" \
00510 "paddd %%mm4, %%mm4 \n\t" \
00511 "psubd %%mm7, %%mm4 \n\t" \
00512 "paddd %%mm2, %%mm5 \n\t" \
00513 "psrad $" #shift ", %%mm7 \n\t"\
00514 "psrad $" #shift ", %%mm4 \n\t"\
00515 "movq %%mm1, %%mm2 \n\t" \
00516 "paddd %%mm5, %%mm1 \n\t" \
00517 "psubd %%mm5, %%mm2 \n\t" \
00518 "psrad $" #shift ", %%mm1 \n\t"\
00519 "psrad $" #shift ", %%mm2 \n\t"\
00520 "packssdw %%mm1, %%mm7 \n\t" \
00521 "packssdw %%mm4, %%mm2 \n\t" \
00522 "movq %%mm7, " #dst " \n\t"\
00523 "movq " #src1 ", %%mm1 \n\t" \
00524 "movq 80(%2), %%mm4 \n\t" \
00525 "movq %%mm2, 24+" #dst " \n\t"\
00526 "pmaddwd %%mm1, %%mm4 \n\t" \
00527 "movq 88(%2), %%mm7 \n\t" \
00528 "pmaddwd 96(%2), %%mm1 \n\t" \
00529 "pmaddwd %%mm3, %%mm7 \n\t" \
00530 "movq %%mm0, %%mm2 \n\t" \
00531 "pmaddwd 104(%2), %%mm3 \n\t" \
00532 "paddd %%mm7, %%mm4 \n\t" \
00533 "paddd %%mm4, %%mm2 \n\t" \
00534 "psubd %%mm4, %%mm0 \n\t" \
00535 "psrad $" #shift ", %%mm2 \n\t"\
00536 "psrad $" #shift ", %%mm0 \n\t"\
00537 "movq %%mm6, %%mm4 \n\t" \
00538 "paddd %%mm1, %%mm3 \n\t" \
00539 "paddd %%mm3, %%mm6 \n\t" \
00540 "psubd %%mm3, %%mm4 \n\t" \
00541 "psrad $" #shift ", %%mm6 \n\t"\
00542 "packssdw %%mm6, %%mm2 \n\t" \
00543 "movq %%mm2, 8+" #dst " \n\t"\
00544 "psrad $" #shift ", %%mm4 \n\t"\
00545 "packssdw %%mm0, %%mm4 \n\t" \
00546 "movq %%mm4, 16+" #dst " \n\t"\
00547 "jmp 2f \n\t"\
00548 "1: \n\t"\
00549 "pslld $16, %%mm0 \n\t"\
00550 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
00551 "psrad $13, %%mm0 \n\t"\
00552 "packssdw %%mm0, %%mm0 \n\t"\
00553 "movq %%mm0, " #dst " \n\t"\
00554 "movq %%mm0, 8+" #dst " \n\t"\
00555 "movq %%mm0, 16+" #dst " \n\t"\
00556 "movq %%mm0, 24+" #dst " \n\t"\
00557 "2: \n\t"
00558
00559 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
00560 "movq " #src0 ", %%mm0 \n\t" \
00561 "movq " #src4 ", %%mm1 \n\t" \
00562 "movq " #src1 ", %%mm2 \n\t" \
00563 "movq " #src5 ", %%mm3 \n\t" \
00564 "movq %%mm0, %%mm4 \n\t"\
00565 "por %%mm1, %%mm4 \n\t"\
00566 "por %%mm2, %%mm4 \n\t"\
00567 "por %%mm3, %%mm4 \n\t"\
00568 "packssdw %%mm4,%%mm4 \n\t"\
00569 "movd %%mm4, %%eax \n\t"\
00570 "orl %%eax, %%eax \n\t"\
00571 "jz " #bt " \n\t"\
00572 "movq 16(%2), %%mm4 \n\t" \
00573 "pmaddwd %%mm0, %%mm4 \n\t" \
00574 "movq 24(%2), %%mm5 \n\t" \
00575 "pmaddwd %%mm5, %%mm0 \n\t" \
00576 "movq 32(%2), %%mm5 \n\t" \
00577 "pmaddwd %%mm1, %%mm5 \n\t" \
00578 "movq 40(%2), %%mm6 \n\t" \
00579 "pmaddwd %%mm6, %%mm1 \n\t" \
00580 "movq 48(%2), %%mm7 \n\t" \
00581 "pmaddwd %%mm2, %%mm7 \n\t" \
00582 #rounder ", %%mm4 \n\t"\
00583 "movq %%mm4, %%mm6 \n\t" \
00584 "paddd %%mm5, %%mm4 \n\t" \
00585 "psubd %%mm5, %%mm6 \n\t" \
00586 "movq 56(%2), %%mm5 \n\t" \
00587 "pmaddwd %%mm3, %%mm5 \n\t" \
00588 #rounder ", %%mm0 \n\t"\
00589 "paddd %%mm0, %%mm1 \n\t" \
00590 "paddd %%mm0, %%mm0 \n\t" \
00591 "psubd %%mm1, %%mm0 \n\t" \
00592 "pmaddwd 64(%2), %%mm2 \n\t" \
00593 "paddd %%mm5, %%mm7 \n\t" \
00594 "movq 72(%2), %%mm5 \n\t" \
00595 "pmaddwd %%mm3, %%mm5 \n\t" \
00596 "paddd %%mm4, %%mm7 \n\t" \
00597 "paddd %%mm4, %%mm4 \n\t" \
00598 "psubd %%mm7, %%mm4 \n\t" \
00599 "paddd %%mm2, %%mm5 \n\t" \
00600 "psrad $" #shift ", %%mm7 \n\t"\
00601 "psrad $" #shift ", %%mm4 \n\t"\
00602 "movq %%mm1, %%mm2 \n\t" \
00603 "paddd %%mm5, %%mm1 \n\t" \
00604 "psubd %%mm5, %%mm2 \n\t" \
00605 "psrad $" #shift ", %%mm1 \n\t"\
00606 "psrad $" #shift ", %%mm2 \n\t"\
00607 "packssdw %%mm1, %%mm7 \n\t" \
00608 "packssdw %%mm4, %%mm2 \n\t" \
00609 "movq %%mm7, " #dst " \n\t"\
00610 "movq " #src1 ", %%mm1 \n\t" \
00611 "movq 80(%2), %%mm4 \n\t" \
00612 "movq %%mm2, 24+" #dst " \n\t"\
00613 "pmaddwd %%mm1, %%mm4 \n\t" \
00614 "movq 88(%2), %%mm7 \n\t" \
00615 "pmaddwd 96(%2), %%mm1 \n\t" \
00616 "pmaddwd %%mm3, %%mm7 \n\t" \
00617 "movq %%mm0, %%mm2 \n\t" \
00618 "pmaddwd 104(%2), %%mm3 \n\t" \
00619 "paddd %%mm7, %%mm4 \n\t" \
00620 "paddd %%mm4, %%mm2 \n\t" \
00621 "psubd %%mm4, %%mm0 \n\t" \
00622 "psrad $" #shift ", %%mm2 \n\t"\
00623 "psrad $" #shift ", %%mm0 \n\t"\
00624 "movq %%mm6, %%mm4 \n\t" \
00625 "paddd %%mm1, %%mm3 \n\t" \
00626 "paddd %%mm3, %%mm6 \n\t" \
00627 "psubd %%mm3, %%mm4 \n\t" \
00628 "psrad $" #shift ", %%mm6 \n\t"\
00629 "packssdw %%mm6, %%mm2 \n\t" \
00630 "movq %%mm2, 8+" #dst " \n\t"\
00631 "psrad $" #shift ", %%mm4 \n\t"\
00632 "packssdw %%mm0, %%mm4 \n\t" \
00633 "movq %%mm4, 16+" #dst " \n\t"\
00634
00635 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
00636 "movq " #src0 ", %%mm0 \n\t" \
00637 "movq " #src4 ", %%mm1 \n\t" \
00638 "movq " #src1 ", %%mm2 \n\t" \
00639 "movq " #src5 ", %%mm3 \n\t" \
00640 "movq 16(%2), %%mm4 \n\t" \
00641 "pmaddwd %%mm0, %%mm4 \n\t" \
00642 "movq 24(%2), %%mm5 \n\t" \
00643 "pmaddwd %%mm5, %%mm0 \n\t" \
00644 "movq 32(%2), %%mm5 \n\t" \
00645 "pmaddwd %%mm1, %%mm5 \n\t" \
00646 "movq 40(%2), %%mm6 \n\t" \
00647 "pmaddwd %%mm6, %%mm1 \n\t" \
00648 "movq 48(%2), %%mm7 \n\t" \
00649 "pmaddwd %%mm2, %%mm7 \n\t" \
00650 #rounder ", %%mm4 \n\t"\
00651 "movq %%mm4, %%mm6 \n\t" \
00652 "paddd %%mm5, %%mm4 \n\t" \
00653 "psubd %%mm5, %%mm6 \n\t" \
00654 "movq 56(%2), %%mm5 \n\t" \
00655 "pmaddwd %%mm3, %%mm5 \n\t" \
00656 #rounder ", %%mm0 \n\t"\
00657 "paddd %%mm0, %%mm1 \n\t" \
00658 "paddd %%mm0, %%mm0 \n\t" \
00659 "psubd %%mm1, %%mm0 \n\t" \
00660 "pmaddwd 64(%2), %%mm2 \n\t" \
00661 "paddd %%mm5, %%mm7 \n\t" \
00662 "movq 72(%2), %%mm5 \n\t" \
00663 "pmaddwd %%mm3, %%mm5 \n\t" \
00664 "paddd %%mm4, %%mm7 \n\t" \
00665 "paddd %%mm4, %%mm4 \n\t" \
00666 "psubd %%mm7, %%mm4 \n\t" \
00667 "paddd %%mm2, %%mm5 \n\t" \
00668 "psrad $" #shift ", %%mm7 \n\t"\
00669 "psrad $" #shift ", %%mm4 \n\t"\
00670 "movq %%mm1, %%mm2 \n\t" \
00671 "paddd %%mm5, %%mm1 \n\t" \
00672 "psubd %%mm5, %%mm2 \n\t" \
00673 "psrad $" #shift ", %%mm1 \n\t"\
00674 "psrad $" #shift ", %%mm2 \n\t"\
00675 "packssdw %%mm1, %%mm7 \n\t" \
00676 "packssdw %%mm4, %%mm2 \n\t" \
00677 "movq %%mm7, " #dst " \n\t"\
00678 "movq " #src1 ", %%mm1 \n\t" \
00679 "movq 80(%2), %%mm4 \n\t" \
00680 "movq %%mm2, 24+" #dst " \n\t"\
00681 "pmaddwd %%mm1, %%mm4 \n\t" \
00682 "movq 88(%2), %%mm7 \n\t" \
00683 "pmaddwd 96(%2), %%mm1 \n\t" \
00684 "pmaddwd %%mm3, %%mm7 \n\t" \
00685 "movq %%mm0, %%mm2 \n\t" \
00686 "pmaddwd 104(%2), %%mm3 \n\t" \
00687 "paddd %%mm7, %%mm4 \n\t" \
00688 "paddd %%mm4, %%mm2 \n\t" \
00689 "psubd %%mm4, %%mm0 \n\t" \
00690 "psrad $" #shift ", %%mm2 \n\t"\
00691 "psrad $" #shift ", %%mm0 \n\t"\
00692 "movq %%mm6, %%mm4 \n\t" \
00693 "paddd %%mm1, %%mm3 \n\t" \
00694 "paddd %%mm3, %%mm6 \n\t" \
00695 "psubd %%mm3, %%mm4 \n\t" \
00696 "psrad $" #shift ", %%mm6 \n\t"\
00697 "packssdw %%mm6, %%mm2 \n\t" \
00698 "movq %%mm2, 8+" #dst " \n\t"\
00699 "psrad $" #shift ", %%mm4 \n\t"\
00700 "packssdw %%mm0, %%mm4 \n\t" \
00701 "movq %%mm4, 16+" #dst " \n\t"\
00702
00703
00704 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
00705 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
00706 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
00707 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
00708
00709 #undef IDCT
00710 #define IDCT(src0, src4, src1, src5, dst, shift) \
00711 "movq " #src0 ", %%mm0 \n\t" \
00712 "movq " #src4 ", %%mm1 \n\t" \
00713 "movq " #src1 ", %%mm2 \n\t" \
00714 "movq " #src5 ", %%mm3 \n\t" \
00715 "movq 16(%2), %%mm4 \n\t" \
00716 "pmaddwd %%mm0, %%mm4 \n\t" \
00717 "movq 24(%2), %%mm5 \n\t" \
00718 "pmaddwd %%mm5, %%mm0 \n\t" \
00719 "movq 32(%2), %%mm5 \n\t" \
00720 "pmaddwd %%mm1, %%mm5 \n\t" \
00721 "movq 40(%2), %%mm6 \n\t" \
00722 "pmaddwd %%mm6, %%mm1 \n\t" \
00723 "movq %%mm4, %%mm6 \n\t" \
00724 "movq 48(%2), %%mm7 \n\t" \
00725 "pmaddwd %%mm2, %%mm7 \n\t" \
00726 "paddd %%mm5, %%mm4 \n\t" \
00727 "psubd %%mm5, %%mm6 \n\t" \
00728 "movq %%mm0, %%mm5 \n\t" \
00729 "paddd %%mm1, %%mm0 \n\t" \
00730 "psubd %%mm1, %%mm5 \n\t" \
00731 "movq 56(%2), %%mm1 \n\t" \
00732 "pmaddwd %%mm3, %%mm1 \n\t" \
00733 "pmaddwd 64(%2), %%mm2 \n\t" \
00734 "paddd %%mm1, %%mm7 \n\t" \
00735 "movq 72(%2), %%mm1 \n\t" \
00736 "pmaddwd %%mm3, %%mm1 \n\t" \
00737 "paddd %%mm4, %%mm7 \n\t" \
00738 "paddd %%mm4, %%mm4 \n\t" \
00739 "psubd %%mm7, %%mm4 \n\t" \
00740 "paddd %%mm2, %%mm1 \n\t" \
00741 "psrad $" #shift ", %%mm7 \n\t"\
00742 "psrad $" #shift ", %%mm4 \n\t"\
00743 "movq %%mm0, %%mm2 \n\t" \
00744 "paddd %%mm1, %%mm0 \n\t" \
00745 "psubd %%mm1, %%mm2 \n\t" \
00746 "psrad $" #shift ", %%mm0 \n\t"\
00747 "psrad $" #shift ", %%mm2 \n\t"\
00748 "packssdw %%mm7, %%mm7 \n\t" \
00749 "movd %%mm7, " #dst " \n\t"\
00750 "packssdw %%mm0, %%mm0 \n\t" \
00751 "movd %%mm0, 16+" #dst " \n\t"\
00752 "packssdw %%mm2, %%mm2 \n\t" \
00753 "movd %%mm2, 96+" #dst " \n\t"\
00754 "packssdw %%mm4, %%mm4 \n\t" \
00755 "movd %%mm4, 112+" #dst " \n\t"\
00756 "movq " #src1 ", %%mm0 \n\t" \
00757 "movq 80(%2), %%mm4 \n\t" \
00758 "pmaddwd %%mm0, %%mm4 \n\t" \
00759 "movq 88(%2), %%mm7 \n\t" \
00760 "pmaddwd 96(%2), %%mm0 \n\t" \
00761 "pmaddwd %%mm3, %%mm7 \n\t" \
00762 "movq %%mm5, %%mm2 \n\t" \
00763 "pmaddwd 104(%2), %%mm3 \n\t" \
00764 "paddd %%mm7, %%mm4 \n\t" \
00765 "paddd %%mm4, %%mm2 \n\t" \
00766 "psubd %%mm4, %%mm5 \n\t" \
00767 "psrad $" #shift ", %%mm2 \n\t"\
00768 "psrad $" #shift ", %%mm5 \n\t"\
00769 "movq %%mm6, %%mm4 \n\t" \
00770 "paddd %%mm0, %%mm3 \n\t" \
00771 "paddd %%mm3, %%mm6 \n\t" \
00772 "psubd %%mm3, %%mm4 \n\t" \
00773 "psrad $" #shift ", %%mm6 \n\t"\
00774 "psrad $" #shift ", %%mm4 \n\t"\
00775 "packssdw %%mm2, %%mm2 \n\t" \
00776 "packssdw %%mm6, %%mm6 \n\t" \
00777 "movd %%mm2, 32+" #dst " \n\t"\
00778 "packssdw %%mm4, %%mm4 \n\t" \
00779 "packssdw %%mm5, %%mm5 \n\t" \
00780 "movd %%mm6, 48+" #dst " \n\t"\
00781 "movd %%mm4, 64+" #dst " \n\t"\
00782 "movd %%mm5, 80+" #dst " \n\t"
00783
00784
00785
00786 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
00787 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
00788 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
00789 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00790 "jmp 9f \n\t"
00791
00792 "# .p2align 4 \n\t"\
00793 "4: \n\t"
00794 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
00795 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
00796
00797 #undef IDCT
00798 #define IDCT(src0, src4, src1, src5, dst, shift) \
00799 "movq " #src0 ", %%mm0 \n\t" \
00800 "movq " #src4 ", %%mm1 \n\t" \
00801 "movq " #src5 ", %%mm3 \n\t" \
00802 "movq 16(%2), %%mm4 \n\t" \
00803 "pmaddwd %%mm0, %%mm4 \n\t" \
00804 "movq 24(%2), %%mm5 \n\t" \
00805 "pmaddwd %%mm5, %%mm0 \n\t" \
00806 "movq 32(%2), %%mm5 \n\t" \
00807 "pmaddwd %%mm1, %%mm5 \n\t" \
00808 "movq 40(%2), %%mm6 \n\t" \
00809 "pmaddwd %%mm6, %%mm1 \n\t" \
00810 "movq %%mm4, %%mm6 \n\t" \
00811 "paddd %%mm5, %%mm4 \n\t" \
00812 "psubd %%mm5, %%mm6 \n\t" \
00813 "movq %%mm0, %%mm5 \n\t" \
00814 "paddd %%mm1, %%mm0 \n\t" \
00815 "psubd %%mm1, %%mm5 \n\t" \
00816 "movq 56(%2), %%mm1 \n\t" \
00817 "pmaddwd %%mm3, %%mm1 \n\t" \
00818 "movq 72(%2), %%mm7 \n\t" \
00819 "pmaddwd %%mm3, %%mm7 \n\t" \
00820 "paddd %%mm4, %%mm1 \n\t" \
00821 "paddd %%mm4, %%mm4 \n\t" \
00822 "psubd %%mm1, %%mm4 \n\t" \
00823 "psrad $" #shift ", %%mm1 \n\t"\
00824 "psrad $" #shift ", %%mm4 \n\t"\
00825 "movq %%mm0, %%mm2 \n\t" \
00826 "paddd %%mm7, %%mm0 \n\t" \
00827 "psubd %%mm7, %%mm2 \n\t" \
00828 "psrad $" #shift ", %%mm0 \n\t"\
00829 "psrad $" #shift ", %%mm2 \n\t"\
00830 "packssdw %%mm1, %%mm1 \n\t" \
00831 "movd %%mm1, " #dst " \n\t"\
00832 "packssdw %%mm0, %%mm0 \n\t" \
00833 "movd %%mm0, 16+" #dst " \n\t"\
00834 "packssdw %%mm2, %%mm2 \n\t" \
00835 "movd %%mm2, 96+" #dst " \n\t"\
00836 "packssdw %%mm4, %%mm4 \n\t" \
00837 "movd %%mm4, 112+" #dst " \n\t"\
00838 "movq 88(%2), %%mm1 \n\t" \
00839 "pmaddwd %%mm3, %%mm1 \n\t" \
00840 "movq %%mm5, %%mm2 \n\t" \
00841 "pmaddwd 104(%2), %%mm3 \n\t" \
00842 "paddd %%mm1, %%mm2 \n\t" \
00843 "psubd %%mm1, %%mm5 \n\t" \
00844 "psrad $" #shift ", %%mm2 \n\t"\
00845 "psrad $" #shift ", %%mm5 \n\t"\
00846 "movq %%mm6, %%mm1 \n\t" \
00847 "paddd %%mm3, %%mm6 \n\t" \
00848 "psubd %%mm3, %%mm1 \n\t" \
00849 "psrad $" #shift ", %%mm6 \n\t"\
00850 "psrad $" #shift ", %%mm1 \n\t"\
00851 "packssdw %%mm2, %%mm2 \n\t" \
00852 "packssdw %%mm6, %%mm6 \n\t" \
00853 "movd %%mm2, 32+" #dst " \n\t"\
00854 "packssdw %%mm1, %%mm1 \n\t" \
00855 "packssdw %%mm5, %%mm5 \n\t" \
00856 "movd %%mm6, 48+" #dst " \n\t"\
00857 "movd %%mm1, 64+" #dst " \n\t"\
00858 "movd %%mm5, 80+" #dst " \n\t"
00859
00860
00861 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
00862 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
00863 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
00864 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00865 "jmp 9f \n\t"
00866
00867 "# .p2align 4 \n\t"\
00868 "6: \n\t"
00869 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
00870
00871 #undef IDCT
00872 #define IDCT(src0, src4, src1, src5, dst, shift) \
00873 "movq " #src0 ", %%mm0 \n\t" \
00874 "movq " #src5 ", %%mm3 \n\t" \
00875 "movq 16(%2), %%mm4 \n\t" \
00876 "pmaddwd %%mm0, %%mm4 \n\t" \
00877 "movq 24(%2), %%mm5 \n\t" \
00878 "pmaddwd %%mm5, %%mm0 \n\t" \
00879 "movq %%mm4, %%mm6 \n\t" \
00880 "movq %%mm0, %%mm5 \n\t" \
00881 "movq 56(%2), %%mm1 \n\t" \
00882 "pmaddwd %%mm3, %%mm1 \n\t" \
00883 "movq 72(%2), %%mm7 \n\t" \
00884 "pmaddwd %%mm3, %%mm7 \n\t" \
00885 "paddd %%mm4, %%mm1 \n\t" \
00886 "paddd %%mm4, %%mm4 \n\t" \
00887 "psubd %%mm1, %%mm4 \n\t" \
00888 "psrad $" #shift ", %%mm1 \n\t"\
00889 "psrad $" #shift ", %%mm4 \n\t"\
00890 "movq %%mm0, %%mm2 \n\t" \
00891 "paddd %%mm7, %%mm0 \n\t" \
00892 "psubd %%mm7, %%mm2 \n\t" \
00893 "psrad $" #shift ", %%mm0 \n\t"\
00894 "psrad $" #shift ", %%mm2 \n\t"\
00895 "packssdw %%mm1, %%mm1 \n\t" \
00896 "movd %%mm1, " #dst " \n\t"\
00897 "packssdw %%mm0, %%mm0 \n\t" \
00898 "movd %%mm0, 16+" #dst " \n\t"\
00899 "packssdw %%mm2, %%mm2 \n\t" \
00900 "movd %%mm2, 96+" #dst " \n\t"\
00901 "packssdw %%mm4, %%mm4 \n\t" \
00902 "movd %%mm4, 112+" #dst " \n\t"\
00903 "movq 88(%2), %%mm1 \n\t" \
00904 "pmaddwd %%mm3, %%mm1 \n\t" \
00905 "movq %%mm5, %%mm2 \n\t" \
00906 "pmaddwd 104(%2), %%mm3 \n\t" \
00907 "paddd %%mm1, %%mm2 \n\t" \
00908 "psubd %%mm1, %%mm5 \n\t" \
00909 "psrad $" #shift ", %%mm2 \n\t"\
00910 "psrad $" #shift ", %%mm5 \n\t"\
00911 "movq %%mm6, %%mm1 \n\t" \
00912 "paddd %%mm3, %%mm6 \n\t" \
00913 "psubd %%mm3, %%mm1 \n\t" \
00914 "psrad $" #shift ", %%mm6 \n\t"\
00915 "psrad $" #shift ", %%mm1 \n\t"\
00916 "packssdw %%mm2, %%mm2 \n\t" \
00917 "packssdw %%mm6, %%mm6 \n\t" \
00918 "movd %%mm2, 32+" #dst " \n\t"\
00919 "packssdw %%mm1, %%mm1 \n\t" \
00920 "packssdw %%mm5, %%mm5 \n\t" \
00921 "movd %%mm6, 48+" #dst " \n\t"\
00922 "movd %%mm1, 64+" #dst " \n\t"\
00923 "movd %%mm5, 80+" #dst " \n\t"
00924
00925
00926
00927 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
00928 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
00929 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
00930 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
00931 "jmp 9f \n\t"
00932
00933 "# .p2align 4 \n\t"\
00934 "2: \n\t"
00935 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
00936
00937 #undef IDCT
00938 #define IDCT(src0, src4, src1, src5, dst, shift) \
00939 "movq " #src0 ", %%mm0 \n\t" \
00940 "movq " #src1 ", %%mm2 \n\t" \
00941 "movq " #src5 ", %%mm3 \n\t" \
00942 "movq 16(%2), %%mm4 \n\t" \
00943 "pmaddwd %%mm0, %%mm4 \n\t" \
00944 "movq 24(%2), %%mm5 \n\t" \
00945 "pmaddwd %%mm5, %%mm0 \n\t" \
00946 "movq %%mm4, %%mm6 \n\t" \
00947 "movq 48(%2), %%mm7 \n\t" \
00948 "pmaddwd %%mm2, %%mm7 \n\t" \
00949 "movq %%mm0, %%mm5 \n\t" \
00950 "movq 56(%2), %%mm1 \n\t" \
00951 "pmaddwd %%mm3, %%mm1 \n\t" \
00952 "pmaddwd 64(%2), %%mm2 \n\t" \
00953 "paddd %%mm1, %%mm7 \n\t" \
00954 "movq 72(%2), %%mm1 \n\t" \
00955 "pmaddwd %%mm3, %%mm1 \n\t" \
00956 "paddd %%mm4, %%mm7 \n\t" \
00957 "paddd %%mm4, %%mm4 \n\t" \
00958 "psubd %%mm7, %%mm4 \n\t" \
00959 "paddd %%mm2, %%mm1 \n\t" \
00960 "psrad $" #shift ", %%mm7 \n\t"\
00961 "psrad $" #shift ", %%mm4 \n\t"\
00962 "movq %%mm0, %%mm2 \n\t" \
00963 "paddd %%mm1, %%mm0 \n\t" \
00964 "psubd %%mm1, %%mm2 \n\t" \
00965 "psrad $" #shift ", %%mm0 \n\t"\
00966 "psrad $" #shift ", %%mm2 \n\t"\
00967 "packssdw %%mm7, %%mm7 \n\t" \
00968 "movd %%mm7, " #dst " \n\t"\
00969 "packssdw %%mm0, %%mm0 \n\t" \
00970 "movd %%mm0, 16+" #dst " \n\t"\
00971 "packssdw %%mm2, %%mm2 \n\t" \
00972 "movd %%mm2, 96+" #dst " \n\t"\
00973 "packssdw %%mm4, %%mm4 \n\t" \
00974 "movd %%mm4, 112+" #dst " \n\t"\
00975 "movq " #src1 ", %%mm0 \n\t" \
00976 "movq 80(%2), %%mm4 \n\t" \
00977 "pmaddwd %%mm0, %%mm4 \n\t" \
00978 "movq 88(%2), %%mm7 \n\t" \
00979 "pmaddwd 96(%2), %%mm0 \n\t" \
00980 "pmaddwd %%mm3, %%mm7 \n\t" \
00981 "movq %%mm5, %%mm2 \n\t" \
00982 "pmaddwd 104(%2), %%mm3 \n\t" \
00983 "paddd %%mm7, %%mm4 \n\t" \
00984 "paddd %%mm4, %%mm2 \n\t" \
00985 "psubd %%mm4, %%mm5 \n\t" \
00986 "psrad $" #shift ", %%mm2 \n\t"\
00987 "psrad $" #shift ", %%mm5 \n\t"\
00988 "movq %%mm6, %%mm4 \n\t" \
00989 "paddd %%mm0, %%mm3 \n\t" \
00990 "paddd %%mm3, %%mm6 \n\t" \
00991 "psubd %%mm3, %%mm4 \n\t" \
00992 "psrad $" #shift ", %%mm6 \n\t"\
00993 "psrad $" #shift ", %%mm4 \n\t"\
00994 "packssdw %%mm2, %%mm2 \n\t" \
00995 "packssdw %%mm6, %%mm6 \n\t" \
00996 "movd %%mm2, 32+" #dst " \n\t"\
00997 "packssdw %%mm4, %%mm4 \n\t" \
00998 "packssdw %%mm5, %%mm5 \n\t" \
00999 "movd %%mm6, 48+" #dst " \n\t"\
01000 "movd %%mm4, 64+" #dst " \n\t"\
01001 "movd %%mm5, 80+" #dst " \n\t"
01002
01003
01004 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
01005 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
01006 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
01007 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01008 "jmp 9f \n\t"
01009
01010 "# .p2align 4 \n\t"\
01011 "3: \n\t"
01012 #undef IDCT
01013 #define IDCT(src0, src4, src1, src5, dst, shift) \
01014 "movq " #src0 ", %%mm0 \n\t" \
01015 "movq " #src1 ", %%mm2 \n\t" \
01016 "movq 16(%2), %%mm4 \n\t" \
01017 "pmaddwd %%mm0, %%mm4 \n\t" \
01018 "movq 24(%2), %%mm5 \n\t" \
01019 "pmaddwd %%mm5, %%mm0 \n\t" \
01020 "movq %%mm4, %%mm6 \n\t" \
01021 "movq 48(%2), %%mm7 \n\t" \
01022 "pmaddwd %%mm2, %%mm7 \n\t" \
01023 "movq %%mm0, %%mm5 \n\t" \
01024 "movq 64(%2), %%mm3 \n\t"\
01025 "pmaddwd %%mm2, %%mm3 \n\t" \
01026 "paddd %%mm4, %%mm7 \n\t" \
01027 "paddd %%mm4, %%mm4 \n\t" \
01028 "psubd %%mm7, %%mm4 \n\t" \
01029 "psrad $" #shift ", %%mm7 \n\t"\
01030 "psrad $" #shift ", %%mm4 \n\t"\
01031 "movq %%mm0, %%mm1 \n\t" \
01032 "paddd %%mm3, %%mm0 \n\t" \
01033 "psubd %%mm3, %%mm1 \n\t" \
01034 "psrad $" #shift ", %%mm0 \n\t"\
01035 "psrad $" #shift ", %%mm1 \n\t"\
01036 "packssdw %%mm7, %%mm7 \n\t" \
01037 "movd %%mm7, " #dst " \n\t"\
01038 "packssdw %%mm0, %%mm0 \n\t" \
01039 "movd %%mm0, 16+" #dst " \n\t"\
01040 "packssdw %%mm1, %%mm1 \n\t" \
01041 "movd %%mm1, 96+" #dst " \n\t"\
01042 "packssdw %%mm4, %%mm4 \n\t" \
01043 "movd %%mm4, 112+" #dst " \n\t"\
01044 "movq 80(%2), %%mm4 \n\t" \
01045 "pmaddwd %%mm2, %%mm4 \n\t" \
01046 "pmaddwd 96(%2), %%mm2 \n\t" \
01047 "movq %%mm5, %%mm1 \n\t" \
01048 "paddd %%mm4, %%mm1 \n\t" \
01049 "psubd %%mm4, %%mm5 \n\t" \
01050 "psrad $" #shift ", %%mm1 \n\t"\
01051 "psrad $" #shift ", %%mm5 \n\t"\
01052 "movq %%mm6, %%mm4 \n\t" \
01053 "paddd %%mm2, %%mm6 \n\t" \
01054 "psubd %%mm2, %%mm4 \n\t" \
01055 "psrad $" #shift ", %%mm6 \n\t"\
01056 "psrad $" #shift ", %%mm4 \n\t"\
01057 "packssdw %%mm1, %%mm1 \n\t" \
01058 "packssdw %%mm6, %%mm6 \n\t" \
01059 "movd %%mm1, 32+" #dst " \n\t"\
01060 "packssdw %%mm4, %%mm4 \n\t" \
01061 "packssdw %%mm5, %%mm5 \n\t" \
01062 "movd %%mm6, 48+" #dst " \n\t"\
01063 "movd %%mm4, 64+" #dst " \n\t"\
01064 "movd %%mm5, 80+" #dst " \n\t"
01065
01066
01067
01068 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
01069 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
01070 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
01071 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01072 "jmp 9f \n\t"
01073
01074 "# .p2align 4 \n\t"\
01075 "5: \n\t"
01076 #undef IDCT
01077 #define IDCT(src0, src4, src1, src5, dst, shift) \
01078 "movq " #src0 ", %%mm0 \n\t" \
01079 "movq " #src4 ", %%mm1 \n\t" \
01080 "movq 16(%2), %%mm4 \n\t" \
01081 "pmaddwd %%mm0, %%mm4 \n\t" \
01082 "movq 24(%2), %%mm5 \n\t" \
01083 "pmaddwd %%mm5, %%mm0 \n\t" \
01084 "movq 32(%2), %%mm5 \n\t" \
01085 "pmaddwd %%mm1, %%mm5 \n\t" \
01086 "movq 40(%2), %%mm6 \n\t" \
01087 "pmaddwd %%mm6, %%mm1 \n\t" \
01088 "movq %%mm4, %%mm6 \n\t" \
01089 "paddd %%mm5, %%mm4 \n\t" \
01090 "psubd %%mm5, %%mm6 \n\t" \
01091 "movq %%mm0, %%mm5 \n\t" \
01092 "paddd %%mm1, %%mm0 \n\t" \
01093 "psubd %%mm1, %%mm5 \n\t" \
01094 "movq 8+" #src0 ", %%mm2 \n\t" \
01095 "movq 8+" #src4 ", %%mm3 \n\t" \
01096 "movq 16(%2), %%mm1 \n\t" \
01097 "pmaddwd %%mm2, %%mm1 \n\t" \
01098 "movq 24(%2), %%mm7 \n\t" \
01099 "pmaddwd %%mm7, %%mm2 \n\t" \
01100 "movq 32(%2), %%mm7 \n\t" \
01101 "pmaddwd %%mm3, %%mm7 \n\t" \
01102 "pmaddwd 40(%2), %%mm3 \n\t" \
01103 "paddd %%mm1, %%mm7 \n\t" \
01104 "paddd %%mm1, %%mm1 \n\t" \
01105 "psubd %%mm7, %%mm1 \n\t" \
01106 "paddd %%mm2, %%mm3 \n\t" \
01107 "paddd %%mm2, %%mm2 \n\t" \
01108 "psubd %%mm3, %%mm2 \n\t" \
01109 "psrad $" #shift ", %%mm4 \n\t"\
01110 "psrad $" #shift ", %%mm7 \n\t"\
01111 "psrad $" #shift ", %%mm3 \n\t"\
01112 "packssdw %%mm7, %%mm4 \n\t" \
01113 "movq %%mm4, " #dst " \n\t"\
01114 "psrad $" #shift ", %%mm0 \n\t"\
01115 "packssdw %%mm3, %%mm0 \n\t" \
01116 "movq %%mm0, 16+" #dst " \n\t"\
01117 "movq %%mm0, 96+" #dst " \n\t"\
01118 "movq %%mm4, 112+" #dst " \n\t"\
01119 "psrad $" #shift ", %%mm5 \n\t"\
01120 "psrad $" #shift ", %%mm6 \n\t"\
01121 "psrad $" #shift ", %%mm2 \n\t"\
01122 "packssdw %%mm2, %%mm5 \n\t" \
01123 "movq %%mm5, 32+" #dst " \n\t"\
01124 "psrad $" #shift ", %%mm1 \n\t"\
01125 "packssdw %%mm1, %%mm6 \n\t" \
01126 "movq %%mm6, 48+" #dst " \n\t"\
01127 "movq %%mm6, 64+" #dst " \n\t"\
01128 "movq %%mm5, 80+" #dst " \n\t"
01129
01130
01131
01132 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
01133
01134 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
01135
01136 "jmp 9f \n\t"
01137
01138
01139 "# .p2align 4 \n\t"\
01140 "1: \n\t"
01141 #undef IDCT
01142 #define IDCT(src0, src4, src1, src5, dst, shift) \
01143 "movq " #src0 ", %%mm0 \n\t" \
01144 "movq " #src4 ", %%mm1 \n\t" \
01145 "movq " #src1 ", %%mm2 \n\t" \
01146 "movq 16(%2), %%mm4 \n\t" \
01147 "pmaddwd %%mm0, %%mm4 \n\t" \
01148 "movq 24(%2), %%mm5 \n\t" \
01149 "pmaddwd %%mm5, %%mm0 \n\t" \
01150 "movq 32(%2), %%mm5 \n\t" \
01151 "pmaddwd %%mm1, %%mm5 \n\t" \
01152 "movq 40(%2), %%mm6 \n\t" \
01153 "pmaddwd %%mm6, %%mm1 \n\t" \
01154 "movq %%mm4, %%mm6 \n\t" \
01155 "movq 48(%2), %%mm7 \n\t" \
01156 "pmaddwd %%mm2, %%mm7 \n\t" \
01157 "paddd %%mm5, %%mm4 \n\t" \
01158 "psubd %%mm5, %%mm6 \n\t" \
01159 "movq %%mm0, %%mm5 \n\t" \
01160 "paddd %%mm1, %%mm0 \n\t" \
01161 "psubd %%mm1, %%mm5 \n\t" \
01162 "movq 64(%2), %%mm1 \n\t"\
01163 "pmaddwd %%mm2, %%mm1 \n\t" \
01164 "paddd %%mm4, %%mm7 \n\t" \
01165 "paddd %%mm4, %%mm4 \n\t" \
01166 "psubd %%mm7, %%mm4 \n\t" \
01167 "psrad $" #shift ", %%mm7 \n\t"\
01168 "psrad $" #shift ", %%mm4 \n\t"\
01169 "movq %%mm0, %%mm3 \n\t" \
01170 "paddd %%mm1, %%mm0 \n\t" \
01171 "psubd %%mm1, %%mm3 \n\t" \
01172 "psrad $" #shift ", %%mm0 \n\t"\
01173 "psrad $" #shift ", %%mm3 \n\t"\
01174 "packssdw %%mm7, %%mm7 \n\t" \
01175 "movd %%mm7, " #dst " \n\t"\
01176 "packssdw %%mm0, %%mm0 \n\t" \
01177 "movd %%mm0, 16+" #dst " \n\t"\
01178 "packssdw %%mm3, %%mm3 \n\t" \
01179 "movd %%mm3, 96+" #dst " \n\t"\
01180 "packssdw %%mm4, %%mm4 \n\t" \
01181 "movd %%mm4, 112+" #dst " \n\t"\
01182 "movq 80(%2), %%mm4 \n\t" \
01183 "pmaddwd %%mm2, %%mm4 \n\t" \
01184 "pmaddwd 96(%2), %%mm2 \n\t" \
01185 "movq %%mm5, %%mm3 \n\t" \
01186 "paddd %%mm4, %%mm3 \n\t" \
01187 "psubd %%mm4, %%mm5 \n\t" \
01188 "psrad $" #shift ", %%mm3 \n\t"\
01189 "psrad $" #shift ", %%mm5 \n\t"\
01190 "movq %%mm6, %%mm4 \n\t" \
01191 "paddd %%mm2, %%mm6 \n\t" \
01192 "psubd %%mm2, %%mm4 \n\t" \
01193 "psrad $" #shift ", %%mm6 \n\t"\
01194 "packssdw %%mm3, %%mm3 \n\t" \
01195 "movd %%mm3, 32+" #dst " \n\t"\
01196 "psrad $" #shift ", %%mm4 \n\t"\
01197 "packssdw %%mm6, %%mm6 \n\t" \
01198 "movd %%mm6, 48+" #dst " \n\t"\
01199 "packssdw %%mm4, %%mm4 \n\t" \
01200 "packssdw %%mm5, %%mm5 \n\t" \
01201 "movd %%mm4, 64+" #dst " \n\t"\
01202 "movd %%mm5, 80+" #dst " \n\t"
01203
01204
01205
01206 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
01207 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
01208 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
01209 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
01210 "jmp 9f \n\t"
01211
01212
01213 "# .p2align 4 \n\t"
01214 "7: \n\t"
01215 #undef IDCT
01216 #define IDCT(src0, src4, src1, src5, dst, shift) \
01217 "movq " #src0 ", %%mm0 \n\t" \
01218 "movq 16(%2), %%mm4 \n\t" \
01219 "pmaddwd %%mm0, %%mm4 \n\t" \
01220 "movq 24(%2), %%mm5 \n\t" \
01221 "pmaddwd %%mm5, %%mm0 \n\t" \
01222 "psrad $" #shift ", %%mm4 \n\t"\
01223 "psrad $" #shift ", %%mm0 \n\t"\
01224 "movq 8+" #src0 ", %%mm2 \n\t" \
01225 "movq 16(%2), %%mm1 \n\t" \
01226 "pmaddwd %%mm2, %%mm1 \n\t" \
01227 "movq 24(%2), %%mm7 \n\t" \
01228 "pmaddwd %%mm7, %%mm2 \n\t" \
01229 "movq 32(%2), %%mm7 \n\t" \
01230 "psrad $" #shift ", %%mm1 \n\t"\
01231 "packssdw %%mm1, %%mm4 \n\t" \
01232 "movq %%mm4, " #dst " \n\t"\
01233 "psrad $" #shift ", %%mm2 \n\t"\
01234 "packssdw %%mm2, %%mm0 \n\t" \
01235 "movq %%mm0, 16+" #dst " \n\t"\
01236 "movq %%mm0, 96+" #dst " \n\t"\
01237 "movq %%mm4, 112+" #dst " \n\t"\
01238 "movq %%mm0, 32+" #dst " \n\t"\
01239 "movq %%mm4, 48+" #dst " \n\t"\
01240 "movq %%mm4, 64+" #dst " \n\t"\
01241 "movq %%mm0, 80+" #dst " \n\t"
01242
01243
01244 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
01245
01246 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
01247
01248
01249
01250 #endif
01251
01252
01253
01254
01255
01256
01257
01258
01259
01260
01261
01262
01263
01264
01265
01266
01267
01268
01269
01270
01271
01272
01273
01274 "9: \n\t"
01275 :: "r" (block), "r" (temp), "r" (coeffs)
01276 : "%eax"
01277 );
01278 }
01279
01280 void ff_simple_idct_mmx(int16_t *block)
01281 {
01282 idct(block);
01283 }
01284
01285
01286
01287 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
01288 {
01289 idct(block);
01290 ff_put_pixels_clamped_mmx(block, dest, line_size);
01291 }
01292 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
01293 {
01294 idct(block);
01295 ff_add_pixels_clamped_mmx(block, dest, line_size);
01296 }