00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 #include <stdio.h>
00036 #include <stdlib.h>
00037 #include <string.h>
00038 #include <inttypes.h>
00039 #include <math.h>
00040
00041 #include "config.h"
00042
00043 #include "mp_msg.h"
00044 #include "cpudetect.h"
00045 #include "img_format.h"
00046 #include "mp_image.h"
00047 #include "vf.h"
00048 #include "vd_ffmpeg.h"
00049 #include "libvo/fastmemcpy.h"
00050
00051 #include "libavutil/internal.h"
00052 #include "libavutil/intreadwrite.h"
00053 #include "libavutil/mem.h"
00054 #include "libavcodec/avcodec.h"
00055 #include "libavcodec/dsputil.h"
00056
00057 #undef free
00058 #undef malloc
00059
00060
00061 #define BLOCKSZ 12
00062
00063 static const short custom_threshold[64]=
00064
00065
00066
00067 { 71, 296, 295, 237, 71, 40, 38, 19,
00068 245, 193, 185, 121, 102, 73, 53, 27,
00069 158, 129, 141, 107, 97, 73, 50, 26,
00070 102, 116, 109, 98, 82, 66, 45, 23,
00071 71, 94, 95, 81, 70, 56, 38, 20,
00072 56, 77, 74, 66, 56, 44, 30, 15,
00073 38, 53, 50, 45, 38, 30, 21, 11,
00074 20, 27, 26, 23, 20, 15, 11, 5
00075 };
00076
00077 static const uint8_t __attribute__((aligned(32))) dither[8][8]={
00078 { 0, 48, 12, 60, 3, 51, 15, 63, },
00079 { 32, 16, 44, 28, 35, 19, 47, 31, },
00080 { 8, 56, 4, 52, 11, 59, 7, 55, },
00081 { 40, 24, 36, 20, 43, 27, 39, 23, },
00082 { 2, 50, 14, 62, 1, 49, 13, 61, },
00083 { 34, 18, 46, 30, 33, 17, 45, 29, },
00084 { 10, 58, 6, 54, 9, 57, 5, 53, },
00085 { 42, 26, 38, 22, 41, 25, 37, 21, },
00086 };
00087
00088 struct vf_priv_s {
00089 uint64_t threshold_mtx_noq[8*2];
00090 uint64_t threshold_mtx[8*2];
00091
00092 int log2_count;
00093 int temp_stride;
00094 int qp;
00095 int mpeg2;
00096 int prev_q;
00097 uint8_t *src;
00098 int16_t *temp;
00099 int bframes;
00100 char *non_b_qp;
00101 };
00102
00103
00104 #if !HAVE_MMX
00105
00106
00107 static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
00108 {int y, x;
00109 #define STORE(pos) \
00110 temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale); \
00111 src[x + pos]=src[x + pos - 8*src_stride]=0; \
00112 if(temp & 0x100) temp= ~(temp>>31); \
00113 dst[x + pos]= temp;
00114
00115 for(y=0; y<height; y++){
00116 const uint8_t *d= dither[y];
00117 for(x=0; x<width; x+=8){
00118 int temp;
00119 STORE(0);
00120 STORE(1);
00121 STORE(2);
00122 STORE(3);
00123 STORE(4);
00124 STORE(5);
00125 STORE(6);
00126 STORE(7);
00127 }
00128 src+=src_stride;
00129 dst+=dst_stride;
00130 }
00131 }
00132
00133
00134 static void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
00135 {int y, x;
00136 #define STORE2(pos) \
00137 temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale); \
00138 src[x + pos + 16*src_stride]=0; \
00139 if(temp & 0x100) temp= ~(temp>>31); \
00140 dst[x + pos]= temp;
00141
00142 for(y=0; y<height; y++){
00143 const uint8_t *d= dither[y];
00144 for(x=0; x<width; x+=8){
00145 int temp;
00146 STORE2(0);
00147 STORE2(1);
00148 STORE2(2);
00149 STORE2(3);
00150 STORE2(4);
00151 STORE2(5);
00152 STORE2(6);
00153 STORE2(7);
00154 }
00155 src+=src_stride;
00156 dst+=dst_stride;
00157 }
00158 }
00159
00160 static void mul_thrmat_c(struct vf_priv_s *p,int q)
00161 {
00162 int a;
00163 for(a=0;a<64;a++)
00164 ((short*)p->threshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];
00165 }
00166
00167 static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);
00168 static void row_idct_c(DCTELEM* workspace,
00169 int16_t* output_adr, int output_stride, int cnt);
00170 static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);
00171
00172
00173 #define store_slice_s store_slice_c
00174 #define store_slice2_s store_slice2_c
00175 #define mul_thrmat_s mul_thrmat_c
00176 #define column_fidct_s column_fidct_c
00177 #define row_idct_s row_idct_c
00178 #define row_fdct_s row_fdct_c
00179
00180 #else
00181
00182
00183 static void store_slice_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
00184 {
00185 const uint8_t *od=&dither[0][0];
00186 const uint8_t *end=&dither[height][0];
00187 width = (width+7)&~7;
00188 dst_stride-=width;
00189
00190 __asm__ volatile(
00191 "mov %5, %%"REG_d" \n\t"
00192 "mov %6, %%"REG_S" \n\t"
00193 "mov %7, %%"REG_D" \n\t"
00194 "mov %1, %%"REG_a" \n\t"
00195 "movd %%"REG_d", %%mm5 \n\t"
00196 "xor $-1, %%"REG_d" \n\t"
00197 "mov %%"REG_a", %%"REG_c" \n\t"
00198 "add $7, %%"REG_d" \n\t"
00199 "neg %%"REG_a" \n\t"
00200 "sub %0, %%"REG_c" \n\t"
00201 "add %%"REG_c", %%"REG_c" \n\t"
00202 "movd %%"REG_d", %%mm2 \n\t"
00203 "mov %%"REG_c", %1 \n\t"
00204 "mov %2, %%"REG_d" \n\t"
00205 "shl $4, %%"REG_a" \n\t"
00206
00207 "2: \n\t"
00208 "movq (%%"REG_d"), %%mm3 \n\t"
00209 "movq %%mm3, %%mm4 \n\t"
00210 "pxor %%mm7, %%mm7 \n\t"
00211 "punpcklbw %%mm7, %%mm3 \n\t"
00212 "punpckhbw %%mm7, %%mm4 \n\t"
00213 "mov %0, %%"REG_c" \n\t"
00214 "psraw %%mm5, %%mm3 \n\t"
00215 "psraw %%mm5, %%mm4 \n\t"
00216 "1: \n\t"
00217 "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
00218 "movq (%%"REG_S"), %%mm0 \n\t"
00219 "movq 8(%%"REG_S"), %%mm1 \n\t"
00220
00221 "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
00222 "paddw %%mm3, %%mm0 \n\t"
00223 "paddw %%mm4, %%mm1 \n\t"
00224
00225 "movq %%mm7, (%%"REG_S") \n\t"
00226 "psraw %%mm2, %%mm0 \n\t"
00227 "psraw %%mm2, %%mm1 \n\t"
00228
00229 "movq %%mm7, 8(%%"REG_S") \n\t"
00230 "packuswb %%mm1, %%mm0 \n\t"
00231 "add $16, %%"REG_S" \n\t"
00232
00233 "movq %%mm0, (%%"REG_D") \n\t"
00234 "add $8, %%"REG_D" \n\t"
00235 "sub $8, %%"REG_c" \n\t"
00236 "jg 1b \n\t"
00237 "add %1, %%"REG_S" \n\t"
00238 "add $8, %%"REG_d" \n\t"
00239 "add %3, %%"REG_D" \n\t"
00240 "cmp %4, %%"REG_d" \n\t"
00241 "jl 2b \n\t"
00242
00243 :
00244 : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
00245 "m" (log2_scale), "m" (src), "m" (dst)
00246 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
00247 );
00248 }
00249
00250
00251 static void store_slice2_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
00252 {
00253 const uint8_t *od=&dither[0][0];
00254 const uint8_t *end=&dither[height][0];
00255 width = (width+7)&~7;
00256 dst_stride-=width;
00257
00258 __asm__ volatile(
00259 "mov %5, %%"REG_d" \n\t"
00260 "mov %6, %%"REG_S" \n\t"
00261 "mov %7, %%"REG_D" \n\t"
00262 "mov %1, %%"REG_a" \n\t"
00263 "movd %%"REG_d", %%mm5 \n\t"
00264 "xor $-1, %%"REG_d" \n\t"
00265 "mov %%"REG_a", %%"REG_c" \n\t"
00266 "add $7, %%"REG_d" \n\t"
00267 "sub %0, %%"REG_c" \n\t"
00268 "add %%"REG_c", %%"REG_c" \n\t"
00269 "movd %%"REG_d", %%mm2 \n\t"
00270 "mov %%"REG_c", %1 \n\t"
00271 "mov %2, %%"REG_d" \n\t"
00272 "shl $5, %%"REG_a" \n\t"
00273
00274 "2: \n\t"
00275 "movq (%%"REG_d"), %%mm3 \n\t"
00276 "movq %%mm3, %%mm4 \n\t"
00277 "pxor %%mm7, %%mm7 \n\t"
00278 "punpcklbw %%mm7, %%mm3 \n\t"
00279 "punpckhbw %%mm7, %%mm4 \n\t"
00280 "mov %0, %%"REG_c" \n\t"
00281 "psraw %%mm5, %%mm3 \n\t"
00282 "psraw %%mm5, %%mm4 \n\t"
00283 "1: \n\t"
00284 "movq (%%"REG_S"), %%mm0 \n\t"
00285 "movq 8(%%"REG_S"), %%mm1 \n\t"
00286 "paddw %%mm3, %%mm0 \n\t"
00287
00288 "paddw (%%"REG_S",%%"REG_a",), %%mm0 \n\t"
00289 "paddw %%mm4, %%mm1 \n\t"
00290 "movq 8(%%"REG_S",%%"REG_a",), %%mm6 \n\t"
00291
00292 "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
00293 "psraw %%mm2, %%mm0 \n\t"
00294 "paddw %%mm6, %%mm1 \n\t"
00295
00296 "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
00297 "psraw %%mm2, %%mm1 \n\t"
00298 "packuswb %%mm1, %%mm0 \n\t"
00299
00300 "movq %%mm0, (%%"REG_D") \n\t"
00301 "add $16, %%"REG_S" \n\t"
00302 "add $8, %%"REG_D" \n\t"
00303 "sub $8, %%"REG_c" \n\t"
00304 "jg 1b \n\t"
00305 "add %1, %%"REG_S" \n\t"
00306 "add $8, %%"REG_d" \n\t"
00307 "add %3, %%"REG_D" \n\t"
00308 "cmp %4, %%"REG_d" \n\t"
00309 "jl 2b \n\t"
00310
00311 :
00312 : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
00313 "m" (log2_scale), "m" (src), "m" (dst)
00314 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S
00315 );
00316 }
00317
00318 static void mul_thrmat_mmx(struct vf_priv_s *p, int q)
00319 {
00320 uint64_t *adr=&p->threshold_mtx_noq[0];
00321 __asm__ volatile(
00322 "movd %0, %%mm7 \n\t"
00323 "add $8*8*2, %%"REG_D" \n\t"
00324 "movq 0*8(%%"REG_S"), %%mm0 \n\t"
00325 "punpcklwd %%mm7, %%mm7 \n\t"
00326 "movq 1*8(%%"REG_S"), %%mm1 \n\t"
00327 "punpckldq %%mm7, %%mm7 \n\t"
00328 "pmullw %%mm7, %%mm0 \n\t"
00329
00330 "movq 2*8(%%"REG_S"), %%mm2 \n\t"
00331 "pmullw %%mm7, %%mm1 \n\t"
00332
00333 "movq 3*8(%%"REG_S"), %%mm3 \n\t"
00334 "pmullw %%mm7, %%mm2 \n\t"
00335
00336 "movq %%mm0, 0*8(%%"REG_D") \n\t"
00337 "movq 4*8(%%"REG_S"), %%mm4 \n\t"
00338 "pmullw %%mm7, %%mm3 \n\t"
00339
00340 "movq %%mm1, 1*8(%%"REG_D") \n\t"
00341 "movq 5*8(%%"REG_S"), %%mm5 \n\t"
00342 "pmullw %%mm7, %%mm4 \n\t"
00343
00344 "movq %%mm2, 2*8(%%"REG_D") \n\t"
00345 "movq 6*8(%%"REG_S"), %%mm6 \n\t"
00346 "pmullw %%mm7, %%mm5 \n\t"
00347
00348 "movq %%mm3, 3*8(%%"REG_D") \n\t"
00349 "movq 7*8+0*8(%%"REG_S"), %%mm0 \n\t"
00350 "pmullw %%mm7, %%mm6 \n\t"
00351
00352 "movq %%mm4, 4*8(%%"REG_D") \n\t"
00353 "movq 7*8+1*8(%%"REG_S"), %%mm1 \n\t"
00354 "pmullw %%mm7, %%mm0 \n\t"
00355
00356 "movq %%mm5, 5*8(%%"REG_D") \n\t"
00357 "movq 7*8+2*8(%%"REG_S"), %%mm2 \n\t"
00358 "pmullw %%mm7, %%mm1 \n\t"
00359
00360 "movq %%mm6, 6*8(%%"REG_D") \n\t"
00361 "movq 7*8+3*8(%%"REG_S"), %%mm3 \n\t"
00362 "pmullw %%mm7, %%mm2 \n\t"
00363
00364 "movq %%mm0, 7*8+0*8(%%"REG_D") \n\t"
00365 "movq 7*8+4*8(%%"REG_S"), %%mm4 \n\t"
00366 "pmullw %%mm7, %%mm3 \n\t"
00367
00368 "movq %%mm1, 7*8+1*8(%%"REG_D") \n\t"
00369 "movq 7*8+5*8(%%"REG_S"), %%mm5 \n\t"
00370 "pmullw %%mm7, %%mm4 \n\t"
00371
00372 "movq %%mm2, 7*8+2*8(%%"REG_D") \n\t"
00373 "movq 7*8+6*8(%%"REG_S"), %%mm6 \n\t"
00374 "pmullw %%mm7, %%mm5 \n\t"
00375
00376 "movq %%mm3, 7*8+3*8(%%"REG_D") \n\t"
00377 "movq 14*8+0*8(%%"REG_S"), %%mm0 \n\t"
00378 "pmullw %%mm7, %%mm6 \n\t"
00379
00380 "movq %%mm4, 7*8+4*8(%%"REG_D") \n\t"
00381 "movq 14*8+1*8(%%"REG_S"), %%mm1 \n\t"
00382 "pmullw %%mm7, %%mm0 \n\t"
00383
00384 "movq %%mm5, 7*8+5*8(%%"REG_D") \n\t"
00385 "pmullw %%mm7, %%mm1 \n\t"
00386
00387 "movq %%mm6, 7*8+6*8(%%"REG_D") \n\t"
00388 "movq %%mm0, 14*8+0*8(%%"REG_D") \n\t"
00389 "movq %%mm1, 14*8+1*8(%%"REG_D") \n\t"
00390
00391 : "+g" (q), "+S" (adr), "+D" (adr)
00392 :
00393 );
00394 }
00395
00396 static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);
00397 static void row_idct_mmx(DCTELEM* workspace,
00398 int16_t* output_adr, int output_stride, int cnt);
00399 static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);
00400
00401 #define store_slice_s store_slice_mmx
00402 #define store_slice2_s store_slice2_mmx
00403 #define mul_thrmat_s mul_thrmat_mmx
00404 #define column_fidct_s column_fidct_mmx
00405 #define row_idct_s row_idct_mmx
00406 #define row_fdct_s row_fdct_mmx
00407 #endif // HAVE_MMX
00408
00409 static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src,
00410 int dst_stride, int src_stride,
00411 int width, int height,
00412 uint8_t *qp_store, int qp_stride, int is_luma)
00413 {
00414 int x, x0, y, es, qy, t;
00415 const int stride= is_luma ? p->temp_stride : (width+16);
00416 const int step=6-p->log2_count;
00417 const int qps= 3 + is_luma;
00418 int32_t __attribute__((aligned(32))) block_align[4*8*BLOCKSZ+ 4*8*BLOCKSZ];
00419 DCTELEM *block= (DCTELEM *)block_align;
00420 DCTELEM *block3=(DCTELEM *)(block_align+4*8*BLOCKSZ);
00421
00422 memset(block3, 0, 4*8*BLOCKSZ);
00423
00424
00425 if (!src || !dst) return;
00426 for(y=0; y<height; y++){
00427 int index= 8 + 8*stride + y*stride;
00428 fast_memcpy(p->src + index, src + y*src_stride, width);
00429 for(x=0; x<8; x++){
00430 p->src[index - x - 1]= p->src[index + x ];
00431 p->src[index + width + x ]= p->src[index + width - x - 1];
00432 }
00433 }
00434 for(y=0; y<8; y++){
00435 fast_memcpy(p->src + ( 7-y)*stride, p->src + ( y+8)*stride, stride);
00436 fast_memcpy(p->src + (height+8+y)*stride, p->src + (height-y+7)*stride, stride);
00437 }
00438
00439
00440 for(y=8; y<24; y++)
00441 memset(p->temp+ 8 +y*stride, 0,width*sizeof(int16_t));
00442
00443 for(y=step; y<height+8; y+=step){
00444 qy=y-4;
00445 if (qy>height-1) qy=height-1;
00446 if (qy<0) qy=0;
00447 qy=(qy>>qps)*qp_stride;
00448 row_fdct_s(block, p->src + y*stride +2-(y&1), stride, 2);
00449 for(x0=0; x0<width+8-8*(BLOCKSZ-1); x0+=8*(BLOCKSZ-1)){
00450 row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, 2*(BLOCKSZ-1));
00451 if(p->qp)
00452 column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+0*8, block3+0*8, 8*(BLOCKSZ-1));
00453 else
00454 for (x=0; x<8*(BLOCKSZ-1); x+=8) {
00455 t=x+x0-2;
00456 if (t<0) t=0;
00457 t=qp_store[qy+(t>>qps)];
00458 t=norm_qscale(t, p->mpeg2);
00459 if (t!=p->prev_q) p->prev_q=t, mul_thrmat_s(p, t);
00460 column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+x*8, block3+x*8, 8);
00461 }
00462 row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, 2*(BLOCKSZ-1));
00463 memmove(block, block+(BLOCKSZ-1)*64, 8*8*sizeof(DCTELEM));
00464 memmove(block3, block3+(BLOCKSZ-1)*64, 6*8*sizeof(DCTELEM));
00465 }
00466
00467 es=width+8-x0;
00468 if (es>8)
00469 row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, (es-4)>>2);
00470 column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block, block3, es&(~1));
00471 row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, es>>2);
00472 {const int y1=y-8+step;
00473 if (!(y1&7) && y1) {
00474 if (y1&8) store_slice_s(dst + (y1-8)*dst_stride, p->temp+ 8 +8*stride,
00475 dst_stride, stride, width, 8, 5-p->log2_count);
00476 else store_slice2_s(dst + (y1-8)*dst_stride, p->temp+ 8 +0*stride,
00477 dst_stride, stride, width, 8, 5-p->log2_count);
00478 } }
00479 }
00480
00481 if (y&7) {
00482 if (y&8) store_slice_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +8*stride,
00483 dst_stride, stride, width, y&7, 5-p->log2_count);
00484 else store_slice2_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +0*stride,
00485 dst_stride, stride, width, y&7, 5-p->log2_count);
00486 }
00487 }
00488
00489 static int config(struct vf_instance *vf,
00490 int width, int height, int d_width, int d_height,
00491 unsigned int flags, unsigned int outfmt)
00492 {
00493 int h= (height+16+15)&(~15);
00494
00495 vf->priv->temp_stride= (width+16+15)&(~15);
00496 vf->priv->temp= (int16_t*)av_mallocz(vf->priv->temp_stride*3*8*sizeof(int16_t));
00497
00498 vf->priv->src = (uint8_t*)av_malloc(vf->priv->temp_stride*h*sizeof(uint8_t));
00499
00500 return vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
00501 }
00502
00503 static void get_image(struct vf_instance *vf, mp_image_t *mpi)
00504 {
00505 if(mpi->flags&MP_IMGFLAG_PRESERVE) return;
00506
00507 vf->dmpi=vf_get_image(vf->next,mpi->imgfmt,
00508 mpi->type, mpi->flags, mpi->width, mpi->height);
00509 mpi->planes[0]=vf->dmpi->planes[0];
00510 mpi->stride[0]=vf->dmpi->stride[0];
00511 mpi->width=vf->dmpi->width;
00512 if(mpi->flags&MP_IMGFLAG_PLANAR){
00513 mpi->planes[1]=vf->dmpi->planes[1];
00514 mpi->planes[2]=vf->dmpi->planes[2];
00515 mpi->stride[1]=vf->dmpi->stride[1];
00516 mpi->stride[2]=vf->dmpi->stride[2];
00517 }
00518 mpi->flags|=MP_IMGFLAG_DIRECT;
00519 }
00520
00521 static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
00522 {
00523 mp_image_t *dmpi;
00524 if(!(mpi->flags&MP_IMGFLAG_DIRECT)){
00525
00526 dmpi=vf_get_image(vf->next,mpi->imgfmt,
00527 MP_IMGTYPE_TEMP,
00528 MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_PREFER_ALIGNED_STRIDE,
00529 mpi->width,mpi->height);
00530 vf_clone_mpi_attributes(dmpi, mpi);
00531 }else{
00532 dmpi=vf->dmpi;
00533 }
00534
00535 vf->priv->mpeg2= mpi->qscale_type;
00536 if(mpi->pict_type != 3 && mpi->qscale && !vf->priv->qp){
00537 int w = mpi->qstride;
00538 int h = (mpi->h + 15) >> 4;
00539 if (!w) {
00540 w = (mpi->w + 15) >> 4;
00541 h = 1;
00542 }
00543 if(!vf->priv->non_b_qp)
00544 vf->priv->non_b_qp= malloc(w*h);
00545 fast_memcpy(vf->priv->non_b_qp, mpi->qscale, w*h);
00546 }
00547 if(vf->priv->log2_count || !(mpi->flags&MP_IMGFLAG_DIRECT)){
00548 char *qp_tab= vf->priv->non_b_qp;
00549 if(vf->priv->bframes || !qp_tab)
00550 qp_tab= mpi->qscale;
00551
00552 if(qp_tab || vf->priv->qp){
00553 filter(vf->priv, dmpi->planes[0], mpi->planes[0], dmpi->stride[0], mpi->stride[0],
00554 mpi->w, mpi->h, qp_tab, mpi->qstride, 1);
00555 filter(vf->priv, dmpi->planes[1], mpi->planes[1], dmpi->stride[1], mpi->stride[1],
00556 mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
00557 filter(vf->priv, dmpi->planes[2], mpi->planes[2], dmpi->stride[2], mpi->stride[2],
00558 mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
00559 }else{
00560 memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
00561 memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
00562 memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
00563 }
00564 }
00565
00566 #if HAVE_MMX
00567 if(gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
00568 #endif
00569 #if HAVE_MMX2
00570 if(gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
00571 #endif
00572 return vf_next_put_image(vf,dmpi, pts);
00573 }
00574
00575 static void uninit(struct vf_instance *vf)
00576 {
00577 if(!vf->priv) return;
00578
00579 av_free(vf->priv->temp);
00580 vf->priv->temp= NULL;
00581 av_free(vf->priv->src);
00582 vf->priv->src= NULL;
00583
00584
00585 free(vf->priv->non_b_qp);
00586 vf->priv->non_b_qp= NULL;
00587
00588 av_free(vf->priv);
00589 vf->priv=NULL;
00590 }
00591
00592
00593
00594 static int query_format(struct vf_instance *vf, unsigned int fmt)
00595 {
00596 switch(fmt){
00597 case IMGFMT_YVU9:
00598 case IMGFMT_IF09:
00599 case IMGFMT_YV12:
00600 case IMGFMT_I420:
00601 case IMGFMT_IYUV:
00602 case IMGFMT_CLPL:
00603 case IMGFMT_Y800:
00604 case IMGFMT_Y8:
00605 case IMGFMT_444P:
00606 case IMGFMT_422P:
00607 case IMGFMT_411P:
00608 return vf_next_query_format(vf,fmt);
00609 }
00610 return 0;
00611 }
00612
00613 static int control(struct vf_instance *vf, int request, void* data)
00614 {
00615 switch(request){
00616 case VFCTRL_QUERY_MAX_PP_LEVEL:
00617 return 5;
00618 case VFCTRL_SET_PP_LEVEL:
00619 vf->priv->log2_count= *((unsigned int*)data);
00620 if (vf->priv->log2_count < 4) vf->priv->log2_count=4;
00621 return CONTROL_TRUE;
00622 }
00623 return vf_next_control(vf,request,data);
00624 }
00625
00626 static int vf_open(vf_instance_t *vf, char *args)
00627 {
00628 int i=0, bias;
00629 int custom_threshold_m[64];
00630 int log2c=-1;
00631
00632 vf->config=config;
00633 vf->put_image=put_image;
00634 vf->get_image=get_image;
00635 vf->query_format=query_format;
00636 vf->uninit=uninit;
00637 vf->control= control;
00638 vf->priv=av_mallocz(sizeof(struct vf_priv_s));
00639
00640 init_avcodec();
00641
00642
00643
00644
00645 vf->priv->log2_count= 4;
00646 vf->priv->bframes = 0;
00647
00648 if (args) sscanf(args, "%d:%d:%d:%d", &log2c, &vf->priv->qp, &i, &vf->priv->bframes);
00649
00650 if( log2c >=4 && log2c <=5 )
00651 vf->priv->log2_count = log2c;
00652 else if( log2c >= 6 )
00653 vf->priv->log2_count = 5;
00654
00655 if(vf->priv->qp < 0)
00656 vf->priv->qp = 0;
00657
00658 if (i < -15) i = -15;
00659 if (i > 32) i = 32;
00660
00661 bias= (1<<4)+i;
00662 vf->priv->prev_q=0;
00663
00664 for(i=0;i<64;i++)
00665 custom_threshold_m[i]=(int)(custom_threshold[i]*(bias/71.)+ 0.5);
00666 for(i=0;i<8;i++){
00667 vf->priv->threshold_mtx_noq[2*i]=(uint64_t)custom_threshold_m[i*8+2]
00668 |(((uint64_t)custom_threshold_m[i*8+6])<<16)
00669 |(((uint64_t)custom_threshold_m[i*8+0])<<32)
00670 |(((uint64_t)custom_threshold_m[i*8+4])<<48);
00671 vf->priv->threshold_mtx_noq[2*i+1]=(uint64_t)custom_threshold_m[i*8+5]
00672 |(((uint64_t)custom_threshold_m[i*8+3])<<16)
00673 |(((uint64_t)custom_threshold_m[i*8+1])<<32)
00674 |(((uint64_t)custom_threshold_m[i*8+7])<<48);
00675 }
00676
00677 if (vf->priv->qp) vf->priv->prev_q=vf->priv->qp, mul_thrmat_s(vf->priv, vf->priv->qp);
00678
00679 return 1;
00680 }
00681
00682 const vf_info_t vf_info_fspp = {
00683 "fast simple postprocess",
00684 "fspp",
00685 "Michael Niedermayer, Nikolaj Poroshin",
00686 "",
00687 vf_open,
00688 NULL
00689 };
00690
00691
00692
00693
00694
00695
00696
00697
00698
00699 #define DCTSIZE 8
00700 #define DCTSIZE_S "8"
00701
00702 #define FIX(x,s) ((int) ((x) * (1<<s) + 0.5)&0xffff)
00703 #define C64(x) ((uint64_t)((x)|(x)<<16))<<32 | (uint64_t)(x) | (uint64_t)(x)<<16
00704 #define FIX64(x,s) C64(FIX(x,s))
00705
00706 #define MULTIPLY16H(x,k) (((x)*(k))>>16)
00707 #define THRESHOLD(r,x,t) if(((unsigned)((x)+t))>t*2) r=(x);else r=0;
00708 #define DESCALE(x,n) (((x) + (1 << ((n)-1))) >> n)
00709
00710 #if HAVE_MMX
00711
00712 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433)=FIX64(0.382683433, 14);
00713 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_541196100)=FIX64(0.541196100, 14);
00714 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_707106781)=FIX64(0.707106781, 14);
00715 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965)=FIX64(1.306562965, 14);
00716
00717 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A)=FIX64(1.414213562, 14);
00718
00719 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065)=FIX64(1.847759065, 13);
00720 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930)=FIX64(-2.613125930, 13);
00721 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562)=FIX64(1.414213562, 13);
00722 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200)=FIX64(1.082392200, 13);
00723
00724 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065)=FIX64(0.847759065, 14);
00725 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497)=FIX64(0.566454497, 14);
00726 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367)=FIX64(0.198912367, 14);
00727
00728 DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND)=C64(4);
00729 DECLARE_ASM_CONST(8, uint64_t, MM_2)=C64(2);
00730
00731 #else
00732
00733 typedef int32_t int_simd16_t;
00734 static const int16_t FIX_0_382683433=FIX(0.382683433, 14);
00735 static const int16_t FIX_0_541196100=FIX(0.541196100, 14);
00736 static const int16_t FIX_0_707106781=FIX(0.707106781, 14);
00737 static const int16_t FIX_1_306562965=FIX(1.306562965, 14);
00738 static const int16_t FIX_1_414213562_A=FIX(1.414213562, 14);
00739 static const int16_t FIX_1_847759065=FIX(1.847759065, 13);
00740 static const int16_t FIX_2_613125930=FIX(-2.613125930, 13);
00741 static const int16_t FIX_1_414213562=FIX(1.414213562, 13);
00742 static const int16_t FIX_1_082392200=FIX(1.082392200, 13);
00743
00744 #endif
00745
00746 #if !HAVE_MMX
00747
00748 static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt)
00749 {
00750 int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
00751 int_simd16_t tmp10, tmp11, tmp12, tmp13;
00752 int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
00753 int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
00754
00755 DCTELEM* dataptr;
00756 DCTELEM* wsptr;
00757 int16_t *threshold;
00758 int ctr;
00759
00760 dataptr = data;
00761 wsptr = output;
00762
00763 for (; cnt > 0; cnt-=2) {
00764 threshold=(int16_t*)thr_adr;
00765 for (ctr = DCTSIZE; ctr > 0; ctr--) {
00766
00767 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
00768 tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
00769
00770 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
00771 tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
00772
00773 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
00774 tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
00775
00776 tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
00777 tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
00778
00779
00780
00781 tmp10 = tmp0 + tmp3;
00782 tmp13 = tmp0 - tmp3;
00783 tmp11 = tmp1 + tmp2;
00784 tmp12 = tmp1 - tmp2;
00785
00786 d0 = tmp10 + tmp11;
00787 d4 = tmp10 - tmp11;
00788
00789 z1 = MULTIPLY16H((tmp12 + tmp13) <<2, FIX_0_707106781);
00790 d2 = tmp13 + z1;
00791 d6 = tmp13 - z1;
00792
00793
00794
00795 THRESHOLD(tmp0, d0, threshold[0*8]);
00796 THRESHOLD(tmp1, d2, threshold[2*8]);
00797 THRESHOLD(tmp2, d4, threshold[4*8]);
00798 THRESHOLD(tmp3, d6, threshold[6*8]);
00799 tmp0+=2;
00800 tmp10 = (tmp0 + tmp2)>>2;
00801 tmp11 = (tmp0 - tmp2)>>2;
00802
00803 tmp13 = (tmp1 + tmp3)>>2;
00804 tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13;
00805
00806 tmp0 = tmp10 + tmp13;
00807 tmp3 = tmp10 - tmp13;
00808 tmp1 = tmp11 + tmp12;
00809 tmp2 = tmp11 - tmp12;
00810
00811
00812
00813 tmp10 = tmp4 + tmp5;
00814 tmp11 = tmp5 + tmp6;
00815 tmp12 = tmp6 + tmp7;
00816
00817 z5 = MULTIPLY16H((tmp10 - tmp12)<<2, FIX_0_382683433);
00818 z2 = MULTIPLY16H(tmp10 <<2, FIX_0_541196100) + z5;
00819 z4 = MULTIPLY16H(tmp12 <<2, FIX_1_306562965) + z5;
00820 z3 = MULTIPLY16H(tmp11 <<2, FIX_0_707106781);
00821
00822 z11 = tmp7 + z3;
00823 z13 = tmp7 - z3;
00824
00825 d5 = z13 + z2;
00826 d3 = z13 - z2;
00827 d1 = z11 + z4;
00828 d7 = z11 - z4;
00829
00830
00831
00832 THRESHOLD(tmp4, d1, threshold[1*8]);
00833 THRESHOLD(tmp5, d3, threshold[3*8]);
00834 THRESHOLD(tmp6, d5, threshold[5*8]);
00835 THRESHOLD(tmp7, d7, threshold[7*8]);
00836
00837
00838 z13 = tmp6 + tmp5;
00839 z10 = (tmp6 - tmp5)<<1;
00840 z11 = tmp4 + tmp7;
00841 z12 = (tmp4 - tmp7)<<1;
00842
00843 tmp7 = (z11 + z13)>>2;
00844 tmp11 = MULTIPLY16H((z11 - z13)<<1, FIX_1_414213562);
00845 z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
00846 tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
00847 tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5;
00848
00849 tmp6 = tmp12 - tmp7;
00850 tmp5 = tmp11 - tmp6;
00851 tmp4 = tmp10 + tmp5;
00852
00853 wsptr[DCTSIZE*0]+= (tmp0 + tmp7);
00854 wsptr[DCTSIZE*1]+= (tmp1 + tmp6);
00855 wsptr[DCTSIZE*2]+= (tmp2 + tmp5);
00856 wsptr[DCTSIZE*3]+= (tmp3 - tmp4);
00857 wsptr[DCTSIZE*4]+= (tmp3 + tmp4);
00858 wsptr[DCTSIZE*5]+= (tmp2 - tmp5);
00859 wsptr[DCTSIZE*6]= (tmp1 - tmp6);
00860 wsptr[DCTSIZE*7]= (tmp0 - tmp7);
00861
00862 dataptr++;
00863 wsptr++;
00864 threshold++;
00865 }
00866 dataptr+=8;
00867 wsptr +=8;
00868 }
00869 }
00870
00871 #else
00872
00873 static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt)
00874 {
00875 uint64_t __attribute__((aligned(8))) temps[4];
00876 __asm__ volatile(
00877 ASMALIGN(4)
00878 "1: \n\t"
00879 "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
00880
00881 "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
00882 "movq %%mm1, %%mm0 \n\t"
00883
00884 "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t"
00885 "movq %%mm7, %%mm3 \n\t"
00886
00887 "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t"
00888 "movq %%mm1, %%mm5 \n\t"
00889
00890 "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
00891 "psubw %%mm7, %%mm1 \n\t"
00892
00893 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
00894 "movq %%mm6, %%mm4 \n\t"
00895
00896 "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t"
00897 "paddw %%mm7, %%mm5 \n\t"
00898
00899 "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t"
00900 "movq %%mm6, %%mm7 \n\t"
00901
00902 "paddw %%mm2, %%mm6 \n\t"
00903 "psubw %%mm2, %%mm7 \n\t"
00904
00905 "movq %%mm5, %%mm2 \n\t"
00906 "paddw %%mm6, %%mm5 \n\t"
00907
00908 "psubw %%mm6, %%mm2 \n\t"
00909 "paddw %%mm1, %%mm7 \n\t"
00910
00911 "movq 4*16(%%"REG_d"), %%mm6 \n\t"
00912 "psllw $2, %%mm7 \n\t"
00913
00914 "psubw 0*16(%%"REG_d"), %%mm5 \n\t"
00915 "psubw %%mm6, %%mm2 \n\t"
00916
00917 "paddusw 0*16(%%"REG_d"), %%mm5 \n\t"
00918 "paddusw %%mm6, %%mm2 \n\t"
00919
00920 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
00921
00922 "paddw 0*16(%%"REG_d"), %%mm5 \n\t"
00923 "paddw %%mm6, %%mm2 \n\t"
00924
00925 "psubusw 0*16(%%"REG_d"), %%mm5 \n\t"
00926 "psubusw %%mm6, %%mm2 \n\t"
00927
00928
00929
00930
00931 "paddw "MANGLE(MM_2)", %%mm5 \n\t"
00932 "movq %%mm2, %%mm6 \n\t"
00933
00934 "paddw %%mm5, %%mm2 \n\t"
00935 "psubw %%mm6, %%mm5 \n\t"
00936
00937 "movq %%mm1, %%mm6 \n\t"
00938 "paddw %%mm7, %%mm1 \n\t"
00939
00940 "psubw 2*16(%%"REG_d"), %%mm1 \n\t"
00941 "psubw %%mm7, %%mm6 \n\t"
00942
00943 "movq 6*16(%%"REG_d"), %%mm7 \n\t"
00944 "psraw $2, %%mm5 \n\t"
00945
00946 "paddusw 2*16(%%"REG_d"), %%mm1 \n\t"
00947 "psubw %%mm7, %%mm6 \n\t"
00948
00949
00950 "paddw 2*16(%%"REG_d"), %%mm1 \n\t"
00951 "paddusw %%mm7, %%mm6 \n\t"
00952
00953 "psubusw 2*16(%%"REG_d"), %%mm1 \n\t"
00954 "paddw %%mm7, %%mm6 \n\t"
00955
00956 "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
00957 "psubusw %%mm7, %%mm6 \n\t"
00958
00959
00960
00961 "movq %%mm1, %%mm7 \n\t"
00962 "psraw $2, %%mm2 \n\t"
00963
00964 "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
00965 "psubw %%mm6, %%mm1 \n\t"
00966
00967 "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
00968 "paddw %%mm7, %%mm6 \n\t"
00969
00970 "psraw $2, %%mm6 \n\t"
00971 "movq %%mm2, %%mm7 \n\t"
00972
00973 "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
00974 "paddw %%mm6, %%mm2 \n\t"
00975
00976 "movq %%mm2, 0*8+%3 \n\t"
00977 "psubw %%mm6, %%mm7 \n\t"
00978
00979 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
00980 "psubw %%mm6, %%mm1 \n\t"
00981
00982 "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t"
00983 "movq %%mm5, %%mm6 \n\t"
00984
00985 "movq %%mm7, 3*8+%3 \n\t"
00986 "paddw %%mm2, %%mm3 \n\t"
00987
00988 "paddw %%mm4, %%mm2 \n\t"
00989 "paddw %%mm0, %%mm4 \n\t"
00990
00991 "movq %%mm3, %%mm7 \n\t"
00992 "psubw %%mm4, %%mm3 \n\t"
00993
00994 "psllw $2, %%mm3 \n\t"
00995 "psllw $2, %%mm7 \n\t"
00996
00997 "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
00998 "psllw $2, %%mm4 \n\t"
00999
01000 "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t"
01001 "psllw $2, %%mm2 \n\t"
01002
01003 "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
01004 "paddw %%mm1, %%mm5 \n\t"
01005
01006 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t"
01007 "psubw %%mm1, %%mm6 \n\t"
01008
01009
01010 "paddw %%mm3, %%mm7 \n\t"
01011
01012 "movq %%mm5, 1*8+%3 \n\t"
01013 "paddw %%mm3, %%mm4 \n\t"
01014
01015 "movq 3*16(%%"REG_d"), %%mm3 \n\t"
01016 "movq %%mm0, %%mm1 \n\t"
01017
01018 "movq %%mm6, 2*8+%3 \n\t"
01019 "psubw %%mm2, %%mm1 \n\t"
01020
01021
01022 "paddw %%mm2, %%mm0 \n\t"
01023 "movq %%mm1, %%mm5 \n\t"
01024
01025 "movq 5*16(%%"REG_d"), %%mm2 \n\t"
01026 "psubw %%mm7, %%mm1 \n\t"
01027
01028 "paddw %%mm7, %%mm5 \n\t"
01029 "psubw %%mm3, %%mm1 \n\t"
01030
01031 "movq 1*16(%%"REG_d"), %%mm7 \n\t"
01032 "psubw %%mm2, %%mm5 \n\t"
01033
01034 "movq %%mm0, %%mm6 \n\t"
01035 "paddw %%mm4, %%mm0 \n\t"
01036
01037 "paddusw %%mm3, %%mm1 \n\t"
01038 "psubw %%mm4, %%mm6 \n\t"
01039
01040
01041 "movq 7*16(%%"REG_d"), %%mm4 \n\t"
01042 "psubw %%mm7, %%mm0 \n\t"
01043
01044 "psubw %%mm4, %%mm6 \n\t"
01045 "paddusw %%mm2, %%mm5 \n\t"
01046
01047 "paddusw %%mm4, %%mm6 \n\t"
01048 "paddw %%mm3, %%mm1 \n\t"
01049
01050 "paddw %%mm2, %%mm5 \n\t"
01051 "paddw %%mm4, %%mm6 \n\t"
01052
01053 "psubusw %%mm3, %%mm1 \n\t"
01054 "psubusw %%mm2, %%mm5 \n\t"
01055
01056 "psubusw %%mm4, %%mm6 \n\t"
01057 "movq %%mm1, %%mm4 \n\t"
01058
01059 "por %%mm5, %%mm4 \n\t"
01060 "paddusw %%mm7, %%mm0 \n\t"
01061
01062 "por %%mm6, %%mm4 \n\t"
01063 "paddw %%mm7, %%mm0 \n\t"
01064
01065 "packssdw %%mm4, %%mm4 \n\t"
01066 "psubusw %%mm7, %%mm0 \n\t"
01067
01068 "movd %%mm4, %%"REG_a" \n\t"
01069 "or %%"REG_a", %%"REG_a" \n\t"
01070 "jnz 2f \n\t"
01071
01072
01073
01074
01075
01076
01077
01078 "movq 0*8+%3, %%mm4 \n\t"
01079 "movq %%mm0, %%mm1 \n\t"
01080
01081 "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t"
01082 "movq %%mm1, %%mm2 \n\t"
01083
01084 "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
01085 "movq %%mm2, %%mm3 \n\t"
01086
01087 "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t"
01088 "paddw %%mm4, %%mm5 \n\t"
01089
01090 "movq 1*8+%3, %%mm6 \n\t"
01091
01092 "psraw $2, %%mm3 \n\t"
01093
01094 "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t"
01095 "psubw %%mm3, %%mm4 \n\t"
01096
01097 "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
01098 "paddw %%mm3, %%mm5 \n\t"
01099
01100 "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
01101 "paddw %%mm6, %%mm7 \n\t"
01102
01103 "movq 2*8+%3, %%mm3 \n\t"
01104 "psubw %%mm0, %%mm6 \n\t"
01105
01106 "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
01107 "paddw %%mm0, %%mm7 \n\t"
01108
01109 "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
01110 "paddw %%mm3, %%mm4 \n\t"
01111
01112 "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
01113 "psubw %%mm1, %%mm3 \n\t"
01114
01115 "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
01116 "paddw %%mm1, %%mm4 \n\t"
01117
01118 "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
01119 "paddw %%mm3, %%mm5 \n\t"
01120
01121 "movq 3*8+%3, %%mm0 \n\t"
01122 "add $8, %%"REG_S" \n\t"
01123
01124 "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
01125 "paddw %%mm0, %%mm6 \n\t"
01126
01127 "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
01128 "psubw %%mm2, %%mm0 \n\t"
01129
01130 "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
01131 "paddw %%mm2, %%mm6 \n\t"
01132
01133 "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
01134 "paddw %%mm0, %%mm7 \n\t"
01135
01136 "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
01137
01138 "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
01139 "add $8, %%"REG_D" \n\t"
01140 "jmp 4f \n\t"
01141
01142 "2: \n\t"
01143
01144
01145
01146
01147
01148 "movq %%mm5, %%mm3 \n\t"
01149 "psubw %%mm1, %%mm5 \n\t"
01150
01151 "psllw $1, %%mm5 \n\t"
01152 "paddw %%mm1, %%mm3 \n\t"
01153
01154 "movq %%mm0, %%mm2 \n\t"
01155 "psubw %%mm6, %%mm0 \n\t"
01156
01157 "movq %%mm5, %%mm1 \n\t"
01158 "psllw $1, %%mm0 \n\t"
01159
01160 "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t"
01161 "paddw %%mm0, %%mm5 \n\t"
01162
01163 "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t"
01164 "paddw %%mm6, %%mm2 \n\t"
01165
01166 "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
01167 "movq %%mm2, %%mm7 \n\t"
01168
01169
01170 "movq 0*8+%3, %%mm4 \n\t"
01171 "psubw %%mm3, %%mm2 \n\t"
01172
01173 "psllw $1, %%mm2 \n\t"
01174 "paddw %%mm3, %%mm7 \n\t"
01175
01176 "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t"
01177 "movq %%mm4, %%mm6 \n\t"
01178
01179 "psraw $2, %%mm7 \n\t"
01180
01181 "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
01182 "psubw %%mm7, %%mm6 \n\t"
01183
01184 "movq 1*8+%3, %%mm3 \n\t"
01185 "paddw %%mm7, %%mm4 \n\t"
01186
01187 "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
01188 "paddw %%mm5, %%mm1 \n\t"
01189
01190 "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
01191 "psubw %%mm7, %%mm1 \n\t"
01192
01193 "movq 2*8+%3, %%mm7 \n\t"
01194 "psubw %%mm5, %%mm0 \n\t"
01195
01196 "movq 3*8+%3, %%mm6 \n\t"
01197 "movq %%mm3, %%mm5 \n\t"
01198
01199 "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
01200 "psubw %%mm1, %%mm5 \n\t"
01201
01202 "psubw %%mm1, %%mm2 \n\t"
01203 "paddw %%mm1, %%mm3 \n\t"
01204
01205 "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
01206 "movq %%mm7, %%mm4 \n\t"
01207
01208 "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
01209 "psubw %%mm2, %%mm4 \n\t"
01210
01211 "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
01212 "paddw %%mm2, %%mm7 \n\t"
01213
01214 "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
01215 "paddw %%mm2, %%mm0 \n\t"
01216
01217
01218 "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
01219 "movq %%mm6, %%mm1 \n\t"
01220
01221 "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
01222 "psubw %%mm0, %%mm1 \n\t"
01223
01224 "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
01225 "paddw %%mm0, %%mm6 \n\t"
01226
01227 "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
01228 "add $8, %%"REG_S" \n\t"
01229
01230 "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
01231
01232 "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
01233 "add $8, %%"REG_D" \n\t"
01234
01235 "4: \n\t"
01236
01237 "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
01238
01239 "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
01240 "movq %%mm1, %%mm0 \n\t"
01241
01242 "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t"
01243 "movq %%mm7, %%mm3 \n\t"
01244
01245 "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t"
01246 "movq %%mm1, %%mm5 \n\t"
01247
01248 "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
01249 "psubw %%mm7, %%mm1 \n\t"
01250
01251 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
01252 "movq %%mm6, %%mm4 \n\t"
01253
01254 "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t"
01255 "paddw %%mm7, %%mm5 \n\t"
01256
01257 "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t"
01258 "movq %%mm6, %%mm7 \n\t"
01259
01260 "paddw %%mm2, %%mm6 \n\t"
01261 "psubw %%mm2, %%mm7 \n\t"
01262
01263 "movq %%mm5, %%mm2 \n\t"
01264 "paddw %%mm6, %%mm5 \n\t"
01265
01266 "psubw %%mm6, %%mm2 \n\t"
01267 "paddw %%mm1, %%mm7 \n\t"
01268
01269 "movq 1*8+4*16(%%"REG_d"), %%mm6 \n\t"
01270 "psllw $2, %%mm7 \n\t"
01271
01272 "psubw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
01273 "psubw %%mm6, %%mm2 \n\t"
01274
01275 "paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
01276 "paddusw %%mm6, %%mm2 \n\t"
01277
01278 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
01279
01280 "paddw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
01281 "paddw %%mm6, %%mm2 \n\t"
01282
01283 "psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
01284 "psubusw %%mm6, %%mm2 \n\t"
01285
01286
01287
01288
01289 "paddw "MANGLE(MM_2)", %%mm5 \n\t"
01290 "movq %%mm2, %%mm6 \n\t"
01291
01292 "paddw %%mm5, %%mm2 \n\t"
01293 "psubw %%mm6, %%mm5 \n\t"
01294
01295 "movq %%mm1, %%mm6 \n\t"
01296 "paddw %%mm7, %%mm1 \n\t"
01297
01298 "psubw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
01299 "psubw %%mm7, %%mm6 \n\t"
01300
01301 "movq 1*8+6*16(%%"REG_d"), %%mm7 \n\t"
01302 "psraw $2, %%mm5 \n\t"
01303
01304 "paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
01305 "psubw %%mm7, %%mm6 \n\t"
01306
01307
01308 "paddw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
01309 "paddusw %%mm7, %%mm6 \n\t"
01310
01311 "psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
01312 "paddw %%mm7, %%mm6 \n\t"
01313
01314 "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
01315 "psubusw %%mm7, %%mm6 \n\t"
01316
01317
01318
01319 "movq %%mm1, %%mm7 \n\t"
01320 "psraw $2, %%mm2 \n\t"
01321
01322 "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
01323 "psubw %%mm6, %%mm1 \n\t"
01324
01325 "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
01326 "paddw %%mm7, %%mm6 \n\t"
01327
01328 "psraw $2, %%mm6 \n\t"
01329 "movq %%mm2, %%mm7 \n\t"
01330
01331 "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
01332 "paddw %%mm6, %%mm2 \n\t"
01333
01334 "movq %%mm2, 0*8+%3 \n\t"
01335 "psubw %%mm6, %%mm7 \n\t"
01336
01337 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
01338 "psubw %%mm6, %%mm1 \n\t"
01339
01340 "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t"
01341 "movq %%mm5, %%mm6 \n\t"
01342
01343 "movq %%mm7, 3*8+%3 \n\t"
01344 "paddw %%mm2, %%mm3 \n\t"
01345
01346 "paddw %%mm4, %%mm2 \n\t"
01347 "paddw %%mm0, %%mm4 \n\t"
01348
01349 "movq %%mm3, %%mm7 \n\t"
01350 "psubw %%mm4, %%mm3 \n\t"
01351
01352 "psllw $2, %%mm3 \n\t"
01353 "psllw $2, %%mm7 \n\t"
01354
01355 "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
01356 "psllw $2, %%mm4 \n\t"
01357
01358 "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t"
01359 "psllw $2, %%mm2 \n\t"
01360
01361 "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
01362 "paddw %%mm1, %%mm5 \n\t"
01363
01364 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t"
01365 "psubw %%mm1, %%mm6 \n\t"
01366
01367
01368 "paddw %%mm3, %%mm7 \n\t"
01369
01370 "movq %%mm5, 1*8+%3 \n\t"
01371 "paddw %%mm3, %%mm4 \n\t"
01372
01373 "movq 1*8+3*16(%%"REG_d"), %%mm3 \n\t"
01374 "movq %%mm0, %%mm1 \n\t"
01375
01376 "movq %%mm6, 2*8+%3 \n\t"
01377 "psubw %%mm2, %%mm1 \n\t"
01378
01379
01380 "paddw %%mm2, %%mm0 \n\t"
01381 "movq %%mm1, %%mm5 \n\t"
01382
01383 "movq 1*8+5*16(%%"REG_d"), %%mm2 \n\t"
01384 "psubw %%mm7, %%mm1 \n\t"
01385
01386 "paddw %%mm7, %%mm5 \n\t"
01387 "psubw %%mm3, %%mm1 \n\t"
01388
01389 "movq 1*8+1*16(%%"REG_d"), %%mm7 \n\t"
01390 "psubw %%mm2, %%mm5 \n\t"
01391
01392 "movq %%mm0, %%mm6 \n\t"
01393 "paddw %%mm4, %%mm0 \n\t"
01394
01395 "paddusw %%mm3, %%mm1 \n\t"
01396 "psubw %%mm4, %%mm6 \n\t"
01397
01398
01399 "movq 1*8+7*16(%%"REG_d"), %%mm4 \n\t"
01400 "psubw %%mm7, %%mm0 \n\t"
01401
01402 "psubw %%mm4, %%mm6 \n\t"
01403 "paddusw %%mm2, %%mm5 \n\t"
01404
01405 "paddusw %%mm4, %%mm6 \n\t"
01406 "paddw %%mm3, %%mm1 \n\t"
01407
01408 "paddw %%mm2, %%mm5 \n\t"
01409 "paddw %%mm4, %%mm6 \n\t"
01410
01411 "psubusw %%mm3, %%mm1 \n\t"
01412 "psubusw %%mm2, %%mm5 \n\t"
01413
01414 "psubusw %%mm4, %%mm6 \n\t"
01415 "movq %%mm1, %%mm4 \n\t"
01416
01417 "por %%mm5, %%mm4 \n\t"
01418 "paddusw %%mm7, %%mm0 \n\t"
01419
01420 "por %%mm6, %%mm4 \n\t"
01421 "paddw %%mm7, %%mm0 \n\t"
01422
01423 "packssdw %%mm4, %%mm4 \n\t"
01424 "psubusw %%mm7, %%mm0 \n\t"
01425
01426 "movd %%mm4, %%"REG_a" \n\t"
01427 "or %%"REG_a", %%"REG_a" \n\t"
01428 "jnz 3f \n\t"
01429
01430
01431
01432
01433
01434
01435
01436 "movq 0*8+%3, %%mm4 \n\t"
01437 "movq %%mm0, %%mm1 \n\t"
01438
01439 "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t"
01440 "movq %%mm1, %%mm2 \n\t"
01441
01442 "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
01443 "movq %%mm2, %%mm3 \n\t"
01444
01445 "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t"
01446 "paddw %%mm4, %%mm5 \n\t"
01447
01448 "movq 1*8+%3, %%mm6 \n\t"
01449
01450 "psraw $2, %%mm3 \n\t"
01451
01452 "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t"
01453 "psubw %%mm3, %%mm4 \n\t"
01454
01455 "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
01456 "paddw %%mm3, %%mm5 \n\t"
01457
01458 "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
01459 "paddw %%mm6, %%mm7 \n\t"
01460
01461 "movq 2*8+%3, %%mm3 \n\t"
01462 "psubw %%mm0, %%mm6 \n\t"
01463
01464 "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
01465 "paddw %%mm0, %%mm7 \n\t"
01466
01467 "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
01468 "paddw %%mm3, %%mm4 \n\t"
01469
01470 "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
01471 "psubw %%mm1, %%mm3 \n\t"
01472
01473 "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
01474 "paddw %%mm1, %%mm4 \n\t"
01475
01476 "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
01477 "paddw %%mm3, %%mm5 \n\t"
01478
01479 "movq 3*8+%3, %%mm0 \n\t"
01480 "add $24, %%"REG_S" \n\t"
01481
01482 "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
01483 "paddw %%mm0, %%mm6 \n\t"
01484
01485 "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
01486 "psubw %%mm2, %%mm0 \n\t"
01487
01488 "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
01489 "paddw %%mm2, %%mm6 \n\t"
01490
01491 "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
01492 "paddw %%mm0, %%mm7 \n\t"
01493
01494 "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
01495
01496 "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
01497 "add $24, %%"REG_D" \n\t"
01498 "sub $2, %%"REG_c" \n\t"
01499 "jnz 1b \n\t"
01500 "jmp 5f \n\t"
01501
01502 "3: \n\t"
01503
01504
01505
01506
01507
01508 "movq %%mm5, %%mm3 \n\t"
01509 "psubw %%mm1, %%mm5 \n\t"
01510
01511 "psllw $1, %%mm5 \n\t"
01512 "paddw %%mm1, %%mm3 \n\t"
01513
01514 "movq %%mm0, %%mm2 \n\t"
01515 "psubw %%mm6, %%mm0 \n\t"
01516
01517 "movq %%mm5, %%mm1 \n\t"
01518 "psllw $1, %%mm0 \n\t"
01519
01520 "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t"
01521 "paddw %%mm0, %%mm5 \n\t"
01522
01523 "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t"
01524 "paddw %%mm6, %%mm2 \n\t"
01525
01526 "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
01527 "movq %%mm2, %%mm7 \n\t"
01528
01529
01530 "movq 0*8+%3, %%mm4 \n\t"
01531 "psubw %%mm3, %%mm2 \n\t"
01532
01533 "psllw $1, %%mm2 \n\t"
01534 "paddw %%mm3, %%mm7 \n\t"
01535
01536 "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t"
01537 "movq %%mm4, %%mm6 \n\t"
01538
01539 "psraw $2, %%mm7 \n\t"
01540
01541 "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
01542 "psubw %%mm7, %%mm6 \n\t"
01543
01544 "movq 1*8+%3, %%mm3 \n\t"
01545 "paddw %%mm7, %%mm4 \n\t"
01546
01547 "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
01548 "paddw %%mm5, %%mm1 \n\t"
01549
01550 "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
01551 "psubw %%mm7, %%mm1 \n\t"
01552
01553 "movq 2*8+%3, %%mm7 \n\t"
01554 "psubw %%mm5, %%mm0 \n\t"
01555
01556 "movq 3*8+%3, %%mm6 \n\t"
01557 "movq %%mm3, %%mm5 \n\t"
01558
01559 "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
01560 "psubw %%mm1, %%mm5 \n\t"
01561
01562 "psubw %%mm1, %%mm2 \n\t"
01563 "paddw %%mm1, %%mm3 \n\t"
01564
01565 "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
01566 "movq %%mm7, %%mm4 \n\t"
01567
01568 "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
01569 "psubw %%mm2, %%mm4 \n\t"
01570
01571 "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
01572 "paddw %%mm2, %%mm7 \n\t"
01573
01574 "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
01575 "paddw %%mm2, %%mm0 \n\t"
01576
01577
01578 "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
01579 "movq %%mm6, %%mm1 \n\t"
01580
01581 "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
01582 "psubw %%mm0, %%mm1 \n\t"
01583
01584 "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
01585 "paddw %%mm0, %%mm6 \n\t"
01586
01587 "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
01588 "add $24, %%"REG_S" \n\t"
01589
01590 "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
01591
01592 "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
01593 "add $24, %%"REG_D" \n\t"
01594 "sub $2, %%"REG_c" \n\t"
01595 "jnz 1b \n\t"
01596 "5: \n\t"
01597
01598 : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps)
01599 : "d"(thr_adr)
01600 : "%"REG_a
01601 );
01602 }
01603
01604 #endif // HAVE_MMX
01605
01606 #if !HAVE_MMX
01607
01608 static void row_idct_c(DCTELEM* workspace,
01609 int16_t* output_adr, int output_stride, int cnt)
01610 {
01611 int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
01612 int_simd16_t tmp10, tmp11, tmp12, tmp13;
01613 int_simd16_t z5, z10, z11, z12, z13;
01614 int16_t* outptr;
01615 DCTELEM* wsptr;
01616
01617 cnt*=4;
01618 wsptr = workspace;
01619 outptr = output_adr;
01620 for (; cnt > 0; cnt--) {
01621
01622
01623 tmp10 = ( wsptr[2] + wsptr[3]);
01624 tmp11 = ( wsptr[2] - wsptr[3]);
01625
01626 tmp13 = ( wsptr[0] + wsptr[1]);
01627 tmp12 = (MULTIPLY16H( wsptr[0] - wsptr[1], FIX_1_414213562_A)<<2) - tmp13;
01628
01629 tmp0 = tmp10 + tmp13;
01630 tmp3 = tmp10 - tmp13;
01631 tmp1 = tmp11 + tmp12;
01632 tmp2 = tmp11 - tmp12;
01633
01634
01635
01636
01637
01638
01639
01640 z13 = wsptr[4] + wsptr[5];
01641 z10 = wsptr[4] - wsptr[5];
01642 z11 = wsptr[6] + wsptr[7];
01643 z12 = wsptr[6] - wsptr[7];
01644
01645 tmp7 = z11 + z13;
01646 tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
01647
01648 z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
01649 tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
01650 tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5;
01651
01652 tmp6 = (tmp12<<3) - tmp7;
01653 tmp5 = (tmp11<<3) - tmp6;
01654 tmp4 = (tmp10<<3) + tmp5;
01655
01656
01657 outptr[0*output_stride]+= DESCALE(tmp0 + tmp7, 3);
01658 outptr[1*output_stride]+= DESCALE(tmp1 + tmp6, 3);
01659 outptr[2*output_stride]+= DESCALE(tmp2 + tmp5, 3);
01660 outptr[3*output_stride]+= DESCALE(tmp3 - tmp4, 3);
01661 outptr[4*output_stride]+= DESCALE(tmp3 + tmp4, 3);
01662 outptr[5*output_stride]+= DESCALE(tmp2 - tmp5, 3);
01663 outptr[6*output_stride]+= DESCALE(tmp1 - tmp6, 3);
01664 outptr[7*output_stride]+= DESCALE(tmp0 - tmp7, 3);
01665 outptr++;
01666
01667 wsptr += DCTSIZE;
01668 }
01669 }
01670
01671 #else
01672
01673 static void row_idct_mmx (DCTELEM* workspace,
01674 int16_t* output_adr, int output_stride, int cnt)
01675 {
01676 uint64_t __attribute__((aligned(8))) temps[4];
01677 __asm__ volatile(
01678 "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
01679 "1: \n\t"
01680 "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t"
01681
01682
01683 "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t"
01684 "movq %%mm0, %%mm4 \n\t"
01685
01686 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
01687 "punpcklwd %%mm1, %%mm0 \n\t"
01688
01689 "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t"
01690 "punpckhwd %%mm1, %%mm4 \n\t"
01691
01692
01693 "movq %%mm2, %%mm7 \n\t"
01694 "punpcklwd %%mm3, %%mm2 \n\t"
01695
01696 "movq %%mm0, %%mm6 \n\t"
01697 "punpckldq %%mm2, %%mm0 \n\t"
01698
01699 "punpckhdq %%mm2, %%mm6 \n\t"
01700 "movq %%mm0, %%mm5 \n\t"
01701
01702 "punpckhwd %%mm3, %%mm7 \n\t"
01703 "psubw %%mm6, %%mm0 \n\t"
01704
01705 "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t"
01706 "movq %%mm4, %%mm2 \n\t"
01707
01708 "punpckldq %%mm7, %%mm4 \n\t"
01709 "paddw %%mm6, %%mm5 \n\t"
01710
01711 "punpckhdq %%mm7, %%mm2 \n\t"
01712 "movq %%mm4, %%mm1 \n\t"
01713
01714 "psllw $2, %%mm0 \n\t"
01715 "paddw %%mm2, %%mm4 \n\t"
01716
01717 "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t"
01718 "psubw %%mm2, %%mm1 \n\t"
01719
01720 "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t"
01721 "psubw %%mm5, %%mm0 \n\t"
01722
01723 "movq %%mm4, %%mm6 \n\t"
01724 "paddw %%mm5, %%mm4 \n\t"
01725
01726 "psubw %%mm5, %%mm6 \n\t"
01727 "movq %%mm1, %%mm7 \n\t"
01728
01729 "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t"
01730 "paddw %%mm0, %%mm1 \n\t"
01731
01732 "movq %%mm4, 0*8+%3 \n\t"
01733 "movq %%mm3, %%mm4 \n\t"
01734
01735 "movq %%mm6, 1*8+%3 \n\t"
01736 "punpcklwd %%mm2, %%mm3 \n\t"
01737
01738
01739 "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t"
01740 "punpckhwd %%mm2, %%mm4 \n\t"
01741
01742 "movq %%mm5, %%mm2 \n\t"
01743 "punpcklwd %%mm6, %%mm5 \n\t"
01744
01745 "psubw %%mm0, %%mm7 \n\t"
01746 "punpckhwd %%mm6, %%mm2 \n\t"
01747
01748 "movq %%mm3, %%mm0 \n\t"
01749 "punpckldq %%mm5, %%mm3 \n\t"
01750
01751 "punpckhdq %%mm5, %%mm0 \n\t"
01752 "movq %%mm4, %%mm5 \n\t"
01753
01754
01755 "movq %%mm3, %%mm6 \n\t"
01756 "punpckldq %%mm2, %%mm4 \n\t"
01757
01758 "psubw %%mm0, %%mm3 \n\t"
01759 "punpckhdq %%mm2, %%mm5 \n\t"
01760
01761 "paddw %%mm0, %%mm6 \n\t"
01762 "movq %%mm4, %%mm2 \n\t"
01763
01764 "movq %%mm3, %%mm0 \n\t"
01765 "psubw %%mm5, %%mm4 \n\t"
01766
01767 "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0 \n\t"
01768 "paddw %%mm4, %%mm3 \n\t"
01769
01770 "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3 \n\t"
01771 "paddw %%mm5, %%mm2 \n\t"
01772
01773 "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4 \n\t"
01774 "movq %%mm2, %%mm5 \n\t"
01775
01776 "psubw %%mm6, %%mm2 \n\t"
01777 "paddw %%mm6, %%mm5 \n\t"
01778
01779 "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t"
01780 "paddw %%mm3, %%mm0 \n\t"
01781
01782 "psllw $3, %%mm0 \n\t"
01783 "psubw %%mm3, %%mm4 \n\t"
01784
01785 "movq 0*8+%3, %%mm6 \n\t"
01786 "movq %%mm1, %%mm3 \n\t"
01787
01788 "psllw $3, %%mm4 \n\t"
01789 "psubw %%mm5, %%mm0 \n\t"
01790
01791 "psllw $3, %%mm2 \n\t"
01792 "paddw %%mm0, %%mm1 \n\t"
01793
01794 "psubw %%mm0, %%mm2 \n\t"
01795 "psubw %%mm0, %%mm3 \n\t"
01796
01797 "paddw %%mm2, %%mm4 \n\t"
01798 "movq %%mm7, %%mm0 \n\t"
01799
01800 "paddw %%mm2, %%mm7 \n\t"
01801 "psubw %%mm2, %%mm0 \n\t"
01802
01803 "movq "MANGLE(MM_DESCALE_RND)", %%mm2 \n\t"
01804 "psubw %%mm5, %%mm6 \n\t"
01805
01806 "paddw 0*8+%3, %%mm5 \n\t"
01807 "paddw %%mm2, %%mm1 \n\t"
01808
01809 "paddw %%mm2, %%mm5 \n\t"
01810 "psraw $3, %%mm1 \n\t"
01811
01812 "paddw %%mm2, %%mm7 \n\t"
01813 "psraw $3, %%mm5 \n\t"
01814
01815 "paddw (%%"REG_D"), %%mm5 \n\t"
01816 "psraw $3, %%mm7 \n\t"
01817
01818 "paddw (%%"REG_D",%%"REG_a",), %%mm1 \n\t"
01819 "paddw %%mm2, %%mm0 \n\t"
01820
01821 "paddw (%%"REG_D",%%"REG_a",2), %%mm7 \n\t"
01822 "paddw %%mm2, %%mm3 \n\t"
01823
01824 "movq %%mm5, (%%"REG_D") \n\t"
01825 "paddw %%mm2, %%mm6 \n\t"
01826
01827 "movq %%mm1, (%%"REG_D",%%"REG_a",) \n\t"
01828 "psraw $3, %%mm0 \n\t"
01829
01830 "movq %%mm7, (%%"REG_D",%%"REG_a",2) \n\t"
01831 "add %%"REG_d", %%"REG_D" \n\t"
01832
01833 "movq 1*8+%3, %%mm5 \n\t"
01834 "psraw $3, %%mm3 \n\t"
01835
01836 "paddw (%%"REG_D",%%"REG_a",2), %%mm0 \n\t"
01837 "psubw %%mm4, %%mm5 \n\t"
01838
01839 "paddw (%%"REG_D",%%"REG_d",), %%mm3 \n\t"
01840 "psraw $3, %%mm6 \n\t"
01841
01842 "paddw 1*8+%3, %%mm4 \n\t"
01843 "paddw %%mm2, %%mm5 \n\t"
01844
01845 "paddw (%%"REG_D",%%"REG_a",4), %%mm6 \n\t"
01846 "paddw %%mm2, %%mm4 \n\t"
01847
01848 "movq %%mm0, (%%"REG_D",%%"REG_a",2) \n\t"
01849 "psraw $3, %%mm5 \n\t"
01850
01851 "paddw (%%"REG_D"), %%mm5 \n\t"
01852 "psraw $3, %%mm4 \n\t"
01853
01854 "paddw (%%"REG_D",%%"REG_a",), %%mm4 \n\t"
01855 "add $"DCTSIZE_S"*2*4, %%"REG_S" \n\t"
01856
01857 "movq %%mm3, (%%"REG_D",%%"REG_d",) \n\t"
01858 "movq %%mm6, (%%"REG_D",%%"REG_a",4) \n\t"
01859 "movq %%mm5, (%%"REG_D") \n\t"
01860 "movq %%mm4, (%%"REG_D",%%"REG_a",) \n\t"
01861
01862 "sub %%"REG_d", %%"REG_D" \n\t"
01863 "add $8, %%"REG_D" \n\t"
01864 "dec %%"REG_c" \n\t"
01865 "jnz 1b \n\t"
01866
01867 : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps)
01868 : "a"(output_stride*sizeof(short))
01869 : "%"REG_d
01870 );
01871 }
01872
01873 #endif // HAVE_MMX
01874
01875 #if !HAVE_MMX
01876
01877 static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt)
01878 {
01879 int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
01880 int_simd16_t tmp10, tmp11, tmp12, tmp13;
01881 int_simd16_t z1, z2, z3, z4, z5, z11, z13;
01882 DCTELEM *dataptr;
01883
01884 cnt*=4;
01885
01886
01887 dataptr = data;
01888 for (; cnt > 0; cnt--) {
01889 tmp0 = pixels[line_size*0] + pixels[line_size*7];
01890 tmp7 = pixels[line_size*0] - pixels[line_size*7];
01891 tmp1 = pixels[line_size*1] + pixels[line_size*6];
01892 tmp6 = pixels[line_size*1] - pixels[line_size*6];
01893 tmp2 = pixels[line_size*2] + pixels[line_size*5];
01894 tmp5 = pixels[line_size*2] - pixels[line_size*5];
01895 tmp3 = pixels[line_size*3] + pixels[line_size*4];
01896 tmp4 = pixels[line_size*3] - pixels[line_size*4];
01897
01898
01899
01900 tmp10 = tmp0 + tmp3;
01901 tmp13 = tmp0 - tmp3;
01902 tmp11 = tmp1 + tmp2;
01903 tmp12 = tmp1 - tmp2;
01904
01905
01906
01907 dataptr[2] = tmp10 + tmp11;
01908 dataptr[3] = tmp10 - tmp11;
01909
01910 z1 = MULTIPLY16H((tmp12 + tmp13)<<2, FIX_0_707106781);
01911 dataptr[0] = tmp13 + z1;
01912 dataptr[1] = tmp13 - z1;
01913
01914
01915
01916 tmp10 = (tmp4 + tmp5) <<2;
01917 tmp11 = (tmp5 + tmp6) <<2;
01918 tmp12 = (tmp6 + tmp7) <<2;
01919
01920 z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
01921 z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
01922 z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
01923 z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
01924
01925 z11 = tmp7 + z3;
01926 z13 = tmp7 - z3;
01927
01928 dataptr[4] = z13 + z2;
01929 dataptr[5] = z13 - z2;
01930 dataptr[6] = z11 + z4;
01931 dataptr[7] = z11 - z4;
01932
01933 pixels++;
01934 dataptr += DCTSIZE;
01935 }
01936 }
01937
01938 #else
01939
01940 static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt)
01941 {
01942 uint64_t __attribute__((aligned(8))) temps[4];
01943 __asm__ volatile(
01944 "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
01945 "6: \n\t"
01946 "movd (%%"REG_S"), %%mm0 \n\t"
01947 "pxor %%mm7, %%mm7 \n\t"
01948
01949 "movd (%%"REG_S",%%"REG_a",), %%mm1 \n\t"
01950 "punpcklbw %%mm7, %%mm0 \n\t"
01951
01952 "movd (%%"REG_S",%%"REG_a",2), %%mm2 \n\t"
01953 "punpcklbw %%mm7, %%mm1 \n\t"
01954
01955 "punpcklbw %%mm7, %%mm2 \n\t"
01956 "add %%"REG_d", %%"REG_S" \n\t"
01957
01958 "movq %%mm0, %%mm5 \n\t"
01959
01960
01961 "movd (%%"REG_S",%%"REG_a",4), %%mm3 \n\t"
01962 "movq %%mm1, %%mm6 \n\t"
01963
01964 "movd (%%"REG_S",%%"REG_d",), %%mm4 \n\t"
01965 "punpcklbw %%mm7, %%mm3 \n\t"
01966
01967 "psubw %%mm3, %%mm5 \n\t"
01968 "punpcklbw %%mm7, %%mm4 \n\t"
01969
01970 "paddw %%mm3, %%mm0 \n\t"
01971 "psubw %%mm4, %%mm6 \n\t"
01972
01973 "movd (%%"REG_S",%%"REG_a",2), %%mm3 \n\t"
01974 "paddw %%mm4, %%mm1 \n\t"
01975
01976 "movq %%mm5, 0*8+%3 \n\t"
01977 "punpcklbw %%mm7, %%mm3 \n\t"
01978
01979 "movq %%mm6, 1*8+%3 \n\t"
01980 "movq %%mm2, %%mm4 \n\t"
01981
01982 "movd (%%"REG_S"), %%mm5 \n\t"
01983 "paddw %%mm3, %%mm2 \n\t"
01984
01985 "movd (%%"REG_S",%%"REG_a",), %%mm6 \n\t"
01986 "punpcklbw %%mm7, %%mm5 \n\t"
01987
01988 "psubw %%mm3, %%mm4 \n\t"
01989 "punpcklbw %%mm7, %%mm6 \n\t"
01990
01991 "movq %%mm5, %%mm3 \n\t"
01992 "paddw %%mm6, %%mm5 \n\t"
01993
01994 "psubw %%mm6, %%mm3 \n\t"
01995 "movq %%mm0, %%mm6 \n\t"
01996
01997 "movq %%mm1, %%mm7 \n\t"
01998 "psubw %%mm5, %%mm0 \n\t"
01999
02000 "psubw %%mm2, %%mm1 \n\t"
02001 "paddw %%mm2, %%mm7 \n\t"
02002
02003 "paddw %%mm0, %%mm1 \n\t"
02004 "movq %%mm7, %%mm2 \n\t"
02005
02006 "psllw $2, %%mm1 \n\t"
02007 "paddw %%mm5, %%mm6 \n\t"
02008
02009 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm1 \n\t"
02010 "paddw %%mm6, %%mm7 \n\t"
02011
02012 "psubw %%mm2, %%mm6 \n\t"
02013 "movq %%mm0, %%mm5 \n\t"
02014
02015
02016 "movq %%mm7, %%mm2 \n\t"
02017 "punpcklwd %%mm6, %%mm7 \n\t"
02018
02019 "paddw %%mm1, %%mm0 \n\t"
02020 "punpckhwd %%mm6, %%mm2 \n\t"
02021
02022 "psubw %%mm1, %%mm5 \n\t"
02023 "movq %%mm0, %%mm6 \n\t"
02024
02025 "movq 1*8+%3, %%mm1 \n\t"
02026 "punpcklwd %%mm5, %%mm0 \n\t"
02027
02028 "punpckhwd %%mm5, %%mm6 \n\t"
02029 "movq %%mm0, %%mm5 \n\t"
02030
02031 "punpckldq %%mm7, %%mm0 \n\t"
02032 "paddw %%mm4, %%mm3 \n\t"
02033
02034 "punpckhdq %%mm7, %%mm5 \n\t"
02035 "movq %%mm6, %%mm7 \n\t"
02036
02037 "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
02038 "punpckldq %%mm2, %%mm6 \n\t"
02039
02040 "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
02041 "punpckhdq %%mm2, %%mm7 \n\t"
02042
02043 "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
02044 "paddw %%mm1, %%mm4 \n\t"
02045
02046 "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
02047 "psllw $2, %%mm3 \n\t"
02048
02049 "movq 0*8+%3, %%mm2 \n\t"
02050 "psllw $2, %%mm4 \n\t"
02051
02052 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm4 \n\t"
02053 "paddw %%mm2, %%mm1 \n\t"
02054
02055 "psllw $2, %%mm1 \n\t"
02056 "movq %%mm3, %%mm0 \n\t"
02057
02058 "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm0 \n\t"
02059 "psubw %%mm1, %%mm3 \n\t"
02060
02061 "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
02062 "movq %%mm2, %%mm5 \n\t"
02063
02064 "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t"
02065 "psubw %%mm4, %%mm2 \n\t"
02066
02067 "paddw %%mm4, %%mm5 \n\t"
02068 "movq %%mm2, %%mm6 \n\t"
02069
02070 "paddw %%mm3, %%mm0 \n\t"
02071 "movq %%mm5, %%mm7 \n\t"
02072
02073 "paddw %%mm0, %%mm2 \n\t"
02074 "psubw %%mm0, %%mm6 \n\t"
02075
02076 "movq %%mm2, %%mm4 \n\t"
02077 "paddw %%mm3, %%mm1 \n\t"
02078
02079
02080 "punpcklwd %%mm6, %%mm2 \n\t"
02081 "paddw %%mm1, %%mm5 \n\t"
02082
02083 "punpckhwd %%mm6, %%mm4 \n\t"
02084 "psubw %%mm1, %%mm7 \n\t"
02085
02086 "movq %%mm5, %%mm6 \n\t"
02087 "punpcklwd %%mm7, %%mm5 \n\t"
02088
02089 "punpckhwd %%mm7, %%mm6 \n\t"
02090 "movq %%mm2, %%mm7 \n\t"
02091
02092 "punpckldq %%mm5, %%mm2 \n\t"
02093 "sub %%"REG_d", %%"REG_S" \n\t"
02094
02095 "punpckhdq %%mm5, %%mm7 \n\t"
02096 "movq %%mm4, %%mm5 \n\t"
02097
02098 "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t"
02099 "punpckldq %%mm6, %%mm4 \n\t"
02100
02101 "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t"
02102 "punpckhdq %%mm6, %%mm5 \n\t"
02103
02104 "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t"
02105 "add $4, %%"REG_S" \n\t"
02106
02107 "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t"
02108 "add $"DCTSIZE_S"*2*4, %%"REG_D" \n\t"
02109 "dec %%"REG_c" \n\t"
02110 "jnz 6b \n\t"
02111
02112 : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps)
02113 : "a"(line_size)
02114 : "%"REG_d);
02115 }
02116
02117 #endif // HAVE_MMX