00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00028 #include <stdlib.h>
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <sys/time.h>
00032 #include <unistd.h>
00033 #include <math.h>
00034
00035 #include "libavutil/cpu.h"
00036 #include "libavutil/common.h"
00037 #include "libavutil/lfg.h"
00038
00039 #include "simple_idct.h"
00040 #include "aandcttab.h"
00041 #include "faandct.h"
00042 #include "faanidct.h"
00043 #include "x86/idct_xvid.h"
00044 #include "dctref.h"
00045
00046 #undef printf
00047
00048 void ff_mmx_idct(DCTELEM *data);
00049 void ff_mmxext_idct(DCTELEM *data);
00050
00051 void odivx_idct_c(short *block);
00052
00053
00054 void ff_bfin_idct(DCTELEM *block);
00055 void ff_bfin_fdct(DCTELEM *block);
00056
00057
00058 void fdct_altivec(DCTELEM *block);
00059
00060
00061
00062 void ff_j_rev_dct_arm(DCTELEM *data);
00063 void ff_simple_idct_arm(DCTELEM *data);
00064 void ff_simple_idct_armv5te(DCTELEM *data);
00065 void ff_simple_idct_armv6(DCTELEM *data);
00066 void ff_simple_idct_neon(DCTELEM *data);
00067
00068 void ff_simple_idct_axp(DCTELEM *data);
00069
00070 struct algo {
00071 const char *name;
00072 enum { FDCT, IDCT } is_idct;
00073 void (* func) (DCTELEM *block);
00074 void (* ref) (DCTELEM *block);
00075 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
00076 int mm_support;
00077 };
00078
00079 #ifndef FAAN_POSTSCALE
00080 #define FAAN_SCALE SCALE_PERM
00081 #else
00082 #define FAAN_SCALE NO_PERM
00083 #endif
00084
00085 static int cpu_flags;
00086
00087 struct algo algos[] = {
00088 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM},
00089 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE},
00090 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM},
00091 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM},
00092 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
00093 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM},
00094 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM},
00095 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM},
00096
00097 #if HAVE_MMX
00098 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX},
00099 #if HAVE_MMX2
00100 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX2},
00101 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_SSE2},
00102 #endif
00103
00104 #if CONFIG_GPL
00105 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX},
00106 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX2},
00107 #endif
00108 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX},
00109 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX},
00110 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX2},
00111 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, AV_CPU_FLAG_SSE2},
00112 #endif
00113
00114 #if HAVE_ALTIVEC
00115 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_ALTIVEC},
00116 #endif
00117
00118 #if ARCH_BFIN
00119 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM},
00120 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM},
00121 #endif
00122
00123 #if ARCH_ARM
00124 {"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
00125 {"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM },
00126 #if HAVE_ARMV5TE
00127 {"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
00128 #endif
00129 #if HAVE_ARMV6
00130 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
00131 #endif
00132 #if HAVE_NEON
00133 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
00134 #endif
00135 #endif
00136
00137 #if ARCH_ALPHA
00138 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM },
00139 #endif
00140
00141 { 0 }
00142 };
00143
00144 #define AANSCALE_BITS 12
00145
00146 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
00147
00148 static int64_t gettime(void)
00149 {
00150 struct timeval tv;
00151 gettimeofday(&tv,NULL);
00152 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
00153 }
00154
00155 #define NB_ITS 20000
00156 #define NB_ITS_SPEED 50000
00157
00158 static short idct_mmx_perm[64];
00159
00160 static short idct_simple_mmx_perm[64]={
00161 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00162 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00163 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00164 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00165 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00166 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00167 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00168 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00169 };
00170
00171 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
00172
00173 static void idct_mmx_init(void)
00174 {
00175 int i;
00176
00177
00178 for (i = 0; i < 64; i++) {
00179 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
00180
00181 }
00182 }
00183
00184 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
00185 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
00186 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
00187
00188 static inline void mmx_emms(void)
00189 {
00190 #if HAVE_MMX
00191 if (cpu_flags & AV_CPU_FLAG_MMX)
00192 __asm__ volatile ("emms\n\t");
00193 #endif
00194 }
00195
00196 static void dct_error(const char *name, int is_idct,
00197 void (*fdct_func)(DCTELEM *block),
00198 void (*fdct_ref)(DCTELEM *block), int form, int test, const int bits)
00199 {
00200 int it, i, scale;
00201 int err_inf, v;
00202 int64_t err2, ti, ti1, it1;
00203 int64_t sysErr[64], sysErrMax=0;
00204 int maxout=0;
00205 int blockSumErrMax=0, blockSumErr;
00206 AVLFG prng;
00207 const int vals=1<<bits;
00208
00209 av_lfg_init(&prng, 1);
00210
00211 err_inf = 0;
00212 err2 = 0;
00213 for(i=0; i<64; i++) sysErr[i]=0;
00214 for(it=0;it<NB_ITS;it++) {
00215 for(i=0;i<64;i++)
00216 block1[i] = 0;
00217 switch(test){
00218 case 0:
00219 for(i=0;i<64;i++)
00220 block1[i] = (av_lfg_get(&prng) % (2*vals)) -vals;
00221 if (is_idct){
00222 ff_ref_fdct(block1);
00223
00224 for(i=0;i<64;i++)
00225 block1[i]>>=3;
00226 }
00227 break;
00228 case 1:{
00229 int num = av_lfg_get(&prng) % 10 + 1;
00230 for(i=0;i<num;i++)
00231 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % (2*vals) -vals;
00232 }break;
00233 case 2:
00234 block1[0] = av_lfg_get(&prng) % (16*vals) - (8*vals);
00235 block1[63]= (block1[0]&1)^1;
00236 break;
00237 }
00238
00239 #if 0 // simulate mismatch control
00240 { int sum=0;
00241 for(i=0;i<64;i++)
00242 sum+=block1[i];
00243
00244 if((sum&1)==0) block1[63]^=1;
00245 }
00246 #endif
00247
00248 for(i=0; i<64; i++)
00249 block_org[i]= block1[i];
00250
00251 if (form == MMX_PERM) {
00252 for(i=0;i<64;i++)
00253 block[idct_mmx_perm[i]] = block1[i];
00254 } else if (form == MMX_SIMPLE_PERM) {
00255 for(i=0;i<64;i++)
00256 block[idct_simple_mmx_perm[i]] = block1[i];
00257
00258 } else if (form == SSE2_PERM) {
00259 for(i=0; i<64; i++)
00260 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
00261 } else if (form == PARTTRANS_PERM) {
00262 for(i=0; i<64; i++)
00263 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
00264 } else {
00265 for(i=0; i<64; i++)
00266 block[i]= block1[i];
00267 }
00268 #if 0 // simulate mismatch control for tested IDCT but not the ref
00269 { int sum=0;
00270 for(i=0;i<64;i++)
00271 sum+=block[i];
00272
00273 if((sum&1)==0) block[63]^=1;
00274 }
00275 #endif
00276
00277 fdct_func(block);
00278 mmx_emms();
00279
00280 if (form == SCALE_PERM) {
00281 for(i=0; i<64; i++) {
00282 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
00283 block[i] = (block[i] * scale ) >> AANSCALE_BITS;
00284 }
00285 }
00286
00287 fdct_ref(block1);
00288
00289 blockSumErr=0;
00290 for(i=0;i<64;i++) {
00291 v = abs(block[i] - block1[i]);
00292 if (v > err_inf)
00293 err_inf = v;
00294 err2 += v * v;
00295 sysErr[i] += block[i] - block1[i];
00296 blockSumErr += v;
00297 if( abs(block[i])>maxout) maxout=abs(block[i]);
00298 }
00299 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
00300 #if 0 // print different matrix pairs
00301 if(blockSumErr){
00302 printf("\n");
00303 for(i=0; i<64; i++){
00304 if((i&7)==0) printf("\n");
00305 printf("%4d ", block_org[i]);
00306 }
00307 for(i=0; i<64; i++){
00308 if((i&7)==0) printf("\n");
00309 printf("%4d ", block[i] - block1[i]);
00310 }
00311 }
00312 #endif
00313 }
00314 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
00315
00316 for(i=0; i<64; i++){
00317 if(i%8==0) printf("\n");
00318 printf("%7d ", (int)sysErr[i]);
00319 }
00320 printf("\n");
00321
00322 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
00323 is_idct ? "IDCT" : "DCT",
00324 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
00325
00326
00327 for(i=0;i<64;i++)
00328 block1[i] = 0;
00329 switch(test){
00330 case 0:
00331 for(i=0;i<64;i++)
00332 block1[i] = av_lfg_get(&prng) % (2*vals) -vals;
00333 if (is_idct){
00334 ff_ref_fdct(block1);
00335
00336 for(i=0;i<64;i++)
00337 block1[i]>>=3;
00338 }
00339 break;
00340 case 1:{
00341 case 2:
00342 block1[0] = av_lfg_get(&prng) % (2*vals) -vals;
00343 block1[1] = av_lfg_get(&prng) % (2*vals) -vals;
00344 block1[2] = av_lfg_get(&prng) % (2*vals) -vals;
00345 block1[3] = av_lfg_get(&prng) % (2*vals) -vals;
00346 }break;
00347 }
00348
00349 if (form == MMX_PERM) {
00350 for(i=0;i<64;i++)
00351 block[idct_mmx_perm[i]] = block1[i];
00352 } else if(form == MMX_SIMPLE_PERM) {
00353 for(i=0;i<64;i++)
00354 block[idct_simple_mmx_perm[i]] = block1[i];
00355 } else {
00356 for(i=0; i<64; i++)
00357 block[i]= block1[i];
00358 }
00359
00360 ti = gettime();
00361 it1 = 0;
00362 do {
00363 for(it=0;it<NB_ITS_SPEED;it++) {
00364 for(i=0; i<64; i++)
00365 block[i]= block1[i];
00366
00367
00368 fdct_func(block);
00369 }
00370 it1 += NB_ITS_SPEED;
00371 ti1 = gettime() - ti;
00372 } while (ti1 < 1000000);
00373 mmx_emms();
00374
00375 printf("%s %s: %0.1f kdct/s\n",
00376 is_idct ? "IDCT" : "DCT",
00377 name, (double)it1 * 1000.0 / (double)ti1);
00378 }
00379
00380 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
00381 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
00382
00383 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
00384 {
00385 static int init;
00386 static double c8[8][8];
00387 static double c4[4][4];
00388 double block1[64], block2[64], block3[64];
00389 double s, sum, v;
00390 int i, j, k;
00391
00392 if (!init) {
00393 init = 1;
00394
00395 for(i=0;i<8;i++) {
00396 sum = 0;
00397 for(j=0;j<8;j++) {
00398 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
00399 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
00400 sum += c8[i][j] * c8[i][j];
00401 }
00402 }
00403
00404 for(i=0;i<4;i++) {
00405 sum = 0;
00406 for(j=0;j<4;j++) {
00407 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
00408 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
00409 sum += c4[i][j] * c4[i][j];
00410 }
00411 }
00412 }
00413
00414
00415 s = 0.5 * sqrt(2.0);
00416 for(i=0;i<4;i++) {
00417 for(j=0;j<8;j++) {
00418 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
00419 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
00420 }
00421 }
00422
00423
00424 for(i=0;i<8;i++) {
00425 for(j=0;j<8;j++) {
00426 sum = 0;
00427 for(k=0;k<8;k++)
00428 sum += c8[k][j] * block1[8*i+k];
00429 block2[8*i+j] = sum;
00430 }
00431 }
00432
00433
00434 for(i=0;i<8;i++) {
00435 for(j=0;j<4;j++) {
00436
00437 sum = 0;
00438 for(k=0;k<4;k++)
00439 sum += c4[k][j] * block2[8*(2*k)+i];
00440 block3[8*(2*j)+i] = sum;
00441
00442
00443 sum = 0;
00444 for(k=0;k<4;k++)
00445 sum += c4[k][j] * block2[8*(2*k+1)+i];
00446 block3[8*(2*j+1)+i] = sum;
00447 }
00448 }
00449
00450
00451 for(i=0;i<8;i++) {
00452 for(j=0;j<8;j++) {
00453 v = block3[8*i+j];
00454 if (v < 0)
00455 v = 0;
00456 else if (v > 255)
00457 v = 255;
00458 dest[i * linesize + j] = (int)rint(v);
00459 }
00460 }
00461 }
00462
00463 static void idct248_error(const char *name,
00464 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
00465 {
00466 int it, i, it1, ti, ti1, err_max, v;
00467
00468 AVLFG prng;
00469
00470 av_lfg_init(&prng, 1);
00471
00472
00473
00474 err_max = 0;
00475 for(it=0;it<NB_ITS;it++) {
00476
00477
00478 for(i=0;i<64;i++)
00479 block1[i] = av_lfg_get(&prng) % 256 - 128;
00480 block1[0] += 1024;
00481
00482 for(i=0; i<64; i++)
00483 block[i]= block1[i];
00484 idct248_ref(img_dest1, 8, block);
00485
00486 for(i=0; i<64; i++)
00487 block[i]= block1[i];
00488 idct248_put(img_dest, 8, block);
00489
00490 for(i=0;i<64;i++) {
00491 v = abs((int)img_dest[i] - (int)img_dest1[i]);
00492 if (v == 255)
00493 printf("%d %d\n", img_dest[i], img_dest1[i]);
00494 if (v > err_max)
00495 err_max = v;
00496 }
00497 #if 0
00498 printf("ref=\n");
00499 for(i=0;i<8;i++) {
00500 int j;
00501 for(j=0;j<8;j++) {
00502 printf(" %3d", img_dest1[i*8+j]);
00503 }
00504 printf("\n");
00505 }
00506
00507 printf("out=\n");
00508 for(i=0;i<8;i++) {
00509 int j;
00510 for(j=0;j<8;j++) {
00511 printf(" %3d", img_dest[i*8+j]);
00512 }
00513 printf("\n");
00514 }
00515 #endif
00516 }
00517 printf("%s %s: err_inf=%d\n",
00518 1 ? "IDCT248" : "DCT248",
00519 name, err_max);
00520
00521 ti = gettime();
00522 it1 = 0;
00523 do {
00524 for(it=0;it<NB_ITS_SPEED;it++) {
00525 for(i=0; i<64; i++)
00526 block[i]= block1[i];
00527
00528
00529 idct248_put(img_dest, 8, block);
00530 }
00531 it1 += NB_ITS_SPEED;
00532 ti1 = gettime() - ti;
00533 } while (ti1 < 1000000);
00534 mmx_emms();
00535
00536 printf("%s %s: %0.1f kdct/s\n",
00537 1 ? "IDCT248" : "DCT248",
00538 name, (double)it1 * 1000.0 / (double)ti1);
00539 }
00540
00541 static void help(void)
00542 {
00543 printf("dct-test [-i] [<test-number>]\n"
00544 "test-number 0 -> test with random matrixes\n"
00545 " 1 -> test with random sparse matrixes\n"
00546 " 2 -> do 3. test from mpeg4 std\n"
00547 "-i test IDCT implementations\n"
00548 "-4 test IDCT248 implementations\n");
00549 }
00550
00551 int main(int argc, char **argv)
00552 {
00553 int test_idct = 0, test_248_dct = 0;
00554 int c,i;
00555 int test=1;
00556 int bits=8;
00557 cpu_flags = av_get_cpu_flags();
00558
00559 ff_ref_dct_init();
00560 idct_mmx_init();
00561
00562 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
00563 for(i=0;i<MAX_NEG_CROP;i++) {
00564 cropTbl[i] = 0;
00565 cropTbl[i + MAX_NEG_CROP + 256] = 255;
00566 }
00567
00568 for(;;) {
00569 c = getopt(argc, argv, "ih4");
00570 if (c == -1)
00571 break;
00572 switch(c) {
00573 case 'i':
00574 test_idct = 1;
00575 break;
00576 case '4':
00577 test_248_dct = 1;
00578 break;
00579 default :
00580 case 'h':
00581 help();
00582 return 0;
00583 }
00584 }
00585
00586 if(optind <argc) test= atoi(argv[optind]);
00587 if(optind+1 < argc) bits= atoi(argv[optind+1]);
00588
00589 printf("ffmpeg DCT/IDCT test\n");
00590
00591 if (test_248_dct) {
00592 idct248_error("SIMPLE-C", ff_simple_idct248_put);
00593 } else {
00594 for (i=0;algos[i].name;i++)
00595 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
00596 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test, bits);
00597 }
00598 }
00599 return 0;
00600 }