FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
dct-test.c
Go to the documentation of this file.
1 /*
2  * (c) 2001 Fabrice Bellard
3  * 2007 Marc Hoffman <marc.hoffman@analog.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 /**
23  * @file
24  * DCT test (c) 2001 Fabrice Bellard
25  * Started from sample code by Juan J. Sierralta P.
26  */
27 
28 #include "config.h"
29 #include <stdlib.h>
30 #include <stdio.h>
31 #include <string.h>
32 #if HAVE_UNISTD_H
33 #include <unistd.h>
34 #endif
35 #include <math.h>
36 
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
41 
42 #include "dct.h"
43 #include "simple_idct.h"
44 #include "aandcttab.h"
45 #include "faandct.h"
46 #include "faanidct.h"
47 #include "x86/idct_xvid.h"
48 #include "dctref.h"
49 
50 // BFIN
51 void ff_bfin_idct(int16_t *block);
52 void ff_bfin_fdct(int16_t *block);
53 
54 // ALTIVEC
55 void ff_fdct_altivec(int16_t *block);
56 
57 // ARM
58 void ff_j_rev_dct_arm(int16_t *data);
59 void ff_simple_idct_arm(int16_t *data);
60 void ff_simple_idct_armv5te(int16_t *data);
61 void ff_simple_idct_armv6(int16_t *data);
62 void ff_simple_idct_neon(int16_t *data);
63 
64 struct algo {
65  const char *name;
66  void (*func)(int16_t *block);
70  int nonspec;
71 };
72 
73 static int cpu_flags;
74 
75 static const struct algo fdct_tab[] = {
76  { "REF-DBL", ff_ref_fdct, NO_PERM },
77  { "FAAN", ff_faandct, NO_PERM },
78  { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
79  { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
80 
81 #if HAVE_MMX_INLINE
82  { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
83 #endif
84 #if HAVE_MMXEXT_INLINE
86 #endif
87 #if HAVE_SSE2_INLINE
89 #endif
90 
91 #if HAVE_ALTIVEC
92  { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
93 #endif
94 
95 #if ARCH_BFIN
96  { "BFINfdct", ff_bfin_fdct, NO_PERM },
97 #endif
98 
99  { 0 }
100 };
101 
102 static void ff_prores_idct_wrap(int16_t *dst){
103  DECLARE_ALIGNED(16, static int16_t, qmat)[64];
104  int i;
105 
106  for(i=0; i<64; i++){
107  qmat[i]=4;
108  }
109  ff_prores_idct(dst, qmat);
110  for(i=0; i<64; i++) {
111  dst[i] -= 512;
112  }
113 }
114 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
115 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
116  int16_t *block, int16_t *qmat);
117 
118 static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
119  DECLARE_ALIGNED(16, static int16_t, qmat)[64];
120  DECLARE_ALIGNED(16, static int16_t, tmp)[64];
121  int i;
122 
123  for(i=0; i<64; i++){
124  qmat[i]=4;
125  tmp[i]= dst[i];
126  }
127  ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
128 
129  for(i=0; i<64; i++) {
130  dst[i] -= 512;
131  }
132 }
133 #endif
134 
135 static const struct algo idct_tab[] = {
136  { "FAANI", ff_faanidct, NO_PERM },
137  { "REF-DBL", ff_ref_idct, NO_PERM },
138  { "INT", ff_j_rev_dct, MMX_PERM },
139  { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
140  { "PR-C", ff_prores_idct_wrap, NO_PERM, 0, 1 },
141 
142 #if HAVE_MMX_INLINE
144  { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
145 #endif
146 #if HAVE_MMXEXT_INLINE
147  { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
148 #endif
149 #if HAVE_SSE2_INLINE
150  { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
151 #if ARCH_X86_64 && HAVE_YASM
152  { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
153 #endif
154 #endif
155 
156 #if ARCH_BFIN
157  { "BFINidct", ff_bfin_idct, NO_PERM },
158 #endif
159 
160 #if ARCH_ARM
161  { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
162  { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
163 #endif
164 #if HAVE_ARMV5TE
165  { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
166 #endif
167 #if HAVE_ARMV6
168  { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
169 #endif
170 #if HAVE_NEON && ARCH_ARM
172 #endif
173 
174  { 0 }
175 };
176 
177 #define AANSCALE_BITS 12
178 
179 #define NB_ITS 20000
180 #define NB_ITS_SPEED 50000
181 
182 static short idct_mmx_perm[64];
183 
184 static short idct_simple_mmx_perm[64] = {
185  0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
186  0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
187  0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
188  0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
189  0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
190  0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
191  0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
192  0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
193 };
194 
195 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
196 
197 static void idct_mmx_init(void)
198 {
199  int i;
200 
201  /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
202  for (i = 0; i < 64; i++) {
203  idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
204  }
205 }
206 
207 DECLARE_ALIGNED(16, static int16_t, block)[64];
208 DECLARE_ALIGNED(8, static int16_t, block1)[64];
209 
210 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
211 {
212  int i, j;
213 
214  memset(block, 0, 64 * sizeof(*block));
215 
216  switch (test) {
217  case 0:
218  for (i = 0; i < 64; i++)
219  block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
220  if (is_idct) {
221  ff_ref_fdct(block);
222  for (i = 0; i < 64; i++)
223  block[i] >>= 3;
224  }
225  break;
226  case 1:
227  j = av_lfg_get(prng) % 10 + 1;
228  for (i = 0; i < j; i++) {
229  int idx = av_lfg_get(prng) % 64;
230  block[idx] = av_lfg_get(prng) % (2*vals) -vals;
231  }
232  break;
233  case 2:
234  block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
235  block[63] = (block[0] & 1) ^ 1;
236  break;
237  }
238 }
239 
240 static void permute(int16_t dst[64], const int16_t src[64], int perm)
241 {
242  int i;
243 
244  if (perm == MMX_PERM) {
245  for (i = 0; i < 64; i++)
246  dst[idct_mmx_perm[i]] = src[i];
247  } else if (perm == MMX_SIMPLE_PERM) {
248  for (i = 0; i < 64; i++)
249  dst[idct_simple_mmx_perm[i]] = src[i];
250  } else if (perm == SSE2_PERM) {
251  for (i = 0; i < 64; i++)
252  dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
253  } else if (perm == PARTTRANS_PERM) {
254  for (i = 0; i < 64; i++)
255  dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
256  } else if (perm == TRANSPOSE_PERM) {
257  for (i = 0; i < 64; i++)
258  dst[(i>>3) | ((i<<3)&0x38)] = src[i];
259  } else {
260  for (i = 0; i < 64; i++)
261  dst[i] = src[i];
262  }
263 }
264 
265 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
266 {
267  void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
268  int it, i, scale;
269  int err_inf, v;
270  int64_t err2, ti, ti1, it1, err_sum = 0;
271  int64_t sysErr[64], sysErrMax = 0;
272  int maxout = 0;
273  int blockSumErrMax = 0, blockSumErr;
274  AVLFG prng;
275  const int vals=1<<bits;
276  double omse, ome;
277  int spec_err;
278 
279  av_lfg_init(&prng, 1);
280 
281  err_inf = 0;
282  err2 = 0;
283  for (i = 0; i < 64; i++)
284  sysErr[i] = 0;
285  for (it = 0; it < NB_ITS; it++) {
286  init_block(block1, test, is_idct, &prng, vals);
287  permute(block, block1, dct->format);
288 
289  dct->func(block);
290  emms_c();
291 
292  if (dct->format == SCALE_PERM) {
293  for (i = 0; i < 64; i++) {
294  scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
295  block[i] = (block[i] * scale) >> AANSCALE_BITS;
296  }
297  }
298 
299  ref(block1);
300  if (!strcmp(dct->name, "PR-SSE2"))
301  for (i = 0; i < 64; i++)
302  block1[i] = av_clip(block1[i], 4-512, 1019-512);
303 
304  blockSumErr = 0;
305  for (i = 0; i < 64; i++) {
306  int err = block[i] - block1[i];
307  err_sum += err;
308  v = abs(err);
309  if (v > err_inf)
310  err_inf = v;
311  err2 += v * v;
312  sysErr[i] += block[i] - block1[i];
313  blockSumErr += v;
314  if (abs(block[i]) > maxout)
315  maxout = abs(block[i]);
316  }
317  if (blockSumErrMax < blockSumErr)
318  blockSumErrMax = blockSumErr;
319  }
320  for (i = 0; i < 64; i++)
321  sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
322 
323  for (i = 0; i < 64; i++) {
324  if (i % 8 == 0)
325  printf("\n");
326  printf("%7d ", (int) sysErr[i]);
327  }
328  printf("\n");
329 
330  omse = (double) err2 / NB_ITS / 64;
331  ome = (double) err_sum / NB_ITS / 64;
332 
333  spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
334 
335  printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
336  is_idct ? "IDCT" : "DCT", dct->name, err_inf,
337  omse, ome, (double) sysErrMax / NB_ITS,
338  maxout, blockSumErrMax);
339 
340  if (spec_err && !dct->nonspec)
341  return 1;
342 
343  if (!speed)
344  return 0;
345 
346  /* speed test */
347 
348  init_block(block, test, is_idct, &prng, vals);
349  permute(block1, block, dct->format);
350 
351  ti = av_gettime();
352  it1 = 0;
353  do {
354  for (it = 0; it < NB_ITS_SPEED; it++) {
355  memcpy(block, block1, sizeof(block));
356  dct->func(block);
357  }
358  emms_c();
359  it1 += NB_ITS_SPEED;
360  ti1 = av_gettime() - ti;
361  } while (ti1 < 1000000);
362 
363  printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
364  (double) it1 * 1000.0 / (double) ti1);
365 
366  return 0;
367 }
368 
371 
372 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
373 {
374  static int init;
375  static double c8[8][8];
376  static double c4[4][4];
377  double block1[64], block2[64], block3[64];
378  double s, sum, v;
379  int i, j, k;
380 
381  if (!init) {
382  init = 1;
383 
384  for (i = 0; i < 8; i++) {
385  sum = 0;
386  for (j = 0; j < 8; j++) {
387  s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
388  c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
389  sum += c8[i][j] * c8[i][j];
390  }
391  }
392 
393  for (i = 0; i < 4; i++) {
394  sum = 0;
395  for (j = 0; j < 4; j++) {
396  s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
397  c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
398  sum += c4[i][j] * c4[i][j];
399  }
400  }
401  }
402 
403  /* butterfly */
404  s = 0.5 * sqrt(2.0);
405  for (i = 0; i < 4; i++) {
406  for (j = 0; j < 8; j++) {
407  block1[8 * (2 * i) + j] =
408  (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
409  block1[8 * (2 * i + 1) + j] =
410  (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
411  }
412  }
413 
414  /* idct8 on lines */
415  for (i = 0; i < 8; i++) {
416  for (j = 0; j < 8; j++) {
417  sum = 0;
418  for (k = 0; k < 8; k++)
419  sum += c8[k][j] * block1[8 * i + k];
420  block2[8 * i + j] = sum;
421  }
422  }
423 
424  /* idct4 */
425  for (i = 0; i < 8; i++) {
426  for (j = 0; j < 4; j++) {
427  /* top */
428  sum = 0;
429  for (k = 0; k < 4; k++)
430  sum += c4[k][j] * block2[8 * (2 * k) + i];
431  block3[8 * (2 * j) + i] = sum;
432 
433  /* bottom */
434  sum = 0;
435  for (k = 0; k < 4; k++)
436  sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
437  block3[8 * (2 * j + 1) + i] = sum;
438  }
439  }
440 
441  /* clamp and store the result */
442  for (i = 0; i < 8; i++) {
443  for (j = 0; j < 8; j++) {
444  v = block3[8 * i + j];
445  if (v < 0) v = 0;
446  else if (v > 255) v = 255;
447  dest[i * linesize + j] = (int) rint(v);
448  }
449  }
450 }
451 
452 static void idct248_error(const char *name,
453  void (*idct248_put)(uint8_t *dest, int line_size,
454  int16_t *block),
455  int speed)
456 {
457  int it, i, it1, ti, ti1, err_max, v;
458  AVLFG prng;
459 
460  av_lfg_init(&prng, 1);
461 
462  /* just one test to see if code is correct (precision is less
463  important here) */
464  err_max = 0;
465  for (it = 0; it < NB_ITS; it++) {
466  /* XXX: use forward transform to generate values */
467  for (i = 0; i < 64; i++)
468  block1[i] = av_lfg_get(&prng) % 256 - 128;
469  block1[0] += 1024;
470 
471  for (i = 0; i < 64; i++)
472  block[i] = block1[i];
473  idct248_ref(img_dest1, 8, block);
474 
475  for (i = 0; i < 64; i++)
476  block[i] = block1[i];
477  idct248_put(img_dest, 8, block);
478 
479  for (i = 0; i < 64; i++) {
480  v = abs((int) img_dest[i] - (int) img_dest1[i]);
481  if (v == 255)
482  printf("%d %d\n", img_dest[i], img_dest1[i]);
483  if (v > err_max)
484  err_max = v;
485  }
486 #if 0
487  printf("ref=\n");
488  for(i=0;i<8;i++) {
489  int j;
490  for(j=0;j<8;j++) {
491  printf(" %3d", img_dest1[i*8+j]);
492  }
493  printf("\n");
494  }
495 
496  printf("out=\n");
497  for(i=0;i<8;i++) {
498  int j;
499  for(j=0;j<8;j++) {
500  printf(" %3d", img_dest[i*8+j]);
501  }
502  printf("\n");
503  }
504 #endif
505  }
506  printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
507 
508  if (!speed)
509  return;
510 
511  ti = av_gettime();
512  it1 = 0;
513  do {
514  for (it = 0; it < NB_ITS_SPEED; it++) {
515  for (i = 0; i < 64; i++)
516  block[i] = block1[i];
517  idct248_put(img_dest, 8, block);
518  }
519  emms_c();
520  it1 += NB_ITS_SPEED;
521  ti1 = av_gettime() - ti;
522  } while (ti1 < 1000000);
523 
524  printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
525  (double) it1 * 1000.0 / (double) ti1);
526 }
527 
528 static void help(void)
529 {
530  printf("dct-test [-i] [<test-number>] [<bits>]\n"
531  "test-number 0 -> test with random matrixes\n"
532  " 1 -> test with random sparse matrixes\n"
533  " 2 -> do 3. test from mpeg4 std\n"
534  "bits Number of time domain bits to use, 8 is default\n"
535  "-i test IDCT implementations\n"
536  "-4 test IDCT248 implementations\n"
537  "-t speed test\n");
538 }
539 
540 #if !HAVE_GETOPT
541 #include "compat/getopt.c"
542 #endif
543 
544 int main(int argc, char **argv)
545 {
546  int test_idct = 0, test_248_dct = 0;
547  int c, i;
548  int test = 1;
549  int speed = 0;
550  int err = 0;
551  int bits=8;
552 
554 
555  ff_ref_dct_init();
556  idct_mmx_init();
557 
558  for (;;) {
559  c = getopt(argc, argv, "ih4t");
560  if (c == -1)
561  break;
562  switch (c) {
563  case 'i':
564  test_idct = 1;
565  break;
566  case '4':
567  test_248_dct = 1;
568  break;
569  case 't':
570  speed = 1;
571  break;
572  default:
573  case 'h':
574  help();
575  return 0;
576  }
577  }
578 
579  if (optind < argc)
580  test = atoi(argv[optind]);
581  if(optind+1 < argc) bits= atoi(argv[optind+1]);
582 
583  printf("ffmpeg DCT/IDCT test\n");
584 
585  if (test_248_dct) {
586  idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
587  } else {
588  const struct algo *algos = test_idct ? idct_tab : fdct_tab;
589  for (i = 0; algos[i].name; i++)
590  if (!(~cpu_flags & algos[i].mm_support)) {
591  err |= dct_error(&algos[i], test, test_idct, speed, bits);
592  }
593  }
594 
595  if (err)
596  printf("Error: %d.\n", err);
597 
598  return !!err;
599 }