FFmpeg
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
postprocess_altivec_template.c
Go to the documentation of this file.
1 /*
2  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * based on code by Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/avutil.h"
24 #include "libavutil/mem_internal.h"
25 
26 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \
27  do { \
28  __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \
29  __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \
30  __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \
31  __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \
32  tempA1 = vec_mergeh (src_a, src_e); \
33  tempB1 = vec_mergel (src_a, src_e); \
34  tempC1 = vec_mergeh (src_b, src_f); \
35  tempD1 = vec_mergel (src_b, src_f); \
36  tempE1 = vec_mergeh (src_c, src_g); \
37  tempF1 = vec_mergel (src_c, src_g); \
38  tempG1 = vec_mergeh (src_d, src_h); \
39  tempH1 = vec_mergel (src_d, src_h); \
40  tempA2 = vec_mergeh (tempA1, tempE1); \
41  tempB2 = vec_mergel (tempA1, tempE1); \
42  tempC2 = vec_mergeh (tempB1, tempF1); \
43  tempD2 = vec_mergel (tempB1, tempF1); \
44  tempE2 = vec_mergeh (tempC1, tempG1); \
45  tempF2 = vec_mergel (tempC1, tempG1); \
46  tempG2 = vec_mergeh (tempD1, tempH1); \
47  tempH2 = vec_mergel (tempD1, tempH1); \
48  src_a = vec_mergeh (tempA2, tempE2); \
49  src_b = vec_mergel (tempA2, tempE2); \
50  src_c = vec_mergeh (tempB2, tempF2); \
51  src_d = vec_mergel (tempB2, tempF2); \
52  src_e = vec_mergeh (tempC2, tempG2); \
53  src_f = vec_mergel (tempC2, tempG2); \
54  src_g = vec_mergeh (tempD2, tempH2); \
55  src_h = vec_mergel (tempD2, tempH2); \
56  } while (0)
57 
58 
59 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) {
60  /*
61  this code makes no assumption on src or stride.
62  One could remove the recomputation of the perm
63  vector by assuming (stride % 16) == 0, unfortunately
64  this is not always true.
65  */
66  short data_0 = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
67  DECLARE_ALIGNED(16, short, data)[8] =
68  {
69  data_0,
70  data_0 * 2 + 1,
71  c->QP * 2,
72  c->QP * 4
73  };
74  int numEq;
75  uint8_t *src2 = src;
76  vector signed short v_dcOffset;
77  vector signed short v2QP;
78  vector unsigned short v4QP;
79  vector unsigned short v_dcThreshold;
80  const int properStride = (stride % 16);
81  const int srcAlign = ((unsigned long)src2 % 16);
82  const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
83  const vector signed int zero = vec_splat_s32(0);
84  const vector signed short mask = vec_splat_s16(1);
85  vector signed int v_numEq = vec_splat_s32(0);
86  vector signed short v_data = vec_ld(0, data);
87  vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3,
88  v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;
89 //FIXME avoid this mess if possible
90  register int j0 = 0,
91  j1 = stride,
92  j2 = 2 * stride,
93  j3 = 3 * stride,
94  j4 = 4 * stride,
95  j5 = 5 * stride,
96  j6 = 6 * stride,
97  j7 = 7 * stride;
98  vector unsigned char v_srcA0, v_srcA1, v_srcA2, v_srcA3,
99  v_srcA4, v_srcA5, v_srcA6, v_srcA7;
100 
101  v_dcOffset = vec_splat(v_data, 0);
102  v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
103  v2QP = vec_splat(v_data, 2);
104  v4QP = (vector unsigned short)vec_splat(v_data, 3);
105 
106  src2 += stride * 4;
107 
108 #define LOAD_LINE(i) \
109  { \
110  vector unsigned char perm##i = vec_lvsl(j##i, src2); \
111  vector unsigned char v_srcA2##i; \
112  vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \
113  if (two_vectors) \
114  v_srcA2##i = vec_ld(j##i + 16, src2); \
115  v_srcA##i = \
116  vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \
117  v_srcAss##i = \
118  (vector signed short)vec_mergeh((vector signed char)zero, \
119  (vector signed char)v_srcA##i); }
120 
121 #define LOAD_LINE_ALIGNED(i) \
122  v_srcA##i = vec_ld(j##i, src2); \
123  v_srcAss##i = \
124  (vector signed short)vec_mergeh((vector signed char)zero, \
125  (vector signed char)v_srcA##i)
126 
127  /* Special-casing the aligned case is worthwhile, as all calls from
128  * the (transposed) horizontable deblocks will be aligned, in addition
129  * to the naturally aligned vertical deblocks. */
130  if (properStride && srcAlign) {
139  } else {
140  LOAD_LINE(0);
141  LOAD_LINE(1);
142  LOAD_LINE(2);
143  LOAD_LINE(3);
144  LOAD_LINE(4);
145  LOAD_LINE(5);
146  LOAD_LINE(6);
147  LOAD_LINE(7);
148  }
149 #undef LOAD_LINE
150 #undef LOAD_LINE_ALIGNED
151 
152 #define ITER(i, j) \
153  const vector signed short v_diff##i = \
154  vec_sub(v_srcAss##i, v_srcAss##j); \
155  const vector signed short v_sum##i = \
156  vec_add(v_diff##i, v_dcOffset); \
157  const vector signed short v_comp##i = \
158  (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \
159  v_dcThreshold); \
160  const vector signed short v_part##i = vec_and(mask, v_comp##i);
161 
162  {
163  ITER(0, 1)
164  ITER(1, 2)
165  ITER(2, 3)
166  ITER(3, 4)
167  ITER(4, 5)
168  ITER(5, 6)
169  ITER(6, 7)
170 
171  v_numEq = vec_sum4s(v_part0, v_numEq);
172  v_numEq = vec_sum4s(v_part1, v_numEq);
173  v_numEq = vec_sum4s(v_part2, v_numEq);
174  v_numEq = vec_sum4s(v_part3, v_numEq);
175  v_numEq = vec_sum4s(v_part4, v_numEq);
176  v_numEq = vec_sum4s(v_part5, v_numEq);
177  v_numEq = vec_sum4s(v_part6, v_numEq);
178  }
179 
180 #undef ITER
181 
182  v_numEq = vec_sums(v_numEq, zero);
183 
184  v_numEq = vec_splat(v_numEq, 3);
185  vec_ste(v_numEq, 0, &numEq);
186 
187  if (numEq > c->ppMode.flatnessThreshold){
188  const vector unsigned char mmoP1 = (const vector unsigned char)
189  {0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
190  0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B};
191  const vector unsigned char mmoP2 = (const vector unsigned char)
192  {0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
193  0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f};
194  const vector unsigned char mmoP = (const vector unsigned char)
195  vec_lvsl(8, (unsigned char*)0);
196 
197  vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
198  vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
199  vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
200  vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
201  vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
202  vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
203  vector signed short mmoDiff = vec_sub(mmoL, mmoR);
204  vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
205 
206  if (vec_any_gt(mmoSum, v4QP))
207  return 0;
208  else
209  return 1;
210  }
211  else return 2;
212 }
213 
214 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
215  /*
216  this code makes no assumption on src or stride.
217  One could remove the recomputation of the perm
218  vector by assuming (stride % 16) == 0, unfortunately
219  this is not always true. Quite a lot of load/stores
220  can be removed by assuming proper alignment of
221  src & stride :-(
222  */
223  uint8_t *src2 = src;
224  const vector signed int zero = vec_splat_s32(0);
225  const int properStride = (stride % 16);
226  const int srcAlign = ((unsigned long)src2 % 16);
227  DECLARE_ALIGNED(16, short, qp)[8] = {c->QP};
228  vector signed short vqp = vec_ld(0, qp);
229  vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
230  vector unsigned char vbA0, av_uninit(vbA1), av_uninit(vbA2), av_uninit(vbA3), av_uninit(vbA4), av_uninit(vbA5), av_uninit(vbA6), av_uninit(vbA7), av_uninit(vbA8), vbA9;
231  vector unsigned char vbB0, av_uninit(vbB1), av_uninit(vbB2), av_uninit(vbB3), av_uninit(vbB4), av_uninit(vbB5), av_uninit(vbB6), av_uninit(vbB7), av_uninit(vbB8), vbB9;
232  vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
233  vector unsigned char perml0, perml1, perml2, perml3, perml4,
234  perml5, perml6, perml7, perml8, perml9;
235  register int j0 = 0,
236  j1 = stride,
237  j2 = 2 * stride,
238  j3 = 3 * stride,
239  j4 = 4 * stride,
240  j5 = 5 * stride,
241  j6 = 6 * stride,
242  j7 = 7 * stride,
243  j8 = 8 * stride,
244  j9 = 9 * stride;
245 
246  vqp = vec_splat(vqp, 0);
247 
248  src2 += stride*3;
249 
250 #define LOAD_LINE(i) \
251  perml##i = vec_lvsl(i * stride, src2); \
252  vbA##i = vec_ld(i * stride, src2); \
253  vbB##i = vec_ld(i * stride + 16, src2); \
254  vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \
255  vb##i = \
256  (vector signed short)vec_mergeh((vector unsigned char)zero, \
257  (vector unsigned char)vbT##i)
258 
259 #define LOAD_LINE_ALIGNED(i) \
260  vbT##i = vec_ld(j##i, src2); \
261  vb##i = \
262  (vector signed short)vec_mergeh((vector signed char)zero, \
263  (vector signed char)vbT##i)
264 
265  /* Special-casing the aligned case is worthwhile, as all calls from
266  * the (transposed) horizontable deblocks will be aligned, in addition
267  * to the naturally aligned vertical deblocks. */
268  if (properStride && srcAlign) {
279  } else {
280  LOAD_LINE(0);
281  LOAD_LINE(1);
282  LOAD_LINE(2);
283  LOAD_LINE(3);
284  LOAD_LINE(4);
285  LOAD_LINE(5);
286  LOAD_LINE(6);
287  LOAD_LINE(7);
288  LOAD_LINE(8);
289  LOAD_LINE(9);
290  }
291 #undef LOAD_LINE
292 #undef LOAD_LINE_ALIGNED
293  {
294  const vector unsigned short v_2 = vec_splat_u16(2);
295  const vector unsigned short v_4 = vec_splat_u16(4);
296 
297  const vector signed short v_diff01 = vec_sub(vb0, vb1);
298  const vector unsigned short v_cmp01 =
299  (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
300  const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
301  const vector signed short v_diff89 = vec_sub(vb8, vb9);
302  const vector unsigned short v_cmp89 =
303  (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
304  const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
305 
306  const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
307  const vector signed short temp02 = vec_add(vb2, vb3);
308  const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
309  const vector signed short v_sumsB0 = vec_add(temp02, temp03);
310 
311  const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
312  const vector signed short v_sumsB1 = vec_add(temp11, vb4);
313 
314  const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
315  const vector signed short v_sumsB2 = vec_add(temp21, vb5);
316 
317  const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
318  const vector signed short v_sumsB3 = vec_add(temp31, vb6);
319 
320  const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
321  const vector signed short v_sumsB4 = vec_add(temp41, vb7);
322 
323  const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
324  const vector signed short v_sumsB5 = vec_add(temp51, vb8);
325 
326  const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
327  const vector signed short v_sumsB6 = vec_add(temp61, v_last);
328 
329  const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
330  const vector signed short v_sumsB7 = vec_add(temp71, v_last);
331 
332  const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
333  const vector signed short v_sumsB8 = vec_add(temp81, v_last);
334 
335  const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
336  const vector signed short v_sumsB9 = vec_add(temp91, v_last);
337 
338  #define COMPUTE_VR(i, j, k) \
339  const vector signed short temps1##i = \
340  vec_add(v_sumsB##i, v_sumsB##k); \
341  const vector signed short temps2##i = \
342  vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \
343  const vector signed short vr##j = vec_sra(temps2##i, v_4)
344 
345  COMPUTE_VR(0, 1, 2);
346  COMPUTE_VR(1, 2, 3);
347  COMPUTE_VR(2, 3, 4);
348  COMPUTE_VR(3, 4, 5);
349  COMPUTE_VR(4, 5, 6);
350  COMPUTE_VR(5, 6, 7);
351  COMPUTE_VR(6, 7, 8);
352  COMPUTE_VR(7, 8, 9);
353 
354  const vector signed char neg1 = vec_splat_s8(-1);
355  const vector unsigned char permHH = (const vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
356  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
357 
358 #define PACK_AND_STORE(i) \
359 { const vector unsigned char perms##i = \
360  vec_lvsr(i * stride, src2); \
361  const vector unsigned char vf##i = \
362  vec_packsu(vr##i, (vector signed short)zero); \
363  const vector unsigned char vg##i = \
364  vec_perm(vf##i, vbT##i, permHH); \
365  const vector unsigned char mask##i = \
366  vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
367  const vector unsigned char vg2##i = \
368  vec_perm(vg##i, vg##i, perms##i); \
369  const vector unsigned char svA##i = \
370  vec_sel(vbA##i, vg2##i, mask##i); \
371  const vector unsigned char svB##i = \
372  vec_sel(vg2##i, vbB##i, mask##i); \
373  vec_st(svA##i, i * stride, src2); \
374  vec_st(svB##i, i * stride + 16, src2);}
375 
376 #define PACK_AND_STORE_ALIGNED(i) \
377 { const vector unsigned char vf##i = \
378  vec_packsu(vr##i, (vector signed short)zero); \
379  const vector unsigned char vg##i = \
380  vec_perm(vf##i, vbT##i, permHH); \
381  vec_st(vg##i, i * stride, src2);}
382 
383  /* Special-casing the aligned case is worthwhile, as all calls from
384  * the (transposed) horizontable deblocks will be aligned, in addition
385  * to the naturally aligned vertical deblocks. */
386  if (properStride && srcAlign) {
395  } else {
396  PACK_AND_STORE(1)
397  PACK_AND_STORE(2)
398  PACK_AND_STORE(3)
399  PACK_AND_STORE(4)
400  PACK_AND_STORE(5)
401  PACK_AND_STORE(6)
402  PACK_AND_STORE(7)
403  PACK_AND_STORE(8)
404  }
405  #undef PACK_AND_STORE
406  #undef PACK_AND_STORE_ALIGNED
407  }
408 }
409 
410 
411 
412 static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c) {
413  /*
414  this code makes no assumption on src or stride.
415  One could remove the recomputation of the perm
416  vector by assuming (stride % 16) == 0, unfortunately
417  this is not always true. Quite a lot of load/stores
418  can be removed by assuming proper alignment of
419  src & stride :-(
420  */
421  uint8_t *src2 = src + stride*3;
422  const vector signed int zero = vec_splat_s32(0);
423  DECLARE_ALIGNED(16, short, qp)[8] = {8*c->QP};
424  vector signed short vqp = vec_splat(
425  (vector signed short)vec_ld(0, qp), 0);
426 
427 #define LOAD_LINE(i) \
428  const vector unsigned char perm##i = \
429  vec_lvsl(i * stride, src2); \
430  const vector unsigned char vbA##i = \
431  vec_ld(i * stride, src2); \
432  const vector unsigned char vbB##i = \
433  vec_ld(i * stride + 16, src2); \
434  const vector unsigned char vbT##i = \
435  vec_perm(vbA##i, vbB##i, perm##i); \
436  const vector signed short vb##i = \
437  (vector signed short)vec_mergeh((vector unsigned char)zero, \
438  (vector unsigned char)vbT##i)
439 
440  LOAD_LINE(1);
441  LOAD_LINE(2);
442  LOAD_LINE(3);
443  LOAD_LINE(4);
444  LOAD_LINE(5);
445  LOAD_LINE(6);
446  LOAD_LINE(7);
447  LOAD_LINE(8);
448 #undef LOAD_LINE
449 
450  const vector signed short v_1 = vec_splat_s16(1);
451  const vector signed short v_2 = vec_splat_s16(2);
452  const vector signed short v_5 = vec_splat_s16(5);
453  const vector signed short v_32 = vec_sl(v_1,
454  (vector unsigned short)v_5);
455  /* middle energy */
456  const vector signed short l3minusl6 = vec_sub(vb3, vb6);
457  const vector signed short l5minusl4 = vec_sub(vb5, vb4);
458  const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero);
459  const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6);
460  const vector signed short absmE = vec_abs(mE);
461  /* left & right energy */
462  const vector signed short l1minusl4 = vec_sub(vb1, vb4);
463  const vector signed short l3minusl2 = vec_sub(vb3, vb2);
464  const vector signed short l5minusl8 = vec_sub(vb5, vb8);
465  const vector signed short l7minusl6 = vec_sub(vb7, vb6);
466  const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero);
467  const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero);
468  const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4);
469  const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8);
470  /* d */
471  const vector signed short ddiff = vec_sub(absmE,
472  vec_min(vec_abs(lE),
473  vec_abs(rE)));
474  const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero);
475  const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32);
476  const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6));
477  const vector signed short minusd = vec_sub((vector signed short)zero, d);
478  const vector signed short finald = vec_sel(minusd,
479  d,
480  vec_cmpgt(vec_sub((vector signed short)zero, mE),
481  (vector signed short)zero));
482  /* q */
483  const vector signed short qtimes2 = vec_sub(vb4, vb5);
484  /* for a shift right to behave like /2, we need to add one
485  to all negative integer */
486  const vector signed short rounddown = vec_sel((vector signed short)zero,
487  v_1,
488  vec_cmplt(qtimes2, (vector signed short)zero));
489  const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1));
490  /* clamp */
491  const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald);
492  const vector signed short dclamp_P = vec_min(dclamp_P1, q);
493  const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald);
494  const vector signed short dclamp_N = vec_max(dclamp_N1, q);
495 
496  const vector signed short dclampedfinal = vec_sel(dclamp_N,
497  dclamp_P,
498  vec_cmpgt(q, (vector signed short)zero));
499  const vector signed short dornotd = vec_sel((vector signed short)zero,
500  dclampedfinal,
501  vec_cmplt(absmE, vqp));
502  /* add/subtract to l4 and l5 */
503  const vector signed short vb4minusd = vec_sub(vb4, dornotd);
504  const vector signed short vb5plusd = vec_add(vb5, dornotd);
505  /* finally, stores */
506  const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);
507  const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero);
508 
509  const vector signed char neg1 = vec_splat_s8(-1);
510  const vector unsigned char permHH = (const vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
511  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
512 
513 #define STORE(i) \
514 { const vector unsigned char perms##i = \
515  vec_lvsr(i * stride, src2); \
516  const vector unsigned char vg##i = \
517  vec_perm(st##i, vbT##i, permHH); \
518  const vector unsigned char mask##i = \
519  vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
520  const vector unsigned char vg2##i = \
521  vec_perm(vg##i, vg##i, perms##i); \
522  const vector unsigned char svA##i = \
523  vec_sel(vbA##i, vg2##i, mask##i); \
524  const vector unsigned char svB##i = \
525  vec_sel(vg2##i, vbB##i, mask##i); \
526  vec_st(svA##i, i * stride, src2); \
527  vec_st(svB##i, i * stride + 16, src2);}
528 
529  STORE(4)
530  STORE(5)
531 }
532 
533 static inline void dering_altivec(uint8_t src[], int stride, PPContext *c, int leftborder, int rightborder, int topborder) {
534  const vector signed int vsint32_8 = vec_splat_s32(8);
535  const vector unsigned int vuint32_4 = vec_splat_u32(4);
536  const vector signed char neg1 = vec_splat_s8(-1);
537 
538  const vector unsigned char permA1 = (vector unsigned char)
539  {0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F,
540  0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
541  const vector unsigned char permA2 = (vector unsigned char)
542  {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11,
543  0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
544  const vector unsigned char permA1inc = (vector unsigned char)
545  {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
546  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
547  const vector unsigned char permA2inc = (vector unsigned char)
548  {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
549  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
550  const vector unsigned char magic = (vector unsigned char)
551  {0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02,
552  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
553  const vector unsigned char extractPerm = (vector unsigned char)
554  {0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01,
555  0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01};
556  const vector unsigned char extractPermInc = (vector unsigned char)
557  {0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
558  0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01};
559  const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0);
560  const vector unsigned char tenRight = (vector unsigned char)
561  {0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
562  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
563  const vector unsigned char eightLeft = (vector unsigned char)
564  {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
565  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08};
566 
567  /*
568  this code makes no assumption on src or stride.
569  One could remove the recomputation of the perm
570  vector by assuming (stride % 16) == 0, unfortunately
571  this is not always true. Quite a lot of load/stores
572  can be removed by assuming proper alignment of
573  src & stride :-(
574  */
575  uint8_t *srcCopy = src;
576  DECLARE_ALIGNED(16, uint8_t, dt)[16] = { DERING_THRESHOLD };
577  const vector signed int zero = vec_splat_s32(0);
578  vector unsigned char v_dt = vec_splat(vec_ld(0, dt), 0);
579 
580  if (topborder)
581  return;
582 
583 #define LOAD_LINE(i) \
584  const vector unsigned char perm##i = \
585  vec_lvsl(i * stride, srcCopy); \
586  vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \
587  vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \
588  vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i)
589 
590  LOAD_LINE(0);
591  LOAD_LINE(1);
592  LOAD_LINE(2);
593  LOAD_LINE(3);
594  LOAD_LINE(4);
595  LOAD_LINE(5);
596  LOAD_LINE(6);
597  LOAD_LINE(7);
598  LOAD_LINE(8);
599  LOAD_LINE(9);
600 #undef LOAD_LINE
601 
602  vector unsigned char v_avg;
603  DECLARE_ALIGNED(16, signed int, S)[8];
604  DECLARE_ALIGNED(16, int, tQP2)[4] = { c->QP/2 + 1 };
605  vector signed int vQP2 = vec_ld(0, tQP2);
606  vQP2 = vec_splat(vQP2, 0);
607 
608  {
609  const vector unsigned char trunc_perm = (vector unsigned char)
610  {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
611  0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
612  const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm);
613  const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm);
614  const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm);
615  const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);
616 
617 #define EXTRACT(op) do { \
618  const vector unsigned char s_1 = vec_##op(trunc_src12, trunc_src34); \
619  const vector unsigned char s_2 = vec_##op(trunc_src56, trunc_src78); \
620  const vector unsigned char s_6 = vec_##op(s_1, s_2); \
621  const vector unsigned char s_8h = vec_mergeh(s_6, s_6); \
622  const vector unsigned char s_8l = vec_mergel(s_6, s_6); \
623  const vector unsigned char s_9 = vec_##op(s_8h, s_8l); \
624  const vector unsigned char s_9h = vec_mergeh(s_9, s_9); \
625  const vector unsigned char s_9l = vec_mergel(s_9, s_9); \
626  const vector unsigned char s_10 = vec_##op(s_9h, s_9l); \
627  const vector unsigned char s_10h = vec_mergeh(s_10, s_10); \
628  const vector unsigned char s_10l = vec_mergel(s_10, s_10); \
629  const vector unsigned char s_11 = vec_##op(s_10h, s_10l); \
630  const vector unsigned char s_11h = vec_mergeh(s_11, s_11); \
631  const vector unsigned char s_11l = vec_mergel(s_11, s_11); \
632  v_##op = vec_##op(s_11h, s_11l); \
633 } while (0)
634 
635  vector unsigned char v_min;
636  vector unsigned char v_max;
637  EXTRACT(min);
638  EXTRACT(max);
639 #undef EXTRACT
640 
641  if (vec_all_lt(vec_sub(v_max, v_min), v_dt))
642  return;
643 
644  v_avg = vec_avg(v_min, v_max);
645  }
646 
647  {
648  const vector unsigned short mask1 = (vector unsigned short)
649  {0x0001, 0x0002, 0x0004, 0x0008,
650  0x0010, 0x0020, 0x0040, 0x0080};
651  const vector unsigned short mask2 = (vector unsigned short)
652  {0x0100, 0x0200, 0x0000, 0x0000,
653  0x0000, 0x0000, 0x0000, 0x0000};
654 
655  const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4));
656  const vector unsigned int vuint32_1 = vec_splat_u32(1);
657 
658  vector signed int sumA2;
659  vector signed int sumB2;
660  vector signed int sum0, sum1, sum2, sum3, sum4;
661  vector signed int sum5, sum6, sum7, sum8, sum9;
662 
663 #define COMPARE(i) \
664  do { \
665  const vector unsigned char cmp = \
666  (vector unsigned char)vec_cmpgt(src##i, v_avg); \
667  const vector unsigned short cmpHi = \
668  (vector unsigned short)vec_mergeh(cmp, cmp); \
669  const vector unsigned short cmpLi = \
670  (vector unsigned short)vec_mergel(cmp, cmp); \
671  const vector signed short cmpHf = \
672  (vector signed short)vec_and(cmpHi, mask1); \
673  const vector signed short cmpLf = \
674  (vector signed short)vec_and(cmpLi, mask2); \
675  const vector signed int sump = vec_sum4s(cmpHf, zero); \
676  const vector signed int sumq = vec_sum4s(cmpLf, sump); \
677  sum##i = vec_sums(sumq, zero); \
678  } while (0)
679 
680  COMPARE(0);
681  COMPARE(1);
682  COMPARE(2);
683  COMPARE(3);
684  COMPARE(4);
685  COMPARE(5);
686  COMPARE(6);
687  COMPARE(7);
688  COMPARE(8);
689  COMPARE(9);
690 #undef COMPARE
691 
692  {
693  const vector signed int sump02 = vec_mergel(sum0, sum2);
694  const vector signed int sump13 = vec_mergel(sum1, sum3);
695  const vector signed int sumA = vec_mergel(sump02, sump13);
696 
697  const vector signed int sump46 = vec_mergel(sum4, sum6);
698  const vector signed int sump57 = vec_mergel(sum5, sum7);
699  const vector signed int sumB = vec_mergel(sump46, sump57);
700 
701  const vector signed int sump8A = vec_mergel(sum8, zero);
702  const vector signed int sump9B = vec_mergel(sum9, zero);
703  const vector signed int sumC = vec_mergel(sump8A, sump9B);
704 
705  const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16);
706  const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16);
707  const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16);
708  const vector signed int t2A = vec_or(sumA, tA);
709  const vector signed int t2B = vec_or(sumB, tB);
710  const vector signed int t2C = vec_or(sumC, tC);
711  const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1),
712  vec_sl(t2A, vuint32_1));
713  const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1),
714  vec_sl(t2B, vuint32_1));
715  const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1),
716  vec_sl(t2C, vuint32_1));
717  const vector signed int yA = vec_and(t2A, t3A);
718  const vector signed int yB = vec_and(t2B, t3B);
719  const vector signed int yC = vec_and(t2C, t3C);
720 
721  const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0);
722  const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0);
723  const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1);
724  const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2);
725  const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1);
726  const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2);
727  const vector signed int sumAp = vec_and(yA,
728  vec_and(sumAd4,sumAd8));
729  const vector signed int sumBp = vec_and(yB,
730  vec_and(sumBd4,sumBd8));
731  sumA2 = vec_or(sumAp,
732  vec_sra(sumAp,
733  vuint32_16));
734  sumB2 = vec_or(sumBp,
735  vec_sra(sumBp,
736  vuint32_16));
737  }
738  vec_st(sumA2, 0, S);
739  vec_st(sumB2, 16, S);
740  }
741 
742  /* I'm not sure the following is actually faster
743  than straight, unvectorized C code :-( */
744 
745 #define F_INIT() \
746  vector unsigned char tenRightM = tenRight; \
747  vector unsigned char permA1M = permA1; \
748  vector unsigned char permA2M = permA2; \
749  vector unsigned char extractPermM = extractPerm
750 
751 #define F2(i, j, k, l) \
752  if (S[i] & (1 << (l+1))) { \
753  const vector unsigned char a_A = vec_perm(src##i, src##j, permA1M); \
754  const vector unsigned char a_B = vec_perm(a_A, src##k, permA2M); \
755  const vector signed int a_sump = \
756  (vector signed int)vec_msum(a_B, magic, (vector unsigned int)zero);\
757  vector signed int F = vec_sr(vec_sums(a_sump, vsint32_8), vuint32_4); \
758  const vector signed int p = \
759  (vector signed int)vec_perm(src##j, (vector unsigned char)zero, \
760  extractPermM); \
761  const vector signed int sum = vec_add(p, vQP2); \
762  const vector signed int diff = vec_sub(p, vQP2); \
763  vector signed int newpm; \
764  vector unsigned char newpm2, mask; \
765  F = vec_splat(F, 3); \
766  if (vec_all_lt(sum, F)) \
767  newpm = sum; \
768  else if (vec_all_gt(diff, F)) \
769  newpm = diff; \
770  else newpm = F; \
771  newpm2 = vec_splat((vector unsigned char)newpm, 15); \
772  mask = vec_add(identity, tenRightM); \
773  src##j = vec_perm(src##j, newpm2, mask); \
774  } \
775  permA1M = vec_add(permA1M, permA1inc); \
776  permA2M = vec_add(permA2M, permA2inc); \
777  tenRightM = vec_sro(tenRightM, eightLeft); \
778  extractPermM = vec_add(extractPermM, extractPermInc)
779 
780 #define ITER(i, j, k) do { \
781  F_INIT(); \
782  F2(i, j, k, 0); \
783  F2(i, j, k, 1); \
784  F2(i, j, k, 2); \
785  F2(i, j, k, 3); \
786  F2(i, j, k, 4); \
787  F2(i, j, k, 5); \
788  F2(i, j, k, 6); \
789  F2(i, j, k, 7); \
790 } while (0)
791 
792  ITER(0, 1, 2);
793  ITER(1, 2, 3);
794  ITER(2, 3, 4);
795  ITER(3, 4, 5);
796  ITER(4, 5, 6);
797  ITER(5, 6, 7);
798  ITER(6, 7, 8);
799  ITER(7, 8, 9);
800 
801 #define STORE_LINE(i) do { \
802  const vector unsigned char permST = \
803  vec_lvsr(i * stride, srcCopy); \
804  const vector unsigned char maskST = \
805  vec_perm((vector unsigned char)zero, \
806  (vector unsigned char)neg1, permST); \
807  src##i = vec_perm(src##i ,src##i, permST); \
808  sA##i= vec_sel(sA##i, src##i, maskST); \
809  sB##i= vec_sel(src##i, sB##i, maskST); \
810  vec_st(sA##i, i * stride, srcCopy); \
811  vec_st(sB##i, i * stride + 16, srcCopy); \
812 } while (0)
813 
814  STORE_LINE(1);
815  STORE_LINE(2);
816  STORE_LINE(3);
817  STORE_LINE(4);
818  STORE_LINE(5);
819  STORE_LINE(6);
820  STORE_LINE(7);
821  STORE_LINE(8);
822 
823 #undef STORE_LINE
824 #undef ITER
825 #undef F2
826 }
827 
828 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a)
829 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a)
830 #define do_a_deblock_altivec(a...) do_a_deblock_C(a)
831 
832 static inline void tempNoiseReducer_altivec(uint8_t *src, int stride,
833  uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
834 {
835  const vector signed char neg1 = vec_splat_s8(-1);
836  const vector unsigned char permHH = (const vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
837  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
838 
839  const vector signed int zero = vec_splat_s32(0);
840  const vector signed short vsint16_1 = vec_splat_s16(1);
841  vector signed int v_dp = zero;
842  vector signed int v_sysdp = zero;
843  int d, sysd, i;
844 
845 #define LOAD_LINE(src, i) \
846  register int j##src##i = i * stride; \
847  vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \
848  const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \
849  const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \
850  const vector unsigned char v_##src##A##i = \
851  vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \
852  vector signed short v_##src##Ass##i = \
853  (vector signed short)vec_mergeh((vector signed char)zero, \
854  (vector signed char)v_##src##A##i)
855 
856  LOAD_LINE(src, 0);
857  LOAD_LINE(src, 1);
858  LOAD_LINE(src, 2);
859  LOAD_LINE(src, 3);
860  LOAD_LINE(src, 4);
861  LOAD_LINE(src, 5);
862  LOAD_LINE(src, 6);
863  LOAD_LINE(src, 7);
864 
865  LOAD_LINE(tempBlurred, 0);
866  LOAD_LINE(tempBlurred, 1);
867  LOAD_LINE(tempBlurred, 2);
868  LOAD_LINE(tempBlurred, 3);
869  LOAD_LINE(tempBlurred, 4);
870  LOAD_LINE(tempBlurred, 5);
871  LOAD_LINE(tempBlurred, 6);
872  LOAD_LINE(tempBlurred, 7);
873 #undef LOAD_LINE
874 
875 #define ACCUMULATE_DIFFS(i) do { \
876  vector signed short v_d = vec_sub(v_tempBlurredAss##i, \
877  v_srcAss##i); \
878  v_dp = vec_msums(v_d, v_d, v_dp); \
879  v_sysdp = vec_msums(v_d, vsint16_1, v_sysdp); \
880  } while (0)
881 
882  ACCUMULATE_DIFFS(0);
883  ACCUMULATE_DIFFS(1);
884  ACCUMULATE_DIFFS(2);
885  ACCUMULATE_DIFFS(3);
886  ACCUMULATE_DIFFS(4);
887  ACCUMULATE_DIFFS(5);
888  ACCUMULATE_DIFFS(6);
889  ACCUMULATE_DIFFS(7);
890 #undef ACCUMULATE_DIFFS
891 
892  tempBlurredPast[127]= maxNoise[0];
893  tempBlurredPast[128]= maxNoise[1];
894  tempBlurredPast[129]= maxNoise[2];
895 
896  v_dp = vec_sums(v_dp, zero);
897  v_sysdp = vec_sums(v_sysdp, zero);
898 
899  v_dp = vec_splat(v_dp, 3);
900  v_sysdp = vec_splat(v_sysdp, 3);
901 
902  vec_ste(v_dp, 0, &d);
903  vec_ste(v_sysdp, 0, &sysd);
904 
905  i = d;
906  d = (4*d
907  +(*(tempBlurredPast-256))
908  +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
909  +(*(tempBlurredPast+256))
910  +4)>>3;
911 
912  *tempBlurredPast=i;
913 
914  if (d > maxNoise[1]) {
915  if (d < maxNoise[2]) {
916 #define OP(i) v_tempBlurredAss##i = vec_avg(v_tempBlurredAss##i, v_srcAss##i);
917 
918  OP(0);
919  OP(1);
920  OP(2);
921  OP(3);
922  OP(4);
923  OP(5);
924  OP(6);
925  OP(7);
926 #undef OP
927  } else {
928 #define OP(i) v_tempBlurredAss##i = v_srcAss##i;
929 
930  OP(0);
931  OP(1);
932  OP(2);
933  OP(3);
934  OP(4);
935  OP(5);
936  OP(6);
937  OP(7);
938 #undef OP
939  }
940  } else {
941  if (d < maxNoise[0]) {
942  const vector signed short vsint16_7 = vec_splat_s16(7);
943  const vector signed short vsint16_4 = vec_splat_s16(4);
944  const vector unsigned short vuint16_3 = vec_splat_u16(3);
945 
946 #define OP(i) do { \
947  const vector signed short v_temp = \
948  vec_mladd(v_tempBlurredAss##i, vsint16_7, v_srcAss##i); \
949  const vector signed short v_temp2 = vec_add(v_temp, vsint16_4); \
950  v_tempBlurredAss##i = vec_sr(v_temp2, vuint16_3); \
951  } while (0)
952 
953  OP(0);
954  OP(1);
955  OP(2);
956  OP(3);
957  OP(4);
958  OP(5);
959  OP(6);
960  OP(7);
961 #undef OP
962  } else {
963  const vector signed short vsint16_3 = vec_splat_s16(3);
964  const vector signed short vsint16_2 = vec_splat_s16(2);
965 
966 #define OP(i) do { \
967  const vector signed short v_temp = \
968  vec_mladd(v_tempBlurredAss##i, vsint16_3, v_srcAss##i); \
969  const vector signed short v_temp2 = vec_add(v_temp, vsint16_2); \
970  v_tempBlurredAss##i = \
971  vec_sr(v_temp2, (vector unsigned short)vsint16_2); \
972  } while (0)
973 
974  OP(0);
975  OP(1);
976  OP(2);
977  OP(3);
978  OP(4);
979  OP(5);
980  OP(6);
981  OP(7);
982 #undef OP
983  }
984  }
985 
986 #define PACK_AND_STORE(src, i) do { \
987  const vector unsigned char perms = vec_lvsr(i * stride, src); \
988  const vector unsigned char vf = \
989  vec_packsu(v_tempBlurredAss##1, (vector signed short)zero); \
990  const vector unsigned char vg = vec_perm(vf, v_##src##A##i, permHH); \
991  const vector unsigned char mask = \
992  vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms); \
993  const vector unsigned char vg2 = vec_perm(vg, vg, perms); \
994  const vector unsigned char svA = vec_sel(v_##src##A1##i, vg2, mask); \
995  const vector unsigned char svB = vec_sel(vg2, v_##src##A2##i, mask); \
996  vec_st(svA, i * stride, src); \
997  vec_st(svB, i * stride + 16, src); \
998 } while (0)
999 
1000  PACK_AND_STORE(src, 0);
1001  PACK_AND_STORE(src, 1);
1002  PACK_AND_STORE(src, 2);
1003  PACK_AND_STORE(src, 3);
1004  PACK_AND_STORE(src, 4);
1005  PACK_AND_STORE(src, 5);
1006  PACK_AND_STORE(src, 6);
1007  PACK_AND_STORE(src, 7);
1008  PACK_AND_STORE(tempBlurred, 0);
1009  PACK_AND_STORE(tempBlurred, 1);
1010  PACK_AND_STORE(tempBlurred, 2);
1011  PACK_AND_STORE(tempBlurred, 3);
1012  PACK_AND_STORE(tempBlurred, 4);
1013  PACK_AND_STORE(tempBlurred, 5);
1014  PACK_AND_STORE(tempBlurred, 6);
1015  PACK_AND_STORE(tempBlurred, 7);
1016 #undef PACK_AND_STORE
1017 }
1018 
1019 static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1020  const vector unsigned char zero = vec_splat_u8(0);
1021 
1022 #define LOAD_DOUBLE_LINE(i, j) \
1023  vector unsigned char perm1##i = vec_lvsl(i * stride, src); \
1024  vector unsigned char perm2##i = vec_lvsl(j * stride, src); \
1025  vector unsigned char srcA##i = vec_ld(i * stride, src); \
1026  vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \
1027  vector unsigned char srcC##i = vec_ld(j * stride, src); \
1028  vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \
1029  vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \
1030  vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
1031 
1032  LOAD_DOUBLE_LINE(0, 1);
1033  LOAD_DOUBLE_LINE(2, 3);
1034  LOAD_DOUBLE_LINE(4, 5);
1035  LOAD_DOUBLE_LINE(6, 7);
1036 #undef LOAD_DOUBLE_LINE
1037 
1038  vector unsigned char tempA = vec_mergeh(src0, zero);
1039  vector unsigned char tempB = vec_mergel(src0, zero);
1040  vector unsigned char tempC = vec_mergeh(src1, zero);
1041  vector unsigned char tempD = vec_mergel(src1, zero);
1042  vector unsigned char tempE = vec_mergeh(src2, zero);
1043  vector unsigned char tempF = vec_mergel(src2, zero);
1044  vector unsigned char tempG = vec_mergeh(src3, zero);
1045  vector unsigned char tempH = vec_mergel(src3, zero);
1046  vector unsigned char tempI = vec_mergeh(src4, zero);
1047  vector unsigned char tempJ = vec_mergel(src4, zero);
1048  vector unsigned char tempK = vec_mergeh(src5, zero);
1049  vector unsigned char tempL = vec_mergel(src5, zero);
1050  vector unsigned char tempM = vec_mergeh(src6, zero);
1051  vector unsigned char tempN = vec_mergel(src6, zero);
1052  vector unsigned char tempO = vec_mergeh(src7, zero);
1053  vector unsigned char tempP = vec_mergel(src7, zero);
1054 
1055  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1056  vector unsigned char temp1 = vec_mergel(tempA, tempI);
1057  vector unsigned char temp2 = vec_mergeh(tempB, tempJ);
1058  vector unsigned char temp3 = vec_mergel(tempB, tempJ);
1059  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1060  vector unsigned char temp5 = vec_mergel(tempC, tempK);
1061  vector unsigned char temp6 = vec_mergeh(tempD, tempL);
1062  vector unsigned char temp7 = vec_mergel(tempD, tempL);
1063  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1064  vector unsigned char temp9 = vec_mergel(tempE, tempM);
1065  vector unsigned char temp10 = vec_mergeh(tempF, tempN);
1066  vector unsigned char temp11 = vec_mergel(tempF, tempN);
1067  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1068  vector unsigned char temp13 = vec_mergel(tempG, tempO);
1069  vector unsigned char temp14 = vec_mergeh(tempH, tempP);
1070  vector unsigned char temp15 = vec_mergel(tempH, tempP);
1071 
1072  tempA = vec_mergeh(temp0, temp8);
1073  tempB = vec_mergel(temp0, temp8);
1074  tempC = vec_mergeh(temp1, temp9);
1075  tempD = vec_mergel(temp1, temp9);
1076  tempE = vec_mergeh(temp2, temp10);
1077  tempF = vec_mergel(temp2, temp10);
1078  tempG = vec_mergeh(temp3, temp11);
1079  tempH = vec_mergel(temp3, temp11);
1080  tempI = vec_mergeh(temp4, temp12);
1081  tempJ = vec_mergel(temp4, temp12);
1082  tempK = vec_mergeh(temp5, temp13);
1083  tempL = vec_mergel(temp5, temp13);
1084  tempM = vec_mergeh(temp6, temp14);
1085  tempN = vec_mergel(temp6, temp14);
1086  tempO = vec_mergeh(temp7, temp15);
1087  tempP = vec_mergel(temp7, temp15);
1088 
1089  temp0 = vec_mergeh(tempA, tempI);
1090  temp1 = vec_mergel(tempA, tempI);
1091  temp2 = vec_mergeh(tempB, tempJ);
1092  temp3 = vec_mergel(tempB, tempJ);
1093  temp4 = vec_mergeh(tempC, tempK);
1094  temp5 = vec_mergel(tempC, tempK);
1095  temp6 = vec_mergeh(tempD, tempL);
1096  temp7 = vec_mergel(tempD, tempL);
1097  temp8 = vec_mergeh(tempE, tempM);
1098  temp9 = vec_mergel(tempE, tempM);
1099  temp10 = vec_mergeh(tempF, tempN);
1100  temp11 = vec_mergel(tempF, tempN);
1101  temp12 = vec_mergeh(tempG, tempO);
1102  temp13 = vec_mergel(tempG, tempO);
1103  temp14 = vec_mergeh(tempH, tempP);
1104  temp15 = vec_mergel(tempH, tempP);
1105 
1106  vec_st(temp0, 0, dst);
1107  vec_st(temp1, 16, dst);
1108  vec_st(temp2, 32, dst);
1109  vec_st(temp3, 48, dst);
1110  vec_st(temp4, 64, dst);
1111  vec_st(temp5, 80, dst);
1112  vec_st(temp6, 96, dst);
1113  vec_st(temp7, 112, dst);
1114  vec_st(temp8, 128, dst);
1115  vec_st(temp9, 144, dst);
1116  vec_st(temp10, 160, dst);
1117  vec_st(temp11, 176, dst);
1118  vec_st(temp12, 192, dst);
1119  vec_st(temp13, 208, dst);
1120  vec_st(temp14, 224, dst);
1121  vec_st(temp15, 240, dst);
1122 }
1123 
1124 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1125  const vector unsigned char zero = vec_splat_u8(0);
1126  const vector signed char neg1 = vec_splat_s8(-1);
1127 
1128 #define LOAD_DOUBLE_LINE(i, j) \
1129  vector unsigned char src##i = vec_ld(i * 16, src); \
1130  vector unsigned char src##j = vec_ld(j * 16, src)
1131 
1132  LOAD_DOUBLE_LINE(0, 1);
1133  LOAD_DOUBLE_LINE(2, 3);
1134  LOAD_DOUBLE_LINE(4, 5);
1135  LOAD_DOUBLE_LINE(6, 7);
1136  LOAD_DOUBLE_LINE(8, 9);
1137  LOAD_DOUBLE_LINE(10, 11);
1138  LOAD_DOUBLE_LINE(12, 13);
1139  LOAD_DOUBLE_LINE(14, 15);
1140 #undef LOAD_DOUBLE_LINE
1141 
1142  vector unsigned char tempA = vec_mergeh(src0, src8);
1143  vector unsigned char tempB;
1144  vector unsigned char tempC = vec_mergeh(src1, src9);
1145  vector unsigned char tempD;
1146  vector unsigned char tempE = vec_mergeh(src2, src10);
1147  vector unsigned char tempG = vec_mergeh(src3, src11);
1148  vector unsigned char tempI = vec_mergeh(src4, src12);
1149  vector unsigned char tempJ;
1150  vector unsigned char tempK = vec_mergeh(src5, src13);
1151  vector unsigned char tempL;
1152  vector unsigned char tempM = vec_mergeh(src6, src14);
1153  vector unsigned char tempO = vec_mergeh(src7, src15);
1154 
1155  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1156  vector unsigned char temp1 = vec_mergel(tempA, tempI);
1157  vector unsigned char temp2;
1158  vector unsigned char temp3;
1159  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1160  vector unsigned char temp5 = vec_mergel(tempC, tempK);
1161  vector unsigned char temp6;
1162  vector unsigned char temp7;
1163  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1164  vector unsigned char temp9 = vec_mergel(tempE, tempM);
1165  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1166  vector unsigned char temp13 = vec_mergel(tempG, tempO);
1167 
1168  tempA = vec_mergeh(temp0, temp8);
1169  tempB = vec_mergel(temp0, temp8);
1170  tempC = vec_mergeh(temp1, temp9);
1171  tempD = vec_mergel(temp1, temp9);
1172  tempI = vec_mergeh(temp4, temp12);
1173  tempJ = vec_mergel(temp4, temp12);
1174  tempK = vec_mergeh(temp5, temp13);
1175  tempL = vec_mergel(temp5, temp13);
1176 
1177  temp0 = vec_mergeh(tempA, tempI);
1178  temp1 = vec_mergel(tempA, tempI);
1179  temp2 = vec_mergeh(tempB, tempJ);
1180  temp3 = vec_mergel(tempB, tempJ);
1181  temp4 = vec_mergeh(tempC, tempK);
1182  temp5 = vec_mergel(tempC, tempK);
1183  temp6 = vec_mergeh(tempD, tempL);
1184  temp7 = vec_mergel(tempD, tempL);
1185 
1186 
1187 #define STORE_DOUBLE_LINE(i, j) do { \
1188  vector unsigned char dstAi = vec_ld(i * stride, dst); \
1189  vector unsigned char dstBi = vec_ld(i * stride + 16, dst); \
1190  vector unsigned char dstAj = vec_ld(j * stride, dst); \
1191  vector unsigned char dstBj = vec_ld(j * stride+ 16, dst); \
1192  vector unsigned char aligni = vec_lvsr(i * stride, dst); \
1193  vector unsigned char alignj = vec_lvsr(j * stride, dst); \
1194  vector unsigned char maski = \
1195  vec_perm(zero, (vector unsigned char)neg1, aligni); \
1196  vector unsigned char maskj = \
1197  vec_perm(zero, (vector unsigned char)neg1, alignj); \
1198  vector unsigned char dstRi = vec_perm(temp##i, temp##i, aligni); \
1199  vector unsigned char dstRj = vec_perm(temp##j, temp##j, alignj); \
1200  vector unsigned char dstAFi = vec_sel(dstAi, dstRi, maski); \
1201  vector unsigned char dstBFi = vec_sel(dstRi, dstBi, maski); \
1202  vector unsigned char dstAFj = vec_sel(dstAj, dstRj, maskj); \
1203  vector unsigned char dstBFj = vec_sel(dstRj, dstBj, maskj); \
1204  vec_st(dstAFi, i * stride, dst); \
1205  vec_st(dstBFi, i * stride + 16, dst); \
1206  vec_st(dstAFj, j * stride, dst); \
1207  vec_st(dstBFj, j * stride + 16, dst); \
1208 } while (0)
1209 
1210  STORE_DOUBLE_LINE(0,1);
1211  STORE_DOUBLE_LINE(2,3);
1212  STORE_DOUBLE_LINE(4,5);
1213  STORE_DOUBLE_LINE(6,7);
1214 }
COMPARE
#define COMPARE(i)
mem_internal.h
PPContext
postprocess context.
Definition: postprocess_internal.h:116
src1
const pixel * src1
Definition: h264pred_template.c:420
mask
int mask
Definition: mediacodecdec_common.c:154
data
const char data[16]
Definition: mxf.c:149
STORE_LINE
#define STORE_LINE(i)
dering_altivec
static void dering_altivec(uint8_t src[], int stride, PPContext *c, int leftborder, int rightborder, int topborder)
Definition: postprocess_altivec_template.c:533
doVertLowPass_altivec
static void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
Definition: postprocess_altivec_template.c:214
max
#define max(a, b)
Definition: cuda_runtime.h:33
S
#define S(s, c, i)
Definition: flacdsp_template.c:46
STORE
#define STORE(i)
OP
#define OP(i)
ITER
#define ITER(i, j)
LOAD_DOUBLE_LINE
#define LOAD_DOUBLE_LINE(i, j)
DERING_THRESHOLD
#define DERING_THRESHOLD
Definition: postprocess.c:98
ACCUMULATE_DIFFS
#define ACCUMULATE_DIFFS(i)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
LOAD_LINE
#define LOAD_LINE(i)
LOAD_LINE_ALIGNED
#define LOAD_LINE_ALIGNED(i)
transpose_16x8_char_toPackedAlign_altivec
static void transpose_16x8_char_toPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1019
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:104
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
transpose_8x16_char_fromPackedAlign_altivec
static void transpose_8x16_char_fromPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1124
zero
static int zero(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:121
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
COMPUTE_VR
#define COMPUTE_VR(i, j, k)
src2
const pixel * src2
Definition: h264pred_template.c:421
stride
#define stride
Definition: h264pred_template.c:536
av_uninit
#define av_uninit(x)
Definition: attributes.h:154
vertClassify_altivec
static int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:59
doVertDefFilter_altivec
static void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:412
EXTRACT
#define EXTRACT(op)
src0
const pixel *const src0
Definition: h264pred_template.c:419
avutil.h
STORE_DOUBLE_LINE
#define STORE_DOUBLE_LINE(i, j)
PACK_AND_STORE_ALIGNED
#define PACK_AND_STORE_ALIGNED(i)
PACK_AND_STORE
#define PACK_AND_STORE(i)
tempNoiseReducer_altivec
static void tempNoiseReducer_altivec(uint8_t *src, int stride, uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
Definition: postprocess_altivec_template.c:832
short
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
Definition: writing_filters.txt:89
src
#define src
Definition: vp8dsp.c:248
min
float min
Definition: vorbis_enc_data.h:429