FFmpeg
swscale_vsx.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include <inttypes.h>
25 
26 #include "config.h"
27 #include "libswscale/swscale.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "libavutil/mem_internal.h"
32 #include "yuv2rgb_altivec.h"
34 
35 #if HAVE_VSX
36 #define vzero vec_splat_s32(0)
37 
38 #if !HAVE_BIGENDIAN
39 #define GET_LS(a,b,c,s) {\
40  ls = a;\
41  a = vec_vsx_ld(((b) << 1) + 16, s);\
42  }
43 
44 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
45  vector signed short ls;\
46  vector signed int vf1, vf2, i1, i2;\
47  GET_LS(l1, x, perm, src);\
48  i1 = vec_mule(filter, ls);\
49  i2 = vec_mulo(filter, ls);\
50  vf1 = vec_mergeh(i1, i2);\
51  vf2 = vec_mergel(i1, i2);\
52  d1 = vec_add(d1, vf1);\
53  d2 = vec_add(d2, vf2);\
54  } while (0)
55 
56 #define LOAD_FILTER(vf,f) {\
57  vf = vec_vsx_ld(joffset, f);\
58 }
59 #define LOAD_L1(ll1,s,p){\
60  ll1 = vec_vsx_ld(xoffset, s);\
61 }
62 
63 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
64 
65 // The neat trick: We only care for half the elements,
66 // high or low depending on (i<<3)%16 (it's 0 or 8 here),
67 // and we're going to use vec_mule, so we choose
68 // carefully how to "unpack" the elements into the even slots.
69 #define GET_VF4(a, vf, f) {\
70  vf = (vector signed short)vec_vsx_ld(a << 3, f);\
71  vf = vec_mergeh(vf, (vector signed short)vzero);\
72 }
73 #define FIRST_LOAD(sv, pos, s, per) {}
74 #define UPDATE_PTR(s0, d0, s1, d1) {}
75 #define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
76  vf = vec_vsx_ld(pos + a, s);\
77 }
78 #define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) LOAD_SRCV(pos, a, s, per, v0, v1, vf)
79 #define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
80  vf = vec_vsx_ld((a * 2 * filterSize) + (b * 2) + off, f);\
81 }
82 
83 #define FUNC(name) name ## _vsx
84 #include "swscale_ppc_template.c"
85 #undef FUNC
86 
87 #undef vzero
88 
89 #endif /* !HAVE_BIGENDIAN */
90 
91 static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
92  const uint8_t *dither, int offset, int start)
93 {
94  int i;
95  for (i = start; i < dstW; i++) {
96  int val = (src[i] + dither[(i + offset) & 7]) >> 7;
97  dest[i] = av_clip_uint8(val);
98  }
99 }
100 
101 static void yuv2plane1_8_vsx(const int16_t *src, uint8_t *dest, int dstW,
102  const uint8_t *dither, int offset)
103 {
104  const int dst_u = -(uintptr_t)dest & 15;
105  int i, j;
106  LOCAL_ALIGNED(16, int16_t, val, [16]);
107  const vec_u16 shifts = (vec_u16) {7, 7, 7, 7, 7, 7, 7, 7};
108  vec_s16 vi, vileft, ditherleft, ditherright;
109  vec_u8 vd;
110 
111  for (j = 0; j < 16; j++) {
112  val[j] = dither[(dst_u + offset + j) & 7];
113  }
114 
115  ditherleft = vec_ld(0, val);
116  ditherright = vec_ld(0, &val[8]);
117 
118  yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0);
119 
120  for (i = dst_u; i < dstW - 15; i += 16) {
121 
122  vi = vec_vsx_ld(0, &src[i]);
123  vi = vec_adds(ditherleft, vi);
124  vileft = vec_sra(vi, shifts);
125 
126  vi = vec_vsx_ld(0, &src[i + 8]);
127  vi = vec_adds(ditherright, vi);
128  vi = vec_sra(vi, shifts);
129 
130  vd = vec_packsu(vileft, vi);
131  vec_st(vd, 0, &dest[i]);
132  }
133 
134  yuv2plane1_8_u(src, dest, dstW, dither, offset, i);
135 }
136 
137 #if !HAVE_BIGENDIAN
138 
139 #define output_pixel(pos, val) \
140  if (big_endian) { \
141  AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
142  } else { \
143  AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
144  }
145 
146 static void yuv2plane1_nbps_u(const int16_t *src, uint16_t *dest, int dstW,
147  int big_endian, int output_bits, int start)
148 {
149  int i;
150  int shift = 15 - output_bits;
151 
152  for (i = start; i < dstW; i++) {
153  int val = src[i] + (1 << (shift - 1));
154  output_pixel(&dest[i], val);
155  }
156 }
157 
158 static av_always_inline void yuv2plane1_nbps_vsx(const int16_t *src,
159  uint16_t *dest, int dstW,
160  const int big_endian,
161  const int output_bits)
162 {
163  const int dst_u = -(uintptr_t)dest & 7;
164  const int shift = 15 - output_bits;
165  const int add = (1 << (shift - 1));
166  const int clip = (1 << output_bits) - 1;
167  const vec_u16 vadd = (vec_u16) {add, add, add, add, add, add, add, add};
168  const vec_u16 vswap = (vec_u16) vec_splat_u16(big_endian ? 8 : 0);
169  const vec_u16 vshift = (vec_u16) vec_splat_u16(shift);
170  const vec_u16 vlargest = (vec_u16) {clip, clip, clip, clip, clip, clip, clip, clip};
171  vec_u16 v;
172  int i;
173 
174  yuv2plane1_nbps_u(src, dest, dst_u, big_endian, output_bits, 0);
175 
176  for (i = dst_u; i < dstW - 7; i += 8) {
177  v = vec_vsx_ld(0, (const uint16_t *) &src[i]);
178  v = vec_add(v, vadd);
179  v = vec_sr(v, vshift);
180  v = vec_min(v, vlargest);
181  v = vec_rl(v, vswap);
182  vec_st(v, 0, &dest[i]);
183  }
184 
185  yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
186 }
187 
188 static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
189  const int16_t **src, uint16_t *dest, int dstW,
190  int big_endian, int output_bits, int start)
191 {
192  int i;
193  int shift = 11 + 16 - output_bits;
194 
195  for (i = start; i < dstW; i++) {
196  int val = 1 << (shift - 1);
197  int j;
198 
199  for (j = 0; j < filterSize; j++)
200  val += src[j][i] * filter[j];
201 
202  output_pixel(&dest[i], val);
203  }
204 }
205 
206 static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
207  const int16_t **src, uint16_t *dest, int dstW,
208  int big_endian, int output_bits)
209 {
210  const int dst_u = -(uintptr_t)dest & 7;
211  const int shift = 11 + 16 - output_bits;
212  const int add = (1 << (shift - 1));
213  const int clip = (1 << output_bits) - 1;
214  const uint16_t swap = big_endian ? 8 : 0;
215  const vec_u32 vadd = (vec_u32) {add, add, add, add};
216  const vec_u32 vshift = (vec_u32) {shift, shift, shift, shift};
217  const vec_u16 vswap = (vec_u16) {swap, swap, swap, swap, swap, swap, swap, swap};
218  const vec_u16 vlargest = (vec_u16) {clip, clip, clip, clip, clip, clip, clip, clip};
219  const vec_s16 vzero = vec_splat_s16(0);
220  const vec_u8 vperm = (vec_u8) {0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
221  vec_s16 vfilter[MAX_FILTER_SIZE], vin;
222  vec_u16 v;
223  vec_u32 vleft, vright, vtmp;
224  int i, j;
225 
226  for (i = 0; i < filterSize; i++) {
227  vfilter[i] = (vec_s16) {filter[i], filter[i], filter[i], filter[i],
228  filter[i], filter[i], filter[i], filter[i]};
229  }
230 
231  yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
232 
233  for (i = dst_u; i < dstW - 7; i += 8) {
234  vleft = vright = vadd;
235 
236  for (j = 0; j < filterSize; j++) {
237  vin = vec_vsx_ld(0, &src[j][i]);
238  vtmp = (vec_u32) vec_mule(vin, vfilter[j]);
239  vleft = vec_add(vleft, vtmp);
240  vtmp = (vec_u32) vec_mulo(vin, vfilter[j]);
241  vright = vec_add(vright, vtmp);
242  }
243 
244  vleft = vec_sra(vleft, vshift);
245  vright = vec_sra(vright, vshift);
246  v = vec_packsu(vleft, vright);
247  v = (vec_u16) vec_max((vec_s16) v, vzero);
248  v = vec_min(v, vlargest);
249  v = vec_rl(v, vswap);
250  v = vec_perm(v, v, vperm);
251  vec_st(v, 0, &dest[i]);
252  }
253 
254  yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
255 }
256 
257 
258 #undef output_pixel
259 
260 #define output_pixel(pos, val, bias, signedness) \
261  if (big_endian) { \
262  AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
263  } else { \
264  AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
265  }
266 
267 static void yuv2plane1_16_u(const int32_t *src, uint16_t *dest, int dstW,
268  int big_endian, int output_bits, int start)
269 {
270  int i;
271  const int shift = 3;
272 
273  for (i = start; i < dstW; i++) {
274  int val = src[i] + (1 << (shift - 1));
275  output_pixel(&dest[i], val, 0, uint);
276  }
277 }
278 
279 static av_always_inline void yuv2plane1_16_vsx(const int32_t *src,
280  uint16_t *dest, int dstW,
281  const int big_endian,
282  int output_bits)
283 {
284  const int dst_u = -(uintptr_t)dest & 7;
285  const int shift = 3;
286  const int add = (1 << (shift - 1));
287  const vec_u32 vadd = (vec_u32) {add, add, add, add};
288  const vec_u16 vswap = (vec_u16) vec_splat_u16(big_endian ? 8 : 0);
289  const vec_u32 vshift = (vec_u32) vec_splat_u32(shift);
290  vec_u32 v, v2;
291  vec_u16 vd;
292  int i;
293 
294  yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0);
295 
296  for (i = dst_u; i < dstW - 7; i += 8) {
297  v = vec_vsx_ld(0, (const uint32_t *) &src[i]);
298  v = vec_add(v, vadd);
299  v = vec_sr(v, vshift);
300 
301  v2 = vec_vsx_ld(0, (const uint32_t *) &src[i + 4]);
302  v2 = vec_add(v2, vadd);
303  v2 = vec_sr(v2, vshift);
304 
305  vd = vec_packsu(v, v2);
306  vd = vec_rl(vd, vswap);
307 
308  vec_st(vd, 0, &dest[i]);
309  }
310 
311  yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
312 }
313 
314 #if HAVE_POWER8
315 
316 static void yuv2planeX_16_u(const int16_t *filter, int filterSize,
317  const int32_t **src, uint16_t *dest, int dstW,
318  int big_endian, int output_bits, int start)
319 {
320  int i;
321  int shift = 15;
322 
323  for (i = start; i < dstW; i++) {
324  int val = 1 << (shift - 1);
325  int j;
326 
327  /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
328  * filters (or anything with negative coeffs, the range can be slightly
329  * wider in both directions. To account for this overflow, we subtract
330  * a constant so it always fits in the signed range (assuming a
331  * reasonable filterSize), and re-add that at the end. */
332  val -= 0x40000000;
333  for (j = 0; j < filterSize; j++)
334  val += src[j][i] * (unsigned)filter[j];
335 
336  output_pixel(&dest[i], val, 0x8000, int);
337  }
338 }
339 
340 static void yuv2planeX_16_vsx(const int16_t *filter, int filterSize,
341  const int32_t **src, uint16_t *dest, int dstW,
342  int big_endian, int output_bits)
343 {
344  const int dst_u = -(uintptr_t)dest & 7;
345  const int shift = 15;
346  const int bias = 0x8000;
347  const int add = (1 << (shift - 1)) - 0x40000000;
348  const uint16_t swap = big_endian ? 8 : 0;
349  const vec_u32 vadd = (vec_u32) {add, add, add, add};
350  const vec_u32 vshift = (vec_u32) {shift, shift, shift, shift};
351  const vec_u16 vswap = (vec_u16) {swap, swap, swap, swap, swap, swap, swap, swap};
352  const vec_u16 vbias = (vec_u16) {bias, bias, bias, bias, bias, bias, bias, bias};
353  vec_s32 vfilter[MAX_FILTER_SIZE];
354  vec_u16 v;
355  vec_u32 vleft, vright, vtmp;
356  vec_s32 vin32l, vin32r;
357  int i, j;
358 
359  for (i = 0; i < filterSize; i++) {
360  vfilter[i] = (vec_s32) {filter[i], filter[i], filter[i], filter[i]};
361  }
362 
363  yuv2planeX_16_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
364 
365  for (i = dst_u; i < dstW - 7; i += 8) {
366  vleft = vright = vadd;
367 
368  for (j = 0; j < filterSize; j++) {
369  vin32l = vec_vsx_ld(0, &src[j][i]);
370  vin32r = vec_vsx_ld(0, &src[j][i + 4]);
371 
372  vtmp = (vec_u32) vec_mul(vin32l, vfilter[j]);
373  vleft = vec_add(vleft, vtmp);
374  vtmp = (vec_u32) vec_mul(vin32r, vfilter[j]);
375  vright = vec_add(vright, vtmp);
376  }
377 
378  vleft = vec_sra(vleft, vshift);
379  vright = vec_sra(vright, vshift);
380  v = (vec_u16) vec_packs((vec_s32) vleft, (vec_s32) vright);
381  v = vec_add(v, vbias);
382  v = vec_rl(v, vswap);
383  vec_st(v, 0, &dest[i]);
384  }
385 
386  yuv2planeX_16_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
387 }
388 
389 #endif /* HAVE_POWER8 */
390 
391 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
392  yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
393  yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t)
394 
395 #define yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
396 static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \
397  uint8_t *dest, int dstW, \
398  const uint8_t *dither, int offset) \
399 { \
400  yuv2plane1_ ## template_size ## _vsx((const typeX_t *) src, \
401  (uint16_t *) dest, dstW, is_be, bits); \
402 }
403 
404 #define yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t) \
405 static void yuv2planeX_ ## bits ## BE_LE ## _vsx(const int16_t *filter, int filterSize, \
406  const int16_t **src, uint8_t *dest, int dstW, \
407  const uint8_t *dither, int offset)\
408 { \
409  yuv2planeX_## template_size ## _vsx(filter, \
410  filterSize, (const typeX_t **) src, \
411  (uint16_t *) dest, dstW, is_be, bits); \
412 }
413 
414 yuv2NBPS( 9, BE, 1, nbps, int16_t)
415 yuv2NBPS( 9, LE, 0, nbps, int16_t)
416 yuv2NBPS(10, BE, 1, nbps, int16_t)
417 yuv2NBPS(10, LE, 0, nbps, int16_t)
418 yuv2NBPS(12, BE, 1, nbps, int16_t)
419 yuv2NBPS(12, LE, 0, nbps, int16_t)
420 yuv2NBPS(14, BE, 1, nbps, int16_t)
421 yuv2NBPS(14, LE, 0, nbps, int16_t)
422 
423 yuv2NBPS1(16, BE, 1, 16, int32_t)
424 yuv2NBPS1(16, LE, 0, 16, int32_t)
425 #if HAVE_POWER8
426 yuv2NBPSX(16, BE, 1, 16, int32_t)
427 yuv2NBPSX(16, LE, 0, 16, int32_t)
428 #endif
429 
430 #define WRITERGB \
431  R_l = vec_max(R_l, zero32); \
432  R_r = vec_max(R_r, zero32); \
433  G_l = vec_max(G_l, zero32); \
434  G_r = vec_max(G_r, zero32); \
435  B_l = vec_max(B_l, zero32); \
436  B_r = vec_max(B_r, zero32); \
437 \
438  R_l = vec_min(R_l, rgbclip); \
439  R_r = vec_min(R_r, rgbclip); \
440  G_l = vec_min(G_l, rgbclip); \
441  G_r = vec_min(G_r, rgbclip); \
442  B_l = vec_min(B_l, rgbclip); \
443  B_r = vec_min(B_r, rgbclip); \
444 \
445  R_l = vec_sr(R_l, shift22); \
446  R_r = vec_sr(R_r, shift22); \
447  G_l = vec_sr(G_l, shift22); \
448  G_r = vec_sr(G_r, shift22); \
449  B_l = vec_sr(B_l, shift22); \
450  B_r = vec_sr(B_r, shift22); \
451 \
452  rd16 = vec_packsu(R_l, R_r); \
453  gd16 = vec_packsu(G_l, G_r); \
454  bd16 = vec_packsu(B_l, B_r); \
455  rd = vec_packsu(rd16, zero16); \
456  gd = vec_packsu(gd16, zero16); \
457  bd = vec_packsu(bd16, zero16); \
458 \
459  switch(target) { \
460  case AV_PIX_FMT_RGB24: \
461  out0 = vec_perm(rd, gd, perm3rg0); \
462  out0 = vec_perm(out0, bd, perm3tb0); \
463  out1 = vec_perm(rd, gd, perm3rg1); \
464  out1 = vec_perm(out1, bd, perm3tb1); \
465 \
466  vec_vsx_st(out0, 0, dest); \
467  vec_vsx_st(out1, 16, dest); \
468 \
469  dest += 24; \
470  break; \
471  case AV_PIX_FMT_BGR24: \
472  out0 = vec_perm(bd, gd, perm3rg0); \
473  out0 = vec_perm(out0, rd, perm3tb0); \
474  out1 = vec_perm(bd, gd, perm3rg1); \
475  out1 = vec_perm(out1, rd, perm3tb1); \
476 \
477  vec_vsx_st(out0, 0, dest); \
478  vec_vsx_st(out1, 16, dest); \
479 \
480  dest += 24; \
481  break; \
482  case AV_PIX_FMT_BGRA: \
483  out0 = vec_mergeh(bd, gd); \
484  out1 = vec_mergeh(rd, ad); \
485 \
486  tmp8 = (vec_u8) vec_mergeh((vec_u16) out0, (vec_u16) out1); \
487  vec_vsx_st(tmp8, 0, dest); \
488  tmp8 = (vec_u8) vec_mergel((vec_u16) out0, (vec_u16) out1); \
489  vec_vsx_st(tmp8, 16, dest); \
490 \
491  dest += 32; \
492  break; \
493  case AV_PIX_FMT_RGBA: \
494  out0 = vec_mergeh(rd, gd); \
495  out1 = vec_mergeh(bd, ad); \
496 \
497  tmp8 = (vec_u8) vec_mergeh((vec_u16) out0, (vec_u16) out1); \
498  vec_vsx_st(tmp8, 0, dest); \
499  tmp8 = (vec_u8) vec_mergel((vec_u16) out0, (vec_u16) out1); \
500  vec_vsx_st(tmp8, 16, dest); \
501 \
502  dest += 32; \
503  break; \
504  case AV_PIX_FMT_ARGB: \
505  out0 = vec_mergeh(ad, rd); \
506  out1 = vec_mergeh(gd, bd); \
507 \
508  tmp8 = (vec_u8) vec_mergeh((vec_u16) out0, (vec_u16) out1); \
509  vec_vsx_st(tmp8, 0, dest); \
510  tmp8 = (vec_u8) vec_mergel((vec_u16) out0, (vec_u16) out1); \
511  vec_vsx_st(tmp8, 16, dest); \
512 \
513  dest += 32; \
514  break; \
515  case AV_PIX_FMT_ABGR: \
516  out0 = vec_mergeh(ad, bd); \
517  out1 = vec_mergeh(gd, rd); \
518 \
519  tmp8 = (vec_u8) vec_mergeh((vec_u16) out0, (vec_u16) out1); \
520  vec_vsx_st(tmp8, 0, dest); \
521  tmp8 = (vec_u8) vec_mergel((vec_u16) out0, (vec_u16) out1); \
522  vec_vsx_st(tmp8, 16, dest); \
523 \
524  dest += 32; \
525  break; \
526  }
527 
528 static av_always_inline void
529 yuv2rgb_full_X_vsx_template(SwsInternal *c, const int16_t *lumFilter,
530  const int16_t **lumSrc, int lumFilterSize,
531  const int16_t *chrFilter, const int16_t **chrUSrc,
532  const int16_t **chrVSrc, int chrFilterSize,
533  const int16_t **alpSrc, uint8_t *dest,
534  int dstW, int y, enum AVPixelFormat target, int hasAlpha)
535 {
536  vec_s16 vv;
537  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
538  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r;
539  vec_s32 tmp, tmp2, tmp3, tmp4;
540  vec_u16 rd16, gd16, bd16;
541  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
542  vec_s16 vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE];
543  const vec_s32 ystart = vec_splats(1 << 9);
544  const vec_s32 uvstart = vec_splats((1 << 9) - (128 << 19));
545  const vec_u16 zero16 = vec_splat_u16(0);
546  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
547  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
548  const vec_s32 y_add = vec_splats(1 << 21);
549  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
550  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
551  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
552  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
553  const vec_s32 rgbclip = vec_splats(1 << 30);
554  const vec_s32 zero32 = vec_splat_s32(0);
555  const vec_u32 shift22 = vec_splats(22U);
556  const vec_u32 shift10 = vec_splat_u32(10);
557  int i, j;
558 
559  // Various permutations
560  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
561  0x1, 0x11, 0,
562  0x2, 0x12, 0,
563  0x3, 0x13, 0,
564  0x4, 0x14, 0,
565  0x5 };
566  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
567  0x6, 0x16, 0,
568  0x7, 0x17, 0 };
569  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
570  0x3, 0x4, 0x11,
571  0x6, 0x7, 0x12,
572  0x9, 0xa, 0x13,
573  0xc, 0xd, 0x14,
574  0xf };
575  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
576  0x2, 0x3, 0x16,
577  0x5, 0x6, 0x17 };
578 
579  ad = vec_splats((uint8_t) 255);
580 
581  for (i = 0; i < lumFilterSize; i++)
582  vlumFilter[i] = vec_splats(lumFilter[i]);
583  for (i = 0; i < chrFilterSize; i++)
584  vchrFilter[i] = vec_splats(chrFilter[i]);
585 
586  for (i = 0; i < dstW; i += 8) {
587  vy32_l =
588  vy32_r = ystart;
589  vu32_l =
590  vu32_r =
591  vv32_l =
592  vv32_r = uvstart;
593 
594  for (j = 0; j < lumFilterSize; j++) {
595  vv = vec_ld(0, &lumSrc[j][i]);
596  tmp = vec_mule(vv, vlumFilter[j]);
597  tmp2 = vec_mulo(vv, vlumFilter[j]);
598  tmp3 = vec_mergeh(tmp, tmp2);
599  tmp4 = vec_mergel(tmp, tmp2);
600 
601  vy32_l = vec_adds(vy32_l, tmp3);
602  vy32_r = vec_adds(vy32_r, tmp4);
603  }
604 
605  for (j = 0; j < chrFilterSize; j++) {
606  vv = vec_ld(0, &chrUSrc[j][i]);
607  tmp = vec_mule(vv, vchrFilter[j]);
608  tmp2 = vec_mulo(vv, vchrFilter[j]);
609  tmp3 = vec_mergeh(tmp, tmp2);
610  tmp4 = vec_mergel(tmp, tmp2);
611 
612  vu32_l = vec_adds(vu32_l, tmp3);
613  vu32_r = vec_adds(vu32_r, tmp4);
614 
615  vv = vec_ld(0, &chrVSrc[j][i]);
616  tmp = vec_mule(vv, vchrFilter[j]);
617  tmp2 = vec_mulo(vv, vchrFilter[j]);
618  tmp3 = vec_mergeh(tmp, tmp2);
619  tmp4 = vec_mergel(tmp, tmp2);
620 
621  vv32_l = vec_adds(vv32_l, tmp3);
622  vv32_r = vec_adds(vv32_r, tmp4);
623  }
624 
625  vy32_l = vec_sra(vy32_l, shift10);
626  vy32_r = vec_sra(vy32_r, shift10);
627  vu32_l = vec_sra(vu32_l, shift10);
628  vu32_r = vec_sra(vu32_r, shift10);
629  vv32_l = vec_sra(vv32_l, shift10);
630  vv32_r = vec_sra(vv32_r, shift10);
631 
632  vy32_l = vec_sub(vy32_l, y_offset);
633  vy32_r = vec_sub(vy32_r, y_offset);
634  vy32_l = vec_mul(vy32_l, y_coeff);
635  vy32_r = vec_mul(vy32_r, y_coeff);
636  vy32_l = vec_add(vy32_l, y_add);
637  vy32_r = vec_add(vy32_r, y_add);
638 
639  R_l = vec_mul(vv32_l, v2r_coeff);
640  R_l = vec_add(R_l, vy32_l);
641  R_r = vec_mul(vv32_r, v2r_coeff);
642  R_r = vec_add(R_r, vy32_r);
643  G_l = vec_mul(vv32_l, v2g_coeff);
644  tmp32 = vec_mul(vu32_l, u2g_coeff);
645  G_l = vec_add(G_l, vy32_l);
646  G_l = vec_add(G_l, tmp32);
647  G_r = vec_mul(vv32_r, v2g_coeff);
648  tmp32 = vec_mul(vu32_r, u2g_coeff);
649  G_r = vec_add(G_r, vy32_r);
650  G_r = vec_add(G_r, tmp32);
651 
652  B_l = vec_mul(vu32_l, u2b_coeff);
653  B_l = vec_add(B_l, vy32_l);
654  B_r = vec_mul(vu32_r, u2b_coeff);
655  B_r = vec_add(B_r, vy32_r);
656 
657  WRITERGB
658  }
659 }
660 
661 #define SETUP(x, buf0, alpha1, buf1, alpha) { \
662  x = vec_ld(0, buf0); \
663  tmp = vec_mule(x, alpha1); \
664  tmp2 = vec_mulo(x, alpha1); \
665  tmp3 = vec_mergeh(tmp, tmp2); \
666  tmp4 = vec_mergel(tmp, tmp2); \
667 \
668  x = vec_ld(0, buf1); \
669  tmp = vec_mule(x, alpha); \
670  tmp2 = vec_mulo(x, alpha); \
671  tmp5 = vec_mergeh(tmp, tmp2); \
672  tmp6 = vec_mergel(tmp, tmp2); \
673 \
674  tmp3 = vec_add(tmp3, tmp5); \
675  tmp4 = vec_add(tmp4, tmp6); \
676 }
677 
678 
679 static av_always_inline void
680 yuv2rgb_full_2_vsx_template(SwsInternal *c, const int16_t *buf[2],
681  const int16_t *ubuf[2], const int16_t *vbuf[2],
682  const int16_t *abuf[2], uint8_t *dest, int dstW,
683  int yalpha, int uvalpha, int y,
684  enum AVPixelFormat target, int hasAlpha)
685 {
686  const int16_t *buf0 = buf[0], *buf1 = buf[1],
687  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
688  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
689  *abuf0 = hasAlpha ? abuf[0] : NULL,
690  *abuf1 = hasAlpha ? abuf[1] : NULL;
691  const int16_t yalpha1 = 4096 - yalpha;
692  const int16_t uvalpha1 = 4096 - uvalpha;
693  vec_s16 vy, vu, vv, A = vec_splat_s16(0);
694  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
695  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r;
696  vec_s32 tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
697  vec_u16 rd16, gd16, bd16;
698  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
699  const vec_s16 vyalpha1 = vec_splats(yalpha1);
700  const vec_s16 vuvalpha1 = vec_splats(uvalpha1);
701  const vec_s16 vyalpha = vec_splats((int16_t) yalpha);
702  const vec_s16 vuvalpha = vec_splats((int16_t) uvalpha);
703  const vec_u16 zero16 = vec_splat_u16(0);
704  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
705  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
706  const vec_s32 y_add = vec_splats(1 << 21);
707  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
708  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
709  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
710  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
711  const vec_s32 rgbclip = vec_splats(1 << 30);
712  const vec_s32 zero32 = vec_splat_s32(0);
713  const vec_u32 shift19 = vec_splats(19U);
714  const vec_u32 shift22 = vec_splats(22U);
715  const vec_u32 shift10 = vec_splat_u32(10);
716  const vec_s32 dec128 = vec_splats(128 << 19);
717  const vec_s32 add18 = vec_splats(1 << 18);
718  int i;
719 
720  // Various permutations
721  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
722  0x1, 0x11, 0,
723  0x2, 0x12, 0,
724  0x3, 0x13, 0,
725  0x4, 0x14, 0,
726  0x5 };
727  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
728  0x6, 0x16, 0,
729  0x7, 0x17, 0 };
730  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
731  0x3, 0x4, 0x11,
732  0x6, 0x7, 0x12,
733  0x9, 0xa, 0x13,
734  0xc, 0xd, 0x14,
735  0xf };
736  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
737  0x2, 0x3, 0x16,
738  0x5, 0x6, 0x17 };
739 
740  av_assert2(yalpha <= 4096U);
741  av_assert2(uvalpha <= 4096U);
742 
743  for (i = 0; i < dstW; i += 8) {
744  SETUP(vy, &buf0[i], vyalpha1, &buf1[i], vyalpha);
745  vy32_l = vec_sra(tmp3, shift10);
746  vy32_r = vec_sra(tmp4, shift10);
747 
748  SETUP(vu, &ubuf0[i], vuvalpha1, &ubuf1[i], vuvalpha);
749  tmp3 = vec_sub(tmp3, dec128);
750  tmp4 = vec_sub(tmp4, dec128);
751  vu32_l = vec_sra(tmp3, shift10);
752  vu32_r = vec_sra(tmp4, shift10);
753 
754  SETUP(vv, &vbuf0[i], vuvalpha1, &vbuf1[i], vuvalpha);
755  tmp3 = vec_sub(tmp3, dec128);
756  tmp4 = vec_sub(tmp4, dec128);
757  vv32_l = vec_sra(tmp3, shift10);
758  vv32_r = vec_sra(tmp4, shift10);
759 
760  if (hasAlpha) {
761  SETUP(A, &abuf0[i], vyalpha1, &abuf1[i], vyalpha);
762  tmp3 = vec_add(tmp3, add18);
763  tmp4 = vec_add(tmp4, add18);
764  tmp3 = vec_sra(tmp3, shift19);
765  tmp4 = vec_sra(tmp4, shift19);
766  A = vec_packs(tmp3, tmp4);
767  ad = vec_packsu(A, (vec_s16) zero16);
768  } else {
769  ad = vec_splats((uint8_t) 255);
770  }
771 
772  vy32_l = vec_sub(vy32_l, y_offset);
773  vy32_r = vec_sub(vy32_r, y_offset);
774  vy32_l = vec_mul(vy32_l, y_coeff);
775  vy32_r = vec_mul(vy32_r, y_coeff);
776  vy32_l = vec_add(vy32_l, y_add);
777  vy32_r = vec_add(vy32_r, y_add);
778 
779  R_l = vec_mul(vv32_l, v2r_coeff);
780  R_l = vec_add(R_l, vy32_l);
781  R_r = vec_mul(vv32_r, v2r_coeff);
782  R_r = vec_add(R_r, vy32_r);
783  G_l = vec_mul(vv32_l, v2g_coeff);
784  tmp32 = vec_mul(vu32_l, u2g_coeff);
785  G_l = vec_add(G_l, vy32_l);
786  G_l = vec_add(G_l, tmp32);
787  G_r = vec_mul(vv32_r, v2g_coeff);
788  tmp32 = vec_mul(vu32_r, u2g_coeff);
789  G_r = vec_add(G_r, vy32_r);
790  G_r = vec_add(G_r, tmp32);
791 
792  B_l = vec_mul(vu32_l, u2b_coeff);
793  B_l = vec_add(B_l, vy32_l);
794  B_r = vec_mul(vu32_r, u2b_coeff);
795  B_r = vec_add(B_r, vy32_r);
796 
797  WRITERGB
798  }
799 }
800 
801 static av_always_inline void
802 yuv2rgb_2_vsx_template(SwsInternal *c, const int16_t *buf[2],
803  const int16_t *ubuf[2], const int16_t *vbuf[2],
804  const int16_t *abuf[2], uint8_t *dest, int dstW,
805  int yalpha, int uvalpha, int y,
806  enum AVPixelFormat target, int hasAlpha)
807 {
808  const int16_t *buf0 = buf[0], *buf1 = buf[1],
809  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
810  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
811  *abuf0 = hasAlpha ? abuf[0] : NULL,
812  *abuf1 = hasAlpha ? abuf[1] : NULL;
813  const int16_t yalpha1 = 4096 - yalpha;
814  const int16_t uvalpha1 = 4096 - uvalpha;
815  vec_s16 vy, vu, vv, A = vec_splat_s16(0);
816  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
817  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r, vud32_l, vud32_r, vvd32_l, vvd32_r;
818  vec_s32 tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
819  vec_u16 rd16, gd16, bd16;
820  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
821  const vec_s16 vyalpha1 = vec_splats(yalpha1);
822  const vec_s16 vuvalpha1 = vec_splats(uvalpha1);
823  const vec_s16 vyalpha = vec_splats((int16_t) yalpha);
824  const vec_s16 vuvalpha = vec_splats((int16_t) uvalpha);
825  const vec_u16 zero16 = vec_splat_u16(0);
826  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
827  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
828  const vec_s32 y_add = vec_splats(1 << 21);
829  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
830  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
831  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
832  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
833  const vec_s32 rgbclip = vec_splats(1 << 30);
834  const vec_s32 zero32 = vec_splat_s32(0);
835  const vec_u32 shift19 = vec_splats(19U);
836  const vec_u32 shift22 = vec_splats(22U);
837  const vec_u32 shift10 = vec_splat_u32(10);
838  const vec_s32 dec128 = vec_splats(128 << 19);
839  const vec_s32 add18 = vec_splats(1 << 18);
840  int i;
841 
842  // Various permutations
843  const vec_u8 doubleleft = (vec_u8) {0, 1, 2, 3,
844  0, 1, 2, 3,
845  4, 5, 6, 7,
846  4, 5, 6, 7 };
847  const vec_u8 doubleright = (vec_u8) {8, 9, 10, 11,
848  8, 9, 10, 11,
849  12, 13, 14, 15,
850  12, 13, 14, 15 };
851  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
852  0x1, 0x11, 0,
853  0x2, 0x12, 0,
854  0x3, 0x13, 0,
855  0x4, 0x14, 0,
856  0x5 };
857  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
858  0x6, 0x16, 0,
859  0x7, 0x17, 0 };
860  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
861  0x3, 0x4, 0x11,
862  0x6, 0x7, 0x12,
863  0x9, 0xa, 0x13,
864  0xc, 0xd, 0x14,
865  0xf };
866  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
867  0x2, 0x3, 0x16,
868  0x5, 0x6, 0x17 };
869 
870  av_assert2(yalpha <= 4096U);
871  av_assert2(uvalpha <= 4096U);
872 
873  for (i = 0; i < (dstW + 1) >> 1; i += 8) {
874  SETUP(vy, &buf0[i * 2], vyalpha1, &buf1[i * 2], vyalpha);
875  vy32_l = vec_sra(tmp3, shift10);
876  vy32_r = vec_sra(tmp4, shift10);
877 
878  SETUP(vu, &ubuf0[i], vuvalpha1, &ubuf1[i], vuvalpha);
879  tmp3 = vec_sub(tmp3, dec128);
880  tmp4 = vec_sub(tmp4, dec128);
881  vu32_l = vec_sra(tmp3, shift10);
882  vu32_r = vec_sra(tmp4, shift10);
883 
884  SETUP(vv, &vbuf0[i], vuvalpha1, &vbuf1[i], vuvalpha);
885  tmp3 = vec_sub(tmp3, dec128);
886  tmp4 = vec_sub(tmp4, dec128);
887  vv32_l = vec_sra(tmp3, shift10);
888  vv32_r = vec_sra(tmp4, shift10);
889 
890  if (hasAlpha) {
891  SETUP(A, &abuf0[i], vyalpha1, &abuf1[i], vyalpha);
892  tmp3 = vec_add(tmp3, add18);
893  tmp4 = vec_add(tmp4, add18);
894  tmp3 = vec_sra(tmp3, shift19);
895  tmp4 = vec_sra(tmp4, shift19);
896  A = vec_packs(tmp3, tmp4);
897  ad = vec_packsu(A, (vec_s16) zero16);
898  } else {
899  ad = vec_splats((uint8_t) 255);
900  }
901 
902  vy32_l = vec_sub(vy32_l, y_offset);
903  vy32_r = vec_sub(vy32_r, y_offset);
904  vy32_l = vec_mul(vy32_l, y_coeff);
905  vy32_r = vec_mul(vy32_r, y_coeff);
906  vy32_l = vec_add(vy32_l, y_add);
907  vy32_r = vec_add(vy32_r, y_add);
908 
909  // Use the first UV half
910  vud32_l = vec_perm(vu32_l, vu32_l, doubleleft);
911  vud32_r = vec_perm(vu32_l, vu32_l, doubleright);
912  vvd32_l = vec_perm(vv32_l, vv32_l, doubleleft);
913  vvd32_r = vec_perm(vv32_l, vv32_l, doubleright);
914 
915  R_l = vec_mul(vvd32_l, v2r_coeff);
916  R_l = vec_add(R_l, vy32_l);
917  R_r = vec_mul(vvd32_r, v2r_coeff);
918  R_r = vec_add(R_r, vy32_r);
919  G_l = vec_mul(vvd32_l, v2g_coeff);
920  tmp32 = vec_mul(vud32_l, u2g_coeff);
921  G_l = vec_add(G_l, vy32_l);
922  G_l = vec_add(G_l, tmp32);
923  G_r = vec_mul(vvd32_r, v2g_coeff);
924  tmp32 = vec_mul(vud32_r, u2g_coeff);
925  G_r = vec_add(G_r, vy32_r);
926  G_r = vec_add(G_r, tmp32);
927 
928  B_l = vec_mul(vud32_l, u2b_coeff);
929  B_l = vec_add(B_l, vy32_l);
930  B_r = vec_mul(vud32_r, u2b_coeff);
931  B_r = vec_add(B_r, vy32_r);
932 
933  WRITERGB
934 
935  // New Y for the second half
936  SETUP(vy, &buf0[i * 2 + 8], vyalpha1, &buf1[i * 2 + 8], vyalpha);
937  vy32_l = vec_sra(tmp3, shift10);
938  vy32_r = vec_sra(tmp4, shift10);
939 
940  vy32_l = vec_sub(vy32_l, y_offset);
941  vy32_r = vec_sub(vy32_r, y_offset);
942  vy32_l = vec_mul(vy32_l, y_coeff);
943  vy32_r = vec_mul(vy32_r, y_coeff);
944  vy32_l = vec_add(vy32_l, y_add);
945  vy32_r = vec_add(vy32_r, y_add);
946 
947  // Second UV half
948  vud32_l = vec_perm(vu32_r, vu32_r, doubleleft);
949  vud32_r = vec_perm(vu32_r, vu32_r, doubleright);
950  vvd32_l = vec_perm(vv32_r, vv32_r, doubleleft);
951  vvd32_r = vec_perm(vv32_r, vv32_r, doubleright);
952 
953  R_l = vec_mul(vvd32_l, v2r_coeff);
954  R_l = vec_add(R_l, vy32_l);
955  R_r = vec_mul(vvd32_r, v2r_coeff);
956  R_r = vec_add(R_r, vy32_r);
957  G_l = vec_mul(vvd32_l, v2g_coeff);
958  tmp32 = vec_mul(vud32_l, u2g_coeff);
959  G_l = vec_add(G_l, vy32_l);
960  G_l = vec_add(G_l, tmp32);
961  G_r = vec_mul(vvd32_r, v2g_coeff);
962  tmp32 = vec_mul(vud32_r, u2g_coeff);
963  G_r = vec_add(G_r, vy32_r);
964  G_r = vec_add(G_r, tmp32);
965 
966  B_l = vec_mul(vud32_l, u2b_coeff);
967  B_l = vec_add(B_l, vy32_l);
968  B_r = vec_mul(vud32_r, u2b_coeff);
969  B_r = vec_add(B_r, vy32_r);
970 
971  WRITERGB
972  }
973 }
974 
975 #undef SETUP
976 
977 static av_always_inline void
978 yuv2rgb_full_1_vsx_template(SwsInternal *c, const int16_t *buf0,
979  const int16_t *ubuf[2], const int16_t *vbuf[2],
980  const int16_t *abuf0, uint8_t *dest, int dstW,
981  int uvalpha, int y, enum AVPixelFormat target,
982  int hasAlpha)
983 {
984  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
985  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
986  vec_s16 vy, vu, vv, A = vec_splat_s16(0), tmp16;
987  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32, tmp32_2;
988  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r;
989  vec_u16 rd16, gd16, bd16;
990  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
991  const vec_u16 zero16 = vec_splat_u16(0);
992  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
993  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
994  const vec_s32 y_add = vec_splats(1 << 21);
995  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
996  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
997  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
998  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
999  const vec_s32 rgbclip = vec_splats(1 << 30);
1000  const vec_s32 zero32 = vec_splat_s32(0);
1001  const vec_u32 shift2 = vec_splat_u32(2);
1002  const vec_u32 shift22 = vec_splats(22U);
1003  const vec_u16 sub7 = vec_splats((uint16_t) (128 << 7));
1004  const vec_u16 sub8 = vec_splats((uint16_t) (128 << 8));
1005  const vec_s16 mul4 = vec_splat_s16(4);
1006  const vec_s16 mul8 = vec_splat_s16(8);
1007  const vec_s16 add64 = vec_splat_s16(64);
1008  const vec_u16 shift7 = vec_splat_u16(7);
1009  const vec_s16 max255 = vec_splat_s16(255);
1010  int i;
1011 
1012  // Various permutations
1013  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
1014  0x1, 0x11, 0,
1015  0x2, 0x12, 0,
1016  0x3, 0x13, 0,
1017  0x4, 0x14, 0,
1018  0x5 };
1019  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
1020  0x6, 0x16, 0,
1021  0x7, 0x17, 0 };
1022  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
1023  0x3, 0x4, 0x11,
1024  0x6, 0x7, 0x12,
1025  0x9, 0xa, 0x13,
1026  0xc, 0xd, 0x14,
1027  0xf };
1028  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
1029  0x2, 0x3, 0x16,
1030  0x5, 0x6, 0x17 };
1031 
1032  for (i = 0; i < dstW; i += 8) { // The x86 asm also overwrites padding bytes.
1033  vy = vec_ld(0, &buf0[i]);
1034  vy32_l = vec_unpackh(vy);
1035  vy32_r = vec_unpackl(vy);
1036  vy32_l = vec_sl(vy32_l, shift2);
1037  vy32_r = vec_sl(vy32_r, shift2);
1038 
1039  vu = vec_ld(0, &ubuf0[i]);
1040  vv = vec_ld(0, &vbuf0[i]);
1041  if (uvalpha < 2048) {
1042  vu = (vec_s16) vec_sub((vec_u16) vu, sub7);
1043  vv = (vec_s16) vec_sub((vec_u16) vv, sub7);
1044 
1045  tmp32 = vec_mule(vu, mul4);
1046  tmp32_2 = vec_mulo(vu, mul4);
1047  vu32_l = vec_mergeh(tmp32, tmp32_2);
1048  vu32_r = vec_mergel(tmp32, tmp32_2);
1049  tmp32 = vec_mule(vv, mul4);
1050  tmp32_2 = vec_mulo(vv, mul4);
1051  vv32_l = vec_mergeh(tmp32, tmp32_2);
1052  vv32_r = vec_mergel(tmp32, tmp32_2);
1053  } else {
1054  tmp16 = vec_ld(0, &ubuf1[i]);
1055  vu = vec_add(vu, tmp16);
1056  vu = (vec_s16) vec_sub((vec_u16) vu, sub8);
1057  tmp16 = vec_ld(0, &vbuf1[i]);
1058  vv = vec_add(vv, tmp16);
1059  vv = (vec_s16) vec_sub((vec_u16) vv, sub8);
1060 
1061  vu32_l = vec_mule(vu, mul8);
1062  vu32_r = vec_mulo(vu, mul8);
1063  vv32_l = vec_mule(vv, mul8);
1064  vv32_r = vec_mulo(vv, mul8);
1065  }
1066 
1067  if (hasAlpha) {
1068  A = vec_ld(0, &abuf0[i]);
1069  A = vec_add(A, add64);
1070  A = vec_sr(A, shift7);
1071  A = vec_max(A, max255);
1072  ad = vec_packsu(A, (vec_s16) zero16);
1073  } else {
1074  ad = vec_splats((uint8_t) 255);
1075  }
1076 
1077  vy32_l = vec_sub(vy32_l, y_offset);
1078  vy32_r = vec_sub(vy32_r, y_offset);
1079  vy32_l = vec_mul(vy32_l, y_coeff);
1080  vy32_r = vec_mul(vy32_r, y_coeff);
1081  vy32_l = vec_add(vy32_l, y_add);
1082  vy32_r = vec_add(vy32_r, y_add);
1083 
1084  R_l = vec_mul(vv32_l, v2r_coeff);
1085  R_l = vec_add(R_l, vy32_l);
1086  R_r = vec_mul(vv32_r, v2r_coeff);
1087  R_r = vec_add(R_r, vy32_r);
1088  G_l = vec_mul(vv32_l, v2g_coeff);
1089  tmp32 = vec_mul(vu32_l, u2g_coeff);
1090  G_l = vec_add(G_l, vy32_l);
1091  G_l = vec_add(G_l, tmp32);
1092  G_r = vec_mul(vv32_r, v2g_coeff);
1093  tmp32 = vec_mul(vu32_r, u2g_coeff);
1094  G_r = vec_add(G_r, vy32_r);
1095  G_r = vec_add(G_r, tmp32);
1096 
1097  B_l = vec_mul(vu32_l, u2b_coeff);
1098  B_l = vec_add(B_l, vy32_l);
1099  B_r = vec_mul(vu32_r, u2b_coeff);
1100  B_r = vec_add(B_r, vy32_r);
1101 
1102  WRITERGB
1103  }
1104 }
1105 
1106 static av_always_inline void
1107 yuv2rgb_1_vsx_template(SwsInternal *c, const int16_t *buf0,
1108  const int16_t *ubuf[2], const int16_t *vbuf[2],
1109  const int16_t *abuf0, uint8_t *dest, int dstW,
1110  int uvalpha, int y, enum AVPixelFormat target,
1111  int hasAlpha)
1112 {
1113  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1114  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1115  vec_s16 vy, vu, vv, A = vec_splat_s16(0), tmp16;
1116  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32, tmp32_2;
1117  vec_s32 vud32_l, vud32_r, vvd32_l, vvd32_r;
1118  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r;
1119  vec_u16 rd16, gd16, bd16;
1120  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
1121  const vec_u16 zero16 = vec_splat_u16(0);
1122  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
1123  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
1124  const vec_s32 y_add = vec_splats(1 << 21);
1125  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
1126  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
1127  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
1128  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
1129  const vec_s32 rgbclip = vec_splats(1 << 30);
1130  const vec_s32 zero32 = vec_splat_s32(0);
1131  const vec_u32 shift2 = vec_splat_u32(2);
1132  const vec_u32 shift22 = vec_splats(22U);
1133  const vec_u16 sub7 = vec_splats((uint16_t) (128 << 7));
1134  const vec_u16 sub8 = vec_splats((uint16_t) (128 << 8));
1135  const vec_s16 mul4 = vec_splat_s16(4);
1136  const vec_s16 mul8 = vec_splat_s16(8);
1137  const vec_s16 add64 = vec_splat_s16(64);
1138  const vec_u16 shift7 = vec_splat_u16(7);
1139  const vec_s16 max255 = vec_splat_s16(255);
1140  int i;
1141 
1142  // Various permutations
1143  const vec_u8 doubleleft = (vec_u8) {0, 1, 2, 3,
1144  0, 1, 2, 3,
1145  4, 5, 6, 7,
1146  4, 5, 6, 7 };
1147  const vec_u8 doubleright = (vec_u8) {8, 9, 10, 11,
1148  8, 9, 10, 11,
1149  12, 13, 14, 15,
1150  12, 13, 14, 15 };
1151  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
1152  0x1, 0x11, 0,
1153  0x2, 0x12, 0,
1154  0x3, 0x13, 0,
1155  0x4, 0x14, 0,
1156  0x5 };
1157  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
1158  0x6, 0x16, 0,
1159  0x7, 0x17, 0 };
1160  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
1161  0x3, 0x4, 0x11,
1162  0x6, 0x7, 0x12,
1163  0x9, 0xa, 0x13,
1164  0xc, 0xd, 0x14,
1165  0xf };
1166  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
1167  0x2, 0x3, 0x16,
1168  0x5, 0x6, 0x17 };
1169 
1170  for (i = 0; i < (dstW + 1) >> 1; i += 8) { // The x86 asm also overwrites padding bytes.
1171  vy = vec_ld(0, &buf0[i * 2]);
1172  vy32_l = vec_unpackh(vy);
1173  vy32_r = vec_unpackl(vy);
1174  vy32_l = vec_sl(vy32_l, shift2);
1175  vy32_r = vec_sl(vy32_r, shift2);
1176 
1177  vu = vec_ld(0, &ubuf0[i]);
1178  vv = vec_ld(0, &vbuf0[i]);
1179  if (uvalpha < 2048) {
1180  vu = (vec_s16) vec_sub((vec_u16) vu, sub7);
1181  vv = (vec_s16) vec_sub((vec_u16) vv, sub7);
1182 
1183  tmp32 = vec_mule(vu, mul4);
1184  tmp32_2 = vec_mulo(vu, mul4);
1185  vu32_l = vec_mergeh(tmp32, tmp32_2);
1186  vu32_r = vec_mergel(tmp32, tmp32_2);
1187  tmp32 = vec_mule(vv, mul4);
1188  tmp32_2 = vec_mulo(vv, mul4);
1189  vv32_l = vec_mergeh(tmp32, tmp32_2);
1190  vv32_r = vec_mergel(tmp32, tmp32_2);
1191  } else {
1192  tmp16 = vec_ld(0, &ubuf1[i]);
1193  vu = vec_add(vu, tmp16);
1194  vu = (vec_s16) vec_sub((vec_u16) vu, sub8);
1195  tmp16 = vec_ld(0, &vbuf1[i]);
1196  vv = vec_add(vv, tmp16);
1197  vv = (vec_s16) vec_sub((vec_u16) vv, sub8);
1198 
1199  vu32_l = vec_mule(vu, mul8);
1200  vu32_r = vec_mulo(vu, mul8);
1201  vv32_l = vec_mule(vv, mul8);
1202  vv32_r = vec_mulo(vv, mul8);
1203  }
1204 
1205  if (hasAlpha) {
1206  A = vec_ld(0, &abuf0[i]);
1207  A = vec_add(A, add64);
1208  A = vec_sr(A, shift7);
1209  A = vec_max(A, max255);
1210  ad = vec_packsu(A, (vec_s16) zero16);
1211  } else {
1212  ad = vec_splats((uint8_t) 255);
1213  }
1214 
1215  vy32_l = vec_sub(vy32_l, y_offset);
1216  vy32_r = vec_sub(vy32_r, y_offset);
1217  vy32_l = vec_mul(vy32_l, y_coeff);
1218  vy32_r = vec_mul(vy32_r, y_coeff);
1219  vy32_l = vec_add(vy32_l, y_add);
1220  vy32_r = vec_add(vy32_r, y_add);
1221 
1222  // Use the first UV half
1223  vud32_l = vec_perm(vu32_l, vu32_l, doubleleft);
1224  vud32_r = vec_perm(vu32_l, vu32_l, doubleright);
1225  vvd32_l = vec_perm(vv32_l, vv32_l, doubleleft);
1226  vvd32_r = vec_perm(vv32_l, vv32_l, doubleright);
1227 
1228  R_l = vec_mul(vvd32_l, v2r_coeff);
1229  R_l = vec_add(R_l, vy32_l);
1230  R_r = vec_mul(vvd32_r, v2r_coeff);
1231  R_r = vec_add(R_r, vy32_r);
1232  G_l = vec_mul(vvd32_l, v2g_coeff);
1233  tmp32 = vec_mul(vud32_l, u2g_coeff);
1234  G_l = vec_add(G_l, vy32_l);
1235  G_l = vec_add(G_l, tmp32);
1236  G_r = vec_mul(vvd32_r, v2g_coeff);
1237  tmp32 = vec_mul(vud32_r, u2g_coeff);
1238  G_r = vec_add(G_r, vy32_r);
1239  G_r = vec_add(G_r, tmp32);
1240 
1241  B_l = vec_mul(vud32_l, u2b_coeff);
1242  B_l = vec_add(B_l, vy32_l);
1243  B_r = vec_mul(vud32_r, u2b_coeff);
1244  B_r = vec_add(B_r, vy32_r);
1245 
1246  WRITERGB
1247 
1248  // New Y for the second half
1249  vy = vec_ld(16, &buf0[i * 2]);
1250  vy32_l = vec_unpackh(vy);
1251  vy32_r = vec_unpackl(vy);
1252  vy32_l = vec_sl(vy32_l, shift2);
1253  vy32_r = vec_sl(vy32_r, shift2);
1254 
1255  vy32_l = vec_sub(vy32_l, y_offset);
1256  vy32_r = vec_sub(vy32_r, y_offset);
1257  vy32_l = vec_mul(vy32_l, y_coeff);
1258  vy32_r = vec_mul(vy32_r, y_coeff);
1259  vy32_l = vec_add(vy32_l, y_add);
1260  vy32_r = vec_add(vy32_r, y_add);
1261 
1262  // Second UV half
1263  vud32_l = vec_perm(vu32_r, vu32_r, doubleleft);
1264  vud32_r = vec_perm(vu32_r, vu32_r, doubleright);
1265  vvd32_l = vec_perm(vv32_r, vv32_r, doubleleft);
1266  vvd32_r = vec_perm(vv32_r, vv32_r, doubleright);
1267 
1268  R_l = vec_mul(vvd32_l, v2r_coeff);
1269  R_l = vec_add(R_l, vy32_l);
1270  R_r = vec_mul(vvd32_r, v2r_coeff);
1271  R_r = vec_add(R_r, vy32_r);
1272  G_l = vec_mul(vvd32_l, v2g_coeff);
1273  tmp32 = vec_mul(vud32_l, u2g_coeff);
1274  G_l = vec_add(G_l, vy32_l);
1275  G_l = vec_add(G_l, tmp32);
1276  G_r = vec_mul(vvd32_r, v2g_coeff);
1277  tmp32 = vec_mul(vud32_r, u2g_coeff);
1278  G_r = vec_add(G_r, vy32_r);
1279  G_r = vec_add(G_r, tmp32);
1280 
1281  B_l = vec_mul(vud32_l, u2b_coeff);
1282  B_l = vec_add(B_l, vy32_l);
1283  B_r = vec_mul(vud32_r, u2b_coeff);
1284  B_r = vec_add(B_r, vy32_r);
1285 
1286  WRITERGB
1287  }
1288 }
1289 
1290 #undef WRITERGB
1291 
1292 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1293 static void name ## ext ## _X_vsx(SwsInternal *c, const int16_t *lumFilter, \
1294  const int16_t **lumSrc, int lumFilterSize, \
1295  const int16_t *chrFilter, const int16_t **chrUSrc, \
1296  const int16_t **chrVSrc, int chrFilterSize, \
1297  const int16_t **alpSrc, uint8_t *dest, int dstW, \
1298  int y) \
1299 { \
1300  name ## base ## _X_vsx_template(c, lumFilter, lumSrc, lumFilterSize, \
1301  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1302  alpSrc, dest, dstW, y, fmt, hasAlpha); \
1303 }
1304 
1305 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
1306 static void name ## ext ## _2_vsx(SwsInternal *c, const int16_t *buf[2], \
1307  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1308  const int16_t *abuf[2], uint8_t *dest, int dstW, \
1309  int yalpha, int uvalpha, int y) \
1310 { \
1311  name ## base ## _2_vsx_template(c, buf, ubuf, vbuf, abuf, \
1312  dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1313 }
1314 
1315 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1316 static void name ## ext ## _1_vsx(SwsInternal *c, const int16_t *buf0, \
1317  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1318  const int16_t *abuf0, uint8_t *dest, int dstW, \
1319  int uvalpha, int y) \
1320 { \
1321  name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1322  dstW, uvalpha, y, fmt, hasAlpha); \
1323 }
1324 
1325 YUV2RGBWRAPPER(yuv2, rgb, bgrx32, AV_PIX_FMT_BGRA, 0)
1326 YUV2RGBWRAPPER(yuv2, rgb, rgbx32, AV_PIX_FMT_RGBA, 0)
1327 YUV2RGBWRAPPER(yuv2, rgb, xrgb32, AV_PIX_FMT_ARGB, 0)
1328 YUV2RGBWRAPPER(yuv2, rgb, xbgr32, AV_PIX_FMT_ABGR, 0)
1329 
1330 YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
1331 YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
1332 
1333 YUV2RGBWRAPPERX2(yuv2, rgb, bgrx32, AV_PIX_FMT_BGRA, 0)
1334 YUV2RGBWRAPPERX2(yuv2, rgb, rgbx32, AV_PIX_FMT_RGBA, 0)
1335 YUV2RGBWRAPPERX2(yuv2, rgb, xrgb32, AV_PIX_FMT_ARGB, 0)
1336 YUV2RGBWRAPPERX2(yuv2, rgb, xbgr32, AV_PIX_FMT_ABGR, 0)
1337 
1338 YUV2RGBWRAPPERX2(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
1339 YUV2RGBWRAPPERX2(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
1340 
1341 YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1342 YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1343 YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1344 YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1345 
1346 YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1347 YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1348 
1349 YUV2RGBWRAPPERX2(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1350 YUV2RGBWRAPPERX2(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1351 YUV2RGBWRAPPERX2(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1352 YUV2RGBWRAPPERX2(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1353 
1354 YUV2RGBWRAPPERX2(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1355 YUV2RGBWRAPPERX2(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1356 
1357 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1358 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1359 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1360 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1361 
1362 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1363 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1364 
1365 static av_always_inline void
1366 write422(const vec_s16 vy1, const vec_s16 vy2,
1367  const vec_s16 vu, const vec_s16 vv,
1368  uint8_t *dest, const enum AVPixelFormat target)
1369 {
1370  vec_u8 vd1, vd2, tmp;
1371  const vec_u8 yuyv1 = (vec_u8) {
1372  0x0, 0x10, 0x1, 0x18,
1373  0x2, 0x11, 0x3, 0x19,
1374  0x4, 0x12, 0x5, 0x1a,
1375  0x6, 0x13, 0x7, 0x1b };
1376  const vec_u8 yuyv2 = (vec_u8) {
1377  0x8, 0x14, 0x9, 0x1c,
1378  0xa, 0x15, 0xb, 0x1d,
1379  0xc, 0x16, 0xd, 0x1e,
1380  0xe, 0x17, 0xf, 0x1f };
1381  const vec_u8 yvyu1 = (vec_u8) {
1382  0x0, 0x18, 0x1, 0x10,
1383  0x2, 0x19, 0x3, 0x11,
1384  0x4, 0x1a, 0x5, 0x12,
1385  0x6, 0x1b, 0x7, 0x13 };
1386  const vec_u8 yvyu2 = (vec_u8) {
1387  0x8, 0x1c, 0x9, 0x14,
1388  0xa, 0x1d, 0xb, 0x15,
1389  0xc, 0x1e, 0xd, 0x16,
1390  0xe, 0x1f, 0xf, 0x17 };
1391  const vec_u8 uyvy1 = (vec_u8) {
1392  0x10, 0x0, 0x18, 0x1,
1393  0x11, 0x2, 0x19, 0x3,
1394  0x12, 0x4, 0x1a, 0x5,
1395  0x13, 0x6, 0x1b, 0x7 };
1396  const vec_u8 uyvy2 = (vec_u8) {
1397  0x14, 0x8, 0x1c, 0x9,
1398  0x15, 0xa, 0x1d, 0xb,
1399  0x16, 0xc, 0x1e, 0xd,
1400  0x17, 0xe, 0x1f, 0xf };
1401 
1402  vd1 = vec_packsu(vy1, vy2);
1403  vd2 = vec_packsu(vu, vv);
1404 
1405  switch (target) {
1406  case AV_PIX_FMT_YUYV422:
1407  tmp = vec_perm(vd1, vd2, yuyv1);
1408  vec_st(tmp, 0, dest);
1409  tmp = vec_perm(vd1, vd2, yuyv2);
1410  vec_st(tmp, 16, dest);
1411  break;
1412  case AV_PIX_FMT_YVYU422:
1413  tmp = vec_perm(vd1, vd2, yvyu1);
1414  vec_st(tmp, 0, dest);
1415  tmp = vec_perm(vd1, vd2, yvyu2);
1416  vec_st(tmp, 16, dest);
1417  break;
1418  case AV_PIX_FMT_UYVY422:
1419  tmp = vec_perm(vd1, vd2, uyvy1);
1420  vec_st(tmp, 0, dest);
1421  tmp = vec_perm(vd1, vd2, uyvy2);
1422  vec_st(tmp, 16, dest);
1423  break;
1424  }
1425 }
1426 
1427 static av_always_inline void
1428 yuv2422_X_vsx_template(SwsInternal *c, const int16_t *lumFilter,
1429  const int16_t **lumSrc, int lumFilterSize,
1430  const int16_t *chrFilter, const int16_t **chrUSrc,
1431  const int16_t **chrVSrc, int chrFilterSize,
1432  const int16_t **alpSrc, uint8_t *dest, int dstW,
1433  int y, enum AVPixelFormat target)
1434 {
1435  int i, j;
1436  vec_s16 vy1, vy2, vu, vv;
1437  vec_s32 vy32[4], vu32[2], vv32[2], tmp, tmp2, tmp3, tmp4;
1438  vec_s16 vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE];
1439  const vec_s32 start = vec_splats(1 << 18);
1440  const vec_u32 shift19 = vec_splats(19U);
1441 
1442  for (i = 0; i < lumFilterSize; i++)
1443  vlumFilter[i] = vec_splats(lumFilter[i]);
1444  for (i = 0; i < chrFilterSize; i++)
1445  vchrFilter[i] = vec_splats(chrFilter[i]);
1446 
1447  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1448  vy32[0] =
1449  vy32[1] =
1450  vy32[2] =
1451  vy32[3] =
1452  vu32[0] =
1453  vu32[1] =
1454  vv32[0] =
1455  vv32[1] = start;
1456 
1457  for (j = 0; j < lumFilterSize; j++) {
1458  vv = vec_ld(0, &lumSrc[j][i * 2]);
1459  tmp = vec_mule(vv, vlumFilter[j]);
1460  tmp2 = vec_mulo(vv, vlumFilter[j]);
1461  tmp3 = vec_mergeh(tmp, tmp2);
1462  tmp4 = vec_mergel(tmp, tmp2);
1463 
1464  vy32[0] = vec_adds(vy32[0], tmp3);
1465  vy32[1] = vec_adds(vy32[1], tmp4);
1466 
1467  vv = vec_ld(0, &lumSrc[j][(i + 4) * 2]);
1468  tmp = vec_mule(vv, vlumFilter[j]);
1469  tmp2 = vec_mulo(vv, vlumFilter[j]);
1470  tmp3 = vec_mergeh(tmp, tmp2);
1471  tmp4 = vec_mergel(tmp, tmp2);
1472 
1473  vy32[2] = vec_adds(vy32[2], tmp3);
1474  vy32[3] = vec_adds(vy32[3], tmp4);
1475  }
1476 
1477  for (j = 0; j < chrFilterSize; j++) {
1478  vv = vec_ld(0, &chrUSrc[j][i]);
1479  tmp = vec_mule(vv, vchrFilter[j]);
1480  tmp2 = vec_mulo(vv, vchrFilter[j]);
1481  tmp3 = vec_mergeh(tmp, tmp2);
1482  tmp4 = vec_mergel(tmp, tmp2);
1483 
1484  vu32[0] = vec_adds(vu32[0], tmp3);
1485  vu32[1] = vec_adds(vu32[1], tmp4);
1486 
1487  vv = vec_ld(0, &chrVSrc[j][i]);
1488  tmp = vec_mule(vv, vchrFilter[j]);
1489  tmp2 = vec_mulo(vv, vchrFilter[j]);
1490  tmp3 = vec_mergeh(tmp, tmp2);
1491  tmp4 = vec_mergel(tmp, tmp2);
1492 
1493  vv32[0] = vec_adds(vv32[0], tmp3);
1494  vv32[1] = vec_adds(vv32[1], tmp4);
1495  }
1496 
1497  for (j = 0; j < 4; j++) {
1498  vy32[j] = vec_sra(vy32[j], shift19);
1499  }
1500  for (j = 0; j < 2; j++) {
1501  vu32[j] = vec_sra(vu32[j], shift19);
1502  vv32[j] = vec_sra(vv32[j], shift19);
1503  }
1504 
1505  vy1 = vec_packs(vy32[0], vy32[1]);
1506  vy2 = vec_packs(vy32[2], vy32[3]);
1507  vu = vec_packs(vu32[0], vu32[1]);
1508  vv = vec_packs(vv32[0], vv32[1]);
1509 
1510  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1511  }
1512 }
1513 
1514 #define SETUP(x, buf0, buf1, alpha) { \
1515  x = vec_ld(0, buf0); \
1516  tmp = vec_mule(x, alpha); \
1517  tmp2 = vec_mulo(x, alpha); \
1518  tmp3 = vec_mergeh(tmp, tmp2); \
1519  tmp4 = vec_mergel(tmp, tmp2); \
1520 \
1521  x = vec_ld(0, buf1); \
1522  tmp = vec_mule(x, alpha); \
1523  tmp2 = vec_mulo(x, alpha); \
1524  tmp5 = vec_mergeh(tmp, tmp2); \
1525  tmp6 = vec_mergel(tmp, tmp2); \
1526 \
1527  tmp3 = vec_add(tmp3, tmp5); \
1528  tmp4 = vec_add(tmp4, tmp6); \
1529 \
1530  tmp3 = vec_sra(tmp3, shift19); \
1531  tmp4 = vec_sra(tmp4, shift19); \
1532  x = vec_packs(tmp3, tmp4); \
1533 }
1534 
1535 static av_always_inline void
1536 yuv2422_2_vsx_template(SwsInternal *c, const int16_t *buf[2],
1537  const int16_t *ubuf[2], const int16_t *vbuf[2],
1538  const int16_t *abuf[2], uint8_t *dest, int dstW,
1539  int yalpha, int uvalpha, int y,
1540  enum AVPixelFormat target)
1541 {
1542  const int16_t *buf0 = buf[0], *buf1 = buf[1],
1543  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1544  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1545  const int16_t yalpha1 = 4096 - yalpha;
1546  const int16_t uvalpha1 = 4096 - uvalpha;
1547  vec_s16 vy1, vy2, vu, vv;
1548  vec_s32 tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
1549  const vec_s16 vyalpha1 = vec_splats(yalpha1);
1550  const vec_s16 vuvalpha1 = vec_splats(uvalpha1);
1551  const vec_u32 shift19 = vec_splats(19U);
1552  int i;
1553  av_assert2(yalpha <= 4096U);
1554  av_assert2(uvalpha <= 4096U);
1555 
1556  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1557 
1558  SETUP(vy1, &buf0[i * 2], &buf1[i * 2], vyalpha1)
1559  SETUP(vy2, &buf0[(i + 4) * 2], &buf1[(i + 4) * 2], vyalpha1)
1560  SETUP(vu, &ubuf0[i], &ubuf1[i], vuvalpha1)
1561  SETUP(vv, &vbuf0[i], &vbuf1[i], vuvalpha1)
1562 
1563  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1564  }
1565 }
1566 
1567 #undef SETUP
1568 
1569 static av_always_inline void
1570 yuv2422_1_vsx_template(SwsInternal *c, const int16_t *buf0,
1571  const int16_t *ubuf[2], const int16_t *vbuf[2],
1572  const int16_t *abuf0, uint8_t *dest, int dstW,
1573  int uvalpha, int y, enum AVPixelFormat target)
1574 {
1575  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1576  vec_s16 vy1, vy2, vu, vv, tmp;
1577  const vec_s16 add64 = vec_splats((int16_t) 64);
1578  const vec_s16 add128 = vec_splats((int16_t) 128);
1579  const vec_u16 shift7 = vec_splat_u16(7);
1580  const vec_u16 shift8 = vec_splat_u16(8);
1581  int i;
1582 
1583  if (uvalpha < 2048) {
1584  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1585  vy1 = vec_ld(0, &buf0[i * 2]);
1586  vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
1587  vu = vec_ld(0, &ubuf0[i]);
1588  vv = vec_ld(0, &vbuf0[i]);
1589 
1590  vy1 = vec_add(vy1, add64);
1591  vy2 = vec_add(vy2, add64);
1592  vu = vec_add(vu, add64);
1593  vv = vec_add(vv, add64);
1594 
1595  vy1 = vec_sra(vy1, shift7);
1596  vy2 = vec_sra(vy2, shift7);
1597  vu = vec_sra(vu, shift7);
1598  vv = vec_sra(vv, shift7);
1599 
1600  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1601  }
1602  } else {
1603  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1604  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1605  vy1 = vec_ld(0, &buf0[i * 2]);
1606  vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
1607  vu = vec_ld(0, &ubuf0[i]);
1608  tmp = vec_ld(0, &ubuf1[i]);
1609  vu = vec_adds(vu, tmp);
1610  vv = vec_ld(0, &vbuf0[i]);
1611  tmp = vec_ld(0, &vbuf1[i]);
1612  vv = vec_adds(vv, tmp);
1613 
1614  vy1 = vec_add(vy1, add64);
1615  vy2 = vec_add(vy2, add64);
1616  vu = vec_adds(vu, add128);
1617  vv = vec_adds(vv, add128);
1618 
1619  vy1 = vec_sra(vy1, shift7);
1620  vy2 = vec_sra(vy2, shift7);
1621  vu = vec_sra(vu, shift8);
1622  vv = vec_sra(vv, shift8);
1623 
1624  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1625  }
1626  }
1627 }
1628 
1629 #define YUV2PACKEDWRAPPERX(name, base, ext, fmt) \
1630 static void name ## ext ## _X_vsx(SwsInternal *c, const int16_t *lumFilter, \
1631  const int16_t **lumSrc, int lumFilterSize, \
1632  const int16_t *chrFilter, const int16_t **chrUSrc, \
1633  const int16_t **chrVSrc, int chrFilterSize, \
1634  const int16_t **alpSrc, uint8_t *dest, int dstW, \
1635  int y) \
1636 { \
1637  name ## base ## _X_vsx_template(c, lumFilter, lumSrc, lumFilterSize, \
1638  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1639  alpSrc, dest, dstW, y, fmt); \
1640 }
1641 
1642 #define YUV2PACKEDWRAPPER2(name, base, ext, fmt) \
1643 YUV2PACKEDWRAPPERX(name, base, ext, fmt) \
1644 static void name ## ext ## _2_vsx(SwsInternal *c, const int16_t *buf[2], \
1645  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1646  const int16_t *abuf[2], uint8_t *dest, int dstW, \
1647  int yalpha, int uvalpha, int y) \
1648 { \
1649  name ## base ## _2_vsx_template(c, buf, ubuf, vbuf, abuf, \
1650  dest, dstW, yalpha, uvalpha, y, fmt); \
1651 }
1652 
1653 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
1654 YUV2PACKEDWRAPPER2(name, base, ext, fmt) \
1655 static void name ## ext ## _1_vsx(SwsInternal *c, const int16_t *buf0, \
1656  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1657  const int16_t *abuf0, uint8_t *dest, int dstW, \
1658  int uvalpha, int y) \
1659 { \
1660  name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, \
1661  abuf0, dest, dstW, uvalpha, \
1662  y, fmt); \
1663 }
1664 
1665 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422)
1666 YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422)
1667 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
1668 
1669 static void hyscale_fast_vsx(SwsInternal *c, int16_t *dst, int dstWidth,
1670  const uint8_t *src, int srcW, int xInc)
1671 {
1672  int i;
1673  unsigned int xpos = 0, xx;
1674  vec_u8 vin, vin2, vperm;
1675  vec_s8 vmul, valpha;
1676  vec_s16 vtmp, vtmp2, vtmp3, vtmp4;
1677  vec_u16 vd_l, vd_r, vcoord16[2];
1678  vec_u32 vcoord[4];
1679  const vec_u32 vadd = (vec_u32) {
1680  0,
1681  xInc * 1,
1682  xInc * 2,
1683  xInc * 3,
1684  };
1685  const vec_u16 vadd16 = (vec_u16) { // Modulo math
1686  0,
1687  xInc * 1,
1688  xInc * 2,
1689  xInc * 3,
1690  xInc * 4,
1691  xInc * 5,
1692  xInc * 6,
1693  xInc * 7,
1694  };
1695  const vec_u32 vshift16 = vec_splats((uint32_t) 16);
1696  const vec_u16 vshift9 = vec_splat_u16(9);
1697  const vec_u8 vzero = vec_splat_u8(0);
1698  const vec_u16 vshift = vec_splat_u16(7);
1699 
1700  for (i = 0; i < dstWidth; i += 16) {
1701  vcoord16[0] = vec_splats((uint16_t) xpos);
1702  vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8));
1703 
1704  vcoord16[0] = vec_add(vcoord16[0], vadd16);
1705  vcoord16[1] = vec_add(vcoord16[1], vadd16);
1706 
1707  vcoord16[0] = vec_sr(vcoord16[0], vshift9);
1708  vcoord16[1] = vec_sr(vcoord16[1], vshift9);
1709  valpha = (vec_s8) vec_pack(vcoord16[0], vcoord16[1]);
1710 
1711  xx = xpos >> 16;
1712  vin = vec_vsx_ld(0, &src[xx]);
1713 
1714  vcoord[0] = vec_splats(xpos & 0xffff);
1715  vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4);
1716  vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8);
1717  vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12);
1718 
1719  vcoord[0] = vec_add(vcoord[0], vadd);
1720  vcoord[1] = vec_add(vcoord[1], vadd);
1721  vcoord[2] = vec_add(vcoord[2], vadd);
1722  vcoord[3] = vec_add(vcoord[3], vadd);
1723 
1724  vcoord[0] = vec_sr(vcoord[0], vshift16);
1725  vcoord[1] = vec_sr(vcoord[1], vshift16);
1726  vcoord[2] = vec_sr(vcoord[2], vshift16);
1727  vcoord[3] = vec_sr(vcoord[3], vshift16);
1728 
1729  vcoord16[0] = vec_pack(vcoord[0], vcoord[1]);
1730  vcoord16[1] = vec_pack(vcoord[2], vcoord[3]);
1731  vperm = vec_pack(vcoord16[0], vcoord16[1]);
1732 
1733  vin = vec_perm(vin, vin, vperm);
1734 
1735  vin2 = vec_vsx_ld(1, &src[xx]);
1736  vin2 = vec_perm(vin2, vin2, vperm);
1737 
1738  vmul = (vec_s8) vec_sub(vin2, vin);
1739  vtmp = vec_mule(vmul, valpha);
1740  vtmp2 = vec_mulo(vmul, valpha);
1741  vtmp3 = vec_mergeh(vtmp, vtmp2);
1742  vtmp4 = vec_mergel(vtmp, vtmp2);
1743 
1744  vd_l = (vec_u16) vec_mergeh(vin, vzero);
1745  vd_r = (vec_u16) vec_mergel(vin, vzero);
1746  vd_l = vec_sl(vd_l, vshift);
1747  vd_r = vec_sl(vd_r, vshift);
1748 
1749  vd_l = vec_add(vd_l, (vec_u16) vtmp3);
1750  vd_r = vec_add(vd_r, (vec_u16) vtmp4);
1751 
1752  vec_st((vec_s16) vd_l, 0, &dst[i]);
1753  vec_st((vec_s16) vd_r, 0, &dst[i + 8]);
1754 
1755  xpos += xInc * 16;
1756  }
1757  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
1758  dst[i] = src[srcW-1]*128;
1759 }
1760 
1761 #define HCSCALE(in, out) \
1762  vin = vec_vsx_ld(0, &in[xx]); \
1763  vin = vec_perm(vin, vin, vperm); \
1764 \
1765  vin2 = vec_vsx_ld(1, &in[xx]); \
1766  vin2 = vec_perm(vin2, vin2, vperm); \
1767 \
1768  vtmp = vec_mule(vin, valphaxor); \
1769  vtmp2 = vec_mulo(vin, valphaxor); \
1770  vtmp3 = vec_mergeh(vtmp, vtmp2); \
1771  vtmp4 = vec_mergel(vtmp, vtmp2); \
1772 \
1773  vtmp = vec_mule(vin2, valpha); \
1774  vtmp2 = vec_mulo(vin2, valpha); \
1775  vd_l = vec_mergeh(vtmp, vtmp2); \
1776  vd_r = vec_mergel(vtmp, vtmp2); \
1777 \
1778  vd_l = vec_add(vd_l, vtmp3); \
1779  vd_r = vec_add(vd_r, vtmp4); \
1780 \
1781  vec_st((vec_s16) vd_l, 0, &out[i]); \
1782  vec_st((vec_s16) vd_r, 0, &out[i + 8])
1783 
1784 static void hcscale_fast_vsx(SwsInternal *c, int16_t *dst1, int16_t *dst2,
1785  int dstWidth, const uint8_t *src1,
1786  const uint8_t *src2, int srcW, int xInc)
1787 {
1788  int i;
1789  unsigned int xpos = 0, xx;
1790  vec_u8 vin, vin2, vperm;
1791  vec_u8 valpha, valphaxor;
1792  vec_u16 vtmp, vtmp2, vtmp3, vtmp4;
1793  vec_u16 vd_l, vd_r, vcoord16[2];
1794  vec_u32 vcoord[4];
1795  const vec_u8 vxor = vec_splats((uint8_t) 127);
1796  const vec_u32 vadd = (vec_u32) {
1797  0,
1798  xInc * 1,
1799  xInc * 2,
1800  xInc * 3,
1801  };
1802  const vec_u16 vadd16 = (vec_u16) { // Modulo math
1803  0,
1804  xInc * 1,
1805  xInc * 2,
1806  xInc * 3,
1807  xInc * 4,
1808  xInc * 5,
1809  xInc * 6,
1810  xInc * 7,
1811  };
1812  const vec_u32 vshift16 = vec_splats((uint32_t) 16);
1813  const vec_u16 vshift9 = vec_splat_u16(9);
1814 
1815  for (i = 0; i < dstWidth; i += 16) {
1816  vcoord16[0] = vec_splats((uint16_t) xpos);
1817  vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8));
1818 
1819  vcoord16[0] = vec_add(vcoord16[0], vadd16);
1820  vcoord16[1] = vec_add(vcoord16[1], vadd16);
1821 
1822  vcoord16[0] = vec_sr(vcoord16[0], vshift9);
1823  vcoord16[1] = vec_sr(vcoord16[1], vshift9);
1824  valpha = vec_pack(vcoord16[0], vcoord16[1]);
1825  valphaxor = vec_xor(valpha, vxor);
1826 
1827  xx = xpos >> 16;
1828 
1829  vcoord[0] = vec_splats(xpos & 0xffff);
1830  vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4);
1831  vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8);
1832  vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12);
1833 
1834  vcoord[0] = vec_add(vcoord[0], vadd);
1835  vcoord[1] = vec_add(vcoord[1], vadd);
1836  vcoord[2] = vec_add(vcoord[2], vadd);
1837  vcoord[3] = vec_add(vcoord[3], vadd);
1838 
1839  vcoord[0] = vec_sr(vcoord[0], vshift16);
1840  vcoord[1] = vec_sr(vcoord[1], vshift16);
1841  vcoord[2] = vec_sr(vcoord[2], vshift16);
1842  vcoord[3] = vec_sr(vcoord[3], vshift16);
1843 
1844  vcoord16[0] = vec_pack(vcoord[0], vcoord[1]);
1845  vcoord16[1] = vec_pack(vcoord[2], vcoord[3]);
1846  vperm = vec_pack(vcoord16[0], vcoord16[1]);
1847 
1848  HCSCALE(src1, dst1);
1849  HCSCALE(src2, dst2);
1850 
1851  xpos += xInc * 16;
1852  }
1853  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
1854  dst1[i] = src1[srcW-1]*128;
1855  dst2[i] = src2[srcW-1]*128;
1856  }
1857 }
1858 
1859 #undef HCSCALE
1860 
1861 static void hScale16To19_vsx(SwsInternal *c, int16_t *_dst, int dstW,
1862  const uint8_t *_src, const int16_t *filter,
1863  const int32_t *filterPos, int filterSize)
1864 {
1865  const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->opts.src_format);
1866  int i, j;
1867  int32_t *dst = (int32_t *) _dst;
1868  const uint16_t *src = (const uint16_t *) _src;
1869  int bits = desc->comp[0].depth - 1;
1870  int sh = bits - 4;
1871  vec_s16 vfilter, vin;
1872  vec_s32 vout, vtmp, vtmp2, vfilter32_l, vfilter32_r;
1873  const vec_u8 vzero = vec_splat_u8(0);
1874  const vec_u8 vunusedtab[8] = {
1875  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1876  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
1877  (vec_u8) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1878  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1879  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
1880  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1881  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
1882  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1883  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1884  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1885  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1886  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1887  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1888  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
1889  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1890  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
1891  };
1892  const vec_u8 vunused = vunusedtab[filterSize % 8];
1893 
1894  if ((isAnyRGB(c->opts.src_format) || c->opts.src_format==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) {
1895  sh = 9;
1896  } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
1897  sh = 16 - 1 - 4;
1898  }
1899 
1900  if (filterSize == 1) {
1901  for (i = 0; i < dstW; i++) {
1902  int srcPos = filterPos[i];
1903  int val = 0;
1904 
1905  for (j = 0; j < filterSize; j++) {
1906  val += src[srcPos + j] * filter[filterSize * i + j];
1907  }
1908  // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
1909  dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
1910  }
1911  } else {
1912  for (i = 0; i < dstW; i++) {
1913  const int srcPos = filterPos[i];
1914  vout = vec_splat_s32(0);
1915  for (j = 0; j < filterSize; j += 8) {
1916  vin = (vec_s16) vec_vsx_ld(0, &src[srcPos + j]);
1917  if (j + 8 > filterSize) // Remove the unused elements on the last round
1918  vin = vec_perm(vin, (vec_s16) vzero, vunused);
1919 
1920  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
1921  vfilter32_l = vec_unpackh(vfilter);
1922  vfilter32_r = vec_unpackl(vfilter);
1923 
1924  vtmp = (vec_s32) vec_mergeh(vin, (vec_s16) vzero);
1925  vtmp2 = (vec_s32) vec_mergel(vin, (vec_s16) vzero);
1926 
1927  vtmp = vec_mul(vtmp, vfilter32_l);
1928  vtmp2 = vec_mul(vtmp2, vfilter32_r);
1929 
1930  vout = vec_adds(vout, vtmp);
1931  vout = vec_adds(vout, vtmp2);
1932  }
1933  vout = vec_sums(vout, (vec_s32) vzero);
1934  dst[i] = FFMIN(vout[3] >> sh, (1 << 19) - 1);
1935  }
1936  }
1937 }
1938 
1939 static void hScale16To15_vsx(SwsInternal *c, int16_t *dst, int dstW,
1940  const uint8_t *_src, const int16_t *filter,
1941  const int32_t *filterPos, int filterSize)
1942 {
1943  const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->opts.src_format);
1944  int i, j;
1945  const uint16_t *src = (const uint16_t *) _src;
1946  int sh = desc->comp[0].depth - 1;
1947  vec_s16 vfilter, vin;
1948  vec_s32 vout, vtmp, vtmp2, vfilter32_l, vfilter32_r;
1949  const vec_u8 vzero = vec_splat_u8(0);
1950  const vec_u8 vunusedtab[8] = {
1951  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1952  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
1953  (vec_u8) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1954  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1955  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
1956  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1957  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
1958  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1959  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1960  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1961  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1962  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1963  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1964  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
1965  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1966  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
1967  };
1968  const vec_u8 vunused = vunusedtab[filterSize % 8];
1969 
1970  if (sh<15) {
1971  sh = isAnyRGB(c->opts.src_format) || c->opts.src_format==AV_PIX_FMT_PAL8 ? 13 : (desc->comp[0].depth - 1);
1972  } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
1973  sh = 16 - 1;
1974  }
1975 
1976  if (filterSize == 1) {
1977  for (i = 0; i < dstW; i++) {
1978  int srcPos = filterPos[i];
1979  int val = 0;
1980 
1981  for (j = 0; j < filterSize; j++) {
1982  val += src[srcPos + j] * filter[filterSize * i + j];
1983  }
1984  // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
1985  dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
1986  }
1987  } else {
1988  for (i = 0; i < dstW; i++) {
1989  const int srcPos = filterPos[i];
1990  vout = vec_splat_s32(0);
1991  for (j = 0; j < filterSize; j += 8) {
1992  vin = (vec_s16) vec_vsx_ld(0, &src[srcPos + j]);
1993  if (j + 8 > filterSize) // Remove the unused elements on the last round
1994  vin = vec_perm(vin, (vec_s16) vzero, vunused);
1995 
1996  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
1997  vfilter32_l = vec_unpackh(vfilter);
1998  vfilter32_r = vec_unpackl(vfilter);
1999 
2000  vtmp = (vec_s32) vec_mergeh(vin, (vec_s16) vzero);
2001  vtmp2 = (vec_s32) vec_mergel(vin, (vec_s16) vzero);
2002 
2003  vtmp = vec_mul(vtmp, vfilter32_l);
2004  vtmp2 = vec_mul(vtmp2, vfilter32_r);
2005 
2006  vout = vec_adds(vout, vtmp);
2007  vout = vec_adds(vout, vtmp2);
2008  }
2009  vout = vec_sums(vout, (vec_s32) vzero);
2010  dst[i] = FFMIN(vout[3] >> sh, (1 << 15) - 1);
2011  }
2012  }
2013 }
2014 
2015 #endif /* !HAVE_BIGENDIAN */
2016 
2017 #endif /* HAVE_VSX */
2018 
2020 {
2021 #if HAVE_VSX
2022  enum AVPixelFormat dstFormat = c->opts.dst_format;
2023  const int cpu_flags = av_get_cpu_flags();
2024  const unsigned char power8 = HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8;
2025 
2026  if (!(cpu_flags & AV_CPU_FLAG_VSX))
2027  return;
2028 
2029 #if !HAVE_BIGENDIAN
2030  if (c->srcBpc == 8) {
2031  if (c->dstBpc <= 14) {
2032  c->hyScale = c->hcScale = hScale_real_vsx;
2033  if (c->opts.flags & SWS_FAST_BILINEAR && c->opts.dst_w >= c->opts.src_w && c->chrDstW >= c->chrSrcW) {
2034  c->hyscale_fast = hyscale_fast_vsx;
2035  c->hcscale_fast = hcscale_fast_vsx;
2036  }
2037  }
2038  } else {
2039  if (power8) {
2040  c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_vsx
2041  : hScale16To15_vsx;
2042  }
2043  }
2044  if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) &&
2045  dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE &&
2046  !c->needAlpha) {
2047  c->yuv2planeX = yuv2planeX_vsx;
2048  }
2049 #endif
2050 
2051  if (!(c->opts.flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {
2052  switch (c->dstBpc) {
2053  case 8:
2054  c->yuv2plane1 = yuv2plane1_8_vsx;
2055  break;
2056 #if !HAVE_BIGENDIAN
2057  case 9:
2058  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_vsx : yuv2plane1_9LE_vsx;
2059  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_vsx : yuv2planeX_9LE_vsx;
2060  break;
2061  case 10:
2062  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_vsx : yuv2plane1_10LE_vsx;
2063  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_vsx : yuv2planeX_10LE_vsx;
2064  break;
2065  case 12:
2066  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_vsx : yuv2plane1_12LE_vsx;
2067  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_vsx : yuv2planeX_12LE_vsx;
2068  break;
2069  case 14:
2070  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_vsx : yuv2plane1_14LE_vsx;
2071  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_vsx : yuv2planeX_14LE_vsx;
2072  break;
2073  case 16:
2074  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_vsx : yuv2plane1_16LE_vsx;
2075 #if HAVE_POWER8
2076  if (cpu_flags & AV_CPU_FLAG_POWER8) {
2077  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_vsx : yuv2planeX_16LE_vsx;
2078  }
2079 #endif /* HAVE_POWER8 */
2080  break;
2081 #endif /* !HAVE_BIGENDIAN */
2082  }
2083  }
2084 
2085  if (c->opts.flags & SWS_BITEXACT)
2086  return;
2087 
2088 #if !HAVE_BIGENDIAN
2089  if (c->opts.flags & SWS_FULL_CHR_H_INT) {
2090  switch (dstFormat) {
2091  case AV_PIX_FMT_RGB24:
2092  if (power8) {
2093  c->yuv2packed1 = yuv2rgb24_full_1_vsx;
2094  c->yuv2packed2 = yuv2rgb24_full_2_vsx;
2095  c->yuv2packedX = yuv2rgb24_full_X_vsx;
2096  }
2097  break;
2098  case AV_PIX_FMT_BGR24:
2099  if (power8) {
2100  c->yuv2packed1 = yuv2bgr24_full_1_vsx;
2101  c->yuv2packed2 = yuv2bgr24_full_2_vsx;
2102  c->yuv2packedX = yuv2bgr24_full_X_vsx;
2103  }
2104  break;
2105  case AV_PIX_FMT_BGRA:
2106  if (power8) {
2107  if (!c->needAlpha) {
2108  c->yuv2packed1 = yuv2bgrx32_full_1_vsx;
2109  c->yuv2packed2 = yuv2bgrx32_full_2_vsx;
2110  c->yuv2packedX = yuv2bgrx32_full_X_vsx;
2111  }
2112  }
2113  break;
2114  case AV_PIX_FMT_RGBA:
2115  if (power8) {
2116  if (!c->needAlpha) {
2117  c->yuv2packed1 = yuv2rgbx32_full_1_vsx;
2118  c->yuv2packed2 = yuv2rgbx32_full_2_vsx;
2119  c->yuv2packedX = yuv2rgbx32_full_X_vsx;
2120  }
2121  }
2122  break;
2123  case AV_PIX_FMT_ARGB:
2124  if (power8) {
2125  if (!c->needAlpha) {
2126  c->yuv2packed1 = yuv2xrgb32_full_1_vsx;
2127  c->yuv2packed2 = yuv2xrgb32_full_2_vsx;
2128  c->yuv2packedX = yuv2xrgb32_full_X_vsx;
2129  }
2130  }
2131  break;
2132  case AV_PIX_FMT_ABGR:
2133  if (power8) {
2134  if (!c->needAlpha) {
2135  c->yuv2packed1 = yuv2xbgr32_full_1_vsx;
2136  c->yuv2packed2 = yuv2xbgr32_full_2_vsx;
2137  c->yuv2packedX = yuv2xbgr32_full_X_vsx;
2138  }
2139  }
2140  break;
2141  }
2142  } else { /* !SWS_FULL_CHR_H_INT */
2143  switch (dstFormat) {
2144  case AV_PIX_FMT_YUYV422:
2145  c->yuv2packed1 = yuv2yuyv422_1_vsx;
2146  c->yuv2packed2 = yuv2yuyv422_2_vsx;
2147  c->yuv2packedX = yuv2yuyv422_X_vsx;
2148  break;
2149  case AV_PIX_FMT_YVYU422:
2150  c->yuv2packed1 = yuv2yvyu422_1_vsx;
2151  c->yuv2packed2 = yuv2yvyu422_2_vsx;
2152  c->yuv2packedX = yuv2yvyu422_X_vsx;
2153  break;
2154  case AV_PIX_FMT_UYVY422:
2155  c->yuv2packed1 = yuv2uyvy422_1_vsx;
2156  c->yuv2packed2 = yuv2uyvy422_2_vsx;
2157  c->yuv2packedX = yuv2uyvy422_X_vsx;
2158  break;
2159  case AV_PIX_FMT_BGRA:
2160  if (power8) {
2161  if (!c->needAlpha) {
2162  c->yuv2packed1 = yuv2bgrx32_1_vsx;
2163  c->yuv2packed2 = yuv2bgrx32_2_vsx;
2164  }
2165  }
2166  break;
2167  case AV_PIX_FMT_RGBA:
2168  if (power8) {
2169  if (!c->needAlpha) {
2170  c->yuv2packed1 = yuv2rgbx32_1_vsx;
2171  c->yuv2packed2 = yuv2rgbx32_2_vsx;
2172  }
2173  }
2174  break;
2175  case AV_PIX_FMT_ARGB:
2176  if (power8) {
2177  if (!c->needAlpha) {
2178  c->yuv2packed1 = yuv2xrgb32_1_vsx;
2179  c->yuv2packed2 = yuv2xrgb32_2_vsx;
2180  }
2181  }
2182  break;
2183  case AV_PIX_FMT_ABGR:
2184  if (power8) {
2185  if (!c->needAlpha) {
2186  c->yuv2packed1 = yuv2xbgr32_1_vsx;
2187  c->yuv2packed2 = yuv2xbgr32_2_vsx;
2188  }
2189  }
2190  break;
2191  case AV_PIX_FMT_RGB24:
2192  if (power8) {
2193  c->yuv2packed1 = yuv2rgb24_1_vsx;
2194  c->yuv2packed2 = yuv2rgb24_2_vsx;
2195  }
2196  break;
2197  case AV_PIX_FMT_BGR24:
2198  if (power8) {
2199  c->yuv2packed1 = yuv2bgr24_1_vsx;
2200  c->yuv2packed2 = yuv2bgr24_2_vsx;
2201  }
2202  break;
2203  }
2204  }
2205 #endif /* !HAVE_BIGENDIAN */
2206 
2207 #endif /* HAVE_VSX */
2208 }
A
#define A(x)
Definition: vpx_arith.h:28
_dst
uint8_t * _dst
Definition: dsp.h:52
YUV2PACKEDWRAPPER
#define YUV2PACKEDWRAPPER(name, base, ext, fmt)
Definition: output.c:765
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:71
yuv2NBPS
yuv2NBPS(yuv2NBPS(9, yuv2NBPS(BE, yuv2NBPS(1, yuv2NBPS(10, int16_t)
Definition: output.c:377
mem_internal.h
vec_s8
#define vec_s8
Definition: util_altivec.h:35
av_pix_fmt_desc_get
const AVPixFmtDescriptor * av_pix_fmt_desc_get(enum AVPixelFormat pix_fmt)
Definition: pixdesc.c:3170
src1
const pixel * src1
Definition: h264pred_template.c:421
AV_PIX_FMT_FLAG_FLOAT
#define AV_PIX_FMT_FLAG_FLOAT
The pixel format contains IEEE-754 floating point values.
Definition: pixdesc.h:158
MAX_FILTER_SIZE
#define MAX_FILTER_SIZE
Definition: af_dynaudnorm.c:36
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
SWS_BITEXACT
@ SWS_BITEXACT
Definition: swscale.h:156
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:76
AV_PIX_FMT_BGRA
@ AV_PIX_FMT_BGRA
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:102
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
AV_PIX_FMT_GRAYF32LE
@ AV_PIX_FMT_GRAYF32LE
IEEE-754 single precision Y, 32bpp, little-endian.
Definition: pixfmt.h:364
_src
uint8_t ptrdiff_t const uint8_t * _src
Definition: dsp.h:52
SWS_FAST_BILINEAR
@ SWS_FAST_BILINEAR
Scaler selection options.
Definition: swscale.h:98
is16BPS
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:727
rgb
Definition: rpzaenc.c:60
YUV2RGBWRAPPER
#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)
Definition: output_lasx.c:808
val
static double val(void *priv, double ch)
Definition: aeval.c:77
isNBPS
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:741
AV_CPU_FLAG_VSX
#define AV_CPU_FLAG_VSX
ISA 2.06.
Definition: cpu.h:62
vec_s32
#define vec_s32
Definition: util_altivec.h:39
vec_s16
#define vec_s16
Definition: util_altivec.h:37
av_cold
#define av_cold
Definition: attributes.h:90
LOCAL_ALIGNED
#define LOCAL_ALIGNED(a, t, v,...)
Definition: mem_internal.h:124
clip
clip
Definition: af_crystalizer.c:122
bits
uint8_t bits
Definition: vp3data.h:128
AV_PIX_FMT_RGBA
@ AV_PIX_FMT_RGBA
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:100
isSemiPlanarYUV
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:773
NULL
#define NULL
Definition: coverity.c:32
bias
static int bias(int x, int c)
Definition: vqcdec.c:115
AV_PIX_FMT_YUYV422
@ AV_PIX_FMT_YUYV422
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
Definition: pixfmt.h:74
AV_PIX_FMT_ABGR
@ AV_PIX_FMT_ABGR
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:101
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
isBE
static av_always_inline int isBE(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:748
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:75
vec_u32
#define vec_u32
Definition: util_altivec.h:38
ff_sws_init_swscale_vsx
av_cold void ff_sws_init_swscale_vsx(SwsInternal *c)
Definition: swscale_vsx.c:2019
shift
static int shift(int a, int b)
Definition: bonk.c:261
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
cpu.h
isAnyRGB
static av_always_inline int isAnyRGB(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:856
output_pixel
#define output_pixel(pos, val, bias, signedness)
Definition: output.c:2979
yuv2rgb_altivec.h
vec_u8
#define vec_u8
Definition: util_altivec.h:34
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
attributes.h
AV_PIX_FMT_ARGB
@ AV_PIX_FMT_ARGB
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:99
av_assert2
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:67
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
src2
const pixel * src2
Definition: h264pred_template.c:422
YUV2RGBWRAPPERX
#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)
Definition: output_lasx.c:784
av_always_inline
#define av_always_inline
Definition: attributes.h:49
swscale_internal.h
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
shift2
static const uint8_t shift2[6]
Definition: dxa.c:49
AV_PIX_FMT_PAL8
@ AV_PIX_FMT_PAL8
8 bits with AV_PIX_FMT_RGB32 palette
Definition: pixfmt.h:84
AV_PIX_FMT_YVYU422
@ AV_PIX_FMT_YVYU422
packed YUV 4:2:2, 16bpp, Y0 Cr Y1 Cb
Definition: pixfmt.h:207
vshift
static int vshift(enum AVPixelFormat fmt, int plane)
Definition: graph.c:99
SwsInternal
Definition: swscale_internal.h:317
YUV2RGBWRAPPERX2
#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)
Definition: output_lasx.c:797
AV_PIX_FMT_GRAYF32BE
@ AV_PIX_FMT_GRAYF32BE
IEEE-754 single precision Y, 32bpp, big-endian.
Definition: pixfmt.h:363
SWS_FULL_CHR_H_INT
@ SWS_FULL_CHR_H_INT
Perform full chroma upsampling when upscaling to RGB.
Definition: swscale.h:132
swscale_ppc_template.c
AV_PIX_FMT_UYVY422
@ AV_PIX_FMT_UYVY422
packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
Definition: pixfmt.h:88
U
#define U(x)
Definition: vpx_arith.h:37
SETUP
@ SETUP
Definition: rtspcodes.h:133
mul8
#define mul8(a, b)
Definition: texturedspenc.c:143
AV_CPU_FLAG_POWER8
#define AV_CPU_FLAG_POWER8
ISA 2.07.
Definition: cpu.h:63
av_clip_uint8
#define av_clip_uint8
Definition: common.h:106
desc
const char * desc
Definition: libsvtav1.c:79
AVPixFmtDescriptor
Descriptor that unambiguously describes how the bits of a pixel are stored in the up to 4 data planes...
Definition: pixdesc.h:69
util_altivec.h
int32_t
int32_t
Definition: audioconvert.c:56
src
#define src
Definition: vp8dsp.c:248
vec_u16
#define vec_u16
Definition: util_altivec.h:36
swscale.h
shifts
static const uint8_t shifts[2][12]
Definition: camellia.c:178
dither
static const uint8_t dither[8][8]
Definition: vf_fspp.c:62