FFmpeg
uops_tmpl.c
Go to the documentation of this file.
1 /**
2  * Copyright (C) 2026 Niklas Haas
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <libavutil/bswap.h>
22 
23 #include "uops_tmpl.h"
24 
25 #ifndef BIT_DEPTH
26 # define BIT_DEPTH 8
27 #endif
28 
29 #if IS_FLOAT && BIT_DEPTH == 32
30 # define PIXEL_TYPE SWS_PIXEL_F32
31 # define pixel_t float
32 # define inter_t float
33 # define PX F32
34 # define px f32
35 #elif BIT_DEPTH == 32
36 # define PIXEL_MAX 0xFFFFFFFFu
37 # define PIXEL_SWAP av_bswap32
38 # define pixel_t uint32_t
39 # define inter_t int64_t
40 # define PX U32
41 # define px u32
42 #elif BIT_DEPTH == 16
43 # define PIXEL_MAX 0xFFFFu
44 # define PIXEL_SWAP av_bswap16
45 # define pixel_t uint16_t
46 # define inter_t int64_t
47 # define PX U16
48 # define px u16
49 #elif BIT_DEPTH == 8
50 # define PIXEL_MAX 0xFFu
51 # define pixel_t uint8_t
52 # define inter_t int32_t
53 # define PX U8
54 # define px u8
55 #else
56 # error Invalid BIT_DEPTH
57 #endif
58 
59 /*********************************
60  * Generic read/write operations *
61  *********************************/
62 
63 DECL_READ(read_planar, const SwsCompMask mask)
64 {
65  SWS_LOOP
66  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
67  if (X) x[i] = in0[i];
68  if (Y) y[i] = in1[i];
69  if (Z) z[i] = in2[i];
70  if (W) w[i] = in3[i];
71  }
72 
73  if (X) iter->in[0] += SIZEOF_BLOCK;
74  if (Y) iter->in[1] += SIZEOF_BLOCK;
75  if (Z) iter->in[2] += SIZEOF_BLOCK;
76  if (W) iter->in[3] += SIZEOF_BLOCK;
77 
78  CONTINUE(x, y, z, w);
79 }
80 
81 DECL_READ(read_packed, const SwsCompMask mask)
82 {
83  const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;
84 
85  SWS_LOOP
86  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
87  if (X) x[i] = in0[elems * i + 0];
88  if (Y) y[i] = in0[elems * i + 1];
89  if (Z) z[i] = in0[elems * i + 2];
90  if (W) w[i] = in0[elems * i + 3];
91  }
92 
93  iter->in[0] += SIZEOF_BLOCK * elems;
94  CONTINUE(x, y, z, w);
95 }
96 
97 DECL_WRITE(write_planar, const SwsCompMask mask)
98 {
99  SWS_LOOP
100  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
101  if (X) out0[i] = x[i];
102  if (Y) out1[i] = y[i];
103  if (Z) out2[i] = z[i];
104  if (W) out3[i] = w[i];
105  }
106 
107  if (X) iter->out[0] += SIZEOF_BLOCK;
108  if (Y) iter->out[1] += SIZEOF_BLOCK;
109  if (Z) iter->out[2] += SIZEOF_BLOCK;
110  if (W) iter->out[3] += SIZEOF_BLOCK;
111 }
112 
113 DECL_WRITE(write_packed, const SwsCompMask mask)
114 {
115  const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;
116 
117  SWS_LOOP
118  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
119  if (X) out0[elems * i + 0] = x[i];
120  if (Y) out0[elems * i + 1] = y[i];
121  if (Z) out0[elems * i + 2] = z[i];
122  if (W) out0[elems * i + 3] = w[i];
123  }
124 
125  iter->out[0] += SIZEOF_BLOCK * elems;
126 }
127 
128 #if BIT_DEPTH == 8
129 
131 {
133 
134  SWS_LOOP
135  for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
136  const pixel_t val = ((const pixel_t *) in0)[i >> 3];
137  x[i + 0] = (val >> 7) & 1;
138  x[i + 1] = (val >> 6) & 1;
139  x[i + 2] = (val >> 5) & 1;
140  x[i + 3] = (val >> 4) & 1;
141  x[i + 4] = (val >> 3) & 1;
142  x[i + 5] = (val >> 2) & 1;
143  x[i + 6] = (val >> 1) & 1;
144  x[i + 7] = (val >> 0) & 1;
145  }
146 
147  iter->in[0] += SIZEOF_BLOCK >> 3;
148  CONTINUE(x, y, z, w);
149 }
150 
151 DECL_READ(read_nibble, const SwsCompMask mask)
152 {
154 
155  SWS_LOOP
156  for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) {
157  const pixel_t val = in0[i >> 1];
158  x[i + 0] = val >> 4; /* high nibble */
159  x[i + 1] = val & 0xF; /* low nibble */
160  }
161 
162  iter->in[0] += SIZEOF_BLOCK >> 1;
163  CONTINUE(x, y, z, w);
164 }
165 
166 DECL_READ(read_palette, const SwsCompMask mask)
167 {
169 
170  SWS_LOOP
171  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
172  const pixel_t index = in0[i];
173  const pixel_t *value = &in1[index * 4];
174  x[i] = value[0];
175  y[i] = value[1];
176  z[i] = value[2];
177  w[i] = value[3];
178  }
179 
180  iter->in[0] += SIZEOF_BLOCK;
181  CONTINUE(x, y, z, w);
182 }
183 
184 DECL_WRITE(write_bit, const SwsCompMask mask)
185 {
187 
188  SWS_LOOP
189  for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
190  out0[i >> 3] = x[i + 0] << 7 |
191  x[i + 1] << 6 |
192  x[i + 2] << 5 |
193  x[i + 3] << 4 |
194  x[i + 4] << 3 |
195  x[i + 5] << 2 |
196  x[i + 6] << 1 |
197  x[i + 7];
198  }
199 
200  iter->out[0] += SIZEOF_BLOCK >> 3;
201 }
202 
203 DECL_WRITE(write_nibble, const SwsCompMask mask)
204 {
206 
207  SWS_LOOP
208  for (int i = 0; i < SWS_BLOCK_SIZE; i += 2)
209  out0[i >> 1] = x[i] << 4 | x[i + 1];
210 
211  iter->out[0] += SIZEOF_BLOCK >> 1;
212 }
213 
214 #endif /* BIT_DEPTH == 8 */
215 
216 SWS_FOR(PX, READ_PLANAR, DECL_IMPL_READ, read_planar)
217 SWS_FOR(PX, READ_PACKED, DECL_IMPL_READ, read_packed)
218 SWS_FOR(PX, READ_NIBBLE, DECL_IMPL_READ, read_nibble)
219 SWS_FOR(PX, READ_BIT, DECL_IMPL_READ, read_bit)
220 SWS_FOR(PX, READ_PALETTE, DECL_IMPL_READ, read_palette)
221 SWS_FOR(PX, WRITE_PLANAR, DECL_IMPL_WRITE, write_planar)
222 SWS_FOR(PX, WRITE_PACKED, DECL_IMPL_WRITE, write_packed)
223 SWS_FOR(PX, WRITE_NIBBLE, DECL_IMPL_WRITE, write_nibble)
224 SWS_FOR(PX, WRITE_BIT, DECL_IMPL_WRITE, write_bit)
225 
226 SWS_FOR_STRUCT(PX, READ_PLANAR, DECL_ENTRY)
227 SWS_FOR_STRUCT(PX, READ_PACKED, DECL_ENTRY)
228 SWS_FOR_STRUCT(PX, READ_NIBBLE, DECL_ENTRY)
229 SWS_FOR_STRUCT(PX, READ_BIT, DECL_ENTRY)
230 SWS_FOR_STRUCT(PX, READ_PALETTE, DECL_ENTRY)
231 SWS_FOR_STRUCT(PX, WRITE_PLANAR, DECL_ENTRY)
232 SWS_FOR_STRUCT(PX, WRITE_PACKED, DECL_ENTRY)
233 SWS_FOR_STRUCT(PX, WRITE_NIBBLE, DECL_ENTRY)
234 SWS_FOR_STRUCT(PX, WRITE_BIT, DECL_ENTRY)
235 
236 /*****************************
237  * Scaling / filtering reads *
238  *****************************/
239 
241 {
242  if (params->uop->par.filter.type != SWS_PIXEL_F32)
243  return AVERROR(ENOTSUP);
244 
245  const SwsFilterWeights *filter = params->uop->data.kernel;
246  static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),
247  ">8 byte pointers not supported");
248 
249  /* Pre-convert weights to float */
250  float *weights = av_calloc(filter->num_weights, sizeof(float));
251  if (!weights)
252  return AVERROR(ENOMEM);
253 
254  for (int i = 0; i < filter->num_weights; i++)
255  weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;
256 
257  out->priv.ptr = weights;
258  out->priv.i32[2] = filter->filter_size;
259  out->free = ff_op_priv_free;
260  return 0;
261 }
262 
263 /* Fully general vertical planar filter case */
264 DECL_READ(read_planar_fv, const SwsCompMask mask, const SwsPixelType type)
265 {
267  const SwsOpExec *exec = iter->exec;
268  const float *restrict weights = impl->priv.ptr;
269  const int filter_size = impl->priv.i32[2];
270  weights += filter_size * iter->y;
271 
272  block_t xs, ys, zs, ws;
273  if (X) memset(&xs.f32, 0, sizeof(xs.f32));
274  if (Y) memset(&ys.f32, 0, sizeof(ys.f32));
275  if (Z) memset(&zs.f32, 0, sizeof(zs.f32));
276  if (W) memset(&ws.f32, 0, sizeof(ws.f32));
277 
278  for (int j = 0; j < filter_size; j++) {
279  const float weight = weights[j];
280 
281  SWS_LOOP
282  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
283  if (X) xs.f32[i] += weight * in0[i];
284  if (Y) ys.f32[i] += weight * in1[i];
285  if (Z) zs.f32[i] += weight * in2[i];
286  if (W) ws.f32[i] += weight * in3[i];
287  }
288 
289  if (X) in0 = bump_ptr(in0, exec->in_stride[0]);
290  if (Y) in1 = bump_ptr(in1, exec->in_stride[1]);
291  if (Z) in2 = bump_ptr(in2, exec->in_stride[2]);
292  if (W) in3 = bump_ptr(in3, exec->in_stride[3]);
293  }
294 
295  if (X) iter->in[0] += SIZEOF_BLOCK;
296  if (Y) iter->in[1] += SIZEOF_BLOCK;
297  if (Z) iter->in[2] += SIZEOF_BLOCK;
298  if (W) iter->in[3] += SIZEOF_BLOCK;
299 
300  CONTINUE(&xs, &ys, &zs, &ws);
301 }
302 
304 {
305  if (params->uop->par.filter.type != SWS_PIXEL_F32)
306  return AVERROR(ENOTSUP);
307 
308  SwsFilterWeights *filter = params->uop->data.kernel;
309  out->priv.ptr = av_refstruct_ref(filter->weights);
310  out->priv.i32[2] = filter->filter_size;
311  out->free = ff_op_priv_unref;
312  return 0;
313 }
314 
315 /* Fully general horizontal planar filter case */
316 DECL_READ(read_planar_fh, const SwsCompMask mask, const SwsPixelType type)
317 {
319  const SwsOpExec *exec = iter->exec;
320  const int *restrict weights = impl->priv.ptr;
321  const int filter_size = impl->priv.i32[2];
322  const float scale = 1.0f / SWS_FILTER_SCALE;
323  const int xpos = iter->x;
324  weights += filter_size * iter->x;
325 
326  block_t xs, ys, zs, ws;
327  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
328  const int offset = exec->in_offset_x[xpos + i];
329  pixel_t *start0 = bump_ptr(in0, offset);
330  pixel_t *start1 = bump_ptr(in1, offset);
331  pixel_t *start2 = bump_ptr(in2, offset);
332  pixel_t *start3 = bump_ptr(in3, offset);
333 
334  inter_t sx = 0, sy = 0, sz = 0, sw = 0;
335  for (int j = 0; j < filter_size; j++) {
336  const int weight = weights[j];
337  if (X) sx += weight * start0[j];
338  if (Y) sy += weight * start1[j];
339  if (Z) sz += weight * start2[j];
340  if (W) sw += weight * start3[j];
341  }
342 
343  if (X) xs.f32[i] = (float) sx * scale;
344  if (Y) ys.f32[i] = (float) sy * scale;
345  if (Z) zs.f32[i] = (float) sz * scale;
346  if (W) ws.f32[i] = (float) sw * scale;
347 
348  weights += filter_size;
349  }
350 
351  CONTINUE(&xs, &ys, &zs, &ws);
352 }
353 
354 SWS_FOR(PX, READ_PLANAR_FV, DECL_IMPL_READ, read_planar_fv)
355 SWS_FOR(PX, READ_PLANAR_FH, DECL_IMPL_READ, read_planar_fh)
356 SWS_FOR_STRUCT(PX, READ_PLANAR_FV, DECL_ENTRY, .setup = fn(setup_filter_v) )
357 SWS_FOR_STRUCT(PX, READ_PLANAR_FH, DECL_ENTRY, .setup = fn(setup_filter_h) )
358 
359 /***************************
360  * Permutation and copying *
361  ***************************/
362 
363 /* Permute by directly swapping the order of arguments to the continuation. */
364 #define DECL_PERMUTE(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3) \
365  static void NAME##_c(SwsOpIter *restrict iter, \
366  const SwsOpImpl *restrict impl, \
367  void *restrict in0, void *restrict in1, \
368  void *restrict in2, void *restrict in3) \
369  { \
370  CONTINUE(in##IDX0, in##IDX1, in##IDX2, in##IDX3); \
371  }
372 
373 #define DECL_COPY(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3) \
374  static void NAME##_c(SwsOpIter *restrict iter, \
375  const SwsOpImpl *restrict impl, \
376  void *restrict in0, void *restrict in1, \
377  void *restrict in2, void *restrict in3) \
378  { \
379  const SwsCompMask mask = (MASK); \
380  block_t x, y, z, w; \
381  \
382  if (X) memcpy(&x.px, in##IDX0, SIZEOF_BLOCK); \
383  if (Y) memcpy(&y.px, in##IDX1, SIZEOF_BLOCK); \
384  if (Z) memcpy(&z.px, in##IDX2, SIZEOF_BLOCK); \
385  if (W) memcpy(&w.px, in##IDX3, SIZEOF_BLOCK); \
386  \
387  CONTINUE(X ? &x : in0, Y ? &y : in1, Z ? &z : in2, W ? &w : in3); \
388  }
389 
390 SWS_FOR(PX, PERMUTE, DECL_PERMUTE)
392 SWS_FOR_STRUCT(PX, PERMUTE, DECL_ENTRY)
394 
395 /*********************
396  * Format conversion *
397  *********************/
398 
399 #define DECL_CAST(DST, dst) \
400  DECL_FUNC(to_##dst, const SwsCompMask mask) \
401  { \
402  block_t xx, yy, zz, ww; \
403  \
404  SWS_LOOP \
405  for (int i = 0; i < SWS_BLOCK_SIZE; i++) { \
406  if (X) xx.dst[i] = x[i]; \
407  if (Y) yy.dst[i] = y[i]; \
408  if (Z) zz.dst[i] = z[i]; \
409  if (W) ww.dst[i] = w[i]; \
410  } \
411  \
412  CONTINUE(&xx, &yy, &zz, &ww); \
413  } \
414  \
415  SWS_FOR(PX, TO_##DST, DECL_IMPL, to_##dst) \
416  SWS_FOR_STRUCT(PX, TO_##DST, DECL_ENTRY)
417 
418 DECL_CAST(U8, u8)
419 DECL_CAST(U16, u16)
420 DECL_CAST(U32, u32)
421 DECL_CAST(F32, f32)
422 
423 /********************
424  * Bit manipulation *
425  ********************/
426 
427 #if !IS_FLOAT
428 DECL_FUNC(lshift, const SwsCompMask mask, const uint8_t amount)
429 {
430  SWS_LOOP
431  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
432  if (X) x[i] <<= amount;
433  if (Y) y[i] <<= amount;
434  if (Z) z[i] <<= amount;
435  if (W) w[i] <<= amount;
436  }
437 
438  CONTINUE(x, y, z, w);
439 }
440 
441 DECL_FUNC(rshift, const SwsCompMask mask, const uint8_t amount)
442 {
443  SWS_LOOP
444  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
445  if (X) x[i] >>= amount;
446  if (Y) y[i] >>= amount;
447  if (Z) z[i] >>= amount;
448  if (W) w[i] >>= amount;
449  }
450 
451  CONTINUE(x, y, z, w);
452 }
453 #endif
454 
455 SWS_FOR(PX, LSHIFT, DECL_IMPL, lshift)
456 SWS_FOR(PX, RSHIFT, DECL_IMPL, rshift)
457 
458 SWS_FOR_STRUCT(PX, LSHIFT, DECL_ENTRY)
460 
461 #ifdef PIXEL_SWAP
462 DECL_FUNC(swap_bytes, const SwsCompMask mask)
463 {
464  SWS_LOOP
465  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
466  if (X) x[i] = PIXEL_SWAP(x[i]);
467  if (Y) y[i] = PIXEL_SWAP(y[i]);
468  if (Z) z[i] = PIXEL_SWAP(z[i]);
469  if (W) w[i] = PIXEL_SWAP(w[i]);
470  }
471 
472  CONTINUE(x, y, z, w);
473 }
474 #endif /* PIXEL_SWAP */
475 
476 SWS_FOR(PX, SWAP_BYTES, DECL_IMPL, swap_bytes)
477 SWS_FOR_STRUCT(PX, SWAP_BYTES, DECL_ENTRY)
478 
479 #ifdef PIXEL_MAX
480 DECL_FUNC(expand_bit, const SwsCompMask mask)
481 {
482  SWS_LOOP
483  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
484  if (X) x[i] = x[i] ? PIXEL_MAX : 0;
485  if (Y) y[i] = y[i] ? PIXEL_MAX : 0;
486  if (Z) z[i] = z[i] ? PIXEL_MAX : 0;
487  if (W) w[i] = w[i] ? PIXEL_MAX : 0;
488  }
489 
490  CONTINUE(x, y, z, w);
491 }
492 #endif
493 
494 #if BIT_DEPTH == 8
495 DECL_FUNC(expand_pair, const SwsCompMask mask)
496 {
497  block_t x16, y16, z16, w16;
498 
499  SWS_LOOP
500  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
501  if (X) x16.u16[i] = x[i] << 8 | x[i];
502  if (Y) y16.u16[i] = y[i] << 8 | y[i];
503  if (Z) z16.u16[i] = z[i] << 8 | z[i];
504  if (W) w16.u16[i] = w[i] << 8 | w[i];
505  }
506 
507  CONTINUE(&x16, &y16, &z16, &w16);
508 }
509 
510 DECL_FUNC(expand_quad, const SwsCompMask mask)
511 {
512  block_t x32, y32, z32, w32;
513 
514  SWS_LOOP
515  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
516  if (X) x32.u32[i] = (uint32_t) x[i] << 24 | x[i] << 16 | x[i] << 8 | x[i];
517  if (Y) y32.u32[i] = (uint32_t) y[i] << 24 | y[i] << 16 | y[i] << 8 | y[i];
518  if (Z) z32.u32[i] = (uint32_t) z[i] << 24 | z[i] << 16 | z[i] << 8 | z[i];
519  if (W) w32.u32[i] = (uint32_t) w[i] << 24 | w[i] << 16 | w[i] << 8 | w[i];
520  }
521 
522  CONTINUE(&x32, &y32, &z32, &w32);
523 }
524 #endif /* BIT_DEPTH == 8 */
525 
526 SWS_FOR(PX, EXPAND_BIT, DECL_IMPL, expand_bit)
527 SWS_FOR(PX, EXPAND_PAIR, DECL_IMPL, expand_pair)
528 SWS_FOR(PX, EXPAND_QUAD, DECL_IMPL, expand_quad)
529 SWS_FOR_STRUCT(PX, EXPAND_BIT, DECL_ENTRY)
530 SWS_FOR_STRUCT(PX, EXPAND_PAIR, DECL_ENTRY)
531 SWS_FOR_STRUCT(PX, EXPAND_QUAD, DECL_ENTRY)
532 
533 /*************************
534  * Packing and unpacking *
535  ************************/
536 
537 #if !IS_FLOAT
539  const uint8_t bx, const uint8_t by,
540  const uint8_t bz, const uint8_t bw)
541 {
542  const uint8_t sx = bw + bz + by;
543  const uint8_t sy = bw + bz;
544  const uint8_t sz = bw;
545  const uint8_t sw = 0;
546 
547  const pixel_t mx = (1 << bx) - 1;
548  const pixel_t my = (1 << by) - 1;
549  const pixel_t mz = (1 << bz) - 1;
550  const pixel_t mw = (1 << bw) - 1;
551 
552  SWS_LOOP
553  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
554  const pixel_t val = x[i];
555  if (X) x[i] = (val >> sx) & mx;
556  if (Y) y[i] = (val >> sy) & my;
557  if (Z) z[i] = (val >> sz) & mz;
558  if (W) w[i] = (val >> sw) & mw;
559  }
560 
561  CONTINUE(x, y, z, w);
562 }
563 
565  const uint8_t bx, const uint8_t by,
566  const uint8_t bz, const uint8_t bw)
567 {
568  const uint8_t sx = bw + bz + by;
569  const uint8_t sy = bw + bz;
570  const uint8_t sz = bw;
571  const uint8_t sw = 0;
572 
573  SWS_LOOP
574  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
575  pixel_t val = 0;
576  if (X) val |= x[i] << sx;
577  if (Y) val |= y[i] << sy;
578  if (Z) val |= z[i] << sz;
579  if (W) val |= w[i] << sw;
580  x[i] = val;
581  }
582 
583  CONTINUE(x, y, z, w);
584 }
585 #endif /* !IS_FLOAT */
586 
587 SWS_FOR(PX, UNPACK, DECL_IMPL, unpack)
588 SWS_FOR(PX, PACK, DECL_IMPL, pack)
589 SWS_FOR_STRUCT(PX, UNPACK, DECL_ENTRY)
591 
592 /***********************
593  * Pixel data clearing *
594  ***********************/
595 
596 #ifdef PIXEL_MAX
597 DECL_FUNC(clear, const SwsCompMask mask, const SwsCompMask one,
598  const SwsCompMask zero)
599 {
600  #define ONE(N) SWS_COMP_TEST(one, N)
601  #define ZERO(N) SWS_COMP_TEST(zero, N)
602  const pixel_t cx = ONE(0) ? PIXEL_MAX : ZERO(0) ? 0 : impl->priv.px[0];
603  const pixel_t cy = ONE(1) ? PIXEL_MAX : ZERO(1) ? 0 : impl->priv.px[1];
604  const pixel_t cz = ONE(2) ? PIXEL_MAX : ZERO(2) ? 0 : impl->priv.px[2];
605  const pixel_t cw = ONE(3) ? PIXEL_MAX : ZERO(3) ? 0 : impl->priv.px[3];
606 
607  SWS_LOOP
608  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
609  if (X) x[i] = cx;
610  if (Y) y[i] = cy;
611  if (Z) z[i] = cz;
612  if (W) w[i] = cw;
613  }
614 
615  CONTINUE(x, y, z, w);
616 }
617 #endif
618 
619 SWS_FOR(PX, CLEAR, DECL_IMPL, clear)
621 
622 /*************************
623  * Arithmetic operations *
624  *************************/
625 
627 {
628  const pixel_t scale = impl->priv.px[0];
629 
630  SWS_LOOP
631  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
632  if (X) x[i] *= scale;
633  if (Y) y[i] *= scale;
634  if (Z) z[i] *= scale;
635  if (W) w[i] *= scale;
636  }
637 
638  CONTINUE(x, y, z, w);
639 }
640 
642 {
643  SWS_LOOP
644  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
645  if (X) x[i] += impl->priv.px[0];
646  if (Y) y[i] += impl->priv.px[1];
647  if (Z) z[i] += impl->priv.px[2];
648  if (W) w[i] += impl->priv.px[3];
649  }
650 
651  CONTINUE(x, y, z, w);
652 }
653 
655 {
656  SWS_LOOP
657  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
658  if (X) x[i] = FFMIN(x[i], impl->priv.px[0]);
659  if (Y) y[i] = FFMIN(y[i], impl->priv.px[1]);
660  if (Z) z[i] = FFMIN(z[i], impl->priv.px[2]);
661  if (W) w[i] = FFMIN(w[i], impl->priv.px[3]);
662  }
663 
664  CONTINUE(x, y, z, w);
665 }
666 
668 {
669  SWS_LOOP
670  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
671  if (X) x[i] = FFMAX(x[i], impl->priv.px[0]);
672  if (Y) y[i] = FFMAX(y[i], impl->priv.px[1]);
673  if (Z) z[i] = FFMAX(z[i], impl->priv.px[2]);
674  if (W) w[i] = FFMAX(w[i], impl->priv.px[3]);
675  }
676 
677  CONTINUE(x, y, z, w);
678 }
679 
681 SWS_FOR(PX, ADD, DECL_IMPL, add)
688 
689 /*************
690  * Dithering *
691  *************/
692 
694 {
695  const SwsUOp *uop = params->uop;
696  const SwsDitherUOp *dither = &uop->par.dither;
697  const int size = 1 << dither->size_log2;
698  if (size >= SWS_BLOCK_SIZE) {
699  /* No extra padding needed */
700  out->priv.ptr = av_refstruct_ref(uop->data.ptr);
701  out->free = ff_op_priv_unref;
702  return 0;
703  }
704 
705  const int stride = FFMAX(size, SWS_BLOCK_SIZE);
706  const int height = ff_sws_dither_height(dither);
707  pixel_t *matrix = av_malloc(sizeof(pixel_t) * height * stride);
708  if (!matrix)
709  return AVERROR(ENOMEM);
710  out->priv.ptr = matrix;
711  out->free = ff_op_priv_free;
712 
713  /* Pad to multiple of block size. We don't need extra padding for the
714  * height because ff_sws_dither_height() already includes any padding
715  * necessary for the y_offset */
716  for (int y = 0; y < height; y++) {
717  pixel_t *row = &matrix[y * stride];
718  for (int x = 0; x < size; x++)
719  row[x] = uop->data.ptr[y * size + x].px;
720  for (int x = size; x < stride; x++)
721  row[x] = row[x % size];
722  }
723 
724  return 0;
725 }
726 
728  const uint8_t off0, const uint8_t off1,
729  const uint8_t off2, const uint8_t off3,
730  const uint8_t size_log2)
731 {
732  const int size = 1 << size_log2;
733  const int stride = FFMAX(size, SWS_BLOCK_SIZE);
734 
735  const pixel_t *matrix = impl->priv.ptr;
736  matrix += (iter->y & (size - 1)) * stride;
737  matrix += (iter->x & (size - 1)) & ~(SWS_BLOCK_SIZE - 1);
738 
739  const pixel_t *const row0 = &matrix[off0 * stride];
740  const pixel_t *const row1 = &matrix[off1 * stride];
741  const pixel_t *const row2 = &matrix[off2 * stride];
742  const pixel_t *const row3 = &matrix[off3 * stride];
743 
744  SWS_LOOP
745  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
746  if (X) x[i] += row0[i];
747  if (Y) y[i] += row1[i];
748  if (Z) z[i] += row2[i];
749  if (W) w[i] += row3[i];
750  }
751 
752  CONTINUE(x, y, z, w);
753 }
754 
755 SWS_FOR(PX, DITHER, DECL_IMPL, dither)
756 SWS_FOR_STRUCT(PX, DITHER, DECL_ENTRY, .setup = fn(setup_dither) )
757 
758 /*********************
759  * Linear operations *
760  *********************/
761 
762 typedef struct {
763  /* Stored in split form for convenience */
764  pixel_t m[4][4];
765  pixel_t k[4];
766 } fn(LinCoeffs);
767 
769 {
770  const SwsUOp *uop = params->uop;
771  fn(LinCoeffs) c;
772 
773  for (int i = 0; i < 4; i++) {
774  for (int j = 0; j < 4; j++)
775  c.m[i][j] = uop->data.mat4[i][j].px;
776  c.k[i] = uop->data.mat4[i][4].px;
777  }
778 
779  out->priv.ptr = av_memdup(&c, sizeof(c));
780  out->free = ff_op_priv_free;
781  return out->priv.ptr ? 0 : AVERROR(ENOMEM);
782 }
783 
784 /**
785  * Fully general case for a 5x5 linear affine transformation. Should never be
786  * called without constant `mask`. This function will compile down to the
787  * appropriately optimized version for the required subset of operations when
788  * called with a constant mask.
789  */
790 DECL_FUNC(linear, const SwsCompMask mask, const uint32_t one, const uint32_t zero)
791 {
792  const fn(LinCoeffs) c = *(const fn(LinCoeffs) *) impl->priv.ptr;
793 
794  SWS_LOOP
795  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
796  const pixel_t xx = x[i];
797  const pixel_t yy = y[i];
798  const pixel_t zz = z[i];
799  const pixel_t ww = w[i];
800 
801 #define LIN_VAL(I, J, val) \
802  ((one & SWS_MASK(I, J)) ? (val) : c.m[I][J] * (val))
803 
804 #define LIN_ROW(I, var) do { \
805  var[i] = (zero & SWS_MASK(I, 4)) ? 0 : c.k[I]; \
806  if (!(zero & SWS_MASK(I, 0))) var[i] += LIN_VAL(I, 0, xx); \
807  if (!(zero & SWS_MASK(I, 1))) var[i] += LIN_VAL(I, 1, yy); \
808  if (!(zero & SWS_MASK(I, 2))) var[i] += LIN_VAL(I, 2, zz); \
809  if (!(zero & SWS_MASK(I, 3))) var[i] += LIN_VAL(I, 3, ww); \
810 } while (0)
811 
812  if (X) LIN_ROW(0, x);
813  if (Y) LIN_ROW(1, y);
814  if (Z) LIN_ROW(2, z);
815  if (W) LIN_ROW(3, w);
816  }
817 
818  CONTINUE(x, y, z, w);
819 }
820 
823 
824 #undef PIXEL_MAX
825 #undef PIXEL_SWAP
826 #undef pixel_t
827 #undef inter_t
828 #undef block_t
829 #undef PX
830 #undef px
DECL_IMPL_WRITE
#define DECL_IMPL_WRITE(...)
Definition: uops_tmpl.h:133
PIXEL_MAX
#define PIXEL_MAX
Definition: uops_tmpl.c:50
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
MAX
#define MAX
Definition: blend_modes.c:46
SWS_FILTER_SCALE
@ SWS_FILTER_SCALE
14-bit coefficients are picked to fit comfortably within int16_t for efficient SIMD processing (e....
Definition: filters.h:40
out
static FILE * out
Definition: movenc.c:55
X
@ X
Definition: vf_addroi.c:27
SIZEOF_BLOCK
#define SIZEOF_BLOCK
Definition: uops_tmpl.h:50
SwsUOp::data
union SwsUOp::@589 data
Z
#define Z
Definition: uops_tmpl.h:83
matrix
Definition: vc1dsp.c:43
F32
@ F32
Definition: sw_ops.c:44
block_t::f32
float f32[SWS_BLOCK_SIZE]
Definition: uops_tmpl.h:47
DECL_ENTRY
#define DECL_ENTRY(SETUP, NAME,...)
Definition: uops_tmpl.h:139
mask
int mask
Definition: mediacodecdec_common.c:154
SwsFilterWeights
Represents a computed filter kernel.
Definition: filters.h:85
DECL_COPY
#define DECL_COPY(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3)
Definition: uops_tmpl.c:373
linear
static int linear(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:135
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
max
#define max(a, b)
Definition: cuda_runtime.h:33
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
SwsOpExec::in_stride
ptrdiff_t in_stride[4]
Definition: ops_dispatch.h:41
ff_op_priv_unref
static void ff_op_priv_unref(SwsOpPriv *priv)
Definition: ops_chain.h:149
setup_linear
static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:280
ONE
#define ONE(N)
av_memdup
void * av_memdup(const void *p, size_t size)
Duplicate a buffer with av_malloc().
Definition: mem.c:304
setup_dither
static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:273
mx
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t mx
Definition: dsp.h:57
DECL_WRITE
DECL_WRITE(write_planar, const SwsCompMask mask)
Definition: uops_tmpl.c:97
PX
#define PX
Definition: uops_tmpl.c:53
weight
const h264_weight_func weight
Definition: h264dsp_init.c:33
val
static double val(void *priv, double ch)
Definition: aeval.c:77
type
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf type
Definition: writing_filters.txt:86
SWS_COMP_ELEMS
#define SWS_COMP_ELEMS(N)
Definition: uops.h:73
SWS_FOR
#define SWS_FOR(TYPE, UOP, MACRO,...)
Definition: uops_macros.h:17
SWS_FOR_STRUCT
#define SWS_FOR_STRUCT(TYPE, UOP, MACRO,...)
Definition: uops_macros.h:19
float
float
Definition: af_crystalizer.c:122
W
#define W(a, i, v)
Definition: jpegls.h:119
dither
static const uint16_t dither[8][8]
Definition: vf_gradfun.c:46
SwsUOp::uop
SwsUOpType uop
Definition: uops.h:224
SCALE
#define SCALE(c)
Definition: dcadata.c:7338
LINEAR
#define LINEAR
Definition: vf_perspective.c:36
SwsCompMask
uint8_t SwsCompMask
Bit-mask of components.
Definition: uops.h:61
COPY
#define COPY(src, name)
RSHIFT
#define RSHIFT(a, b)
Definition: common.h:56
my
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t my
Definition: dsp.h:57
read_bit
static unsigned int BS_FUNC() read_bit(BSCTX *bc)
Return one bit from the buffer.
Definition: bitstream_template.h:211
SwsOpExec
Copyright (C) 2026 Niklas Haas.
Definition: ops_dispatch.h:35
LIN_ROW
#define LIN_ROW(I, var)
xs
#define xs(width, name, var, subs,...)
Definition: cbs_vp9.c:305
SwsUOp::mat4
SwsPixel mat4[4][5]
Definition: uops.h:234
ADD
#define ADD(a, b)
Definition: dct32_template.c:123
block_t
Definition: uops_tmpl.h:43
DECL_IMPL
#define DECL_IMPL(FUNC, NAME, TYPE, UOP,...)
Definition: uops_tmpl.h:119
SWS_BLOCK_SIZE
#define SWS_BLOCK_SIZE
Copyright (C) 2026 Niklas Haas.
Definition: uops_tmpl.h:40
SwsPixelType
SwsPixelType
Definition: uops.h:38
index
int index
Definition: gxfenc.c:90
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
SwsUOp::par
SwsUOpParams par
Definition: uops.h:226
ff_sws_setup_vec4
int ff_sws_setup_vec4(const SwsImplParams *params, SwsImplResult *out)
Definition: ops_chain.c:200
SwsUOp
Definition: uops.h:221
SWS_LOOP
#define SWS_LOOP
Definition: uops_tmpl.h:68
height
#define height
Definition: dsp.h:89
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
for
for(k=2;k<=8;++k)
Definition: h264pred_template.c:424
size
int size
Definition: twinvq_data.h:10344
U16
@ U16
Definition: sw_ops.c:42
mz
static double mz(int i, double w0, double r, double alpha)
Definition: af_atilt.c:55
inter_t
#define inter_t
Definition: uops_tmpl.c:52
fn
#define fn(a)
Definition: aap_template.c:37
av_refstruct_ref
void * av_refstruct_ref(void *obj)
Create a new reference to an object managed via this API, i.e.
Definition: refstruct.c:140
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
zero
static int zero(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:121
Y
#define Y
Definition: boxblur.h:37
CONTINUE
#define CONTINUE(...)
Definition: uops_tmpl.h:107
av_assert2
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:68
unpack
static int unpack(const uint8_t *src, const uint8_t *src_end, uint8_t *dst, int width, int height)
Unpack buffer.
Definition: eatgv.c:73
av_malloc
#define av_malloc(s)
Definition: ops_asmgen.c:44
SwsOpExec::in_offset_x
int32_t * in_offset_x
Pixel offset map; for horizontal scaling, in bytes.
Definition: ops_dispatch.h:80
weights
static const int weights[]
Definition: hevc_pel.c:32
value
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default value
Definition: writing_filters.txt:86
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
MIN
#define MIN(a, b)
Definition: qt-faststart.c:45
av_calloc
void * av_calloc(size_t nmemb, size_t size)
Definition: mem.c:264
ff_op_priv_free
static void ff_op_priv_free(SwsOpPriv *priv)
Definition: ops_chain.h:144
bswap.h
bump_ptr
#define bump_ptr(ptr, bump)
Definition: uops_tmpl.h:78
DECL_FUNC
DECL_FUNC(lshift, const SwsCompMask mask, const uint8_t amount)
Definition: uops_tmpl.c:428
DECL_SETUP
DECL_SETUP(setup_filter_v, params, out)
Definition: uops_tmpl.c:240
DECL_CAST
#define DECL_CAST(DST, dst)
Definition: uops_tmpl.c:399
SwsUOp::ptr
SwsPixel * ptr
Definition: uops.h:231
pixel_t
#define pixel_t
Definition: uops_tmpl.c:51
block_t::u32
uint32_t u32[SWS_BLOCK_SIZE]
Definition: uops_tmpl.h:46
CLEAR
#define CLEAR(destin)
Definition: wavpackenc.c:50
ff_sws_setup_scalar
int ff_sws_setup_scalar(const SwsImplParams *params, SwsImplResult *out)
Definition: ops_chain.c:185
block_t::u16
uint16_t u16[SWS_BLOCK_SIZE]
Definition: uops_tmpl.h:45
SwsDitherUOp
Definition: uops.h:199
SwsUOpParams::dither
SwsDitherUOp dither
Definition: uops.h:218
ZERO
#define ZERO(N)
U32
@ U32
Definition: sw_ops.c:43
SWS_PIXEL_F32
@ SWS_PIXEL_F32
Definition: uops.h:43
w
uint8_t w
Definition: llvidencdsp.c:39
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:278
uops_tmpl.h
setup_filter_v
static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:46
DECL_PERMUTE
#define DECL_PERMUTE(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3)
Definition: uops_tmpl.c:364
DECL_READ
DECL_READ(read_planar, const SwsCompMask mask)
Definition: uops_tmpl.c:63
int32_t
int32_t
Definition: audioconvert.c:56
ff_sws_dither_height
int ff_sws_dither_height(const SwsDitherUOp *dither)
Computes (1 << size_log2) + MAX(y_offset).
Definition: uops.c:222
stride
#define stride
Definition: h264pred_template.c:536
U8
@ U8
Definition: sw_ops.c:41
DECL_IMPL_READ
#define DECL_IMPL_READ(...)
Definition: uops_tmpl.h:128
min
float min
Definition: vorbis_enc_data.h:429
setup_filter_h
static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:76