FFmpeg
ops.c
Go to the documentation of this file.
1 /**
2  * Copyright (C) 2025 Niklas Haas
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <float.h>
22 
23 #include "libavutil/avassert.h"
24 #include "libavutil/mem.h"
25 
26 #include "../ops_chain.h"
27 
28 #define DECL_ENTRY(TYPE, NAME, ...) \
29  static const SwsOpEntry op_##NAME = { \
30  .type = SWS_PIXEL_##TYPE, \
31  __VA_ARGS__ \
32  }
33 
34 #define DECL_ASM(TYPE, NAME, ...) \
35  void ff_##NAME(void); \
36  DECL_ENTRY(TYPE, NAME, \
37  .func = ff_##NAME, \
38  __VA_ARGS__)
39 
40 #define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...) \
41  DECL_ASM(TYPE, p##X##Y##Z##W##_##NAME, \
42  .unused = { !X, !Y, !Z, !W }, \
43  __VA_ARGS__ \
44  )
45 
46 #define REF_PATTERN(NAME, X, Y, Z, W) \
47  &op_p##X##Y##Z##W##_##NAME
48 
49 #define DECL_COMMON_PATTERNS(TYPE, NAME, ...) \
50  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__); \
51  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__); \
52  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__); \
53  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__) \
54 
55 #define REF_COMMON_PATTERNS(NAME) \
56  REF_PATTERN(NAME, 1, 0, 0, 0), \
57  REF_PATTERN(NAME, 1, 0, 0, 1), \
58  REF_PATTERN(NAME, 1, 1, 1, 0), \
59  REF_PATTERN(NAME, 1, 1, 1, 1)
60 
61 static int setup_rw(const SwsImplParams *params, SwsImplResult *out)
62 {
63  const SwsOp *op = params->op;
64 
65  /* 3-component reads/writes process one extra garbage word */
66  if (op->rw.packed && op->rw.elems == 3) {
67  switch (op->op) {
68  case SWS_OP_READ: out->over_read = sizeof(uint32_t); break;
69  case SWS_OP_WRITE: out->over_write = sizeof(uint32_t); break;
70  }
71  }
72 
73  return 0;
74 }
75 
76 #define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC) \
77  DECL_ASM(TYPE, NAME##ELEMS##EXT, \
78  .op = SWS_OP_##OP, \
79  .rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC }, \
80  .setup = setup_rw, \
81  );
82 
83 #define DECL_PACKED_RW(EXT, DEPTH) \
84  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 2, true, 0) \
85  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 3, true, 0) \
86  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 4, true, 0) \
87  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true, 0) \
88  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true, 0) \
89  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true, 0) \
90 
91 #define DECL_PACK_UNPACK(EXT, TYPE, X, Y, Z, W) \
92  DECL_ASM(TYPE, pack_##X##Y##Z##W##EXT, \
93  .op = SWS_OP_PACK, \
94  .pack.pattern = {X, Y, Z, W}, \
95  ); \
96  \
97  DECL_ASM(TYPE, unpack_##X##Y##Z##W##EXT, \
98  .op = SWS_OP_UNPACK, \
99  .pack.pattern = {X, Y, Z, W}, \
100  ); \
101 
102 static int setup_swap_bytes(const SwsImplParams *params, SwsImplResult *out)
103 {
104  const int mask = ff_sws_pixel_type_size(params->op->type) - 1;
105  for (int i = 0; i < 16; i++)
106  out->priv.u8[i] = (i & ~mask) | (mask - (i & mask));
107  return 0;
108 }
109 
110 #define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W) \
111  DECL_ENTRY(TYPE, p##X##Y##Z##W##_swap_bytes_##TYPE##EXT, \
112  .op = SWS_OP_SWAP_BYTES, \
113  .unused = { !X, !Y, !Z, !W }, \
114  .func = ff_p##X##Y##Z##W##_shuffle##EXT, \
115  .setup = setup_swap_bytes, \
116  );
117 
118 #define DECL_CLEAR_ALPHA(EXT, IDX) \
119  DECL_ASM(U8, clear_alpha##IDX##EXT, \
120  .op = SWS_OP_CLEAR, \
121  .clear_value = -1, \
122  .unused[IDX] = true, \
123  ); \
124 
125 #define DECL_CLEAR_ZERO(EXT, IDX) \
126  DECL_ASM(U8, clear_zero##IDX##EXT, \
127  .op = SWS_OP_CLEAR, \
128  .clear_value = 0, \
129  .unused[IDX] = true, \
130  );
131 
132 static int setup_clear(const SwsImplParams *params, SwsImplResult *out)
133 {
134  const SwsOp *op = params->op;
135  for (int i = 0; i < 4; i++)
136  out->priv.u32[i] = (uint32_t) op->c.q4[i].num;
137  return 0;
138 }
139 
140 #define DECL_CLEAR(EXT, X, Y, Z, W) \
141  DECL_PATTERN(U8, clear##EXT, X, Y, Z, W, \
142  .op = SWS_OP_CLEAR, \
143  .setup = setup_clear, \
144  .flexible = true, \
145  );
146 
147 #define DECL_SWIZZLE(EXT, X, Y, Z, W) \
148  DECL_ASM(U8, swizzle_##X##Y##Z##W##EXT, \
149  .op = SWS_OP_SWIZZLE, \
150  .swizzle.in = {X, Y, Z, W}, \
151  );
152 
153 #define DECL_CONVERT(EXT, FROM, TO) \
154  DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT, \
155  .op = SWS_OP_CONVERT, \
156  .convert.to = SWS_PIXEL_##TO, \
157  );
158 
159 #define DECL_EXPAND(EXT, FROM, TO) \
160  DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT, \
161  .op = SWS_OP_CONVERT, \
162  .convert.to = SWS_PIXEL_##TO, \
163  .convert.expand = true, \
164  );
165 
166 static int setup_shift(const SwsImplParams *params, SwsImplResult *out)
167 {
168  out->priv.u16[0] = params->op->c.u;
169  return 0;
170 }
171 
172 #define DECL_SHIFT16(EXT) \
173  DECL_COMMON_PATTERNS(U16, lshift16##EXT, \
174  .op = SWS_OP_LSHIFT, \
175  .setup = setup_shift, \
176  .flexible = true, \
177  ); \
178  \
179  DECL_COMMON_PATTERNS(U16, rshift16##EXT, \
180  .op = SWS_OP_RSHIFT, \
181  .setup = setup_shift, \
182  .flexible = true, \
183  );
184 
185 #define DECL_MIN_MAX(EXT) \
186  DECL_COMMON_PATTERNS(F32, min##EXT, \
187  .op = SWS_OP_MIN, \
188  .setup = ff_sws_setup_q4, \
189  .flexible = true, \
190  ); \
191  \
192  DECL_COMMON_PATTERNS(F32, max##EXT, \
193  .op = SWS_OP_MAX, \
194  .setup = ff_sws_setup_q4, \
195  .flexible = true, \
196  );
197 
198 #define DECL_SCALE(EXT) \
199  DECL_COMMON_PATTERNS(F32, scale##EXT, \
200  .op = SWS_OP_SCALE, \
201  .setup = ff_sws_setup_q, \
202  .flexible = true, \
203  );
204 
205 #define DECL_EXPAND_BITS(EXT, BITS) \
206  DECL_ASM(U##BITS, expand_bits##BITS##EXT, \
207  .op = SWS_OP_SCALE, \
208  .scale = { .num = ((1 << (BITS)) - 1), .den = 1 }, \
209  );
210 
211 static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
212 {
213  const SwsOp *op = params->op;
214  /* 1x1 matrix / single constant */
215  if (!op->dither.size_log2) {
216  const AVRational k = op->dither.matrix[0];
217  out->priv.f32[0] = (float) k.num / k.den;
218  return 0;
219  }
220 
221  const int size = 1 << op->dither.size_log2;
222  const int8_t *off = op->dither.y_offset;
223  int max_offset = 0;
224  for (int i = 0; i < 4; i++) {
225  if (off[i] >= 0)
226  max_offset = FFMAX(max_offset, off[i] & (size - 1));
227  }
228 
229  /* Allocate extra rows to allow over-reading for row offsets. Note that
230  * max_offset is currently never larger than 5, so the extra space needed
231  * for this over-allocation is bounded by 5 * size * sizeof(float),
232  * typically 320 bytes for a 16x16 dither matrix. */
233  const int stride = size * sizeof(float);
234  const int num_rows = size + max_offset;
235  float *matrix = out->priv.ptr = av_mallocz(num_rows * stride);
236  if (!matrix)
237  return AVERROR(ENOMEM);
238  out->free = ff_op_priv_free;
239 
240  for (int i = 0; i < size * size; i++)
241  matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
242 
243  memcpy(&matrix[size * size], matrix, max_offset * stride);
244 
245  /* Store relative pointer offset to each row inside extra space */
246  static_assert(sizeof(out->priv.ptr) <= sizeof(int16_t[4]),
247  ">8 byte pointers not supported");
248  assert(max_offset * stride <= INT16_MAX);
249  int16_t *off_out = &out->priv.i16[4];
250  for (int i = 0; i < 4; i++)
251  off_out[i] = off[i] >= 0 ? (off[i] & (size - 1)) * stride : -1;
252 
253  return 0;
254 }
255 
256 #define DECL_DITHER(DECL_MACRO, EXT, SIZE) \
257  DECL_MACRO(F32, dither##SIZE##EXT, \
258  .op = SWS_OP_DITHER, \
259  .setup = setup_dither, \
260  .dither_size = SIZE, \
261  );
262 
263 static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
264 {
265  const SwsOp *op = params->op;
266 
267  float *matrix = out->priv.ptr = av_mallocz(sizeof(float[4][5]));
268  if (!matrix)
269  return AVERROR(ENOMEM);
270  out->free = ff_op_priv_free;
271 
272  for (int y = 0; y < 4; y++) {
273  for (int x = 0; x < 5; x++)
274  matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den;
275  }
276 
277  return 0;
278 }
279 
280 #define DECL_LINEAR(EXT, NAME, MASK) \
281  DECL_ASM(F32, NAME##EXT, \
282  .op = SWS_OP_LINEAR, \
283  .setup = setup_linear, \
284  .linear_mask = (MASK), \
285  );
286 
287 #define DECL_FUNCS_8(SIZE, EXT, FLAG) \
288  DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \
289  DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \
290  DECL_RW(EXT, U8, read_planar, READ, 3, false, 0) \
291  DECL_RW(EXT, U8, read_planar, READ, 4, false, 0) \
292  DECL_RW(EXT, U8, write_planar, WRITE, 1, false, 0) \
293  DECL_RW(EXT, U8, write_planar, WRITE, 2, false, 0) \
294  DECL_RW(EXT, U8, write_planar, WRITE, 3, false, 0) \
295  DECL_RW(EXT, U8, write_planar, WRITE, 4, false, 0) \
296  DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \
297  DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \
298  DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \
299  DECL_EXPAND_BITS(EXT, 8) \
300  DECL_PACKED_RW(EXT, 8) \
301  DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \
302  DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \
303  DECL_PACK_UNPACK(EXT, U8, 2, 3, 3, 0) \
304  void ff_p1000_shuffle##EXT(void); \
305  void ff_p1001_shuffle##EXT(void); \
306  void ff_p1110_shuffle##EXT(void); \
307  void ff_p1111_shuffle##EXT(void); \
308  DECL_SWIZZLE(EXT, 3, 0, 1, 2) \
309  DECL_SWIZZLE(EXT, 3, 0, 2, 1) \
310  DECL_SWIZZLE(EXT, 2, 1, 0, 3) \
311  DECL_SWIZZLE(EXT, 3, 2, 1, 0) \
312  DECL_SWIZZLE(EXT, 3, 1, 0, 2) \
313  DECL_SWIZZLE(EXT, 3, 2, 0, 1) \
314  DECL_SWIZZLE(EXT, 1, 2, 0, 3) \
315  DECL_SWIZZLE(EXT, 1, 0, 2, 3) \
316  DECL_SWIZZLE(EXT, 2, 0, 1, 3) \
317  DECL_SWIZZLE(EXT, 2, 3, 1, 0) \
318  DECL_SWIZZLE(EXT, 2, 1, 3, 0) \
319  DECL_SWIZZLE(EXT, 1, 2, 3, 0) \
320  DECL_SWIZZLE(EXT, 1, 3, 2, 0) \
321  DECL_SWIZZLE(EXT, 0, 2, 1, 3) \
322  DECL_SWIZZLE(EXT, 0, 2, 3, 1) \
323  DECL_SWIZZLE(EXT, 0, 3, 1, 2) \
324  DECL_SWIZZLE(EXT, 3, 1, 2, 0) \
325  DECL_SWIZZLE(EXT, 0, 3, 2, 1) \
326  DECL_SWIZZLE(EXT, 0, 0, 0, 3) \
327  DECL_SWIZZLE(EXT, 3, 0, 0, 0) \
328  DECL_SWIZZLE(EXT, 0, 0, 0, 1) \
329  DECL_SWIZZLE(EXT, 1, 0, 0, 0) \
330  DECL_CLEAR_ALPHA(EXT, 0) \
331  DECL_CLEAR_ALPHA(EXT, 1) \
332  DECL_CLEAR_ALPHA(EXT, 3) \
333  DECL_CLEAR_ZERO(EXT, 0) \
334  DECL_CLEAR_ZERO(EXT, 1) \
335  DECL_CLEAR_ZERO(EXT, 3) \
336  DECL_CLEAR(EXT, 1, 1, 1, 0) \
337  DECL_CLEAR(EXT, 0, 1, 1, 1) \
338  DECL_CLEAR(EXT, 0, 0, 1, 1) \
339  DECL_CLEAR(EXT, 1, 0, 0, 1) \
340  DECL_CLEAR(EXT, 1, 1, 0, 0) \
341  DECL_CLEAR(EXT, 0, 1, 0, 1) \
342  DECL_CLEAR(EXT, 1, 0, 1, 0) \
343  DECL_CLEAR(EXT, 1, 0, 0, 0) \
344  DECL_CLEAR(EXT, 0, 1, 0, 0) \
345  DECL_CLEAR(EXT, 0, 0, 1, 0) \
346  \
347 static const SwsOpTable ops8##EXT = { \
348  .cpu_flags = AV_CPU_FLAG_##FLAG, \
349  .block_size = SIZE, \
350  .entries = { \
351  &op_read_planar1##EXT, \
352  &op_read_planar2##EXT, \
353  &op_read_planar3##EXT, \
354  &op_read_planar4##EXT, \
355  &op_write_planar1##EXT, \
356  &op_write_planar2##EXT, \
357  &op_write_planar3##EXT, \
358  &op_write_planar4##EXT, \
359  &op_read8_packed2##EXT, \
360  &op_read8_packed3##EXT, \
361  &op_read8_packed4##EXT, \
362  &op_write8_packed2##EXT, \
363  &op_write8_packed3##EXT, \
364  &op_write8_packed4##EXT, \
365  &op_read_nibbles1##EXT, \
366  &op_read_bits1##EXT, \
367  &op_write_bits1##EXT, \
368  &op_expand_bits8##EXT, \
369  &op_pack_1210##EXT, \
370  &op_pack_3320##EXT, \
371  &op_pack_2330##EXT, \
372  &op_unpack_1210##EXT, \
373  &op_unpack_3320##EXT, \
374  &op_unpack_2330##EXT, \
375  &op_swizzle_3012##EXT, \
376  &op_swizzle_3021##EXT, \
377  &op_swizzle_2103##EXT, \
378  &op_swizzle_3210##EXT, \
379  &op_swizzle_3102##EXT, \
380  &op_swizzle_3201##EXT, \
381  &op_swizzle_1203##EXT, \
382  &op_swizzle_1023##EXT, \
383  &op_swizzle_2013##EXT, \
384  &op_swizzle_2310##EXT, \
385  &op_swizzle_2130##EXT, \
386  &op_swizzle_1230##EXT, \
387  &op_swizzle_1320##EXT, \
388  &op_swizzle_0213##EXT, \
389  &op_swizzle_0231##EXT, \
390  &op_swizzle_0312##EXT, \
391  &op_swizzle_3120##EXT, \
392  &op_swizzle_0321##EXT, \
393  &op_swizzle_0003##EXT, \
394  &op_swizzle_0001##EXT, \
395  &op_swizzle_3000##EXT, \
396  &op_swizzle_1000##EXT, \
397  &op_clear_alpha0##EXT, \
398  &op_clear_alpha1##EXT, \
399  &op_clear_alpha3##EXT, \
400  &op_clear_zero0##EXT, \
401  &op_clear_zero1##EXT, \
402  &op_clear_zero3##EXT, \
403  REF_PATTERN(clear##EXT, 1, 1, 1, 0), \
404  REF_PATTERN(clear##EXT, 0, 1, 1, 1), \
405  REF_PATTERN(clear##EXT, 0, 0, 1, 1), \
406  REF_PATTERN(clear##EXT, 1, 0, 0, 1), \
407  REF_PATTERN(clear##EXT, 1, 1, 0, 0), \
408  REF_PATTERN(clear##EXT, 0, 1, 0, 1), \
409  REF_PATTERN(clear##EXT, 1, 0, 1, 0), \
410  REF_PATTERN(clear##EXT, 1, 0, 0, 0), \
411  REF_PATTERN(clear##EXT, 0, 1, 0, 0), \
412  REF_PATTERN(clear##EXT, 0, 0, 1, 0), \
413  NULL \
414  }, \
415 };
416 
417 #define DECL_FUNCS_16(SIZE, EXT, FLAG) \
418  DECL_PACKED_RW(EXT, 16) \
419  DECL_EXPAND_BITS(EXT, 16) \
420  DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \
421  DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \
422  DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \
423  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0) \
424  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1) \
425  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0) \
426  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1) \
427  DECL_SHIFT16(EXT) \
428  DECL_CONVERT(EXT, U8, U16) \
429  DECL_CONVERT(EXT, U16, U8) \
430  DECL_EXPAND(EXT, U8, U16) \
431  \
432 static const SwsOpTable ops16##EXT = { \
433  .cpu_flags = AV_CPU_FLAG_##FLAG, \
434  .block_size = SIZE, \
435  .entries = { \
436  &op_read16_packed2##EXT, \
437  &op_read16_packed3##EXT, \
438  &op_read16_packed4##EXT, \
439  &op_write16_packed2##EXT, \
440  &op_write16_packed3##EXT, \
441  &op_write16_packed4##EXT, \
442  &op_pack_4440##EXT, \
443  &op_pack_5550##EXT, \
444  &op_pack_5650##EXT, \
445  &op_unpack_4440##EXT, \
446  &op_unpack_5550##EXT, \
447  &op_unpack_5650##EXT, \
448  &op_expand_bits16##EXT, \
449  REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \
450  REF_COMMON_PATTERNS(convert_U8_U16##EXT), \
451  REF_COMMON_PATTERNS(convert_U16_U8##EXT), \
452  REF_COMMON_PATTERNS(expand_U8_U16##EXT), \
453  REF_COMMON_PATTERNS(lshift16##EXT), \
454  REF_COMMON_PATTERNS(rshift16##EXT), \
455  NULL \
456  }, \
457 };
458 
459 #define DECL_FUNCS_32(SIZE, EXT, FLAG) \
460  DECL_PACKED_RW(_m2##EXT, 32) \
461  DECL_PACK_UNPACK(_m2##EXT, U32, 10, 10, 10, 2) \
462  DECL_PACK_UNPACK(_m2##EXT, U32, 2, 10, 10, 10) \
463  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0) \
464  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1) \
465  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0) \
466  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1) \
467  DECL_CONVERT(EXT, U8, U32) \
468  DECL_CONVERT(EXT, U32, U8) \
469  DECL_CONVERT(EXT, U16, U32) \
470  DECL_CONVERT(EXT, U32, U16) \
471  DECL_CONVERT(EXT, U8, F32) \
472  DECL_CONVERT(EXT, F32, U8) \
473  DECL_CONVERT(EXT, U16, F32) \
474  DECL_CONVERT(EXT, F32, U16) \
475  DECL_EXPAND(EXT, U8, U32) \
476  DECL_MIN_MAX(EXT) \
477  DECL_SCALE(EXT) \
478  DECL_DITHER(DECL_COMMON_PATTERNS, EXT, 0) \
479  DECL_DITHER(DECL_ASM, EXT, 1) \
480  DECL_DITHER(DECL_ASM, EXT, 2) \
481  DECL_DITHER(DECL_ASM, EXT, 3) \
482  DECL_DITHER(DECL_ASM, EXT, 4) \
483  DECL_DITHER(DECL_ASM, EXT, 5) \
484  DECL_DITHER(DECL_ASM, EXT, 6) \
485  DECL_DITHER(DECL_ASM, EXT, 7) \
486  DECL_DITHER(DECL_ASM, EXT, 8) \
487  DECL_LINEAR(EXT, luma, SWS_MASK_LUMA) \
488  DECL_LINEAR(EXT, alpha, SWS_MASK_ALPHA) \
489  DECL_LINEAR(EXT, lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) \
490  DECL_LINEAR(EXT, dot3, 0x7) \
491  DECL_LINEAR(EXT, row0, SWS_MASK_ROW(0)) \
492  DECL_LINEAR(EXT, row0a, SWS_MASK_ROW(0) | SWS_MASK_ALPHA) \
493  DECL_LINEAR(EXT, diag3, SWS_MASK_DIAG3) \
494  DECL_LINEAR(EXT, diag4, SWS_MASK_DIAG4) \
495  DECL_LINEAR(EXT, diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) \
496  DECL_LINEAR(EXT, matrix3, SWS_MASK_MAT3) \
497  DECL_LINEAR(EXT, affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) \
498  DECL_LINEAR(EXT, affine3a, SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
499  DECL_LINEAR(EXT, matrix4, SWS_MASK_MAT4) \
500  DECL_LINEAR(EXT, affine4, SWS_MASK_MAT4 | SWS_MASK_OFF4) \
501  \
502 static const SwsOpTable ops32##EXT = { \
503  .cpu_flags = AV_CPU_FLAG_##FLAG, \
504  .block_size = SIZE, \
505  .entries = { \
506  &op_read32_packed2_m2##EXT, \
507  &op_read32_packed3_m2##EXT, \
508  &op_read32_packed4_m2##EXT, \
509  &op_write32_packed2_m2##EXT, \
510  &op_write32_packed3_m2##EXT, \
511  &op_write32_packed4_m2##EXT, \
512  &op_pack_1010102_m2##EXT, \
513  &op_pack_2101010_m2##EXT, \
514  &op_unpack_1010102_m2##EXT, \
515  &op_unpack_2101010_m2##EXT, \
516  REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT), \
517  REF_COMMON_PATTERNS(convert_U8_U32##EXT), \
518  REF_COMMON_PATTERNS(convert_U32_U8##EXT), \
519  REF_COMMON_PATTERNS(convert_U16_U32##EXT), \
520  REF_COMMON_PATTERNS(convert_U32_U16##EXT), \
521  REF_COMMON_PATTERNS(convert_U8_F32##EXT), \
522  REF_COMMON_PATTERNS(convert_F32_U8##EXT), \
523  REF_COMMON_PATTERNS(convert_U16_F32##EXT), \
524  REF_COMMON_PATTERNS(convert_F32_U16##EXT), \
525  REF_COMMON_PATTERNS(expand_U8_U32##EXT), \
526  REF_COMMON_PATTERNS(min##EXT), \
527  REF_COMMON_PATTERNS(max##EXT), \
528  REF_COMMON_PATTERNS(scale##EXT), \
529  REF_COMMON_PATTERNS(dither0##EXT), \
530  &op_dither1##EXT, \
531  &op_dither2##EXT, \
532  &op_dither3##EXT, \
533  &op_dither4##EXT, \
534  &op_dither5##EXT, \
535  &op_dither6##EXT, \
536  &op_dither7##EXT, \
537  &op_dither8##EXT, \
538  &op_luma##EXT, \
539  &op_alpha##EXT, \
540  &op_lumalpha##EXT, \
541  &op_dot3##EXT, \
542  &op_row0##EXT, \
543  &op_row0a##EXT, \
544  &op_diag3##EXT, \
545  &op_diag4##EXT, \
546  &op_diagoff3##EXT, \
547  &op_matrix3##EXT, \
548  &op_affine3##EXT, \
549  &op_affine3a##EXT, \
550  &op_matrix4##EXT, \
551  &op_affine4##EXT, \
552  NULL \
553  }, \
554 };
555 
556 DECL_FUNCS_8(16, _m1_sse4, SSE4)
557 DECL_FUNCS_8(32, _m1_avx2, AVX2)
558 DECL_FUNCS_8(32, _m2_sse4, SSE4)
559 DECL_FUNCS_8(64, _m2_avx2, AVX2)
560 
561 DECL_FUNCS_16(16, _m1_avx2, AVX2)
562 DECL_FUNCS_16(32, _m2_avx2, AVX2)
563 
564 DECL_FUNCS_32(16, _avx2, AVX2)
565 
566 static const SwsOpTable *const tables[] = {
567  &ops8_m1_sse4,
568  &ops8_m1_avx2,
569  &ops8_m2_sse4,
570  &ops8_m2_avx2,
571  &ops16_m1_avx2,
572  &ops16_m2_avx2,
573  &ops32_avx2,
574 };
575 
576 static av_const int get_mmsize(const int cpu_flags)
577 {
579  return 64;
580  else if (cpu_flags & AV_CPU_FLAG_AVX2)
581  return 32;
582  else if (cpu_flags & AV_CPU_FLAG_SSE4)
583  return 16;
584  else
585  return AVERROR(ENOTSUP);
586 }
587 
588 /**
589  * Returns true if the operation's implementation only depends on the block
590  * size, and not the underlying pixel type
591  */
592 static bool op_is_type_invariant(const SwsOp *op)
593 {
594  switch (op->op) {
595  case SWS_OP_READ:
596  case SWS_OP_WRITE:
597  return !(op->rw.elems > 1 && op->rw.packed) && !op->rw.frac;
598  case SWS_OP_SWIZZLE:
599  case SWS_OP_CLEAR:
600  return true;
601  }
602 
603  return false;
604 }
605 
606 static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
607 {
608  uint8_t shuffle[16];
609  int read_bytes, write_bytes;
610  int pixels;
611 
612  /* Solve the shuffle mask for one 128-bit lane only */
613  pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes);
614  if (pixels < 0)
615  return pixels;
616 
617  /* We can't shuffle acress lanes, so restrict the vector size to XMM
618  * whenever the read/write size would be a subset of the full vector */
619  if (read_bytes < 16 || write_bytes < 16)
620  mmsize = 16;
621 
622  const int num_lanes = mmsize / 16;
623  const int in_total = num_lanes * read_bytes;
624  const int out_total = num_lanes * write_bytes;
625  const int read_size = in_total <= 4 ? 4 : /* movd */
626  in_total <= 8 ? 8 : /* movq */
627  mmsize; /* movu */
628 
629  *out = (SwsCompiledOp) {
630  .priv = av_memdup(shuffle, sizeof(shuffle)),
631  .free = av_free,
632  .slice_align = 1,
633  .block_size = pixels * num_lanes,
634  .over_read = read_size - in_total,
635  .over_write = mmsize - out_total,
636  .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
637  mmsize > 16 ? AV_CPU_FLAG_AVX2 :
639  };
640 
641  if (!out->priv)
642  return AVERROR(ENOMEM);
643 
644 #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
645 do { \
646  SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
647  if (in_total == IN && out_total == OUT) \
648  out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
649 } while (0)
650 
651  ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
652  ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
653  ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
654  ASSIGN_SHUFFLE_FUNC(16, 8, sse4);
655  ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
656  ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
657  ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
658  ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
659  ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
660  ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
661  ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
662  ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
663  ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
664  ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
665  ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
666  ASSIGN_SHUFFLE_FUNC(64, 64, avx512);
667  av_assert1(out->func);
668  return 0;
669 }
670 
671 /* Normalize clear values into 32-bit integer constants */
672 static void normalize_clear(SwsOp *op)
673 {
674  static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch");
675  SwsImplResult res;
676  union {
677  uint32_t u32;
678  int i;
679  } c;
680 
681  ff_sws_setup_q4(&(const SwsImplParams) { .op = op }, &res);
682 
683  for (int i = 0; i < 4; i++) {
684  if (!op->c.q4[i].den)
685  continue;
686  switch (ff_sws_pixel_type_size(op->type)) {
687  case 1: c.u32 = 0x1010101U * res.priv.u8[i]; break;
688  case 2: c.u32 = (uint32_t) res.priv.u16[i] << 16 | res.priv.u16[i]; break;
689  case 4: c.u32 = res.priv.u32[i]; break;
690  }
691 
692  op->c.q4[i].num = c.i;
693  op->c.q4[i].den = 1;
694  }
695 }
696 
698 {
699  int ret;
700  const int cpu_flags = av_get_cpu_flags();
701  const int mmsize = get_mmsize(cpu_flags);
702  if (mmsize < 0)
703  return mmsize;
704 
705  /* Special fast path for in-place packed shuffle */
706  ret = solve_shuffle(ops, mmsize, out);
707  if (ret != AVERROR(ENOTSUP))
708  return ret;
709 
711  if (!chain)
712  return AVERROR(ENOMEM);
713 
714  *out = (SwsCompiledOp) {
715  .priv = chain,
716  .slice_align = 1,
718 
719  /* Use at most two full YMM regs during the widest precision section */
720  .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops),
721  };
722 
723  /* Make on-stack copy of `ops` to iterate over */
724  SwsOpList rest = *ops;
725  do {
726  int op_block_size = out->block_size;
727  SwsOp *op = &rest.ops[0];
728 
729  if (op_is_type_invariant(op)) {
730  if (op->op == SWS_OP_CLEAR)
732  op_block_size *= ff_sws_pixel_type_size(op->type);
733  op->type = SWS_PIXEL_U8;
734  }
735 
737  &rest, op_block_size, chain);
738  } while (ret == AVERROR(EAGAIN));
739 
740  if (ret < 0) {
741  ff_sws_op_chain_free(chain);
742  if (rest.num_ops < ops->num_ops) {
743  av_log(ctx, AV_LOG_TRACE, "Uncompiled remainder:\n");
745  }
746  return ret;
747  }
748 
749 #define ASSIGN_PROCESS_FUNC(NAME) \
750  do { \
751  SWS_DECL_FUNC(NAME); \
752  void NAME##_return(void); \
753  ret = ff_sws_op_chain_append(chain, NAME##_return, \
754  NULL, &(SwsOpPriv) {0}); \
755  out->func = NAME; \
756  } while (0)
757 
758  const SwsOp *read = ff_sws_op_list_input(ops);
759  const SwsOp *write = ff_sws_op_list_output(ops);
760  const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
761  const int write_planes = write->rw.packed ? 1 : write->rw.elems;
762  switch (FFMAX(read_planes, write_planes)) {
763  case 1: ASSIGN_PROCESS_FUNC(ff_sws_process1_x86); break;
764  case 2: ASSIGN_PROCESS_FUNC(ff_sws_process2_x86); break;
765  case 3: ASSIGN_PROCESS_FUNC(ff_sws_process3_x86); break;
766  case 4: ASSIGN_PROCESS_FUNC(ff_sws_process4_x86); break;
767  }
768 
769  if (ret < 0) {
770  ff_sws_op_chain_free(chain);
771  return ret;
772  }
773 
774  out->cpu_flags = chain->cpu_flags;
775  out->over_read = chain->over_read;
776  out->over_write = chain->over_write;
777  return 0;
778 }
779 
781  .name = "x86",
782  .compile = compile,
783  .hw_format = AV_PIX_FMT_NONE,
784 };
SWS_OP_READ
@ SWS_OP_READ
Definition: ops.h:47
SwsOpTable
Copyright (C) 2025 Niklas Haas.
Definition: ops_chain.h:153
SWS_OP_SWIZZLE
@ SWS_OP_SWIZZLE
Definition: ops.h:50
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
ASSIGN_PROCESS_FUNC
#define ASSIGN_PROCESS_FUNC(NAME)
get_mmsize
static av_const int get_mmsize(const int cpu_flags)
Definition: ops.c:576
out
static FILE * out
Definition: movenc.c:55
ff_sws_op_list_input
const SwsOp * ff_sws_op_list_input(const SwsOpList *ops)
Returns the input operation for a given op list, or NULL if there is none (e.g.
Definition: ops.c:570
SWS_OP_CLEAR
@ SWS_OP_CLEAR
Definition: ops.h:59
ff_sws_op_list_max_size
int ff_sws_op_list_max_size(const SwsOpList *ops)
Returns the size of the largest pixel type used in ops.
Definition: ops.c:646
backend_x86
const SwsOpBackend backend_x86
Definition: ops.c:780
matrix
Definition: vc1dsp.c:43
mask
int mask
Definition: mediacodecdec_common.c:154
SwsOp::rw
SwsReadWriteOp rw
Definition: ops.h:193
normalize_clear
static void normalize_clear(SwsOp *op)
Definition: ops.c:672
av_const
#define av_const
Definition: attributes.h:105
read_bytes
static void read_bytes(const uint8_t *src, float *dst, int src_stride, int dst_stride, int width, int height, float scale)
Definition: vf_nnedi.c:442
float.h
DECL_FUNCS_32
#define DECL_FUNCS_32(SIZE, EXT, FLAG)
Definition: ops.c:459
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
setup_linear
static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:263
SwsOpBackend::name
const char * name
Definition: ops_internal.h:56
ff_sws_pixel_type_size
int ff_sws_pixel_type_size(SwsPixelType type)
Definition: ops.c:63
SwsOpChain::cpu_flags
int cpu_flags
Definition: ops_chain.h:89
av_memdup
void * av_memdup(const void *p, size_t size)
Duplicate a buffer with av_malloc().
Definition: mem.c:304
SwsOpPriv::u32
uint32_t u32[4]
Definition: ops_chain.h:54
setup_dither
static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:211
ff_sws_op_list_print
void ff_sws_op_list_print(void *log, int lev, int lev_extra, const SwsOpList *ops)
Print out the contents of an operation list.
Definition: ops.c:754
SwsOpList::num_ops
int num_ops
Definition: ops.h:226
SwsOp::c
SwsConst c
Definition: ops.h:198
SWS_PIXEL_U8
@ SWS_PIXEL_U8
Definition: ops.h:32
AVRational::num
int num
Numerator.
Definition: rational.h:59
SwsOpChain::over_read
int over_read
Definition: ops_chain.h:90
AV_CPU_FLAG_AVX512
#define AV_CPU_FLAG_AVX512
AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used.
Definition: cpu.h:60
SwsOpChain::free
void(* free[SWS_MAX_OPS+1])(SwsOpPriv *)
Definition: ops_chain.h:87
avassert.h
AV_LOG_TRACE
#define AV_LOG_TRACE
Extremely verbose debugging, useful for libav* development.
Definition: log.h:236
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
float
float
Definition: af_crystalizer.c:122
ff_sws_op_chain_alloc
SwsOpChain * ff_sws_op_chain_alloc(void)
Definition: ops_chain.c:29
op
static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)
Perform decode operation.
Definition: anm.c:76
setup_clear
static int setup_clear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:132
ctx
static AVFormatContext * ctx
Definition: movenc.c:49
AV_CPU_FLAG_SSE4
#define AV_CPU_FLAG_SSE4
Penryn SSE4.1 functions.
Definition: cpu.h:47
ff_sws_op_list_output
const SwsOp * ff_sws_op_list_output(const SwsOpList *ops)
Returns the output operation for a given op list, or NULL if there is none.
Definition: ops.c:579
av_mallocz
#define av_mallocz(s)
Definition: tableprint_vlc.h:31
SwsOpBackend
Definition: ops_internal.h:55
SwsOpChain
Compiled "chain" of operations, which can be dispatched efficiently.
Definition: ops_chain.h:84
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
SwsImplParams::op
const SwsOp * op
Definition: ops_chain.h:107
tables
static const SwsOpTable *const tables[]
Definition: ops.c:566
setup_rw
static int setup_rw(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:61
solve_shuffle
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
Definition: ops.c:606
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
SwsImplParams
Definition: ops_chain.h:105
AV_CPU_FLAG_AVX2
#define AV_CPU_FLAG_AVX2
AVX2 functions: requires OS support even if YMM registers aren't used.
Definition: cpu.h:56
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
SwsOp::type
SwsPixelType type
Definition: ops.h:190
SwsOpPriv::u8
uint8_t u8[16]
Definition: ops_chain.h:50
size
int size
Definition: twinvq_data.h:10344
setup_swap_bytes
static int setup_swap_bytes(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:102
ff_sws_op_compile_tables
int ff_sws_op_compile_tables(SwsContext *ctx, const SwsOpTable *const tables[], int num_tables, SwsOpList *ops, const int block_size, SwsOpChain *chain)
"Compile" a single op by looking it up in a list of fixed size op tables.
Definition: ops_chain.c:196
SWS_OP_WRITE
@ SWS_OP_WRITE
Definition: ops.h:48
ff_sws_setup_q4
int ff_sws_setup_q4(const SwsImplParams *params, SwsImplResult *out)
Definition: ops_chain.c:290
SwsOpPriv::u16
uint16_t u16[8]
Definition: ops_chain.h:52
ff_sws_op_chain_free_cb
void ff_sws_op_chain_free_cb(void *ptr)
Definition: ops_chain.c:34
compile
static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
Definition: ops.c:697
ff_sws_op_chain_free
static void ff_sws_op_chain_free(SwsOpChain *chain)
Definition: ops_chain.h:96
SwsOpList::ops
SwsOp * ops
Definition: ops.h:225
op_is_type_invariant
static bool op_is_type_invariant(const SwsOp *op)
Returns true if the operation's implementation only depends on the block size, and not the underlying...
Definition: ops.c:592
av_assert1
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
Definition: avassert.h:58
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
SwsOp
Definition: ops.h:188
write_bytes
static void write_bytes(const float *src, uint8_t *dst, int src_stride, int dst_stride, int width, int height, int depth, float scale)
Definition: vf_nnedi.c:484
ff_op_priv_free
static void ff_op_priv_free(SwsOpPriv *priv)
Definition: ops_chain.h:148
ret
ret
Definition: filter_design.txt:187
SwsCompiledOp
Definition: ops_dispatch.h:75
setup_shift
static int setup_shift(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:166
U
#define U(x)
Definition: vpx_arith.h:37
ASSIGN_SHUFFLE_FUNC
#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT)
SwsConst::u
unsigned u
Definition: ops.h:85
SwsImplResult::priv
SwsOpPriv priv
Definition: ops_chain.h:113
AVRational::den
int den
Denominator.
Definition: rational.h:60
SwsReadWriteOp::packed
bool packed
Definition: ops.h:103
AV_PIX_FMT_NONE
@ AV_PIX_FMT_NONE
Definition: pixfmt.h:72
ff_sws_solve_shuffle
int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size, uint8_t clear_val, int *read_bytes, int *write_bytes)
"Solve" an op list into a fixed shuffle mask, with an optional ability to also directly clear the out...
Definition: ops_optimizer.c:686
SwsReadWriteOp::elems
uint8_t elems
Definition: ops.h:101
mem.h
av_free
#define av_free(p)
Definition: tableprint_vlc.h:34
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
DECL_FUNCS_16
#define DECL_FUNCS_16(SIZE, EXT, FLAG)
Definition: ops.c:417
stride
#define stride
Definition: h264pred_template.c:536
SwsOpList
Helper struct for representing a list of operations.
Definition: ops.h:224
DECL_FUNCS_8
#define DECL_FUNCS_8(SIZE, EXT, FLAG)
Definition: ops.c:287
SwsContext
Main external API structure.
Definition: swscale.h:206
shuffle
static uint64_t shuffle(uint64_t in, const uint8_t *shuffle, int shuffle_len)
Definition: des.c:179
SwsImplResult
Definition: ops_chain.h:111
read
static uint32_t BS_FUNC() read(BSCTX *bc, unsigned int n)
Return n bits from the buffer, n has to be in the 0-32 range.
Definition: bitstream_template.h:239
SwsOpChain::over_write
int over_write
Definition: ops_chain.h:91