FFmpeg
ops.c
Go to the documentation of this file.
1 /**
2  * Copyright (C) 2025 Niklas Haas
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <float.h>
22 
23 #include "libavutil/avassert.h"
24 #include "libavutil/mem.h"
25 
26 #include "../ops_chain.h"
27 
28 #define DECL_ENTRY(TYPE, NAME, ...) \
29  static const SwsOpEntry op_##NAME = { \
30  .type = SWS_PIXEL_##TYPE, \
31  __VA_ARGS__ \
32  }
33 
34 #define DECL_ASM(TYPE, NAME, ...) \
35  void ff_##NAME(void); \
36  DECL_ENTRY(TYPE, NAME, \
37  .func = ff_##NAME, \
38  __VA_ARGS__)
39 
40 #define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...) \
41  DECL_ASM(TYPE, p##X##Y##Z##W##_##NAME, \
42  .unused = { !X, !Y, !Z, !W }, \
43  __VA_ARGS__ \
44  )
45 
46 #define REF_PATTERN(NAME, X, Y, Z, W) \
47  &op_p##X##Y##Z##W##_##NAME
48 
49 #define DECL_COMMON_PATTERNS(TYPE, NAME, ...) \
50  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__); \
51  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__); \
52  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__); \
53  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__) \
54 
55 #define REF_COMMON_PATTERNS(NAME) \
56  REF_PATTERN(NAME, 1, 0, 0, 0), \
57  REF_PATTERN(NAME, 1, 0, 0, 1), \
58  REF_PATTERN(NAME, 1, 1, 1, 0), \
59  REF_PATTERN(NAME, 1, 1, 1, 1)
60 
61 #define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC) \
62  DECL_ASM(TYPE, NAME##ELEMS##EXT, \
63  .op = SWS_OP_##OP, \
64  .rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC }, \
65  );
66 
67 #define DECL_PACKED_RW(EXT, DEPTH) \
68  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 2, true, 0) \
69  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 3, true, 0) \
70  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 4, true, 0) \
71  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true, 0) \
72  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true, 0) \
73  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true, 0) \
74 
75 #define DECL_PACK_UNPACK(EXT, TYPE, X, Y, Z, W) \
76  DECL_ASM(TYPE, pack_##X##Y##Z##W##EXT, \
77  .op = SWS_OP_PACK, \
78  .pack.pattern = {X, Y, Z, W}, \
79  ); \
80  \
81  DECL_ASM(TYPE, unpack_##X##Y##Z##W##EXT, \
82  .op = SWS_OP_UNPACK, \
83  .pack.pattern = {X, Y, Z, W}, \
84  ); \
85 
86 static int setup_swap_bytes(const SwsOp *op, SwsOpPriv *out)
87 {
88  const int mask = ff_sws_pixel_type_size(op->type) - 1;
89  for (int i = 0; i < 16; i++)
90  out->u8[i] = (i & ~mask) | (mask - (i & mask));
91  return 0;
92 }
93 
94 #define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W) \
95  DECL_ENTRY(TYPE, p##X##Y##Z##W##_swap_bytes_##TYPE##EXT, \
96  .op = SWS_OP_SWAP_BYTES, \
97  .unused = { !X, !Y, !Z, !W }, \
98  .func = ff_p##X##Y##Z##W##_shuffle##EXT, \
99  .setup = setup_swap_bytes, \
100  );
101 
102 #define DECL_CLEAR_ALPHA(EXT, IDX) \
103  DECL_ASM(U8, clear_alpha##IDX##EXT, \
104  .op = SWS_OP_CLEAR, \
105  .clear_value = -1, \
106  .unused[IDX] = true, \
107  ); \
108 
109 #define DECL_CLEAR_ZERO(EXT, IDX) \
110  DECL_ASM(U8, clear_zero##IDX##EXT, \
111  .op = SWS_OP_CLEAR, \
112  .clear_value = 0, \
113  .unused[IDX] = true, \
114  );
115 
116 static int setup_clear(const SwsOp *op, SwsOpPriv *out)
117 {
118  for (int i = 0; i < 4; i++)
119  out->u32[i] = (uint32_t) op->c.q4[i].num;
120  return 0;
121 }
122 
123 #define DECL_CLEAR(EXT, X, Y, Z, W) \
124  DECL_PATTERN(U8, clear##EXT, X, Y, Z, W, \
125  .op = SWS_OP_CLEAR, \
126  .setup = setup_clear, \
127  .flexible = true, \
128  );
129 
130 #define DECL_SWIZZLE(EXT, X, Y, Z, W) \
131  DECL_ASM(U8, swizzle_##X##Y##Z##W##EXT, \
132  .op = SWS_OP_SWIZZLE, \
133  .swizzle.in = {X, Y, Z, W}, \
134  );
135 
136 #define DECL_CONVERT(EXT, FROM, TO) \
137  DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT, \
138  .op = SWS_OP_CONVERT, \
139  .convert.to = SWS_PIXEL_##TO, \
140  );
141 
142 #define DECL_EXPAND(EXT, FROM, TO) \
143  DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT, \
144  .op = SWS_OP_CONVERT, \
145  .convert.to = SWS_PIXEL_##TO, \
146  .convert.expand = true, \
147  );
148 
149 static int setup_shift(const SwsOp *op, SwsOpPriv *out)
150 {
151  out->u16[0] = op->c.u;
152  return 0;
153 }
154 
155 #define DECL_SHIFT16(EXT) \
156  DECL_COMMON_PATTERNS(U16, lshift16##EXT, \
157  .op = SWS_OP_LSHIFT, \
158  .setup = setup_shift, \
159  .flexible = true, \
160  ); \
161  \
162  DECL_COMMON_PATTERNS(U16, rshift16##EXT, \
163  .op = SWS_OP_RSHIFT, \
164  .setup = setup_shift, \
165  .flexible = true, \
166  );
167 
168 #define DECL_MIN_MAX(EXT) \
169  DECL_COMMON_PATTERNS(F32, min##EXT, \
170  .op = SWS_OP_MIN, \
171  .setup = ff_sws_setup_q4, \
172  .flexible = true, \
173  ); \
174  \
175  DECL_COMMON_PATTERNS(F32, max##EXT, \
176  .op = SWS_OP_MAX, \
177  .setup = ff_sws_setup_q4, \
178  .flexible = true, \
179  );
180 
181 #define DECL_SCALE(EXT) \
182  DECL_COMMON_PATTERNS(F32, scale##EXT, \
183  .op = SWS_OP_SCALE, \
184  .setup = ff_sws_setup_q, \
185  );
186 
187 static int setup_dither(const SwsOp *op, SwsOpPriv *out)
188 {
189  /* 1x1 matrix / single constant */
190  if (!op->dither.size_log2) {
191  const AVRational k = op->dither.matrix[0];
192  out->f32[0] = (float) k.num / k.den;
193  return 0;
194  }
195 
196  const int size = 1 << op->dither.size_log2;
197  int max_offset = 0;
198  for (int i = 0; i < 4; i++) {
199  const int offset = op->dither.y_offset[i] & (size - 1);
200  max_offset = FFMAX(max_offset, offset);
201  }
202 
203  /* Allocate extra rows to allow over-reading for row offsets. Note that
204  * max_offset is currently never larger than 5, so the extra space needed
205  * for this over-allocation is bounded by 5 * size * sizeof(float),
206  * typically 320 bytes for a 16x16 dither matrix. */
207  const int stride = size * sizeof(float);
208  const int num_rows = size + max_offset;
209  float *matrix = out->ptr = av_mallocz(num_rows * stride);
210  if (!matrix)
211  return AVERROR(ENOMEM);
212 
213  for (int i = 0; i < size * size; i++)
214  matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
215 
216  memcpy(&matrix[size * size], matrix, max_offset * stride);
217 
218  /* Store relative pointer offset to each row inside extra space */
219  static_assert(sizeof(out->ptr) <= sizeof(uint16_t[4]), ">8 byte pointers not supported");
220  assert(max_offset * stride <= UINT16_MAX);
221  uint16_t *offset = &out->u16[4];
222  for (int i = 0; i < 4; i++)
223  offset[i] = (op->dither.y_offset[i] & (size - 1)) * stride;
224 
225  return 0;
226 }
227 
228 #define DECL_DITHER(EXT, SIZE) \
229  DECL_COMMON_PATTERNS(F32, dither##SIZE##EXT, \
230  .op = SWS_OP_DITHER, \
231  .setup = setup_dither, \
232  .free = (SIZE) ? av_free : NULL, \
233  .dither_size = SIZE, \
234  );
235 
236 static int setup_linear(const SwsOp *op, SwsOpPriv *out)
237 {
238  float *matrix = out->ptr = av_mallocz(sizeof(float[4][5]));
239  if (!matrix)
240  return AVERROR(ENOMEM);
241 
242  for (int y = 0; y < 4; y++) {
243  for (int x = 0; x < 5; x++)
244  matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den;
245  }
246 
247  return 0;
248 }
249 
250 #define DECL_LINEAR(EXT, NAME, MASK) \
251  DECL_ASM(F32, NAME##EXT, \
252  .op = SWS_OP_LINEAR, \
253  .setup = setup_linear, \
254  .free = av_free, \
255  .linear_mask = (MASK), \
256  );
257 
258 #define DECL_FUNCS_8(SIZE, EXT, FLAG) \
259  DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \
260  DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \
261  DECL_RW(EXT, U8, read_planar, READ, 3, false, 0) \
262  DECL_RW(EXT, U8, read_planar, READ, 4, false, 0) \
263  DECL_RW(EXT, U8, write_planar, WRITE, 1, false, 0) \
264  DECL_RW(EXT, U8, write_planar, WRITE, 2, false, 0) \
265  DECL_RW(EXT, U8, write_planar, WRITE, 3, false, 0) \
266  DECL_RW(EXT, U8, write_planar, WRITE, 4, false, 0) \
267  DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \
268  DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \
269  DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \
270  DECL_PACKED_RW(EXT, 8) \
271  DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \
272  DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \
273  DECL_PACK_UNPACK(EXT, U8, 2, 3, 3, 0) \
274  void ff_p1000_shuffle##EXT(void); \
275  void ff_p1001_shuffle##EXT(void); \
276  void ff_p1110_shuffle##EXT(void); \
277  void ff_p1111_shuffle##EXT(void); \
278  DECL_SWIZZLE(EXT, 3, 0, 1, 2) \
279  DECL_SWIZZLE(EXT, 3, 0, 2, 1) \
280  DECL_SWIZZLE(EXT, 2, 1, 0, 3) \
281  DECL_SWIZZLE(EXT, 3, 2, 1, 0) \
282  DECL_SWIZZLE(EXT, 3, 1, 0, 2) \
283  DECL_SWIZZLE(EXT, 3, 2, 0, 1) \
284  DECL_SWIZZLE(EXT, 1, 2, 0, 3) \
285  DECL_SWIZZLE(EXT, 1, 0, 2, 3) \
286  DECL_SWIZZLE(EXT, 2, 0, 1, 3) \
287  DECL_SWIZZLE(EXT, 2, 3, 1, 0) \
288  DECL_SWIZZLE(EXT, 2, 1, 3, 0) \
289  DECL_SWIZZLE(EXT, 1, 2, 3, 0) \
290  DECL_SWIZZLE(EXT, 1, 3, 2, 0) \
291  DECL_SWIZZLE(EXT, 0, 2, 1, 3) \
292  DECL_SWIZZLE(EXT, 0, 2, 3, 1) \
293  DECL_SWIZZLE(EXT, 0, 3, 1, 2) \
294  DECL_SWIZZLE(EXT, 3, 1, 2, 0) \
295  DECL_SWIZZLE(EXT, 0, 3, 2, 1) \
296  DECL_SWIZZLE(EXT, 0, 0, 0, 3) \
297  DECL_SWIZZLE(EXT, 3, 0, 0, 0) \
298  DECL_SWIZZLE(EXT, 0, 0, 0, 1) \
299  DECL_SWIZZLE(EXT, 1, 0, 0, 0) \
300  DECL_CLEAR_ALPHA(EXT, 0) \
301  DECL_CLEAR_ALPHA(EXT, 1) \
302  DECL_CLEAR_ALPHA(EXT, 3) \
303  DECL_CLEAR_ZERO(EXT, 0) \
304  DECL_CLEAR_ZERO(EXT, 1) \
305  DECL_CLEAR_ZERO(EXT, 3) \
306  DECL_CLEAR(EXT, 1, 1, 1, 0) \
307  DECL_CLEAR(EXT, 0, 1, 1, 1) \
308  DECL_CLEAR(EXT, 0, 0, 1, 1) \
309  DECL_CLEAR(EXT, 1, 0, 0, 1) \
310  DECL_CLEAR(EXT, 1, 1, 0, 0) \
311  DECL_CLEAR(EXT, 0, 1, 0, 1) \
312  DECL_CLEAR(EXT, 1, 0, 1, 0) \
313  DECL_CLEAR(EXT, 1, 0, 0, 0) \
314  DECL_CLEAR(EXT, 0, 1, 0, 0) \
315  DECL_CLEAR(EXT, 0, 0, 1, 0) \
316  \
317 static const SwsOpTable ops8##EXT = { \
318  .cpu_flags = AV_CPU_FLAG_##FLAG, \
319  .block_size = SIZE, \
320  .entries = { \
321  &op_read_planar1##EXT, \
322  &op_read_planar2##EXT, \
323  &op_read_planar3##EXT, \
324  &op_read_planar4##EXT, \
325  &op_write_planar1##EXT, \
326  &op_write_planar2##EXT, \
327  &op_write_planar3##EXT, \
328  &op_write_planar4##EXT, \
329  &op_read8_packed2##EXT, \
330  &op_read8_packed3##EXT, \
331  &op_read8_packed4##EXT, \
332  &op_write8_packed2##EXT, \
333  &op_write8_packed3##EXT, \
334  &op_write8_packed4##EXT, \
335  &op_read_nibbles1##EXT, \
336  &op_read_bits1##EXT, \
337  &op_write_bits1##EXT, \
338  &op_pack_1210##EXT, \
339  &op_pack_3320##EXT, \
340  &op_pack_2330##EXT, \
341  &op_unpack_1210##EXT, \
342  &op_unpack_3320##EXT, \
343  &op_unpack_2330##EXT, \
344  &op_swizzle_3012##EXT, \
345  &op_swizzle_3021##EXT, \
346  &op_swizzle_2103##EXT, \
347  &op_swizzle_3210##EXT, \
348  &op_swizzle_3102##EXT, \
349  &op_swizzle_3201##EXT, \
350  &op_swizzle_1203##EXT, \
351  &op_swizzle_1023##EXT, \
352  &op_swizzle_2013##EXT, \
353  &op_swizzle_2310##EXT, \
354  &op_swizzle_2130##EXT, \
355  &op_swizzle_1230##EXT, \
356  &op_swizzle_1320##EXT, \
357  &op_swizzle_0213##EXT, \
358  &op_swizzle_0231##EXT, \
359  &op_swizzle_0312##EXT, \
360  &op_swizzle_3120##EXT, \
361  &op_swizzle_0321##EXT, \
362  &op_swizzle_0003##EXT, \
363  &op_swizzle_0001##EXT, \
364  &op_swizzle_3000##EXT, \
365  &op_swizzle_1000##EXT, \
366  &op_clear_alpha0##EXT, \
367  &op_clear_alpha1##EXT, \
368  &op_clear_alpha3##EXT, \
369  &op_clear_zero0##EXT, \
370  &op_clear_zero1##EXT, \
371  &op_clear_zero3##EXT, \
372  REF_PATTERN(clear##EXT, 1, 1, 1, 0), \
373  REF_PATTERN(clear##EXT, 0, 1, 1, 1), \
374  REF_PATTERN(clear##EXT, 0, 0, 1, 1), \
375  REF_PATTERN(clear##EXT, 1, 0, 0, 1), \
376  REF_PATTERN(clear##EXT, 1, 1, 0, 0), \
377  REF_PATTERN(clear##EXT, 0, 1, 0, 1), \
378  REF_PATTERN(clear##EXT, 1, 0, 1, 0), \
379  REF_PATTERN(clear##EXT, 1, 0, 0, 0), \
380  REF_PATTERN(clear##EXT, 0, 1, 0, 0), \
381  REF_PATTERN(clear##EXT, 0, 0, 1, 0), \
382  NULL \
383  }, \
384 };
385 
386 #define DECL_FUNCS_16(SIZE, EXT, FLAG) \
387  DECL_PACKED_RW(EXT, 16) \
388  DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \
389  DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \
390  DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \
391  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0) \
392  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1) \
393  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0) \
394  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1) \
395  DECL_SHIFT16(EXT) \
396  DECL_CONVERT(EXT, U8, U16) \
397  DECL_CONVERT(EXT, U16, U8) \
398  DECL_EXPAND(EXT, U8, U16) \
399  \
400 static const SwsOpTable ops16##EXT = { \
401  .cpu_flags = AV_CPU_FLAG_##FLAG, \
402  .block_size = SIZE, \
403  .entries = { \
404  &op_read16_packed2##EXT, \
405  &op_read16_packed3##EXT, \
406  &op_read16_packed4##EXT, \
407  &op_write16_packed2##EXT, \
408  &op_write16_packed3##EXT, \
409  &op_write16_packed4##EXT, \
410  &op_pack_4440##EXT, \
411  &op_pack_5550##EXT, \
412  &op_pack_5650##EXT, \
413  &op_unpack_4440##EXT, \
414  &op_unpack_5550##EXT, \
415  &op_unpack_5650##EXT, \
416  REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \
417  REF_COMMON_PATTERNS(convert_U8_U16##EXT), \
418  REF_COMMON_PATTERNS(convert_U16_U8##EXT), \
419  REF_COMMON_PATTERNS(expand_U8_U16##EXT), \
420  REF_COMMON_PATTERNS(lshift16##EXT), \
421  REF_COMMON_PATTERNS(rshift16##EXT), \
422  NULL \
423  }, \
424 };
425 
426 #define DECL_FUNCS_32(SIZE, EXT, FLAG) \
427  DECL_PACKED_RW(_m2##EXT, 32) \
428  DECL_PACK_UNPACK(_m2##EXT, U32, 10, 10, 10, 2) \
429  DECL_PACK_UNPACK(_m2##EXT, U32, 2, 10, 10, 10) \
430  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0) \
431  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1) \
432  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0) \
433  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1) \
434  DECL_CONVERT(EXT, U8, U32) \
435  DECL_CONVERT(EXT, U32, U8) \
436  DECL_CONVERT(EXT, U16, U32) \
437  DECL_CONVERT(EXT, U32, U16) \
438  DECL_CONVERT(EXT, U8, F32) \
439  DECL_CONVERT(EXT, F32, U8) \
440  DECL_CONVERT(EXT, U16, F32) \
441  DECL_CONVERT(EXT, F32, U16) \
442  DECL_EXPAND(EXT, U8, U32) \
443  DECL_MIN_MAX(EXT) \
444  DECL_SCALE(EXT) \
445  DECL_DITHER(EXT, 0) \
446  DECL_DITHER(EXT, 1) \
447  DECL_DITHER(EXT, 2) \
448  DECL_DITHER(EXT, 3) \
449  DECL_DITHER(EXT, 4) \
450  DECL_DITHER(EXT, 5) \
451  DECL_DITHER(EXT, 6) \
452  DECL_DITHER(EXT, 7) \
453  DECL_DITHER(EXT, 8) \
454  DECL_LINEAR(EXT, luma, SWS_MASK_LUMA) \
455  DECL_LINEAR(EXT, alpha, SWS_MASK_ALPHA) \
456  DECL_LINEAR(EXT, lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) \
457  DECL_LINEAR(EXT, dot3, 0x7) \
458  DECL_LINEAR(EXT, row0, SWS_MASK_ROW(0)) \
459  DECL_LINEAR(EXT, row0a, SWS_MASK_ROW(0) | SWS_MASK_ALPHA) \
460  DECL_LINEAR(EXT, diag3, SWS_MASK_DIAG3) \
461  DECL_LINEAR(EXT, diag4, SWS_MASK_DIAG4) \
462  DECL_LINEAR(EXT, diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) \
463  DECL_LINEAR(EXT, matrix3, SWS_MASK_MAT3) \
464  DECL_LINEAR(EXT, affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) \
465  DECL_LINEAR(EXT, affine3a, SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
466  DECL_LINEAR(EXT, matrix4, SWS_MASK_MAT4) \
467  DECL_LINEAR(EXT, affine4, SWS_MASK_MAT4 | SWS_MASK_OFF4) \
468  \
469 static const SwsOpTable ops32##EXT = { \
470  .cpu_flags = AV_CPU_FLAG_##FLAG, \
471  .block_size = SIZE, \
472  .entries = { \
473  &op_read32_packed2_m2##EXT, \
474  &op_read32_packed3_m2##EXT, \
475  &op_read32_packed4_m2##EXT, \
476  &op_write32_packed2_m2##EXT, \
477  &op_write32_packed3_m2##EXT, \
478  &op_write32_packed4_m2##EXT, \
479  &op_pack_1010102_m2##EXT, \
480  &op_pack_2101010_m2##EXT, \
481  &op_unpack_1010102_m2##EXT, \
482  &op_unpack_2101010_m2##EXT, \
483  REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT), \
484  REF_COMMON_PATTERNS(convert_U8_U32##EXT), \
485  REF_COMMON_PATTERNS(convert_U32_U8##EXT), \
486  REF_COMMON_PATTERNS(convert_U16_U32##EXT), \
487  REF_COMMON_PATTERNS(convert_U32_U16##EXT), \
488  REF_COMMON_PATTERNS(convert_U8_F32##EXT), \
489  REF_COMMON_PATTERNS(convert_F32_U8##EXT), \
490  REF_COMMON_PATTERNS(convert_U16_F32##EXT), \
491  REF_COMMON_PATTERNS(convert_F32_U16##EXT), \
492  REF_COMMON_PATTERNS(expand_U8_U32##EXT), \
493  REF_COMMON_PATTERNS(min##EXT), \
494  REF_COMMON_PATTERNS(max##EXT), \
495  REF_COMMON_PATTERNS(scale##EXT), \
496  REF_COMMON_PATTERNS(dither0##EXT), \
497  REF_COMMON_PATTERNS(dither1##EXT), \
498  REF_COMMON_PATTERNS(dither2##EXT), \
499  REF_COMMON_PATTERNS(dither3##EXT), \
500  REF_COMMON_PATTERNS(dither4##EXT), \
501  REF_COMMON_PATTERNS(dither5##EXT), \
502  REF_COMMON_PATTERNS(dither6##EXT), \
503  REF_COMMON_PATTERNS(dither7##EXT), \
504  REF_COMMON_PATTERNS(dither8##EXT), \
505  &op_luma##EXT, \
506  &op_alpha##EXT, \
507  &op_lumalpha##EXT, \
508  &op_dot3##EXT, \
509  &op_row0##EXT, \
510  &op_row0a##EXT, \
511  &op_diag3##EXT, \
512  &op_diag4##EXT, \
513  &op_diagoff3##EXT, \
514  &op_matrix3##EXT, \
515  &op_affine3##EXT, \
516  &op_affine3a##EXT, \
517  &op_matrix4##EXT, \
518  &op_affine4##EXT, \
519  NULL \
520  }, \
521 };
522 
523 DECL_FUNCS_8(16, _m1_sse4, SSE4)
524 DECL_FUNCS_8(32, _m1_avx2, AVX2)
525 DECL_FUNCS_8(32, _m2_sse4, SSE4)
526 DECL_FUNCS_8(64, _m2_avx2, AVX2)
527 
528 DECL_FUNCS_16(16, _m1_avx2, AVX2)
529 DECL_FUNCS_16(32, _m2_avx2, AVX2)
530 
531 DECL_FUNCS_32(16, _avx2, AVX2)
532 
533 static av_const int get_mmsize(const int cpu_flags)
534 {
536  return 64;
537  else if (cpu_flags & AV_CPU_FLAG_AVX2)
538  return 32;
539  else if (cpu_flags & AV_CPU_FLAG_SSE4)
540  return 16;
541  else
542  return AVERROR(ENOTSUP);
543 }
544 
545 /**
546  * Returns true if the operation's implementation only depends on the block
547  * size, and not the underlying pixel type
548  */
549 static bool op_is_type_invariant(const SwsOp *op)
550 {
551  switch (op->op) {
552  case SWS_OP_READ:
553  case SWS_OP_WRITE:
554  return !op->rw.packed && !op->rw.frac;
555  case SWS_OP_SWIZZLE:
556  case SWS_OP_CLEAR:
557  return true;
558  }
559 
560  return false;
561 }
562 
563 static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
564 {
565  uint8_t shuffle[16];
566  int read_bytes, write_bytes;
567  int pixels;
568 
569  /* Solve the shuffle mask for one 128-bit lane only */
570  pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes);
571  if (pixels < 0)
572  return pixels;
573 
574  /* We can't shuffle acress lanes, so restrict the vector size to XMM
575  * whenever the read/write size would be a subset of the full vector */
576  if (read_bytes < 16 || write_bytes < 16)
577  mmsize = 16;
578 
579  const int num_lanes = mmsize / 16;
580  const int in_total = num_lanes * read_bytes;
581  const int out_total = num_lanes * write_bytes;
582  const int read_size = in_total <= 4 ? 4 : /* movd */
583  in_total <= 8 ? 8 : /* movq */
584  mmsize; /* movu */
585 
586  *out = (SwsCompiledOp) {
587  .priv = av_memdup(shuffle, sizeof(shuffle)),
588  .free = av_free,
589  .block_size = pixels * num_lanes,
590  .over_read = read_size - in_total,
591  .over_write = mmsize - out_total,
592  .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
593  mmsize > 16 ? AV_CPU_FLAG_AVX2 :
595  };
596 
597  if (!out->priv)
598  return AVERROR(ENOMEM);
599 
600 #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
601 do { \
602  SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
603  if (in_total == IN && out_total == OUT) \
604  out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
605 } while (0)
606 
607  ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
608  ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
609  ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
610  ASSIGN_SHUFFLE_FUNC(16, 8, sse4);
611  ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
612  ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
613  ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
614  ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
615  ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
616  ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
617  ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
618  ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
619  ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
620  ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
621  ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
622  ASSIGN_SHUFFLE_FUNC(64, 64, avx512);
623  av_assert1(out->func);
624  return 0;
625 }
626 
627 /* Normalize clear values into 32-bit integer constants */
628 static void normalize_clear(SwsOp *op)
629 {
630  static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch");
631  SwsOpPriv priv;
632  union {
633  uint32_t u32;
634  int i;
635  } c;
636 
637  ff_sws_setup_q4(op, &priv);
638  for (int i = 0; i < 4; i++) {
639  if (!op->c.q4[i].den)
640  continue;
641  switch (ff_sws_pixel_type_size(op->type)) {
642  case 1: c.u32 = 0x1010101U * priv.u8[i]; break;
643  case 2: c.u32 = (uint32_t)priv.u16[i] << 16 | priv.u16[i]; break;
644  case 4: c.u32 = priv.u32[i]; break;
645  }
646 
647  op->c.q4[i].num = c.i;
648  op->c.q4[i].den = 1;
649  }
650 }
651 
653 {
654  const int cpu_flags = av_get_cpu_flags();
655  const int mmsize = get_mmsize(cpu_flags);
656  if (mmsize < 0)
657  return mmsize;
658 
659  av_assert1(ops->num_ops > 0);
660  const SwsOp read = ops->ops[0];
661  const SwsOp write = ops->ops[ops->num_ops - 1];
662  int ret;
663 
664  /* Special fast path for in-place packed shuffle */
665  ret = solve_shuffle(ops, mmsize, out);
666  if (ret != AVERROR(ENOTSUP))
667  return ret;
668 
670  if (!chain)
671  return AVERROR(ENOMEM);
672 
673  *out = (SwsCompiledOp) {
674  .priv = chain,
676 
677  /* Use at most two full YMM regs during the widest precision section */
678  .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops),
679  };
680 
681  /* 3-component reads/writes process one extra garbage word */
682  if (read.rw.packed && read.rw.elems == 3)
683  out->over_read = sizeof(uint32_t);
684  if (write.rw.packed && write.rw.elems == 3)
685  out->over_write = sizeof(uint32_t);
686 
687  static const SwsOpTable *const tables[] = {
688  &ops8_m1_sse4,
689  &ops8_m1_avx2,
690  &ops8_m2_sse4,
691  &ops8_m2_avx2,
692  &ops16_m1_avx2,
693  &ops16_m2_avx2,
694  &ops32_avx2,
695  };
696 
697  do {
698  int op_block_size = out->block_size;
699  SwsOp *op = &ops->ops[0];
700 
701  if (op_is_type_invariant(op)) {
702  if (op->op == SWS_OP_CLEAR)
704  op_block_size *= ff_sws_pixel_type_size(op->type);
705  op->type = SWS_PIXEL_U8;
706  }
707 
709  op_block_size, chain);
710  } while (ret == AVERROR(EAGAIN));
711  if (ret < 0) {
712  ff_sws_op_chain_free(chain);
713  return ret;
714  }
715 
716 #define ASSIGN_PROCESS_FUNC(NAME) \
717  do { \
718  SWS_DECL_FUNC(NAME); \
719  void NAME##_return(void); \
720  ret = ff_sws_op_chain_append(chain, NAME##_return, \
721  NULL, &(SwsOpPriv) {0}); \
722  out->func = NAME; \
723  } while (0)
724 
725  const int read_planes = read.rw.packed ? 1 : read.rw.elems;
726  const int write_planes = write.rw.packed ? 1 : write.rw.elems;
727  switch (FFMAX(read_planes, write_planes)) {
728  case 1: ASSIGN_PROCESS_FUNC(ff_sws_process1_x86); break;
729  case 2: ASSIGN_PROCESS_FUNC(ff_sws_process2_x86); break;
730  case 3: ASSIGN_PROCESS_FUNC(ff_sws_process3_x86); break;
731  case 4: ASSIGN_PROCESS_FUNC(ff_sws_process4_x86); break;
732  }
733 
734  if (ret < 0) {
735  ff_sws_op_chain_free(chain);
736  return ret;
737  }
738 
739  out->cpu_flags = chain->cpu_flags;
740  return 0;
741 }
742 
744  .name = "x86",
745  .compile = compile,
746 };
SWS_OP_READ
@ SWS_OP_READ
Definition: ops.h:47
SwsOpTable
Definition: ops_chain.h:122
SWS_OP_SWIZZLE
@ SWS_OP_SWIZZLE
Definition: ops.h:57
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
ASSIGN_PROCESS_FUNC
#define ASSIGN_PROCESS_FUNC(NAME)
get_mmsize
static av_const int get_mmsize(const int cpu_flags)
Definition: ops.c:533
out
FILE * out
Definition: movenc.c:55
setup_linear
static int setup_linear(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:236
SWS_OP_CLEAR
@ SWS_OP_CLEAR
Definition: ops.h:54
ff_sws_op_list_max_size
int ff_sws_op_list_max_size(const SwsOpList *ops)
Returns the size of the largest pixel type used in ops.
Definition: ops.c:302
backend_x86
const SwsOpBackend backend_x86
Definition: ops.c:743
matrix
Definition: vc1dsp.c:43
mask
int mask
Definition: mediacodecdec_common.c:154
SwsOp::rw
SwsReadWriteOp rw
Definition: ops.h:184
normalize_clear
static void normalize_clear(SwsOp *op)
Definition: ops.c:628
av_const
#define av_const
Definition: attributes.h:100
read_bytes
static void read_bytes(const uint8_t *src, float *dst, int src_stride, int dst_stride, int width, int height, float scale)
Definition: vf_nnedi.c:442
float.h
DECL_FUNCS_32
#define DECL_FUNCS_32(SIZE, EXT, FLAG)
Definition: ops.c:426
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
SwsOpBackend::name
const char * name
Definition: ops_internal.h:104
ff_sws_pixel_type_size
int ff_sws_pixel_type_size(SwsPixelType type)
Definition: ops.c:64
SwsOpChain::cpu_flags
int cpu_flags
Definition: ops_chain.h:85
av_memdup
void * av_memdup(const void *p, size_t size)
Duplicate a buffer with av_malloc().
Definition: mem.c:304
SwsOpPriv::u32
uint32_t u32[4]
Definition: ops_chain.h:49
SwsOpList::num_ops
int num_ops
Definition: ops.h:211
tables
Writing a table generator This documentation is preliminary Parts of the API are not good and should be changed Basic concepts A table generator consists of two *_tablegen c and *_tablegen h The h file will provide the variable declarations and initialization code for the tables
Definition: tablegen.txt:10
SWS_PIXEL_U8
@ SWS_PIXEL_U8
Definition: ops.h:32
AVRational::num
int num
Numerator.
Definition: rational.h:59
AV_CPU_FLAG_AVX512
#define AV_CPU_FLAG_AVX512
AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used.
Definition: cpu.h:59
avassert.h
setup_dither
static int setup_dither(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:187
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
float
float
Definition: af_crystalizer.c:122
ff_sws_op_chain_alloc
SwsOpChain * ff_sws_op_chain_alloc(void)
Definition: ops_chain.c:29
op
static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)
Perform decode operation.
Definition: anm.c:76
ctx
AVFormatContext * ctx
Definition: movenc.c:49
AV_CPU_FLAG_SSE4
#define AV_CPU_FLAG_SSE4
Penryn SSE4.1 functions.
Definition: cpu.h:47
av_mallocz
#define av_mallocz(s)
Definition: tableprint_vlc.h:31
SwsOpBackend
Definition: ops_internal.h:103
SwsOpChain
Compiled "chain" of operations, which can be dispatched efficiently.
Definition: ops_chain.h:80
ff_sws_op_compile_tables
int ff_sws_op_compile_tables(const SwsOpTable *const tables[], int num_tables, SwsOpList *ops, const int block_size, SwsOpChain *chain)
"Compile" a single op by looking it up in a list of fixed size op tables.
Definition: ops_chain.c:196
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
solve_shuffle
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
Definition: ops.c:563
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AV_CPU_FLAG_AVX2
#define AV_CPU_FLAG_AVX2
AVX2 functions: requires OS support even if YMM registers aren't used.
Definition: cpu.h:55
SwsOpPriv::u8
uint8_t u8[16]
Definition: ops_chain.h:47
size
int size
Definition: twinvq_data.h:10344
SWS_OP_WRITE
@ SWS_OP_WRITE
Definition: ops.h:48
SwsOpPriv::u16
uint16_t u16[8]
Definition: ops_chain.h:48
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
ff_sws_op_chain_free_cb
void ff_sws_op_chain_free_cb(void *ptr)
Definition: ops_chain.c:34
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
compile
static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
Definition: ops.c:652
SwsOpChain::free
void(* free[SWS_MAX_OPS+1])(void *)
Definition: ops_chain.h:83
setup_swap_bytes
static int setup_swap_bytes(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:86
ff_sws_op_chain_free
static void ff_sws_op_chain_free(SwsOpChain *chain)
Definition: ops_chain.h:90
SwsOpList::ops
SwsOp * ops
Definition: ops.h:210
op_is_type_invariant
static bool op_is_type_invariant(const SwsOp *op)
Returns true if the operation's implementation only depends on the block size, and not the underlying...
Definition: ops.c:549
ff_sws_setup_q4
int ff_sws_setup_q4(const SwsOp *op, SwsOpPriv *out)
Definition: ops_chain.c:279
av_assert1
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
Definition: avassert.h:57
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
SwsOp
Definition: ops.h:179
write_bytes
static void write_bytes(const float *src, uint8_t *dst, int src_stride, int dst_stride, int width, int height, int depth, float scale)
Definition: vf_nnedi.c:484
ret
ret
Definition: filter_design.txt:187
SwsCompiledOp
Definition: ops_internal.h:90
U
#define U(x)
Definition: vpx_arith.h:37
ASSIGN_SHUFFLE_FUNC
#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT)
AVRational::den
int den
Denominator.
Definition: rational.h:60
SwsReadWriteOp::packed
bool packed
Definition: ops.h:98
ff_sws_solve_shuffle
int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size, uint8_t clear_val, int *read_bytes, int *write_bytes)
"Solve" an op list into a fixed shuffle mask, with an optional ability to also directly clear the out...
Definition: ops_optimizer.c:843
setup_shift
static int setup_shift(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:149
SwsReadWriteOp::elems
uint8_t elems
Definition: ops.h:96
mem.h
setup_clear
static int setup_clear(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:116
av_free
#define av_free(p)
Definition: tableprint_vlc.h:34
DECL_FUNCS_16
#define DECL_FUNCS_16(SIZE, EXT, FLAG)
Definition: ops.c:386
stride
#define stride
Definition: h264pred_template.c:536
SwsOpList
Helper struct for representing a list of operations.
Definition: ops.h:209
DECL_FUNCS_8
#define DECL_FUNCS_8(SIZE, EXT, FLAG)
Definition: ops.c:258
SwsContext
Main external API structure.
Definition: swscale.h:189
SwsOpPriv
Copyright (C) 2025 Niklas Haas.
Definition: ops_chain.h:42
shuffle
static uint64_t shuffle(uint64_t in, const uint8_t *shuffle, int shuffle_len)
Definition: des.c:179
read
static uint32_t BS_FUNC() read(BSCTX *bc, unsigned int n)
Return n bits from the buffer, n has to be in the 0-32 range.
Definition: bitstream_template.h:239