FFmpeg
vvcdsp_init.c
Go to the documentation of this file.
1 /*
2  * VVC DSP init for x86
3  *
4  * Copyright (C) 2022-2024 Nuo Mi
5  * Copyright (c) 2023-2024 Wu Jianhua
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "config.h"
25 
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/cpu.h"
28 #include "libavcodec/vvc/dec.h"
29 #include "libavcodec/vvc/ctu.h"
30 #include "libavcodec/vvc/dsp.h"
32 
33 #define PUT_PROTOTYPE(name, depth, opt) \
34 void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height, const int8_t *hf, const int8_t *vf, int width);
35 
36 #define PUT_PROTOTYPES(name, bitd, opt) \
37  PUT_PROTOTYPE(name##2, bitd, opt) \
38  PUT_PROTOTYPE(name##4, bitd, opt) \
39  PUT_PROTOTYPE(name##8, bitd, opt) \
40  PUT_PROTOTYPE(name##12, bitd, opt) \
41  PUT_PROTOTYPE(name##16, bitd, opt) \
42  PUT_PROTOTYPE(name##24, bitd, opt) \
43  PUT_PROTOTYPE(name##32, bitd, opt) \
44  PUT_PROTOTYPE(name##48, bitd, opt) \
45  PUT_PROTOTYPE(name##64, bitd, opt) \
46  PUT_PROTOTYPE(name##128, bitd, opt)
47 
48 #define PUT_BPC_PROTOTYPES(name, opt) \
49  PUT_PROTOTYPES(name, 8, opt) \
50  PUT_PROTOTYPES(name, 10, opt) \
51  PUT_PROTOTYPES(name, 12, opt)
52 
53 #define PUT_TAP_PROTOTYPES(n, opt) \
54  PUT_BPC_PROTOTYPES(n##tap_h, opt) \
55  PUT_BPC_PROTOTYPES(n##tap_v, opt) \
56  PUT_BPC_PROTOTYPES(n##tap_hv, opt)
57 
58 PUT_BPC_PROTOTYPES(pixels, sse4)
59 PUT_BPC_PROTOTYPES(pixels, avx2)
60 
61 PUT_TAP_PROTOTYPES(4, sse4)
62 PUT_TAP_PROTOTYPES(8, sse4)
63 PUT_TAP_PROTOTYPES(4, avx2)
64 PUT_TAP_PROTOTYPES(8, avx2)
65 
66 #define bf(fn, bd, opt) fn##_##bd##_##opt
67 #define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
68 
69 #define AVG_BPC_PROTOTYPES(bpc, opt) \
70 void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
71  const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \
72 void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
73  const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \
74  intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
75 
76 #define AVG_PROTOTYPES(bd, opt) \
77 void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
78  const int16_t *src0, const int16_t *src1, int width, int height); \
79 void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
80  const int16_t *src0, const int16_t *src1, int width, int height, \
81  int denom, int w0, int w1, int o0, int o1);
82 
83 AVG_BPC_PROTOTYPES( 8, avx2)
84 AVG_BPC_PROTOTYPES(16, avx2)
85 
86 AVG_PROTOTYPES( 8, avx2)
87 AVG_PROTOTYPES(10, avx2)
88 AVG_PROTOTYPES(12, avx2)
89 
90 #define ALF_BPC_PROTOTYPES(bpc, opt) \
91 void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
92  const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \
93  const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \
94 void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
95  const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \
96  const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \
97 void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum, \
98  const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, intptr_t vb_pos); \
99 void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, const int *gradient_sum, \
100  intptr_t width, intptr_t height, intptr_t vb_pos, intptr_t bit_depth); \
101 
102 #define ALF_PROTOTYPES(bpc, bd, opt) \
103 void bf(ff_vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
104  int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos); \
105 void bf(ff_vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
106  int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos); \
107 void bf(ff_vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \
108  const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp); \
109 
110 ALF_BPC_PROTOTYPES(8, avx2)
111 ALF_BPC_PROTOTYPES(16, avx2)
112 
113 ALF_PROTOTYPES(8, 8, avx2)
114 ALF_PROTOTYPES(16, 10, avx2)
115 ALF_PROTOTYPES(16, 12, avx2)
116 
117 #if ARCH_X86_64
118 #if HAVE_SSE4_EXTERNAL
119 #define FW_PUT(name, depth, opt) \
120 void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \
121  int height, const int8_t *hf, const int8_t *vf, int width) \
122 { \
123  ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \
124 }
125 
126 #define FW_PUT_TAP(fname, bitd, opt ) \
127  FW_PUT(fname##4, bitd, opt ) \
128  FW_PUT(fname##8, bitd, opt ) \
129  FW_PUT(fname##16, bitd, opt ) \
130  FW_PUT(fname##32, bitd, opt ) \
131  FW_PUT(fname##64, bitd, opt ) \
132  FW_PUT(fname##128, bitd, opt ) \
133 
134 #define FW_PUT_4TAP(fname, bitd, opt) \
135  FW_PUT(fname ## 2, bitd, opt) \
136  FW_PUT_TAP(fname, bitd, opt)
137 
138 #define FW_PUT_4TAP_SSE4(bitd) \
139  FW_PUT_4TAP(pixels, bitd, sse4) \
140  FW_PUT_4TAP(4tap_h, bitd, sse4) \
141  FW_PUT_4TAP(4tap_v, bitd, sse4) \
142  FW_PUT_4TAP(4tap_hv, bitd, sse4)
143 
144 #define FW_PUT_8TAP_SSE4(bitd) \
145  FW_PUT_TAP(8tap_h, bitd, sse4) \
146  FW_PUT_TAP(8tap_v, bitd, sse4) \
147  FW_PUT_TAP(8tap_hv, bitd, sse4)
148 
149 #define FW_PUT_SSE4(bitd) \
150  FW_PUT_4TAP_SSE4(bitd) \
151  FW_PUT_8TAP_SSE4(bitd)
152 
153 FW_PUT_SSE4( 8)
154 FW_PUT_SSE4(10)
155 FW_PUT_SSE4(12)
156 #endif
157 
158 #if HAVE_AVX2_EXTERNAL
159 #define FW_PUT_TAP_AVX2(n, bitd) \
160  FW_PUT(n ## tap_h32, bitd, avx2) \
161  FW_PUT(n ## tap_h64, bitd, avx2) \
162  FW_PUT(n ## tap_h128, bitd, avx2) \
163  FW_PUT(n ## tap_v32, bitd, avx2) \
164  FW_PUT(n ## tap_v64, bitd, avx2) \
165  FW_PUT(n ## tap_v128, bitd, avx2)
166 
167 #define FW_PUT_AVX2(bitd) \
168  FW_PUT(pixels32, bitd, avx2) \
169  FW_PUT(pixels64, bitd, avx2) \
170  FW_PUT(pixels128, bitd, avx2) \
171  FW_PUT_TAP_AVX2(4, bitd) \
172  FW_PUT_TAP_AVX2(8, bitd) \
173 
174 FW_PUT_AVX2( 8)
175 FW_PUT_AVX2(10)
176 FW_PUT_AVX2(12)
177 
178 #define FW_PUT_TAP_16BPC_AVX2(n, bitd) \
179  FW_PUT(n ## tap_h16, bitd, avx2) \
180  FW_PUT(n ## tap_v16, bitd, avx2) \
181  FW_PUT(n ## tap_hv16, bitd, avx2) \
182  FW_PUT(n ## tap_hv32, bitd, avx2) \
183  FW_PUT(n ## tap_hv64, bitd, avx2) \
184  FW_PUT(n ## tap_hv128, bitd, avx2)
185 
186 #define FW_PUT_16BPC_AVX2(bitd) \
187  FW_PUT(pixels16, bitd, avx2) \
188  FW_PUT_TAP_16BPC_AVX2(4, bitd) \
189  FW_PUT_TAP_16BPC_AVX2(8, bitd)
190 
191 FW_PUT_16BPC_AVX2(10)
192 FW_PUT_16BPC_AVX2(12)
193 
194 #define AVG_FUNCS(bpc, bd, opt) \
195 void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
196  const int16_t *src0, const int16_t *src1, int width, int height) \
197 { \
198  BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \
199 } \
200 void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
201  const int16_t *src0, const int16_t *src1, int width, int height, \
202  int denom, int w0, int w1, int o0, int o1) \
203 { \
204  BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \
205  denom, w0, w1, o0, o1, (1 << bd) - 1); \
206 }
207 
208 AVG_FUNCS(8, 8, avx2)
209 AVG_FUNCS(16, 10, avx2)
210 AVG_FUNCS(16, 12, avx2)
211 
212 #define ALF_FUNCS(bpc, bd, opt) \
213 void bf(ff_vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
214  int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \
215 { \
216  const int param_stride = (width >> 2) * ALF_NUM_COEFF_LUMA; \
217  BF(ff_vvc_alf_filter_luma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \
218  filter, clip, param_stride, vb_pos, (1 << bd) - 1); \
219 } \
220 void bf(ff_vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
221  int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \
222 { \
223  BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \
224  filter, clip, 0, vb_pos,(1 << bd) - 1); \
225 } \
226 void bf(ff_vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \
227  const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp) \
228 { \
229  BF(ff_vvc_alf_classify_grad, bpc, opt)(gradient_tmp, src, src_stride, width, height, vb_pos); \
230  BF(ff_vvc_alf_classify, bpc, opt)(class_idx, transpose_idx, gradient_tmp, width, height, vb_pos, bd); \
231 } \
232 
233 ALF_FUNCS(8, 8, avx2)
234 ALF_FUNCS(16, 10, avx2)
235 ALF_FUNCS(16, 12, avx2)
236 
237 #endif
238 
239 #define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt) \
240  dst[C][W][idx1][idx2] = ff_vvc_put_## name ## _ ## D ## _##opt; \
241  dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## _##opt; \
242 
243 #define MC_TAP_LINKS(pointer, C, my, mx, fname, bitd, opt ) \
244  PEL_LINK(pointer, C, 1, my , mx , fname##4 , bitd, opt ); \
245  PEL_LINK(pointer, C, 2, my , mx , fname##8 , bitd, opt ); \
246  PEL_LINK(pointer, C, 3, my , mx , fname##16, bitd, opt ); \
247  PEL_LINK(pointer, C, 4, my , mx , fname##32, bitd, opt ); \
248  PEL_LINK(pointer, C, 5, my , mx , fname##64, bitd, opt ); \
249  PEL_LINK(pointer, C, 6, my , mx , fname##128, bitd, opt );
250 
251 #define MC_8TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
252  MC_TAP_LINKS(pointer, LUMA, my, mx, fname, bitd, opt)
253 
254 #define MC_8TAP_LINKS_SSE4(bd) \
255  MC_8TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
256  MC_8TAP_LINKS(c->inter.put, 0, 1, 8tap_h, bd, sse4); \
257  MC_8TAP_LINKS(c->inter.put, 1, 0, 8tap_v, bd, sse4); \
258  MC_8TAP_LINKS(c->inter.put, 1, 1, 8tap_hv, bd, sse4)
259 
260 #define MC_4TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
261  PEL_LINK(pointer, CHROMA, 0, my , mx , fname##2 , bitd, opt ); \
262  MC_TAP_LINKS(pointer, CHROMA, my, mx, fname, bitd, opt) \
263 
264 #define MC_4TAP_LINKS_SSE4(bd) \
265  MC_4TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
266  MC_4TAP_LINKS(c->inter.put, 0, 1, 4tap_h, bd, sse4); \
267  MC_4TAP_LINKS(c->inter.put, 1, 0, 4tap_v, bd, sse4); \
268  MC_4TAP_LINKS(c->inter.put, 1, 1, 4tap_hv, bd, sse4)
269 
270 #define MC_LINK_SSE4(bd) \
271  MC_4TAP_LINKS_SSE4(bd) \
272  MC_8TAP_LINKS_SSE4(bd)
273 
274 #define MC_TAP_LINKS_AVX2(C,tap,bd) do { \
275  PEL_LINK(c->inter.put, C, 4, 0, 0, pixels32, bd, avx2) \
276  PEL_LINK(c->inter.put, C, 5, 0, 0, pixels64, bd, avx2) \
277  PEL_LINK(c->inter.put, C, 6, 0, 0, pixels128, bd, avx2) \
278  PEL_LINK(c->inter.put, C, 4, 0, 1, tap##tap_h32, bd, avx2) \
279  PEL_LINK(c->inter.put, C, 5, 0, 1, tap##tap_h64, bd, avx2) \
280  PEL_LINK(c->inter.put, C, 6, 0, 1, tap##tap_h128, bd, avx2) \
281  PEL_LINK(c->inter.put, C, 4, 1, 0, tap##tap_v32, bd, avx2) \
282  PEL_LINK(c->inter.put, C, 5, 1, 0, tap##tap_v64, bd, avx2) \
283  PEL_LINK(c->inter.put, C, 6, 1, 0, tap##tap_v128, bd, avx2) \
284  } while (0)
285 
286 #define MC_LINKS_AVX2(bd) \
287  MC_TAP_LINKS_AVX2(LUMA, 8, bd); \
288  MC_TAP_LINKS_AVX2(CHROMA, 4, bd);
289 
290 #define MC_TAP_LINKS_16BPC_AVX2(C, tap, bd) do { \
291  PEL_LINK(c->inter.put, C, 3, 0, 0, pixels16, bd, avx2) \
292  PEL_LINK(c->inter.put, C, 3, 0, 1, tap##tap_h16, bd, avx2) \
293  PEL_LINK(c->inter.put, C, 3, 1, 0, tap##tap_v16, bd, avx2) \
294  PEL_LINK(c->inter.put, C, 3, 1, 1, tap##tap_hv16, bd, avx2) \
295  PEL_LINK(c->inter.put, C, 4, 1, 1, tap##tap_hv32, bd, avx2) \
296  PEL_LINK(c->inter.put, C, 5, 1, 1, tap##tap_hv64, bd, avx2) \
297  PEL_LINK(c->inter.put, C, 6, 1, 1, tap##tap_hv128, bd, avx2) \
298  } while (0)
299 
300 #define MC_LINKS_16BPC_AVX2(bd) \
301  MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \
302  MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
303 
304 #define AVG_INIT(bd, opt) do { \
305  c->inter.avg = bf(ff_vvc_avg, bd, opt); \
306  c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \
307 } while (0)
308 
309 #define ALF_INIT(bd) do { \
310  c->alf.filter[LUMA] = ff_vvc_alf_filter_luma_##bd##_avx2; \
311  c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2; \
312  c->alf.classify = ff_vvc_alf_classify_##bd##_avx2; \
313 } while (0)
314 #endif
315 
316 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
317 {
318 #if ARCH_X86_64
319  const int cpu_flags = av_get_cpu_flags();
320 
321  switch (bd) {
322  case 8:
323  if (EXTERNAL_SSE4(cpu_flags)) {
324  MC_LINK_SSE4(8);
325  }
327  ALF_INIT(8);
328  AVG_INIT(8, avx2);
329  MC_LINKS_AVX2(8);
330  }
331  break;
332  case 10:
333  if (EXTERNAL_SSE4(cpu_flags)) {
334  MC_LINK_SSE4(10);
335  }
337  ALF_INIT(10);
338  AVG_INIT(10, avx2);
339  MC_LINKS_AVX2(10);
340  MC_LINKS_16BPC_AVX2(10);
341  }
342  break;
343  case 12:
344  if (EXTERNAL_SSE4(cpu_flags)) {
345  MC_LINK_SSE4(12);
346  }
348  ALF_INIT(12);
349  AVG_INIT(12, avx2);
350  MC_LINKS_AVX2(12);
351  MC_LINKS_16BPC_AVX2(12);
352  }
353  break;
354  default:
355  break;
356  }
357 #endif
358 }
ff_vvc_dsp_init_x86
void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
Definition: vvcdsp_init.c:316
cpu.h
ALF_PROTOTYPES
#define ALF_PROTOTYPES(bpc, bd, opt)
Definition: vvcdsp_init.c:102
dsp.h
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:79
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:103
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:52
ALF_BPC_PROTOTYPES
#define ALF_BPC_PROTOTYPES(bpc, opt)
Definition: vvcdsp_init.c:90
AVG_PROTOTYPES
#define AVG_PROTOTYPES(bd, opt)
Definition: vvcdsp_init.c:76
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
h2656dsp.h
cpu.h
PUT_BPC_PROTOTYPES
#define PUT_BPC_PROTOTYPES(name, opt)
Definition: vvcdsp_init.c:48
EXTERNAL_SSE4
#define EXTERNAL_SSE4(flags)
Definition: cpu.h:68
AVG_BPC_PROTOTYPES
#define AVG_BPC_PROTOTYPES(bpc, opt)
Definition: vvcdsp_init.c:69
ctu.h
PUT_TAP_PROTOTYPES
#define PUT_TAP_PROTOTYPES(n, opt)
Definition: vvcdsp_init.c:53
dec.h
VVCDSPContext
Definition: dsp.h:158