FFmpeg
dsp_init.c
Go to the documentation of this file.
1 /*
2  * VVC filters DSP
3  *
4  * Copyright (C) 2024 Zhao Zhili
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/cpu.h"
24 #include "libavutil/aarch64/cpu.h"
26 #include "libavcodec/vvc/dsp.h"
27 #include "libavcodec/vvc/dec.h"
28 #include "libavcodec/vvc/ctu.h"
29 
30 #define BDOF_BLOCK_SIZE 16
31 #define BDOF_MIN_BLOCK_SIZE 4
32 
33 void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h,
34  int16_t *gradient_v,
35  ptrdiff_t gradient_stride,
36  const int16_t *_src,
37  ptrdiff_t src_stride,
38  int width, int height);
39 
40 void ff_vvc_derive_bdof_vx_vy_neon(const int16_t *_src0, const int16_t *_src1,
41  int pad_mask,
42  const int16_t **gradient_h,
43  const int16_t **gradient_v,
44  int16_t *vx, int16_t *vy);
45 #define BIT_DEPTH 8
46 #include "alf_template.c"
47 #include "of_template.c"
48 #undef BIT_DEPTH
49 
50 #define BIT_DEPTH 10
51 #include "alf_template.c"
52 #include "of_template.c"
53 #undef BIT_DEPTH
54 
55 #define BIT_DEPTH 12
56 #include "alf_template.c"
57 #include "of_template.c"
58 #undef BIT_DEPTH
59 
60 int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
61  const int block_w, const int block_h);
62 
63 void ff_vvc_avg_8_neon(uint8_t *dst, ptrdiff_t dst_stride,
64  const int16_t *src0, const int16_t *src1, int width,
65  int height);
66 void ff_vvc_avg_10_neon(uint8_t *dst, ptrdiff_t dst_stride,
67  const int16_t *src0, const int16_t *src1, int width,
68  int height);
69 void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
70  const int16_t *src0, const int16_t *src1, int width,
71  int height);
72 
73 void ff_vvc_w_avg_8_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
74  const int16_t *src0, const int16_t *src1,
75  int width, int height,
76  uintptr_t w0_w1, uintptr_t offset_shift);
77 void ff_vvc_w_avg_10_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
78  const int16_t *src0, const int16_t *src1,
79  int width, int height,
80  uintptr_t w0_w1, uintptr_t offset_shift);
81 void ff_vvc_w_avg_12_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
82  const int16_t *src0, const int16_t *src1,
83  int width, int height,
84  uintptr_t w0_w1, uintptr_t offset_shift);
85 /* When passing arguments to functions, Apple platforms diverge from the ARM64
86  * standard ABI for functions that require passing arguments on the stack. To
87  * simplify portability in the assembly function interface, use a different
88  * function signature that doesn't require passing arguments on the stack.
89  */
90 #define W_AVG_FUN(bit_depth) \
91 static void vvc_w_avg_ ## bit_depth(uint8_t *dst, ptrdiff_t dst_stride, \
92  const int16_t *src0, const int16_t *src1, int width, int height, \
93  int denom, int w0, int w1, int o0, int o1) \
94 { \
95  int shift = denom + FFMAX(3, 15 - bit_depth); \
96  int offset = ((o0 + o1) * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \
97  uintptr_t w0_w1 = ((uintptr_t)w0 << 32) | (uint32_t)w1; \
98  uintptr_t offset_shift = ((uintptr_t)offset << 32) | (uint32_t)shift; \
99  ff_vvc_w_avg_ ## bit_depth ## _neon(dst, dst_stride, src0, src1, width, height, w0_w1, offset_shift); \
100 }
101 
102 W_AVG_FUN(8)
103 W_AVG_FUN(10)
104 W_AVG_FUN(12)
105 
106 #define DMVR_FUN(fn, bd) \
107  void ff_vvc_dmvr_ ## fn ## bd ## _neon(int16_t *dst, \
108  const uint8_t *_src, ptrdiff_t _src_stride, int height, \
109  intptr_t mx, intptr_t my, int width);
110 
111 DMVR_FUN(, 8)
112 DMVR_FUN(, 12)
113 DMVR_FUN(hv_, 8)
114 DMVR_FUN(hv_, 10)
115 DMVR_FUN(hv_, 12)
116 
117 void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
118 {
119  int cpu_flags = av_get_cpu_flags();
120  if (!have_neon(cpu_flags))
121  return;
122 
123  if (bd == 8) {
124  c->inter.put[0][1][0][0] = ff_vvc_put_pel_pixels4_8_neon;
125  c->inter.put[0][2][0][0] = ff_vvc_put_pel_pixels8_8_neon;
126  c->inter.put[0][3][0][0] = ff_vvc_put_pel_pixels16_8_neon;
127  c->inter.put[0][4][0][0] = ff_vvc_put_pel_pixels32_8_neon;
128  c->inter.put[0][5][0][0] = ff_vvc_put_pel_pixels64_8_neon;
129  c->inter.put[0][6][0][0] = ff_vvc_put_pel_pixels128_8_neon;
130 
131  c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon;
132  c->inter.put[0][2][0][1] = ff_vvc_put_qpel_h8_8_neon;
133  c->inter.put[0][3][0][1] = ff_vvc_put_qpel_h16_8_neon;
134  c->inter.put[0][4][0][1] =
135  c->inter.put[0][5][0][1] =
136  c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h32_8_neon;
137 
138  c->inter.put[0][1][1][0] = ff_vvc_put_qpel_v4_8_neon;
139  c->inter.put[0][2][1][0] =
140  c->inter.put[0][3][1][0] =
141  c->inter.put[0][4][1][0] =
142  c->inter.put[0][5][1][0] =
143  c->inter.put[0][6][1][0] = ff_vvc_put_qpel_v8_8_neon;
144 
145  c->inter.put[0][1][1][1] = ff_vvc_put_qpel_hv4_8_neon;
146  c->inter.put[0][2][1][1] = ff_vvc_put_qpel_hv8_8_neon;
147  c->inter.put[0][3][1][1] = ff_vvc_put_qpel_hv16_8_neon;
148  c->inter.put[0][4][1][1] = ff_vvc_put_qpel_hv32_8_neon;
149  c->inter.put[0][5][1][1] = ff_vvc_put_qpel_hv64_8_neon;
150  c->inter.put[0][6][1][1] = ff_vvc_put_qpel_hv128_8_neon;
151 
152  c->inter.put[1][1][0][0] = ff_vvc_put_pel_pixels4_8_neon;
153  c->inter.put[1][2][0][0] = ff_vvc_put_pel_pixels8_8_neon;
154  c->inter.put[1][3][0][0] = ff_vvc_put_pel_pixels16_8_neon;
155  c->inter.put[1][4][0][0] = ff_vvc_put_pel_pixels32_8_neon;
156  c->inter.put[1][5][0][0] = ff_vvc_put_pel_pixels64_8_neon;
157  c->inter.put[1][6][0][0] = ff_vvc_put_pel_pixels128_8_neon;
158 
159  c->inter.put[1][1][0][1] = ff_vvc_put_epel_h4_8_neon;
160  c->inter.put[1][2][0][1] = ff_vvc_put_epel_h8_8_neon;
161  c->inter.put[1][3][0][1] = ff_vvc_put_epel_h16_8_neon;
162  c->inter.put[1][4][0][1] =
163  c->inter.put[1][5][0][1] =
164  c->inter.put[1][6][0][1] = ff_vvc_put_epel_h32_8_neon;
165 
166  c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon;
167  c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon;
168  c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon;
169  c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon;
170  c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon;
171  c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon;
172 
173  c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
174  c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
175  c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;
176  c->inter.put_uni[0][4][0][0] = ff_vvc_put_pel_uni_pixels32_8_neon;
177  c->inter.put_uni[0][5][0][0] = ff_vvc_put_pel_uni_pixels64_8_neon;
178  c->inter.put_uni[0][6][0][0] = ff_vvc_put_pel_uni_pixels128_8_neon;
179 
180  c->inter.put_uni[0][1][0][1] = ff_vvc_put_qpel_uni_h4_8_neon;
181  c->inter.put_uni[0][2][0][1] = ff_vvc_put_qpel_uni_h8_8_neon;
182  c->inter.put_uni[0][3][0][1] = ff_vvc_put_qpel_uni_h16_8_neon;
183  c->inter.put_uni[0][4][0][1] =
184  c->inter.put_uni[0][5][0][1] =
185  c->inter.put_uni[0][6][0][1] = ff_vvc_put_qpel_uni_h32_8_neon;
186 
187  c->inter.put_uni_w[0][1][0][0] = ff_vvc_put_pel_uni_w_pixels4_8_neon;
188  c->inter.put_uni_w[0][2][0][0] = ff_vvc_put_pel_uni_w_pixels8_8_neon;
189  c->inter.put_uni_w[0][3][0][0] = ff_vvc_put_pel_uni_w_pixels16_8_neon;
190  c->inter.put_uni_w[0][4][0][0] = ff_vvc_put_pel_uni_w_pixels32_8_neon;
191  c->inter.put_uni_w[0][5][0][0] = ff_vvc_put_pel_uni_w_pixels64_8_neon;
192  c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;
193 
194  c->inter.avg = ff_vvc_avg_8_neon;
195  c->inter.w_avg = vvc_w_avg_8;
196  c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon;
197  c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
198  c->inter.apply_bdof = apply_bdof_8;
199 
200  for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
201  c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
202  c->sao.edge_filter[0] = ff_vvc_sao_edge_filter_8x8_8_neon;
203  for (int i = 1; i < FF_ARRAY_ELEMS(c->sao.edge_filter); i++)
204  c->sao.edge_filter[i] = ff_vvc_sao_edge_filter_16x16_8_neon;
205  c->alf.filter[LUMA] = alf_filter_luma_8_neon;
206  c->alf.filter[CHROMA] = alf_filter_chroma_8_neon;
207 
208  if (have_i8mm(cpu_flags)) {
209  c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon_i8mm;
210  c->inter.put[0][2][0][1] = ff_vvc_put_qpel_h8_8_neon_i8mm;
211  c->inter.put[0][3][0][1] = ff_vvc_put_qpel_h16_8_neon_i8mm;
212  c->inter.put[0][4][0][1] = ff_vvc_put_qpel_h32_8_neon_i8mm;
213  c->inter.put[0][5][0][1] = ff_vvc_put_qpel_h64_8_neon_i8mm;
214  c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h128_8_neon_i8mm;
215 
216  c->inter.put[0][1][1][1] = ff_vvc_put_qpel_hv4_8_neon_i8mm;
217  c->inter.put[0][2][1][1] = ff_vvc_put_qpel_hv8_8_neon_i8mm;
218  c->inter.put[0][3][1][1] = ff_vvc_put_qpel_hv16_8_neon_i8mm;
219  c->inter.put[0][4][1][1] = ff_vvc_put_qpel_hv32_8_neon_i8mm;
220  c->inter.put[0][5][1][1] = ff_vvc_put_qpel_hv64_8_neon_i8mm;
221  c->inter.put[0][6][1][1] = ff_vvc_put_qpel_hv128_8_neon_i8mm;
222 
223  c->inter.put[1][1][0][1] = ff_vvc_put_epel_h4_8_neon_i8mm;
224  c->inter.put[1][2][0][1] = ff_vvc_put_epel_h8_8_neon_i8mm;
225  c->inter.put[1][3][0][1] = ff_vvc_put_epel_h16_8_neon_i8mm;
226  c->inter.put[1][4][0][1] = ff_vvc_put_epel_h32_8_neon_i8mm;
227  c->inter.put[1][5][0][1] = ff_vvc_put_epel_h64_8_neon_i8mm;
228  c->inter.put[1][6][0][1] = ff_vvc_put_epel_h128_8_neon_i8mm;
229 
230  c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon_i8mm;
231  c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon_i8mm;
232  c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon_i8mm;
233  c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon_i8mm;
234  c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm;
235  c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm;
236  }
237  } else if (bd == 10) {
238  c->inter.avg = ff_vvc_avg_10_neon;
239  c->inter.w_avg = vvc_w_avg_10;
240  c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
241  c->inter.apply_bdof = apply_bdof_10;
242 
243  c->alf.filter[LUMA] = alf_filter_luma_10_neon;
244  c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
245  } else if (bd == 12) {
246  c->inter.avg = ff_vvc_avg_12_neon;
247  c->inter.w_avg = vvc_w_avg_12;
248  c->inter.dmvr[0][0] = ff_vvc_dmvr_12_neon;
249  c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
250  c->inter.apply_bdof = apply_bdof_12;
251 
252  c->alf.filter[LUMA] = alf_filter_luma_12_neon;
253  c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
254  }
255 
256  c->inter.sad = ff_vvc_sad_neon;
257 }
_dst
uint8_t * _dst
Definition: dsp.h:52
LUMA
#define LUMA
Definition: filter.c:31
src1
const pixel * src1
Definition: h264pred_template.c:421
ff_vvc_put_qpel_v8_8_neon
void ff_vvc_put_qpel_v8_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width)
of_template.c
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
ff_vvc_dsp_init_aarch64
void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
Definition: dsp_init.c:117
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
_src
uint8_t ptrdiff_t const uint8_t * _src
Definition: dsp.h:52
dsp.h
W_AVG_FUN
#define W_AVG_FUN(bit_depth)
Definition: dsp_init.c:90
ff_vvc_w_avg_10_neon
void ff_vvc_w_avg_10_neon(uint8_t *_dst, ptrdiff_t _dst_stride, const int16_t *src0, const int16_t *src1, int width, int height, uintptr_t w0_w1, uintptr_t offset_shift)
ff_vvc_put_qpel_v4_8_neon
void ff_vvc_put_qpel_v4_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width)
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
dsp.h
DMVR_FUN
#define DMVR_FUN(fn, bd)
Definition: dsp_init.c:106
if
if(ret)
Definition: filter_design.txt:179
ff_vvc_sao_edge_filter_8x8_8_neon
void ff_vvc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst, const int16_t *sao_offset_val, int eo, int width, int height)
ff_vvc_derive_bdof_vx_vy_neon
void ff_vvc_derive_bdof_vx_vy_neon(const int16_t *_src0, const int16_t *_src1, int pad_mask, const int16_t **gradient_h, const int16_t **gradient_v, int16_t *vx, int16_t *vy)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
have_i8mm
#define have_i8mm(flags)
Definition: cpu.h:29
height
#define height
Definition: dsp.h:85
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
cpu.h
for
for(k=2;k<=8;++k)
Definition: h264pred_template.c:425
ff_vvc_avg_8_neon
void ff_vvc_avg_8_neon(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, int width, int height)
have_neon
#define have_neon(flags)
Definition: cpu.h:26
ff_h26x_sao_band_filter_8x8_8_neon
void ff_h26x_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, const int16_t *sao_offset_val, int sao_left_class, int width, int height)
ff_vvc_avg_10_neon
void ff_vvc_avg_10_neon(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, int width, int height)
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
CHROMA
@ CHROMA
Definition: vf_waveform.c:49
ff_vvc_w_avg_8_neon
void ff_vvc_w_avg_8_neon(uint8_t *_dst, ptrdiff_t _dst_stride, const int16_t *src0, const int16_t *src1, int width, int height, uintptr_t w0_w1, uintptr_t offset_shift)
ff_vvc_w_avg_12_neon
void ff_vvc_w_avg_12_neon(uint8_t *_dst, ptrdiff_t _dst_stride, const int16_t *src0, const int16_t *src1, int width, int height, uintptr_t w0_w1, uintptr_t offset_shift)
src0
const pixel *const src0
Definition: h264pred_template.c:420
ff_vvc_avg_12_neon
void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, int width, int height)
ff_vvc_sao_edge_filter_16x16_8_neon
void ff_vvc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst, const int16_t *sao_offset_val, int eo, int width, int height)
alf_template.c
ff_vvc_sad_neon
int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy, const int block_w, const int block_h)
ctu.h
width
#define width
Definition: dsp.h:85
cpu.h
ff_vvc_prof_grad_filter_8x_neon
void ff_vvc_prof_grad_filter_8x_neon(int16_t *gradient_h, int16_t *gradient_v, ptrdiff_t gradient_stride, const int16_t *_src, ptrdiff_t src_stride, int width, int height)
dec.h
VVCDSPContext
Definition: dsp.h:169