FFmpeg
h264dsp_init.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/attributes.h"
22 #include "libavutil/cpu.h"
23 #include "libavutil/x86/asm.h"
24 #include "libavutil/x86/cpu.h"
25 #include "libavcodec/h264dsp.h"
26 
27 /***********************************/
28 /* IDCT */
29 #define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
30 void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
31  int16_t *block, \
32  int stride);
33 
34 IDCT_ADD_FUNC(, 8, sse2)
35 IDCT_ADD_FUNC(, 8, avx)
36 IDCT_ADD_FUNC(, 10, sse2)
37 IDCT_ADD_FUNC(_dc, 8, sse2)
38 IDCT_ADD_FUNC(_dc, 8, avx)
39 IDCT_ADD_FUNC(_dc, 10, mmxext)
40 IDCT_ADD_FUNC(8_dc, 8, mmxext)
41 IDCT_ADD_FUNC(8_dc, 10, sse2)
42 IDCT_ADD_FUNC(8, 8, sse2)
43 IDCT_ADD_FUNC(8, 10, sse2)
44 IDCT_ADD_FUNC(, 10, avx)
45 IDCT_ADD_FUNC(8_dc, 10, avx)
46 IDCT_ADD_FUNC(8, 10, avx)
47 
48 
49 #define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
50 void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
51  (uint8_t *dst, const int *block_offset, \
52  int16_t *block, int stride, const uint8_t nnzc[5 * 8]);
53 
54 IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
55 IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
56 IDCT_ADD_REP_FUNC(8, 4, 10, avx)
57 IDCT_ADD_REP_FUNC(, 16, 8, sse2)
58 IDCT_ADD_REP_FUNC(, 16, 10, sse2)
59 IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
60 IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
61 IDCT_ADD_REP_FUNC(, 16, 10, avx)
62 IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
63 
64 
65 #define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
66 void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
67  (uint8_t **dst, const int *block_offset, \
68  int16_t *block, int stride, const uint8_t nnzc[15 * 8]);
69 
70 IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
71 IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
72 IDCT_ADD_REP_FUNC2(, 8, 10, avx)
73 
74 IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx)
75 
76 IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2)
77 IDCT_ADD_REP_FUNC2(, 8_422, 10, avx)
78 
79 void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
80 
81 /***********************************/
82 /* deblocking */
83 
84 void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
85  int8_t ref[2][40],
86  int16_t mv[2][40][2],
87  int bidir, int edges, int step,
88  int mask_mv0, int mask_mv1, int field);
89 
90 #define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
91 void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
92  ptrdiff_t stride, \
93  int alpha, \
94  int beta, \
95  int8_t *tc0);
96 #define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
97 void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
98  ptrdiff_t stride, \
99  int alpha, \
100  int beta);
101 
102 #define LF_FUNCS(type, depth) \
103 LF_FUNC(h, luma, depth, sse2) \
104 LF_IFUNC(h, luma_intra, depth, sse2) \
105 LF_FUNC(v, luma, depth, sse2) \
106 LF_IFUNC(v, luma_intra, depth, sse2) \
107 LF_FUNC(h, chroma, depth, sse2) \
108 LF_IFUNC(h, chroma_intra, depth, sse2) \
109 LF_FUNC(h, chroma422, depth, sse2) \
110 LF_IFUNC(h, chroma422_intra, depth, sse2) \
111 LF_FUNC(v, chroma, depth, sse2) \
112 LF_IFUNC(v, chroma_intra, depth, sse2) \
113 LF_FUNC(h, luma, depth, avx) \
114 LF_IFUNC(h, luma_intra, depth, avx) \
115 LF_FUNC(v, luma, depth, avx) \
116 LF_IFUNC(v, luma_intra, depth, avx) \
117 LF_FUNC(h, chroma, depth, avx) \
118 LF_IFUNC(h, chroma_intra, depth, avx) \
119 LF_FUNC(h, chroma422, depth, avx) \
120 LF_IFUNC(h, chroma422_intra, depth, avx) \
121 LF_FUNC(v, chroma, depth, avx) \
122 LF_IFUNC(v, chroma_intra, depth, avx)
123 
124 LF_FUNC(h, luma_mbaff, 8, sse2)
125 LF_FUNC(h, luma_mbaff, 8, avx)
126 
127 LF_FUNCS(uint8_t, 8)
128 LF_FUNCS(uint16_t, 10)
129 
130 /***********************************/
131 /* weighted prediction */
132 
133 #define H264_WEIGHT(W, OPT) \
134 void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, ptrdiff_t stride, \
135  int height, int log2_denom, \
136  int weight, int offset);
137 
138 #define H264_BIWEIGHT(W, OPT) \
139 void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \
140  ptrdiff_t stride, int height, \
141  int log2_denom, int weightd, \
142  int weights, int offset);
143 
144 #define H264_BIWEIGHT_MMX(W) \
145  H264_WEIGHT(W, mmxext) \
146  H264_BIWEIGHT(W, mmxext)
147 
148 #define H264_BIWEIGHT_SSE(W) \
149  H264_WEIGHT(W, sse2) \
150  H264_BIWEIGHT(W, sse2) \
151  H264_BIWEIGHT(W, ssse3)
152 
156 
157 #define H264_WEIGHT_10(W, DEPTH, OPT) \
158 void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
159  ptrdiff_t stride, \
160  int height, \
161  int log2_denom, \
162  int weight, \
163  int offset);
164 
165 #define H264_BIWEIGHT_10(W, DEPTH, OPT) \
166 void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
167  uint8_t *src, \
168  ptrdiff_t stride, \
169  int height, \
170  int log2_denom, \
171  int weightd, \
172  int weights, \
173  int offset);
174 
175 #define H264_BIWEIGHT_10_SSE(W, DEPTH) \
176  H264_WEIGHT_10(W, DEPTH, sse2) \
177  H264_WEIGHT_10(W, DEPTH, sse4) \
178  H264_BIWEIGHT_10(W, DEPTH, sse2) \
179  H264_BIWEIGHT_10(W, DEPTH, sse4)
180 
181 H264_BIWEIGHT_10_SSE(16, 10)
184 
186  const int chroma_format_idc)
187 {
188  int cpu_flags = av_get_cpu_flags();
189 
190  if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1)
191  c->loop_filter_strength = ff_h264_loop_filter_strength_mmxext;
192 
193  if (bit_depth == 8) {
194  if (EXTERNAL_MMX(cpu_flags)) {
195  if (chroma_format_idc <= 1) {
196  } else {
197  c->idct_add8 = ff_h264_idct_add8_422_8_mmx;
198  }
199  }
200  if (EXTERNAL_MMXEXT(cpu_flags)) {
201  c->idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
202 
203  c->weight_pixels_tab[2] = ff_h264_weight_4_mmxext;
204 
205  c->biweight_pixels_tab[2] = ff_h264_biweight_4_mmxext;
206  }
207  if (EXTERNAL_SSE2(cpu_flags)) {
208  c->idct8_add = ff_h264_idct8_add_8_sse2;
209 
210  c->idct_add16 = ff_h264_idct_add16_8_sse2;
211  c->idct8_add4 = ff_h264_idct8_add4_8_sse2;
212  if (chroma_format_idc <= 1)
213  c->idct_add8 = ff_h264_idct_add8_8_sse2;
214  c->idct_add16intra = ff_h264_idct_add16intra_8_sse2;
215  c->luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
216 
217  c->weight_pixels_tab[0] = ff_h264_weight_16_sse2;
218  c->weight_pixels_tab[1] = ff_h264_weight_8_sse2;
219 
220  c->biweight_pixels_tab[0] = ff_h264_biweight_16_sse2;
221  c->biweight_pixels_tab[1] = ff_h264_biweight_8_sse2;
222 
223  c->v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
224  c->h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
225  c->v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
226  c->h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
227 
228 #if ARCH_X86_64
229  c->h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_sse2;
230 #endif
231 
232  c->v_loop_filter_chroma = ff_deblock_v_chroma_8_sse2;
233  c->v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_sse2;
234  if (chroma_format_idc <= 1) {
235  c->h_loop_filter_chroma = ff_deblock_h_chroma_8_sse2;
236  c->h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_sse2;
237  } else {
238  c->h_loop_filter_chroma = ff_deblock_h_chroma422_8_sse2;
239  c->h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2;
240  }
241 
242  c->idct_add = ff_h264_idct_add_8_sse2;
243  c->idct_dc_add = ff_h264_idct_dc_add_8_sse2;
244  }
245  if (EXTERNAL_SSSE3(cpu_flags)) {
246  c->biweight_pixels_tab[0] = ff_h264_biweight_16_ssse3;
247  c->biweight_pixels_tab[1] = ff_h264_biweight_8_ssse3;
248  }
249  if (EXTERNAL_AVX(cpu_flags)) {
250  c->v_loop_filter_luma = ff_deblock_v_luma_8_avx;
251  c->h_loop_filter_luma = ff_deblock_h_luma_8_avx;
252  c->v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
253  c->h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
254 #if ARCH_X86_64
255  c->h_loop_filter_luma_mbaff = ff_deblock_h_luma_mbaff_8_avx;
256 #endif
257 
258  c->v_loop_filter_chroma = ff_deblock_v_chroma_8_avx;
259  c->v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_avx;
260  if (chroma_format_idc <= 1) {
261  c->h_loop_filter_chroma = ff_deblock_h_chroma_8_avx;
262  c->h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_avx;
263  } else {
264  c->h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx;
265  c->h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
266  }
267 
268  c->idct_add = ff_h264_idct_add_8_avx;
269  c->idct_dc_add = ff_h264_idct_dc_add_8_avx;
270  }
271  } else if (bit_depth == 10) {
272  if (EXTERNAL_MMXEXT(cpu_flags)) {
273  c->idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
274  }
275  if (EXTERNAL_SSE2(cpu_flags)) {
276  c->idct_add = ff_h264_idct_add_10_sse2;
277  c->idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
278 
279  c->idct_add16 = ff_h264_idct_add16_10_sse2;
280  if (chroma_format_idc <= 1) {
281  c->idct_add8 = ff_h264_idct_add8_10_sse2;
282  } else {
283  c->idct_add8 = ff_h264_idct_add8_422_10_sse2;
284  }
285  c->idct_add16intra = ff_h264_idct_add16intra_10_sse2;
286 #if HAVE_ALIGNED_STACK
287  c->idct8_add = ff_h264_idct8_add_10_sse2;
288  c->idct8_add4 = ff_h264_idct8_add4_10_sse2;
289 #endif /* HAVE_ALIGNED_STACK */
290 
291  c->weight_pixels_tab[0] = ff_h264_weight_16_10_sse2;
292  c->weight_pixels_tab[1] = ff_h264_weight_8_10_sse2;
293  c->weight_pixels_tab[2] = ff_h264_weight_4_10_sse2;
294 
295  c->biweight_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
296  c->biweight_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
297  c->biweight_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
298 
299  c->v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2;
300  c->v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
301  if (chroma_format_idc <= 1) {
302  c->h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
303  } else {
304  c->h_loop_filter_chroma = ff_deblock_h_chroma422_10_sse2;
305  }
306  c->v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
307  c->h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
308  c->v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
309  c->h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
310  }
311  if (EXTERNAL_SSE4(cpu_flags)) {
312  c->weight_pixels_tab[0] = ff_h264_weight_16_10_sse4;
313  c->weight_pixels_tab[1] = ff_h264_weight_8_10_sse4;
314  c->weight_pixels_tab[2] = ff_h264_weight_4_10_sse4;
315 
316  c->biweight_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
317  c->biweight_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
318  c->biweight_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
319  }
320  if (EXTERNAL_AVX(cpu_flags)) {
321  c->idct_dc_add =
322  c->idct_add = ff_h264_idct_add_10_avx;
323  c->idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
324 
325  c->idct_add16 = ff_h264_idct_add16_10_avx;
326  if (chroma_format_idc <= 1) {
327  c->idct_add8 = ff_h264_idct_add8_10_avx;
328  } else {
329  c->idct_add8 = ff_h264_idct_add8_422_10_avx;
330  }
331  c->idct_add16intra = ff_h264_idct_add16intra_10_avx;
332 #if HAVE_ALIGNED_STACK
333  c->idct8_add = ff_h264_idct8_add_10_avx;
334  c->idct8_add4 = ff_h264_idct8_add4_10_avx;
335 #endif /* HAVE_ALIGNED_STACK */
336 
337  c->v_loop_filter_chroma = ff_deblock_v_chroma_10_avx;
338  c->v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
339  if (chroma_format_idc <= 1) {
340  c->h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
341  } else {
342  c->h_loop_filter_chroma = ff_deblock_h_chroma422_10_avx;
343  }
344  c->v_loop_filter_luma = ff_deblock_v_luma_10_avx;
345  c->h_loop_filter_luma = ff_deblock_h_luma_10_avx;
346  c->v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
347  c->h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
348  }
349  }
350 }
cpu.h
IDCT_ADD_REP_FUNC
#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT)
Definition: h264dsp_init.c:49
mv
static const int8_t mv[256][2]
Definition: 4xm.c:81
output
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output
Definition: filter_design.txt:226
H264_BIWEIGHT_MMX
#define H264_BIWEIGHT_MMX(W)
Definition: h264dsp_init.c:144
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
ff_h264_luma_dc_dequant_idct_sse2
void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul)
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
bit_depth
static void bit_depth(AudioStatsContext *s, const uint64_t *const mask, uint8_t *depth)
Definition: af_astats.c:246
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
IDCT_ADD_REP_FUNC2
#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT)
Definition: h264dsp_init.c:65
ff_h264dsp_init_x86
av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
Definition: h264dsp_init.c:185
av_cold
#define av_cold
Definition: attributes.h:106
field
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this field
Definition: writing_filters.txt:78
H264_BIWEIGHT_10_SSE
#define H264_BIWEIGHT_10_SSE(W, DEPTH)
Definition: h264dsp_init.c:175
LF_FUNC
#define LF_FUNC(DIR, TYPE, DEPTH, OPT)
Definition: h264dsp_init.c:90
asm.h
H264_BIWEIGHT_SSE
#define H264_BIWEIGHT_SSE(W)
Definition: h264dsp_init.c:148
h264dsp.h
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
H264DSPContext
Context for storing H.264 DSP functions.
Definition: h264dsp.h:42
cpu.h
attributes.h
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:53
input
and forward the test the status of outputs and forward it to the corresponding return FFERROR_NOT_READY If the filters stores internally one or a few frame for some input
Definition: filter_design.txt:172
LF_FUNCS
#define LF_FUNCS(type, depth)
Definition: h264dsp_init.c:102
EXTERNAL_AVX
#define EXTERNAL_AVX(flags)
Definition: cpu.h:64
EXTERNAL_SSE4
#define EXTERNAL_SSE4(flags)
Definition: cpu.h:62
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:117
ff_h264_loop_filter_strength_mmxext
void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field)
IDCT_ADD_FUNC
#define IDCT_ADD_FUNC(NUM, DEPTH, OPT)
Definition: h264dsp_init.c:29
h
h
Definition: vp9dsp_template.c:2070
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:59
EXTERNAL_MMX
#define EXTERNAL_MMX(flags)
Definition: cpu.h:50
EXTERNAL_MMXEXT
#define EXTERNAL_MMXEXT(flags)
Definition: cpu.h:51