FFmpeg
vp9dsp_template.c
Go to the documentation of this file.
1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "libavutil/common.h"
25 #include "bit_depth_template.c"
26 #include "vp9dsp.h"
27 
28 #if BIT_DEPTH != 12
29 
30 // FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8)
31 // back with h264pred.[ch]
32 
33 static void vert_4x4_c(uint8_t *restrict _dst, ptrdiff_t stride,
34  const uint8_t *left, const uint8_t *_top)
35 {
36  pixel *dst = (pixel *) _dst;
37  const pixel *top = (const pixel *) _top;
38  pixel4 p4 = AV_RN4PA(top);
39 
40  stride /= sizeof(pixel);
41  AV_WN4PA(dst + stride * 0, p4);
42  AV_WN4PA(dst + stride * 1, p4);
43  AV_WN4PA(dst + stride * 2, p4);
44  AV_WN4PA(dst + stride * 3, p4);
45 }
46 
47 static void vert_8x8_c(uint8_t *restrict _dst, ptrdiff_t stride,
48  const uint8_t *left, const uint8_t *_top)
49 {
50  pixel *dst = (pixel *) _dst;
51  const pixel *top = (const pixel *) _top;
52 #if BIT_DEPTH == 8
53  uint64_t p8 = AV_RN64A(top);
54 #else
55  pixel4 p4a = AV_RN4PA(top + 0);
56  pixel4 p4b = AV_RN4PA(top + 4);
57 #endif
58  int y;
59 
60  stride /= sizeof(pixel);
61  for (y = 0; y < 8; y++) {
62 #if BIT_DEPTH == 8
63  AV_WN64A(dst, p8);
64 #else
65  AV_WN4PA(dst + 0, p4a);
66  AV_WN4PA(dst + 4, p4b);
67 #endif
68  dst += stride;
69  }
70 }
71 
72 static void vert_16x16_c(uint8_t *restrict _dst, ptrdiff_t stride,
73  const uint8_t *left, const uint8_t *_top)
74 {
75  pixel *dst = (pixel *) _dst;
76  const pixel *top = (const pixel *) _top;
77 #if BIT_DEPTH == 8
78  uint64_t p8a = AV_RN64A(top);
79  uint64_t p8b = AV_RN64A(top + 8);
80 #else
81  pixel4 p4a = AV_RN4PA(top + 0);
82  pixel4 p4b = AV_RN4PA(top + 4);
83  pixel4 p4c = AV_RN4PA(top + 8);
84  pixel4 p4d = AV_RN4PA(top + 12);
85 #endif
86  int y;
87 
88  stride /= sizeof(pixel);
89  for (y = 0; y < 16; y++) {
90 #if BIT_DEPTH == 8
91  AV_WN64A(dst + 0, p8a);
92  AV_WN64A(dst + 8, p8b);
93 #else
94  AV_WN4PA(dst + 0, p4a);
95  AV_WN4PA(dst + 4, p4b);
96  AV_WN4PA(dst + 8, p4c);
97  AV_WN4PA(dst + 12, p4d);
98 #endif
99  dst += stride;
100  }
101 }
102 
103 static void vert_32x32_c(uint8_t *restrict _dst, ptrdiff_t stride,
104  const uint8_t *left, const uint8_t *_top)
105 {
106  pixel *dst = (pixel *) _dst;
107  const pixel *top = (const pixel *) _top;
108 #if BIT_DEPTH == 8
109  uint64_t p8a = AV_RN64A(top);
110  uint64_t p8b = AV_RN64A(top + 8);
111  uint64_t p8c = AV_RN64A(top + 16);
112  uint64_t p8d = AV_RN64A(top + 24);
113 #else
114  pixel4 p4a = AV_RN4PA(top + 0);
115  pixel4 p4b = AV_RN4PA(top + 4);
116  pixel4 p4c = AV_RN4PA(top + 8);
117  pixel4 p4d = AV_RN4PA(top + 12);
118  pixel4 p4e = AV_RN4PA(top + 16);
119  pixel4 p4f = AV_RN4PA(top + 20);
120  pixel4 p4g = AV_RN4PA(top + 24);
121  pixel4 p4h = AV_RN4PA(top + 28);
122 #endif
123  int y;
124 
125  stride /= sizeof(pixel);
126  for (y = 0; y < 32; y++) {
127 #if BIT_DEPTH == 8
128  AV_WN64A(dst + 0, p8a);
129  AV_WN64A(dst + 8, p8b);
130  AV_WN64A(dst + 16, p8c);
131  AV_WN64A(dst + 24, p8d);
132 #else
133  AV_WN4PA(dst + 0, p4a);
134  AV_WN4PA(dst + 4, p4b);
135  AV_WN4PA(dst + 8, p4c);
136  AV_WN4PA(dst + 12, p4d);
137  AV_WN4PA(dst + 16, p4e);
138  AV_WN4PA(dst + 20, p4f);
139  AV_WN4PA(dst + 24, p4g);
140  AV_WN4PA(dst + 28, p4h);
141 #endif
142  dst += stride;
143  }
144 }
145 
146 static void hor_4x4_c(uint8_t *_dst, ptrdiff_t stride,
147  const uint8_t *_left, const uint8_t *top)
148 {
149  pixel *dst = (pixel *) _dst;
150  const pixel *left = (const pixel *) _left;
151 
152  stride /= sizeof(pixel);
153  AV_WN4PA(dst + stride * 0, PIXEL_SPLAT_X4(left[3]));
154  AV_WN4PA(dst + stride * 1, PIXEL_SPLAT_X4(left[2]));
155  AV_WN4PA(dst + stride * 2, PIXEL_SPLAT_X4(left[1]));
156  AV_WN4PA(dst + stride * 3, PIXEL_SPLAT_X4(left[0]));
157 }
158 
159 static void hor_8x8_c(uint8_t *_dst, ptrdiff_t stride,
160  const uint8_t *_left, const uint8_t *top)
161 {
162  pixel *dst = (pixel *) _dst;
163  const pixel *left = (const pixel *) _left;
164  int y;
165 
166  stride /= sizeof(pixel);
167  for (y = 0; y < 8; y++) {
168  pixel4 p4 = PIXEL_SPLAT_X4(left[7 - y]);
169 
170  AV_WN4PA(dst + 0, p4);
171  AV_WN4PA(dst + 4, p4);
172  dst += stride;
173  }
174 }
175 
176 static void hor_16x16_c(uint8_t *_dst, ptrdiff_t stride,
177  const uint8_t *_left, const uint8_t *top)
178 {
179  pixel *dst = (pixel *) _dst;
180  const pixel *left = (const pixel *) _left;
181  int y;
182 
183  stride /= sizeof(pixel);
184  for (y = 0; y < 16; y++) {
185  pixel4 p4 = PIXEL_SPLAT_X4(left[15 - y]);
186 
187  AV_WN4PA(dst + 0, p4);
188  AV_WN4PA(dst + 4, p4);
189  AV_WN4PA(dst + 8, p4);
190  AV_WN4PA(dst + 12, p4);
191  dst += stride;
192  }
193 }
194 
195 static void hor_32x32_c(uint8_t *_dst, ptrdiff_t stride,
196  const uint8_t *_left, const uint8_t *top)
197 {
198  pixel *dst = (pixel *) _dst;
199  const pixel *left = (const pixel *) _left;
200  int y;
201 
202  stride /= sizeof(pixel);
203  for (y = 0; y < 32; y++) {
204  pixel4 p4 = PIXEL_SPLAT_X4(left[31 - y]);
205 
206  AV_WN4PA(dst + 0, p4);
207  AV_WN4PA(dst + 4, p4);
208  AV_WN4PA(dst + 8, p4);
209  AV_WN4PA(dst + 12, p4);
210  AV_WN4PA(dst + 16, p4);
211  AV_WN4PA(dst + 20, p4);
212  AV_WN4PA(dst + 24, p4);
213  AV_WN4PA(dst + 28, p4);
214  dst += stride;
215  }
216 }
217 
218 #endif /* BIT_DEPTH != 12 */
219 
220 static void tm_4x4_c(uint8_t *_dst, ptrdiff_t stride,
221  const uint8_t *_left, const uint8_t *_top)
222 {
223  pixel *dst = (pixel *) _dst;
224  const pixel *left = (const pixel *) _left;
225  const pixel *top = (const pixel *) _top;
226  int y, tl = top[-1];
227 
228  stride /= sizeof(pixel);
229  for (y = 0; y < 4; y++) {
230  int l_m_tl = left[3 - y] - tl;
231 
232  dst[0] = av_clip_pixel(top[0] + l_m_tl);
233  dst[1] = av_clip_pixel(top[1] + l_m_tl);
234  dst[2] = av_clip_pixel(top[2] + l_m_tl);
235  dst[3] = av_clip_pixel(top[3] + l_m_tl);
236  dst += stride;
237  }
238 }
239 
240 static void tm_8x8_c(uint8_t *_dst, ptrdiff_t stride,
241  const uint8_t *_left, const uint8_t *_top)
242 {
243  pixel *dst = (pixel *) _dst;
244  const pixel *left = (const pixel *) _left;
245  const pixel *top = (const pixel *) _top;
246  int y, tl = top[-1];
247 
248  stride /= sizeof(pixel);
249  for (y = 0; y < 8; y++) {
250  int l_m_tl = left[7 - y] - tl;
251 
252  dst[0] = av_clip_pixel(top[0] + l_m_tl);
253  dst[1] = av_clip_pixel(top[1] + l_m_tl);
254  dst[2] = av_clip_pixel(top[2] + l_m_tl);
255  dst[3] = av_clip_pixel(top[3] + l_m_tl);
256  dst[4] = av_clip_pixel(top[4] + l_m_tl);
257  dst[5] = av_clip_pixel(top[5] + l_m_tl);
258  dst[6] = av_clip_pixel(top[6] + l_m_tl);
259  dst[7] = av_clip_pixel(top[7] + l_m_tl);
260  dst += stride;
261  }
262 }
263 
264 static void tm_16x16_c(uint8_t *_dst, ptrdiff_t stride,
265  const uint8_t *_left, const uint8_t *_top)
266 {
267  pixel *dst = (pixel *) _dst;
268  const pixel *left = (const pixel *) _left;
269  const pixel *top = (const pixel *) _top;
270  int y, tl = top[-1];
271 
272  stride /= sizeof(pixel);
273  for (y = 0; y < 16; y++) {
274  int l_m_tl = left[15 - y] - tl;
275 
276  dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
277  dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
278  dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
279  dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
280  dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
281  dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
282  dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
283  dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
284  dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
285  dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
286  dst[10] = av_clip_pixel(top[10] + l_m_tl);
287  dst[11] = av_clip_pixel(top[11] + l_m_tl);
288  dst[12] = av_clip_pixel(top[12] + l_m_tl);
289  dst[13] = av_clip_pixel(top[13] + l_m_tl);
290  dst[14] = av_clip_pixel(top[14] + l_m_tl);
291  dst[15] = av_clip_pixel(top[15] + l_m_tl);
292  dst += stride;
293  }
294 }
295 
296 static void tm_32x32_c(uint8_t *_dst, ptrdiff_t stride,
297  const uint8_t *_left, const uint8_t *_top)
298 {
299  pixel *dst = (pixel *) _dst;
300  const pixel *left = (const pixel *) _left;
301  const pixel *top = (const pixel *) _top;
302  int y, tl = top[-1];
303 
304  stride /= sizeof(pixel);
305  for (y = 0; y < 32; y++) {
306  int l_m_tl = left[31 - y] - tl;
307 
308  dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
309  dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
310  dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
311  dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
312  dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
313  dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
314  dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
315  dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
316  dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
317  dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
318  dst[10] = av_clip_pixel(top[10] + l_m_tl);
319  dst[11] = av_clip_pixel(top[11] + l_m_tl);
320  dst[12] = av_clip_pixel(top[12] + l_m_tl);
321  dst[13] = av_clip_pixel(top[13] + l_m_tl);
322  dst[14] = av_clip_pixel(top[14] + l_m_tl);
323  dst[15] = av_clip_pixel(top[15] + l_m_tl);
324  dst[16] = av_clip_pixel(top[16] + l_m_tl);
325  dst[17] = av_clip_pixel(top[17] + l_m_tl);
326  dst[18] = av_clip_pixel(top[18] + l_m_tl);
327  dst[19] = av_clip_pixel(top[19] + l_m_tl);
328  dst[20] = av_clip_pixel(top[20] + l_m_tl);
329  dst[21] = av_clip_pixel(top[21] + l_m_tl);
330  dst[22] = av_clip_pixel(top[22] + l_m_tl);
331  dst[23] = av_clip_pixel(top[23] + l_m_tl);
332  dst[24] = av_clip_pixel(top[24] + l_m_tl);
333  dst[25] = av_clip_pixel(top[25] + l_m_tl);
334  dst[26] = av_clip_pixel(top[26] + l_m_tl);
335  dst[27] = av_clip_pixel(top[27] + l_m_tl);
336  dst[28] = av_clip_pixel(top[28] + l_m_tl);
337  dst[29] = av_clip_pixel(top[29] + l_m_tl);
338  dst[30] = av_clip_pixel(top[30] + l_m_tl);
339  dst[31] = av_clip_pixel(top[31] + l_m_tl);
340  dst += stride;
341  }
342 }
343 
344 #if BIT_DEPTH != 12
345 
346 static void dc_4x4_c(uint8_t *_dst, ptrdiff_t stride,
347  const uint8_t *_left, const uint8_t *_top)
348 {
349  pixel *dst = (pixel *) _dst;
350  const pixel *left = (const pixel *) _left;
351  const pixel *top = (const pixel *) _top;
352  pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] +
353  top[0] + top[1] + top[2] + top[3] + 4) >> 3);
354 
355  stride /= sizeof(pixel);
356  AV_WN4PA(dst + stride * 0, dc);
357  AV_WN4PA(dst + stride * 1, dc);
358  AV_WN4PA(dst + stride * 2, dc);
359  AV_WN4PA(dst + stride * 3, dc);
360 }
361 
362 static void dc_8x8_c(uint8_t *_dst, ptrdiff_t stride,
363  const uint8_t *_left, const uint8_t *_top)
364 {
365  pixel *dst = (pixel *) _dst;
366  const pixel *left = (const pixel *) _left;
367  const pixel *top = (const pixel *) _top;
369  ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
370  left[6] + left[7] + top[0] + top[1] + top[2] + top[3] +
371  top[4] + top[5] + top[6] + top[7] + 8) >> 4);
372  int y;
373 
374  stride /= sizeof(pixel);
375  for (y = 0; y < 8; y++) {
376  AV_WN4PA(dst + 0, dc);
377  AV_WN4PA(dst + 4, dc);
378  dst += stride;
379  }
380 }
381 
382 static void dc_16x16_c(uint8_t *_dst, ptrdiff_t stride,
383  const uint8_t *_left, const uint8_t *_top)
384 {
385  pixel *dst = (pixel *) _dst;
386  const pixel *left = (const pixel *) _left;
387  const pixel *top = (const pixel *) _top;
389  ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
390  left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
391  left[13] + left[14] + left[15] + top[0] + top[1] + top[2] + top[3] +
392  top[4] + top[5] + top[6] + top[7] + top[8] + top[9] + top[10] +
393  top[11] + top[12] + top[13] + top[14] + top[15] + 16) >> 5);
394  int y;
395 
396  stride /= sizeof(pixel);
397  for (y = 0; y < 16; y++) {
398  AV_WN4PA(dst + 0, dc);
399  AV_WN4PA(dst + 4, dc);
400  AV_WN4PA(dst + 8, dc);
401  AV_WN4PA(dst + 12, dc);
402  dst += stride;
403  }
404 }
405 
406 static void dc_32x32_c(uint8_t *_dst, ptrdiff_t stride,
407  const uint8_t *_left, const uint8_t *_top)
408 {
409  pixel *dst = (pixel *) _dst;
410  const pixel *left = (const pixel *) _left;
411  const pixel *top = (const pixel *) _top;
413  ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
414  left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
415  left[13] + left[14] + left[15] + left[16] + left[17] + left[18] +
416  left[19] + left[20] + left[21] + left[22] + left[23] + left[24] +
417  left[25] + left[26] + left[27] + left[28] + left[29] + left[30] +
418  left[31] + top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
419  top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + top[12] +
420  top[13] + top[14] + top[15] + top[16] + top[17] + top[18] + top[19] +
421  top[20] + top[21] + top[22] + top[23] + top[24] + top[25] + top[26] +
422  top[27] + top[28] + top[29] + top[30] + top[31] + 32) >> 6);
423  int y;
424 
425  stride /= sizeof(pixel);
426  for (y = 0; y < 32; y++) {
427  AV_WN4PA(dst + 0, dc);
428  AV_WN4PA(dst + 4, dc);
429  AV_WN4PA(dst + 8, dc);
430  AV_WN4PA(dst + 12, dc);
431  AV_WN4PA(dst + 16, dc);
432  AV_WN4PA(dst + 20, dc);
433  AV_WN4PA(dst + 24, dc);
434  AV_WN4PA(dst + 28, dc);
435  dst += stride;
436  }
437 }
438 
439 static void dc_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
440  const uint8_t *_left, const uint8_t *top)
441 {
442  pixel *dst = (pixel *) _dst;
443  const pixel *left = (const pixel *) _left;
444  pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] + 2) >> 2);
445 
446  stride /= sizeof(pixel);
447  AV_WN4PA(dst + stride * 0, dc);
448  AV_WN4PA(dst + stride * 1, dc);
449  AV_WN4PA(dst + stride * 2, dc);
450  AV_WN4PA(dst + stride * 3, dc);
451 }
452 
453 static void dc_left_8x8_c(uint8_t *_dst, ptrdiff_t stride,
454  const uint8_t *_left, const uint8_t *top)
455 {
456  pixel *dst = (pixel *) _dst;
457  const pixel *left = (const pixel *) _left;
459  ((left[0] + left[1] + left[2] + left[3] +
460  left[4] + left[5] + left[6] + left[7] + 4) >> 3);
461  int y;
462 
463  stride /= sizeof(pixel);
464  for (y = 0; y < 8; y++) {
465  AV_WN4PA(dst + 0, dc);
466  AV_WN4PA(dst + 4, dc);
467  dst += stride;
468  }
469 }
470 
471 static void dc_left_16x16_c(uint8_t *_dst, ptrdiff_t stride,
472  const uint8_t *_left, const uint8_t *top)
473 {
474  pixel *dst = (pixel *) _dst;
475  const pixel *left = (const pixel *) _left;
477  ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
478  left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
479  left[12] + left[13] + left[14] + left[15] + 8) >> 4);
480  int y;
481 
482  stride /= sizeof(pixel);
483  for (y = 0; y < 16; y++) {
484  AV_WN4PA(dst + 0, dc);
485  AV_WN4PA(dst + 4, dc);
486  AV_WN4PA(dst + 8, dc);
487  AV_WN4PA(dst + 12, dc);
488  dst += stride;
489  }
490 }
491 
492 static void dc_left_32x32_c(uint8_t *_dst, ptrdiff_t stride,
493  const uint8_t *_left, const uint8_t *top)
494 {
495  pixel *dst = (pixel *) _dst;
496  const pixel *left = (const pixel *) _left;
498  ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
499  left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
500  left[12] + left[13] + left[14] + left[15] + left[16] + left[17] +
501  left[18] + left[19] + left[20] + left[21] + left[22] + left[23] +
502  left[24] + left[25] + left[26] + left[27] + left[28] + left[29] +
503  left[30] + left[31] + 16) >> 5);
504  int y;
505 
506  stride /= sizeof(pixel);
507  for (y = 0; y < 32; y++) {
508  AV_WN4PA(dst + 0, dc);
509  AV_WN4PA(dst + 4, dc);
510  AV_WN4PA(dst + 8, dc);
511  AV_WN4PA(dst + 12, dc);
512  AV_WN4PA(dst + 16, dc);
513  AV_WN4PA(dst + 20, dc);
514  AV_WN4PA(dst + 24, dc);
515  AV_WN4PA(dst + 28, dc);
516  dst += stride;
517  }
518 }
519 
520 static void dc_top_4x4_c(uint8_t *_dst, ptrdiff_t stride,
521  const uint8_t *left, const uint8_t *_top)
522 {
523  pixel *dst = (pixel *) _dst;
524  const pixel *top = (const pixel *) _top;
525  pixel4 dc = PIXEL_SPLAT_X4((top[0] + top[1] + top[2] + top[3] + 2) >> 2);
526 
527  stride /= sizeof(pixel);
528  AV_WN4PA(dst + stride * 0, dc);
529  AV_WN4PA(dst + stride * 1, dc);
530  AV_WN4PA(dst + stride * 2, dc);
531  AV_WN4PA(dst + stride * 3, dc);
532 }
533 
534 static void dc_top_8x8_c(uint8_t *_dst, ptrdiff_t stride,
535  const uint8_t *left, const uint8_t *_top)
536 {
537  pixel *dst = (pixel *) _dst;
538  const pixel *top = (const pixel *) _top;
540  ((top[0] + top[1] + top[2] + top[3] +
541  top[4] + top[5] + top[6] + top[7] + 4) >> 3);
542  int y;
543 
544  stride /= sizeof(pixel);
545  for (y = 0; y < 8; y++) {
546  AV_WN4PA(dst + 0, dc);
547  AV_WN4PA(dst + 4, dc);
548  dst += stride;
549  }
550 }
551 
552 static void dc_top_16x16_c(uint8_t *_dst, ptrdiff_t stride,
553  const uint8_t *left, const uint8_t *_top)
554 {
555  pixel *dst = (pixel *) _dst;
556  const pixel *top = (const pixel *) _top;
558  ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
559  top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
560  top[12] + top[13] + top[14] + top[15] + 8) >> 4);
561  int y;
562 
563  stride /= sizeof(pixel);
564  for (y = 0; y < 16; y++) {
565  AV_WN4PA(dst + 0, dc);
566  AV_WN4PA(dst + 4, dc);
567  AV_WN4PA(dst + 8, dc);
568  AV_WN4PA(dst + 12, dc);
569  dst += stride;
570  }
571 }
572 
573 static void dc_top_32x32_c(uint8_t *_dst, ptrdiff_t stride,
574  const uint8_t *left, const uint8_t *_top)
575 {
576  pixel *dst = (pixel *) _dst;
577  const pixel *top = (const pixel *) _top;
579  ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
580  top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
581  top[12] + top[13] + top[14] + top[15] + top[16] + top[17] +
582  top[18] + top[19] + top[20] + top[21] + top[22] + top[23] +
583  top[24] + top[25] + top[26] + top[27] + top[28] + top[29] +
584  top[30] + top[31] + 16) >> 5);
585  int y;
586 
587  stride /= sizeof(pixel);
588  for (y = 0; y < 32; y++) {
589  AV_WN4PA(dst + 0, dc);
590  AV_WN4PA(dst + 4, dc);
591  AV_WN4PA(dst + 8, dc);
592  AV_WN4PA(dst + 12, dc);
593  AV_WN4PA(dst + 16, dc);
594  AV_WN4PA(dst + 20, dc);
595  AV_WN4PA(dst + 24, dc);
596  AV_WN4PA(dst + 28, dc);
597  dst += stride;
598  }
599 }
600 
601 #endif /* BIT_DEPTH != 12 */
602 
603 static void dc_128_4x4_c(uint8_t *_dst, ptrdiff_t stride,
604  const uint8_t *left, const uint8_t *top)
605 {
606  pixel *dst = (pixel *) _dst;
607  pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
608 
609  stride /= sizeof(pixel);
610  AV_WN4PA(dst + stride * 0, val);
611  AV_WN4PA(dst + stride * 1, val);
612  AV_WN4PA(dst + stride * 2, val);
613  AV_WN4PA(dst + stride * 3, val);
614 }
615 
616 static void dc_128_8x8_c(uint8_t *_dst, ptrdiff_t stride,
617  const uint8_t *left, const uint8_t *top)
618 {
619  pixel *dst = (pixel *) _dst;
620  pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
621  int y;
622 
623  stride /= sizeof(pixel);
624  for (y = 0; y < 8; y++) {
625  AV_WN4PA(dst + 0, val);
626  AV_WN4PA(dst + 4, val);
627  dst += stride;
628  }
629 }
630 
631 static void dc_128_16x16_c(uint8_t *_dst, ptrdiff_t stride,
632  const uint8_t *left, const uint8_t *top)
633 {
634  pixel *dst = (pixel *) _dst;
635  pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
636  int y;
637 
638  stride /= sizeof(pixel);
639  for (y = 0; y < 16; y++) {
640  AV_WN4PA(dst + 0, val);
641  AV_WN4PA(dst + 4, val);
642  AV_WN4PA(dst + 8, val);
643  AV_WN4PA(dst + 12, val);
644  dst += stride;
645  }
646 }
647 
648 static void dc_128_32x32_c(uint8_t *_dst, ptrdiff_t stride,
649  const uint8_t *left, const uint8_t *top)
650 {
651  pixel *dst = (pixel *) _dst;
652  pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
653  int y;
654 
655  stride /= sizeof(pixel);
656  for (y = 0; y < 32; y++) {
657  AV_WN4PA(dst + 0, val);
658  AV_WN4PA(dst + 4, val);
659  AV_WN4PA(dst + 8, val);
660  AV_WN4PA(dst + 12, val);
661  AV_WN4PA(dst + 16, val);
662  AV_WN4PA(dst + 20, val);
663  AV_WN4PA(dst + 24, val);
664  AV_WN4PA(dst + 28, val);
665  dst += stride;
666  }
667 }
668 
669 static void dc_127_4x4_c(uint8_t *_dst, ptrdiff_t stride,
670  const uint8_t *left, const uint8_t *top)
671 {
672  pixel *dst = (pixel *) _dst;
673  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
674 
675  stride /= sizeof(pixel);
676  AV_WN4PA(dst + stride * 0, val);
677  AV_WN4PA(dst + stride * 1, val);
678  AV_WN4PA(dst + stride * 2, val);
679  AV_WN4PA(dst + stride * 3, val);}
680 
681 static void dc_127_8x8_c(uint8_t *_dst, ptrdiff_t stride,
682  const uint8_t *left, const uint8_t *top)
683 {
684  pixel *dst = (pixel *) _dst;
685  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
686  int y;
687 
688  stride /= sizeof(pixel);
689  for (y = 0; y < 8; y++) {
690  AV_WN4PA(dst + 0, val);
691  AV_WN4PA(dst + 4, val);
692  dst += stride;
693  }
694 }
695 
696 static void dc_127_16x16_c(uint8_t *_dst, ptrdiff_t stride,
697  const uint8_t *left, const uint8_t *top)
698 {
699  pixel *dst = (pixel *) _dst;
700  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
701  int y;
702 
703  stride /= sizeof(pixel);
704  for (y = 0; y < 16; y++) {
705  AV_WN4PA(dst + 0, val);
706  AV_WN4PA(dst + 4, val);
707  AV_WN4PA(dst + 8, val);
708  AV_WN4PA(dst + 12, val);
709  dst += stride;
710  }
711 }
712 
713 static void dc_127_32x32_c(uint8_t *_dst, ptrdiff_t stride,
714  const uint8_t *left, const uint8_t *top)
715 {
716  pixel *dst = (pixel *) _dst;
717  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
718  int y;
719 
720  stride /= sizeof(pixel);
721  for (y = 0; y < 32; y++) {
722  AV_WN4PA(dst + 0, val);
723  AV_WN4PA(dst + 4, val);
724  AV_WN4PA(dst + 8, val);
725  AV_WN4PA(dst + 12, val);
726  AV_WN4PA(dst + 16, val);
727  AV_WN4PA(dst + 20, val);
728  AV_WN4PA(dst + 24, val);
729  AV_WN4PA(dst + 28, val);
730  dst += stride;
731  }
732 }
733 
734 static void dc_129_4x4_c(uint8_t *_dst, ptrdiff_t stride,
735  const uint8_t *left, const uint8_t *top)
736 {
737  pixel *dst = (pixel *) _dst;
738  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
739 
740  stride /= sizeof(pixel);
741  AV_WN4PA(dst + stride * 0, val);
742  AV_WN4PA(dst + stride * 1, val);
743  AV_WN4PA(dst + stride * 2, val);
744  AV_WN4PA(dst + stride * 3, val);
745 }
746 
747 static void dc_129_8x8_c(uint8_t *_dst, ptrdiff_t stride,
748  const uint8_t *left, const uint8_t *top)
749 {
750  pixel *dst = (pixel *) _dst;
751  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
752  int y;
753 
754  stride /= sizeof(pixel);
755  for (y = 0; y < 8; y++) {
756  AV_WN4PA(dst + 0, val);
757  AV_WN4PA(dst + 4, val);
758  dst += stride;
759  }
760 }
761 
762 static void dc_129_16x16_c(uint8_t *_dst, ptrdiff_t stride,
763  const uint8_t *left, const uint8_t *top)
764 {
765  pixel *dst = (pixel *) _dst;
766  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
767  int y;
768 
769  stride /= sizeof(pixel);
770  for (y = 0; y < 16; y++) {
771  AV_WN4PA(dst + 0, val);
772  AV_WN4PA(dst + 4, val);
773  AV_WN4PA(dst + 8, val);
774  AV_WN4PA(dst + 12, val);
775  dst += stride;
776  }
777 }
778 
779 static void dc_129_32x32_c(uint8_t *_dst, ptrdiff_t stride,
780  const uint8_t *left, const uint8_t *top)
781 {
782  pixel *dst = (pixel *) _dst;
783  pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
784  int y;
785 
786  stride /= sizeof(pixel);
787  for (y = 0; y < 32; y++) {
788  AV_WN4PA(dst + 0, val);
789  AV_WN4PA(dst + 4, val);
790  AV_WN4PA(dst + 8, val);
791  AV_WN4PA(dst + 12, val);
792  AV_WN4PA(dst + 16, val);
793  AV_WN4PA(dst + 20, val);
794  AV_WN4PA(dst + 24, val);
795  AV_WN4PA(dst + 28, val);
796  dst += stride;
797  }
798 }
799 
800 #if BIT_DEPTH != 12
801 
802 #if BIT_DEPTH == 8
803 #define memset_bpc memset
804 #else
805 static inline void memset_bpc(uint16_t *dst, int val, int len) {
806  int n;
807  for (n = 0; n < len; n++) {
808  dst[n] = val;
809  }
810 }
811 #endif
812 
813 #define DST(x, y) dst[(x) + (y) * stride]
814 
815 static void diag_downleft_4x4_c(uint8_t *_dst, ptrdiff_t stride,
816  const uint8_t *left, const uint8_t *_top)
817 {
818  pixel *dst = (pixel *) _dst;
819  const pixel *top = (const pixel *) _top;
820  int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
821  a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7];
822 
823  stride /= sizeof(pixel);
824  DST(0,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
825  DST(1,0) = DST(0,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
826  DST(2,0) = DST(1,1) = DST(0,2) = (a2 + a3 * 2 + a4 + 2) >> 2;
827  DST(3,0) = DST(2,1) = DST(1,2) = DST(0,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
828  DST(3,1) = DST(2,2) = DST(1,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
829  DST(3,2) = DST(2,3) = (a5 + a6 * 2 + a7 + 2) >> 2;
830  DST(3,3) = a7; // note: this is different from vp8 and such
831 }
832 
833 #define def_diag_downleft(size) \
834 static void diag_downleft_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
835  const uint8_t *left, const uint8_t *_top) \
836 { \
837  pixel *dst = (pixel *) _dst; \
838  const pixel *top = (const pixel *) _top; \
839  int i, j; \
840  pixel v[size - 1]; \
841 \
842  stride /= sizeof(pixel); \
843  for (i = 0; i < size - 2; i++) \
844  v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
845  v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
846 \
847  for (j = 0; j < size; j++) { \
848  memcpy(dst + j*stride, v + j, (size - 1 - j) * sizeof(pixel)); \
849  memset_bpc(dst + j*stride + size - 1 - j, top[size - 1], j + 1); \
850  } \
851 }
852 
856 
857 static void diag_downright_4x4_c(uint8_t *_dst, ptrdiff_t stride,
858  const uint8_t *_left, const uint8_t *_top)
859 {
860  pixel *dst = (pixel *) _dst;
861  const pixel *top = (const pixel *) _top;
862  const pixel *left = (const pixel *) _left;
863  int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
864  l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0];
865 
866  stride /= sizeof(pixel);
867  DST(0,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
868  DST(0,2) = DST(1,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
869  DST(0,1) = DST(1,2) = DST(2,3) = (tl + l0 * 2 + l1 + 2) >> 2;
870  DST(0,0) = DST(1,1) = DST(2,2) = DST(3,3) = (l0 + tl * 2 + a0 + 2) >> 2;
871  DST(1,0) = DST(2,1) = DST(3,2) = (tl + a0 * 2 + a1 + 2) >> 2;
872  DST(2,0) = DST(3,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
873  DST(3,0) = (a1 + a2 * 2 + a3 + 2) >> 2;
874 }
875 
876 #define def_diag_downright(size) \
877 static void diag_downright_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
878  const uint8_t *_left, const uint8_t *_top) \
879 { \
880  pixel *dst = (pixel *) _dst; \
881  const pixel *top = (const pixel *) _top; \
882  const pixel *left = (const pixel *) _left; \
883  int i, j; \
884  pixel v[size + size - 1]; \
885 \
886  stride /= sizeof(pixel); \
887  for (i = 0; i < size - 2; i++) { \
888  v[i ] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
889  v[size + 1 + i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
890  } \
891  v[size - 2] = (left[size - 2] + left[size - 1] * 2 + top[-1] + 2) >> 2; \
892  v[size - 1] = (left[size - 1] + top[-1] * 2 + top[ 0] + 2) >> 2; \
893  v[size ] = (top[-1] + top[0] * 2 + top[ 1] + 2) >> 2; \
894 \
895  for (j = 0; j < size; j++) \
896  memcpy(dst + j*stride, v + size - 1 - j, size * sizeof(pixel)); \
897 }
898 
902 
903 static void vert_right_4x4_c(uint8_t *_dst, ptrdiff_t stride,
904  const uint8_t *_left, const uint8_t *_top)
905 {
906  pixel *dst = (pixel *) _dst;
907  const pixel *top = (const pixel *) _top;
908  const pixel *left = (const pixel *) _left;
909  int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
910  l0 = left[3], l1 = left[2], l2 = left[1];
911 
912  stride /= sizeof(pixel);
913  DST(0,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
914  DST(0,2) = (tl + l0 * 2 + l1 + 2) >> 2;
915  DST(0,0) = DST(1,2) = (tl + a0 + 1) >> 1;
916  DST(0,1) = DST(1,3) = (l0 + tl * 2 + a0 + 2) >> 2;
917  DST(1,0) = DST(2,2) = (a0 + a1 + 1) >> 1;
918  DST(1,1) = DST(2,3) = (tl + a0 * 2 + a1 + 2) >> 2;
919  DST(2,0) = DST(3,2) = (a1 + a2 + 1) >> 1;
920  DST(2,1) = DST(3,3) = (a0 + a1 * 2 + a2 + 2) >> 2;
921  DST(3,0) = (a2 + a3 + 1) >> 1;
922  DST(3,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
923 }
924 
925 #define def_vert_right(size) \
926 static void vert_right_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
927  const uint8_t *_left, const uint8_t *_top) \
928 { \
929  pixel *dst = (pixel *) _dst; \
930  const pixel *top = (const pixel *) _top; \
931  const pixel *left = (const pixel *) _left; \
932  int i, j; \
933  pixel ve[size + size/2 - 1], vo[size + size/2 - 1]; \
934 \
935  stride /= sizeof(pixel); \
936  for (i = 0; i < size/2 - 2; i++) { \
937  vo[i] = (left[i*2 + 3] + left[i*2 + 2] * 2 + left[i*2 + 1] + 2) >> 2; \
938  ve[i] = (left[i*2 + 4] + left[i*2 + 3] * 2 + left[i*2 + 2] + 2) >> 2; \
939  } \
940  vo[size/2 - 2] = (left[size - 1] + left[size - 2] * 2 + left[size - 3] + 2) >> 2; \
941  ve[size/2 - 2] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
942 \
943  ve[size/2 - 1] = (top[-1] + top[0] + 1) >> 1; \
944  vo[size/2 - 1] = (left[size - 1] + top[-1] * 2 + top[0] + 2) >> 2; \
945  for (i = 0; i < size - 1; i++) { \
946  ve[size/2 + i] = (top[i] + top[i + 1] + 1) >> 1; \
947  vo[size/2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
948  } \
949 \
950  for (j = 0; j < size / 2; j++) { \
951  memcpy(dst + j*2 *stride, ve + size/2 - 1 - j, size * sizeof(pixel)); \
952  memcpy(dst + (j*2 + 1)*stride, vo + size/2 - 1 - j, size * sizeof(pixel)); \
953  } \
954 }
955 
957 def_vert_right(16)
958 def_vert_right(32)
959 
960 static void hor_down_4x4_c(uint8_t *_dst, ptrdiff_t stride,
961  const uint8_t *_left, const uint8_t *_top)
962 {
963  pixel *dst = (pixel *) _dst;
964  const pixel *top = (const pixel *) _top;
965  const pixel *left = (const pixel *) _left;
966  int l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0],
967  tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2];
968 
969  stride /= sizeof(pixel);
970  DST(2,0) = (tl + a0 * 2 + a1 + 2) >> 2;
971  DST(3,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
972  DST(0,0) = DST(2,1) = (tl + l0 + 1) >> 1;
973  DST(1,0) = DST(3,1) = (a0 + tl * 2 + l0 + 2) >> 2;
974  DST(0,1) = DST(2,2) = (l0 + l1 + 1) >> 1;
975  DST(1,1) = DST(3,2) = (tl + l0 * 2 + l1 + 2) >> 2;
976  DST(0,2) = DST(2,3) = (l1 + l2 + 1) >> 1;
977  DST(1,2) = DST(3,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
978  DST(0,3) = (l2 + l3 + 1) >> 1;
979  DST(1,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
980 }
981 
982 #define def_hor_down(size) \
983 static void hor_down_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
984  const uint8_t *_left, const uint8_t *_top) \
985 { \
986  pixel *dst = (pixel *) _dst; \
987  const pixel *top = (const pixel *) _top; \
988  const pixel *left = (const pixel *) _left; \
989  int i, j; \
990  pixel v[size * 3 - 2]; \
991 \
992  stride /= sizeof(pixel); \
993  for (i = 0; i < size - 2; i++) { \
994  v[i*2 ] = (left[i + 1] + left[i + 0] + 1) >> 1; \
995  v[i*2 + 1] = (left[i + 2] + left[i + 1] * 2 + left[i + 0] + 2) >> 2; \
996  v[size*2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
997  } \
998  v[size*2 - 2] = (top[-1] + left[size - 1] + 1) >> 1; \
999  v[size*2 - 4] = (left[size - 1] + left[size - 2] + 1) >> 1; \
1000  v[size*2 - 1] = (top[0] + top[-1] * 2 + left[size - 1] + 2) >> 2; \
1001  v[size*2 - 3] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
1002 \
1003  for (j = 0; j < size; j++) \
1004  memcpy(dst + j*stride, v + size*2 - 2 - j*2, size * sizeof(pixel)); \
1005 }
1006 
1008 def_hor_down(16)
1009 def_hor_down(32)
1010 
1011 static void vert_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
1012  const uint8_t *left, const uint8_t *_top)
1013 {
1014  pixel *dst = (pixel *) _dst;
1015  const pixel *top = (const pixel *) _top;
1016  int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
1017  a4 = top[4], a5 = top[5], a6 = top[6];
1018 
1019  stride /= sizeof(pixel);
1020  DST(0,0) = (a0 + a1 + 1) >> 1;
1021  DST(0,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
1022  DST(1,0) = DST(0,2) = (a1 + a2 + 1) >> 1;
1023  DST(1,1) = DST(0,3) = (a1 + a2 * 2 + a3 + 2) >> 2;
1024  DST(2,0) = DST(1,2) = (a2 + a3 + 1) >> 1;
1025  DST(2,1) = DST(1,3) = (a2 + a3 * 2 + a4 + 2) >> 2;
1026  DST(3,0) = DST(2,2) = (a3 + a4 + 1) >> 1;
1027  DST(3,1) = DST(2,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
1028  DST(3,2) = (a4 + a5 + 1) >> 1;
1029  DST(3,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
1030 }
1031 
1032 #define def_vert_left(size) \
1033 static void vert_left_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
1034  const uint8_t *left, const uint8_t *_top) \
1035 { \
1036  pixel *dst = (pixel *) _dst; \
1037  const pixel *top = (const pixel *) _top; \
1038  int i, j; \
1039  pixel ve[size - 1], vo[size - 1]; \
1040 \
1041  stride /= sizeof(pixel); \
1042  for (i = 0; i < size - 2; i++) { \
1043  ve[i] = (top[i] + top[i + 1] + 1) >> 1; \
1044  vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
1045  } \
1046  ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1; \
1047  vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
1048 \
1049  for (j = 0; j < size / 2; j++) { \
1050  memcpy(dst + j*2 * stride, ve + j, (size - j - 1) * sizeof(pixel)); \
1051  memset_bpc(dst + j*2 * stride + size - j - 1, top[size - 1], j + 1); \
1052  memcpy(dst + (j*2 + 1) * stride, vo + j, (size - j - 1) * sizeof(pixel)); \
1053  memset_bpc(dst + (j*2 + 1) * stride + size - j - 1, top[size - 1], j + 1); \
1054  } \
1055 }
1056 
1058 def_vert_left(16)
1059 def_vert_left(32)
1060 
1061 static void hor_up_4x4_c(uint8_t *_dst, ptrdiff_t stride,
1062  const uint8_t *_left, const uint8_t *top)
1063 {
1064  pixel *dst = (pixel *) _dst;
1065  const pixel *left = (const pixel *) _left;
1066  int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
1067 
1068  stride /= sizeof(pixel);
1069  DST(0,0) = (l0 + l1 + 1) >> 1;
1070  DST(1,0) = (l0 + l1 * 2 + l2 + 2) >> 2;
1071  DST(0,1) = DST(2,0) = (l1 + l2 + 1) >> 1;
1072  DST(1,1) = DST(3,0) = (l1 + l2 * 2 + l3 + 2) >> 2;
1073  DST(0,2) = DST(2,1) = (l2 + l3 + 1) >> 1;
1074  DST(1,2) = DST(3,1) = (l2 + l3 * 3 + 2) >> 2;
1075  DST(0,3) = DST(1,3) = DST(2,2) = DST(2,3) = DST(3,2) = DST(3,3) = l3;
1076 }
1077 
1078 #define def_hor_up(size) \
1079 static void hor_up_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
1080  const uint8_t *_left, const uint8_t *top) \
1081 { \
1082  pixel *dst = (pixel *) _dst; \
1083  const pixel *left = (const pixel *) _left; \
1084  int i, j; \
1085  pixel v[size*2 - 2]; \
1086 \
1087  stride /= sizeof(pixel); \
1088  for (i = 0; i < size - 2; i++) { \
1089  v[i*2 ] = (left[i] + left[i + 1] + 1) >> 1; \
1090  v[i*2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
1091  } \
1092  v[size*2 - 4] = (left[size - 2] + left[size - 1] + 1) >> 1; \
1093  v[size*2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2; \
1094 \
1095  for (j = 0; j < size / 2; j++) \
1096  memcpy(dst + j*stride, v + j*2, size * sizeof(pixel)); \
1097  for (j = size / 2; j < size; j++) { \
1098  memcpy(dst + j*stride, v + j*2, (size*2 - 2 - j*2) * sizeof(pixel)); \
1099  memset_bpc(dst + j*stride + size*2 - 2 - j*2, left[size - 1], \
1100  2 + j*2 - size); \
1101  } \
1102 }
1103 
1105 def_hor_up(16)
1106 def_hor_up(32)
1107 
1108 #undef DST
1109 
1110 #endif /* BIT_DEPTH != 12 */
1111 
1112 #if BIT_DEPTH != 8
1113 void ff_vp9dsp_intrapred_init_10(VP9DSPContext *dsp);
1114 #endif
1115 #if BIT_DEPTH != 10
1116 static
1117 #endif
1118 av_cold void FUNC(ff_vp9dsp_intrapred_init)(VP9DSPContext *dsp)
1119 {
1120 #define init_intra_pred_bd_aware(tx, sz) \
1121  dsp->intra_pred[tx][TM_VP8_PRED] = tm_##sz##_c; \
1122  dsp->intra_pred[tx][DC_128_PRED] = dc_128_##sz##_c; \
1123  dsp->intra_pred[tx][DC_127_PRED] = dc_127_##sz##_c; \
1124  dsp->intra_pred[tx][DC_129_PRED] = dc_129_##sz##_c
1125 
1126 #if BIT_DEPTH == 12
1127  ff_vp9dsp_intrapred_init_10(dsp);
1128 #define init_intra_pred(tx, sz) \
1129  init_intra_pred_bd_aware(tx, sz)
1130 #else
1131  #define init_intra_pred(tx, sz) \
1132  dsp->intra_pred[tx][VERT_PRED] = vert_##sz##_c; \
1133  dsp->intra_pred[tx][HOR_PRED] = hor_##sz##_c; \
1134  dsp->intra_pred[tx][DC_PRED] = dc_##sz##_c; \
1135  dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED] = diag_downleft_##sz##_c; \
1136  dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_##sz##_c; \
1137  dsp->intra_pred[tx][VERT_RIGHT_PRED] = vert_right_##sz##_c; \
1138  dsp->intra_pred[tx][HOR_DOWN_PRED] = hor_down_##sz##_c; \
1139  dsp->intra_pred[tx][VERT_LEFT_PRED] = vert_left_##sz##_c; \
1140  dsp->intra_pred[tx][HOR_UP_PRED] = hor_up_##sz##_c; \
1141  dsp->intra_pred[tx][LEFT_DC_PRED] = dc_left_##sz##_c; \
1142  dsp->intra_pred[tx][TOP_DC_PRED] = dc_top_##sz##_c; \
1143  init_intra_pred_bd_aware(tx, sz)
1144 #endif
1145 
1146  init_intra_pred(TX_4X4, 4x4);
1147  init_intra_pred(TX_8X8, 8x8);
1148  init_intra_pred(TX_16X16, 16x16);
1149  init_intra_pred(TX_32X32, 32x32);
1150 
1151 #undef init_intra_pred
1152 #undef init_intra_pred_bd_aware
1153 }
1154 
1155 #define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \
1156 static void type_a##_##type_b##_##sz##x##sz##_add_c(uint8_t *_dst, \
1157  ptrdiff_t stride, \
1158  int16_t *_block, int eob) \
1159 { \
1160  int i, j; \
1161  pixel *dst = (pixel *) _dst; \
1162  dctcoef *block = (dctcoef *) _block, tmp[sz * sz], out[sz]; \
1163 \
1164  stride /= sizeof(pixel); \
1165  if (has_dconly && eob == 1) { \
1166  const int t = ((((dctint) block[0] * 11585 + (1 << 13)) >> 14) \
1167  * 11585 + (1 << 13)) >> 14; \
1168  block[0] = 0; \
1169  for (i = 0; i < sz; i++) { \
1170  for (j = 0; j < sz; j++) \
1171  dst[j * stride] = av_clip_pixel(dst[j * stride] + \
1172  (bits ? \
1173  (int)(t + (1U << (bits - 1))) >> bits : \
1174  t)); \
1175  dst++; \
1176  } \
1177  return; \
1178  } \
1179 \
1180  for (i = 0; i < sz; i++) \
1181  type_a##sz##_1d(block + i, sz, tmp + i * sz, 0); \
1182  memset(block, 0, sz * sz * sizeof(*block)); \
1183  for (i = 0; i < sz; i++) { \
1184  type_b##sz##_1d(tmp + i, sz, out, 1); \
1185  for (j = 0; j < sz; j++) \
1186  dst[j * stride] = av_clip_pixel(dst[j * stride] + \
1187  (bits ? \
1188  (int)(out[j] + (1U << (bits - 1))) >> bits : \
1189  out[j])); \
1190  dst++; \
1191  } \
1192 }
1193 
1194 #define itxfm_wrap(sz, bits) \
1195 itxfm_wrapper(idct, idct, sz, bits, 1) \
1196 itxfm_wrapper(iadst, idct, sz, bits, 0) \
1197 itxfm_wrapper(idct, iadst, sz, bits, 0) \
1198 itxfm_wrapper(iadst, iadst, sz, bits, 0)
1199 
1200 #define IN(x) ((dctint) in[(x) * stride])
1201 
1202 static av_always_inline void idct4_1d(const dctcoef *in, ptrdiff_t stride,
1203  dctcoef *out, int pass)
1204 {
1205  dctint t0, t1, t2, t3;
1206 
1207  t0 = ((IN(0) + IN(2)) * 11585 + (1 << 13)) >> 14;
1208  t1 = ((IN(0) - IN(2)) * 11585 + (1 << 13)) >> 14;
1209  t2 = (IN(1) * 6270 - IN(3) * 15137 + (1 << 13)) >> 14;
1210  t3 = (IN(1) * 15137 + IN(3) * 6270 + (1 << 13)) >> 14;
1211 
1212  out[0] = t0 + t3;
1213  out[1] = t1 + t2;
1214  out[2] = t1 - t2;
1215  out[3] = t0 - t3;
1216 }
1217 
1218 static av_always_inline void iadst4_1d(const dctcoef *in, ptrdiff_t stride,
1219  dctcoef *out, int pass)
1220 {
1221  dctint t0, t1, t2, t3;
1222 
1223  t0 = 5283 * IN(0) + 15212 * IN(2) + 9929 * IN(3);
1224  t1 = 9929 * IN(0) - 5283 * IN(2) - 15212 * IN(3);
1225  t2 = 13377 * (IN(0) - IN(2) + IN(3));
1226  t3 = 13377 * IN(1);
1227 
1228  out[0] = (t0 + t3 + (1 << 13)) >> 14;
1229  out[1] = (t1 + t3 + (1 << 13)) >> 14;
1230  out[2] = (t2 + (1 << 13)) >> 14;
1231  out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14;
1232 }
1233 
1235 
1236 static av_always_inline void idct8_1d(const dctcoef *in, ptrdiff_t stride,
1237  dctcoef *out, int pass)
1238 {
1239  dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
1240 
1241  t0a = ((IN(0) + IN(4)) * 11585 + (1 << 13)) >> 14;
1242  t1a = ((IN(0) - IN(4)) * 11585 + (1 << 13)) >> 14;
1243  t2a = (IN(2) * 6270 - IN(6) * 15137 + (1 << 13)) >> 14;
1244  t3a = (IN(2) * 15137 + IN(6) * 6270 + (1 << 13)) >> 14;
1245  t4a = (IN(1) * 3196 - IN(7) * 16069 + (1 << 13)) >> 14;
1246  t5a = (IN(5) * 13623 - IN(3) * 9102 + (1 << 13)) >> 14;
1247  t6a = (IN(5) * 9102 + IN(3) * 13623 + (1 << 13)) >> 14;
1248  t7a = (IN(1) * 16069 + IN(7) * 3196 + (1 << 13)) >> 14;
1249 
1250  t0 = t0a + t3a;
1251  t1 = t1a + t2a;
1252  t2 = t1a - t2a;
1253  t3 = t0a - t3a;
1254  t4 = t4a + t5a;
1255  t5a = t4a - t5a;
1256  t7 = t7a + t6a;
1257  t6a = t7a - t6a;
1258 
1259  t5 = ((t6a - t5a) * 11585 + (1 << 13)) >> 14;
1260  t6 = ((t6a + t5a) * 11585 + (1 << 13)) >> 14;
1261 
1262  out[0] = t0 + t7;
1263  out[1] = t1 + t6;
1264  out[2] = t2 + t5;
1265  out[3] = t3 + t4;
1266  out[4] = t3 - t4;
1267  out[5] = t2 - t5;
1268  out[6] = t1 - t6;
1269  out[7] = t0 - t7;
1270 }
1271 
1272 static av_always_inline void iadst8_1d(const dctcoef *in, ptrdiff_t stride,
1273  dctcoef *out, int pass)
1274 {
1275  dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
1276 
1277  t0a = 16305 * IN(7) + 1606 * IN(0);
1278  t1a = 1606 * IN(7) - 16305 * IN(0);
1279  t2a = 14449 * IN(5) + 7723 * IN(2);
1280  t3a = 7723 * IN(5) - 14449 * IN(2);
1281  t4a = 10394 * IN(3) + 12665 * IN(4);
1282  t5a = 12665 * IN(3) - 10394 * IN(4);
1283  t6a = 4756 * IN(1) + 15679 * IN(6);
1284  t7a = 15679 * IN(1) - 4756 * IN(6);
1285 
1286  t0 = (t0a + t4a + (1 << 13)) >> 14;
1287  t1 = (t1a + t5a + (1 << 13)) >> 14;
1288  t2 = (t2a + t6a + (1 << 13)) >> 14;
1289  t3 = (t3a + t7a + (1 << 13)) >> 14;
1290  t4 = (t0a - t4a + (1 << 13)) >> 14;
1291  t5 = (t1a - t5a + (1 << 13)) >> 14;
1292  t6 = (t2a - t6a + (1 << 13)) >> 14;
1293  t7 = (t3a - t7a + (1 << 13)) >> 14;
1294 
1295  t4a = 15137U * t4 + 6270U * t5;
1296  t5a = 6270U * t4 - 15137U * t5;
1297  t6a = 15137U * t7 - 6270U * t6;
1298  t7a = 6270U * t7 + 15137U * t6;
1299 
1300  out[0] = t0 + t2;
1301  out[7] = -(t1 + t3);
1302  t2 = t0 - t2;
1303  t3 = t1 - t3;
1304 
1305  out[1] = -((dctint)((1U << 13) + t4a + t6a) >> 14);
1306  out[6] = (dctint)((1U << 13) + t5a + t7a) >> 14;
1307  t6 = (dctint)((1U << 13) + t4a - t6a) >> 14;
1308  t7 = (dctint)((1U << 13) + t5a - t7a) >> 14;
1309 
1310  out[3] = -((dctint)((t2 + t3) * 11585U + (1 << 13)) >> 14);
1311  out[4] = (dctint)((t2 - t3) * 11585U + (1 << 13)) >> 14;
1312  out[2] = (dctint)((t6 + t7) * 11585U + (1 << 13)) >> 14;
1313  out[5] = -((dctint)((t6 - t7) * 11585U + (1 << 13)) >> 14);
1314 }
1315 
1317 
1318 static av_always_inline void idct16_1d(const dctcoef *in, ptrdiff_t stride,
1319  dctcoef *out, int pass)
1320 {
1321  dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
1322  dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
1323  dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
1324 
1325  t0a = (dctint)((IN(0) + IN(8)) * 11585U + (1 << 13)) >> 14;
1326  t1a = (dctint)((IN(0) - IN(8)) * 11585U + (1 << 13)) >> 14;
1327  t2a = (dctint)(IN(4) * 6270U - IN(12) * 15137U + (1 << 13)) >> 14;
1328  t3a = (dctint)(IN(4) * 15137U + IN(12) * 6270U + (1 << 13)) >> 14;
1329  t4a = (dctint)(IN(2) * 3196U - IN(14) * 16069U + (1 << 13)) >> 14;
1330  t7a = (dctint)(IN(2) * 16069U + IN(14) * 3196U + (1 << 13)) >> 14;
1331  t5a = (dctint)(IN(10) * 13623U - IN(6) * 9102U + (1 << 13)) >> 14;
1332  t6a = (dctint)(IN(10) * 9102U + IN(6) * 13623U + (1 << 13)) >> 14;
1333  t8a = (dctint)(IN(1) * 1606U - IN(15) * 16305U + (1 << 13)) >> 14;
1334  t15a = (dctint)(IN(1) * 16305U + IN(15) * 1606U + (1 << 13)) >> 14;
1335  t9a = (dctint)(IN(9) * 12665U - IN(7) * 10394U + (1 << 13)) >> 14;
1336  t14a = (dctint)(IN(9) * 10394U + IN(7) * 12665U + (1 << 13)) >> 14;
1337  t10a = (dctint)(IN(5) * 7723U - IN(11) * 14449U + (1 << 13)) >> 14;
1338  t13a = (dctint)(IN(5) * 14449U + IN(11) * 7723U + (1 << 13)) >> 14;
1339  t11a = (dctint)(IN(13) * 15679U - IN(3) * 4756U + (1 << 13)) >> 14;
1340  t12a = (dctint)(IN(13) * 4756U + IN(3) * 15679U + (1 << 13)) >> 14;
1341 
1342  t0 = t0a + t3a;
1343  t1 = t1a + t2a;
1344  t2 = t1a - t2a;
1345  t3 = t0a - t3a;
1346  t4 = t4a + t5a;
1347  t5 = t4a - t5a;
1348  t6 = t7a - t6a;
1349  t7 = t7a + t6a;
1350  t8 = t8a + t9a;
1351  t9 = t8a - t9a;
1352  t10 = t11a - t10a;
1353  t11 = t11a + t10a;
1354  t12 = t12a + t13a;
1355  t13 = t12a - t13a;
1356  t14 = t15a - t14a;
1357  t15 = t15a + t14a;
1358 
1359  t5a = (dctint)((t6 - t5) * 11585U + (1 << 13)) >> 14;
1360  t6a = (dctint)((t6 + t5) * 11585U + (1 << 13)) >> 14;
1361  t9a = (dctint)( t14 * 6270U - t9 * 15137U + (1 << 13)) >> 14;
1362  t14a = (dctint)( t14 * 15137U + t9 * 6270U + (1 << 13)) >> 14;
1363  t10a = (dctint)(-(t13 * 15137U + t10 * 6270U) + (1 << 13)) >> 14;
1364  t13a = (dctint)( t13 * 6270U - t10 * 15137U + (1 << 13)) >> 14;
1365 
1366  t0a = t0 + t7;
1367  t1a = t1 + t6a;
1368  t2a = t2 + t5a;
1369  t3a = t3 + t4;
1370  t4 = t3 - t4;
1371  t5 = t2 - t5a;
1372  t6 = t1 - t6a;
1373  t7 = t0 - t7;
1374  t8a = t8 + t11;
1375  t9 = t9a + t10a;
1376  t10 = t9a - t10a;
1377  t11a = t8 - t11;
1378  t12a = t15 - t12;
1379  t13 = t14a - t13a;
1380  t14 = t14a + t13a;
1381  t15a = t15 + t12;
1382 
1383  t10a = (dctint)((t13 - t10) * 11585U + (1 << 13)) >> 14;
1384  t13a = (dctint)((t13 + t10) * 11585U + (1 << 13)) >> 14;
1385  t11 = (dctint)((t12a - t11a) * 11585U + (1 << 13)) >> 14;
1386  t12 = (dctint)((t12a + t11a) * 11585U + (1 << 13)) >> 14;
1387 
1388  out[ 0] = t0a + t15a;
1389  out[ 1] = t1a + t14;
1390  out[ 2] = t2a + t13a;
1391  out[ 3] = t3a + t12;
1392  out[ 4] = t4 + t11;
1393  out[ 5] = t5 + t10a;
1394  out[ 6] = t6 + t9;
1395  out[ 7] = t7 + t8a;
1396  out[ 8] = t7 - t8a;
1397  out[ 9] = t6 - t9;
1398  out[10] = t5 - t10a;
1399  out[11] = t4 - t11;
1400  out[12] = t3a - t12;
1401  out[13] = t2a - t13a;
1402  out[14] = t1a - t14;
1403  out[15] = t0a - t15a;
1404 }
1405 
1406 static av_always_inline void iadst16_1d(const dctcoef *in, ptrdiff_t stride,
1407  dctcoef *out, int pass)
1408 {
1409  dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
1410  dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
1411  dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
1412 
1413  t0 = IN(15) * 16364U + IN(0) * 804U;
1414  t1 = IN(15) * 804U - IN(0) * 16364U;
1415  t2 = IN(13) * 15893U + IN(2) * 3981U;
1416  t3 = IN(13) * 3981U - IN(2) * 15893U;
1417  t4 = IN(11) * 14811U + IN(4) * 7005U;
1418  t5 = IN(11) * 7005U - IN(4) * 14811U;
1419  t6 = IN(9) * 13160U + IN(6) * 9760U;
1420  t7 = IN(9) * 9760U - IN(6) * 13160U;
1421  t8 = IN(7) * 11003U + IN(8) * 12140U;
1422  t9 = IN(7) * 12140U - IN(8) * 11003U;
1423  t10 = IN(5) * 8423U + IN(10) * 14053U;
1424  t11 = IN(5) * 14053U - IN(10) * 8423U;
1425  t12 = IN(3) * 5520U + IN(12) * 15426U;
1426  t13 = IN(3) * 15426U - IN(12) * 5520U;
1427  t14 = IN(1) * 2404U + IN(14) * 16207U;
1428  t15 = IN(1) * 16207U - IN(14) * 2404U;
1429 
1430  t0a = (dctint)((1U << 13) + t0 + t8 ) >> 14;
1431  t1a = (dctint)((1U << 13) + t1 + t9 ) >> 14;
1432  t2a = (dctint)((1U << 13) + t2 + t10) >> 14;
1433  t3a = (dctint)((1U << 13) + t3 + t11) >> 14;
1434  t4a = (dctint)((1U << 13) + t4 + t12) >> 14;
1435  t5a = (dctint)((1U << 13) + t5 + t13) >> 14;
1436  t6a = (dctint)((1U << 13) + t6 + t14) >> 14;
1437  t7a = (dctint)((1U << 13) + t7 + t15) >> 14;
1438  t8a = (dctint)((1U << 13) + t0 - t8 ) >> 14;
1439  t9a = (dctint)((1U << 13) + t1 - t9 ) >> 14;
1440  t10a = (dctint)((1U << 13) + t2 - t10) >> 14;
1441  t11a = (dctint)((1U << 13) + t3 - t11) >> 14;
1442  t12a = (dctint)((1U << 13) + t4 - t12) >> 14;
1443  t13a = (dctint)((1U << 13) + t5 - t13) >> 14;
1444  t14a = (dctint)((1U << 13) + t6 - t14) >> 14;
1445  t15a = (dctint)((1U << 13) + t7 - t15) >> 14;
1446 
1447  t8 = t8a * 16069U + t9a * 3196U;
1448  t9 = t8a * 3196U - t9a * 16069U;
1449  t10 = t10a * 9102U + t11a * 13623U;
1450  t11 = t10a * 13623U - t11a * 9102U;
1451  t12 = t13a * 16069U - t12a * 3196U;
1452  t13 = t13a * 3196U + t12a * 16069U;
1453  t14 = t15a * 9102U - t14a * 13623U;
1454  t15 = t15a * 13623U + t14a * 9102U;
1455 
1456  t0 = t0a + t4a;
1457  t1 = t1a + t5a;
1458  t2 = t2a + t6a;
1459  t3 = t3a + t7a;
1460  t4 = t0a - t4a;
1461  t5 = t1a - t5a;
1462  t6 = t2a - t6a;
1463  t7 = t3a - t7a;
1464  t8a = (dctint)((1U << 13) + t8 + t12) >> 14;
1465  t9a = (dctint)((1U << 13) + t9 + t13) >> 14;
1466  t10a = (dctint)((1U << 13) + t10 + t14) >> 14;
1467  t11a = (dctint)((1U << 13) + t11 + t15) >> 14;
1468  t12a = (dctint)((1U << 13) + t8 - t12) >> 14;
1469  t13a = (dctint)((1U << 13) + t9 - t13) >> 14;
1470  t14a = (dctint)((1U << 13) + t10 - t14) >> 14;
1471  t15a = (dctint)((1U << 13) + t11 - t15) >> 14;
1472 
1473  t4a = t4 * 15137U + t5 * 6270U;
1474  t5a = t4 * 6270U - t5 * 15137U;
1475  t6a = t7 * 15137U - t6 * 6270U;
1476  t7a = t7 * 6270U + t6 * 15137U;
1477  t12 = t12a * 15137U + t13a * 6270U;
1478  t13 = t12a * 6270U - t13a * 15137U;
1479  t14 = t15a * 15137U - t14a * 6270U;
1480  t15 = t15a * 6270U + t14a * 15137U;
1481 
1482  out[ 0] = t0 + t2;
1483  out[15] = -(t1 + t3);
1484  t2a = t0 - t2;
1485  t3a = t1 - t3;
1486  out[ 3] = -((dctint)((1U << 13) + t4a + t6a) >> 14);
1487  out[12] = (dctint)((1U << 13) + t5a + t7a) >> 14;
1488  t6 = (dctint)((1U << 13) + t4a - t6a) >> 14;
1489  t7 = (dctint)((1U << 13) + t5a - t7a) >> 14;
1490  out[ 1] = -(t8a + t10a);
1491  out[14] = t9a + t11a;
1492  t10 = t8a - t10a;
1493  t11 = t9a - t11a;
1494  out[ 2] = (dctint)((1U << 13) + t12 + t14) >> 14;
1495  out[13] = -((dctint)((1U << 13) + t13 + t15) >> 14);
1496  t14a = (dctint)((1U << 13) + t12 - t14) >> 14;
1497  t15a = (dctint)((1U << 13) + t13 - t15) >> 14;
1498 
1499  out[ 7] = (dctint)(-(t2a + t3a) * 11585U + (1 << 13)) >> 14;
1500  out[ 8] = (dctint)( (t2a - t3a) * 11585U + (1 << 13)) >> 14;
1501  out[ 4] = (dctint)( (t7 + t6) * 11585U + (1 << 13)) >> 14;
1502  out[11] = (dctint)( (t7 - t6) * 11585U + (1 << 13)) >> 14;
1503  out[ 6] = (dctint)( (t11 + t10) * 11585U + (1 << 13)) >> 14;
1504  out[ 9] = (dctint)( (t11 - t10) * 11585U + (1 << 13)) >> 14;
1505  out[ 5] = (dctint)(-(t14a + t15a) * 11585U + (1 << 13)) >> 14;
1506  out[10] = (dctint)( (t14a - t15a) * 11585U + (1 << 13)) >> 14;
1507 }
1508 
1510 
1511 static av_always_inline void idct32_1d(const dctcoef *in, ptrdiff_t stride,
1512  dctcoef *out, int pass)
1513 {
1514  dctint t0a = (dctint)((IN(0) + IN(16)) * 11585U + (1 << 13)) >> 14;
1515  dctint t1a = (dctint)((IN(0) - IN(16)) * 11585U + (1 << 13)) >> 14;
1516  dctint t2a = (dctint)(IN( 8) * 6270U - IN(24) * 15137U + (1 << 13)) >> 14;
1517  dctint t3a = (dctint)(IN( 8) * 15137U + IN(24) * 6270U + (1 << 13)) >> 14;
1518  dctint t4a = (dctint)(IN( 4) * 3196U - IN(28) * 16069U + (1 << 13)) >> 14;
1519  dctint t7a = (dctint)(IN( 4) * 16069U + IN(28) * 3196U + (1 << 13)) >> 14;
1520  dctint t5a = (dctint)(IN(20) * 13623U - IN(12) * 9102U + (1 << 13)) >> 14;
1521  dctint t6a = (dctint)(IN(20) * 9102U + IN(12) * 13623U + (1 << 13)) >> 14;
1522  dctint t8a = (dctint)(IN( 2) * 1606U - IN(30) * 16305U + (1 << 13)) >> 14;
1523  dctint t15a = (dctint)(IN( 2) * 16305U + IN(30) * 1606U + (1 << 13)) >> 14;
1524  dctint t9a = (dctint)(IN(18) * 12665U - IN(14) * 10394U + (1 << 13)) >> 14;
1525  dctint t14a = (dctint)(IN(18) * 10394U + IN(14) * 12665U + (1 << 13)) >> 14;
1526  dctint t10a = (dctint)(IN(10) * 7723U - IN(22) * 14449U + (1 << 13)) >> 14;
1527  dctint t13a = (dctint)(IN(10) * 14449U + IN(22) * 7723U + (1 << 13)) >> 14;
1528  dctint t11a = (dctint)(IN(26) * 15679U - IN( 6) * 4756U + (1 << 13)) >> 14;
1529  dctint t12a = (dctint)(IN(26) * 4756U + IN( 6) * 15679U + (1 << 13)) >> 14;
1530  dctint t16a = (dctint)(IN( 1) * 804U - IN(31) * 16364U + (1 << 13)) >> 14;
1531  dctint t31a = (dctint)(IN( 1) * 16364U + IN(31) * 804U + (1 << 13)) >> 14;
1532  dctint t17a = (dctint)(IN(17) * 12140U - IN(15) * 11003U + (1 << 13)) >> 14;
1533  dctint t30a = (dctint)(IN(17) * 11003U + IN(15) * 12140U + (1 << 13)) >> 14;
1534  dctint t18a = (dctint)(IN( 9) * 7005U - IN(23) * 14811U + (1 << 13)) >> 14;
1535  dctint t29a = (dctint)(IN( 9) * 14811U + IN(23) * 7005U + (1 << 13)) >> 14;
1536  dctint t19a = (dctint)(IN(25) * 15426U - IN( 7) * 5520U + (1 << 13)) >> 14;
1537  dctint t28a = (dctint)(IN(25) * 5520U + IN( 7) * 15426U + (1 << 13)) >> 14;
1538  dctint t20a = (dctint)(IN( 5) * 3981U - IN(27) * 15893U + (1 << 13)) >> 14;
1539  dctint t27a = (dctint)(IN( 5) * 15893U + IN(27) * 3981U + (1 << 13)) >> 14;
1540  dctint t21a = (dctint)(IN(21) * 14053U - IN(11) * 8423U + (1 << 13)) >> 14;
1541  dctint t26a = (dctint)(IN(21) * 8423U + IN(11) * 14053U + (1 << 13)) >> 14;
1542  dctint t22a = (dctint)(IN(13) * 9760U - IN(19) * 13160U + (1 << 13)) >> 14;
1543  dctint t25a = (dctint)(IN(13) * 13160U + IN(19) * 9760U + (1 << 13)) >> 14;
1544  dctint t23a = (dctint)(IN(29) * 16207U - IN( 3) * 2404U + (1 << 13)) >> 14;
1545  dctint t24a = (dctint)(IN(29) * 2404U + IN( 3) * 16207U + (1 << 13)) >> 14;
1546 
1547  dctint t0 = t0a + t3a;
1548  dctint t1 = t1a + t2a;
1549  dctint t2 = t1a - t2a;
1550  dctint t3 = t0a - t3a;
1551  dctint t4 = t4a + t5a;
1552  dctint t5 = t4a - t5a;
1553  dctint t6 = t7a - t6a;
1554  dctint t7 = t7a + t6a;
1555  dctint t8 = t8a + t9a;
1556  dctint t9 = t8a - t9a;
1557  dctint t10 = t11a - t10a;
1558  dctint t11 = t11a + t10a;
1559  dctint t12 = t12a + t13a;
1560  dctint t13 = t12a - t13a;
1561  dctint t14 = t15a - t14a;
1562  dctint t15 = t15a + t14a;
1563  dctint t16 = t16a + t17a;
1564  dctint t17 = t16a - t17a;
1565  dctint t18 = t19a - t18a;
1566  dctint t19 = t19a + t18a;
1567  dctint t20 = t20a + t21a;
1568  dctint t21 = t20a - t21a;
1569  dctint t22 = t23a - t22a;
1570  dctint t23 = t23a + t22a;
1571  dctint t24 = t24a + t25a;
1572  dctint t25 = t24a - t25a;
1573  dctint t26 = t27a - t26a;
1574  dctint t27 = t27a + t26a;
1575  dctint t28 = t28a + t29a;
1576  dctint t29 = t28a - t29a;
1577  dctint t30 = t31a - t30a;
1578  dctint t31 = t31a + t30a;
1579 
1580  t5a = (dctint)((t6 - t5) * 11585U + (1 << 13)) >> 14;
1581  t6a = (dctint)((t6 + t5) * 11585U + (1 << 13)) >> 14;
1582  t9a = (dctint)( t14 * 6270U - t9 * 15137U + (1 << 13)) >> 14;
1583  t14a = (dctint)( t14 * 15137U + t9 * 6270U + (1 << 13)) >> 14;
1584  t10a = (dctint)(-(t13 * 15137U + t10 * 6270U) + (1 << 13)) >> 14;
1585  t13a = (dctint)( t13 * 6270U - t10 * 15137U + (1 << 13)) >> 14;
1586  t17a = (dctint)( t30 * 3196U - t17 * 16069U + (1 << 13)) >> 14;
1587  t30a = (dctint)( t30 * 16069U + t17 * 3196U + (1 << 13)) >> 14;
1588  t18a = (dctint)(-(t29 * 16069U + t18 * 3196U) + (1 << 13)) >> 14;
1589  t29a = (dctint)( t29 * 3196U - t18 * 16069U + (1 << 13)) >> 14;
1590  t21a = (dctint)( t26 * 13623U - t21 * 9102U + (1 << 13)) >> 14;
1591  t26a = (dctint)( t26 * 9102U + t21 * 13623U + (1 << 13)) >> 14;
1592  t22a = (dctint)(-(t25 * 9102U + t22 * 13623U) + (1 << 13)) >> 14;
1593  t25a = (dctint)( t25 * 13623U - t22 * 9102U + (1 << 13)) >> 14;
1594 
1595  t0a = t0 + t7;
1596  t1a = t1 + t6a;
1597  t2a = t2 + t5a;
1598  t3a = t3 + t4;
1599  t4a = t3 - t4;
1600  t5 = t2 - t5a;
1601  t6 = t1 - t6a;
1602  t7a = t0 - t7;
1603  t8a = t8 + t11;
1604  t9 = t9a + t10a;
1605  t10 = t9a - t10a;
1606  t11a = t8 - t11;
1607  t12a = t15 - t12;
1608  t13 = t14a - t13a;
1609  t14 = t14a + t13a;
1610  t15a = t15 + t12;
1611  t16a = t16 + t19;
1612  t17 = t17a + t18a;
1613  t18 = t17a - t18a;
1614  t19a = t16 - t19;
1615  t20a = t23 - t20;
1616  t21 = t22a - t21a;
1617  t22 = t22a + t21a;
1618  t23a = t23 + t20;
1619  t24a = t24 + t27;
1620  t25 = t25a + t26a;
1621  t26 = t25a - t26a;
1622  t27a = t24 - t27;
1623  t28a = t31 - t28;
1624  t29 = t30a - t29a;
1625  t30 = t30a + t29a;
1626  t31a = t31 + t28;
1627 
1628  t10a = (dctint)((t13 - t10) * 11585U + (1 << 13)) >> 14;
1629  t13a = (dctint)((t13 + t10) * 11585U + (1 << 13)) >> 14;
1630  t11 = (dctint)((t12a - t11a) * 11585U + (1 << 13)) >> 14;
1631  t12 = (dctint)((t12a + t11a) * 11585U + (1 << 13)) >> 14;
1632  t18a = (dctint)( t29 * 6270U - t18 * 15137U + (1 << 13)) >> 14;
1633  t29a = (dctint)( t29 * 15137U + t18 * 6270U + (1 << 13)) >> 14;
1634  t19 = (dctint)( t28a * 6270U - t19a * 15137U + (1 << 13)) >> 14;
1635  t28 = (dctint)( t28a * 15137U + t19a * 6270U + (1 << 13)) >> 14;
1636  t20 = (dctint)(-(t27a * 15137U + t20a * 6270U) + (1 << 13)) >> 14;
1637  t27 = (dctint)( t27a * 6270U - t20a * 15137U + (1 << 13)) >> 14;
1638  t21a = (dctint)(-(t26 * 15137U + t21 * 6270U) + (1 << 13)) >> 14;
1639  t26a = (dctint)( t26 * 6270U - t21 * 15137U + (1 << 13)) >> 14;
1640 
1641  t0 = t0a + t15a;
1642  t1 = t1a + t14;
1643  t2 = t2a + t13a;
1644  t3 = t3a + t12;
1645  t4 = t4a + t11;
1646  t5a = t5 + t10a;
1647  t6a = t6 + t9;
1648  t7 = t7a + t8a;
1649  t8 = t7a - t8a;
1650  t9a = t6 - t9;
1651  t10 = t5 - t10a;
1652  t11a = t4a - t11;
1653  t12a = t3a - t12;
1654  t13 = t2a - t13a;
1655  t14a = t1a - t14;
1656  t15 = t0a - t15a;
1657  t16 = t16a + t23a;
1658  t17a = t17 + t22;
1659  t18 = t18a + t21a;
1660  t19a = t19 + t20;
1661  t20a = t19 - t20;
1662  t21 = t18a - t21a;
1663  t22a = t17 - t22;
1664  t23 = t16a - t23a;
1665  t24 = t31a - t24a;
1666  t25a = t30 - t25;
1667  t26 = t29a - t26a;
1668  t27a = t28 - t27;
1669  t28a = t28 + t27;
1670  t29 = t29a + t26a;
1671  t30a = t30 + t25;
1672  t31 = t31a + t24a;
1673 
1674  t20 = (dctint)((t27a - t20a) * 11585U + (1 << 13)) >> 14;
1675  t27 = (dctint)((t27a + t20a) * 11585U + (1 << 13)) >> 14;
1676  t21a = (dctint)((t26 - t21 ) * 11585U + (1 << 13)) >> 14;
1677  t26a = (dctint)((t26 + t21 ) * 11585U + (1 << 13)) >> 14;
1678  t22 = (dctint)((t25a - t22a) * 11585U + (1 << 13)) >> 14;
1679  t25 = (dctint)((t25a + t22a) * 11585U + (1 << 13)) >> 14;
1680  t23a = (dctint)((t24 - t23 ) * 11585U + (1 << 13)) >> 14;
1681  t24a = (dctint)((t24 + t23 ) * 11585U + (1 << 13)) >> 14;
1682 
1683  out[ 0] = t0 + t31;
1684  out[ 1] = t1 + t30a;
1685  out[ 2] = t2 + t29;
1686  out[ 3] = t3 + t28a;
1687  out[ 4] = t4 + t27;
1688  out[ 5] = t5a + t26a;
1689  out[ 6] = t6a + t25;
1690  out[ 7] = t7 + t24a;
1691  out[ 8] = t8 + t23a;
1692  out[ 9] = t9a + t22;
1693  out[10] = t10 + t21a;
1694  out[11] = t11a + t20;
1695  out[12] = t12a + t19a;
1696  out[13] = t13 + t18;
1697  out[14] = t14a + t17a;
1698  out[15] = t15 + t16;
1699  out[16] = t15 - t16;
1700  out[17] = t14a - t17a;
1701  out[18] = t13 - t18;
1702  out[19] = t12a - t19a;
1703  out[20] = t11a - t20;
1704  out[21] = t10 - t21a;
1705  out[22] = t9a - t22;
1706  out[23] = t8 - t23a;
1707  out[24] = t7 - t24a;
1708  out[25] = t6a - t25;
1709  out[26] = t5a - t26a;
1710  out[27] = t4 - t27;
1711  out[28] = t3 - t28a;
1712  out[29] = t2 - t29;
1713  out[30] = t1 - t30a;
1714  out[31] = t0 - t31;
1715 }
1716 
1718 
1719 static av_always_inline void iwht4_1d(const dctcoef *in, ptrdiff_t stride,
1720  dctcoef *out, int pass)
1721 {
1722  int t0, t1, t2, t3, t4;
1723 
1724  if (pass == 0) {
1725  t0 = IN(0) >> 2;
1726  t1 = IN(3) >> 2;
1727  t2 = IN(1) >> 2;
1728  t3 = IN(2) >> 2;
1729  } else {
1730  t0 = IN(0);
1731  t1 = IN(3);
1732  t2 = IN(1);
1733  t3 = IN(2);
1734  }
1735 
1736  t0 += t2;
1737  t3 -= t1;
1738  t4 = (t0 - t3) >> 1;
1739  t1 = t4 - t1;
1740  t2 = t4 - t2;
1741  t0 -= t1;
1742  t3 += t2;
1743 
1744  out[0] = t0;
1745  out[1] = t1;
1746  out[2] = t2;
1747  out[3] = t3;
1748 }
1749 
1750 itxfm_wrapper(iwht, iwht, 4, 0, 0)
1751 
1752 #undef IN
1753 #undef itxfm_wrapper
1754 #undef itxfm_wrap
1755 
1756 static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp)
1757 {
1758 #define init_itxfm(tx, sz) \
1759  dsp->itxfm_add[tx][DCT_DCT] = idct_idct_##sz##_add_c; \
1760  dsp->itxfm_add[tx][DCT_ADST] = iadst_idct_##sz##_add_c; \
1761  dsp->itxfm_add[tx][ADST_DCT] = idct_iadst_##sz##_add_c; \
1762  dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_##sz##_add_c
1763 
1764 #define init_idct(tx, nm) \
1765  dsp->itxfm_add[tx][DCT_DCT] = \
1766  dsp->itxfm_add[tx][ADST_DCT] = \
1767  dsp->itxfm_add[tx][DCT_ADST] = \
1768  dsp->itxfm_add[tx][ADST_ADST] = nm##_add_c
1769 
1770  init_itxfm(TX_4X4, 4x4);
1771  init_itxfm(TX_8X8, 8x8);
1772  init_itxfm(TX_16X16, 16x16);
1773  init_idct(TX_32X32, idct_idct_32x32);
1774  init_idct(4 /* lossless */, iwht_iwht_4x4);
1775 
1776 #undef init_itxfm
1777 #undef init_idct
1778 }
1779 
1780 static av_always_inline void loop_filter(pixel *dst, int E, int I, int H,
1781  ptrdiff_t stridea, ptrdiff_t strideb,
1782  int wd)
1783 {
1784  int i, F = 1 << (BIT_DEPTH - 8);
1785 
1786  E <<= (BIT_DEPTH - 8);
1787  I <<= (BIT_DEPTH - 8);
1788  H <<= (BIT_DEPTH - 8);
1789  for (i = 0; i < 8; i++, dst += stridea) {
1790  int p7, p6, p5, p4;
1791  int p3 = dst[strideb * -4], p2 = dst[strideb * -3];
1792  int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
1793  int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
1794  int q2 = dst[strideb * +2], q3 = dst[strideb * +3];
1795  int q4, q5, q6, q7;
1796  int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
1797  FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I &&
1798  FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I &&
1799  FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E;
1800  int flat8out, flat8in;
1801 
1802  if (!fm)
1803  continue;
1804 
1805  if (wd >= 16) {
1806  p7 = dst[strideb * -8];
1807  p6 = dst[strideb * -7];
1808  p5 = dst[strideb * -6];
1809  p4 = dst[strideb * -5];
1810  q4 = dst[strideb * +4];
1811  q5 = dst[strideb * +5];
1812  q6 = dst[strideb * +6];
1813  q7 = dst[strideb * +7];
1814 
1815  flat8out = FFABS(p7 - p0) <= F && FFABS(p6 - p0) <= F &&
1816  FFABS(p5 - p0) <= F && FFABS(p4 - p0) <= F &&
1817  FFABS(q4 - q0) <= F && FFABS(q5 - q0) <= F &&
1818  FFABS(q6 - q0) <= F && FFABS(q7 - q0) <= F;
1819  }
1820 
1821  if (wd >= 8)
1822  flat8in = FFABS(p3 - p0) <= F && FFABS(p2 - p0) <= F &&
1823  FFABS(p1 - p0) <= F && FFABS(q1 - q0) <= F &&
1824  FFABS(q2 - q0) <= F && FFABS(q3 - q0) <= F;
1825 
1826  if (wd >= 16 && flat8out && flat8in) {
1827  dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 +
1828  p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
1829  dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 +
1830  p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
1831  dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 +
1832  p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
1833  dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 +
1834  p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
1835  dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 +
1836  p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
1837  dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
1838  p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
1839  dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
1840  q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
1841  dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
1842  q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
1843  dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
1844  q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4;
1845  dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
1846  q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4;
1847  dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
1848  q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4;
1849  dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
1850  q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
1851  dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
1852  q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
1853  dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
1854  q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
1855  } else if (wd >= 8 && flat8in) {
1856  dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
1857  dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
1858  dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
1859  dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
1860  dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
1861  dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
1862  } else {
1863  int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H;
1864 
1865  if (hev) {
1866  int f = av_clip_intp2(p1 - q1, BIT_DEPTH - 1), f1, f2;
1867  f = av_clip_intp2(3 * (q0 - p0) + f, BIT_DEPTH - 1);
1868 
1869  f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
1870  f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
1871 
1872  dst[strideb * -1] = av_clip_pixel(p0 + f2);
1873  dst[strideb * +0] = av_clip_pixel(q0 - f1);
1874  } else {
1875  int f = av_clip_intp2(3 * (q0 - p0), BIT_DEPTH - 1), f1, f2;
1876 
1877  f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
1878  f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
1879 
1880  dst[strideb * -1] = av_clip_pixel(p0 + f2);
1881  dst[strideb * +0] = av_clip_pixel(q0 - f1);
1882 
1883  f = (f1 + 1) >> 1;
1884  dst[strideb * -2] = av_clip_pixel(p1 + f);
1885  dst[strideb * +1] = av_clip_pixel(q1 - f);
1886  }
1887  }
1888  }
1889 }
1890 
1891 #define lf_8_fn(dir, wd, stridea, strideb) \
1892 static void loop_filter_##dir##_##wd##_8_c(uint8_t *_dst, \
1893  ptrdiff_t stride, \
1894  int E, int I, int H) \
1895 { \
1896  pixel *dst = (pixel *) _dst; \
1897  stride /= sizeof(pixel); \
1898  loop_filter(dst, E, I, H, stridea, strideb, wd); \
1899 }
1900 
1901 #define lf_8_fns(wd) \
1902 lf_8_fn(h, wd, stride, 1) \
1903 lf_8_fn(v, wd, 1, stride)
1904 
1906 lf_8_fns(8)
1907 lf_8_fns(16)
1908 
1909 #undef lf_8_fn
1910 #undef lf_8_fns
1911 
1912 #define lf_16_fn(dir, stridea) \
1913 static void loop_filter_##dir##_16_16_c(uint8_t *dst, \
1914  ptrdiff_t stride, \
1915  int E, int I, int H) \
1916 { \
1917  loop_filter_##dir##_16_8_c(dst, stride, E, I, H); \
1918  loop_filter_##dir##_16_8_c(dst + 8 * stridea, stride, E, I, H); \
1919 }
1920 
1921 lf_16_fn(h, stride)
1922 lf_16_fn(v, sizeof(pixel))
1923 
1924 #undef lf_16_fn
1925 
1926 #define lf_mix_fn(dir, wd1, wd2, stridea) \
1927 static void loop_filter_##dir##_##wd1##wd2##_16_c(uint8_t *dst, \
1928  ptrdiff_t stride, \
1929  int E, int I, int H) \
1930 { \
1931  loop_filter_##dir##_##wd1##_8_c(dst, stride, E & 0xff, I & 0xff, H & 0xff); \
1932  loop_filter_##dir##_##wd2##_8_c(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \
1933 }
1934 
1935 #define lf_mix_fns(wd1, wd2) \
1936 lf_mix_fn(h, wd1, wd2, stride) \
1937 lf_mix_fn(v, wd1, wd2, sizeof(pixel))
1938 
1939 lf_mix_fns(4, 4)
1940 lf_mix_fns(4, 8)
1941 lf_mix_fns(8, 4)
1942 lf_mix_fns(8, 8)
1943 
1944 #undef lf_mix_fn
1945 #undef lf_mix_fns
1946 
1947 static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp)
1948 {
1949  dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c;
1950  dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c;
1951  dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c;
1952  dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c;
1953  dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c;
1954  dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c;
1955 
1956  dsp->loop_filter_16[0] = loop_filter_h_16_16_c;
1957  dsp->loop_filter_16[1] = loop_filter_v_16_16_c;
1958 
1959  dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c;
1960  dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c;
1961  dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c;
1962  dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c;
1963  dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c;
1964  dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c;
1965  dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c;
1966  dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c;
1967 }
1968 
1969 #if BIT_DEPTH != 12
1970 
1971 static av_always_inline void copy_c(uint8_t *restrict dst, ptrdiff_t dst_stride,
1972  const uint8_t *restrict src,
1973  ptrdiff_t src_stride, int w, int h)
1974 {
1975  do {
1976  memcpy(dst, src, w * sizeof(pixel));
1977 
1978  dst += dst_stride;
1979  src += src_stride;
1980  } while (--h);
1981 }
1982 
1983 static av_always_inline void avg_c(uint8_t *restrict _dst, ptrdiff_t dst_stride,
1984  const uint8_t *restrict _src,
1985  ptrdiff_t src_stride, int w, int h)
1986 {
1987  pixel *dst = (pixel *) _dst;
1988  const pixel *src = (const pixel *) _src;
1989 
1990  dst_stride /= sizeof(pixel);
1991  src_stride /= sizeof(pixel);
1992  do {
1993  int x;
1994 
1995  for (x = 0; x < w; x += 4)
1996  AV_WN4PA(&dst[x], rnd_avg_pixel4(AV_RN4PA(&dst[x]), AV_RN4P(&src[x])));
1997 
1998  dst += dst_stride;
1999  src += src_stride;
2000  } while (--h);
2001 }
2002 
2003 #define fpel_fn(type, sz) \
2004 static void type##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2005  const uint8_t *src, ptrdiff_t src_stride, \
2006  int h, int mx, int my) \
2007 { \
2008  type##_c(dst, dst_stride, src, src_stride, sz, h); \
2009 }
2010 
2011 #define copy_avg_fn(sz) \
2012 fpel_fn(copy, sz) \
2013 fpel_fn(avg, sz)
2014 
2016 copy_avg_fn(32)
2017 copy_avg_fn(16)
2018 copy_avg_fn(8)
2019 copy_avg_fn(4)
2020 
2021 #undef fpel_fn
2022 #undef copy_avg_fn
2023 
2024 #endif /* BIT_DEPTH != 12 */
2025 
2026 #define FILTER_8TAP(src, x, F, stride) \
2027  av_clip_pixel((F[0] * src[x + -3 * stride] + \
2028  F[1] * src[x + -2 * stride] + \
2029  F[2] * src[x + -1 * stride] + \
2030  F[3] * src[x + +0 * stride] + \
2031  F[4] * src[x + +1 * stride] + \
2032  F[5] * src[x + +2 * stride] + \
2033  F[6] * src[x + +3 * stride] + \
2034  F[7] * src[x + +4 * stride] + 64) >> 7)
2035 
2036 static av_always_inline void do_8tap_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
2037  const uint8_t *_src, ptrdiff_t src_stride,
2038  int w, int h, ptrdiff_t ds,
2039  const int16_t *filter, int avg)
2040 {
2041  pixel *dst = (pixel *) _dst;
2042  const pixel *src = (const pixel *) _src;
2043 
2044  dst_stride /= sizeof(pixel);
2045  src_stride /= sizeof(pixel);
2046  do {
2047  int x;
2048 
2049  for (x = 0; x < w; x++)
2050  if (avg) {
2051  dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1;
2052  } else {
2053  dst[x] = FILTER_8TAP(src, x, filter, ds);
2054  }
2055 
2056  dst += dst_stride;
2057  src += src_stride;
2058  } while (--h);
2059 }
2060 
2061 #define filter_8tap_1d_fn(opn, opa, dir, ds) \
2062 static av_noinline void opn##_8tap_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2063  const uint8_t *src, ptrdiff_t src_stride, \
2064  int w, int h, const int16_t *filter) \
2065 { \
2066  do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa); \
2067 }
2068 
2069 filter_8tap_1d_fn(put, 0, v, src_stride / sizeof(pixel))
2071 filter_8tap_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
2072 filter_8tap_1d_fn(avg, 1, h, 1)
2073 
2074 #undef filter_8tap_1d_fn
2075 
2076 static av_always_inline void do_8tap_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
2077  const uint8_t *_src, ptrdiff_t src_stride,
2078  int w, int h, const int16_t *filterx,
2079  const int16_t *filtery, int avg)
2080 {
2081  int tmp_h = h + 7;
2082  pixel tmp[64 * 71], *tmp_ptr = tmp;
2083  pixel *dst = (pixel *) _dst;
2084  const pixel *src = (const pixel *) _src;
2085 
2086  dst_stride /= sizeof(pixel);
2087  src_stride /= sizeof(pixel);
2088  src -= src_stride * 3;
2089  do {
2090  int x;
2091 
2092  for (x = 0; x < w; x++)
2093  tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1);
2094 
2095  tmp_ptr += 64;
2096  src += src_stride;
2097  } while (--tmp_h);
2098 
2099  tmp_ptr = tmp + 64 * 3;
2100  do {
2101  int x;
2102 
2103  for (x = 0; x < w; x++)
2104  if (avg) {
2105  dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1;
2106  } else {
2107  dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64);
2108  }
2109 
2110  tmp_ptr += 64;
2111  dst += dst_stride;
2112  } while (--h);
2113 }
2114 
2115 #define filter_8tap_2d_fn(opn, opa) \
2116 static av_noinline void opn##_8tap_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
2117  const uint8_t *src, ptrdiff_t src_stride, \
2118  int w, int h, const int16_t *filterx, \
2119  const int16_t *filtery) \
2120 { \
2121  do_8tap_2d_c(dst, dst_stride, src, src_stride, w, h, filterx, filtery, opa); \
2122 }
2123 
2126 
2127 #undef filter_8tap_2d_fn
2128 
2129 #define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \
2130 static void avg##_8tap_##type##_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2131  const uint8_t *src, ptrdiff_t src_stride, \
2132  int h, int mx, int my) \
2133 { \
2134  avg##_8tap_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, \
2135  ff_vp9_subpel_filters[type_idx][dir_m]); \
2136 }
2137 
2138 #define filter_fn_2d(sz, type, type_idx, avg) \
2139 static void avg##_8tap_##type##_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
2140  const uint8_t *src, ptrdiff_t src_stride, \
2141  int h, int mx, int my) \
2142 { \
2143  avg##_8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, \
2144  ff_vp9_subpel_filters[type_idx][mx], \
2145  ff_vp9_subpel_filters[type_idx][my]); \
2146 }
2147 
2148 #if BIT_DEPTH != 12
2149 
2150 #define FILTER_BILIN(src, x, mxy, stride) \
2151  (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4))
2152 
2153 static av_always_inline void do_bilin_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
2154  const uint8_t *_src, ptrdiff_t src_stride,
2155  int w, int h, ptrdiff_t ds, int mxy, int avg)
2156 {
2157  pixel *dst = (pixel *) _dst;
2158  const pixel *src = (const pixel *) _src;
2159 
2160  dst_stride /= sizeof(pixel);
2161  src_stride /= sizeof(pixel);
2162  do {
2163  int x;
2164 
2165  for (x = 0; x < w; x++)
2166  if (avg) {
2167  dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1;
2168  } else {
2169  dst[x] = FILTER_BILIN(src, x, mxy, ds);
2170  }
2171 
2172  dst += dst_stride;
2173  src += src_stride;
2174  } while (--h);
2175 }
2176 
2177 #define bilin_1d_fn(opn, opa, dir, ds) \
2178 static av_noinline void opn##_bilin_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2179  const uint8_t *src, ptrdiff_t src_stride, \
2180  int w, int h, int mxy) \
2181 { \
2182  do_bilin_1d_c(dst, dst_stride, src, src_stride, w, h, ds, mxy, opa); \
2183 }
2184 
2185 bilin_1d_fn(put, 0, v, src_stride / sizeof(pixel))
2186 bilin_1d_fn(put, 0, h, 1)
2187 bilin_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
2188 bilin_1d_fn(avg, 1, h, 1)
2189 
2190 #undef bilin_1d_fn
2191 
2192 static av_always_inline void do_bilin_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
2193  const uint8_t *_src, ptrdiff_t src_stride,
2194  int w, int h, int mx, int my, int avg)
2195 {
2196  pixel tmp[64 * 65], *tmp_ptr = tmp;
2197  int tmp_h = h + 1;
2198  pixel *dst = (pixel *) _dst;
2199  const pixel *src = (const pixel *) _src;
2200 
2201  dst_stride /= sizeof(pixel);
2202  src_stride /= sizeof(pixel);
2203  do {
2204  int x;
2205 
2206  for (x = 0; x < w; x++)
2207  tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1);
2208 
2209  tmp_ptr += 64;
2210  src += src_stride;
2211  } while (--tmp_h);
2212 
2213  tmp_ptr = tmp;
2214  do {
2215  int x;
2216 
2217  for (x = 0; x < w; x++)
2218  if (avg) {
2219  dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
2220  } else {
2221  dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
2222  }
2223 
2224  tmp_ptr += 64;
2225  dst += dst_stride;
2226  } while (--h);
2227 }
2228 
2229 #define bilin_2d_fn(opn, opa) \
2230 static av_noinline void opn##_bilin_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
2231  const uint8_t *src, ptrdiff_t src_stride, \
2232  int w, int h, int mx, int my) \
2233 { \
2234  do_bilin_2d_c(dst, dst_stride, src, src_stride, w, h, mx, my, opa); \
2235 }
2236 
2238 bilin_2d_fn(avg, 1)
2239 
2240 #undef bilin_2d_fn
2241 
2242 #define bilinf_fn_1d(sz, dir, dir_m, avg) \
2243 static void avg##_bilin_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2244  const uint8_t *src, ptrdiff_t src_stride, \
2245  int h, int mx, int my) \
2246 { \
2247  avg##_bilin_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, dir_m); \
2248 }
2249 
2250 #define bilinf_fn_2d(sz, avg) \
2251 static void avg##_bilin_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
2252  const uint8_t *src, ptrdiff_t src_stride, \
2253  int h, int mx, int my) \
2254 { \
2255  avg##_bilin_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, mx, my); \
2256 }
2257 
2258 #else
2259 
2260 #define bilinf_fn_1d(a, b, c, d)
2261 #define bilinf_fn_2d(a, b)
2262 
2263 #endif
2264 
2265 #define filter_fn(sz, avg) \
2266 filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
2267 filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
2268 filter_fn_2d(sz, regular, FILTER_8TAP_REGULAR, avg) \
2269 filter_fn_1d(sz, h, mx, smooth, FILTER_8TAP_SMOOTH, avg) \
2270 filter_fn_1d(sz, v, my, smooth, FILTER_8TAP_SMOOTH, avg) \
2271 filter_fn_2d(sz, smooth, FILTER_8TAP_SMOOTH, avg) \
2272 filter_fn_1d(sz, h, mx, sharp, FILTER_8TAP_SHARP, avg) \
2273 filter_fn_1d(sz, v, my, sharp, FILTER_8TAP_SHARP, avg) \
2274 filter_fn_2d(sz, sharp, FILTER_8TAP_SHARP, avg) \
2275 bilinf_fn_1d(sz, h, mx, avg) \
2276 bilinf_fn_1d(sz, v, my, avg) \
2277 bilinf_fn_2d(sz, avg)
2278 
2279 #define filter_fn_set(avg) \
2280 filter_fn(64, avg) \
2281 filter_fn(32, avg) \
2282 filter_fn(16, avg) \
2283 filter_fn(8, avg) \
2284 filter_fn(4, avg)
2285 
2286 filter_fn_set(put)
2288 
2289 #undef filter_fn
2290 #undef filter_fn_set
2291 #undef filter_fn_1d
2292 #undef filter_fn_2d
2293 #undef bilinf_fn_1d
2294 #undef bilinf_fn_2d
2295 
2296 #if BIT_DEPTH != 8
2297 void ff_vp9dsp_mc_init_10(VP9DSPContext *dsp);
2298 #endif
2299 #if BIT_DEPTH != 10
2300 static
2301 #endif
2302 av_cold void FUNC(ff_vp9dsp_mc_init)(VP9DSPContext *dsp)
2303 {
2304 #if BIT_DEPTH == 12
2305  ff_vp9dsp_mc_init_10(dsp);
2306 #else /* BIT_DEPTH == 12 */
2307 
2308 #define init_fpel(idx1, idx2, sz, type) \
2309  dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = type##sz##_c; \
2310  dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type##sz##_c; \
2311  dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = type##sz##_c; \
2312  dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = type##sz##_c
2313 
2314 #define init_copy_avg(idx, sz) \
2315  init_fpel(idx, 0, sz, copy); \
2316  init_fpel(idx, 1, sz, avg)
2317 
2318  init_copy_avg(0, 64);
2319  init_copy_avg(1, 32);
2320  init_copy_avg(2, 16);
2321  init_copy_avg(3, 8);
2322  init_copy_avg(4, 4);
2323 
2324 #undef init_copy_avg
2325 #undef init_fpel
2326 
2327 #endif /* BIT_DEPTH == 12 */
2328 
2329 #define init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type) \
2330  dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_c; \
2331  dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_c; \
2332  dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_c
2333 
2334 #if BIT_DEPTH == 12
2335 #define init_subpel1 init_subpel1_bd_aware
2336 #else
2337 #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \
2338  init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type); \
2339  dsp->mc[idx1][FILTER_BILINEAR ][idx2][idxh][idxv] = type##_bilin_##sz##dir##_c
2340 #endif
2341 
2342 #define init_subpel2(idx, idxh, idxv, dir, type) \
2343  init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
2344  init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
2345  init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
2346  init_subpel1(3, idx, idxh, idxv, 8, dir, type); \
2347  init_subpel1(4, idx, idxh, idxv, 4, dir, type)
2348 
2349 #define init_subpel3(idx, type) \
2350  init_subpel2(idx, 1, 1, hv, type); \
2351  init_subpel2(idx, 0, 1, v, type); \
2352  init_subpel2(idx, 1, 0, h, type)
2353 
2354  init_subpel3(0, put);
2355  init_subpel3(1, avg);
2356 
2357 #undef init_subpel1
2358 #undef init_subpel2
2359 #undef init_subpel3
2360 #undef init_subpel1_bd_aware
2361 }
2362 
2363 static av_always_inline void do_scaled_8tap_c(uint8_t *_dst, ptrdiff_t dst_stride,
2364  const uint8_t *_src, ptrdiff_t src_stride,
2365  int w, int h, int mx, int my,
2366  int dx, int dy, int avg,
2367  const int16_t (*filters)[8])
2368 {
2369  int tmp_h = (((h - 1) * dy + my) >> 4) + 8;
2370  pixel tmp[64 * 135], *tmp_ptr = tmp;
2371  pixel *dst = (pixel *) _dst;
2372  const pixel *src = (const pixel *) _src;
2373 
2374  dst_stride /= sizeof(pixel);
2375  src_stride /= sizeof(pixel);
2376  src -= src_stride * 3;
2377  do {
2378  int x;
2379  int imx = mx, ioff = 0;
2380 
2381  for (x = 0; x < w; x++) {
2382  tmp_ptr[x] = FILTER_8TAP(src, ioff, filters[imx], 1);
2383  imx += dx;
2384  ioff += imx >> 4;
2385  imx &= 0xf;
2386  }
2387 
2388  tmp_ptr += 64;
2389  src += src_stride;
2390  } while (--tmp_h);
2391 
2392  tmp_ptr = tmp + 64 * 3;
2393  do {
2394  int x;
2395  const int16_t *filter = filters[my];
2396 
2397  for (x = 0; x < w; x++)
2398  if (avg) {
2399  dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filter, 64) + 1) >> 1;
2400  } else {
2401  dst[x] = FILTER_8TAP(tmp_ptr, x, filter, 64);
2402  }
2403 
2404  my += dy;
2405  tmp_ptr += (my >> 4) * 64;
2406  my &= 0xf;
2407  dst += dst_stride;
2408  } while (--h);
2409 }
2410 
2411 #define scaled_filter_8tap_fn(opn, opa) \
2412 static av_noinline void opn##_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride, \
2413  const uint8_t *src, ptrdiff_t src_stride, \
2414  int w, int h, int mx, int my, int dx, int dy, \
2415  const int16_t (*filters)[8]) \
2416 { \
2417  do_scaled_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
2418  opa, filters); \
2419 }
2420 
2423 
2424 #undef scaled_filter_8tap_fn
2425 
2426 #undef FILTER_8TAP
2427 
2428 #define scaled_filter_fn(sz, type, type_idx, avg) \
2429 static void avg##_scaled_##type##_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2430  const uint8_t *src, ptrdiff_t src_stride, \
2431  int h, int mx, int my, int dx, int dy) \
2432 { \
2433  avg##_scaled_8tap_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy, \
2434  ff_vp9_subpel_filters[type_idx]); \
2435 }
2436 
2437 #if BIT_DEPTH != 12
2438 
2439 static av_always_inline void do_scaled_bilin_c(uint8_t *_dst, ptrdiff_t dst_stride,
2440  const uint8_t *_src, ptrdiff_t src_stride,
2441  int w, int h, int mx, int my,
2442  int dx, int dy, int avg)
2443 {
2444  pixel tmp[64 * 129], *tmp_ptr = tmp;
2445  int tmp_h = (((h - 1) * dy + my) >> 4) + 2;
2446  pixel *dst = (pixel *) _dst;
2447  const pixel *src = (const pixel *) _src;
2448 
2449  dst_stride /= sizeof(pixel);
2450  src_stride /= sizeof(pixel);
2451  do {
2452  int x;
2453  int imx = mx, ioff = 0;
2454 
2455  for (x = 0; x < w; x++) {
2456  tmp_ptr[x] = FILTER_BILIN(src, ioff, imx, 1);
2457  imx += dx;
2458  ioff += imx >> 4;
2459  imx &= 0xf;
2460  }
2461 
2462  tmp_ptr += 64;
2463  src += src_stride;
2464  } while (--tmp_h);
2465 
2466  tmp_ptr = tmp;
2467  do {
2468  int x;
2469 
2470  for (x = 0; x < w; x++)
2471  if (avg) {
2472  dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
2473  } else {
2474  dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
2475  }
2476 
2477  my += dy;
2478  tmp_ptr += (my >> 4) * 64;
2479  my &= 0xf;
2480  dst += dst_stride;
2481  } while (--h);
2482 }
2483 
2484 #define scaled_bilin_fn(opn, opa) \
2485 static av_noinline void opn##_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride, \
2486  const uint8_t *src, ptrdiff_t src_stride, \
2487  int w, int h, int mx, int my, int dx, int dy) \
2488 { \
2489  do_scaled_bilin_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, opa); \
2490 }
2491 
2493 scaled_bilin_fn(avg, 1)
2494 
2495 #undef scaled_bilin_fn
2496 
2497 #undef FILTER_BILIN
2498 
2499 #define scaled_bilinf_fn(sz, avg) \
2500 static void avg##_scaled_bilin_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
2501  const uint8_t *src, ptrdiff_t src_stride, \
2502  int h, int mx, int my, int dx, int dy) \
2503 { \
2504  avg##_scaled_bilin_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy); \
2505 }
2506 
2507 #else
2508 
2509 #define scaled_bilinf_fn(a, b)
2510 
2511 #endif
2512 
2513 #define scaled_filter_fns(sz, avg) \
2514 scaled_filter_fn(sz, regular, FILTER_8TAP_REGULAR, avg) \
2515 scaled_filter_fn(sz, smooth, FILTER_8TAP_SMOOTH, avg) \
2516 scaled_filter_fn(sz, sharp, FILTER_8TAP_SHARP, avg) \
2517 scaled_bilinf_fn(sz, avg)
2518 
2519 #define scaled_filter_fn_set(avg) \
2520 scaled_filter_fns(64, avg) \
2521 scaled_filter_fns(32, avg) \
2522 scaled_filter_fns(16, avg) \
2523 scaled_filter_fns(8, avg) \
2524 scaled_filter_fns(4, avg)
2525 
2528 
2529 #undef scaled_filter_fns
2530 #undef scaled_filter_fn_set
2531 #undef scaled_filter_fn
2532 #undef scaled_bilinf_fn
2533 
2534 #if BIT_DEPTH != 8
2535 void ff_vp9dsp_scaled_mc_init_10(VP9DSPContext *dsp);
2536 #endif
2537 #if BIT_DEPTH != 10
2538 static
2539 #endif
2540 av_cold void FUNC(ff_vp9dsp_scaled_mc_init)(VP9DSPContext *dsp)
2541 {
2542 #define init_scaled_bd_aware(idx1, idx2, sz, type) \
2543  dsp->smc[idx1][FILTER_8TAP_SMOOTH ][idx2] = type##_scaled_smooth_##sz##_c; \
2544  dsp->smc[idx1][FILTER_8TAP_REGULAR][idx2] = type##_scaled_regular_##sz##_c; \
2545  dsp->smc[idx1][FILTER_8TAP_SHARP ][idx2] = type##_scaled_sharp_##sz##_c
2546 
2547 #if BIT_DEPTH == 12
2548  ff_vp9dsp_scaled_mc_init_10(dsp);
2549 #define init_scaled(a,b,c,d) init_scaled_bd_aware(a,b,c,d)
2550 #else
2551 #define init_scaled(idx1, idx2, sz, type) \
2552  init_scaled_bd_aware(idx1, idx2, sz, type); \
2553  dsp->smc[idx1][FILTER_BILINEAR ][idx2] = type##_scaled_bilin_##sz##_c
2554 #endif
2555 
2556 #define init_scaled_put_avg(idx, sz) \
2557  init_scaled(idx, 0, sz, put); \
2558  init_scaled(idx, 1, sz, avg)
2559 
2560  init_scaled_put_avg(0, 64);
2561  init_scaled_put_avg(1, 32);
2562  init_scaled_put_avg(2, 16);
2563  init_scaled_put_avg(3, 8);
2564  init_scaled_put_avg(4, 4);
2565 
2566 #undef init_scaled_put_avg
2567 #undef init_scaled
2568 #undef init_scaled_bd_aware
2569 }
2570 
2572 {
2573  FUNC(ff_vp9dsp_intrapred_init)(dsp);
2574  vp9dsp_itxfm_init(dsp);
2575  vp9dsp_loopfilter_init(dsp);
2576  FUNC(ff_vp9dsp_mc_init)(dsp);
2577  FUNC(ff_vp9dsp_scaled_mc_init)(dsp);
2578 }
_dst
uint8_t * _dst
Definition: dsp.h:52
q1
static const uint8_t q1[256]
Definition: twofish.c:100
FILTER_8TAP
#define FILTER_8TAP(src, x, F, stride)
out
FILE * out
Definition: movenc.c:55
init_intra_pred
#define init_intra_pred(tx, sz)
dc_top_32x32_c
static void dc_top_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:573
idct4_1d
static av_always_inline void idct4_1d(const dctcoef *in, ptrdiff_t stride, dctcoef *out, int pass)
Definition: vp9dsp_template.c:1202
init_idct
#define init_idct(tx, nm)
lf_16_fn
#define lf_16_fn(dir, stridea)
init_copy_avg
#define init_copy_avg(idx, sz)
AV_RN4P
#define AV_RN4P
Definition: bit_depth_template.c:88
VP9DSPContext::loop_filter_8
void(* loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:81
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
copy_c
static av_always_inline void copy_c(uint8_t *restrict dst, ptrdiff_t dst_stride, const uint8_t *restrict src, ptrdiff_t src_stride, int w, int h)
Definition: vp9dsp_template.c:1971
w
uint8_t w
Definition: llviddspenc.c:38
lf_mix_fns
#define lf_mix_fns(wd1, wd2)
VP9DSPContext
Definition: vp9dsp.h:40
F
#define F(x)
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
hor_8x8_c
static void hor_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:159
tm_32x32_c
static void tm_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:296
init_itxfm
#define init_itxfm(tx, sz)
def_diag_downleft
#define def_diag_downleft(size)
Definition: vp9dsp_template.c:833
_src
uint8_t ptrdiff_t const uint8_t * _src
Definition: dsp.h:52
pixel4
#define pixel4
Definition: bit_depth_template.c:81
mx
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t mx
Definition: dsp.h:53
vert_32x32_c
static void vert_32x32_c(uint8_t *restrict _dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:103
memset_bpc
static void memset_bpc(uint16_t *dst, int val, int len)
Definition: vp9dsp_template.c:805
diag_downleft_4x4_c
static void diag_downleft_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:815
VP9DSPContext::loop_filter_mix2
void(* loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:103
loop_filter
static av_always_inline void loop_filter(pixel *dst, int E, int I, int H, ptrdiff_t stridea, ptrdiff_t strideb, int wd)
Definition: vp9dsp_template.c:1780
dc_left_32x32_c
static void dc_left_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:492
lf_8_fns
#define lf_8_fns(wd)
Definition: vp9dsp_template.c:1901
dctcoef
#define dctcoef
Definition: bit_depth_template.c:82
val
static double val(void *priv, double ch)
Definition: aeval.c:77
iadst8_1d
static av_always_inline void iadst8_1d(const dctcoef *in, ptrdiff_t stride, dctcoef *out, int pass)
Definition: vp9dsp_template.c:1272
a2
static double a2(void *priv, double x, double y)
Definition: vf_xfade.c:2030
do_bilin_2d_c
static av_always_inline void do_bilin_2d_c(uint8_t *_dst, ptrdiff_t dst_stride, const uint8_t *_src, ptrdiff_t src_stride, int w, int h, int mx, int my, int avg)
Definition: vp9dsp_template.c:2192
idct
static void idct(int16_t block[64])
Definition: 4xm.c:167
rnd_avg_pixel4
#define rnd_avg_pixel4
Definition: bit_depth_template.c:86
dctint
#define dctint
Definition: vp9dsp_10bpp.c:25
itxfm_wrapper
#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly)
Definition: vp9dsp_template.c:1155
def_hor_up
#define def_hor_up(size)
Definition: vp9dsp_template.c:1078
av_cold
#define av_cold
Definition: attributes.h:90
tm_4x4_c
static void tm_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:220
t15
static int t15(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:339
AV_WN4PA
#define AV_WN4PA
Definition: bit_depth_template.c:92
dc_129_16x16_c
static void dc_129_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:762
dc_127_4x4_c
static void dc_127_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:669
scaled_bilin_fn
#define scaled_bilin_fn(opn, opa)
Definition: vp9dsp_template.c:2484
filters
#define filters(fmt, type, inverse, clp, inverset, clip, one, clip_fn, packed)
Definition: af_crystalizer.c:55
dc_128_32x32_c
static void dc_128_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:648
PIXEL_SPLAT_X4
#define PIXEL_SPLAT_X4(x)
Definition: bit_depth_template.c:93
tm_16x16_c
static void tm_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:264
q0
static const uint8_t q0[256]
Definition: twofish.c:81
E
#define E
Definition: avdct.c:33
my
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t my
Definition: dsp.h:53
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:74
do_scaled_8tap_c
static av_always_inline void do_scaled_8tap_c(uint8_t *_dst, ptrdiff_t dst_stride, const uint8_t *_src, ptrdiff_t src_stride, int w, int h, int mx, int my, int dx, int dy, int avg, const int16_t(*filters)[8])
Definition: vp9dsp_template.c:2363
tm_8x8_c
static void tm_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:240
av_clip_intp2
#define av_clip_intp2
Definition: common.h:121
FILTER_BILIN
#define FILTER_BILIN(src, x, mxy, stride)
a3
static double a3(void *priv, double x, double y)
Definition: vf_xfade.c:2031
pixel
uint8_t pixel
Definition: tiny_ssim.c:41
TX_8X8
@ TX_8X8
Definition: vp9.h:29
dc_127_32x32_c
static void dc_127_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:713
dc_16x16_c
static void dc_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:382
bit_depth_template.c
hor_32x32_c
static void hor_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:195
dc_127_8x8_c
static void dc_127_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:681
TX_16X16
@ TX_16X16
Definition: vp9.h:30
dc_left_8x8_c
static void dc_left_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:453
dc_128_8x8_c
static void dc_128_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:616
dc_4x4_c
static void dc_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:346
filter_8tap_1d_fn
#define filter_8tap_1d_fn(opn, opa, dir, ds)
Definition: vp9dsp_template.c:2061
vp9dsp.h
f
f
Definition: af_crystalizer.c:122
init_scaled_put_avg
#define init_scaled_put_avg(idx, sz)
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
vert_8x8_c
static void vert_8x8_c(uint8_t *restrict _dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:47
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
def_hor_down
#define def_hor_down(size)
Definition: vp9dsp_template.c:982
dc_8x8_c
static void dc_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:362
AV_RN64A
#define AV_RN64A(p)
Definition: intreadwrite.h:526
t27
static int t27(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:368
dc_129_8x8_c
static void dc_129_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:747
AV_RN4PA
#define AV_RN4PA
Definition: bit_depth_template.c:89
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
TX_4X4
@ TX_4X4
Definition: vp9.h:28
idct16_1d
static void av_always_inline idct16_1d(float *dst, const float *src, int dst_stridea, int dst_strideb, int src_stridea, int src_strideb, int add)
Definition: vf_dctdnoiz.c:261
ff_vp9dsp_init
av_cold void FUNC() ff_vp9dsp_init(VP9DSPContext *dsp)
Definition: vp9dsp_template.c:2571
a0
static double a0(void *priv, double x, double y)
Definition: vf_xfade.c:2028
H
#define H
Definition: pixlet.c:39
IN
#define IN(x)
Definition: vp9dsp_template.c:1200
iadst16_1d
static av_always_inline void iadst16_1d(const dctcoef *in, ptrdiff_t stride, dctcoef *out, int pass)
Definition: vp9dsp_template.c:1406
dc_top_16x16_c
static void dc_top_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:552
dc_128_4x4_c
static void dc_128_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:603
dc_top_8x8_c
static void dc_top_8x8_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:534
vert_4x4_c
static void vert_4x4_c(uint8_t *restrict _dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:33
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
avg_c
static av_always_inline void avg_c(uint8_t *restrict _dst, ptrdiff_t dst_stride, const uint8_t *restrict _src, ptrdiff_t src_stride, int w, int h)
Definition: vp9dsp_template.c:1983
iadst4_1d
static av_always_inline void iadst4_1d(const dctcoef *in, ptrdiff_t stride, dctcoef *out, int pass)
Definition: vp9dsp_template.c:1218
common.h
av_always_inline
#define av_always_inline
Definition: attributes.h:49
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
scaled_filter_fn_set
#define scaled_filter_fn_set(avg)
len
int len
Definition: vorbis_enc_data.h:426
stride
#define stride
Definition: h264pred_template.c:536
dc_129_4x4_c
static void dc_129_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:734
def_diag_downright
#define def_diag_downright(size)
Definition: vp9dsp_template.c:876
hor_4x4_c
static void hor_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:146
dc_left_4x4_c
static void dc_left_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:439
av_clip_pixel
#define av_clip_pixel(a)
Definition: bit_depth_template.c:95
dc_128_16x16_c
static void dc_128_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:631
left
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:386
U
#define U(x)
Definition: vpx_arith.h:37
AV_WN64A
#define AV_WN64A(p, v)
Definition: intreadwrite.h:538
copy_avg_fn
#define copy_avg_fn(sz)
Definition: vp9dsp_template.c:2011
dc_left_16x16_c
static void dc_left_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:471
idct8_1d
static void av_always_inline idct8_1d(float *dst, const float *src, int dst_stridea, int dst_strideb, int src_stridea, int src_strideb, int add)
Definition: vf_dctdnoiz.c:133
FUNC
#define FUNC(a)
Definition: bit_depth_template.c:101
vert_16x16_c
static void vert_16x16_c(uint8_t *restrict _dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:72
BIT_DEPTH
#define BIT_DEPTH
Definition: dsp_init.c:55
bilin_1d_fn
#define bilin_1d_fn(opn, opa, dir, ds)
Definition: vp9dsp_template.c:2177
filter_8tap_2d_fn
#define filter_8tap_2d_fn(opn, opa)
Definition: vp9dsp_template.c:2115
filter_fn_set
#define filter_fn_set(avg)
DST
#define DST(x, y)
Definition: vp9dsp_template.c:813
itxfm_wrap
#define itxfm_wrap(sz, bits)
Definition: vp9dsp_template.c:1194
TX_32X32
@ TX_32X32
Definition: vp9.h:31
hev
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:731
dc_top_4x4_c
static void dc_top_4x4_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *_top)
Definition: vp9dsp_template.c:520
scaled_filter_8tap_fn
#define scaled_filter_8tap_fn(opn, opa)
Definition: vp9dsp_template.c:2411
dc_127_16x16_c
static void dc_127_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:696
init_subpel3
#define init_subpel3(idx, type)
def_vert_left
#define def_vert_left(size)
Definition: vp9dsp_template.c:1032
a1
static double a1(void *priv, double x, double y)
Definition: vf_xfade.c:2029
h
h
Definition: vp9dsp_template.c:2070
VP9DSPContext::loop_filter_16
void(* loop_filter_16[2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:89
dc_32x32_c
static void dc_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *_top)
Definition: vp9dsp_template.c:406
def_vert_right
#define def_vert_right(size)
Definition: vp9dsp_template.c:925
hor_16x16_c
static void hor_16x16_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *_left, const uint8_t *top)
Definition: vp9dsp_template.c:176
do_8tap_2d_c
static av_always_inline void do_8tap_2d_c(uint8_t *_dst, ptrdiff_t dst_stride, const uint8_t *_src, ptrdiff_t src_stride, int w, int h, const int16_t *filterx, const int16_t *filtery, int avg)
Definition: vp9dsp_template.c:2076
bilin_2d_fn
#define bilin_2d_fn(opn, opa)
Definition: vp9dsp_template.c:2229
src
#define src
Definition: vp8dsp.c:248
dc_129_32x32_c
static void dc_129_32x32_c(uint8_t *_dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp_template.c:779