FFmpeg
vp9_lpf_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp9dsp.h"
23 #include "vp9dsp_mips.h"
24 
25 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
26  p1_out, p0_out, q0_out, q1_out) \
27 { \
28  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt, filt1, filt2; \
29  const v16i8 cnst4b = __msa_ldi_b(4); \
30  const v16i8 cnst3b = __msa_ldi_b(3); \
31  \
32  p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \
33  p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \
34  q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \
35  q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \
36  \
37  filt = __msa_subs_s_b(p1_m, q1_m); \
38  \
39  filt = filt & (v16i8) hev_in; \
40  \
41  q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \
42  filt = __msa_adds_s_b(filt, q0_sub_p0); \
43  filt = __msa_adds_s_b(filt, q0_sub_p0); \
44  filt = __msa_adds_s_b(filt, q0_sub_p0); \
45  filt = filt & (v16i8) mask_in; \
46  \
47  filt1 = __msa_adds_s_b(filt, cnst4b); \
48  filt1 >>= 3; \
49  \
50  filt2 = __msa_adds_s_b(filt, cnst3b); \
51  filt2 >>= 3; \
52  \
53  q0_m = __msa_subs_s_b(q0_m, filt1); \
54  q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \
55  p0_m = __msa_adds_s_b(p0_m, filt2); \
56  p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \
57  \
58  filt = __msa_srari_b(filt1, 1); \
59  hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \
60  filt = filt & (v16i8) hev_in; \
61  \
62  q1_m = __msa_subs_s_b(q1_m, filt); \
63  q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \
64  p1_m = __msa_adds_s_b(p1_m, filt); \
65  p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \
66 }
67 
68 #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
69 { \
70  v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
71  v16u8 zero_in = { 0 }; \
72  \
73  tmp = __msa_ori_b(zero_in, 1); \
74  p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
75  q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
76  p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
77  q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
78  \
79  p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
80  flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
81  p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
82  flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
83  \
84  flat_out = (tmp < (v16u8) flat_out); \
85  flat_out = __msa_xori_b(flat_out, 0xff); \
86  flat_out = flat_out & (mask); \
87 }
88 
89 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \
90  q5_in, q6_in, q7_in, flat_in, flat2_out) \
91 { \
92  v16u8 tmp, zero_in = { 0 }; \
93  v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
94  v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
95  \
96  tmp = __msa_ori_b(zero_in, 1); \
97  p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
98  q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
99  p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
100  q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
101  p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
102  q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
103  p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
104  q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
105  \
106  p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
107  flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
108  flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
109  p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
110  flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
111  p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
112  flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
113  \
114  flat2_out = (tmp < (v16u8) flat2_out); \
115  flat2_out = __msa_xori_b(flat2_out, 0xff); \
116  flat2_out = flat2_out & flat_in; \
117 }
118 
119 #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \
120  q0_in, q1_in, q2_in, q3_in, \
121  p2_filt8_out, p1_filt8_out, p0_filt8_out, \
122  q0_filt8_out, q1_filt8_out, q2_filt8_out) \
123 { \
124  v8u16 tmp0, tmp1, tmp2; \
125  \
126  tmp2 = p2_in + p1_in + p0_in; \
127  tmp0 = p3_in << 1; \
128  \
129  tmp0 = tmp0 + tmp2 + q0_in; \
130  tmp1 = tmp0 + p3_in + p2_in; \
131  p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
132  \
133  tmp1 = tmp0 + p1_in + q1_in; \
134  p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
135  \
136  tmp1 = q2_in + q1_in + q0_in; \
137  tmp2 = tmp2 + tmp1; \
138  tmp0 = tmp2 + (p0_in); \
139  tmp0 = tmp0 + (p3_in); \
140  p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3); \
141  \
142  tmp0 = q2_in + q3_in; \
143  tmp0 = p0_in + tmp1 + tmp0; \
144  tmp1 = q3_in + q3_in; \
145  tmp1 = tmp1 + tmp0; \
146  q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
147  \
148  tmp0 = tmp2 + q3_in; \
149  tmp1 = tmp0 + q0_in; \
150  q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
151  \
152  tmp1 = tmp0 - p2_in; \
153  tmp0 = q1_in + q3_in; \
154  tmp1 = tmp0 + tmp1; \
155  q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
156 }
157 
158 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
159  q0_in, q1_in, q2_in, q3_in, \
160  limit_in, b_limit_in, thresh_in, \
161  hev_out, mask_out, flat_out) \
162 { \
163  v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
164  v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
165  \
166  /* absolute subtraction of pixel values */ \
167  p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
168  p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
169  p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
170  q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
171  q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
172  q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
173  p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
174  p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
175  \
176  /* calculation of hev */ \
177  flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
178  hev_out = thresh_in < (v16u8) flat_out; \
179  \
180  /* calculation of mask */ \
181  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
182  p1_asub_q1_m >>= 1; \
183  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
184  \
185  mask_out = b_limit_in < p0_asub_q0_m; \
186  mask_out = __msa_max_u_b(flat_out, mask_out); \
187  p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
188  mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
189  q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
190  mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
191  \
192  mask_out = limit_in < (v16u8) mask_out; \
193  mask_out = __msa_xori_b(mask_out, 0xff); \
194 }
195 
196 void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch,
197  int32_t b_limit_ptr,
198  int32_t limit_ptr,
199  int32_t thresh_ptr)
200 {
201  uint64_t p1_d, p0_d, q0_d, q1_d;
202  v16u8 mask, hev, flat, thresh, b_limit, limit;
203  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
204 
205  /* load vector elements */
206  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
207 
208  thresh = (v16u8) __msa_fill_b(thresh_ptr);
209  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
210  limit = (v16u8) __msa_fill_b(limit_ptr);
211 
212  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
213  hev, mask, flat);
214  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
215  q1_out);
216 
217  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
218  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
219  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
220  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
221  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
222 }
223 
224 
225 void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch,
226  int32_t b_limit_ptr,
227  int32_t limit_ptr,
228  int32_t thresh_ptr)
229 {
230  v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
231  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
232 
233  /* load vector elements */
234  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
235 
236  thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
237  thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
238  thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
239 
240  b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
241  b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
242  b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
243 
244  limit0 = (v16u8) __msa_fill_b(limit_ptr);
245  limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
246  limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
247 
248  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
249  hev, mask, flat);
250  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
251 
252  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
253 }
254 
255 void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch,
256  int32_t b_limit_ptr,
257  int32_t limit_ptr,
258  int32_t thresh_ptr)
259 {
260  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
261  v16u8 mask, hev, flat, thresh, b_limit, limit;
262  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
263  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
264  v8i16 p2_filter8, p1_filter8, p0_filter8;
265  v8i16 q0_filter8, q1_filter8, q2_filter8;
266  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
267  v16i8 zero = { 0 };
268 
269  /* load vector elements */
270  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
271 
272  thresh = (v16u8) __msa_fill_b(thresh_ptr);
273  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
274  limit = (v16u8) __msa_fill_b(limit_ptr);
275 
276  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
277  hev, mask, flat);
278  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
279  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
280  q1_out);
281 
282  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
283 
284  /* if flat is zero for all pixels, then no need to calculate other filter */
285  if (__msa_test_bz_v(flat)) {
286  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
287  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
288  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
289  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
290  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
291  } else {
292  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
293  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
294  q2_r, q3_r);
295  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
296  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
297 
298  /* convert 16 bit output data into 8 bit */
299  PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
300  zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
301  q0_filter8);
302  PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
303 
304  /* store pixel values */
305  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
306  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
307  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
308  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
309  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
310  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
311 
312  p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
313  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
314  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
315  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
316  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
317  q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
318 
319  src -= 3 * pitch;
320 
321  SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
322  src += (4 * pitch);
323  SD(q1_d, src);
324  src += pitch;
325  SD(q2_d, src);
326  }
327 }
328 
329 void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch,
330  int32_t b_limit_ptr,
331  int32_t limit_ptr,
332  int32_t thresh_ptr)
333 {
334  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
335  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
336  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
337  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
338  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
339  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
340  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
341  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
342  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
343  v16u8 zero = { 0 };
344 
345  /* load vector elements */
346  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
347 
348  thresh = (v16u8) __msa_fill_b(thresh_ptr);
349  tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
350  thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
351 
352  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
353  tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
354  b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
355 
356  limit = (v16u8) __msa_fill_b(limit_ptr);
357  tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
358  limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
359 
360  /* mask and hev */
361  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
362  hev, mask, flat);
363  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
364  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
365  q1_out);
366 
367  /* if flat is zero for all pixels, then no need to calculate other filter */
368  if (__msa_test_bz_v(flat)) {
369  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
370  } else {
371  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
372  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
373  q2_r, q3_r);
374  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
375  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
376 
377  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
378  p0_l);
379  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
380  q3_l);
381  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
382  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
383 
384  /* convert 16 bit output data into 8 bit */
385  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
386  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
387  p0_filt8_r, q0_filt8_r);
388  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
389  q1_filt8_r, q2_filt8_r);
390 
391  /* store pixel values */
392  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
393  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
394  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
395  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
396  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
397  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
398 
399  src -= 3 * pitch;
400 
401  ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
402  src += (4 * pitch);
403  ST_UB2(q1_out, q2_out, src, pitch);
404  src += (2 * pitch);
405  }
406 }
407 
408 void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch,
409  int32_t b_limit_ptr,
410  int32_t limit_ptr,
411  int32_t thresh_ptr)
412 {
413  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
414  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
415  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
416  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
417  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
418  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
419  v16u8 zero = { 0 };
420 
421  /* load vector elements */
422  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
423 
424  thresh = (v16u8) __msa_fill_b(thresh_ptr);
425  tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
426  thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
427 
428  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
429  tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
430  b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
431 
432  limit = (v16u8) __msa_fill_b(limit_ptr);
433  tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
434  limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
435 
436  /* mask and hev */
437  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
438  hev, mask, flat);
439  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
440  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
441  q1_out);
442 
443  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
444 
445  /* if flat is zero for all pixels, then no need to calculate other filter */
446  if (__msa_test_bz_v(flat)) {
447  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
448  } else {
449  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
450  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
451  q2_r, q3_r);
452  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
453  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
454 
455  /* convert 16 bit output data into 8 bit */
456  PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
457  p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
458  p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
459  PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
460  q1_filt8_r, q2_filt8_r);
461 
462  /* store pixel values */
463  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
464  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
465  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
466  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
467  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
468  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
469 
470  src -= 3 * pitch;
471 
472  ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
473  src += (4 * pitch);
474  ST_UB2(q1_out, q2_out, src, pitch);
475  src += (2 * pitch);
476  }
477 }
478 
479 void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch,
480  int32_t b_limit_ptr,
481  int32_t limit_ptr,
482  int32_t thresh_ptr)
483 {
484  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
485  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
486  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
487  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
488  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
489  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
490  v16u8 zero = { 0 };
491 
492  /* load vector elements */
493  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
494 
495  thresh = (v16u8) __msa_fill_b(thresh_ptr);
496  tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
497  thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
498 
499  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
500  tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
501  b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
502 
503  limit = (v16u8) __msa_fill_b(limit_ptr);
504  tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
505  limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
506 
507  /* mask and hev */
508  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
509  hev, mask, flat);
510  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
511  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
512  q1_out);
513 
514  flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
515 
516  /* if flat is zero for all pixels, then no need to calculate other filter */
517  if (__msa_test_bz_v(flat)) {
518  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
519  } else {
520  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
521  p0_l);
522  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
523  q3_l);
524  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
525  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
526 
527  /* convert 16 bit output data into 8 bit */
528  PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
529  p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
530  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
531  PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
532  q1_filt8_l, q2_filt8_l);
533 
534  /* store pixel values */
535  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
536  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
537  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
538  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
539  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
540  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
541 
542  src -= 3 * pitch;
543 
544  ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
545  src += (4 * pitch);
546  ST_UB2(q1_out, q2_out, src, pitch);
547  src += (2 * pitch);
548  }
549 }
550 
551 static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch,
552  uint8_t *filter48,
553  int32_t b_limit_ptr,
554  int32_t limit_ptr,
555  int32_t thresh_ptr)
556 {
557  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
558  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
559  v16u8 flat, mask, hev, thresh, b_limit, limit;
560  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
561  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
562  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
563  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
564  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
565  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
566  v16u8 zero = { 0 };
567 
568  /* load vector elements */
569  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
570 
571  thresh = (v16u8) __msa_fill_b(thresh_ptr);
572  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
573  limit = (v16u8) __msa_fill_b(limit_ptr);
574 
575  /* mask and hev */
576  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
577  hev, mask, flat);
578  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
579  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
580  q1_out);
581 
582  /* if flat is zero for all pixels, then no need to calculate other filter */
583  if (__msa_test_bz_v(flat)) {
584  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
585 
586  return 1;
587  } else {
588  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
589  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
590  q2_r, q3_r);
591  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
592  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
593 
594  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
595  p0_l);
596  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
597  q3_l);
598  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
599  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
600 
601  /* convert 16 bit output data into 8 bit */
602  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
603  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
604  p0_filt8_r, q0_filt8_r);
605  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
606  q2_filt8_r);
607 
608  /* store pixel values */
609  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
610  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
611  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
612  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
613  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
614  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
615 
616  ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
617  filter48 += (4 * 16);
618  ST_UB2(q1_out, q2_out, filter48, 16);
619  filter48 += (2 * 16);
620  ST_UB(flat, filter48);
621 
622  return 0;
623  }
624 }
625 
626 static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
627 {
628  v16u8 flat, flat2, filter8;
629  v16i8 zero = { 0 };
630  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
631  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
632  v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
633  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
634  v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
635  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
636  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
637  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
638  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
639  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
640  v8i16 l_out, r_out;
641 
642  flat = LD_UB(filter48 + 96);
643 
644  LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
645  LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
646  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
647 
648  /* if flat2 is zero for all pixels, then no need to calculate other filter */
649  if (__msa_test_bz_v(flat2)) {
650  LD_UB4(filter48, 16, p2, p1, p0, q0);
651  LD_UB2(filter48 + 4 * 16, 16, q1, q2);
652 
653  src -= 3 * pitch;
654  ST_UB4(p2, p1, p0, q0, src, pitch);
655  src += (4 * pitch);
656  ST_UB2(q1, q2, src, pitch);
657  } else {
658  src -= 7 * pitch;
659 
660  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
661  zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
662  p3_r_in, p2_r_in, p1_r_in, p0_r_in);
663 
664  q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
665 
666  tmp0_r = p7_r_in << 3;
667  tmp0_r -= p7_r_in;
668  tmp0_r += p6_r_in;
669  tmp0_r += q0_r_in;
670  tmp1_r = p6_r_in + p5_r_in;
671  tmp1_r += p4_r_in;
672  tmp1_r += p3_r_in;
673  tmp1_r += p2_r_in;
674  tmp1_r += p1_r_in;
675  tmp1_r += p0_r_in;
676  tmp1_r += tmp0_r;
677  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
678 
679  ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
680  p5_l_in, p4_l_in);
681  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
682  p1_l_in, p0_l_in);
683  q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
684 
685  tmp0_l = p7_l_in << 3;
686  tmp0_l -= p7_l_in;
687  tmp0_l += p6_l_in;
688  tmp0_l += q0_l_in;
689  tmp1_l = p6_l_in + p5_l_in;
690  tmp1_l += p4_l_in;
691  tmp1_l += p3_l_in;
692  tmp1_l += p2_l_in;
693  tmp1_l += p1_l_in;
694  tmp1_l += p0_l_in;
695  tmp1_l += tmp0_l;
696  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
697 
698  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
699  p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
700  ST_UB(p6, src);
701  src += pitch;
702 
703  /* p5 */
704  q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
705  tmp0_r = p5_r_in - p6_r_in;
706  tmp0_r += q1_r_in;
707  tmp0_r -= p7_r_in;
708  tmp1_r += tmp0_r;
709  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
710 
711  q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
712  tmp0_l = p5_l_in - p6_l_in;
713  tmp0_l += q1_l_in;
714  tmp0_l -= p7_l_in;
715  tmp1_l += tmp0_l;
716  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
717 
718  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
719  p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
720  ST_UB(p5, src);
721  src += pitch;
722 
723  /* p4 */
724  q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
725  tmp0_r = p4_r_in - p5_r_in;
726  tmp0_r += q2_r_in;
727  tmp0_r -= p7_r_in;
728  tmp1_r += tmp0_r;
729  r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
730 
731  q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
732  tmp0_l = p4_l_in - p5_l_in;
733  tmp0_l += q2_l_in;
734  tmp0_l -= p7_l_in;
735  tmp1_l += tmp0_l;
736  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
737 
738  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
739  p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
740  ST_UB(p4, src);
741  src += pitch;
742 
743  /* p3 */
744  q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
745  tmp0_r = p3_r_in - p4_r_in;
746  tmp0_r += q3_r_in;
747  tmp0_r -= p7_r_in;
748  tmp1_r += tmp0_r;
749  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
750 
751  q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
752  tmp0_l = p3_l_in - p4_l_in;
753  tmp0_l += q3_l_in;
754  tmp0_l -= p7_l_in;
755  tmp1_l += tmp0_l;
756  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
757 
758  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
759  p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
760  ST_UB(p3, src);
761  src += pitch;
762 
763  /* p2 */
764  q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
765  filter8 = LD_UB(filter48);
766  tmp0_r = p2_r_in - p3_r_in;
767  tmp0_r += q4_r_in;
768  tmp0_r -= p7_r_in;
769  tmp1_r += tmp0_r;
770  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
771 
772  q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
773  tmp0_l = p2_l_in - p3_l_in;
774  tmp0_l += q4_l_in;
775  tmp0_l -= p7_l_in;
776  tmp1_l += tmp0_l;
777  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
778 
779  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
780  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
781  ST_UB(filter8, src);
782  src += pitch;
783 
784  /* p1 */
785  q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
786  filter8 = LD_UB(filter48 + 16);
787  tmp0_r = p1_r_in - p2_r_in;
788  tmp0_r += q5_r_in;
789  tmp0_r -= p7_r_in;
790  tmp1_r += tmp0_r;
791  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
792 
793  q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
794  tmp0_l = p1_l_in - p2_l_in;
795  tmp0_l += q5_l_in;
796  tmp0_l -= p7_l_in;
797  tmp1_l += tmp0_l;
798  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
799 
800  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
801  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
802  ST_UB(filter8, src);
803  src += pitch;
804 
805  /* p0 */
806  q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
807  filter8 = LD_UB(filter48 + 32);
808  tmp0_r = p0_r_in - p1_r_in;
809  tmp0_r += q6_r_in;
810  tmp0_r -= p7_r_in;
811  tmp1_r += tmp0_r;
812  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
813 
814  q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
815  tmp0_l = p0_l_in - p1_l_in;
816  tmp0_l += q6_l_in;
817  tmp0_l -= p7_l_in;
818  tmp1_l += tmp0_l;
819  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
820 
821  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
822  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
823  ST_UB(filter8, src);
824  src += pitch;
825 
826  /* q0 */
827  q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
828  filter8 = LD_UB(filter48 + 48);
829  tmp0_r = q7_r_in - p0_r_in;
830  tmp0_r += q0_r_in;
831  tmp0_r -= p7_r_in;
832  tmp1_r += tmp0_r;
833  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
834 
835  q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
836  tmp0_l = q7_l_in - p0_l_in;
837  tmp0_l += q0_l_in;
838  tmp0_l -= p7_l_in;
839  tmp1_l += tmp0_l;
840  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
841 
842  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
843  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
844  ST_UB(filter8, src);
845  src += pitch;
846 
847  /* q1 */
848  filter8 = LD_UB(filter48 + 64);
849  tmp0_r = q7_r_in - q0_r_in;
850  tmp0_r += q1_r_in;
851  tmp0_r -= p6_r_in;
852  tmp1_r += tmp0_r;
853  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
854 
855  tmp0_l = q7_l_in - q0_l_in;
856  tmp0_l += q1_l_in;
857  tmp0_l -= p6_l_in;
858  tmp1_l += tmp0_l;
859  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
860 
861  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
862  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
863  ST_UB(filter8, src);
864  src += pitch;
865 
866  /* q2 */
867  filter8 = LD_UB(filter48 + 80);
868  tmp0_r = q7_r_in - q1_r_in;
869  tmp0_r += q2_r_in;
870  tmp0_r -= p5_r_in;
871  tmp1_r += tmp0_r;
872  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
873 
874  tmp0_l = q7_l_in - q1_l_in;
875  tmp0_l += q2_l_in;
876  tmp0_l -= p5_l_in;
877  tmp1_l += tmp0_l;
878  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
879 
880  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
881  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
882  ST_UB(filter8, src);
883  src += pitch;
884 
885  /* q3 */
886  tmp0_r = q7_r_in - q2_r_in;
887  tmp0_r += q3_r_in;
888  tmp0_r -= p4_r_in;
889  tmp1_r += tmp0_r;
890  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
891 
892  tmp0_l = q7_l_in - q2_l_in;
893  tmp0_l += q3_l_in;
894  tmp0_l -= p4_l_in;
895  tmp1_l += tmp0_l;
896  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
897 
898  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
899  q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
900  ST_UB(q3, src);
901  src += pitch;
902 
903  /* q4 */
904  tmp0_r = q7_r_in - q3_r_in;
905  tmp0_r += q4_r_in;
906  tmp0_r -= p3_r_in;
907  tmp1_r += tmp0_r;
908  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
909 
910  tmp0_l = q7_l_in - q3_l_in;
911  tmp0_l += q4_l_in;
912  tmp0_l -= p3_l_in;
913  tmp1_l += tmp0_l;
914  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
915 
916  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
917  q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
918  ST_UB(q4, src);
919  src += pitch;
920 
921  /* q5 */
922  tmp0_r = q7_r_in - q4_r_in;
923  tmp0_r += q5_r_in;
924  tmp0_r -= p2_r_in;
925  tmp1_r += tmp0_r;
926  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
927 
928  tmp0_l = q7_l_in - q4_l_in;
929  tmp0_l += q5_l_in;
930  tmp0_l -= p2_l_in;
931  tmp1_l += tmp0_l;
932  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
933 
934  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
935  q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
936  ST_UB(q5, src);
937  src += pitch;
938 
939  /* q6 */
940  tmp0_r = q7_r_in - q5_r_in;
941  tmp0_r += q6_r_in;
942  tmp0_r -= p1_r_in;
943  tmp1_r += tmp0_r;
944  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
945 
946  tmp0_l = q7_l_in - q5_l_in;
947  tmp0_l += q6_l_in;
948  tmp0_l -= p1_l_in;
949  tmp1_l += tmp0_l;
950  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
951 
952  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
953  q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
954  ST_UB(q6, src);
955  }
956 }
957 
958 void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch,
959  int32_t b_limit_ptr,
960  int32_t limit_ptr,
961  int32_t thresh_ptr)
962 {
963  uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
964  uint8_t early_exit = 0;
965 
966  early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0],
967  b_limit_ptr, limit_ptr, thresh_ptr);
968 
969  if (0 == early_exit) {
970  vp9_hz_lpf_t16_16w(src, pitch, filter48);
971  }
972 }
973 
974 void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch,
975  int32_t b_limit_ptr,
976  int32_t limit_ptr,
977  int32_t thresh_ptr)
978 {
979  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
980  uint64_t dword0, dword1;
981  v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
982  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
983  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
984  v16u8 p0_filter16, p1_filter16;
985  v8i16 p2_filter8, p1_filter8, p0_filter8;
986  v8i16 q0_filter8, q1_filter8, q2_filter8;
987  v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
988  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
989  v16i8 zero = { 0 };
990  v8u16 tmp0, tmp1, tmp2;
991 
992  /* load vector elements */
993  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
994 
995  thresh = (v16u8) __msa_fill_b(thresh_ptr);
996  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
997  limit = (v16u8) __msa_fill_b(limit_ptr);
998 
999  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1000  hev, mask, flat);
1001  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1002  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1003  q1_out);
1004 
1005  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1006 
1007  /* if flat is zero for all pixels, then no need to calculate other filter */
1008  if (__msa_test_bz_v(flat)) {
1009  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1010  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1011  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1012  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1013  SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
1014  } else {
1015  /* convert 8 bit input data into 16 bit */
1016  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
1017  q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
1018  q1_r, q2_r, q3_r);
1019  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
1020  p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1021  q1_filter8, q2_filter8);
1022 
1023  /* convert 16 bit output data into 8 bit */
1024  PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
1025  zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
1026  q0_filter8);
1027  PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
1028  q2_filter8);
1029 
1030  /* store pixel values */
1031  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
1032  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
1033  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
1034  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
1035  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
1036  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
1037 
1038  /* load 16 vector elements */
1039  LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
1040  LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
1041 
1042  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1043 
1044  /* if flat2 is zero for all pixels, then no need to calculate other filter */
1045  if (__msa_test_bz_v(flat2)) {
1046  p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
1047  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1048  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1049  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1050  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1051  q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
1052 
1053  SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
1054  SD(q1_d, src + pitch);
1055  SD(q2_d, src + 2 * pitch);
1056  } else {
1057  /* LSB(right) 8 pixel operation */
1058  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
1059  zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
1060  q4_r, q5_r, q6_r, q7_r);
1061 
1062  tmp0 = p7_r << 3;
1063  tmp0 -= p7_r;
1064  tmp0 += p6_r;
1065  tmp0 += q0_r;
1066 
1067  src -= 7 * pitch;
1068 
1069  /* calculation of p6 and p5 */
1070  tmp1 = p6_r + p5_r + p4_r + p3_r;
1071  tmp1 += (p2_r + p1_r + p0_r);
1072  tmp1 += tmp0;
1073  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1074  tmp0 = p5_r - p6_r + q1_r - p7_r;
1075  tmp1 += tmp0;
1076  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1077  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1078  p0_filter16, p1_filter16);
1079  p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
1080  p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
1081  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1082  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1083  SD(dword0, src);
1084  src += pitch;
1085  SD(dword1, src);
1086  src += pitch;
1087 
1088  /* calculation of p4 and p3 */
1089  tmp0 = p4_r - p5_r + q2_r - p7_r;
1090  tmp2 = p3_r - p4_r + q3_r - p7_r;
1091  tmp1 += tmp0;
1092  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1093  tmp1 += tmp2;
1094  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1095  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1096  p0_filter16, p1_filter16);
1097  p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
1098  p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
1099  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1100  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1101  SD(dword0, src);
1102  src += pitch;
1103  SD(dword1, src);
1104  src += pitch;
1105 
1106  /* calculation of p2 and p1 */
1107  tmp0 = p2_r - p3_r + q4_r - p7_r;
1108  tmp2 = p1_r - p2_r + q5_r - p7_r;
1109  tmp1 += tmp0;
1110  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1111  tmp1 += tmp2;
1112  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1113  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1114  p0_filter16, p1_filter16);
1115  p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
1116  p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
1117  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1118  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1119  SD(dword0, src);
1120  src += pitch;
1121  SD(dword1, src);
1122  src += pitch;
1123 
1124  /* calculation of p0 and q0 */
1125  tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
1126  tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
1127  tmp1 += tmp0;
1128  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1129  tmp1 += tmp2;
1130  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1131  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1132  p0_filter16, p1_filter16);
1133  p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
1134  p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
1135  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1136  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1137  SD(dword0, src);
1138  src += pitch;
1139  SD(dword1, src);
1140  src += pitch;
1141 
1142  /* calculation of q1 and q2 */
1143  tmp0 = q7_r - q0_r + q1_r - p6_r;
1144  tmp2 = q7_r - q1_r + q2_r - p5_r;
1145  tmp1 += tmp0;
1146  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1147  tmp1 += tmp2;
1148  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1149  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1150  p0_filter16, p1_filter16);
1151  p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
1152  p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
1153  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1154  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1155  SD(dword0, src);
1156  src += pitch;
1157  SD(dword1, src);
1158  src += pitch;
1159 
1160  /* calculation of q3 and q4 */
1161  tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
1162  tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
1163  tmp1 += tmp0;
1164  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1165  tmp1 += tmp2;
1166  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1167  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1168  p0_filter16, p1_filter16);
1169  p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
1170  p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
1171  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1172  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1173  SD(dword0, src);
1174  src += pitch;
1175  SD(dword1, src);
1176  src += pitch;
1177 
1178  /* calculation of q5 and q6 */
1179  tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
1180  tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
1181  tmp1 += tmp0;
1182  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1183  tmp1 += tmp2;
1184  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1185  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1186  p0_filter16, p1_filter16);
1187  p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
1188  p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
1189  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1190  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1191  SD(dword0, src);
1192  src += pitch;
1193  SD(dword1, src);
1194  }
1195  }
1196 }
1197 
1198 void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch,
1199  int32_t b_limit_ptr,
1200  int32_t limit_ptr,
1201  int32_t thresh_ptr)
1202 {
1203  v16u8 mask, hev, flat, limit, thresh, b_limit;
1204  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1205  v8i16 vec0, vec1, vec2, vec3;
1206 
1207  LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1208 
1209  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1210  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1211  limit = (v16u8) __msa_fill_b(limit_ptr);
1212 
1213  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1214  p3, p2, p1, p0, q0, q1, q2, q3);
1215  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1216  hev, mask, flat);
1217  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1218  ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
1219  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1220 
1221  src -= 2;
1222  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1223 }
1224 
1225 void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
1226  int32_t b_limit_ptr,
1227  int32_t limit_ptr,
1228  int32_t thresh_ptr)
1229 {
1230  v16u8 mask, hev, flat;
1231  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1232  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1233  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1234  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1235  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1236 
1237  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1238  LD_UB8(src - 4 + (8 * pitch), pitch,
1239  row8, row9, row10, row11, row12, row13, row14, row15);
1240 
1241  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1242  row8, row9, row10, row11, row12, row13, row14, row15,
1243  p3, p2, p1, p0, q0, q1, q2, q3);
1244 
1245  thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
1246  thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
1247  thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
1248 
1249  b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
1250  b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
1251  b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
1252 
1253  limit0 = (v16u8) __msa_fill_b(limit_ptr);
1254  limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
1255  limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
1256 
1257  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1258  hev, mask, flat);
1259  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1260  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1261  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
1262  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1263  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
1264 
1265  src -= 2;
1266 
1267  ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1268  ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1269 }
1270 
1271 void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
1272  int32_t b_limit_ptr,
1273  int32_t limit_ptr,
1274  int32_t thresh_ptr)
1275 {
1276  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1277  v16u8 p1_out, p0_out, q0_out, q1_out;
1278  v16u8 flat, mask, hev, thresh, b_limit, limit;
1279  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1280  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1281  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1282  v16u8 zero = { 0 };
1283  v8i16 vec0, vec1, vec2, vec3, vec4;
1284 
1285  /* load vector elements */
1286  LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1287 
1288  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1289  p3, p2, p1, p0, q0, q1, q2, q3);
1290 
1291  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1292  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1293  limit = (v16u8) __msa_fill_b(limit_ptr);
1294 
1295  /* mask and hev */
1296  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1297  hev, mask, flat);
1298  /* flat4 */
1299  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1300  /* filter4 */
1301  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1302  q1_out);
1303 
1304  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1305 
1306  /* if flat is zero for all pixels, then no need to calculate other filter */
1307  if (__msa_test_bz_v(flat)) {
1308  /* Store 4 pixels p1-_q1 */
1309  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1310  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1311 
1312  src -= 2;
1313  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1314  } else {
1315  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1316  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1317  q3_r);
1318  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1319  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1320  /* convert 16 bit output data into 8 bit */
1321  PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
1322  p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1323  p0_filt8_r, q0_filt8_r);
1324  PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
1325  q2_filt8_r);
1326 
1327  /* store pixel values */
1328  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1329  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1330  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1331  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1332  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1333  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1334 
1335  /* Store 6 pixels p2-_q2 */
1336  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1337  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1338  vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1339 
1340  src -= 3;
1341  ST_W4(vec2, 0, 1, 2, 3, src, pitch);
1342  ST_H4(vec4, 0, 1, 2, 3, src + 4, pitch);
1343  src += (4 * pitch);
1344  ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1345  ST_H4(vec4, 4, 5, 6, 7, src + 4, pitch);
1346  }
1347 }
1348 
1349 void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch,
1350  int32_t b_limit_ptr,
1351  int32_t limit_ptr,
1352  int32_t thresh_ptr)
1353 {
1354  uint8_t *temp_src;
1355  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1356  v16u8 p1_out, p0_out, q0_out, q1_out;
1357  v16u8 flat, mask, hev, thresh, b_limit, limit;
1358  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1359  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1360  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1361  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1362  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1363  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1364  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1365  v16u8 zero = { 0 };
1366  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1367 
1368  temp_src = src - 4;
1369 
1370  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1371  temp_src += (8 * pitch);
1372  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1373 
1374  /* transpose 16x8 matrix into 8x16 */
1375  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1376  q3, q2, q1, q0, row12, row13, row14, row15,
1377  p3, p2, p1, p0, q0, q1, q2, q3);
1378 
1379  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1380  vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1381  thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1382 
1383  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1384  vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1385  b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1386 
1387  limit = (v16u8) __msa_fill_b(limit_ptr);
1388  vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1389  limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1390 
1391  /* mask and hev */
1392  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1393  hev, mask, flat);
1394  /* flat4 */
1395  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1396  /* filter4 */
1397  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1398  q1_out);
1399 
1400  /* if flat is zero for all pixels, then no need to calculate other filter */
1401  if (__msa_test_bz_v(flat)) {
1402  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1403  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1404  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1405  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1406 
1407  src -= 2;
1408  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1409  ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1410  } else {
1411  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1412  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1413  q3_r);
1414  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1415  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1416 
1417  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1418  p0_l);
1419  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1420  q3_l);
1421 
1422  /* filter8 */
1423  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1424  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1425 
1426  /* convert 16 bit output data into 8 bit */
1427  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1428  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1429  p0_filt8_r, q0_filt8_r);
1430  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1431  q2_filt8_r);
1432 
1433  /* store pixel values */
1434  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1435  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1436  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1437  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1438  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1439  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1440 
1441  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1442  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1443  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1444  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1445  ILVRL_B2_SH(q2, q1, vec2, vec5);
1446 
1447  src -= 3;
1448  ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1449  ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1450  src += (4 * pitch);
1451  ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1452  ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1453  src += (4 * pitch);
1454  ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1455  ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1456  src += (4 * pitch);
1457  ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1458  ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1459  }
1460 }
1461 
1462 void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch,
1463  int32_t b_limit_ptr,
1464  int32_t limit_ptr,
1465  int32_t thresh_ptr)
1466 {
1467  uint8_t *temp_src;
1468  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1469  v16u8 p1_out, p0_out, q0_out, q1_out;
1470  v16u8 flat, mask, hev, thresh, b_limit, limit;
1471  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1472  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1473  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1474  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1475  v16u8 zero = { 0 };
1476  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1477 
1478  temp_src = src - 4;
1479 
1480  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1481  temp_src += (8 * pitch);
1482  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1483 
1484  /* transpose 16x8 matrix into 8x16 */
1485  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1486  q3, q2, q1, q0, row12, row13, row14, row15,
1487  p3, p2, p1, p0, q0, q1, q2, q3);
1488 
1489  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1490  vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1491  thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1492 
1493  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1494  vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1495  b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1496 
1497  limit = (v16u8) __msa_fill_b(limit_ptr);
1498  vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1499  limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1500 
1501  /* mask and hev */
1502  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1503  hev, mask, flat);
1504  /* flat4 */
1505  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1506  /* filter4 */
1507  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1508  q1_out);
1509 
1510  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1511 
1512  /* if flat is zero for all pixels, then no need to calculate other filter */
1513  if (__msa_test_bz_v(flat)) {
1514  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1515  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1516  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1517  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1518 
1519  src -= 2;
1520  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1521  ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1522  } else {
1523  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1524  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1525  q3_r);
1526  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1527  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1528 
1529  /* convert 16 bit output data into 8 bit */
1530  PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
1531  p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
1532  p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
1533  PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
1534  q1_filt8_r, q2_filt8_r);
1535 
1536  /* store pixel values */
1537  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1538  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1539  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1540  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1541  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1542  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1543 
1544  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1545  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1546  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1547  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1548  ILVRL_B2_SH(q2, q1, vec2, vec5);
1549 
1550  src -= 3;
1551  ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1552  ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1553  src += (4 * pitch);
1554  ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1555  ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1556  src += (4 * pitch);
1557  ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1558  ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1559  src += (4 * pitch);
1560  ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1561  ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1562  }
1563 }
1564 
1565 void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch,
1566  int32_t b_limit_ptr,
1567  int32_t limit_ptr,
1568  int32_t thresh_ptr)
1569 {
1570  uint8_t *temp_src;
1571  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1572  v16u8 p1_out, p0_out, q0_out, q1_out;
1573  v16u8 flat, mask, hev, thresh, b_limit, limit;
1574  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1575  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1576  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1577  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1578  v16u8 zero = { 0 };
1579  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1580 
1581  temp_src = src - 4;
1582 
1583  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1584  temp_src += (8 * pitch);
1585  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1586 
1587  /* transpose 16x8 matrix into 8x16 */
1588  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1589  q3, q2, q1, q0, row12, row13, row14, row15,
1590  p3, p2, p1, p0, q0, q1, q2, q3);
1591 
1592  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1593  vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1594  thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1595 
1596  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1597  vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1598  b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1599 
1600  limit = (v16u8) __msa_fill_b(limit_ptr);
1601  vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1602  limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1603 
1604  /* mask and hev */
1605  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1606  hev, mask, flat);
1607  /* flat4 */
1608  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1609  /* filter4 */
1610  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1611  q1_out);
1612 
1613  flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
1614 
1615  /* if flat is zero for all pixels, then no need to calculate other filter */
1616  if (__msa_test_bz_v(flat)) {
1617  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1618  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1619  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1620  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1621 
1622  src -= 2;
1623  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1624  ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1625  } else {
1626  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1627  p0_l);
1628  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1629  q3_l);
1630 
1631  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1632  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1633 
1634  /* convert 16 bit output data into 8 bit */
1635  PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1636  p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
1637  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1638  PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1639  q1_filt8_l, q2_filt8_l);
1640 
1641  /* store pixel values */
1642  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
1643  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
1644  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
1645  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
1646  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
1647  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
1648 
1649  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1650  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1651  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1652  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1653  ILVRL_B2_SH(q2, q1, vec2, vec5);
1654 
1655  src -= 3;
1656  ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1657  ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1658  src += (4 * pitch);
1659  ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1660  ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1661  src += (4 * pitch);
1662  ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1663  ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1664  src += (4 * pitch);
1665  ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1666  ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1667  }
1668 }
1669 
1670 static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
1671  uint8_t *output, int32_t out_pitch)
1672 {
1673  v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
1674  v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1675  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1676  v16i8 zeros = { 0 };
1677 
1678  LD_UB8(input, in_pitch,
1679  p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
1680  /* 8x8 transpose */
1681  TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
1682  p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
1683  /* 8x8 transpose */
1684  ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
1685  tmp0, tmp1, tmp2, tmp3);
1686  ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
1687  ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
1688  ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
1689  ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
1690  SLDI_B4_UB(zeros, q0, zeros, q2, zeros, q4, zeros, q6, 8, q1, q3, q5, q7);
1691 
1692  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1693  output += (8 * out_pitch);
1694  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1695 }
1696 
1697 static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
1698  uint8_t *output, int32_t out_pitch)
1699 {
1700  v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
1701  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1702 
1703  LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
1704  LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
1705  TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
1706  q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
1707  ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
1708 }
1709 
1710 static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
1711  uint8_t *output, int32_t out_pitch)
1712 {
1713  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1714  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1715  v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
1716  v4i32 tmp2, tmp3;
1717  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1718 
1719  LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1720  input += (8 * in_pitch);
1721  LD_UB8(input, in_pitch,
1722  row8, row9, row10, row11, row12, row13, row14, row15);
1723 
1724  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1725  row8, row9, row10, row11, row12, row13, row14, row15,
1726  p7, p6, p5, p4, p3, p2, p1, p0);
1727 
1728  /* transpose 16x8 matrix into 8x16 */
1729  /* total 8 intermediate register and 32 instructions */
1730  q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
1731  q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
1732  q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
1733  q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
1734  q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
1735  q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
1736  q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
1737  q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
1738 
1739  ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
1740  tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
1741  tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
1742 
1743  ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
1744  tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
1745  tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1);
1746 
1747  ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
1748  q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1749  q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1750 
1751  tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
1752  tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
1753  q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1754  q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1755 
1756  ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
1757  q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1758  q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1759 
1760  tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
1761  tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
1762  q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1763  q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1764 
1765  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1766  output += (8 * out_pitch);
1767  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1768 }
1769 
1770 static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
1771  uint8_t *src_org, int32_t pitch_org,
1772  int32_t b_limit_ptr,
1773  int32_t limit_ptr,
1774  int32_t thresh_ptr)
1775 {
1776  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1777  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1778  v16u8 flat, mask, hev, thresh, b_limit, limit;
1779  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1780  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1781  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1782  v16i8 zero = { 0 };
1783  v8i16 vec0, vec1, vec2, vec3;
1784 
1785  /* load vector elements */
1786  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1787 
1788  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1789  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1790  limit = (v16u8) __msa_fill_b(limit_ptr);
1791 
1792  /* mask and hev */
1793  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1794  hev, mask, flat);
1795  /* flat4 */
1796  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1797  /* filter4 */
1798  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1799  q1_out);
1800 
1801  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1802 
1803  /* if flat is zero for all pixels, then no need to calculate other filter */
1804  if (__msa_test_bz_v(flat)) {
1805  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1806  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1807  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, (src_org - 2), pitch_org);
1808  return 1;
1809  } else {
1810  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1811  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1812  q3_r);
1813  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1814  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1815 
1816  /* convert 16 bit output data into 8 bit */
1817  p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
1818  p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
1819  p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
1820  q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
1821  q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
1822  q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
1823 
1824  /* store pixel values */
1825  p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
1826  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
1827  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
1828  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
1829  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
1830  q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
1831 
1832  ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1833  filter48 += (4 * 16);
1834  ST_UB2(q1_out, q2_out, filter48, 16);
1835  filter48 += (2 * 16);
1836  ST_UB(flat, filter48);
1837 
1838  return 0;
1839  }
1840 }
1841 
1842 static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
1843  uint8_t *filter48)
1844 {
1845  v16i8 zero = { 0 };
1846  v16u8 filter8, flat, flat2;
1847  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1848  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
1849  v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1850  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
1851  v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1852  v8u16 tmp0_r, tmp1_r;
1853  v8i16 r_out;
1854 
1855  flat = LD_UB(filter48 + 6 * 16);
1856 
1857  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1858  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1859 
1860  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1861 
1862  /* if flat2 is zero for all pixels, then no need to calculate other filter */
1863  if (__msa_test_bz_v(flat2)) {
1864  v8i16 vec0, vec1, vec2, vec3, vec4;
1865 
1866  LD_UB4(filter48, 16, p2, p1, p0, q0);
1867  LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1868 
1869  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1870  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1871  vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1872 
1873  src_org -= 3;
1874  ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
1875  ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
1876  src_org += (4 * pitch);
1877  ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
1878  ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
1879 
1880  return 1;
1881  } else {
1882  src -= 7 * 16;
1883 
1884  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
1885  zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
1886  p3_r_in, p2_r_in, p1_r_in, p0_r_in);
1887  q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
1888 
1889  tmp0_r = p7_r_in << 3;
1890  tmp0_r -= p7_r_in;
1891  tmp0_r += p6_r_in;
1892  tmp0_r += q0_r_in;
1893  tmp1_r = p6_r_in + p5_r_in;
1894  tmp1_r += p4_r_in;
1895  tmp1_r += p3_r_in;
1896  tmp1_r += p2_r_in;
1897  tmp1_r += p1_r_in;
1898  tmp1_r += p0_r_in;
1899  tmp1_r += tmp0_r;
1900 
1901  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1902  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1903  p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
1904  ST_D1(p6, 0, src);
1905  src += 16;
1906 
1907  /* p5 */
1908  q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
1909  tmp0_r = p5_r_in - p6_r_in;
1910  tmp0_r += q1_r_in;
1911  tmp0_r -= p7_r_in;
1912  tmp1_r += tmp0_r;
1913  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1914  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1915  p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
1916  ST_D1(p5, 0, src);
1917  src += 16;
1918 
1919  /* p4 */
1920  q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
1921  tmp0_r = p4_r_in - p5_r_in;
1922  tmp0_r += q2_r_in;
1923  tmp0_r -= p7_r_in;
1924  tmp1_r += tmp0_r;
1925  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1926  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1927  p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
1928  ST_D1(p4, 0, src);
1929  src += 16;
1930 
1931  /* p3 */
1932  q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
1933  tmp0_r = p3_r_in - p4_r_in;
1934  tmp0_r += q3_r_in;
1935  tmp0_r -= p7_r_in;
1936  tmp1_r += tmp0_r;
1937  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1938  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1939  p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
1940  ST_D1(p3, 0, src);
1941  src += 16;
1942 
1943  /* p2 */
1944  q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
1945  filter8 = LD_UB(filter48);
1946  tmp0_r = p2_r_in - p3_r_in;
1947  tmp0_r += q4_r_in;
1948  tmp0_r -= p7_r_in;
1949  tmp1_r += tmp0_r;
1950  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1951  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1952  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1953  ST_D1(filter8, 0, src);
1954  src += 16;
1955 
1956  /* p1 */
1957  q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
1958  filter8 = LD_UB(filter48 + 16);
1959  tmp0_r = p1_r_in - p2_r_in;
1960  tmp0_r += q5_r_in;
1961  tmp0_r -= p7_r_in;
1962  tmp1_r += tmp0_r;
1963  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1964  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1965  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1966  ST_D1(filter8, 0, src);
1967  src += 16;
1968 
1969  /* p0 */
1970  q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
1971  filter8 = LD_UB(filter48 + 32);
1972  tmp0_r = p0_r_in - p1_r_in;
1973  tmp0_r += q6_r_in;
1974  tmp0_r -= p7_r_in;
1975  tmp1_r += tmp0_r;
1976  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1977  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1978  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1979  ST_D1(filter8, 0, src);
1980  src += 16;
1981 
1982  /* q0 */
1983  q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
1984  filter8 = LD_UB(filter48 + 48);
1985  tmp0_r = q7_r_in - p0_r_in;
1986  tmp0_r += q0_r_in;
1987  tmp0_r -= p7_r_in;
1988  tmp1_r += tmp0_r;
1989  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1990  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1991  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1992  ST_D1(filter8, 0, src);
1993  src += 16;
1994 
1995  /* q1 */
1996  filter8 = LD_UB(filter48 + 64);
1997  tmp0_r = q7_r_in - q0_r_in;
1998  tmp0_r += q1_r_in;
1999  tmp0_r -= p6_r_in;
2000  tmp1_r += tmp0_r;
2001  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2002  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2003  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2004  ST_D1(filter8, 0, src);
2005  src += 16;
2006 
2007  /* q2 */
2008  filter8 = LD_UB(filter48 + 80);
2009  tmp0_r = q7_r_in - q1_r_in;
2010  tmp0_r += q2_r_in;
2011  tmp0_r -= p5_r_in;
2012  tmp1_r += tmp0_r;
2013  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2014  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2015  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2016  ST_D1(filter8, 0, src);
2017  src += 16;
2018 
2019  /* q3 */
2020  tmp0_r = q7_r_in - q2_r_in;
2021  tmp0_r += q3_r_in;
2022  tmp0_r -= p4_r_in;
2023  tmp1_r += tmp0_r;
2024  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2025  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2026  q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2027  ST_D1(q3, 0, src);
2028  src += 16;
2029 
2030  /* q4 */
2031  tmp0_r = q7_r_in - q3_r_in;
2032  tmp0_r += q4_r_in;
2033  tmp0_r -= p3_r_in;
2034  tmp1_r += tmp0_r;
2035  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2036  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2037  q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2038  ST_D1(q4, 0, src);
2039  src += 16;
2040 
2041  /* q5 */
2042  tmp0_r = q7_r_in - q4_r_in;
2043  tmp0_r += q5_r_in;
2044  tmp0_r -= p2_r_in;
2045  tmp1_r += tmp0_r;
2046  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2047  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2048  q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2049  ST_D1(q5, 0, src);
2050  src += 16;
2051 
2052  /* q6 */
2053  tmp0_r = q7_r_in - q5_r_in;
2054  tmp0_r += q6_r_in;
2055  tmp0_r -= p1_r_in;
2056  tmp1_r += tmp0_r;
2057  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2058  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2059  q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2060  ST_D1(q6, 0, src);
2061 
2062  return 0;
2063  }
2064 }
2065 
2066 void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch,
2067  int32_t b_limit_ptr,
2068  int32_t limit_ptr,
2069  int32_t thresh_ptr)
2070 {
2071  uint8_t early_exit = 0;
2072  uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2073  uint8_t *filter48 = &transposed_input[16 * 16];
2074 
2075  vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
2076 
2077  early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
2078  &filter48[0], src, pitch,
2079  b_limit_ptr, limit_ptr, thresh_ptr);
2080 
2081  if (0 == early_exit) {
2082  early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
2083  &filter48[0]);
2084 
2085  if (0 == early_exit) {
2086  vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
2087  }
2088  }
2089 }
2090 
2091 static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
2092  uint8_t *src_org, ptrdiff_t pitch,
2093  int32_t b_limit_ptr,
2094  int32_t limit_ptr,
2095  int32_t thresh_ptr)
2096 {
2097  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
2098  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2099  v16u8 flat, mask, hev, thresh, b_limit, limit;
2100  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
2101  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2102  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
2103  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
2104  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
2105  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
2106  v16i8 zero = { 0 };
2107  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
2108 
2109  /* load vector elements */
2110  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
2111 
2112  thresh = (v16u8) __msa_fill_b(thresh_ptr);
2113  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
2114  limit = (v16u8) __msa_fill_b(limit_ptr);
2115 
2116  /* mask and hev */
2117  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2118  hev, mask, flat);
2119  /* flat4 */
2120  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2121  /* filter4 */
2122  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2123  q1_out);
2124 
2125  /* if flat is zero for all pixels, then no need to calculate other filter */
2126  if (__msa_test_bz_v(flat)) {
2127  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2128  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
2129  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2130  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
2131 
2132  src_org -= 2;
2133  ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src_org, pitch);
2134  ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src_org + 8 * pitch, pitch);
2135 
2136  return 1;
2137  } else {
2138  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
2139  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
2140  q3_r);
2141  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
2142  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
2143  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
2144  p0_l);
2145  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
2146  q3_l);
2147  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2148  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2149 
2150  /* convert 16 bit output data into 8 bit */
2151  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
2152  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
2153  p0_filt8_r, q0_filt8_r);
2154  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
2155  q2_filt8_r);
2156 
2157  /* store pixel values */
2158  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
2159  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
2160  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
2161  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
2162  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
2163  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
2164 
2165  ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
2166  filter48 += (4 * 16);
2167  ST_UB2(q1_out, q2_out, filter48, 16);
2168  filter48 += (2 * 16);
2169  ST_UB(flat, filter48);
2170 
2171  return 0;
2172  }
2173 }
2174 
2175 static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
2176  uint8_t *filter48)
2177 {
2178  v16u8 flat, flat2, filter8;
2179  v16i8 zero = { 0 };
2180  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2181  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
2182  v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
2183  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
2184  v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
2185  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2186  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2187  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2188  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2189  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
2190  v8i16 l_out, r_out;
2191 
2192  flat = LD_UB(filter48 + 6 * 16);
2193 
2194  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
2195  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
2196 
2197  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2198 
2199  /* if flat2 is zero for all pixels, then no need to calculate other filter */
2200  if (__msa_test_bz_v(flat2)) {
2201  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2202 
2203  LD_UB4(filter48, 16, p2, p1, p0, q0);
2204  LD_UB2(filter48 + 4 * 16, 16, q1, q2);
2205 
2206  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
2207  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
2208  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
2209  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
2210  ILVRL_B2_SH(q2, q1, vec2, vec5);
2211 
2212  src_org -= 3;
2213  ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
2214  ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
2215  src_org += (4 * pitch);
2216  ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
2217  ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
2218  src_org += (4 * pitch);
2219  ST_W4(vec6, 0, 1, 2, 3, src_org, pitch);
2220  ST_H4(vec5, 0, 1, 2, 3, (src_org + 4), pitch);
2221  src_org += (4 * pitch);
2222  ST_W4(vec7, 0, 1, 2, 3, src_org, pitch);
2223  ST_H4(vec5, 4, 5, 6, 7, (src_org + 4), pitch);
2224 
2225  return 1;
2226  } else {
2227  src -= 7 * 16;
2228 
2229  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
2230  zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
2231  p3_r_in, p2_r_in, p1_r_in, p0_r_in);
2232  q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
2233 
2234  tmp0_r = p7_r_in << 3;
2235  tmp0_r -= p7_r_in;
2236  tmp0_r += p6_r_in;
2237  tmp0_r += q0_r_in;
2238  tmp1_r = p6_r_in + p5_r_in;
2239  tmp1_r += p4_r_in;
2240  tmp1_r += p3_r_in;
2241  tmp1_r += p2_r_in;
2242  tmp1_r += p1_r_in;
2243  tmp1_r += p0_r_in;
2244  tmp1_r += tmp0_r;
2245  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2246 
2247  ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
2248  p5_l_in, p4_l_in);
2249  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
2250  p1_l_in, p0_l_in);
2251  q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
2252 
2253  tmp0_l = p7_l_in << 3;
2254  tmp0_l -= p7_l_in;
2255  tmp0_l += p6_l_in;
2256  tmp0_l += q0_l_in;
2257  tmp1_l = p6_l_in + p5_l_in;
2258  tmp1_l += p4_l_in;
2259  tmp1_l += p3_l_in;
2260  tmp1_l += p2_l_in;
2261  tmp1_l += p1_l_in;
2262  tmp1_l += p0_l_in;
2263  tmp1_l += tmp0_l;
2264  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2265 
2266  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2267  p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
2268  ST_UB(p6, src);
2269  src += 16;
2270 
2271  /* p5 */
2272  q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
2273  tmp0_r = p5_r_in - p6_r_in;
2274  tmp0_r += q1_r_in;
2275  tmp0_r -= p7_r_in;
2276  tmp1_r += tmp0_r;
2277  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2278  q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
2279  tmp0_l = p5_l_in - p6_l_in;
2280  tmp0_l += q1_l_in;
2281  tmp0_l -= p7_l_in;
2282  tmp1_l += tmp0_l;
2283  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2284  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2285  p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
2286  ST_UB(p5, src);
2287  src += 16;
2288 
2289  /* p4 */
2290  q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
2291  tmp0_r = p4_r_in - p5_r_in;
2292  tmp0_r += q2_r_in;
2293  tmp0_r -= p7_r_in;
2294  tmp1_r += tmp0_r;
2295  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2296  q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
2297  tmp0_l = p4_l_in - p5_l_in;
2298  tmp0_l += q2_l_in;
2299  tmp0_l -= p7_l_in;
2300  tmp1_l += tmp0_l;
2301  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2302  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2303  p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2304  ST_UB(p4, src);
2305  src += 16;
2306 
2307  /* p3 */
2308  q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2309  tmp0_r = p3_r_in - p4_r_in;
2310  tmp0_r += q3_r_in;
2311  tmp0_r -= p7_r_in;
2312  tmp1_r += tmp0_r;
2313  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2314  q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
2315  tmp0_l = p3_l_in - p4_l_in;
2316  tmp0_l += q3_l_in;
2317  tmp0_l -= p7_l_in;
2318  tmp1_l += tmp0_l;
2319  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2320  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2321  p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2322  ST_UB(p3, src);
2323  src += 16;
2324 
2325  /* p2 */
2326  q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2327  filter8 = LD_UB(filter48);
2328  tmp0_r = p2_r_in - p3_r_in;
2329  tmp0_r += q4_r_in;
2330  tmp0_r -= p7_r_in;
2331  tmp1_r += tmp0_r;
2332  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2333  q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
2334  tmp0_l = p2_l_in - p3_l_in;
2335  tmp0_l += q4_l_in;
2336  tmp0_l -= p7_l_in;
2337  tmp1_l += tmp0_l;
2338  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2339  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2340  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2341  ST_UB(filter8, src);
2342  src += 16;
2343 
2344  /* p1 */
2345  q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2346  filter8 = LD_UB(filter48 + 16);
2347  tmp0_r = p1_r_in - p2_r_in;
2348  tmp0_r += q5_r_in;
2349  tmp0_r -= p7_r_in;
2350  tmp1_r += tmp0_r;
2351  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2352  q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
2353  tmp0_l = p1_l_in - p2_l_in;
2354  tmp0_l += q5_l_in;
2355  tmp0_l -= p7_l_in;
2356  tmp1_l += tmp0_l;
2357  l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
2358  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2359  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2360  ST_UB(filter8, src);
2361  src += 16;
2362 
2363  /* p0 */
2364  q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2365  filter8 = LD_UB(filter48 + 32);
2366  tmp0_r = p0_r_in - p1_r_in;
2367  tmp0_r += q6_r_in;
2368  tmp0_r -= p7_r_in;
2369  tmp1_r += tmp0_r;
2370  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2371  q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
2372  tmp0_l = p0_l_in - p1_l_in;
2373  tmp0_l += q6_l_in;
2374  tmp0_l -= p7_l_in;
2375  tmp1_l += tmp0_l;
2376  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2377  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2378  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2379  ST_UB(filter8, src);
2380  src += 16;
2381 
2382  /* q0 */
2383  q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2384  filter8 = LD_UB(filter48 + 48);
2385  tmp0_r = q7_r_in - p0_r_in;
2386  tmp0_r += q0_r_in;
2387  tmp0_r -= p7_r_in;
2388  tmp1_r += tmp0_r;
2389  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2390  q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
2391  tmp0_l = q7_l_in - p0_l_in;
2392  tmp0_l += q0_l_in;
2393  tmp0_l -= p7_l_in;
2394  tmp1_l += tmp0_l;
2395  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2396  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2397  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2398  ST_UB(filter8, src);
2399  src += 16;
2400 
2401  /* q1 */
2402  filter8 = LD_UB(filter48 + 64);
2403  tmp0_r = q7_r_in - q0_r_in;
2404  tmp0_r += q1_r_in;
2405  tmp0_r -= p6_r_in;
2406  tmp1_r += tmp0_r;
2407  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2408  tmp0_l = q7_l_in - q0_l_in;
2409  tmp0_l += q1_l_in;
2410  tmp0_l -= p6_l_in;
2411  tmp1_l += tmp0_l;
2412  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2413  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2414  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2415  ST_UB(filter8, src);
2416  src += 16;
2417 
2418  /* q2 */
2419  filter8 = LD_UB(filter48 + 80);
2420  tmp0_r = q7_r_in - q1_r_in;
2421  tmp0_r += q2_r_in;
2422  tmp0_r -= p5_r_in;
2423  tmp1_r += tmp0_r;
2424  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2425  tmp0_l = q7_l_in - q1_l_in;
2426  tmp0_l += q2_l_in;
2427  tmp0_l -= p5_l_in;
2428  tmp1_l += tmp0_l;
2429  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2430  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2431  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2432  ST_UB(filter8, src);
2433  src += 16;
2434 
2435  /* q3 */
2436  tmp0_r = q7_r_in - q2_r_in;
2437  tmp0_r += q3_r_in;
2438  tmp0_r -= p4_r_in;
2439  tmp1_r += tmp0_r;
2440  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2441  tmp0_l = q7_l_in - q2_l_in;
2442  tmp0_l += q3_l_in;
2443  tmp0_l -= p4_l_in;
2444  tmp1_l += tmp0_l;
2445  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2446  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2447  q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2448  ST_UB(q3, src);
2449  src += 16;
2450 
2451  /* q4 */
2452  tmp0_r = q7_r_in - q3_r_in;
2453  tmp0_r += q4_r_in;
2454  tmp0_r -= p3_r_in;
2455  tmp1_r += tmp0_r;
2456  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2457  tmp0_l = q7_l_in - q3_l_in;
2458  tmp0_l += q4_l_in;
2459  tmp0_l -= p3_l_in;
2460  tmp1_l += tmp0_l;
2461  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2462  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2463  q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2464  ST_UB(q4, src);
2465  src += 16;
2466 
2467  /* q5 */
2468  tmp0_r = q7_r_in - q4_r_in;
2469  tmp0_r += q5_r_in;
2470  tmp0_r -= p2_r_in;
2471  tmp1_r += tmp0_r;
2472  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2473  tmp0_l = q7_l_in - q4_l_in;
2474  tmp0_l += q5_l_in;
2475  tmp0_l -= p2_l_in;
2476  tmp1_l += tmp0_l;
2477  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2478  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2479  q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2480  ST_UB(q5, src);
2481  src += 16;
2482 
2483  /* q6 */
2484  tmp0_r = q7_r_in - q5_r_in;
2485  tmp0_r += q6_r_in;
2486  tmp0_r -= p1_r_in;
2487  tmp1_r += tmp0_r;
2488  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2489  tmp0_l = q7_l_in - q5_l_in;
2490  tmp0_l += q6_l_in;
2491  tmp0_l -= p1_l_in;
2492  tmp1_l += tmp0_l;
2493  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2494  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2495  q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2496  ST_UB(q6, src);
2497 
2498  return 0;
2499  }
2500 }
2501 
2502 void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch,
2503  int32_t b_limit_ptr,
2504  int32_t limit_ptr,
2505  int32_t thresh_ptr)
2506 {
2507  uint8_t early_exit = 0;
2508  uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2509  uint8_t *filter48 = &transposed_input[16 * 16];
2510 
2511  vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
2512 
2513  early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
2514  &filter48[0], src, pitch,
2515  b_limit_ptr, limit_ptr, thresh_ptr);
2516 
2517  if (0 == early_exit) {
2518  early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
2519  &filter48[0]);
2520 
2521  if (0 == early_exit) {
2522  vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
2523  }
2524  }
2525 }
ff_loop_filter_h_88_16_msa
void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1349
vp9_transpose_8x16_to_16x8
static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
Definition: vp9_lpf_msa.c:1697
q1
static const uint8_t q1[256]
Definition: twofish.c:96
LD_UB8
#define LD_UB8(...)
Definition: generic_macros_msa.h:335
ff_loop_filter_v_44_16_msa
void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:225
ff_loop_filter_h_8_8_msa
void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1271
ff_loop_filter_v_4_8_msa
void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:196
ST_UB2
#define ST_UB2(...)
Definition: generic_macros_msa.h:363
PCKEV_B4_SH
#define PCKEV_B4_SH(...)
Definition: generic_macros_msa.h:1740
output
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output
Definition: filter_design.txt:225
ff_loop_filter_h_48_16_msa
void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1565
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
ST_D1
#define ST_D1(in, idx, pdst)
Definition: generic_macros_msa.h:485
vp9_transpose_16x16
static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
Definition: vp9_lpf_msa.c:1710
ST_UB8
#define ST_UB8(...)
Definition: generic_macros_msa.h:391
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
generic_macros_msa.h
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: vp8_lpf_lsx.c:234
VP9_FLAT5
#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, q6_in, q7_in, flat_in, flat2_out)
Definition: vp9_lpf_msa.c:89
ILVR_W2_UB
#define ILVR_W2_UB(...)
Definition: generic_macros_msa.h:1416
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:32
VP9_FLAT4
#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)
Definition: vp9_lpf_msa.c:68
ILVEV_B2_SH
#define ILVEV_B2_SH(...)
Definition: generic_macros_msa.h:1190
vp9_hz_lpf_t4_and_t8_16w
static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:551
ILVEV_B2_UB
#define ILVEV_B2_UB(...)
Definition: generic_macros_msa.h:1188
ILVR_B8_UH
#define ILVR_B8_UH(...)
Definition: generic_macros_msa.h:1374
mask
static const uint16_t mask[17]
Definition: lzw.c:38
SLDI_B4_UB
#define SLDI_B4_UB(...)
Definition: generic_macros_msa.h:643
vp9_vt_lpf_t4_and_t8_8w
static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, int32_t pitch_org, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1770
ff_loop_filter_v_48_16_msa
void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:479
q0
static const uint8_t q0[256]
Definition: twofish.c:77
vp9dsp_mips.h
vp9_hz_lpf_t16_16w
static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
Definition: vp9_lpf_msa.c:626
ILVL_B2_SH
#define ILVL_B2_SH(...)
Definition: generic_macros_msa.h:1265
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1720
ILVRL_H2_SH
#define ILVRL_H2_SH(...)
Definition: generic_macros_msa.h:1508
ff_loop_filter_h_84_16_msa
void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1462
ILVEV_H2_SW
#define ILVEV_H2_SW(...)
Definition: generic_macros_msa.h:1209
src
#define src
Definition: vp8dsp.c:255
ff_loop_filter_v_16_16_msa
void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:958
ff_loop_filter_h_4_8_msa
void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1198
ALLOC_ALIGNED
#define ALLOC_ALIGNED(align)
Definition: vp9_idct_lsx.c:28
TRANSPOSE8x8_UB_UB
#define TRANSPOSE8x8_UB_UB(...)
Definition: generic_macros_msa.h:2375
vp9_vt_lpf_t16_16w
static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, uint8_t *filter48)
Definition: vp9_lpf_msa.c:2175
vp9dsp.h
ILVL_W2_UB
#define ILVL_W2_UB(...)
Definition: generic_macros_msa.h:1318
vp9_vt_lpf_t16_8w
static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, uint8_t *filter48)
Definition: vp9_lpf_msa.c:1842
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:256
LD_UB4
#define LD_UB4(...)
Definition: generic_macros_msa.h:296
ILVR_B2_SB
#define ILVR_B2_SB(...)
Definition: generic_macros_msa.h:1338
ff_loop_filter_h_16_8_msa
void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:2066
TRANSPOSE16x8_UB_UB
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7)
Definition: generic_macros_msa.h:2420
ff_loop_filter_v_84_16_msa
void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:408
input
and forward the test the status of outputs and forward it to the corresponding return FFERROR_NOT_READY If the filters stores internally one or a few frame for some input
Definition: filter_design.txt:172
vp9_transpose_16x8_to_8x16
static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
Definition: vp9_lpf_msa.c:1670
PCKEV_B2_SH
#define PCKEV_B2_SH(...)
Definition: generic_macros_msa.h:1721
ST_H4
#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:417
ff_loop_filter_v_16_8_msa
void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:974
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
ff_loop_filter_h_44_16_msa
void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1225
ILVL_B2_SB
#define ILVL_B2_SB(...)
Definition: generic_macros_msa.h:1263
limit
static double limit(double x)
Definition: vf_pseudocolor.c:128
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:40
LD_UB2
#define LD_UB2(...)
Definition: generic_macros_msa.h:277
ff_loop_filter_v_8_8_msa
void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:255
ff_loop_filter_h_16_16_msa
void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:2502
ILVL_B4_SB
#define ILVL_B4_SB(...)
Definition: generic_macros_msa.h:1274
zero
#define zero
Definition: regdef.h:64
hev
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:731
VP9_FILTER8
#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, q1_filt8_out, q2_filt8_out)
Definition: vp9_lpf_msa.c:119
ILVRL_B2_SH
#define ILVRL_B2_SH(...)
Definition: generic_macros_msa.h:1498
ff_loop_filter_v_88_16_msa
void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:329
vp9_vt_lpf_t4_and_t8_16w
static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:2091
int32_t
int32_t
Definition: audioconvert.c:56
LPF_MASK_HEV
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, limit_in, b_limit_in, thresh_in, hev_out, mask_out, flat_out)
Definition: vp9_lpf_msa.c:158
flat
static av_always_inline void flat(WaveformContext *s, AVFrame *in, AVFrame *out, int component, int intensity, int offset_y, int offset_x, int column, int mirror, int jobnr, int nb_jobs)
Definition: vf_waveform.c:1099
ALIGNMENT
#define ALIGNMENT
Definition: generic_macros_msa.h:28
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1340
ILVL_B4_UH
#define ILVL_B4_UH(...)
Definition: generic_macros_msa.h:1275
SD
#define SD
Definition: ccaption_dec.c:928
VP9_LPF_FILTER4_4W
#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, p1_out, p0_out, q0_out, q1_out)
Definition: vp9_lpf_msa.c:25