25 int16_t qadd, int8_t n_coeffs,
28 int16_t *block_dup =
block;
30 v8i16 block_vec, qmul_vec, qadd_vec, sub;
31 v8i16 add,
mask, mul, zero_mask;
33 qmul_vec = __msa_fill_h(qmul);
34 qadd_vec = __msa_fill_h(qadd);
35 for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) {
36 block_vec =
LD_SH(block_dup + loop_start);
37 mask = __msa_clti_s_h(block_vec, 0);
38 zero_mask = __msa_ceqi_h(block_vec, 0);
39 mul = block_vec * qmul_vec;
42 add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8)
mask);
43 block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec,
45 ST_SH(block_vec, block_dup + loop_start);
49 cnt = ((n_coeffs >> 3) * 8) + loop_start;
51 for (; cnt <= n_coeffs; cnt++) {
55 level = level * qmul - qadd;
57 level = level * qmul + qadd;
66 const int16_t *quant_matrix)
69 v8i16 block_vec, block_neg, qscale_vec,
mask;
70 v8i16 block_org0, block_org1, block_org2, block_org3;
71 v8i16 quant_m0, quant_m1, quant_m2, quant_m3;
72 v8i16 sum, mul, zero_mask;
73 v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l;
74 v4i32 block_l, block_r, sad;
76 qscale_vec = __msa_fill_h(qscale);
77 for (cnt = 0; cnt < 2; cnt++) {
78 LD_SH4(block, 8, block_org0, block_org1, block_org2, block_org3);
79 LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3);
80 mask = __msa_clti_s_h(block_org0, 0);
81 zero_mask = __msa_ceqi_h(block_org0, 0);
82 block_neg = -block_org0;
83 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg,
90 mul_vec = block_l * qscale_l;
92 block_l = mul_vec >> 4;
93 mul_vec = block_r * qscale_r;
95 block_r = mul_vec >> 4;
96 mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
98 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
100 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0,
105 sad = __msa_hadd_s_w(sum, sum);
107 mask = __msa_clti_s_h(block_org1, 0);
108 zero_mask = __msa_ceqi_h(block_org1, 0);
109 block_neg = - block_org1;
110 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg,
117 mul_vec = block_l * qscale_l;
118 mul_vec *= quant_m_l;
119 block_l = mul_vec >> 4;
120 mul_vec = block_r * qscale_r;
121 mul_vec *= quant_m_r;
122 block_r = mul_vec >> 4;
123 mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
125 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
127 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1,
133 sad = __msa_hadd_s_w(sum, sum);
135 mask = __msa_clti_s_h(block_org2, 0);
136 zero_mask = __msa_ceqi_h(block_org2, 0);
137 block_neg = - block_org2;
138 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg,
145 mul_vec = block_l * qscale_l;
146 mul_vec *= quant_m_l;
147 block_l = mul_vec >> 4;
148 mul_vec = block_r * qscale_r;
149 mul_vec *= quant_m_r;
150 block_r = mul_vec >> 4;
151 mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
153 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
155 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2,
161 sad = __msa_hadd_s_w(sum, sum);
163 mask = __msa_clti_s_h(block_org3, 0);
164 zero_mask = __msa_ceqi_h(block_org3, 0);
165 block_neg = - block_org3;
166 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg,
173 mul_vec = block_l * qscale_l;
174 mul_vec *= quant_m_l;
175 block_l = mul_vec >> 4;
176 mul_vec = block_r * qscale_r;
177 mul_vec *= quant_m_r;
178 block_r = mul_vec >> 4;
179 mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
181 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
183 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3,
189 sad = __msa_hadd_s_w(sum, sum);
209 qadd = (qscale - 1) | 1;
230 qadd = (qscale - 1) | 1;
242 const uint16_t *quant_matrix;
249 block[63] ^= sum & 1;
void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s, int16_t *block, int32_t index, int32_t qscale)
int h263_aic
Advanced INTRA Coding (AIC)
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
static const uint16_t mask[17]
static int32_t mpeg2_dct_unquantize_inter_msa(int16_t *block, int32_t qscale, const int16_t *quant_matrix)
int block_last_index[12]
last non zero coefficient in block
void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s, int16_t *block, int32_t index, int32_t qscale)
#define UNPCK_SH_SW(in, out0, out1)
uint16_t inter_matrix[64]
void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s, int16_t *block, int32_t index, int32_t qscale)
ScanTable inter_scantable
if inter == intra then intra should be used to reduce the cache usage
static void h263_dct_unquantize_msa(int16_t *block, int16_t qmul, int16_t qadd, int8_t n_coeffs, uint8_t loop_start)