FFmpeg
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
postprocess_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /**
22  * @file
23  * mmx/mmx2/sse2 postprocess code.
24  */
25 #include "config.h"
26 
27 #include "libavutil/mem_internal.h"
28 #if ARCH_X86
29 #include "libavutil/x86/asm.h"
30 #endif
31 
32 /* A single TEMPLATE_PP_* should be defined (to 1) when this template is
33  * included. The following macros will define its dependencies to 1 as well
34  * (like MMX2 depending on MMX), and will define to 0 all the others. Every
35  * TEMPLATE_PP_* need to be undef at the end. */
36 
37 #ifdef TEMPLATE_PP_C
38 # define RENAME(a) a ## _C
39 #else
40 # define TEMPLATE_PP_C 0
41 #endif
42 
43 #ifdef TEMPLATE_PP_ALTIVEC
44 # define RENAME(a) a ## _altivec
45 #else
46 # define TEMPLATE_PP_ALTIVEC 0
47 #endif
48 
49 #ifdef TEMPLATE_PP_MMX
50 # define RENAME(a) a ## _MMX
51 #else
52 # define TEMPLATE_PP_MMX 0
53 #endif
54 
55 #ifdef TEMPLATE_PP_MMXEXT
56 # undef TEMPLATE_PP_MMX
57 # define TEMPLATE_PP_MMX 1
58 # define RENAME(a) a ## _MMX2
59 #else
60 # define TEMPLATE_PP_MMXEXT 0
61 #endif
62 
63 #ifdef TEMPLATE_PP_SSE2
64 # undef TEMPLATE_PP_MMX
65 # define TEMPLATE_PP_MMX 1
66 # undef TEMPLATE_PP_MMXEXT
67 # define TEMPLATE_PP_MMXEXT 1
68 # define RENAME(a) a ## _SSE2
69 #else
70 # define TEMPLATE_PP_SSE2 0
71 #endif
72 
73 #undef REAL_PAVGB
74 #undef PAVGB
75 #undef PMINUB
76 #undef PMAXUB
77 
78 #if TEMPLATE_PP_MMXEXT
79 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
80 #endif
81 #define PAVGB(a,b) REAL_PAVGB(a,b)
82 
83 #if TEMPLATE_PP_MMXEXT
84 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
85 #endif
86 
87 #if TEMPLATE_PP_MMXEXT
88 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
89 #endif
90 
91 //FIXME? |255-0| = 1 (should not be a problem ...)
92 #if TEMPLATE_PP_MMXEXT
93 /**
94  * Check if the middle 8x8 Block in the given 8x16 block is flat
95  */
96 static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContext *c){
97  int numEq= 0, dcOk;
98  src+= stride*4; // src points to begin of the 8x8 Block
99  __asm__ volatile(
100  "movq %0, %%mm7 \n\t"
101  "movq %1, %%mm6 \n\t"
102  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
103  );
104 
105  __asm__ volatile(
106  "lea (%2, %3), %%"FF_REG_a" \n\t"
107 // 0 1 2 3 4 5 6 7 8 9
108 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
109 
110  "movq (%2), %%mm0 \n\t"
111  "movq (%%"FF_REG_a"), %%mm1 \n\t"
112  "movq %%mm0, %%mm3 \n\t"
113  "movq %%mm0, %%mm4 \n\t"
114  PMAXUB(%%mm1, %%mm4)
115  PMINUB(%%mm1, %%mm3, %%mm5)
116  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
117  "paddb %%mm7, %%mm0 \n\t"
118  "pcmpgtb %%mm6, %%mm0 \n\t"
119 
120  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
121  PMAXUB(%%mm2, %%mm4)
122  PMINUB(%%mm2, %%mm3, %%mm5)
123  "psubb %%mm2, %%mm1 \n\t"
124  "paddb %%mm7, %%mm1 \n\t"
125  "pcmpgtb %%mm6, %%mm1 \n\t"
126  "paddb %%mm1, %%mm0 \n\t"
127 
128  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
129  PMAXUB(%%mm1, %%mm4)
130  PMINUB(%%mm1, %%mm3, %%mm5)
131  "psubb %%mm1, %%mm2 \n\t"
132  "paddb %%mm7, %%mm2 \n\t"
133  "pcmpgtb %%mm6, %%mm2 \n\t"
134  "paddb %%mm2, %%mm0 \n\t"
135 
136  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
137 
138  "movq (%2, %3, 4), %%mm2 \n\t"
139  PMAXUB(%%mm2, %%mm4)
140  PMINUB(%%mm2, %%mm3, %%mm5)
141  "psubb %%mm2, %%mm1 \n\t"
142  "paddb %%mm7, %%mm1 \n\t"
143  "pcmpgtb %%mm6, %%mm1 \n\t"
144  "paddb %%mm1, %%mm0 \n\t"
145 
146  "movq (%%"FF_REG_a"), %%mm1 \n\t"
147  PMAXUB(%%mm1, %%mm4)
148  PMINUB(%%mm1, %%mm3, %%mm5)
149  "psubb %%mm1, %%mm2 \n\t"
150  "paddb %%mm7, %%mm2 \n\t"
151  "pcmpgtb %%mm6, %%mm2 \n\t"
152  "paddb %%mm2, %%mm0 \n\t"
153 
154  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
155  PMAXUB(%%mm2, %%mm4)
156  PMINUB(%%mm2, %%mm3, %%mm5)
157  "psubb %%mm2, %%mm1 \n\t"
158  "paddb %%mm7, %%mm1 \n\t"
159  "pcmpgtb %%mm6, %%mm1 \n\t"
160  "paddb %%mm1, %%mm0 \n\t"
161 
162  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
163  PMAXUB(%%mm1, %%mm4)
164  PMINUB(%%mm1, %%mm3, %%mm5)
165  "psubb %%mm1, %%mm2 \n\t"
166  "paddb %%mm7, %%mm2 \n\t"
167  "pcmpgtb %%mm6, %%mm2 \n\t"
168  "paddb %%mm2, %%mm0 \n\t"
169  "psubusb %%mm3, %%mm4 \n\t"
170 
171  " \n\t"
172  "pxor %%mm7, %%mm7 \n\t"
173  "psadbw %%mm7, %%mm0 \n\t"
174  "movq %4, %%mm7 \n\t" // QP,..., QP
175  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
176  "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
177  "packssdw %%mm4, %%mm4 \n\t"
178  "movd %%mm0, %0 \n\t"
179  "movd %%mm4, %1 \n\t"
180 
181  : "=r" (numEq), "=r" (dcOk)
182  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
183  : "%"FF_REG_a
184  );
185 
186  numEq= (-numEq) &0xFF;
187  if(numEq > c->ppMode.flatnessThreshold){
188  if(dcOk) return 0;
189  else return 1;
190  }else{
191  return 2;
192  }
193 }
194 #endif //TEMPLATE_PP_MMXEXT
195 
196 /**
197  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
198  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
199  */
200 #if !TEMPLATE_PP_ALTIVEC
201 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
202 {
203 #if TEMPLATE_PP_MMXEXT
204  src+= stride*3;
205  __asm__ volatile( //"movv %0 %1 %2\n\t"
206  "movq %2, %%mm0 \n\t" // QP,..., QP
207  "pxor %%mm4, %%mm4 \n\t"
208 
209  "movq (%0), %%mm6 \n\t"
210  "movq (%0, %1), %%mm5 \n\t"
211  "movq %%mm5, %%mm1 \n\t"
212  "movq %%mm6, %%mm2 \n\t"
213  "psubusb %%mm6, %%mm5 \n\t"
214  "psubusb %%mm1, %%mm2 \n\t"
215  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
216  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
217  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
218 
219  "pand %%mm2, %%mm6 \n\t"
220  "pandn %%mm1, %%mm2 \n\t"
221  "por %%mm2, %%mm6 \n\t"// First Line to Filter
222 
223  "movq (%0, %1, 8), %%mm5 \n\t"
224  "lea (%0, %1, 4), %%"FF_REG_a" \n\t"
225  "lea (%0, %1, 8), %%"FF_REG_c" \n\t"
226  "sub %1, %%"FF_REG_c" \n\t"
227  "add %1, %0 \n\t" // %0 points to line 1 not 0
228  "movq (%0, %1, 8), %%mm7 \n\t"
229  "movq %%mm5, %%mm1 \n\t"
230  "movq %%mm7, %%mm2 \n\t"
231  "psubusb %%mm7, %%mm5 \n\t"
232  "psubusb %%mm1, %%mm2 \n\t"
233  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
234  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
235  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
236 
237  "pand %%mm2, %%mm7 \n\t"
238  "pandn %%mm1, %%mm2 \n\t"
239  "por %%mm2, %%mm7 \n\t" // First Line to Filter
240 
241 
242  // 1 2 3 4 5 6 7 8
243  // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
244  // 6 4 2 2 1 1
245  // 6 4 4 2
246  // 6 8 2
247 
248  "movq (%0, %1), %%mm0 \n\t" // 1
249  "movq %%mm0, %%mm1 \n\t" // 1
250  PAVGB(%%mm6, %%mm0) //1 1 /2
251  PAVGB(%%mm6, %%mm0) //3 1 /4
252 
253  "movq (%0, %1, 4), %%mm2 \n\t" // 1
254  "movq %%mm2, %%mm5 \n\t" // 1
255  PAVGB((%%FF_REGa), %%mm2) // 11 /2
256  PAVGB((%0, %1, 2), %%mm2) // 211 /4
257  "movq %%mm2, %%mm3 \n\t" // 211 /4
258  "movq (%0), %%mm4 \n\t" // 1
259  PAVGB(%%mm4, %%mm3) // 4 211 /8
260  PAVGB(%%mm0, %%mm3) //642211 /16
261  "movq %%mm3, (%0) \n\t" // X
262  // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
263  "movq %%mm1, %%mm0 \n\t" // 1
264  PAVGB(%%mm6, %%mm0) //1 1 /2
265  "movq %%mm4, %%mm3 \n\t" // 1
266  PAVGB((%0,%1,2), %%mm3) // 1 1 /2
267  PAVGB((%%FF_REGa,%1,2), %%mm5) // 11 /2
268  PAVGB((%%FF_REGa), %%mm5) // 211 /4
269  PAVGB(%%mm5, %%mm3) // 2 2211 /8
270  PAVGB(%%mm0, %%mm3) //4242211 /16
271  "movq %%mm3, (%0,%1) \n\t" // X
272  // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
273  PAVGB(%%mm4, %%mm6) //11 /2
274  "movq (%%"FF_REG_c"), %%mm0 \n\t" // 1
275  PAVGB((%%FF_REGa, %1, 2), %%mm0) // 11/2
276  "movq %%mm0, %%mm3 \n\t" // 11/2
277  PAVGB(%%mm1, %%mm0) // 2 11/4
278  PAVGB(%%mm6, %%mm0) //222 11/8
279  PAVGB(%%mm2, %%mm0) //22242211/16
280  "movq (%0, %1, 2), %%mm2 \n\t" // 1
281  "movq %%mm0, (%0, %1, 2) \n\t" // X
282  // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
283  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
284  PAVGB((%%FF_REGc), %%mm0) // 11 /2
285  PAVGB(%%mm0, %%mm6) //11 11 /4
286  PAVGB(%%mm1, %%mm4) // 11 /2
287  PAVGB(%%mm2, %%mm1) // 11 /2
288  PAVGB(%%mm1, %%mm6) //1122 11 /8
289  PAVGB(%%mm5, %%mm6) //112242211 /16
290  "movq (%%"FF_REG_a"), %%mm5 \n\t" // 1
291  "movq %%mm6, (%%"FF_REG_a") \n\t" // X
292  // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
293  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t" // 1
294  PAVGB(%%mm7, %%mm6) // 11 /2
295  PAVGB(%%mm4, %%mm6) // 11 11 /4
296  PAVGB(%%mm3, %%mm6) // 11 2211 /8
297  PAVGB(%%mm5, %%mm2) // 11 /2
298  "movq (%0, %1, 4), %%mm4 \n\t" // 1
299  PAVGB(%%mm4, %%mm2) // 112 /4
300  PAVGB(%%mm2, %%mm6) // 112242211 /16
301  "movq %%mm6, (%0, %1, 4) \n\t" // X
302  // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
303  PAVGB(%%mm7, %%mm1) // 11 2 /4
304  PAVGB(%%mm4, %%mm5) // 11 /2
305  PAVGB(%%mm5, %%mm0) // 11 11 /4
306  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" // 1
307  PAVGB(%%mm6, %%mm1) // 11 4 2 /8
308  PAVGB(%%mm0, %%mm1) // 11224222 /16
309  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t" // X
310  // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
311  PAVGB((%%FF_REGc), %%mm2) // 112 4 /8
312  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
313  PAVGB(%%mm0, %%mm6) // 1 1 /2
314  PAVGB(%%mm7, %%mm6) // 1 12 /4
315  PAVGB(%%mm2, %%mm6) // 1122424 /4
316  "movq %%mm6, (%%"FF_REG_c") \n\t" // X
317  // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
318  PAVGB(%%mm7, %%mm5) // 11 2 /4
319  PAVGB(%%mm7, %%mm5) // 11 6 /8
320 
321  PAVGB(%%mm3, %%mm0) // 112 /4
322  PAVGB(%%mm0, %%mm5) // 112246 /16
323  "movq %%mm5, (%%"FF_REG_a", %1, 4) \n\t" // X
324  "sub %1, %0 \n\t"
325 
326  :
327  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
328  : "%"FF_REG_a, "%"FF_REG_c
329  );
330 #else //TEMPLATE_PP_MMXEXT
331  const int l1= stride;
332  const int l2= stride + l1;
333  const int l3= stride + l2;
334  const int l4= stride + l3;
335  const int l5= stride + l4;
336  const int l6= stride + l5;
337  const int l7= stride + l6;
338  const int l8= stride + l7;
339  const int l9= stride + l8;
340  int x;
341  src+= stride*3;
342  for(x=0; x<BLOCK_SIZE; x++){
343  const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
344  const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
345 
346  int sums[10];
347  sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
348  sums[1] = sums[0] - first + src[l4];
349  sums[2] = sums[1] - first + src[l5];
350  sums[3] = sums[2] - first + src[l6];
351  sums[4] = sums[3] - first + src[l7];
352  sums[5] = sums[4] - src[l1] + src[l8];
353  sums[6] = sums[5] - src[l2] + last;
354  sums[7] = sums[6] - src[l3] + last;
355  sums[8] = sums[7] - src[l4] + last;
356  sums[9] = sums[8] - src[l5] + last;
357 
358  src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
359  src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
360  src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
361  src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
362  src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
363  src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
364  src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
365  src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
366 
367  src++;
368  }
369 #endif //TEMPLATE_PP_MMXEXT
370 }
371 #endif //TEMPLATE_PP_ALTIVEC
372 
373 /**
374  * Experimental Filter 1
375  * will not damage linear gradients
376  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
377  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
378  * MMX2 version does correct clipping C version does not
379  */
380 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
381 {
382 #if TEMPLATE_PP_MMXEXT
383  src+= stride*3;
384 
385  __asm__ volatile(
386  "pxor %%mm7, %%mm7 \n\t" // 0
387  "lea (%0, %1), %%"FF_REG_a" \n\t"
388  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
389 // 0 1 2 3 4 5 6 7 8 9
390 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
391  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
392  "movq (%0, %1, 4), %%mm1 \n\t" // line 4
393  "movq %%mm1, %%mm2 \n\t" // line 4
394  "psubusb %%mm0, %%mm1 \n\t"
395  "psubusb %%mm2, %%mm0 \n\t"
396  "por %%mm1, %%mm0 \n\t" // |l2 - l3|
397  "movq (%%"FF_REG_c"), %%mm3 \n\t" // line 5
398  "movq (%%"FF_REG_c", %1), %%mm4 \n\t" // line 6
399  "movq %%mm3, %%mm5 \n\t" // line 5
400  "psubusb %%mm4, %%mm3 \n\t"
401  "psubusb %%mm5, %%mm4 \n\t"
402  "por %%mm4, %%mm3 \n\t" // |l5 - l6|
403  PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
404  "movq %%mm2, %%mm1 \n\t" // line 4
405  "psubusb %%mm5, %%mm2 \n\t"
406  "movq %%mm2, %%mm4 \n\t"
407  "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
408  "psubusb %%mm1, %%mm5 \n\t"
409  "por %%mm5, %%mm4 \n\t" // |l4 - l5|
410  "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
411  "movq %%mm4, %%mm3 \n\t" // d
412  "movq %2, %%mm0 \n\t"
413  "paddusb %%mm0, %%mm0 \n\t"
414  "psubusb %%mm0, %%mm4 \n\t"
415  "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
416  "psubusb "MANGLE(b01)", %%mm3 \n\t"
417  "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
418 
419  PAVGB(%%mm7, %%mm3) // d/2
420  "movq %%mm3, %%mm1 \n\t" // d/2
421  PAVGB(%%mm7, %%mm3) // d/4
422  PAVGB(%%mm1, %%mm3) // 3*d/8
423 
424  "movq (%0, %1, 4), %%mm0 \n\t" // line 4
425  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
426  "psubusb %%mm3, %%mm0 \n\t"
427  "pxor %%mm2, %%mm0 \n\t"
428  "movq %%mm0, (%0, %1, 4) \n\t" // line 4
429 
430  "movq (%%"FF_REG_c"), %%mm0 \n\t" // line 5
431  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
432  "paddusb %%mm3, %%mm0 \n\t"
433  "pxor %%mm2, %%mm0 \n\t"
434  "movq %%mm0, (%%"FF_REG_c") \n\t" // line 5
435 
436  PAVGB(%%mm7, %%mm1) // d/4
437 
438  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
439  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
440  "psubusb %%mm1, %%mm0 \n\t"
441  "pxor %%mm2, %%mm0 \n\t"
442  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t" // line 3
443 
444  "movq (%%"FF_REG_c", %1), %%mm0 \n\t" // line 6
445  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
446  "paddusb %%mm1, %%mm0 \n\t"
447  "pxor %%mm2, %%mm0 \n\t"
448  "movq %%mm0, (%%"FF_REG_c", %1) \n\t" // line 6
449 
450  PAVGB(%%mm7, %%mm1) // d/8
451 
452  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // line 2
453  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
454  "psubusb %%mm1, %%mm0 \n\t"
455  "pxor %%mm2, %%mm0 \n\t"
456  "movq %%mm0, (%%"FF_REG_a", %1) \n\t" // line 2
457 
458  "movq (%%"FF_REG_c", %1, 2), %%mm0 \n\t" // line 7
459  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
460  "paddusb %%mm1, %%mm0 \n\t"
461  "pxor %%mm2, %%mm0 \n\t"
462  "movq %%mm0, (%%"FF_REG_c", %1, 2) \n\t" // line 7
463 
464  :
465  : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
467  : "%"FF_REG_a, "%"FF_REG_c
468  );
469 #else //TEMPLATE_PP_MMXEXT
470 
471  const int l1= stride;
472  const int l2= stride + l1;
473  const int l3= stride + l2;
474  const int l4= stride + l3;
475  const int l5= stride + l4;
476  const int l6= stride + l5;
477  const int l7= stride + l6;
478 // const int l8= stride + l7;
479 // const int l9= stride + l8;
480  int x;
481 
482  src+= stride*3;
483  for(x=0; x<BLOCK_SIZE; x++){
484  int a= src[l3] - src[l4];
485  int b= src[l4] - src[l5];
486  int c= src[l5] - src[l6];
487 
488  int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
489  d= FFMAX(d, 0);
490 
491  if(d < co->QP*2){
492  int v = d * FFSIGN(-b);
493 
494  src[l2] +=v>>3;
495  src[l3] +=v>>2;
496  src[l4] +=(3*v)>>3;
497  src[l5] -=(3*v)>>3;
498  src[l6] -=v>>2;
499  src[l7] -=v>>3;
500  }
501  src++;
502  }
503 #endif //TEMPLATE_PP_MMXEXT
504 }
505 
506 #if !TEMPLATE_PP_ALTIVEC
507 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
508 {
509 #if TEMPLATE_PP_MMXEXT
510 /*
511  uint8_t tmp[16];
512  const int l1= stride;
513  const int l2= stride + l1;
514  const int l3= stride + l2;
515  const int l4= (int)tmp - (int)src - stride*3;
516  const int l5= (int)tmp - (int)src - stride*3 + 8;
517  const int l6= stride*3 + l3;
518  const int l7= stride + l6;
519  const int l8= stride + l7;
520 
521  memcpy(tmp, src+stride*7, 8);
522  memcpy(tmp+8, src+stride*8, 8);
523 */
524  src+= stride*4;
525  __asm__ volatile(
526 
527 #if 0 //slightly more accurate and slightly slower
528  "pxor %%mm7, %%mm7 \n\t" // 0
529  "lea (%0, %1), %%"FF_REG_a" \n\t"
530  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
531 // 0 1 2 3 4 5 6 7
532 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
533 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
534 
535 
536  "movq (%0, %1, 2), %%mm0 \n\t" // l2
537  "movq (%0), %%mm1 \n\t" // l0
538  "movq %%mm0, %%mm2 \n\t" // l2
539  PAVGB(%%mm7, %%mm0) // ~l2/2
540  PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
541  PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
542 
543  "movq (%%"FF_REG_a"), %%mm1 \n\t" // l1
544  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t" // l3
545  "movq %%mm1, %%mm4 \n\t" // l1
546  PAVGB(%%mm7, %%mm1) // ~l1/2
547  PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
548  PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
549 
550  "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
551  "psubusb %%mm1, %%mm0 \n\t"
552  "psubusb %%mm4, %%mm1 \n\t"
553  "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
554 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
555 
556  "movq (%0, %1, 4), %%mm0 \n\t" // l4
557  "movq %%mm0, %%mm4 \n\t" // l4
558  PAVGB(%%mm7, %%mm0) // ~l4/2
559  PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
560  PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
561 
562  "movq (%%"FF_REG_c"), %%mm2 \n\t" // l5
563  "movq %%mm3, %%mm5 \n\t" // l3
564  PAVGB(%%mm7, %%mm3) // ~l3/2
565  PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
566  PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
567 
568  "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
569  "psubusb %%mm3, %%mm0 \n\t"
570  "psubusb %%mm6, %%mm3 \n\t"
571  "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
572  "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
573 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
574 
575  "movq (%%"FF_REG_c", %1), %%mm6 \n\t" // l6
576  "movq %%mm6, %%mm5 \n\t" // l6
577  PAVGB(%%mm7, %%mm6) // ~l6/2
578  PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
579  PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
580 
581  "movq (%%"FF_REG_c", %1, 2), %%mm5 \n\t" // l7
582  "movq %%mm2, %%mm4 \n\t" // l5
583  PAVGB(%%mm7, %%mm2) // ~l5/2
584  PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
585  PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
586 
587  "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
588  "psubusb %%mm2, %%mm6 \n\t"
589  "psubusb %%mm4, %%mm2 \n\t"
590  "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
591 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
592 
593 
594  PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
595  "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
596  "paddusb "MANGLE(b01)", %%mm4 \n\t"
597  "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
598  "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
599  "pand %%mm4, %%mm3 \n\t"
600 
601  "movq %%mm3, %%mm1 \n\t"
602 // "psubusb "MANGLE(b01)", %%mm3 \n\t"
603  PAVGB(%%mm7, %%mm3)
604  PAVGB(%%mm7, %%mm3)
605  "paddusb %%mm1, %%mm3 \n\t"
606 // "paddusb "MANGLE(b01)", %%mm3 \n\t"
607 
608  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" //l3
609  "movq (%0, %1, 4), %%mm5 \n\t" //l4
610  "movq (%0, %1, 4), %%mm4 \n\t" //l4
611  "psubusb %%mm6, %%mm5 \n\t"
612  "psubusb %%mm4, %%mm6 \n\t"
613  "por %%mm6, %%mm5 \n\t" // |l3-l4|
614  "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
615  "pxor %%mm6, %%mm0 \n\t"
616  "pand %%mm0, %%mm3 \n\t"
617  PMINUB(%%mm5, %%mm3, %%mm0)
618 
619  "psubusb "MANGLE(b01)", %%mm3 \n\t"
620  PAVGB(%%mm7, %%mm3)
621 
622  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
623  "movq (%0, %1, 4), %%mm2 \n\t"
624  "pxor %%mm6, %%mm0 \n\t"
625  "pxor %%mm6, %%mm2 \n\t"
626  "psubb %%mm3, %%mm0 \n\t"
627  "paddb %%mm3, %%mm2 \n\t"
628  "pxor %%mm6, %%mm0 \n\t"
629  "pxor %%mm6, %%mm2 \n\t"
630  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
631  "movq %%mm2, (%0, %1, 4) \n\t"
632 #endif //0
633 
634  "lea (%0, %1), %%"FF_REG_a" \n\t"
635  "pcmpeqb %%mm6, %%mm6 \n\t" // -1
636 // 0 1 2 3 4 5 6 7
637 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
638 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
639 
640 
641  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t" // l3
642  "movq (%0, %1, 4), %%mm0 \n\t" // l4
643  "pxor %%mm6, %%mm1 \n\t" // -l3-1
644  PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
645 // mm1=-l3-1, mm0=128-q
646 
647  "movq (%%"FF_REG_a", %1, 4), %%mm2 \n\t" // l5
648  "movq (%%"FF_REG_a", %1), %%mm3 \n\t" // l2
649  "pxor %%mm6, %%mm2 \n\t" // -l5-1
650  "movq %%mm2, %%mm5 \n\t" // -l5-1
651  "movq "MANGLE(b80)", %%mm4 \n\t" // 128
652  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
653  PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
654  PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
655  PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
656  PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
657 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
658 
659  "movq (%%"FF_REG_a"), %%mm2 \n\t" // l1
660  "pxor %%mm6, %%mm2 \n\t" // -l1-1
661  PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
662  PAVGB((%0), %%mm1) // (l0-l3+256)/2
663  "movq "MANGLE(b80)", %%mm3 \n\t" // 128
664  PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
665  PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
666  PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
667 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
668 
669  PAVGB((%%FF_REGc, %1), %%mm5) // (l6-l5+256)/2
670  "movq (%%"FF_REG_c", %1, 2), %%mm1 \n\t" // l7
671  "pxor %%mm6, %%mm1 \n\t" // -l7-1
672  PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
673  "movq "MANGLE(b80)", %%mm2 \n\t" // 128
674  PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
675  PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
676  PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
677 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
678 
679  "movq "MANGLE(b00)", %%mm1 \n\t" // 0
680  "movq "MANGLE(b00)", %%mm5 \n\t" // 0
681  "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
682  "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
683  PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
684  PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
685  PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
686 
687 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
688 
689  "movq "MANGLE(b00)", %%mm7 \n\t" // 0
690  "movq %2, %%mm2 \n\t" // QP
691  PAVGB(%%mm6, %%mm2) // 128 + QP/2
692  "psubb %%mm6, %%mm2 \n\t"
693 
694  "movq %%mm4, %%mm1 \n\t"
695  "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
696  "pxor %%mm1, %%mm4 \n\t"
697  "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
698  "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
699  "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
700 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
701 
702  "movq %%mm4, %%mm3 \n\t" // d
703  "psubusb "MANGLE(b01)", %%mm4 \n\t"
704  PAVGB(%%mm7, %%mm4) // d/32
705  PAVGB(%%mm7, %%mm4) // (d + 32)/64
706  "paddb %%mm3, %%mm4 \n\t" // 5d/64
707  "pand %%mm2, %%mm4 \n\t"
708 
709  "movq "MANGLE(b80)", %%mm5 \n\t" // 128
710  "psubb %%mm0, %%mm5 \n\t" // q
711  "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
712  "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
713  "pxor %%mm7, %%mm5 \n\t"
714 
715  PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
716  "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
717 
718  "pand %%mm7, %%mm4 \n\t"
719  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
720  "movq (%0, %1, 4), %%mm2 \n\t"
721  "pxor %%mm1, %%mm0 \n\t"
722  "pxor %%mm1, %%mm2 \n\t"
723  "paddb %%mm4, %%mm0 \n\t"
724  "psubb %%mm4, %%mm2 \n\t"
725  "pxor %%mm1, %%mm0 \n\t"
726  "pxor %%mm1, %%mm2 \n\t"
727  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
728  "movq %%mm2, (%0, %1, 4) \n\t"
729 
730  :
731  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
732  NAMED_CONSTRAINTS_ADD(b80,b00,b01)
733  : "%"FF_REG_a, "%"FF_REG_c
734  );
735 
736 /*
737  {
738  int x;
739  src-= stride;
740  for(x=0; x<BLOCK_SIZE; x++){
741  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
742  if(FFABS(middleEnergy)< 8*QP){
743  const int q=(src[l4] - src[l5])/2;
744  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
745  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
746 
747  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
748  d= FFMAX(d, 0);
749 
750  d= (5*d + 32) >> 6;
751  d*= FFSIGN(-middleEnergy);
752 
753  if(q>0){
754  d= d<0 ? 0 : d;
755  d= d>q ? q : d;
756  }else{
757  d= d>0 ? 0 : d;
758  d= d<q ? q : d;
759  }
760 
761  src[l4]-= d;
762  src[l5]+= d;
763  }
764  src++;
765  }
766  src-=8;
767  for(x=0; x<8; x++){
768  int y;
769  for(y=4; y<6; y++){
770  int d= src[x+y*stride] - tmp[x+(y-4)*8];
771  int ad= FFABS(d);
772  static int max=0;
773  static int sum=0;
774  static int num=0;
775  static int bias=0;
776 
777  if(max<ad) max=ad;
778  sum+= ad>3 ? 1 : 0;
779  if(ad>3){
780  src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
781  }
782  if(y==4) bias+=d;
783  num++;
784  if(num%1000000 == 0){
785  av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
786  }
787  }
788  }
789 }
790 */
791 #else //TEMPLATE_PP_MMXEXT
792  const int l1= stride;
793  const int l2= stride + l1;
794  const int l3= stride + l2;
795  const int l4= stride + l3;
796  const int l5= stride + l4;
797  const int l6= stride + l5;
798  const int l7= stride + l6;
799  const int l8= stride + l7;
800 // const int l9= stride + l8;
801  int x;
802  src+= stride*3;
803  for(x=0; x<BLOCK_SIZE; x++){
804  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
805  if(FFABS(middleEnergy) < 8*c->QP){
806  const int q=(src[l4] - src[l5])/2;
807  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
808  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
809 
810  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
811  d= FFMAX(d, 0);
812 
813  d= (5*d + 32) >> 6;
814  d*= FFSIGN(-middleEnergy);
815 
816  if(q>0){
817  d = FFMAX(d, 0);
818  d = FFMIN(d, q);
819  }else{
820  d = FFMIN(d, 0);
821  d = FFMAX(d, q);
822  }
823 
824  src[l4]-= d;
825  src[l5]+= d;
826  }
827  src++;
828  }
829 #endif //TEMPLATE_PP_MMXEXT
830 }
831 #endif //TEMPLATE_PP_ALTIVEC
832 
833 #if !TEMPLATE_PP_ALTIVEC
834 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c, int leftborder, int rightborder, int topborder)
835 {
836 #if TEMPLATE_PP_MMXEXT && HAVE_7REGS
837  if (topborder)
838  return;
839  DECLARE_ALIGNED(8, uint64_t, tmp)[3];
840  __asm__ volatile(
841  "pxor %%mm6, %%mm6 \n\t"
842  "pcmpeqb %%mm7, %%mm7 \n\t"
843  "movq %2, %%mm0 \n\t"
844  "punpcklbw %%mm6, %%mm0 \n\t"
845  "psrlw $1, %%mm0 \n\t"
846  "psubw %%mm7, %%mm0 \n\t"
847  "packuswb %%mm0, %%mm0 \n\t"
848  "movq %%mm0, %3 \n\t"
849 
850  "lea (%0, %1), %%"FF_REG_a" \n\t"
851  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
852 
853 // 0 1 2 3 4 5 6 7 8 9
854 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
855 
856 #undef REAL_FIND_MIN_MAX
857 #undef FIND_MIN_MAX
858 #define REAL_FIND_MIN_MAX(addr)\
859  "movq " #addr ", %%mm0 \n\t"\
860  "pminub %%mm0, %%mm7 \n\t"\
861  "pmaxub %%mm0, %%mm6 \n\t"
862 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
863 
864 FIND_MIN_MAX((%%FF_REGa))
865 FIND_MIN_MAX((%%FF_REGa, %1))
866 FIND_MIN_MAX((%%FF_REGa, %1, 2))
867 FIND_MIN_MAX((%0, %1, 4))
868 FIND_MIN_MAX((%%FF_REGd))
869 FIND_MIN_MAX((%%FF_REGd, %1))
870 FIND_MIN_MAX((%%FF_REGd, %1, 2))
871 FIND_MIN_MAX((%0, %1, 8))
872 
873  "movq %%mm7, %%mm4 \n\t"
874  "psrlq $8, %%mm7 \n\t"
875  "pminub %%mm4, %%mm7 \n\t" // min of pixels
876  "pshufw $0xF9, %%mm7, %%mm4 \n\t"
877  "pminub %%mm4, %%mm7 \n\t" // min of pixels
878  "pshufw $0xFE, %%mm7, %%mm4 \n\t"
879  "pminub %%mm4, %%mm7 \n\t"
880 
881 
882  "movq %%mm6, %%mm4 \n\t"
883  "psrlq $8, %%mm6 \n\t"
884  "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
885  "pshufw $0xF9, %%mm6, %%mm4 \n\t"
886  "pmaxub %%mm4, %%mm6 \n\t"
887  "pshufw $0xFE, %%mm6, %%mm4 \n\t"
888  "pmaxub %%mm4, %%mm6 \n\t"
889  "movq %%mm6, %%mm0 \n\t" // max
890  "psubb %%mm7, %%mm6 \n\t" // max - min
891  "push %%"FF_REG_a" \n\t"
892  "movd %%mm6, %%eax \n\t"
893  "cmpb $"AV_STRINGIFY(DERING_THRESHOLD)", %%al \n\t"
894  "pop %%"FF_REG_a" \n\t"
895  " jb 1f \n\t"
896  PAVGB(%%mm0, %%mm7) // a=(max + min)/2
897  "punpcklbw %%mm7, %%mm7 \n\t"
898  "punpcklbw %%mm7, %%mm7 \n\t"
899  "punpcklbw %%mm7, %%mm7 \n\t"
900  "movq %%mm7, (%4) \n\t"
901 
902  "movq (%0), %%mm0 \n\t" // L10
903  "movq %%mm0, %%mm1 \n\t" // L10
904  "movq %%mm0, %%mm2 \n\t" // L10
905  "psllq $8, %%mm1 \n\t"
906  "psrlq $8, %%mm2 \n\t"
907  "movd -4(%0), %%mm3 \n\t"
908  "movd 8(%0), %%mm4 \n\t"
909  "psrlq $24, %%mm3 \n\t"
910  "psllq $56, %%mm4 \n\t"
911  "por %%mm3, %%mm1 \n\t" // L00
912  "por %%mm4, %%mm2 \n\t" // L20
913  "movq %%mm1, %%mm3 \n\t" // L00
914  PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
915  PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
916  "psubusb %%mm7, %%mm0 \n\t"
917  "psubusb %%mm7, %%mm2 \n\t"
918  "psubusb %%mm7, %%mm3 \n\t"
919  "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
920  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
921  "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
922  "paddb %%mm2, %%mm0 \n\t"
923  "paddb %%mm3, %%mm0 \n\t"
924 
925  "movq (%%"FF_REG_a"), %%mm2 \n\t" // L11
926  "movq %%mm2, %%mm3 \n\t" // L11
927  "movq %%mm2, %%mm4 \n\t" // L11
928  "psllq $8, %%mm3 \n\t"
929  "psrlq $8, %%mm4 \n\t"
930  "movd -4(%%"FF_REG_a"), %%mm5 \n\t"
931  "movd 8(%%"FF_REG_a"), %%mm6 \n\t"
932  "psrlq $24, %%mm5 \n\t"
933  "psllq $56, %%mm6 \n\t"
934  "por %%mm5, %%mm3 \n\t" // L01
935  "por %%mm6, %%mm4 \n\t" // L21
936  "movq %%mm3, %%mm5 \n\t" // L01
937  PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
938  PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
939  "psubusb %%mm7, %%mm2 \n\t"
940  "psubusb %%mm7, %%mm4 \n\t"
941  "psubusb %%mm7, %%mm5 \n\t"
942  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
943  "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
944  "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
945  "paddb %%mm4, %%mm2 \n\t"
946  "paddb %%mm5, %%mm2 \n\t"
947 // 0, 2, 3, 1
948 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
949  "movq " #src ", " #sx " \n\t" /* src[0] */\
950  "movq " #sx ", " #lx " \n\t" /* src[0] */\
951  "movq " #sx ", " #t0 " \n\t" /* src[0] */\
952  "psllq $8, " #lx " \n\t"\
953  "psrlq $8, " #t0 " \n\t"\
954  "movd -4" #src ", " #t1 " \n\t"\
955  "psrlq $24, " #t1 " \n\t"\
956  "por " #t1 ", " #lx " \n\t" /* src[-1] */\
957  "movd 8" #src ", " #t1 " \n\t"\
958  "psllq $56, " #t1 " \n\t"\
959  "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
960  "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
961  PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
962  PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
963  PAVGB(lx, pplx) \
964  "movq " #lx ", 8(%4) \n\t"\
965  "movq (%4), " #lx " \n\t"\
966  "psubusb " #lx ", " #t1 " \n\t"\
967  "psubusb " #lx ", " #t0 " \n\t"\
968  "psubusb " #lx ", " #sx " \n\t"\
969  "movq "MANGLE(b00)", " #lx " \n\t"\
970  "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
971  "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
972  "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
973  "paddb " #t1 ", " #t0 " \n\t"\
974  "paddb " #t0 ", " #sx " \n\t"\
975 \
976  PAVGB(plx, pplx) /* filtered */\
977  "movq " #dst ", " #t0 " \n\t" /* dst */\
978  "movq " #t0 ", " #t1 " \n\t" /* dst */\
979  "psubusb %3, " #t0 " \n\t"\
980  "paddusb %3, " #t1 " \n\t"\
981  PMAXUB(t0, pplx)\
982  PMINUB(t1, pplx, t0)\
983  "paddb " #sx ", " #ppsx " \n\t"\
984  "paddb " #psx ", " #ppsx " \n\t"\
985  "pand "MANGLE(b08)", " #ppsx " \n\t"\
986  "pcmpeqb " #lx ", " #ppsx " \n\t"\
987  "pand " #ppsx ", " #pplx " \n\t"\
988  "pandn " #dst ", " #ppsx " \n\t"\
989  "por " #pplx ", " #ppsx " \n\t"\
990  "movq " #ppsx ", " #dst " \n\t"\
991  "movq 8(%4), " #lx " \n\t"
992 
993 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
994  REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
995 /*
996 0000000
997 1111111
998 
999 1111110
1000 1111101
1001 1111100
1002 1111011
1003 1111010
1004 1111001
1005 
1006 1111000
1007 1110111
1008 
1009 */
1010 //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1011 DERING_CORE((%%FF_REGa) ,(%%FF_REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1012 DERING_CORE((%%FF_REGa, %1) ,(%%FF_REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1013 DERING_CORE((%%FF_REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1014 DERING_CORE((%0, %1, 4) ,(%%FF_REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1015 DERING_CORE((%%FF_REGd) ,(%%FF_REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1016 DERING_CORE((%%FF_REGd, %1) ,(%%FF_REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1017 DERING_CORE((%%FF_REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1018 DERING_CORE((%0, %1, 8) ,(%%FF_REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1019 
1020  "1: \n\t"
1021  : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp)
1022  NAMED_CONSTRAINTS_ADD(b00,b08)
1023  : "%"FF_REG_a, "%"FF_REG_d
1024  );
1025 #else // HAVE_7REGS && TEMPLATE_PP_MMXEXT
1026  int y;
1027  int min=255;
1028  int max=0;
1029  int avg;
1030  uint8_t *p;
1031  int s[10];
1032  const int QP2= c->QP/2 + 1;
1033 
1034  src --;
1035  for(y=1; y<9; y++){
1036  int x;
1037  p= src + stride*y;
1038  for(x=1; x<9; x++){
1039  p++;
1040  if(*p > max) max= *p;
1041  if(*p < min) min= *p;
1042  }
1043  }
1044  avg= (min + max + 1)>>1;
1045 
1046  if (max - min < DERING_THRESHOLD) return;
1047 
1048  s[0] = 0;
1049  for(y=topborder; y<10; y++){
1050  int t = 0;
1051 
1052  if(!leftborder && src[stride*y + 0] > avg) t+= 1;
1053  if(src[stride*y + 1] > avg) t+= 2;
1054  if(src[stride*y + 2] > avg) t+= 4;
1055  if(src[stride*y + 3] > avg) t+= 8;
1056  if(src[stride*y + 4] > avg) t+= 16;
1057  if(src[stride*y + 5] > avg) t+= 32;
1058  if(src[stride*y + 6] > avg) t+= 64;
1059  if(src[stride*y + 7] > avg) t+= 128;
1060  if(src[stride*y + 8] > avg) t+= 256;
1061  if(!rightborder && src[stride*y + 9] > avg) t+= 512;
1062 
1063  t |= (~t)<<16;
1064  t &= (t<<1) & (t>>1);
1065  s[y] = t;
1066  }
1067 
1068  for(y=1; y<9; y++){
1069  int t = s[y-1] & s[y] & s[y+1];
1070  t|= t>>16;
1071  s[y-1]= t;
1072  }
1073 
1074  for(y=1; y<9; y++){
1075  int x;
1076  int t = s[y-1];
1077 
1078  p= src + stride*y + leftborder;
1079  for(x=1+leftborder; x<9-rightborder; x++){
1080  p++;
1081  if(t & (1<<x)){
1082  int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1083  +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1084  +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1085  f= (f + 8)>>4;
1086 
1087 #ifdef DEBUG_DERING_THRESHOLD
1088  __asm__ volatile("emms\n\t":);
1089  {
1090  static uint64_t numPixels=0;
1091  if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1092 // if((max-min)<20 || (max-min)*QP<200)
1093 // if((max-min)*QP < 500)
1094 // if(max-min<QP/2)
1095  if(max-min < 20){
1096  static int numSkipped=0;
1097  static int errorSum=0;
1098  static int worstQP=0;
1099  static int worstRange=0;
1100  static int worstDiff=0;
1101  int diff= (f - *p);
1102  int absDiff= FFABS(diff);
1103  int error= diff*diff;
1104 
1105  if(x==1 || x==8 || y==1 || y==8) continue;
1106 
1107  numSkipped++;
1108  if(absDiff > worstDiff){
1109  worstDiff= absDiff;
1110  worstQP= QP;
1111  worstRange= max-min;
1112  }
1113  errorSum+= error;
1114 
1115  if(1024LL*1024LL*1024LL % numSkipped == 0){
1116  av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1117  "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1118  (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
1119  worstDiff, (float)numSkipped/numPixels);
1120  }
1121  }
1122  }
1123 #endif
1124  if (*p + QP2 < f) *p= *p + QP2;
1125  else if(*p - QP2 > f) *p= *p - QP2;
1126  else *p=f;
1127  }
1128  }
1129  }
1130 #ifdef DEBUG_DERING_THRESHOLD
1131  if(max-min < 20){
1132  for(y=1; y<9; y++){
1133  int x;
1134  int t = 0;
1135  p= src + stride*y;
1136  for(x=1; x<9; x++){
1137  p++;
1138  *p = FFMIN(*p + 20, 255);
1139  }
1140  }
1141 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1142  }
1143 #endif
1144 #endif //TEMPLATE_PP_MMXEXT
1145 }
1146 #endif //TEMPLATE_PP_ALTIVEC
1147 
1148 /**
1149  * Deinterlace the given block by linearly interpolating every second line.
1150  * will be called for every 8x8 block and can read & write from line 4-15
1151  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1152  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1153  */
1154 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1155 {
1156 #if TEMPLATE_PP_MMXEXT
1157  src+= 4*stride;
1158  __asm__ volatile(
1159  "lea (%0, %1), %%"FF_REG_a" \n\t"
1160  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
1161 // 0 1 2 3 4 5 6 7 8 9
1162 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
1163 
1164  "movq (%0), %%mm0 \n\t"
1165  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1166  PAVGB(%%mm1, %%mm0)
1167  "movq %%mm0, (%%"FF_REG_a") \n\t"
1168  "movq (%0, %1, 4), %%mm0 \n\t"
1169  PAVGB(%%mm0, %%mm1)
1170  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t"
1171  "movq (%%"FF_REG_c", %1), %%mm1 \n\t"
1172  PAVGB(%%mm1, %%mm0)
1173  "movq %%mm0, (%%"FF_REG_c") \n\t"
1174  "movq (%0, %1, 8), %%mm0 \n\t"
1175  PAVGB(%%mm0, %%mm1)
1176  "movq %%mm1, (%%"FF_REG_c", %1, 2) \n\t"
1177 
1178  : : "r" (src), "r" ((x86_reg)stride)
1179  : "%"FF_REG_a, "%"FF_REG_c
1180  );
1181 #else
1182  int a, b, x;
1183  src+= 4*stride;
1184 
1185  for(x=0; x<2; x++){
1186  a= *(uint32_t*)&src[stride*0];
1187  b= *(uint32_t*)&src[stride*2];
1188  *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1189  a= *(uint32_t*)&src[stride*4];
1190  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1191  b= *(uint32_t*)&src[stride*6];
1192  *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1193  a= *(uint32_t*)&src[stride*8];
1194  *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1195  src += 4;
1196  }
1197 #endif
1198 }
1199 
1200 /**
1201  * Deinterlace the given block by cubic interpolating every second line.
1202  * will be called for every 8x8 block and can read & write from line 4-15
1203  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1204  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1205  * this filter will read lines 3-15 and write 7-13
1206  */
1207 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1208 {
1209 #if TEMPLATE_PP_SSE2
1210  src+= stride*3;
1211  __asm__ volatile(
1212  "lea (%0, %1), %%"FF_REG_a" \n\t"
1213  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1214  "lea (%%"FF_REG_d", %1, 4), %%"FF_REG_c"\n\t"
1215  "add %1, %%"FF_REG_c" \n\t"
1216  "pxor %%xmm7, %%xmm7 \n\t"
1217 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1218  "movq " #a ", %%xmm0 \n\t"\
1219  "movq " #b ", %%xmm1 \n\t"\
1220  "movq " #d ", %%xmm2 \n\t"\
1221  "movq " #e ", %%xmm3 \n\t"\
1222  "pavgb %%xmm2, %%xmm1 \n\t"\
1223  "pavgb %%xmm3, %%xmm0 \n\t"\
1224  "punpcklbw %%xmm7, %%xmm0 \n\t"\
1225  "punpcklbw %%xmm7, %%xmm1 \n\t"\
1226  "psubw %%xmm1, %%xmm0 \n\t"\
1227  "psraw $3, %%xmm0 \n\t"\
1228  "psubw %%xmm0, %%xmm1 \n\t"\
1229  "packuswb %%xmm1, %%xmm1 \n\t"\
1230  "movlps %%xmm1, " #c " \n\t"
1231 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
1232 
1233 DEINT_CUBIC((%0) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd, %1))
1234 DEINT_CUBIC((%%FF_REGa, %1), (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%0, %1, 8))
1235 DEINT_CUBIC((%0, %1, 4) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGc))
1236 DEINT_CUBIC((%%FF_REGd, %1), (%0, %1, 8) , (%%FF_REGd, %1, 4), (%%FF_REGc) , (%%FF_REGc, %1, 2))
1237 
1238  : : "r" (src), "r" ((x86_reg)stride)
1239  :
1240  XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm7",)
1241  "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c
1242  );
1243 #undef REAL_DEINT_CUBIC
1244 #else //TEMPLATE_PP_SSE2
1245  int x;
1246  src+= stride*3;
1247  for(x=0; x<8; x++){
1248  src[stride*3] = av_clip_uint8((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1249  src[stride*5] = av_clip_uint8((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1250  src[stride*7] = av_clip_uint8((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1251  src[stride*9] = av_clip_uint8((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1252  src++;
1253  }
1254 #endif //TEMPLATE_PP_SSE2
1255 }
1256 
1257 /**
1258  * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1259  * will be called for every 8x8 block and can read & write from line 4-15
1260  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1261  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1262  * this filter will read lines 4-13 and write 5-11
1263  */
1264 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1265 {
1266 #if TEMPLATE_PP_MMXEXT
1267  src+= stride*4;
1268  __asm__ volatile(
1269  "lea (%0, %1), %%"FF_REG_a" \n\t"
1270  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1271  "pxor %%mm7, %%mm7 \n\t"
1272  "movq (%2), %%mm0 \n\t"
1273 // 0 1 2 3 4 5 6 7 8 9 10
1274 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1275 
1276 #define REAL_DEINT_FF(a,b,c,d)\
1277  "movq " #a ", %%mm1 \n\t"\
1278  "movq " #b ", %%mm2 \n\t"\
1279  "movq " #c ", %%mm3 \n\t"\
1280  "movq " #d ", %%mm4 \n\t"\
1281  PAVGB(%%mm3, %%mm1) \
1282  PAVGB(%%mm4, %%mm0) \
1283  "movq %%mm0, %%mm3 \n\t"\
1284  "punpcklbw %%mm7, %%mm0 \n\t"\
1285  "punpckhbw %%mm7, %%mm3 \n\t"\
1286  "movq %%mm1, %%mm4 \n\t"\
1287  "punpcklbw %%mm7, %%mm1 \n\t"\
1288  "punpckhbw %%mm7, %%mm4 \n\t"\
1289  "psllw $2, %%mm1 \n\t"\
1290  "psllw $2, %%mm4 \n\t"\
1291  "psubw %%mm0, %%mm1 \n\t"\
1292  "psubw %%mm3, %%mm4 \n\t"\
1293  "movq %%mm2, %%mm5 \n\t"\
1294  "movq %%mm2, %%mm0 \n\t"\
1295  "punpcklbw %%mm7, %%mm2 \n\t"\
1296  "punpckhbw %%mm7, %%mm5 \n\t"\
1297  "paddw %%mm2, %%mm1 \n\t"\
1298  "paddw %%mm5, %%mm4 \n\t"\
1299  "psraw $2, %%mm1 \n\t"\
1300  "psraw $2, %%mm4 \n\t"\
1301  "packuswb %%mm4, %%mm1 \n\t"\
1302  "movq %%mm1, " #b " \n\t"\
1303 
1304 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
1305 
1306 DEINT_FF((%0) , (%%FF_REGa) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2))
1307 DEINT_FF((%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1308 DEINT_FF((%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2))
1309 DEINT_FF((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1310 
1311  "movq %%mm0, (%2) \n\t"
1312  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
1313  : "%"FF_REG_a, "%"FF_REG_d
1314  );
1315 #else //TEMPLATE_PP_MMXEXT
1316  int x;
1317  src+= stride*4;
1318  for(x=0; x<8; x++){
1319  int t1= tmp[x];
1320  int t2= src[stride*1];
1321 
1322  src[stride*1]= av_clip_uint8((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1323  t1= src[stride*4];
1324  src[stride*3]= av_clip_uint8((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1325  t2= src[stride*6];
1326  src[stride*5]= av_clip_uint8((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1327  t1= src[stride*8];
1328  src[stride*7]= av_clip_uint8((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1329  tmp[x]= t1;
1330 
1331  src++;
1332  }
1333 #endif //TEMPLATE_PP_MMXEXT
1334 }
1335 
1336 /**
1337  * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter.
1338  * will be called for every 8x8 block and can read & write from line 4-15
1339  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1340  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1341  * this filter will read lines 4-13 and write 4-11
1342  */
1343 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1344 {
1345 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
1346  src+= stride*4;
1347  __asm__ volatile(
1348  "lea (%0, %1), %%"FF_REG_a" \n\t"
1349  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1350  "pxor %%mm7, %%mm7 \n\t"
1351  "movq (%2), %%mm0 \n\t"
1352  "movq (%3), %%mm1 \n\t"
1353 // 0 1 2 3 4 5 6 7 8 9 10
1354 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1355 
1356 #define REAL_DEINT_L5(t1,t2,a,b,c)\
1357  "movq " #a ", %%mm2 \n\t"\
1358  "movq " #b ", %%mm3 \n\t"\
1359  "movq " #c ", %%mm4 \n\t"\
1360  PAVGB(t2, %%mm3) \
1361  PAVGB(t1, %%mm4) \
1362  "movq %%mm2, %%mm5 \n\t"\
1363  "movq %%mm2, " #t1 " \n\t"\
1364  "punpcklbw %%mm7, %%mm2 \n\t"\
1365  "punpckhbw %%mm7, %%mm5 \n\t"\
1366  "movq %%mm2, %%mm6 \n\t"\
1367  "paddw %%mm2, %%mm2 \n\t"\
1368  "paddw %%mm6, %%mm2 \n\t"\
1369  "movq %%mm5, %%mm6 \n\t"\
1370  "paddw %%mm5, %%mm5 \n\t"\
1371  "paddw %%mm6, %%mm5 \n\t"\
1372  "movq %%mm3, %%mm6 \n\t"\
1373  "punpcklbw %%mm7, %%mm3 \n\t"\
1374  "punpckhbw %%mm7, %%mm6 \n\t"\
1375  "paddw %%mm3, %%mm3 \n\t"\
1376  "paddw %%mm6, %%mm6 \n\t"\
1377  "paddw %%mm3, %%mm2 \n\t"\
1378  "paddw %%mm6, %%mm5 \n\t"\
1379  "movq %%mm4, %%mm6 \n\t"\
1380  "punpcklbw %%mm7, %%mm4 \n\t"\
1381  "punpckhbw %%mm7, %%mm6 \n\t"\
1382  "psubw %%mm4, %%mm2 \n\t"\
1383  "psubw %%mm6, %%mm5 \n\t"\
1384  "psraw $2, %%mm2 \n\t"\
1385  "psraw $2, %%mm5 \n\t"\
1386  "packuswb %%mm5, %%mm2 \n\t"\
1387  "movq %%mm2, " #a " \n\t"\
1388 
1389 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
1390 
1391 DEINT_L5(%%mm0, %%mm1, (%0) , (%%FF_REGa) , (%%FF_REGa, %1) )
1392 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa) , (%%FF_REGa, %1) , (%%FF_REGa, %1, 2))
1393 DEINT_L5(%%mm0, %%mm1, (%%FF_REGa, %1) , (%%FF_REGa, %1, 2), (%0, %1, 4) )
1394 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1395 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1) )
1396 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd) , (%%FF_REGd, %1) , (%%FF_REGd, %1, 2))
1397 DEINT_L5(%%mm0, %%mm1, (%%FF_REGd, %1) , (%%FF_REGd, %1, 2), (%0, %1, 8) )
1398 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1399 
1400  "movq %%mm0, (%2) \n\t"
1401  "movq %%mm1, (%3) \n\t"
1402  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
1403  : "%"FF_REG_a, "%"FF_REG_d
1404  );
1405 #else //TEMPLATE_PP_MMXEXT && HAVE_6REGS
1406  int x;
1407  src+= stride*4;
1408  for(x=0; x<8; x++){
1409  int t1= tmp[x];
1410  int t2= tmp2[x];
1411  int t3= src[0];
1412 
1413  src[stride*0]= av_clip_uint8((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1414  t1= src[stride*1];
1415  src[stride*1]= av_clip_uint8((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1416  t2= src[stride*2];
1417  src[stride*2]= av_clip_uint8((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1418  t3= src[stride*3];
1419  src[stride*3]= av_clip_uint8((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1420  t1= src[stride*4];
1421  src[stride*4]= av_clip_uint8((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1422  t2= src[stride*5];
1423  src[stride*5]= av_clip_uint8((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1424  t3= src[stride*6];
1425  src[stride*6]= av_clip_uint8((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1426  t1= src[stride*7];
1427  src[stride*7]= av_clip_uint8((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1428 
1429  tmp[x]= t3;
1430  tmp2[x]= t1;
1431 
1432  src++;
1433  }
1434 #endif // TEMPLATE_PP_MMXEXT && HAVE_6REGS
1435 }
1436 
1437 /**
1438  * Deinterlace the given block by filtering all lines with a (1 2 1) filter.
1439  * will be called for every 8x8 block and can read & write from line 4-15
1440  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1441  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1442  * this filter will read lines 4-13 and write 4-11
1443  */
1444 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1445 {
1446 #if TEMPLATE_PP_MMXEXT
1447  src+= 4*stride;
1448  __asm__ volatile(
1449  "lea (%0, %1), %%"FF_REG_a" \n\t"
1450  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1451 // 0 1 2 3 4 5 6 7 8 9
1452 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1453 
1454  "movq (%2), %%mm0 \n\t" // L0
1455  "movq (%%"FF_REG_a"), %%mm1 \n\t" // L2
1456  PAVGB(%%mm1, %%mm0) // L0+L2
1457  "movq (%0), %%mm2 \n\t" // L1
1458  PAVGB(%%mm2, %%mm0)
1459  "movq %%mm0, (%0) \n\t"
1460  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // L3
1461  PAVGB(%%mm0, %%mm2) // L1+L3
1462  PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1463  "movq %%mm2, (%%"FF_REG_a") \n\t"
1464  "movq (%%"FF_REG_a", %1, 2), %%mm2 \n\t" // L4
1465  PAVGB(%%mm2, %%mm1) // L2+L4
1466  PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1467  "movq %%mm1, (%%"FF_REG_a", %1) \n\t"
1468  "movq (%0, %1, 4), %%mm1 \n\t" // L5
1469  PAVGB(%%mm1, %%mm0) // L3+L5
1470  PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1471  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
1472  "movq (%%"FF_REG_d"), %%mm0 \n\t" // L6
1473  PAVGB(%%mm0, %%mm2) // L4+L6
1474  PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1475  "movq %%mm2, (%0, %1, 4) \n\t"
1476  "movq (%%"FF_REG_d", %1), %%mm2 \n\t" // L7
1477  PAVGB(%%mm2, %%mm1) // L5+L7
1478  PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1479  "movq %%mm1, (%%"FF_REG_d") \n\t"
1480  "movq (%%"FF_REG_d", %1, 2), %%mm1 \n\t" // L8
1481  PAVGB(%%mm1, %%mm0) // L6+L8
1482  PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1483  "movq %%mm0, (%%"FF_REG_d", %1) \n\t"
1484  "movq (%0, %1, 8), %%mm0 \n\t" // L9
1485  PAVGB(%%mm0, %%mm2) // L7+L9
1486  PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1487  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1488  "movq %%mm1, (%2) \n\t"
1489 
1490  : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
1491  : "%"FF_REG_a, "%"FF_REG_d
1492  );
1493 #else //TEMPLATE_PP_MMXEXT
1494  int a, b, c, x;
1495  src+= 4*stride;
1496 
1497  for(x=0; x<2; x++){
1498  a= *(uint32_t*)&tmp[stride*0];
1499  b= *(uint32_t*)&src[stride*0];
1500  c= *(uint32_t*)&src[stride*1];
1501  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1502  *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1503 
1504  a= *(uint32_t*)&src[stride*2];
1505  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1506  *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1507 
1508  b= *(uint32_t*)&src[stride*3];
1509  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1510  *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1511 
1512  c= *(uint32_t*)&src[stride*4];
1513  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1514  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1515 
1516  a= *(uint32_t*)&src[stride*5];
1517  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1518  *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1519 
1520  b= *(uint32_t*)&src[stride*6];
1521  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1522  *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1523 
1524  c= *(uint32_t*)&src[stride*7];
1525  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1526  *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1527 
1528  a= *(uint32_t*)&src[stride*8];
1529  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1530  *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1531 
1532  *(uint32_t*)&tmp[stride*0]= c;
1533  src += 4;
1534  tmp += 4;
1535  }
1536 #endif //TEMPLATE_PP_MMXEXT
1537 }
1538 
1539 /**
1540  * Deinterlace the given block by applying a median filter to every second line.
1541  * will be called for every 8x8 block and can read & write from line 4-15,
1542  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1543  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1544  */
1545 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1546 {
1547 #if TEMPLATE_PP_MMXEXT
1548  src+= 4*stride;
1549  __asm__ volatile(
1550  "lea (%0, %1), %%"FF_REG_a" \n\t"
1551  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1552 // 0 1 2 3 4 5 6 7 8 9
1553 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1554 
1555  "movq (%0), %%mm0 \n\t"
1556  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
1557  "movq (%%"FF_REG_a"), %%mm1 \n\t"
1558  "movq %%mm0, %%mm3 \n\t"
1559  "pmaxub %%mm1, %%mm0 \n\t"
1560  "pminub %%mm3, %%mm1 \n\t"
1561  "pmaxub %%mm2, %%mm1 \n\t"
1562  "pminub %%mm1, %%mm0 \n\t"
1563  "movq %%mm0, (%%"FF_REG_a") \n\t"
1564 
1565  "movq (%0, %1, 4), %%mm0 \n\t"
1566  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t"
1567  "movq %%mm2, %%mm3 \n\t"
1568  "pmaxub %%mm1, %%mm2 \n\t"
1569  "pminub %%mm3, %%mm1 \n\t"
1570  "pmaxub %%mm0, %%mm1 \n\t"
1571  "pminub %%mm1, %%mm2 \n\t"
1572  "movq %%mm2, (%%"FF_REG_a", %1, 2) \n\t"
1573 
1574  "movq (%%"FF_REG_d"), %%mm2 \n\t"
1575  "movq (%%"FF_REG_d", %1), %%mm1 \n\t"
1576  "movq %%mm2, %%mm3 \n\t"
1577  "pmaxub %%mm0, %%mm2 \n\t"
1578  "pminub %%mm3, %%mm0 \n\t"
1579  "pmaxub %%mm1, %%mm0 \n\t"
1580  "pminub %%mm0, %%mm2 \n\t"
1581  "movq %%mm2, (%%"FF_REG_d") \n\t"
1582 
1583  "movq (%%"FF_REG_d", %1, 2), %%mm2 \n\t"
1584  "movq (%0, %1, 8), %%mm0 \n\t"
1585  "movq %%mm2, %%mm3 \n\t"
1586  "pmaxub %%mm0, %%mm2 \n\t"
1587  "pminub %%mm3, %%mm0 \n\t"
1588  "pmaxub %%mm1, %%mm0 \n\t"
1589  "pminub %%mm0, %%mm2 \n\t"
1590  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1591 
1592 
1593  : : "r" (src), "r" ((x86_reg)stride)
1594  : "%"FF_REG_a, "%"FF_REG_d
1595  );
1596 
1597 #else //TEMPLATE_PP_MMX
1598  int x, y;
1599  src+= 4*stride;
1600  // FIXME - there should be a way to do a few columns in parallel like w/mmx
1601  for(x=0; x<8; x++){
1602  uint8_t *colsrc = src;
1603  for (y=0; y<4; y++){
1604  int a, b, c, d, e, f;
1605  a = colsrc[0 ];
1606  b = colsrc[stride ];
1607  c = colsrc[stride*2];
1608  d = (a-b)>>31;
1609  e = (b-c)>>31;
1610  f = (c-a)>>31;
1611  colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
1612  colsrc += stride*2;
1613  }
1614  src++;
1615  }
1616 #endif //TEMPLATE_PP_MMX
1617 }
1618 
1619 #if TEMPLATE_PP_MMX
1620 /**
1621  * Transpose and shift the given 8x8 Block into dst1 and dst2.
1622  */
1623 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, const uint8_t *src, int srcStride)
1624 {
1625  __asm__(
1626  "lea (%0, %1), %%"FF_REG_a" \n\t"
1627 // 0 1 2 3 4 5 6 7 8 9
1628 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1629  "movq (%0), %%mm0 \n\t" // 12345678
1630  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
1631  "movq %%mm0, %%mm2 \n\t" // 12345678
1632  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1633  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1634 
1635  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1636  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
1637  "movq %%mm1, %%mm4 \n\t"
1638  "punpcklbw %%mm3, %%mm1 \n\t"
1639  "punpckhbw %%mm3, %%mm4 \n\t"
1640 
1641  "movq %%mm0, %%mm3 \n\t"
1642  "punpcklwd %%mm1, %%mm0 \n\t"
1643  "punpckhwd %%mm1, %%mm3 \n\t"
1644  "movq %%mm2, %%mm1 \n\t"
1645  "punpcklwd %%mm4, %%mm2 \n\t"
1646  "punpckhwd %%mm4, %%mm1 \n\t"
1647 
1648  "movd %%mm0, 128(%2) \n\t"
1649  "psrlq $32, %%mm0 \n\t"
1650  "movd %%mm0, 144(%2) \n\t"
1651  "movd %%mm3, 160(%2) \n\t"
1652  "psrlq $32, %%mm3 \n\t"
1653  "movd %%mm3, 176(%2) \n\t"
1654  "movd %%mm3, 48(%3) \n\t"
1655  "movd %%mm2, 192(%2) \n\t"
1656  "movd %%mm2, 64(%3) \n\t"
1657  "psrlq $32, %%mm2 \n\t"
1658  "movd %%mm2, 80(%3) \n\t"
1659  "movd %%mm1, 96(%3) \n\t"
1660  "psrlq $32, %%mm1 \n\t"
1661  "movd %%mm1, 112(%3) \n\t"
1662 
1663  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_a"\n\t"
1664 
1665  "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
1666  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
1667  "movq %%mm0, %%mm2 \n\t" // 12345678
1668  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1669  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1670 
1671  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1672  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
1673  "movq %%mm1, %%mm4 \n\t"
1674  "punpcklbw %%mm3, %%mm1 \n\t"
1675  "punpckhbw %%mm3, %%mm4 \n\t"
1676 
1677  "movq %%mm0, %%mm3 \n\t"
1678  "punpcklwd %%mm1, %%mm0 \n\t"
1679  "punpckhwd %%mm1, %%mm3 \n\t"
1680  "movq %%mm2, %%mm1 \n\t"
1681  "punpcklwd %%mm4, %%mm2 \n\t"
1682  "punpckhwd %%mm4, %%mm1 \n\t"
1683 
1684  "movd %%mm0, 132(%2) \n\t"
1685  "psrlq $32, %%mm0 \n\t"
1686  "movd %%mm0, 148(%2) \n\t"
1687  "movd %%mm3, 164(%2) \n\t"
1688  "psrlq $32, %%mm3 \n\t"
1689  "movd %%mm3, 180(%2) \n\t"
1690  "movd %%mm3, 52(%3) \n\t"
1691  "movd %%mm2, 196(%2) \n\t"
1692  "movd %%mm2, 68(%3) \n\t"
1693  "psrlq $32, %%mm2 \n\t"
1694  "movd %%mm2, 84(%3) \n\t"
1695  "movd %%mm1, 100(%3) \n\t"
1696  "psrlq $32, %%mm1 \n\t"
1697  "movd %%mm1, 116(%3) \n\t"
1698 
1699 
1700  :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
1701  : "%"FF_REG_a
1702  );
1703 }
1704 
1705 /**
1706  * Transpose the given 8x8 block.
1707  */
1708 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, const uint8_t *src)
1709 {
1710  __asm__(
1711  "lea (%0, %1), %%"FF_REG_a" \n\t"
1712  "lea (%%"FF_REG_a",%1,4), %%"FF_REG_d" \n\t"
1713 // 0 1 2 3 4 5 6 7 8 9
1714 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1715  "movq (%2), %%mm0 \n\t" // 12345678
1716  "movq 16(%2), %%mm1 \n\t" // abcdefgh
1717  "movq %%mm0, %%mm2 \n\t" // 12345678
1718  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1719  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1720 
1721  "movq 32(%2), %%mm1 \n\t"
1722  "movq 48(%2), %%mm3 \n\t"
1723  "movq %%mm1, %%mm4 \n\t"
1724  "punpcklbw %%mm3, %%mm1 \n\t"
1725  "punpckhbw %%mm3, %%mm4 \n\t"
1726 
1727  "movq %%mm0, %%mm3 \n\t"
1728  "punpcklwd %%mm1, %%mm0 \n\t"
1729  "punpckhwd %%mm1, %%mm3 \n\t"
1730  "movq %%mm2, %%mm1 \n\t"
1731  "punpcklwd %%mm4, %%mm2 \n\t"
1732  "punpckhwd %%mm4, %%mm1 \n\t"
1733 
1734  "movd %%mm0, (%0) \n\t"
1735  "psrlq $32, %%mm0 \n\t"
1736  "movd %%mm0, (%%"FF_REG_a") \n\t"
1737  "movd %%mm3, (%%"FF_REG_a", %1) \n\t"
1738  "psrlq $32, %%mm3 \n\t"
1739  "movd %%mm3, (%%"FF_REG_a", %1, 2) \n\t"
1740  "movd %%mm2, (%0, %1, 4) \n\t"
1741  "psrlq $32, %%mm2 \n\t"
1742  "movd %%mm2, (%%"FF_REG_d") \n\t"
1743  "movd %%mm1, (%%"FF_REG_d", %1) \n\t"
1744  "psrlq $32, %%mm1 \n\t"
1745  "movd %%mm1, (%%"FF_REG_d", %1, 2) \n\t"
1746 
1747 
1748  "movq 64(%2), %%mm0 \n\t" // 12345678
1749  "movq 80(%2), %%mm1 \n\t" // abcdefgh
1750  "movq %%mm0, %%mm2 \n\t" // 12345678
1751  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
1752  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
1753 
1754  "movq 96(%2), %%mm1 \n\t"
1755  "movq 112(%2), %%mm3 \n\t"
1756  "movq %%mm1, %%mm4 \n\t"
1757  "punpcklbw %%mm3, %%mm1 \n\t"
1758  "punpckhbw %%mm3, %%mm4 \n\t"
1759 
1760  "movq %%mm0, %%mm3 \n\t"
1761  "punpcklwd %%mm1, %%mm0 \n\t"
1762  "punpckhwd %%mm1, %%mm3 \n\t"
1763  "movq %%mm2, %%mm1 \n\t"
1764  "punpcklwd %%mm4, %%mm2 \n\t"
1765  "punpckhwd %%mm4, %%mm1 \n\t"
1766 
1767  "movd %%mm0, 4(%0) \n\t"
1768  "psrlq $32, %%mm0 \n\t"
1769  "movd %%mm0, 4(%%"FF_REG_a") \n\t"
1770  "movd %%mm3, 4(%%"FF_REG_a", %1) \n\t"
1771  "psrlq $32, %%mm3 \n\t"
1772  "movd %%mm3, 4(%%"FF_REG_a", %1, 2) \n\t"
1773  "movd %%mm2, 4(%0, %1, 4) \n\t"
1774  "psrlq $32, %%mm2 \n\t"
1775  "movd %%mm2, 4(%%"FF_REG_d") \n\t"
1776  "movd %%mm1, 4(%%"FF_REG_d", %1) \n\t"
1777  "psrlq $32, %%mm1 \n\t"
1778  "movd %%mm1, 4(%%"FF_REG_d", %1, 2) \n\t"
1779 
1780  :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
1781  : "%"FF_REG_a, "%"FF_REG_d
1782  );
1783 }
1784 #endif //TEMPLATE_PP_MMX
1785 //static long test=0;
1786 
1787 #if !TEMPLATE_PP_ALTIVEC
1788 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
1789  uint8_t *tempBlurred, uint32_t *tempBlurredPast, const int *maxNoise)
1790 {
1791  // to save a register (FIXME do this outside of the loops)
1792  tempBlurredPast[127]= maxNoise[0];
1793  tempBlurredPast[128]= maxNoise[1];
1794  tempBlurredPast[129]= maxNoise[2];
1795 
1796 #define FAST_L2_DIFF
1797 //#define L1_DIFF //u should change the thresholds too if u try that one
1798 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
1799  __asm__ volatile(
1800  "lea (%2, %2, 2), %%"FF_REG_a" \n\t" // 3*stride
1801  "lea (%2, %2, 4), %%"FF_REG_d" \n\t" // 5*stride
1802  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1803 // 0 1 2 3 4 5 6 7 8 9
1804 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
1805 //FIXME reorder?
1806 #ifdef L1_DIFF //needs mmx2
1807  "movq (%0), %%mm0 \n\t" // L0
1808  "psadbw (%1), %%mm0 \n\t" // |L0-R0|
1809  "movq (%0, %2), %%mm1 \n\t" // L1
1810  "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
1811  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1812  "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
1813  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1814  "psadbw (%1, %%"FF_REG_a"), %%mm3 \n\t" // |L3-R3|
1815 
1816  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1817  "paddw %%mm1, %%mm0 \n\t"
1818  "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
1819  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1820  "paddw %%mm2, %%mm0 \n\t"
1821  "psadbw (%1, %%"FF_REG_d"), %%mm5 \n\t" // |L5-R5|
1822  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1823  "paddw %%mm3, %%mm0 \n\t"
1824  "psadbw (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // |L6-R6|
1825  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1826  "paddw %%mm4, %%mm0 \n\t"
1827  "psadbw (%1, %%"FF_REG_c"), %%mm7 \n\t" // |L7-R7|
1828  "paddw %%mm5, %%mm6 \n\t"
1829  "paddw %%mm7, %%mm6 \n\t"
1830  "paddw %%mm6, %%mm0 \n\t"
1831 #else //L1_DIFF
1832 #if defined (FAST_L2_DIFF)
1833  "pcmpeqb %%mm7, %%mm7 \n\t"
1834  "movq "MANGLE(b80)", %%mm6 \n\t"
1835  "pxor %%mm0, %%mm0 \n\t"
1836 #define REAL_L2_DIFF_CORE(a, b)\
1837  "movq " #a ", %%mm5 \n\t"\
1838  "movq " #b ", %%mm2 \n\t"\
1839  "pxor %%mm7, %%mm2 \n\t"\
1840  PAVGB(%%mm2, %%mm5)\
1841  "paddb %%mm6, %%mm5 \n\t"\
1842  "movq %%mm5, %%mm2 \n\t"\
1843  "psllw $8, %%mm5 \n\t"\
1844  "pmaddwd %%mm5, %%mm5 \n\t"\
1845  "pmaddwd %%mm2, %%mm2 \n\t"\
1846  "paddd %%mm2, %%mm5 \n\t"\
1847  "psrld $14, %%mm5 \n\t"\
1848  "paddd %%mm5, %%mm0 \n\t"
1849 
1850 #else //defined (FAST_L2_DIFF)
1851  "pxor %%mm7, %%mm7 \n\t"
1852  "pxor %%mm0, %%mm0 \n\t"
1853 #define REAL_L2_DIFF_CORE(a, b)\
1854  "movq " #a ", %%mm5 \n\t"\
1855  "movq " #b ", %%mm2 \n\t"\
1856  "movq %%mm5, %%mm1 \n\t"\
1857  "movq %%mm2, %%mm3 \n\t"\
1858  "punpcklbw %%mm7, %%mm5 \n\t"\
1859  "punpckhbw %%mm7, %%mm1 \n\t"\
1860  "punpcklbw %%mm7, %%mm2 \n\t"\
1861  "punpckhbw %%mm7, %%mm3 \n\t"\
1862  "psubw %%mm2, %%mm5 \n\t"\
1863  "psubw %%mm3, %%mm1 \n\t"\
1864  "pmaddwd %%mm5, %%mm5 \n\t"\
1865  "pmaddwd %%mm1, %%mm1 \n\t"\
1866  "paddd %%mm1, %%mm5 \n\t"\
1867  "paddd %%mm5, %%mm0 \n\t"
1868 
1869 #endif //defined (FAST_L2_DIFF)
1870 
1871 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
1872 
1873 L2_DIFF_CORE((%0) , (%1))
1874 L2_DIFF_CORE((%0, %2) , (%1, %2))
1875 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
1876 L2_DIFF_CORE((%0, %%FF_REGa) , (%1, %%FF_REGa))
1877 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
1878 L2_DIFF_CORE((%0, %%FF_REGd) , (%1, %%FF_REGd))
1879 L2_DIFF_CORE((%0, %%FF_REGa,2), (%1, %%FF_REGa,2))
1880 L2_DIFF_CORE((%0, %%FF_REGc) , (%1, %%FF_REGc))
1881 
1882 #endif //L1_DIFF
1883 
1884  "movq %%mm0, %%mm4 \n\t"
1885  "psrlq $32, %%mm0 \n\t"
1886  "paddd %%mm0, %%mm4 \n\t"
1887  "movd %%mm4, %%ecx \n\t"
1888  "shll $2, %%ecx \n\t"
1889  "mov %3, %%"FF_REG_d" \n\t"
1890  "addl -4(%%"FF_REG_d"), %%ecx \n\t"
1891  "addl 4(%%"FF_REG_d"), %%ecx \n\t"
1892  "addl -1024(%%"FF_REG_d"), %%ecx \n\t"
1893  "addl $4, %%ecx \n\t"
1894  "addl 1024(%%"FF_REG_d"), %%ecx \n\t"
1895  "shrl $3, %%ecx \n\t"
1896  "movl %%ecx, (%%"FF_REG_d") \n\t"
1897 
1898 // "mov %3, %%"FF_REG_c" \n\t"
1899 // "mov %%"FF_REG_c", test \n\t"
1900 // "jmp 4f \n\t"
1901  "cmpl 512(%%"FF_REG_d"), %%ecx \n\t"
1902  " jb 2f \n\t"
1903  "cmpl 516(%%"FF_REG_d"), %%ecx \n\t"
1904  " jb 1f \n\t"
1905 
1906  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1907  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1908  "movq (%0), %%mm0 \n\t" // L0
1909  "movq (%0, %2), %%mm1 \n\t" // L1
1910  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1911  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1912  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1913  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1914  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1915  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1916  "movq %%mm0, (%1) \n\t" // L0
1917  "movq %%mm1, (%1, %2) \n\t" // L1
1918  "movq %%mm2, (%1, %2, 2) \n\t" // L2
1919  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // L3
1920  "movq %%mm4, (%1, %2, 4) \n\t" // L4
1921  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // L5
1922  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // L6
1923  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // L7
1924  "jmp 4f \n\t"
1925 
1926  "1: \n\t"
1927  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1928  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1929  "movq (%0), %%mm0 \n\t" // L0
1930  PAVGB((%1), %%mm0) // L0
1931  "movq (%0, %2), %%mm1 \n\t" // L1
1932  PAVGB((%1, %2), %%mm1) // L1
1933  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1934  PAVGB((%1, %2, 2), %%mm2) // L2
1935  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1936  PAVGB((%1, %%FF_REGa), %%mm3) // L3
1937  "movq (%0, %2, 4), %%mm4 \n\t" // L4
1938  PAVGB((%1, %2, 4), %%mm4) // L4
1939  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
1940  PAVGB((%1, %%FF_REGd), %%mm5) // L5
1941  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
1942  PAVGB((%1, %%FF_REGa, 2), %%mm6) // L6
1943  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
1944  PAVGB((%1, %%FF_REGc), %%mm7) // L7
1945  "movq %%mm0, (%1) \n\t" // R0
1946  "movq %%mm1, (%1, %2) \n\t" // R1
1947  "movq %%mm2, (%1, %2, 2) \n\t" // R2
1948  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
1949  "movq %%mm4, (%1, %2, 4) \n\t" // R4
1950  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // R5
1951  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // R6
1952  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // R7
1953  "movq %%mm0, (%0) \n\t" // L0
1954  "movq %%mm1, (%0, %2) \n\t" // L1
1955  "movq %%mm2, (%0, %2, 2) \n\t" // L2
1956  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
1957  "movq %%mm4, (%0, %2, 4) \n\t" // L4
1958  "movq %%mm5, (%0, %%"FF_REG_d") \n\t" // L5
1959  "movq %%mm6, (%0, %%"FF_REG_a", 2) \n\t" // L6
1960  "movq %%mm7, (%0, %%"FF_REG_c") \n\t" // L7
1961  "jmp 4f \n\t"
1962 
1963  "2: \n\t"
1964  "cmpl 508(%%"FF_REG_d"), %%ecx \n\t"
1965  " jb 3f \n\t"
1966 
1967  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
1968  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
1969  "movq (%0), %%mm0 \n\t" // L0
1970  "movq (%0, %2), %%mm1 \n\t" // L1
1971  "movq (%0, %2, 2), %%mm2 \n\t" // L2
1972  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
1973  "movq (%1), %%mm4 \n\t" // R0
1974  "movq (%1, %2), %%mm5 \n\t" // R1
1975  "movq (%1, %2, 2), %%mm6 \n\t" // R2
1976  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
1977  PAVGB(%%mm4, %%mm0)
1978  PAVGB(%%mm5, %%mm1)
1979  PAVGB(%%mm6, %%mm2)
1980  PAVGB(%%mm7, %%mm3)
1981  PAVGB(%%mm4, %%mm0)
1982  PAVGB(%%mm5, %%mm1)
1983  PAVGB(%%mm6, %%mm2)
1984  PAVGB(%%mm7, %%mm3)
1985  "movq %%mm0, (%1) \n\t" // R0
1986  "movq %%mm1, (%1, %2) \n\t" // R1
1987  "movq %%mm2, (%1, %2, 2) \n\t" // R2
1988  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
1989  "movq %%mm0, (%0) \n\t" // L0
1990  "movq %%mm1, (%0, %2) \n\t" // L1
1991  "movq %%mm2, (%0, %2, 2) \n\t" // L2
1992  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
1993 
1994  "movq (%0, %2, 4), %%mm0 \n\t" // L4
1995  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
1996  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
1997  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
1998  "movq (%1, %2, 4), %%mm4 \n\t" // R4
1999  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
2000  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
2001  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2002  PAVGB(%%mm4, %%mm0)
2003  PAVGB(%%mm5, %%mm1)
2004  PAVGB(%%mm6, %%mm2)
2005  PAVGB(%%mm7, %%mm3)
2006  PAVGB(%%mm4, %%mm0)
2007  PAVGB(%%mm5, %%mm1)
2008  PAVGB(%%mm6, %%mm2)
2009  PAVGB(%%mm7, %%mm3)
2010  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2011  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2012  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2013  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2014  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2015  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2016  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2017  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2018  "jmp 4f \n\t"
2019 
2020  "3: \n\t"
2021  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2022  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2023  "movq (%0), %%mm0 \n\t" // L0
2024  "movq (%0, %2), %%mm1 \n\t" // L1
2025  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2026  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2027  "movq (%1), %%mm4 \n\t" // R0
2028  "movq (%1, %2), %%mm5 \n\t" // R1
2029  "movq (%1, %2, 2), %%mm6 \n\t" // R2
2030  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
2031  PAVGB(%%mm4, %%mm0)
2032  PAVGB(%%mm5, %%mm1)
2033  PAVGB(%%mm6, %%mm2)
2034  PAVGB(%%mm7, %%mm3)
2035  PAVGB(%%mm4, %%mm0)
2036  PAVGB(%%mm5, %%mm1)
2037  PAVGB(%%mm6, %%mm2)
2038  PAVGB(%%mm7, %%mm3)
2039  PAVGB(%%mm4, %%mm0)
2040  PAVGB(%%mm5, %%mm1)
2041  PAVGB(%%mm6, %%mm2)
2042  PAVGB(%%mm7, %%mm3)
2043  "movq %%mm0, (%1) \n\t" // R0
2044  "movq %%mm1, (%1, %2) \n\t" // R1
2045  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2046  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2047  "movq %%mm0, (%0) \n\t" // L0
2048  "movq %%mm1, (%0, %2) \n\t" // L1
2049  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2050  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2051 
2052  "movq (%0, %2, 4), %%mm0 \n\t" // L4
2053  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
2054  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
2055  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
2056  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2057  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
2058  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
2059  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2060  PAVGB(%%mm4, %%mm0)
2061  PAVGB(%%mm5, %%mm1)
2062  PAVGB(%%mm6, %%mm2)
2063  PAVGB(%%mm7, %%mm3)
2064  PAVGB(%%mm4, %%mm0)
2065  PAVGB(%%mm5, %%mm1)
2066  PAVGB(%%mm6, %%mm2)
2067  PAVGB(%%mm7, %%mm3)
2068  PAVGB(%%mm4, %%mm0)
2069  PAVGB(%%mm5, %%mm1)
2070  PAVGB(%%mm6, %%mm2)
2071  PAVGB(%%mm7, %%mm3)
2072  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2073  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2074  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2075  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2076  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2077  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2078  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2079  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2080 
2081  "4: \n\t"
2082 
2083  :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
2085  : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c, "memory"
2086  );
2087 #else //TEMPLATE_PP_MMXEXT && HAVE_6REGS
2088 {
2089  int y;
2090  int d=0;
2091 // int sysd=0;
2092  int i;
2093 
2094  for(y=0; y<8; y++){
2095  int x;
2096  for(x=0; x<8; x++){
2097  int ref= tempBlurred[ x + y*stride ];
2098  int cur= src[ x + y*stride ];
2099  int d1=ref - cur;
2100 // if(x==0 || x==7) d1+= d1>>1;
2101 // if(y==0 || y==7) d1+= d1>>1;
2102 // d+= FFABS(d1);
2103  d+= d1*d1;
2104 // sysd+= d1;
2105  }
2106  }
2107  i=d;
2108  d= (
2109  4*d
2110  +(*(tempBlurredPast-256))
2111  +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
2112  +(*(tempBlurredPast+256))
2113  +4)>>3;
2114  *tempBlurredPast=i;
2115 // ((*tempBlurredPast)*3 + d + 2)>>2;
2116 
2117 /*
2118 Switch between
2119  1 0 0 0 0 0 0 (0)
2120 64 32 16 8 4 2 1 (1)
2121 64 48 36 27 20 15 11 (33) (approx)
2122 64 56 49 43 37 33 29 (200) (approx)
2123 */
2124  if(d > maxNoise[1]){
2125  if(d < maxNoise[2]){
2126  for(y=0; y<8; y++){
2127  int x;
2128  for(x=0; x<8; x++){
2129  int ref= tempBlurred[ x + y*stride ];
2130  int cur= src[ x + y*stride ];
2131  tempBlurred[ x + y*stride ]=
2132  src[ x + y*stride ]=
2133  (ref + cur + 1)>>1;
2134  }
2135  }
2136  }else{
2137  for(y=0; y<8; y++){
2138  int x;
2139  for(x=0; x<8; x++){
2140  tempBlurred[ x + y*stride ]= src[ x + y*stride ];
2141  }
2142  }
2143  }
2144  }else{
2145  if(d < maxNoise[0]){
2146  for(y=0; y<8; y++){
2147  int x;
2148  for(x=0; x<8; x++){
2149  int ref= tempBlurred[ x + y*stride ];
2150  int cur= src[ x + y*stride ];
2151  tempBlurred[ x + y*stride ]=
2152  src[ x + y*stride ]=
2153  (ref*7 + cur + 4)>>3;
2154  }
2155  }
2156  }else{
2157  for(y=0; y<8; y++){
2158  int x;
2159  for(x=0; x<8; x++){
2160  int ref= tempBlurred[ x + y*stride ];
2161  int cur= src[ x + y*stride ];
2162  tempBlurred[ x + y*stride ]=
2163  src[ x + y*stride ]=
2164  (ref*3 + cur + 2)>>2;
2165  }
2166  }
2167  }
2168  }
2169 }
2170 #endif //TEMPLATE_PP_MMXEXT && HAVE_6REGS
2171 }
2172 #endif //TEMPLATE_PP_ALTIVEC
2173 
2174 #if TEMPLATE_PP_MMXEXT
2175 /**
2176  * accurate deblock filter
2177  */
2178 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, const PPContext *c, int mode){
2179  int64_t dc_mask, eq_mask, both_masks;
2180  int64_t sums[10*8*2];
2181  src+= step*3; // src points to begin of the 8x8 Block
2182 
2183  __asm__ volatile(
2184  "movq %0, %%mm7 \n\t"
2185  "movq %1, %%mm6 \n\t"
2186  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2187  );
2188 
2189  __asm__ volatile(
2190  "lea (%2, %3), %%"FF_REG_a" \n\t"
2191 // 0 1 2 3 4 5 6 7 8 9
2192 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2193 
2194  "movq (%2), %%mm0 \n\t"
2195  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2196  "movq %%mm1, %%mm3 \n\t"
2197  "movq %%mm1, %%mm4 \n\t"
2198  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
2199  "paddb %%mm7, %%mm0 \n\t"
2200  "pcmpgtb %%mm6, %%mm0 \n\t"
2201 
2202  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
2203  PMAXUB(%%mm2, %%mm4)
2204  PMINUB(%%mm2, %%mm3, %%mm5)
2205  "psubb %%mm2, %%mm1 \n\t"
2206  "paddb %%mm7, %%mm1 \n\t"
2207  "pcmpgtb %%mm6, %%mm1 \n\t"
2208  "paddb %%mm1, %%mm0 \n\t"
2209 
2210  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2211  PMAXUB(%%mm1, %%mm4)
2212  PMINUB(%%mm1, %%mm3, %%mm5)
2213  "psubb %%mm1, %%mm2 \n\t"
2214  "paddb %%mm7, %%mm2 \n\t"
2215  "pcmpgtb %%mm6, %%mm2 \n\t"
2216  "paddb %%mm2, %%mm0 \n\t"
2217 
2218  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
2219 
2220  "movq (%2, %3, 4), %%mm2 \n\t"
2221  PMAXUB(%%mm2, %%mm4)
2222  PMINUB(%%mm2, %%mm3, %%mm5)
2223  "psubb %%mm2, %%mm1 \n\t"
2224  "paddb %%mm7, %%mm1 \n\t"
2225  "pcmpgtb %%mm6, %%mm1 \n\t"
2226  "paddb %%mm1, %%mm0 \n\t"
2227 
2228  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2229  PMAXUB(%%mm1, %%mm4)
2230  PMINUB(%%mm1, %%mm3, %%mm5)
2231  "psubb %%mm1, %%mm2 \n\t"
2232  "paddb %%mm7, %%mm2 \n\t"
2233  "pcmpgtb %%mm6, %%mm2 \n\t"
2234  "paddb %%mm2, %%mm0 \n\t"
2235 
2236  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
2237  PMAXUB(%%mm2, %%mm4)
2238  PMINUB(%%mm2, %%mm3, %%mm5)
2239  "psubb %%mm2, %%mm1 \n\t"
2240  "paddb %%mm7, %%mm1 \n\t"
2241  "pcmpgtb %%mm6, %%mm1 \n\t"
2242  "paddb %%mm1, %%mm0 \n\t"
2243 
2244  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2245  PMAXUB(%%mm1, %%mm4)
2246  PMINUB(%%mm1, %%mm3, %%mm5)
2247  "psubb %%mm1, %%mm2 \n\t"
2248  "paddb %%mm7, %%mm2 \n\t"
2249  "pcmpgtb %%mm6, %%mm2 \n\t"
2250  "paddb %%mm2, %%mm0 \n\t"
2251 
2252  "movq (%2, %3, 8), %%mm2 \n\t"
2253  PMAXUB(%%mm2, %%mm4)
2254  PMINUB(%%mm2, %%mm3, %%mm5)
2255  "psubb %%mm2, %%mm1 \n\t"
2256  "paddb %%mm7, %%mm1 \n\t"
2257  "pcmpgtb %%mm6, %%mm1 \n\t"
2258  "paddb %%mm1, %%mm0 \n\t"
2259 
2260  "movq (%%"FF_REG_a", %3, 4), %%mm1 \n\t"
2261  "psubb %%mm1, %%mm2 \n\t"
2262  "paddb %%mm7, %%mm2 \n\t"
2263  "pcmpgtb %%mm6, %%mm2 \n\t"
2264  "paddb %%mm2, %%mm0 \n\t"
2265  "psubusb %%mm3, %%mm4 \n\t"
2266 
2267  "pxor %%mm6, %%mm6 \n\t"
2268  "movq %4, %%mm7 \n\t" // QP,..., QP
2269  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
2270  "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2271  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2272  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2273  "movq %%mm7, %1 \n\t"
2274 
2275  "movq %5, %%mm7 \n\t"
2276  "punpcklbw %%mm7, %%mm7 \n\t"
2277  "punpcklbw %%mm7, %%mm7 \n\t"
2278  "punpcklbw %%mm7, %%mm7 \n\t"
2279  "psubb %%mm0, %%mm6 \n\t"
2280  "pcmpgtb %%mm7, %%mm6 \n\t"
2281  "movq %%mm6, %0 \n\t"
2282 
2283  : "=m" (eq_mask), "=m" (dc_mask)
2284  : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2285  : "%"FF_REG_a
2286  );
2287 
2288  both_masks = dc_mask & eq_mask;
2289 
2290  if(both_masks){
2291  x86_reg offset= -8*step;
2292  int64_t *temp_sums= sums;
2293 
2294  __asm__ volatile(
2295  "movq %2, %%mm0 \n\t" // QP,..., QP
2296  "pxor %%mm4, %%mm4 \n\t"
2297 
2298  "movq (%0), %%mm6 \n\t"
2299  "movq (%0, %1), %%mm5 \n\t"
2300  "movq %%mm5, %%mm1 \n\t"
2301  "movq %%mm6, %%mm2 \n\t"
2302  "psubusb %%mm6, %%mm5 \n\t"
2303  "psubusb %%mm1, %%mm2 \n\t"
2304  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2305  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2306  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2307 
2308  "pxor %%mm6, %%mm1 \n\t"
2309  "pand %%mm0, %%mm1 \n\t"
2310  "pxor %%mm1, %%mm6 \n\t"
2311  // 0:QP 6:First
2312 
2313  "movq (%0, %1, 8), %%mm5 \n\t"
2314  "add %1, %0 \n\t" // %0 points to line 1 not 0
2315  "movq (%0, %1, 8), %%mm7 \n\t"
2316  "movq %%mm5, %%mm1 \n\t"
2317  "movq %%mm7, %%mm2 \n\t"
2318  "psubusb %%mm7, %%mm5 \n\t"
2319  "psubusb %%mm1, %%mm2 \n\t"
2320  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2321  "movq %2, %%mm0 \n\t" // QP,..., QP
2322  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2323  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2324 
2325  "pxor %%mm7, %%mm1 \n\t"
2326  "pand %%mm0, %%mm1 \n\t"
2327  "pxor %%mm1, %%mm7 \n\t"
2328 
2329  "movq %%mm6, %%mm5 \n\t"
2330  "punpckhbw %%mm4, %%mm6 \n\t"
2331  "punpcklbw %%mm4, %%mm5 \n\t"
2332  // 4:0 5/6:First 7:Last
2333 
2334  "movq %%mm5, %%mm0 \n\t"
2335  "movq %%mm6, %%mm1 \n\t"
2336  "psllw $2, %%mm0 \n\t"
2337  "psllw $2, %%mm1 \n\t"
2338  "paddw "MANGLE(w04)", %%mm0 \n\t"
2339  "paddw "MANGLE(w04)", %%mm1 \n\t"
2340 
2341 #define NEXT\
2342  "movq (%0), %%mm2 \n\t"\
2343  "movq (%0), %%mm3 \n\t"\
2344  "add %1, %0 \n\t"\
2345  "punpcklbw %%mm4, %%mm2 \n\t"\
2346  "punpckhbw %%mm4, %%mm3 \n\t"\
2347  "paddw %%mm2, %%mm0 \n\t"\
2348  "paddw %%mm3, %%mm1 \n\t"
2349 
2350 #define PREV\
2351  "movq (%0), %%mm2 \n\t"\
2352  "movq (%0), %%mm3 \n\t"\
2353  "add %1, %0 \n\t"\
2354  "punpcklbw %%mm4, %%mm2 \n\t"\
2355  "punpckhbw %%mm4, %%mm3 \n\t"\
2356  "psubw %%mm2, %%mm0 \n\t"\
2357  "psubw %%mm3, %%mm1 \n\t"
2358 
2359 
2360  NEXT //0
2361  NEXT //1
2362  NEXT //2
2363  "movq %%mm0, (%3) \n\t"
2364  "movq %%mm1, 8(%3) \n\t"
2365 
2366  NEXT //3
2367  "psubw %%mm5, %%mm0 \n\t"
2368  "psubw %%mm6, %%mm1 \n\t"
2369  "movq %%mm0, 16(%3) \n\t"
2370  "movq %%mm1, 24(%3) \n\t"
2371 
2372  NEXT //4
2373  "psubw %%mm5, %%mm0 \n\t"
2374  "psubw %%mm6, %%mm1 \n\t"
2375  "movq %%mm0, 32(%3) \n\t"
2376  "movq %%mm1, 40(%3) \n\t"
2377 
2378  NEXT //5
2379  "psubw %%mm5, %%mm0 \n\t"
2380  "psubw %%mm6, %%mm1 \n\t"
2381  "movq %%mm0, 48(%3) \n\t"
2382  "movq %%mm1, 56(%3) \n\t"
2383 
2384  NEXT //6
2385  "psubw %%mm5, %%mm0 \n\t"
2386  "psubw %%mm6, %%mm1 \n\t"
2387  "movq %%mm0, 64(%3) \n\t"
2388  "movq %%mm1, 72(%3) \n\t"
2389 
2390  "movq %%mm7, %%mm6 \n\t"
2391  "punpckhbw %%mm4, %%mm7 \n\t"
2392  "punpcklbw %%mm4, %%mm6 \n\t"
2393 
2394  NEXT //7
2395  "mov %4, %0 \n\t"
2396  "add %1, %0 \n\t"
2397  PREV //0
2398  "movq %%mm0, 80(%3) \n\t"
2399  "movq %%mm1, 88(%3) \n\t"
2400 
2401  PREV //1
2402  "paddw %%mm6, %%mm0 \n\t"
2403  "paddw %%mm7, %%mm1 \n\t"
2404  "movq %%mm0, 96(%3) \n\t"
2405  "movq %%mm1, 104(%3) \n\t"
2406 
2407  PREV //2
2408  "paddw %%mm6, %%mm0 \n\t"
2409  "paddw %%mm7, %%mm1 \n\t"
2410  "movq %%mm0, 112(%3) \n\t"
2411  "movq %%mm1, 120(%3) \n\t"
2412 
2413  PREV //3
2414  "paddw %%mm6, %%mm0 \n\t"
2415  "paddw %%mm7, %%mm1 \n\t"
2416  "movq %%mm0, 128(%3) \n\t"
2417  "movq %%mm1, 136(%3) \n\t"
2418 
2419  PREV //4
2420  "paddw %%mm6, %%mm0 \n\t"
2421  "paddw %%mm7, %%mm1 \n\t"
2422  "movq %%mm0, 144(%3) \n\t"
2423  "movq %%mm1, 152(%3) \n\t"
2424 
2425  "mov %4, %0 \n\t" //FIXME
2426 
2427  : "+&r"(src)
2428  : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
2430  );
2431 
2432  src+= step; // src points to begin of the 8x8 Block
2433 
2434  __asm__ volatile(
2435  "movq %4, %%mm6 \n\t"
2436  "pcmpeqb %%mm5, %%mm5 \n\t"
2437  "pxor %%mm6, %%mm5 \n\t"
2438  "pxor %%mm7, %%mm7 \n\t"
2439 
2440  "1: \n\t"
2441  "movq (%1), %%mm0 \n\t"
2442  "movq 8(%1), %%mm1 \n\t"
2443  "paddw 32(%1), %%mm0 \n\t"
2444  "paddw 40(%1), %%mm1 \n\t"
2445  "movq (%0, %3), %%mm2 \n\t"
2446  "movq %%mm2, %%mm3 \n\t"
2447  "movq %%mm2, %%mm4 \n\t"
2448  "punpcklbw %%mm7, %%mm2 \n\t"
2449  "punpckhbw %%mm7, %%mm3 \n\t"
2450  "paddw %%mm2, %%mm0 \n\t"
2451  "paddw %%mm3, %%mm1 \n\t"
2452  "paddw %%mm2, %%mm0 \n\t"
2453  "paddw %%mm3, %%mm1 \n\t"
2454  "psrlw $4, %%mm0 \n\t"
2455  "psrlw $4, %%mm1 \n\t"
2456  "packuswb %%mm1, %%mm0 \n\t"
2457  "pand %%mm6, %%mm0 \n\t"
2458  "pand %%mm5, %%mm4 \n\t"
2459  "por %%mm4, %%mm0 \n\t"
2460  "movq %%mm0, (%0, %3) \n\t"
2461  "add $16, %1 \n\t"
2462  "add %2, %0 \n\t"
2463  " js 1b \n\t"
2464 
2465  : "+r"(offset), "+r"(temp_sums)
2466  : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
2467  );
2468  }else
2469  src+= step; // src points to begin of the 8x8 Block
2470 
2471  if(eq_mask != -1LL){
2472  uint8_t *temp_src= src;
2473  DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
2474  __asm__ volatile(
2475  "pxor %%mm7, %%mm7 \n\t"
2476 // 0 1 2 3 4 5 6 7 8 9
2477 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2478 
2479  "movq (%0), %%mm0 \n\t"
2480  "movq %%mm0, %%mm1 \n\t"
2481  "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2482  "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2483 
2484  "movq (%0, %1), %%mm2 \n\t"
2485  "lea (%0, %1, 2), %%"FF_REG_a" \n\t"
2486  "movq %%mm2, %%mm3 \n\t"
2487  "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2488  "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2489 
2490  "movq (%%"FF_REG_a"), %%mm4 \n\t"
2491  "movq %%mm4, %%mm5 \n\t"
2492  "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2493  "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2494 
2495  "paddw %%mm0, %%mm0 \n\t" // 2L0
2496  "paddw %%mm1, %%mm1 \n\t" // 2H0
2497  "psubw %%mm4, %%mm2 \n\t" // L1 - L2
2498  "psubw %%mm5, %%mm3 \n\t" // H1 - H2
2499  "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2500  "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2501 
2502  "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2503  "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2504  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2505  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2506 
2507  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
2508  "movq %%mm2, %%mm3 \n\t"
2509  "punpcklbw %%mm7, %%mm2 \n\t" // L3
2510  "punpckhbw %%mm7, %%mm3 \n\t" // H3
2511 
2512  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2513  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2514  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2515  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2516  "movq %%mm0, (%4) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2517  "movq %%mm1, 8(%4) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2518 
2519  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
2520  "movq %%mm0, %%mm1 \n\t"
2521  "punpcklbw %%mm7, %%mm0 \n\t" // L4
2522  "punpckhbw %%mm7, %%mm1 \n\t" // H4
2523 
2524  "psubw %%mm0, %%mm2 \n\t" // L3 - L4
2525  "psubw %%mm1, %%mm3 \n\t" // H3 - H4
2526  "movq %%mm2, 16(%4) \n\t" // L3 - L4
2527  "movq %%mm3, 24(%4) \n\t" // H3 - H4
2528  "paddw %%mm4, %%mm4 \n\t" // 2L2
2529  "paddw %%mm5, %%mm5 \n\t" // 2H2
2530  "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
2531  "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
2532 
2533  "lea (%%"FF_REG_a", %1), %0 \n\t"
2534  "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
2535  "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
2536  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
2537  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
2538 //50 opcodes so far
2539  "movq (%0, %1, 2), %%mm2 \n\t"
2540  "movq %%mm2, %%mm3 \n\t"
2541  "punpcklbw %%mm7, %%mm2 \n\t" // L5
2542  "punpckhbw %%mm7, %%mm3 \n\t" // H5
2543  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
2544  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
2545  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2546  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
2547 
2548  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2549  "punpcklbw %%mm7, %%mm6 \n\t" // L6
2550  "psubw %%mm6, %%mm2 \n\t" // L5 - L6
2551  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2552  "punpckhbw %%mm7, %%mm6 \n\t" // H6
2553  "psubw %%mm6, %%mm3 \n\t" // H5 - H6
2554 
2555  "paddw %%mm0, %%mm0 \n\t" // 2L4
2556  "paddw %%mm1, %%mm1 \n\t" // 2H4
2557  "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
2558  "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
2559 
2560  "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
2561  "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
2562  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
2563  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
2564 
2565  "movq (%0, %1, 4), %%mm2 \n\t"
2566  "movq %%mm2, %%mm3 \n\t"
2567  "punpcklbw %%mm7, %%mm2 \n\t" // L7
2568  "punpckhbw %%mm7, %%mm3 \n\t" // H7
2569 
2570  "paddw %%mm2, %%mm2 \n\t" // 2L7
2571  "paddw %%mm3, %%mm3 \n\t" // 2H7
2572  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
2573  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
2574 
2575  "movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2576  "movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2577 
2578  "movq %%mm7, %%mm6 \n\t" // 0
2579  "psubw %%mm0, %%mm6 \n\t"
2580  "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2581  "movq %%mm7, %%mm6 \n\t" // 0
2582  "psubw %%mm1, %%mm6 \n\t"
2583  "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2584  "movq %%mm7, %%mm6 \n\t" // 0
2585  "psubw %%mm2, %%mm6 \n\t"
2586  "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2587  "movq %%mm7, %%mm6 \n\t" // 0
2588  "psubw %%mm3, %%mm6 \n\t"
2589  "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2590 
2591  "pminsw %%mm2, %%mm0 \n\t"
2592  "pminsw %%mm3, %%mm1 \n\t"
2593 
2594  "movd %2, %%mm2 \n\t" // QP
2595  "punpcklbw %%mm7, %%mm2 \n\t"
2596 
2597  "movq %%mm7, %%mm6 \n\t" // 0
2598  "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
2599  "pxor %%mm6, %%mm4 \n\t"
2600  "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
2601  "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
2602  "pxor %%mm7, %%mm5 \n\t"
2603  "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
2604 // 100 opcodes
2605  "psllw $3, %%mm2 \n\t" // 8QP
2606  "movq %%mm2, %%mm3 \n\t" // 8QP
2607  "pcmpgtw %%mm4, %%mm2 \n\t"
2608  "pcmpgtw %%mm5, %%mm3 \n\t"
2609  "pand %%mm2, %%mm4 \n\t"
2610  "pand %%mm3, %%mm5 \n\t"
2611 
2612 
2613  "psubusw %%mm0, %%mm4 \n\t" // hd
2614  "psubusw %%mm1, %%mm5 \n\t" // ld
2615 
2616 
2617  "movq "MANGLE(w05)", %%mm2 \n\t" // 5
2618  "pmullw %%mm2, %%mm4 \n\t"
2619  "pmullw %%mm2, %%mm5 \n\t"
2620  "movq "MANGLE(w20)", %%mm2 \n\t" // 32
2621  "paddw %%mm2, %%mm4 \n\t"
2622  "paddw %%mm2, %%mm5 \n\t"
2623  "psrlw $6, %%mm4 \n\t"
2624  "psrlw $6, %%mm5 \n\t"
2625 
2626  "movq 16(%4), %%mm0 \n\t" // L3 - L4
2627  "movq 24(%4), %%mm1 \n\t" // H3 - H4
2628 
2629  "pxor %%mm2, %%mm2 \n\t"
2630  "pxor %%mm3, %%mm3 \n\t"
2631 
2632  "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
2633  "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
2634  "pxor %%mm2, %%mm0 \n\t"
2635  "pxor %%mm3, %%mm1 \n\t"
2636  "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
2637  "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
2638  "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
2639  "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
2640 
2641  "pxor %%mm6, %%mm2 \n\t"
2642  "pxor %%mm7, %%mm3 \n\t"
2643  "pand %%mm2, %%mm4 \n\t"
2644  "pand %%mm3, %%mm5 \n\t"
2645 
2646  "pminsw %%mm0, %%mm4 \n\t"
2647  "pminsw %%mm1, %%mm5 \n\t"
2648  "pxor %%mm6, %%mm4 \n\t"
2649  "pxor %%mm7, %%mm5 \n\t"
2650  "psubw %%mm6, %%mm4 \n\t"
2651  "psubw %%mm7, %%mm5 \n\t"
2652  "packsswb %%mm5, %%mm4 \n\t"
2653  "movq %3, %%mm1 \n\t"
2654  "pandn %%mm4, %%mm1 \n\t"
2655  "movq (%0), %%mm0 \n\t"
2656  "paddb %%mm1, %%mm0 \n\t"
2657  "movq %%mm0, (%0) \n\t"
2658  "movq (%0, %1), %%mm0 \n\t"
2659  "psubb %%mm1, %%mm0 \n\t"
2660  "movq %%mm0, (%0, %1) \n\t"
2661 
2662  : "+r" (temp_src)
2663  : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp)
2664  NAMED_CONSTRAINTS_ADD(w05,w20)
2665  : "%"FF_REG_a
2666  );
2667  }
2668 }
2669 #endif //TEMPLATE_PP_MMX
2670 
2671 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2672  const int8_t QPs[], int QPStride, int isColor, PPContext *c);
2673 
2674 /**
2675  * Copy a block from src to dst and fixes the blacklevel.
2676  * levelFix == 0 -> do not touch the brightness & contrast
2677  */
2678 #undef REAL_SCALED_CPY
2679 #undef SCALED_CPY
2680 
2681 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
2682  int levelFix, int64_t *packedOffsetAndScale)
2683 {
2684  if(levelFix){
2685 #if TEMPLATE_PP_MMXEXT && HAVE_6REGS
2686  __asm__ volatile(
2687  "movq (%%"FF_REG_a"), %%mm2 \n\t" // packedYOffset
2688  "movq 8(%%"FF_REG_a"), %%mm3 \n\t" // packedYScale
2689  "lea (%2,%4), %%"FF_REG_a" \n\t"
2690  "lea (%3,%5), %%"FF_REG_d" \n\t"
2691  "pxor %%mm4, %%mm4 \n\t"
2692 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
2693  "movq " #src1 ", %%mm0 \n\t"\
2694  "movq " #src1 ", %%mm5 \n\t"\
2695  "movq " #src2 ", %%mm1 \n\t"\
2696  "movq " #src2 ", %%mm6 \n\t"\
2697  "punpcklbw %%mm0, %%mm0 \n\t"\
2698  "punpckhbw %%mm5, %%mm5 \n\t"\
2699  "punpcklbw %%mm1, %%mm1 \n\t"\
2700  "punpckhbw %%mm6, %%mm6 \n\t"\
2701  "pmulhuw %%mm3, %%mm0 \n\t"\
2702  "pmulhuw %%mm3, %%mm5 \n\t"\
2703  "pmulhuw %%mm3, %%mm1 \n\t"\
2704  "pmulhuw %%mm3, %%mm6 \n\t"\
2705  "psubw %%mm2, %%mm0 \n\t"\
2706  "psubw %%mm2, %%mm5 \n\t"\
2707  "psubw %%mm2, %%mm1 \n\t"\
2708  "psubw %%mm2, %%mm6 \n\t"\
2709  "packuswb %%mm5, %%mm0 \n\t"\
2710  "packuswb %%mm6, %%mm1 \n\t"\
2711  "movq %%mm0, " #dst1 " \n\t"\
2712  "movq %%mm1, " #dst2 " \n\t"\
2713 
2714 #define SCALED_CPY(src1, src2, dst1, dst2)\
2715  REAL_SCALED_CPY(src1, src2, dst1, dst2)
2716 
2717 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
2718 SCALED_CPY((%2, %4, 2), (%%FF_REGa, %4, 2), (%3, %5, 2), (%%FF_REGd, %5, 2))
2719 SCALED_CPY((%2, %4, 4), (%%FF_REGa, %4, 4), (%3, %5, 4), (%%FF_REGd, %5, 4))
2720  "lea (%%"FF_REG_a",%4,4), %%"FF_REG_a" \n\t"
2721  "lea (%%"FF_REG_d",%5,4), %%"FF_REG_d" \n\t"
2722 SCALED_CPY((%%FF_REGa, %4), (%%FF_REGa, %4, 2), (%%FF_REGd, %5), (%%FF_REGd, %5, 2))
2723 
2724 
2725  : "=&a" (packedOffsetAndScale)
2726  : "0" (packedOffsetAndScale),
2727  "r"(src),
2728  "r"(dst),
2729  "r" ((x86_reg)srcStride),
2730  "r" ((x86_reg)dstStride)
2731  : "%"FF_REG_d
2732  );
2733 #else //TEMPLATE_PP_MMX && HAVE_6REGS
2734  for (int i = 0; i < 8; i++)
2735  memcpy( &(dst[dstStride*i]),
2736  &(src[srcStride*i]), BLOCK_SIZE);
2737 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
2738  }else{
2739 #if TEMPLATE_PP_MMX && HAVE_6REGS
2740  __asm__ volatile(
2741  "lea (%0,%2), %%"FF_REG_a" \n\t"
2742  "lea (%1,%3), %%"FF_REG_d" \n\t"
2743 
2744 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
2745  "movq " #src1 ", %%mm0 \n\t"\
2746  "movq " #src2 ", %%mm1 \n\t"\
2747  "movq %%mm0, " #dst1 " \n\t"\
2748  "movq %%mm1, " #dst2 " \n\t"\
2749 
2750 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
2751  REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
2752 
2753 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
2754 SIMPLE_CPY((%0, %2, 2), (%%FF_REGa, %2, 2), (%1, %3, 2), (%%FF_REGd, %3, 2))
2755 SIMPLE_CPY((%0, %2, 4), (%%FF_REGa, %2, 4), (%1, %3, 4), (%%FF_REGd, %3, 4))
2756  "lea (%%"FF_REG_a",%2,4), %%"FF_REG_a" \n\t"
2757  "lea (%%"FF_REG_d",%3,4), %%"FF_REG_d" \n\t"
2758 SIMPLE_CPY((%%FF_REGa, %2), (%%FF_REGa, %2, 2), (%%FF_REGd, %3), (%%FF_REGd, %3, 2))
2759 
2760  : : "r" (src),
2761  "r" (dst),
2762  "r" ((x86_reg)srcStride),
2763  "r" ((x86_reg)dstStride)
2764  : "%"FF_REG_a, "%"FF_REG_d
2765  );
2766 #else //TEMPLATE_PP_MMX && HAVE_6REGS
2767  for (int i = 0; i < 8; i++)
2768  memcpy( &(dst[dstStride*i]),
2769  &(src[srcStride*i]), BLOCK_SIZE);
2770 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
2771  }
2772 }
2773 
2774 /**
2775  * Duplicate the given 8 src pixels ? times upward
2776  */
2777 static inline void RENAME(duplicate)(uint8_t src[], int stride)
2778 {
2779 #if TEMPLATE_PP_MMX
2780  __asm__ volatile(
2781  "movq (%0), %%mm0 \n\t"
2782  "movq %%mm0, (%0, %1, 4) \n\t"
2783  "add %1, %0 \n\t"
2784  "movq %%mm0, (%0) \n\t"
2785  "movq %%mm0, (%0, %1) \n\t"
2786  "movq %%mm0, (%0, %1, 2) \n\t"
2787  "movq %%mm0, (%0, %1, 4) \n\t"
2788  : "+r" (src)
2789  : "r" ((x86_reg)-stride)
2790  );
2791 #else
2792  int i;
2793  uint8_t *p=src;
2794  for(i=0; i<5; i++){
2795  p-= stride;
2796  memcpy(p, src, 8);
2797  }
2798 #endif
2799 }
2800 
2801 #if ARCH_X86 && TEMPLATE_PP_MMXEXT
2802 static inline void RENAME(prefetchnta)(const void *p)
2803 {
2804  __asm__ volatile( "prefetchnta (%0)\n\t"
2805  : : "r" (p)
2806  );
2807 }
2808 
2809 static inline void RENAME(prefetcht0)(const void *p)
2810 {
2811  __asm__ volatile( "prefetcht0 (%0)\n\t"
2812  : : "r" (p)
2813  );
2814 }
2815 
2816 static inline void RENAME(prefetcht1)(const void *p)
2817 {
2818  __asm__ volatile( "prefetcht1 (%0)\n\t"
2819  : : "r" (p)
2820  );
2821 }
2822 
2823 static inline void RENAME(prefetcht2)(const void *p)
2824 {
2825  __asm__ volatile( "prefetcht2 (%0)\n\t"
2826  : : "r" (p)
2827  );
2828 }
2829 #elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2)
2830 static inline void RENAME(prefetchnta)(const void *p)
2831 {
2832  __builtin_prefetch(p,0,0);
2833 }
2834 static inline void RENAME(prefetcht0)(const void *p)
2835 {
2836  __builtin_prefetch(p,0,1);
2837 }
2838 static inline void RENAME(prefetcht1)(const void *p)
2839 {
2840  __builtin_prefetch(p,0,2);
2841 }
2842 static inline void RENAME(prefetcht2)(const void *p)
2843 {
2844  __builtin_prefetch(p,0,3);
2845 }
2846 #else
2847 static inline void RENAME(prefetchnta)(const void *p)
2848 {
2849  return;
2850 }
2851 static inline void RENAME(prefetcht0)(const void *p)
2852 {
2853  return;
2854 }
2855 static inline void RENAME(prefetcht1)(const void *p)
2856 {
2857  return;
2858 }
2859 static inline void RENAME(prefetcht2)(const void *p)
2860 {
2861  return;
2862 }
2863 #endif
2864 /**
2865  * Filter array of bytes (Y or U or V values)
2866  */
2867 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2868  const int8_t QPs[], int QPStride, int isColor, PPContext *c)
2869 {
2870  int x,y;
2871 #ifdef TEMPLATE_PP_TIME_MODE
2872  const int mode= TEMPLATE_PP_TIME_MODE;
2873 #else
2874  const int mode = isColor ? c->ppMode.chromMode : c->ppMode.lumMode;
2875 #endif
2876  int black=0, white=255; // blackest black and whitest white in the picture
2877  int QPCorrecture= 256*256;
2878 
2879  int copyAhead;
2880 #if TEMPLATE_PP_MMX
2881  int i;
2882 #endif
2883 
2884  const int qpHShift = isColor ? 4 - c->hChromaSubSample : 4;
2885  const int qpVShift = isColor ? 4 - c->vChromaSubSample : 4;
2886 
2887  //FIXME remove
2888  uint64_t * const yHistogram= c->yHistogram;
2889  uint8_t * const tempSrc = srcStride > 0 ? c->tempSrc : c->tempSrc - 23*srcStride;
2890  uint8_t * const tempDst = (dstStride > 0 ? c->tempDst : c->tempDst - 23*dstStride) + 32;
2891  //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
2892 
2893  if (mode & VISUALIZE){
2894  if(!(mode & (V_A_DEBLOCK | H_A_DEBLOCK)) || TEMPLATE_PP_MMX) {
2895  av_log(c, AV_LOG_WARNING, "Visualization is currently only supported with the accurate deblock filter without SIMD\n");
2896  }
2897  }
2898 
2899 #if TEMPLATE_PP_MMX
2900  for(i=0; i<57; i++){
2901  int offset = ((i * c->ppMode.baseDcDiff) >> 8) + 1;
2902  int threshold= offset*2 + 1;
2903  c->mmxDcOffset[i] = 0x7F - offset;
2904  c->mmxDcThreshold[i] = 0x7F - threshold;
2905  c->mmxDcOffset[i] *= 0x0101010101010101LL;
2906  c->mmxDcThreshold[i] *= 0x0101010101010101LL;
2907  }
2908 #endif
2909 
2910  if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
2911  else if( (mode & LINEAR_BLEND_DEINT_FILTER)
2912  || (mode & FFMPEG_DEINT_FILTER)
2913  || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
2914  else if( (mode & V_DEBLOCK)
2916  || (mode & MEDIAN_DEINT_FILTER)
2917  || (mode & V_A_DEBLOCK)) copyAhead=13;
2918  else if(mode & V_X1_FILTER) copyAhead=11;
2919 // else if(mode & V_RK1_FILTER) copyAhead=10;
2920  else if(mode & DERING) copyAhead=9;
2921  else copyAhead=8;
2922 
2923  copyAhead-= 8;
2924 
2925  if(!isColor){
2926  uint64_t sum= 0;
2927  int i;
2928  uint64_t maxClipped;
2929  uint64_t clipped;
2930  AVRational scale;
2931 
2932  c->frameNum++;
2933  // first frame is fscked so we ignore it
2934  if (c->frameNum == 1)
2935  yHistogram[0] = width * (uint64_t)height/64*15/256;
2936 
2937  for(i=0; i<256; i++){
2938  sum+= yHistogram[i];
2939  }
2940 
2941  /* We always get a completely black picture first. */
2942  maxClipped = av_rescale(sum, c->ppMode.maxClippedThreshold.num,
2943  c->ppMode.maxClippedThreshold.den);
2944 
2945  clipped= sum;
2946  for(black=255; black>0; black--){
2947  if(clipped < maxClipped) break;
2948  clipped-= yHistogram[black];
2949  }
2950 
2951  clipped= sum;
2952  for(white=0; white<256; white++){
2953  if(clipped < maxClipped) break;
2954  clipped-= yHistogram[white];
2955  }
2956 
2957  scale = (AVRational){c->ppMode.maxAllowedY - c->ppMode.minAllowedY, white - black};
2958 
2959 #if TEMPLATE_PP_MMXEXT
2960  c->packedYScale = (uint16_t)av_rescale(scale.num, 256, scale.den);
2961  c->packedYOffset = (((black*c->packedYScale)>>8) - c->ppMode.minAllowedY) & 0xFFFF;
2962 #else
2963  c->packedYScale = (uint16_t)av_rescale(scale.num, 1024, scale.den);
2964  c->packedYOffset = (black - c->ppMode.minAllowedY) & 0xFFFF;
2965 #endif
2966 
2967  c->packedYOffset |= c->packedYOffset<<32;
2968  c->packedYOffset |= c->packedYOffset<<16;
2969 
2970  c->packedYScale |= c->packedYScale<<32;
2971  c->packedYScale |= c->packedYScale<<16;
2972 
2973  if(mode & LEVEL_FIX) QPCorrecture= (int)av_rescale(scale.num, 256*256, scale.den);
2974  else QPCorrecture= 256*256;
2975  }else{
2976  c->packedYScale = 0x0100010001000100LL;
2977  c->packedYOffset = 0;
2978  QPCorrecture= 256*256;
2979  }
2980 
2981  /* copy & deinterlace first row of blocks */
2982  y=-BLOCK_SIZE;
2983  {
2984  const uint8_t *srcBlock= &(src[y*srcStride]);
2985  uint8_t *dstBlock= tempDst + dstStride;
2986 
2987  // From this point on it is guaranteed that we can read and write 16 lines downward
2988  // finish 1 block before the next otherwise we might have a problem
2989  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
2990  for(x=0; x<width; x+=BLOCK_SIZE){
2991  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
2992  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
2993  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
2994  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
2995 
2996  RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
2997  srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c->packedYOffset);
2998 
2999  RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
3000 
3002  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3003  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3004  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c->deintTemp + x);
3005  else if(mode & MEDIAN_DEINT_FILTER)
3006  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3007  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3008  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3009  else if(mode & FFMPEG_DEINT_FILTER)
3010  RENAME(deInterlaceFF)(dstBlock, dstStride, c->deintTemp + x);
3011  else if(mode & LOWPASS5_DEINT_FILTER)
3012  RENAME(deInterlaceL5)(dstBlock, dstStride, c->deintTemp + x, c->deintTemp + width + x);
3013 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3014  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3015 */
3016  dstBlock+=8;
3017  srcBlock+=8;
3018  }
3019  if(width==FFABS(dstStride))
3020  linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3021  else{
3022  int i;
3023  for(i=0; i<copyAhead; i++){
3024  memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3025  }
3026  }
3027  }
3028 
3029  for(y=0; y<height; y+=BLOCK_SIZE){
3030  //1% speedup if these are here instead of the inner loop
3031  const uint8_t *srcBlock= &(src[y*srcStride]);
3032  uint8_t *dstBlock= &(dst[y*dstStride]);
3033 #if TEMPLATE_PP_MMX
3034  uint8_t *tempBlock1 = c->tempBlocks;
3035  uint8_t *tempBlock2 = c->tempBlocks + 8;
3036 #endif
3037  const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3038  int8_t *nonBQPptr = &c->nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3039  int QP=0, nonBQP=0;
3040  /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3041  if not than use a temporary buffer */
3042  if(y+15 >= height){
3043  int i;
3044  /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3045  blockcopy to dst later */
3046  linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3047  FFMAX(height-y-copyAhead, 0), srcStride);
3048 
3049  /* duplicate last line of src to fill the void up to line (copyAhead+7) */
3050  for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3051  memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3052 
3053  /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3054  linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3055 
3056  /* duplicate last line of dst to fill the void up to line (copyAhead) */
3057  for(i=height-y+1; i<=copyAhead; i++)
3058  memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3059 
3060  dstBlock= tempDst + dstStride;
3061  srcBlock= tempSrc;
3062  }
3063 
3064  // From this point on it is guaranteed that we can read and write 16 lines downward
3065  // finish 1 block before the next otherwise we might have a problem
3066  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
3067  for(x=0; x<width; ){
3068  int startx = x;
3069  int endx = FFMIN(width, x+32);
3070  uint8_t *dstBlockStart = dstBlock;
3071  const uint8_t *srcBlockStart = srcBlock;
3072  int qp_index = 0;
3073  for(qp_index=0; qp_index < (endx-startx)/BLOCK_SIZE; qp_index++){
3074  QP = QPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3075  nonBQP = nonBQPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3076  if(!isColor){
3077  QP= (QP* QPCorrecture + 256*128)>>16;
3078  nonBQP= (nonBQP* QPCorrecture + 256*128)>>16;
3079  yHistogram[(srcBlock+qp_index*8)[srcStride*12 + 4]]++;
3080  }
3081  c->QP_block[qp_index] = QP;
3082  c->nonBQP_block[qp_index] = nonBQP;
3083 #if TEMPLATE_PP_MMX
3084  __asm__ volatile(
3085  "movd %1, %%mm7 \n\t"
3086  "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3087  "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3088  "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3089  "movq %%mm7, %0 \n\t"
3090  : "=m" (c->pQPb_block[qp_index])
3091  : "r" (QP)
3092  );
3093 #endif
3094  }
3095  for(; x < endx; x+=BLOCK_SIZE){
3096  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
3097  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
3098  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
3099  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
3100 
3101  RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3102  srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c->packedYOffset);
3103 
3105  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3106  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3107  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c->deintTemp + x);
3108  else if(mode & MEDIAN_DEINT_FILTER)
3109  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3110  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3111  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3112  else if(mode & FFMPEG_DEINT_FILTER)
3113  RENAME(deInterlaceFF)(dstBlock, dstStride, c->deintTemp + x);
3114  else if(mode & LOWPASS5_DEINT_FILTER)
3115  RENAME(deInterlaceL5)(dstBlock, dstStride, c->deintTemp + x, c->deintTemp + width + x);
3116 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3117  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3118 */
3119  dstBlock+=8;
3120  srcBlock+=8;
3121  }
3122 
3123  dstBlock = dstBlockStart;
3124  srcBlock = srcBlockStart;
3125 
3126  for(x = startx, qp_index = 0; x < endx; x+=BLOCK_SIZE, qp_index++){
3127  const int stride= dstStride;
3128  //temporary while changing QP stuff to make things continue to work
3129  //eventually QP,nonBQP,etc will be arrays and this will be unnecessary
3130  c->QP = c->QP_block[qp_index];
3131  c->nonBQP = c->nonBQP_block[qp_index];
3132  c->pQPb = c->pQPb_block[qp_index];
3133  c->pQPb2 = c->pQPb2_block[qp_index];
3134 
3135  /* only deblock if we have 2 blocks */
3136  if(y + 8 < height){
3137  if(mode & V_X1_FILTER)
3138  RENAME(vertX1Filter)(dstBlock, stride, c);
3139  else if(mode & V_DEBLOCK){
3140  const int t = RENAME(vertClassify)(dstBlock, stride, c);
3141 
3142  if(t==1)
3143  RENAME(doVertLowPass)(dstBlock, stride, c);
3144  else if(t==2)
3145  RENAME(doVertDefFilter)(dstBlock, stride, c);
3146  }else if(mode & V_A_DEBLOCK){
3147  RENAME(do_a_deblock)(dstBlock, stride, 1, c, mode);
3148  }
3149  }
3150 
3151  dstBlock+=8;
3152  srcBlock+=8;
3153  }
3154 
3155  dstBlock = dstBlockStart;
3156  srcBlock = srcBlockStart;
3157 
3158  for(x = startx, qp_index=0; x < endx; x+=BLOCK_SIZE, qp_index++){
3159  const int stride= dstStride;
3160  c->QP = c->QP_block[qp_index];
3161  c->nonBQP = c->nonBQP_block[qp_index];
3162  c->pQPb = c->pQPb_block[qp_index];
3163  c->pQPb2 = c->pQPb2_block[qp_index];
3164 #if TEMPLATE_PP_MMX
3165  RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3166 #endif
3167  /* check if we have a previous block to deblock it with dstBlock */
3168  if(x - 8 >= 0){
3169 #if TEMPLATE_PP_MMX
3170  if(mode & H_X1_FILTER)
3171  RENAME(vertX1Filter)(tempBlock1, 16, c);
3172  else if(mode & H_DEBLOCK){
3173  const int t= RENAME(vertClassify)(tempBlock1, 16, c);
3174  if(t==1)
3175  RENAME(doVertLowPass)(tempBlock1, 16, c);
3176  else if(t==2)
3177  RENAME(doVertDefFilter)(tempBlock1, 16, c);
3178  }else if(mode & H_A_DEBLOCK){
3179  RENAME(do_a_deblock)(tempBlock1, 16, 1, c, mode);
3180  }
3181 
3182  RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3183 
3184 #else
3185  if(mode & H_X1_FILTER)
3186  horizX1Filter(dstBlock-4, stride, c->QP);
3187  else if(mode & H_DEBLOCK){
3188 #if TEMPLATE_PP_ALTIVEC
3189  DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
3190  int t;
3191  transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3192 
3193  t = vertClassify_altivec(tempBlock-48, 16, c);
3194  if(t==1) {
3195  doVertLowPass_altivec(tempBlock-48, 16, c);
3196  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3197  }
3198  else if(t==2) {
3199  doVertDefFilter_altivec(tempBlock-48, 16, c);
3200  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3201  }
3202 #else
3203  const int t= RENAME(horizClassify)(dstBlock-4, stride, c);
3204 
3205  if(t==1)
3206  RENAME(doHorizLowPass)(dstBlock-4, stride, c);
3207  else if(t==2)
3208  RENAME(doHorizDefFilter)(dstBlock-4, stride, c);
3209 #endif
3210  }else if(mode & H_A_DEBLOCK){
3211  RENAME(do_a_deblock)(dstBlock-8, 1, stride, c, mode);
3212  }
3213 #endif //TEMPLATE_PP_MMX
3214  if(mode & DERING){
3215  RENAME(dering)(dstBlock - stride - 8, stride, c, x<=8, 0, y<=0);
3216  }
3217 
3218  if(mode & TEMP_NOISE_FILTER)
3219  {
3220  RENAME(tempNoiseReducer)(dstBlock-8, stride,
3221  c->tempBlurred[isColor] + y*dstStride + x,
3222  c->tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3223  c->ppMode.maxTmpNoise);
3224  }
3225  }
3226 
3227  dstBlock+=8;
3228  srcBlock+=8;
3229 
3230 #if TEMPLATE_PP_MMX
3231  FFSWAP(uint8_t *, tempBlock1, tempBlock2);
3232 #endif
3233  }
3234  }
3235 
3236  if(mode & DERING){
3237  RENAME(dering)(dstBlock - dstStride - 8, dstStride, c, 0, 1, y<=0);
3238  }
3239 
3240  if((mode & TEMP_NOISE_FILTER)){
3241  RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3242  c->tempBlurred[isColor] + y*dstStride + x,
3243  c->tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3244  c->ppMode.maxTmpNoise);
3245  }
3246 
3247  /* did we use a tmp buffer for the last lines*/
3248  if(y+15 >= height){
3249  uint8_t *dstBlock= &(dst[y*dstStride]);
3250  if(width==FFABS(dstStride))
3251  linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3252  else{
3253  int i;
3254  for(i=0; i<height-y; i++){
3255  memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3256  }
3257  }
3258  }
3259  }
3260 #if TEMPLATE_PP_MMX
3261  __asm__ volatile("emms");
3262 #endif
3263 
3264 #ifdef DEBUG_BRIGHTNESS
3265  if(!isColor){
3266  int max=1;
3267  int i;
3268  for(i=0; i<256; i++)
3269  if(yHistogram[i] > max) max=yHistogram[i];
3270 
3271  for(i=1; i<256; i++){
3272  int x;
3273  int start=yHistogram[i-1]/(max/256+1);
3274  int end=yHistogram[i]/(max/256+1);
3275  int inc= end > start ? 1 : -1;
3276  for(x=start; x!=end+inc; x+=inc)
3277  dst[ i*dstStride + x]+=128;
3278  }
3279 
3280  for(i=0; i<100; i+=2){
3281  dst[ (white)*dstStride + i]+=128;
3282  dst[ (black)*dstStride + i]+=128;
3283  }
3284  }
3285 #endif
3286 }
3287 
3288 #undef RENAME
3289 #undef TEMPLATE_PP_C
3290 #undef TEMPLATE_PP_ALTIVEC
3291 #undef TEMPLATE_PP_MMX
3292 #undef TEMPLATE_PP_MMXEXT
3293 #undef TEMPLATE_PP_SSE2
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:32
FFMPEG_DEINT_FILTER
#define FFMPEG_DEINT_FILTER
Definition: postprocess_internal.h:67
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:216
mem_internal.h
PPContext
postprocess context.
Definition: postprocess_internal.h:116
x86_reg
int x86_reg
Definition: asm.h:72
int64_t
long long int64_t
Definition: coverity.c:34
mode
Definition: swscale.c:56
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
LOWPASS5_DEINT_FILTER
#define LOWPASS5_DEINT_FILTER
Definition: postprocess_internal.h:68
b
#define b
Definition: input.c:42
horizX1Filter
static void horizX1Filter(uint8_t *src, int stride, int QP)
Experimental Filter 1 (Horizontal) will not damage linear gradients Flat blocks should look like they...
Definition: postprocess.c:322
doVertLowPass_altivec
static void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
Definition: postprocess_altivec_template.c:214
max
#define max(a, b)
Definition: cuda_runtime.h:33
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
H_A_DEBLOCK
#define H_A_DEBLOCK
Definition: postprocess_internal.h:56
FFSIGN
#define FFSIGN(a)
Definition: common.h:75
QP
#define QP(qP, depth)
Definition: h264data.c:190
MANGLE
#define MANGLE(a)
Definition: asm.h:127
first
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But first
Definition: rate_distortion.txt:12
postProcess
static void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
Definition: postprocess.c:521
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:145
s
#define s(width, name)
Definition: cbs_vp9.c:198
V_A_DEBLOCK
#define V_A_DEBLOCK
Definition: postprocess_internal.h:52
DERING_THRESHOLD
#define DERING_THRESHOLD
Definition: postprocess.c:98
V_DEBLOCK
#define V_DEBLOCK
Definition: postprocess_internal.h:36
TEMP_NOISE_FILTER
#define TEMP_NOISE_FILTER
Definition: postprocess_internal.h:70
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:74
asm.h
tmp
static uint8_t tmp[20]
Definition: aes_ctr.c:47
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
MEDIAN_DEINT_FILTER
#define MEDIAN_DEINT_FILTER
Definition: postprocess_internal.h:66
linecpy
static void linecpy(void *dest, const void *src, int lines, int stride)
Definition: postprocess_internal.h:177
V_X1_FILTER
#define V_X1_FILTER
Definition: postprocess_internal.h:51
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
inc
static int inc(int num, int period)
Definition: perlin.c:34
transpose_16x8_char_toPackedAlign_altivec
static void transpose_16x8_char_toPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1019
PAVGB
#define PAVGB(a, b)
Definition: postprocess_template.c:81
f
f
Definition: af_crystalizer.c:122
height
#define height
Definition: dsp.h:89
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:104
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
transpose_8x16_char_fromPackedAlign_altivec
static void transpose_8x16_char_fromPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1124
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
PREV
@ PREV
Definition: vf_fftdnoiz.c:37
diff
static av_always_inline int diff(const struct color_info *a, const struct color_info *b, const int trans_thresh)
Definition: vf_paletteuse.c:166
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
H_DEBLOCK
#define H_DEBLOCK
Definition: postprocess_internal.h:37
AV_LOG_INFO
#define AV_LOG_INFO
Standard information.
Definition: log.h:221
DERING
#define DERING
Definition: postprocess_internal.h:38
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
VISUALIZE
#define VISUALIZE
Definition: postprocess_internal.h:73
av_always_inline
#define av_always_inline
Definition: attributes.h:49
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
AV_STRINGIFY
#define AV_STRINGIFY(s)
Definition: macros.h:66
av_rescale
int64_t av_rescale(int64_t a, int64_t b, int64_t c)
Rescale a 64-bit integer with rounding to nearest.
Definition: mathematics.c:129
stride
#define stride
Definition: h264pred_template.c:536
NEXT
@ NEXT
Definition: vf_fftdnoiz.c:38
CUBIC_IPOL_DEINT_FILTER
#define CUBIC_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:65
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
vertClassify_altivec
static int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:59
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:98
TEMPLATE_PP_MMX
#define TEMPLATE_PP_MMX
Definition: postprocess_template.c:52
RENAME
#define RENAME(element)
Definition: ac3enc_template.c:44
doVertDefFilter_altivec
static void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:412
mode
mode
Definition: ebur128.h:83
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:117
LINEAR_BLEND_DEINT_FILTER
#define LINEAR_BLEND_DEINT_FILTER
Definition: postprocess_internal.h:63
av_clip_uint8
#define av_clip_uint8
Definition: common.h:106
LINEAR_IPOL_DEINT_FILTER
#define LINEAR_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:62
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:291
H_X1_FILTER
#define H_X1_FILTER
Definition: postprocess_internal.h:55
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
BLOCK_SIZE
#define BLOCK_SIZE
Definition: adx.h:51
width
#define width
Definition: dsp.h:89
LEVEL_FIX
#define LEVEL_FIX
Brightness & Contrast.
Definition: postprocess_internal.h:39
src
#define src
Definition: vp8dsp.c:248
min
float min
Definition: vorbis_enc_data.h:429