FFmpeg
vp3dsp_altivec.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2009 David Conrad
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <string.h>
22 
23 #include "config.h"
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/ppc/cpu.h"
29 
30 #include "libavcodec/vp3dsp.h"
31 
32 #if HAVE_ALTIVEC
33 
34 static const vec_s16 constants =
35  {0, 64277, 60547, 54491, 46341, 36410, 25080, 12785};
36 #if HAVE_BIGENDIAN
37 static const vec_u8 interleave_high =
38  {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
39 #else
40 static const vec_u8 interleave_high =
41  {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
42 #endif
43 
44 #define IDCT_START \
45  vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\
46  vec_s16 Ed, Gd, Add, Bdd, Fd, Hd;\
47  vec_s16 eight = vec_splat_s16(8);\
48  vec_u16 four = vec_splat_u16(4);\
49 \
50  vec_s16 C1 = vec_splat(constants, 1);\
51  vec_s16 C2 = vec_splat(constants, 2);\
52  vec_s16 C3 = vec_splat(constants, 3);\
53  vec_s16 C4 = vec_splat(constants, 4);\
54  vec_s16 C5 = vec_splat(constants, 5);\
55  vec_s16 C6 = vec_splat(constants, 6);\
56  vec_s16 C7 = vec_splat(constants, 7);\
57 \
58  vec_s16 b0 = vec_ld(0x00, block);\
59  vec_s16 b1 = vec_ld(0x10, block);\
60  vec_s16 b2 = vec_ld(0x20, block);\
61  vec_s16 b3 = vec_ld(0x30, block);\
62  vec_s16 b4 = vec_ld(0x40, block);\
63  vec_s16 b5 = vec_ld(0x50, block);\
64  vec_s16 b6 = vec_ld(0x60, block);\
65  vec_s16 b7 = vec_ld(0x70, block);
66 
67 // these functions do (a*C)>>16
68 // things are tricky because a is signed, but C unsigned.
69 // M15 is used if C fits in 15 bit unsigned (C6,C7)
70 // M16 is used if C requires 16 bits unsigned
71 static inline vec_s16 M15(vec_s16 a, vec_s16 C)
72 {
73  return (vec_s16)vec_perm(vec_mule(a,C), vec_mulo(a,C), interleave_high);
74 }
75 static inline vec_s16 M16(vec_s16 a, vec_s16 C)
76 {
77  return vec_add(a, M15(a, C));
78 }
79 
80 #define IDCT_1D(ADD, SHIFT)\
81  A = vec_add(M16(b1, C1), M15(b7, C7));\
82  B = vec_sub(M15(b1, C7), M16(b7, C1));\
83  C = vec_add(M16(b3, C3), M16(b5, C5));\
84  D = vec_sub(M16(b5, C3), M16(b3, C5));\
85 \
86  Ad = M16(vec_sub(A, C), C4);\
87  Bd = M16(vec_sub(B, D), C4);\
88 \
89  Cd = vec_add(A, C);\
90  Dd = vec_add(B, D);\
91 \
92  E = ADD(M16(vec_add(b0, b4), C4));\
93  F = ADD(M16(vec_sub(b0, b4), C4));\
94 \
95  G = vec_add(M16(b2, C2), M15(b6, C6));\
96  H = vec_sub(M15(b2, C6), M16(b6, C2));\
97 \
98  Ed = vec_sub(E, G);\
99  Gd = vec_add(E, G);\
100 \
101  Add = vec_add(F, Ad);\
102  Bdd = vec_sub(Bd, H);\
103 \
104  Fd = vec_sub(F, Ad);\
105  Hd = vec_add(Bd, H);\
106 \
107  b0 = SHIFT(vec_add(Gd, Cd));\
108  b7 = SHIFT(vec_sub(Gd, Cd));\
109 \
110  b1 = SHIFT(vec_add(Add, Hd));\
111  b2 = SHIFT(vec_sub(Add, Hd));\
112 \
113  b3 = SHIFT(vec_add(Ed, Dd));\
114  b4 = SHIFT(vec_sub(Ed, Dd));\
115 \
116  b5 = SHIFT(vec_add(Fd, Bdd));\
117  b6 = SHIFT(vec_sub(Fd, Bdd));
118 
119 #define NOP(a) a
120 #define ADD8(a) vec_add(a, eight)
121 #define SHIFT4(a) vec_sra(a, four)
122 
123 static void vp3_idct_put_altivec(uint8_t *dst, ptrdiff_t stride, int16_t block[64])
124 {
125  vec_u8 t;
126  IDCT_START
127 
128  // pixels are signed; so add 128*16 in addition to the normal 8
129  vec_s16 v2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11));
130  eight = vec_add(eight, v2048);
131 
132  IDCT_1D(NOP, NOP)
133  TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
134  IDCT_1D(ADD8, SHIFT4)
135 
136 #define PUT(a)\
137  t = vec_packsu(a, a);\
138  vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
139  vec_ste((vec_u32)t, 4, (unsigned int *)dst);
140 
141  PUT(b0) dst += stride;
142  PUT(b1) dst += stride;
143  PUT(b2) dst += stride;
144  PUT(b3) dst += stride;
145  PUT(b4) dst += stride;
146  PUT(b5) dst += stride;
147  PUT(b6) dst += stride;
148  PUT(b7)
149  memset(block, 0, sizeof(*block) * 64);
150 }
151 
152 static void vp3_idct_add_altivec(uint8_t *dst, ptrdiff_t stride, int16_t block[64])
153 {
154  LOAD_ZERO;
155  vec_u8 t, vdst;
156  vec_s16 vdst_16;
157  vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst));
158 
159  IDCT_START
160 
161  IDCT_1D(NOP, NOP)
162  TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
163  IDCT_1D(ADD8, SHIFT4)
164 
165 #if HAVE_BIGENDIAN
166 #define GET_VDST16\
167  vdst = vec_ld(0, dst);\
168  vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);
169 #else
170 #define GET_VDST16\
171  vdst = vec_vsx_ld(0,dst);\
172  vdst_16 = (vec_s16)vec_mergeh(vdst, zero_u8v);
173 #endif
174 
175 #define ADD(a)\
176  GET_VDST16;\
177  vdst_16 = vec_adds(a, vdst_16);\
178  t = vec_packsu(vdst_16, vdst_16);\
179  vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
180  vec_ste((vec_u32)t, 4, (unsigned int *)dst);
181 
182  ADD(b0) dst += stride;
183  ADD(b1) dst += stride;
184  ADD(b2) dst += stride;
185  ADD(b3) dst += stride;
186  ADD(b4) dst += stride;
187  ADD(b5) dst += stride;
188  ADD(b6) dst += stride;
189  ADD(b7)
190  memset(block, 0, sizeof(*block) * 64);
191 }
192 
193 #endif /* HAVE_ALTIVEC */
194 
196 {
197 #if HAVE_ALTIVEC
199  return;
200 
201  c->idct_put = vp3_idct_put_altivec;
202  c->idct_add = vp3_idct_add_altivec;
203 #endif
204 }
VP3DSPContext
Definition: vp3dsp.h:25
LOAD_ZERO
#define LOAD_ZERO
Definition: util_altivec.h:45
vp3dsp.h
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
PUT
#define PUT(v)
b1
static double b1(void *priv, double x, double y)
Definition: vf_xfade.c:2034
C
s EdgeDetect Foobar g libavfilter vf_edgedetect c libavfilter vf_foobar c edit libavfilter and add an entry for foobar following the pattern of the other filters edit libavfilter allfilters and add an entry for foobar following the pattern of the other filters configure make j< whatever > ffmpeg ffmpeg i you should get a foobar png with Lena edge detected That s your new playground is ready Some little details about what s going which in turn will define variables for the build system and the C
Definition: writing_filters.txt:58
vec_s16
#define vec_s16
Definition: util_altivec.h:37
av_cold
#define av_cold
Definition: attributes.h:90
b3
static double b3(void *priv, double x, double y)
Definition: vf_xfade.c:2036
ADD
#define ADD(a, b)
Definition: dct32_template.c:123
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
PPC_ALTIVEC
#define PPC_ALTIVEC(flags)
Definition: cpu.h:25
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
cpu.h
constants
static const struct @436 constants[]
ff_vp3dsp_init_ppc
av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags)
Definition: vp3dsp_altivec.c:195
b2
static double b2(void *priv, double x, double y)
Definition: vf_xfade.c:2035
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
vec_u8
#define vec_u8
Definition: util_altivec.h:34
attributes.h
stride
#define stride
Definition: h264pred_template.c:536
util_altivec.h
NOP
#define NOP(x)
Definition: colorchannelmixer_template.c:35
cpu.h
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:482
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
b0
static double b0(void *priv, double x, double y)
Definition: vf_xfade.c:2033