ReactOS 0.4.15-dev-7942-gd23573b
dct64_altivec.c File Reference
#include "mpg123lib_intern.h"
#include <altivec.h>
Include dependency graph for dct64_altivec.c:

Go to the source code of this file.

Functions

void dct64_altivec (real *out0, real *out1, real *samples)
 

Function Documentation

◆ dct64_altivec()

void dct64_altivec ( real out0,
real out1,
real samples 
)

Definition at line 27 of file dct64_altivec.c.

28{
29 ALIGNED(16) real bufs[32];
30
31 {
32 register real *b1,*costab;
33
34 vector unsigned char vinvert,vperm1,vperm2,vperm3,vperm4;
35 vector float v1,v2,v3,v4,v5,v6,v7,v8;
36 vector float vbs1,vbs2,vbs3,vbs4,vbs5,vbs6,vbs7,vbs8;
37 vector float vbs9,vbs10,vbs11,vbs12,vbs13,vbs14,vbs15,vbs16;
38 vector float vzero;
39 b1 = samples;
40 costab = pnts[0];
41
42 vzero = vec_xor(vzero,vzero);
43#ifdef __APPLE__
44 vinvert = (vector unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
45#else
46 vinvert = (vector unsigned char){12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
47#endif
48 vperm1 = vec_lvsl(0,b1);
49 vperm2 = vec_perm(vperm1,vperm1,vinvert);
50
51 v1 = vec_ld(0,b1);
52 v2 = vec_ld(16,b1);
53 v3 = vec_ld(112,b1);
54 v4 = vec_ld(127,b1);
55 v5 = vec_perm(v1,v2,vperm1); /* b1[0,1,2,3] */
56 v6 = vec_perm(v3,v4,vperm2); /* b1[31,30,29,28] */
57
58 vbs1 = vec_add(v5,v6);
59 vbs8 = vec_sub(v5,v6);
60
61 v1 = vec_ld(32,b1);
62 v4 = vec_ld(96,b1);
63 v5 = vec_perm(v2,v1,vperm1); /* b1[4,5,6,7] */
64 v6 = vec_perm(v4,v3,vperm2); /* b1[27,26,25,24] */
65
66 vbs2 = vec_add(v5,v6);
67 vbs7 = vec_sub(v5,v6);
68
69 v2 = vec_ld(48,b1);
70 v3 = vec_ld(80,b1);
71 v5 = vec_perm(v1,v2,vperm1); /* b1[8,9,10,11] */
72 v6 = vec_perm(v3,v4,vperm2); /* b1[23,22,21,20] */
73
74 vbs3 = vec_add(v5,v6);
75 vbs6 = vec_sub(v5,v6);
76
77 v1 = vec_ld(64,b1);
78 v5 = vec_perm(v2,v1,vperm1); /* b1[12,13,14,15] */
79 v6 = vec_perm(v1,v3,vperm2); /* b1[19,18,17,16] */
80
81 vbs4 = vec_add(v5,v6);
82 vbs5 = vec_sub(v5,v6);
83
84 v1 = vec_ld(0,costab);
85 vbs8 = vec_madd(vbs8,v1,vzero);
86 v2 = vec_ld(16,costab);
87 vbs7 = vec_madd(vbs7,v2,vzero);
88 v3 = vec_ld(32,costab);
89 vbs6 = vec_madd(vbs6,v3,vzero);
90 v4 = vec_ld(48,costab);
91 vbs5 = vec_madd(vbs5,v4,vzero);
92 vbs6 = vec_perm(vbs6,vbs6,vinvert);
93 vbs5 = vec_perm(vbs5,vbs5,vinvert);
94
95
96 costab = pnts[1];
97
98 v1 = vec_perm(vbs4,vbs4,vinvert);
99 vbs9 = vec_add(vbs1,v1);
100 v3 = vec_sub(vbs1,v1);
101 v5 = vec_ld(0,costab);
102 v2 = vec_perm(vbs3,vbs3,vinvert);
103 vbs10 = vec_add(vbs2,v2);
104 v4 = vec_sub(vbs2,v2);
105 v6 = vec_ld(16,costab);
106 vbs12 = vec_madd(v3,v5,vzero);
107 vbs11 = vec_madd(v4,v6,vzero);
108
109 v7 = vec_sub(vbs7,vbs6);
110 v8 = vec_sub(vbs8,vbs5);
111 vbs13 = vec_add(vbs5,vbs8);
112 vbs14 = vec_add(vbs6,vbs7);
113 vbs15 = vec_madd(v7,v6,vzero);
114 vbs16 = vec_madd(v8,v5,vzero);
115
116
117 costab = pnts[2];
118
119 v1 = vec_perm(vbs10,vbs10,vinvert);
120 v5 = vec_perm(vbs14,vbs14,vinvert);
121 vbs1 = vec_add(v1,vbs9);
122 vbs5 = vec_add(v5,vbs13);
123 v2 = vec_sub(vbs9,v1);
124 v6 = vec_sub(vbs13,v5);
125 v3 = vec_ld(0,costab);
126 vbs11 = vec_perm(vbs11,vbs11,vinvert);
127 vbs15 = vec_perm(vbs15,vbs15,vinvert);
128 vbs3 = vec_add(vbs11,vbs12);
129 vbs7 = vec_add(vbs15,vbs16);
130 v4 = vec_sub(vbs12,vbs11);
131 v7 = vec_sub(vbs16,vbs15);
132 vbs2 = vec_madd(v2,v3,vzero);
133 vbs4 = vec_madd(v4,v3,vzero);
134 vbs6 = vec_madd(v6,v3,vzero);
135 vbs8 = vec_madd(v7,v3,vzero);
136
137 vbs2 = vec_perm(vbs2,vbs2,vinvert);
138 vbs4 = vec_perm(vbs4,vbs4,vinvert);
139 vbs6 = vec_perm(vbs6,vbs6,vinvert);
140 vbs8 = vec_perm(vbs8,vbs8,vinvert);
141
142
143 costab = pnts[3];
144
145#ifdef __APPLE__
146 vperm1 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
147 vperm2 = (vector unsigned char)(12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27);
148 vperm3 = (vector unsigned char)(0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19);
149#else
150 vperm1 = (vector unsigned char){0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23};
151 vperm2 = (vector unsigned char){12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27};
152 vperm3 = (vector unsigned char){0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19};
153#endif
154 vperm4 = vec_add(vperm3,vec_splat_u8(8));
155
156 v1 = vec_ld(0,costab);
157 v2 = vec_splat(v1,0);
158 v3 = vec_splat(v1,1);
159 v1 = vec_mergeh(v2,v3);
160
161 v2 = vec_perm(vbs1,vbs3,vperm1);
162 v3 = vec_perm(vbs2,vbs4,vperm1);
163 v4 = vec_perm(vbs1,vbs3,vperm2);
164 v5 = vec_perm(vbs2,vbs4,vperm2);
165 v6 = vec_sub(v2,v4);
166 v7 = vec_sub(v3,v5);
167 v2 = vec_add(v2,v4);
168 v3 = vec_add(v3,v5);
169 v4 = vec_madd(v6,v1,vzero);
170 v5 = vec_nmsub(v7,v1,vzero);
171 vbs9 = vec_perm(v2,v4,vperm3);
172 vbs11 = vec_perm(v2,v4,vperm4);
173 vbs10 = vec_perm(v3,v5,vperm3);
174 vbs12 = vec_perm(v3,v5,vperm4);
175
176 v2 = vec_perm(vbs5,vbs7,vperm1);
177 v3 = vec_perm(vbs6,vbs8,vperm1);
178 v4 = vec_perm(vbs5,vbs7,vperm2);
179 v5 = vec_perm(vbs6,vbs8,vperm2);
180 v6 = vec_sub(v2,v4);
181 v7 = vec_sub(v3,v5);
182 v2 = vec_add(v2,v4);
183 v3 = vec_add(v3,v5);
184 v4 = vec_madd(v6,v1,vzero);
185 v5 = vec_nmsub(v7,v1,vzero);
186 vbs13 = vec_perm(v2,v4,vperm3);
187 vbs15 = vec_perm(v2,v4,vperm4);
188 vbs14 = vec_perm(v3,v5,vperm3);
189 vbs16 = vec_perm(v3,v5,vperm4);
190
191
192 costab = pnts[4];
193
194 v1 = vec_lde(0,costab);
195#ifdef __APPLE__
196 v2 = (vector float)(1.0f,-1.0f,1.0f,-1.0f);
197#else
198 v2 = (vector float){1.0f,-1.0f,1.0f,-1.0f};
199#endif
200 v3 = vec_splat(v1,0);
201 v1 = vec_madd(v2,v3,vzero);
202
203 v2 = vec_mergeh(vbs9,vbs10);
204 v3 = vec_mergel(vbs9,vbs10);
205 v4 = vec_mergeh(vbs11,vbs12);
206 v5 = vec_mergel(vbs11,vbs12);
207 v6 = vec_mergeh(v2,v3);
208 v7 = vec_mergel(v2,v3);
209 v2 = vec_mergeh(v4,v5);
210 v3 = vec_mergel(v4,v5);
211 v4 = vec_sub(v6,v7);
212 v5 = vec_sub(v2,v3);
213 v6 = vec_add(v6,v7);
214 v7 = vec_add(v2,v3);
215 v2 = vec_madd(v4,v1,vzero);
216 v3 = vec_madd(v5,v1,vzero);
217 vbs1 = vec_mergeh(v6,v2);
218 vbs2 = vec_mergel(v6,v2);
219 vbs3 = vec_mergeh(v7,v3);
220 vbs4 = vec_mergel(v7,v3);
221
222 v2 = vec_mergeh(vbs13,vbs14);
223 v3 = vec_mergel(vbs13,vbs14);
224 v4 = vec_mergeh(vbs15,vbs16);
225 v5 = vec_mergel(vbs15,vbs16);
226 v6 = vec_mergeh(v2,v3);
227 v7 = vec_mergel(v2,v3);
228 v2 = vec_mergeh(v4,v5);
229 v3 = vec_mergel(v4,v5);
230 v4 = vec_sub(v6,v7);
231 v5 = vec_sub(v2,v3);
232 v6 = vec_add(v6,v7);
233 v7 = vec_add(v2,v3);
234 v2 = vec_madd(v4,v1,vzero);
235 v3 = vec_madd(v5,v1,vzero);
236 vbs5 = vec_mergeh(v6,v2);
237 vbs6 = vec_mergel(v6,v2);
238 vbs7 = vec_mergeh(v7,v3);
239 vbs8 = vec_mergel(v7,v3);
240
241 vec_st(vbs1,0,bufs);
242 vec_st(vbs2,16,bufs);
243 vec_st(vbs3,32,bufs);
244 vec_st(vbs4,48,bufs);
245 vec_st(vbs5,64,bufs);
246 vec_st(vbs6,80,bufs);
247 vec_st(vbs7,96,bufs);
248 vec_st(vbs8,112,bufs);
249 }
250
251 {
252 register real *b1;
253 register int i;
254
255 for(b1=bufs,i=8;i;i--,b1+=4)
256 b1[2] += b1[3];
257
258 for(b1=bufs,i=4;i;i--,b1+=8)
259 {
260 b1[4] += b1[6];
261 b1[6] += b1[5];
262 b1[5] += b1[7];
263 }
264
265 for(b1=bufs,i=2;i;i--,b1+=16)
266 {
267 b1[8] += b1[12];
268 b1[12] += b1[10];
269 b1[10] += b1[14];
270 b1[14] += b1[9];
271 b1[9] += b1[13];
272 b1[13] += b1[11];
273 b1[11] += b1[15];
274 }
275 }
276
277
278 out0[0x10*16] = bufs[0];
279 out0[0x10*15] = bufs[16+0] + bufs[16+8];
280 out0[0x10*14] = bufs[8];
281 out0[0x10*13] = bufs[16+8] + bufs[16+4];
282 out0[0x10*12] = bufs[4];
283 out0[0x10*11] = bufs[16+4] + bufs[16+12];
284 out0[0x10*10] = bufs[12];
285 out0[0x10* 9] = bufs[16+12] + bufs[16+2];
286 out0[0x10* 8] = bufs[2];
287 out0[0x10* 7] = bufs[16+2] + bufs[16+10];
288 out0[0x10* 6] = bufs[10];
289 out0[0x10* 5] = bufs[16+10] + bufs[16+6];
290 out0[0x10* 4] = bufs[6];
291 out0[0x10* 3] = bufs[16+6] + bufs[16+14];
292 out0[0x10* 2] = bufs[14];
293 out0[0x10* 1] = bufs[16+14] + bufs[16+1];
294 out0[0x10* 0] = bufs[1];
295
296 out1[0x10* 0] = bufs[1];
297 out1[0x10* 1] = bufs[16+1] + bufs[16+9];
298 out1[0x10* 2] = bufs[9];
299 out1[0x10* 3] = bufs[16+9] + bufs[16+5];
300 out1[0x10* 4] = bufs[5];
301 out1[0x10* 5] = bufs[16+5] + bufs[16+13];
302 out1[0x10* 6] = bufs[13];
303 out1[0x10* 7] = bufs[16+13] + bufs[16+3];
304 out1[0x10* 8] = bufs[3];
305 out1[0x10* 9] = bufs[16+3] + bufs[16+11];
306 out1[0x10*10] = bufs[11];
307 out1[0x10*11] = bufs[16+11] + bufs[16+7];
308 out1[0x10*12] = bufs[7];
309 out1[0x10*13] = bufs[16+7] + bufs[16+15];
310 out1[0x10*14] = bufs[15];
311 out1[0x10*15] = bufs[16+15];
312
313}
unsigned char
Definition: typeof.h:29
GLsizei samples
Definition: glext.h:7006
const GLenum * bufs
Definition: glext.h:6026
GLfloat GLfloat GLfloat GLfloat v3
Definition: glext.h:6064
GLfloat GLfloat v1
Definition: glext.h:6062
GLfloat GLfloat GLfloat v2
Definition: glext.h:6063
GLsizei GLenum const GLvoid GLsizei GLenum GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLint GLint GLint GLshort GLshort GLshort GLubyte GLubyte GLubyte GLuint GLuint GLuint GLushort GLushort GLushort GLbyte GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLfloat GLint GLint GLint GLint GLshort GLshort GLshort GLshort GLubyte GLubyte GLubyte GLubyte GLuint GLuint GLuint GLuint GLushort GLushort GLushort GLushort GLboolean const GLdouble const GLfloat const GLint const GLshort const GLbyte const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLdouble const GLfloat const GLfloat const GLint const GLint const GLshort const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort GLenum GLenum GLenum GLfloat GLenum GLint GLenum GLenum GLenum GLfloat GLenum GLenum GLint GLenum GLfloat GLenum GLint GLint GLushort GLenum GLenum GLfloat GLenum GLenum GLint GLfloat const GLubyte GLenum GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLint GLint GLsizei GLsizei GLint GLenum GLenum const GLvoid GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLenum const GLdouble GLenum GLenum const GLfloat GLenum GLenum const GLint GLsizei GLuint GLfloat GLuint GLbitfield GLfloat GLint GLuint GLboolean GLenum GLfloat GLenum GLbitfield GLenum GLfloat GLfloat GLint GLint const GLfloat GLenum GLfloat GLfloat GLint GLint GLfloat GLfloat GLint GLint const GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat const GLdouble const GLfloat const GLdouble const GLfloat GLint i
Definition: glfuncs.h:248
#define pnts
Definition: intsym.h:8
static CRYPT_DATA_BLOB b1[]
Definition: msg.c:573
static float(__cdecl *square_half_float)(float x
#define real
#define ALIGNED(a)
Definition: optimize.h:190