ReactOS  0.4.14-dev-52-g6116262
dct64_altivec.c
Go to the documentation of this file.
1 /*
2  dct64_altivec.c: Discrete Cosine Tansform (DCT) for Altivec
3 
4  copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1
5  see COPYING and AUTHORS files in distribution or http://mpg123.org
6  initially written by Michael Hipp
7  altivec optimization by tmkk
8 */
9 
10 /*
11  * Discrete Cosine Tansform (DCT) for subband synthesis
12  *
13  * -funroll-loops (for gcc) will remove the loops for better performance
14  * using loops in the source-code enhances readabillity
15  *
16  *
17  * TODO: write an optimized version for the down-sampling modes
18  * (in these modes the bands 16-31 (2:1) or 8-31 (4:1) are zero
19  */
20 
21 #include "mpg123lib_intern.h"
22 
23 #ifndef __APPLE__
24 #include <altivec.h>
25 #endif
26 
27 void dct64_altivec(real *out0,real *out1,real *samples)
28 {
29  ALIGNED(16) real bufs[32];
30 
31  {
32  register real *b1,*costab;
33 
34  vector unsigned char vinvert,vperm1,vperm2,vperm3,vperm4;
35  vector float v1,v2,v3,v4,v5,v6,v7,v8;
36  vector float vbs1,vbs2,vbs3,vbs4,vbs5,vbs6,vbs7,vbs8;
37  vector float vbs9,vbs10,vbs11,vbs12,vbs13,vbs14,vbs15,vbs16;
38  vector float vzero;
39  b1 = samples;
40  costab = pnts[0];
41 
42  vzero = vec_xor(vzero,vzero);
43 #ifdef __APPLE__
44  vinvert = (vector unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
45 #else
46  vinvert = (vector unsigned char){12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
47 #endif
48  vperm1 = vec_lvsl(0,b1);
49  vperm2 = vec_perm(vperm1,vperm1,vinvert);
50 
51  v1 = vec_ld(0,b1);
52  v2 = vec_ld(16,b1);
53  v3 = vec_ld(112,b1);
54  v4 = vec_ld(127,b1);
55  v5 = vec_perm(v1,v2,vperm1); /* b1[0,1,2,3] */
56  v6 = vec_perm(v3,v4,vperm2); /* b1[31,30,29,28] */
57 
58  vbs1 = vec_add(v5,v6);
59  vbs8 = vec_sub(v5,v6);
60 
61  v1 = vec_ld(32,b1);
62  v4 = vec_ld(96,b1);
63  v5 = vec_perm(v2,v1,vperm1); /* b1[4,5,6,7] */
64  v6 = vec_perm(v4,v3,vperm2); /* b1[27,26,25,24] */
65 
66  vbs2 = vec_add(v5,v6);
67  vbs7 = vec_sub(v5,v6);
68 
69  v2 = vec_ld(48,b1);
70  v3 = vec_ld(80,b1);
71  v5 = vec_perm(v1,v2,vperm1); /* b1[8,9,10,11] */
72  v6 = vec_perm(v3,v4,vperm2); /* b1[23,22,21,20] */
73 
74  vbs3 = vec_add(v5,v6);
75  vbs6 = vec_sub(v5,v6);
76 
77  v1 = vec_ld(64,b1);
78  v5 = vec_perm(v2,v1,vperm1); /* b1[12,13,14,15] */
79  v6 = vec_perm(v1,v3,vperm2); /* b1[19,18,17,16] */
80 
81  vbs4 = vec_add(v5,v6);
82  vbs5 = vec_sub(v5,v6);
83 
84  v1 = vec_ld(0,costab);
85  vbs8 = vec_madd(vbs8,v1,vzero);
86  v2 = vec_ld(16,costab);
87  vbs7 = vec_madd(vbs7,v2,vzero);
88  v3 = vec_ld(32,costab);
89  vbs6 = vec_madd(vbs6,v3,vzero);
90  v4 = vec_ld(48,costab);
91  vbs5 = vec_madd(vbs5,v4,vzero);
92  vbs6 = vec_perm(vbs6,vbs6,vinvert);
93  vbs5 = vec_perm(vbs5,vbs5,vinvert);
94 
95 
96  costab = pnts[1];
97 
98  v1 = vec_perm(vbs4,vbs4,vinvert);
99  vbs9 = vec_add(vbs1,v1);
100  v3 = vec_sub(vbs1,v1);
101  v5 = vec_ld(0,costab);
102  v2 = vec_perm(vbs3,vbs3,vinvert);
103  vbs10 = vec_add(vbs2,v2);
104  v4 = vec_sub(vbs2,v2);
105  v6 = vec_ld(16,costab);
106  vbs12 = vec_madd(v3,v5,vzero);
107  vbs11 = vec_madd(v4,v6,vzero);
108 
109  v7 = vec_sub(vbs7,vbs6);
110  v8 = vec_sub(vbs8,vbs5);
111  vbs13 = vec_add(vbs5,vbs8);
112  vbs14 = vec_add(vbs6,vbs7);
113  vbs15 = vec_madd(v7,v6,vzero);
114  vbs16 = vec_madd(v8,v5,vzero);
115 
116 
117  costab = pnts[2];
118 
119  v1 = vec_perm(vbs10,vbs10,vinvert);
120  v5 = vec_perm(vbs14,vbs14,vinvert);
121  vbs1 = vec_add(v1,vbs9);
122  vbs5 = vec_add(v5,vbs13);
123  v2 = vec_sub(vbs9,v1);
124  v6 = vec_sub(vbs13,v5);
125  v3 = vec_ld(0,costab);
126  vbs11 = vec_perm(vbs11,vbs11,vinvert);
127  vbs15 = vec_perm(vbs15,vbs15,vinvert);
128  vbs3 = vec_add(vbs11,vbs12);
129  vbs7 = vec_add(vbs15,vbs16);
130  v4 = vec_sub(vbs12,vbs11);
131  v7 = vec_sub(vbs16,vbs15);
132  vbs2 = vec_madd(v2,v3,vzero);
133  vbs4 = vec_madd(v4,v3,vzero);
134  vbs6 = vec_madd(v6,v3,vzero);
135  vbs8 = vec_madd(v7,v3,vzero);
136 
137  vbs2 = vec_perm(vbs2,vbs2,vinvert);
138  vbs4 = vec_perm(vbs4,vbs4,vinvert);
139  vbs6 = vec_perm(vbs6,vbs6,vinvert);
140  vbs8 = vec_perm(vbs8,vbs8,vinvert);
141 
142 
143  costab = pnts[3];
144 
145 #ifdef __APPLE__
146  vperm1 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
147  vperm2 = (vector unsigned char)(12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27);
148  vperm3 = (vector unsigned char)(0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19);
149 #else
150  vperm1 = (vector unsigned char){0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23};
151  vperm2 = (vector unsigned char){12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27};
152  vperm3 = (vector unsigned char){0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19};
153 #endif
154  vperm4 = vec_add(vperm3,vec_splat_u8(8));
155 
156  v1 = vec_ld(0,costab);
157  v2 = vec_splat(v1,0);
158  v3 = vec_splat(v1,1);
159  v1 = vec_mergeh(v2,v3);
160 
161  v2 = vec_perm(vbs1,vbs3,vperm1);
162  v3 = vec_perm(vbs2,vbs4,vperm1);
163  v4 = vec_perm(vbs1,vbs3,vperm2);
164  v5 = vec_perm(vbs2,vbs4,vperm2);
165  v6 = vec_sub(v2,v4);
166  v7 = vec_sub(v3,v5);
167  v2 = vec_add(v2,v4);
168  v3 = vec_add(v3,v5);
169  v4 = vec_madd(v6,v1,vzero);
170  v5 = vec_nmsub(v7,v1,vzero);
171  vbs9 = vec_perm(v2,v4,vperm3);
172  vbs11 = vec_perm(v2,v4,vperm4);
173  vbs10 = vec_perm(v3,v5,vperm3);
174  vbs12 = vec_perm(v3,v5,vperm4);
175 
176  v2 = vec_perm(vbs5,vbs7,vperm1);
177  v3 = vec_perm(vbs6,vbs8,vperm1);
178  v4 = vec_perm(vbs5,vbs7,vperm2);
179  v5 = vec_perm(vbs6,vbs8,vperm2);
180  v6 = vec_sub(v2,v4);
181  v7 = vec_sub(v3,v5);
182  v2 = vec_add(v2,v4);
183  v3 = vec_add(v3,v5);
184  v4 = vec_madd(v6,v1,vzero);
185  v5 = vec_nmsub(v7,v1,vzero);
186  vbs13 = vec_perm(v2,v4,vperm3);
187  vbs15 = vec_perm(v2,v4,vperm4);
188  vbs14 = vec_perm(v3,v5,vperm3);
189  vbs16 = vec_perm(v3,v5,vperm4);
190 
191 
192  costab = pnts[4];
193 
194  v1 = vec_lde(0,costab);
195 #ifdef __APPLE__
196  v2 = (vector float)(1.0f,-1.0f,1.0f,-1.0f);
197 #else
198  v2 = (vector float){1.0f,-1.0f,1.0f,-1.0f};
199 #endif
200  v3 = vec_splat(v1,0);
201  v1 = vec_madd(v2,v3,vzero);
202 
203  v2 = vec_mergeh(vbs9,vbs10);
204  v3 = vec_mergel(vbs9,vbs10);
205  v4 = vec_mergeh(vbs11,vbs12);
206  v5 = vec_mergel(vbs11,vbs12);
207  v6 = vec_mergeh(v2,v3);
208  v7 = vec_mergel(v2,v3);
209  v2 = vec_mergeh(v4,v5);
210  v3 = vec_mergel(v4,v5);
211  v4 = vec_sub(v6,v7);
212  v5 = vec_sub(v2,v3);
213  v6 = vec_add(v6,v7);
214  v7 = vec_add(v2,v3);
215  v2 = vec_madd(v4,v1,vzero);
216  v3 = vec_madd(v5,v1,vzero);
217  vbs1 = vec_mergeh(v6,v2);
218  vbs2 = vec_mergel(v6,v2);
219  vbs3 = vec_mergeh(v7,v3);
220  vbs4 = vec_mergel(v7,v3);
221 
222  v2 = vec_mergeh(vbs13,vbs14);
223  v3 = vec_mergel(vbs13,vbs14);
224  v4 = vec_mergeh(vbs15,vbs16);
225  v5 = vec_mergel(vbs15,vbs16);
226  v6 = vec_mergeh(v2,v3);
227  v7 = vec_mergel(v2,v3);
228  v2 = vec_mergeh(v4,v5);
229  v3 = vec_mergel(v4,v5);
230  v4 = vec_sub(v6,v7);
231  v5 = vec_sub(v2,v3);
232  v6 = vec_add(v6,v7);
233  v7 = vec_add(v2,v3);
234  v2 = vec_madd(v4,v1,vzero);
235  v3 = vec_madd(v5,v1,vzero);
236  vbs5 = vec_mergeh(v6,v2);
237  vbs6 = vec_mergel(v6,v2);
238  vbs7 = vec_mergeh(v7,v3);
239  vbs8 = vec_mergel(v7,v3);
240 
241  vec_st(vbs1,0,bufs);
242  vec_st(vbs2,16,bufs);
243  vec_st(vbs3,32,bufs);
244  vec_st(vbs4,48,bufs);
245  vec_st(vbs5,64,bufs);
246  vec_st(vbs6,80,bufs);
247  vec_st(vbs7,96,bufs);
248  vec_st(vbs8,112,bufs);
249  }
250 
251  {
252  register real *b1;
253  register int i;
254 
255  for(b1=bufs,i=8;i;i--,b1+=4)
256  b1[2] += b1[3];
257 
258  for(b1=bufs,i=4;i;i--,b1+=8)
259  {
260  b1[4] += b1[6];
261  b1[6] += b1[5];
262  b1[5] += b1[7];
263  }
264 
265  for(b1=bufs,i=2;i;i--,b1+=16)
266  {
267  b1[8] += b1[12];
268  b1[12] += b1[10];
269  b1[10] += b1[14];
270  b1[14] += b1[9];
271  b1[9] += b1[13];
272  b1[13] += b1[11];
273  b1[11] += b1[15];
274  }
275  }
276 
277 
278  out0[0x10*16] = bufs[0];
279  out0[0x10*15] = bufs[16+0] + bufs[16+8];
280  out0[0x10*14] = bufs[8];
281  out0[0x10*13] = bufs[16+8] + bufs[16+4];
282  out0[0x10*12] = bufs[4];
283  out0[0x10*11] = bufs[16+4] + bufs[16+12];
284  out0[0x10*10] = bufs[12];
285  out0[0x10* 9] = bufs[16+12] + bufs[16+2];
286  out0[0x10* 8] = bufs[2];
287  out0[0x10* 7] = bufs[16+2] + bufs[16+10];
288  out0[0x10* 6] = bufs[10];
289  out0[0x10* 5] = bufs[16+10] + bufs[16+6];
290  out0[0x10* 4] = bufs[6];
291  out0[0x10* 3] = bufs[16+6] + bufs[16+14];
292  out0[0x10* 2] = bufs[14];
293  out0[0x10* 1] = bufs[16+14] + bufs[16+1];
294  out0[0x10* 0] = bufs[1];
295 
296  out1[0x10* 0] = bufs[1];
297  out1[0x10* 1] = bufs[16+1] + bufs[16+9];
298  out1[0x10* 2] = bufs[9];
299  out1[0x10* 3] = bufs[16+9] + bufs[16+5];
300  out1[0x10* 4] = bufs[5];
301  out1[0x10* 5] = bufs[16+5] + bufs[16+13];
302  out1[0x10* 6] = bufs[13];
303  out1[0x10* 7] = bufs[16+13] + bufs[16+3];
304  out1[0x10* 8] = bufs[3];
305  out1[0x10* 9] = bufs[16+3] + bufs[16+11];
306  out1[0x10*10] = bufs[11];
307  out1[0x10*11] = bufs[16+11] + bufs[16+7];
308  out1[0x10*12] = bufs[7];
309  out1[0x10*13] = bufs[16+7] + bufs[16+15];
310  out1[0x10*14] = bufs[15];
311  out1[0x10*15] = bufs[16+15];
312 
313 }
314 
315 
GLsizei samples
Definition: glext.h:7006
#define ALIGNED(a)
Definition: optimize.h:190
const GLenum * bufs
Definition: glext.h:6026
_Tp _STLP_CALL real(const complex< _Tp > &__z)
Definition: _complex.h:727
#define pnts
Definition: intsym.h:8
GLsizei GLenum const GLvoid GLsizei GLenum GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLint GLint GLint GLshort GLshort GLshort GLubyte GLubyte GLubyte GLuint GLuint GLuint GLushort GLushort GLushort GLbyte GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLfloat GLint GLint GLint GLint GLshort GLshort GLshort GLshort GLubyte GLubyte GLubyte GLubyte GLuint GLuint GLuint GLuint GLushort GLushort GLushort GLushort GLboolean const GLdouble const GLfloat const GLint const GLshort const GLbyte const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLdouble const GLfloat const GLfloat const GLint const GLint const GLshort const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort GLenum GLenum GLenum GLfloat GLenum GLint GLenum GLenum GLenum GLfloat GLenum GLenum GLint GLenum GLfloat GLenum GLint GLint GLushort GLenum GLenum GLfloat GLenum GLenum GLint GLfloat const GLubyte GLenum GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLint GLint GLsizei GLsizei GLint GLenum GLenum const GLvoid GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLenum const GLdouble GLenum GLenum const GLfloat GLenum GLenum const GLint GLsizei GLuint GLfloat GLuint GLbitfield GLfloat GLint GLuint GLboolean GLenum GLfloat GLenum GLbitfield GLenum GLfloat GLfloat GLint GLint const GLfloat GLenum GLfloat GLfloat GLint GLint GLfloat GLfloat GLint GLint const GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat const GLdouble const GLfloat const GLdouble const GLfloat GLint i
Definition: glfuncs.h:248
static CRYPT_DATA_BLOB b1[]
Definition: msg.c:573
unsigned char
Definition: typeof.h:29
GLfloat f
Definition: glext.h:7540
GLfloat GLfloat GLfloat GLfloat v3
Definition: glext.h:6064
GLfloat GLfloat GLfloat v2
Definition: glext.h:6063
GLfloat GLfloat v1
Definition: glext.h:6062
void dct64_altivec(real *out0, real *out1, real *samples)
Definition: dct64_altivec.c:27