Home | Info | Community | Development | myReactOS | Contact Us
ReactOS Development > Doxygendct64_altivec.c
Go to the documentation of this file.
00001 /* 00002 dct64_altivec.c: Discrete Cosine Tansform (DCT) for Altivec 00003 00004 copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 00005 see COPYING and AUTHORS files in distribution or http://mpg123.org 00006 initially written by Michael Hipp 00007 altivec optimization by tmkk 00008 */ 00009 00010 /* 00011 * Discrete Cosine Tansform (DCT) for subband synthesis 00012 * 00013 * -funroll-loops (for gcc) will remove the loops for better performance 00014 * using loops in the source-code enhances readabillity 00015 * 00016 * 00017 * TODO: write an optimized version for the down-sampling modes 00018 * (in these modes the bands 16-31 (2:1) or 8-31 (4:1) are zero 00019 */ 00020 00021 #include "mpg123lib_intern.h" 00022 00023 #ifndef __APPLE__ 00024 #include <altivec.h> 00025 #endif 00026 00027 void dct64_altivec(real *out0,real *out1,real *samples) 00028 { 00029 ALIGNED(16) real bufs[32]; 00030 00031 { 00032 register real *b1,*costab; 00033 00034 vector unsigned char vinvert,vperm1,vperm2,vperm3,vperm4; 00035 vector float v1,v2,v3,v4,v5,v6,v7,v8; 00036 vector float vbs1,vbs2,vbs3,vbs4,vbs5,vbs6,vbs7,vbs8; 00037 vector float vbs9,vbs10,vbs11,vbs12,vbs13,vbs14,vbs15,vbs16; 00038 vector float vzero; 00039 b1 = samples; 00040 costab = pnts[0]; 00041 00042 vzero = vec_xor(vzero,vzero); 00043 #ifdef __APPLE__ 00044 vinvert = (vector unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3); 00045 #else 00046 vinvert = (vector unsigned char){12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3}; 00047 #endif 00048 vperm1 = vec_lvsl(0,b1); 00049 vperm2 = vec_perm(vperm1,vperm1,vinvert); 00050 00051 v1 = vec_ld(0,b1); 00052 v2 = vec_ld(16,b1); 00053 v3 = vec_ld(112,b1); 00054 v4 = vec_ld(127,b1); 00055 v5 = vec_perm(v1,v2,vperm1); /* b1[0,1,2,3] */ 00056 v6 = vec_perm(v3,v4,vperm2); /* b1[31,30,29,28] */ 00057 00058 vbs1 = vec_add(v5,v6); 00059 vbs8 = vec_sub(v5,v6); 00060 00061 v1 = vec_ld(32,b1); 00062 v4 = vec_ld(96,b1); 00063 v5 = vec_perm(v2,v1,vperm1); /* b1[4,5,6,7] */ 00064 v6 = vec_perm(v4,v3,vperm2); /* b1[27,26,25,24] */ 00065 00066 vbs2 = vec_add(v5,v6); 00067 vbs7 = vec_sub(v5,v6); 00068 00069 v2 = vec_ld(48,b1); 00070 v3 = vec_ld(80,b1); 00071 v5 = vec_perm(v1,v2,vperm1); /* b1[8,9,10,11] */ 00072 v6 = vec_perm(v3,v4,vperm2); /* b1[23,22,21,20] */ 00073 00074 vbs3 = vec_add(v5,v6); 00075 vbs6 = vec_sub(v5,v6); 00076 00077 v1 = vec_ld(64,b1); 00078 v5 = vec_perm(v2,v1,vperm1); /* b1[12,13,14,15] */ 00079 v6 = vec_perm(v1,v3,vperm2); /* b1[19,18,17,16] */ 00080 00081 vbs4 = vec_add(v5,v6); 00082 vbs5 = vec_sub(v5,v6); 00083 00084 v1 = vec_ld(0,costab); 00085 vbs8 = vec_madd(vbs8,v1,vzero); 00086 v2 = vec_ld(16,costab); 00087 vbs7 = vec_madd(vbs7,v2,vzero); 00088 v3 = vec_ld(32,costab); 00089 vbs6 = vec_madd(vbs6,v3,vzero); 00090 v4 = vec_ld(48,costab); 00091 vbs5 = vec_madd(vbs5,v4,vzero); 00092 vbs6 = vec_perm(vbs6,vbs6,vinvert); 00093 vbs5 = vec_perm(vbs5,vbs5,vinvert); 00094 00095 00096 costab = pnts[1]; 00097 00098 v1 = vec_perm(vbs4,vbs4,vinvert); 00099 vbs9 = vec_add(vbs1,v1); 00100 v3 = vec_sub(vbs1,v1); 00101 v5 = vec_ld(0,costab); 00102 v2 = vec_perm(vbs3,vbs3,vinvert); 00103 vbs10 = vec_add(vbs2,v2); 00104 v4 = vec_sub(vbs2,v2); 00105 v6 = vec_ld(16,costab); 00106 vbs12 = vec_madd(v3,v5,vzero); 00107 vbs11 = vec_madd(v4,v6,vzero); 00108 00109 v7 = vec_sub(vbs7,vbs6); 00110 v8 = vec_sub(vbs8,vbs5); 00111 vbs13 = vec_add(vbs5,vbs8); 00112 vbs14 = vec_add(vbs6,vbs7); 00113 vbs15 = vec_madd(v7,v6,vzero); 00114 vbs16 = vec_madd(v8,v5,vzero); 00115 00116 00117 costab = pnts[2]; 00118 00119 v1 = vec_perm(vbs10,vbs10,vinvert); 00120 v5 = vec_perm(vbs14,vbs14,vinvert); 00121 vbs1 = vec_add(v1,vbs9); 00122 vbs5 = vec_add(v5,vbs13); 00123 v2 = vec_sub(vbs9,v1); 00124 v6 = vec_sub(vbs13,v5); 00125 v3 = vec_ld(0,costab); 00126 vbs11 = vec_perm(vbs11,vbs11,vinvert); 00127 vbs15 = vec_perm(vbs15,vbs15,vinvert); 00128 vbs3 = vec_add(vbs11,vbs12); 00129 vbs7 = vec_add(vbs15,vbs16); 00130 v4 = vec_sub(vbs12,vbs11); 00131 v7 = vec_sub(vbs16,vbs15); 00132 vbs2 = vec_madd(v2,v3,vzero); 00133 vbs4 = vec_madd(v4,v3,vzero); 00134 vbs6 = vec_madd(v6,v3,vzero); 00135 vbs8 = vec_madd(v7,v3,vzero); 00136 00137 vbs2 = vec_perm(vbs2,vbs2,vinvert); 00138 vbs4 = vec_perm(vbs4,vbs4,vinvert); 00139 vbs6 = vec_perm(vbs6,vbs6,vinvert); 00140 vbs8 = vec_perm(vbs8,vbs8,vinvert); 00141 00142 00143 costab = pnts[3]; 00144 00145 #ifdef __APPLE__ 00146 vperm1 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23); 00147 vperm2 = (vector unsigned char)(12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27); 00148 vperm3 = (vector unsigned char)(0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19); 00149 #else 00150 vperm1 = (vector unsigned char){0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23}; 00151 vperm2 = (vector unsigned char){12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27}; 00152 vperm3 = (vector unsigned char){0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19}; 00153 #endif 00154 vperm4 = vec_add(vperm3,vec_splat_u8(8)); 00155 00156 v1 = vec_ld(0,costab); 00157 v2 = vec_splat(v1,0); 00158 v3 = vec_splat(v1,1); 00159 v1 = vec_mergeh(v2,v3); 00160 00161 v2 = vec_perm(vbs1,vbs3,vperm1); 00162 v3 = vec_perm(vbs2,vbs4,vperm1); 00163 v4 = vec_perm(vbs1,vbs3,vperm2); 00164 v5 = vec_perm(vbs2,vbs4,vperm2); 00165 v6 = vec_sub(v2,v4); 00166 v7 = vec_sub(v3,v5); 00167 v2 = vec_add(v2,v4); 00168 v3 = vec_add(v3,v5); 00169 v4 = vec_madd(v6,v1,vzero); 00170 v5 = vec_nmsub(v7,v1,vzero); 00171 vbs9 = vec_perm(v2,v4,vperm3); 00172 vbs11 = vec_perm(v2,v4,vperm4); 00173 vbs10 = vec_perm(v3,v5,vperm3); 00174 vbs12 = vec_perm(v3,v5,vperm4); 00175 00176 v2 = vec_perm(vbs5,vbs7,vperm1); 00177 v3 = vec_perm(vbs6,vbs8,vperm1); 00178 v4 = vec_perm(vbs5,vbs7,vperm2); 00179 v5 = vec_perm(vbs6,vbs8,vperm2); 00180 v6 = vec_sub(v2,v4); 00181 v7 = vec_sub(v3,v5); 00182 v2 = vec_add(v2,v4); 00183 v3 = vec_add(v3,v5); 00184 v4 = vec_madd(v6,v1,vzero); 00185 v5 = vec_nmsub(v7,v1,vzero); 00186 vbs13 = vec_perm(v2,v4,vperm3); 00187 vbs15 = vec_perm(v2,v4,vperm4); 00188 vbs14 = vec_perm(v3,v5,vperm3); 00189 vbs16 = vec_perm(v3,v5,vperm4); 00190 00191 00192 costab = pnts[4]; 00193 00194 v1 = vec_lde(0,costab); 00195 #ifdef __APPLE__ 00196 v2 = (vector float)(1.0f,-1.0f,1.0f,-1.0f); 00197 #else 00198 v2 = (vector float){1.0f,-1.0f,1.0f,-1.0f}; 00199 #endif 00200 v3 = vec_splat(v1,0); 00201 v1 = vec_madd(v2,v3,vzero); 00202 00203 v2 = vec_mergeh(vbs9,vbs10); 00204 v3 = vec_mergel(vbs9,vbs10); 00205 v4 = vec_mergeh(vbs11,vbs12); 00206 v5 = vec_mergel(vbs11,vbs12); 00207 v6 = vec_mergeh(v2,v3); 00208 v7 = vec_mergel(v2,v3); 00209 v2 = vec_mergeh(v4,v5); 00210 v3 = vec_mergel(v4,v5); 00211 v4 = vec_sub(v6,v7); 00212 v5 = vec_sub(v2,v3); 00213 v6 = vec_add(v6,v7); 00214 v7 = vec_add(v2,v3); 00215 v2 = vec_madd(v4,v1,vzero); 00216 v3 = vec_madd(v5,v1,vzero); 00217 vbs1 = vec_mergeh(v6,v2); 00218 vbs2 = vec_mergel(v6,v2); 00219 vbs3 = vec_mergeh(v7,v3); 00220 vbs4 = vec_mergel(v7,v3); 00221 00222 v2 = vec_mergeh(vbs13,vbs14); 00223 v3 = vec_mergel(vbs13,vbs14); 00224 v4 = vec_mergeh(vbs15,vbs16); 00225 v5 = vec_mergel(vbs15,vbs16); 00226 v6 = vec_mergeh(v2,v3); 00227 v7 = vec_mergel(v2,v3); 00228 v2 = vec_mergeh(v4,v5); 00229 v3 = vec_mergel(v4,v5); 00230 v4 = vec_sub(v6,v7); 00231 v5 = vec_sub(v2,v3); 00232 v6 = vec_add(v6,v7); 00233 v7 = vec_add(v2,v3); 00234 v2 = vec_madd(v4,v1,vzero); 00235 v3 = vec_madd(v5,v1,vzero); 00236 vbs5 = vec_mergeh(v6,v2); 00237 vbs6 = vec_mergel(v6,v2); 00238 vbs7 = vec_mergeh(v7,v3); 00239 vbs8 = vec_mergel(v7,v3); 00240 00241 vec_st(vbs1,0,bufs); 00242 vec_st(vbs2,16,bufs); 00243 vec_st(vbs3,32,bufs); 00244 vec_st(vbs4,48,bufs); 00245 vec_st(vbs5,64,bufs); 00246 vec_st(vbs6,80,bufs); 00247 vec_st(vbs7,96,bufs); 00248 vec_st(vbs8,112,bufs); 00249 } 00250 00251 { 00252 register real *b1; 00253 register int i; 00254 00255 for(b1=bufs,i=8;i;i--,b1+=4) 00256 b1[2] += b1[3]; 00257 00258 for(b1=bufs,i=4;i;i--,b1+=8) 00259 { 00260 b1[4] += b1[6]; 00261 b1[6] += b1[5]; 00262 b1[5] += b1[7]; 00263 } 00264 00265 for(b1=bufs,i=2;i;i--,b1+=16) 00266 { 00267 b1[8] += b1[12]; 00268 b1[12] += b1[10]; 00269 b1[10] += b1[14]; 00270 b1[14] += b1[9]; 00271 b1[9] += b1[13]; 00272 b1[13] += b1[11]; 00273 b1[11] += b1[15]; 00274 } 00275 } 00276 00277 00278 out0[0x10*16] = bufs[0]; 00279 out0[0x10*15] = bufs[16+0] + bufs[16+8]; 00280 out0[0x10*14] = bufs[8]; 00281 out0[0x10*13] = bufs[16+8] + bufs[16+4]; 00282 out0[0x10*12] = bufs[4]; 00283 out0[0x10*11] = bufs[16+4] + bufs[16+12]; 00284 out0[0x10*10] = bufs[12]; 00285 out0[0x10* 9] = bufs[16+12] + bufs[16+2]; 00286 out0[0x10* 8] = bufs[2]; 00287 out0[0x10* 7] = bufs[16+2] + bufs[16+10]; 00288 out0[0x10* 6] = bufs[10]; 00289 out0[0x10* 5] = bufs[16+10] + bufs[16+6]; 00290 out0[0x10* 4] = bufs[6]; 00291 out0[0x10* 3] = bufs[16+6] + bufs[16+14]; 00292 out0[0x10* 2] = bufs[14]; 00293 out0[0x10* 1] = bufs[16+14] + bufs[16+1]; 00294 out0[0x10* 0] = bufs[1]; 00295 00296 out1[0x10* 0] = bufs[1]; 00297 out1[0x10* 1] = bufs[16+1] + bufs[16+9]; 00298 out1[0x10* 2] = bufs[9]; 00299 out1[0x10* 3] = bufs[16+9] + bufs[16+5]; 00300 out1[0x10* 4] = bufs[5]; 00301 out1[0x10* 5] = bufs[16+5] + bufs[16+13]; 00302 out1[0x10* 6] = bufs[13]; 00303 out1[0x10* 7] = bufs[16+13] + bufs[16+3]; 00304 out1[0x10* 8] = bufs[3]; 00305 out1[0x10* 9] = bufs[16+3] + bufs[16+11]; 00306 out1[0x10*10] = bufs[11]; 00307 out1[0x10*11] = bufs[16+11] + bufs[16+7]; 00308 out1[0x10*12] = bufs[7]; 00309 out1[0x10*13] = bufs[16+7] + bufs[16+15]; 00310 out1[0x10*14] = bufs[15]; 00311 out1[0x10*15] = bufs[16+15]; 00312 00313 } 00314 00315 Generated on Fri May 25 2012 04:32:36 for ReactOS by
1.7.6.1
|