ReactOS Fundraising Campaign 2012
 
€ 4,410 / € 30,000

Information | Donate

Home | Info | Community | Development | myReactOS | Contact Us

  1. Home
  2. Community
  3. Development
  4. myReactOS
  5. Fundraiser 2012

  1. Main Page
  2. Alphabetical List
  3. Data Structures
  4. Directories
  5. File List
  6. Data Fields
  7. Globals
  8. Related Pages

ReactOS Development > Doxygen

dct64_altivec.c
Go to the documentation of this file.
00001 /*
00002     dct64_altivec.c: Discrete Cosine Tansform (DCT) for Altivec
00003 
00004     copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1
00005     see COPYING and AUTHORS files in distribution or http://mpg123.org
00006     initially written by Michael Hipp
00007     altivec optimization by tmkk
00008 */
00009 
00010 /*
00011  * Discrete Cosine Tansform (DCT) for subband synthesis
00012  *
00013  * -funroll-loops (for gcc) will remove the loops for better performance
00014  * using loops in the source-code enhances readabillity
00015  *
00016  *
00017  * TODO: write an optimized version for the down-sampling modes
00018  *       (in these modes the bands 16-31 (2:1) or 8-31 (4:1) are zero 
00019  */
00020 
00021 #include "mpg123lib_intern.h"
00022 
00023 #ifndef __APPLE__
00024 #include <altivec.h>
00025 #endif
00026 
00027 void dct64_altivec(real *out0,real *out1,real *samples)
00028 {
00029   ALIGNED(16) real bufs[32];
00030 
00031     {
00032         register real *b1,*costab;
00033         
00034         vector unsigned char vinvert,vperm1,vperm2,vperm3,vperm4;
00035         vector float v1,v2,v3,v4,v5,v6,v7,v8;
00036         vector float vbs1,vbs2,vbs3,vbs4,vbs5,vbs6,vbs7,vbs8;
00037         vector float vbs9,vbs10,vbs11,vbs12,vbs13,vbs14,vbs15,vbs16;
00038         vector float vzero;
00039         b1 = samples;
00040         costab = pnts[0];
00041         
00042         vzero = vec_xor(vzero,vzero);
00043 #ifdef __APPLE__
00044         vinvert = (vector unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
00045 #else
00046         vinvert = (vector unsigned char){12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
00047 #endif
00048         vperm1 = vec_lvsl(0,b1);
00049         vperm2 = vec_perm(vperm1,vperm1,vinvert);
00050         
00051         v1 = vec_ld(0,b1);
00052         v2 = vec_ld(16,b1);
00053         v3 = vec_ld(112,b1);
00054         v4 = vec_ld(127,b1);
00055         v5 = vec_perm(v1,v2,vperm1); /* b1[0,1,2,3] */
00056         v6 = vec_perm(v3,v4,vperm2); /* b1[31,30,29,28] */
00057         
00058         vbs1 = vec_add(v5,v6);
00059         vbs8 = vec_sub(v5,v6);
00060         
00061         v1 = vec_ld(32,b1);
00062         v4 = vec_ld(96,b1);
00063         v5 = vec_perm(v2,v1,vperm1); /* b1[4,5,6,7] */
00064         v6 = vec_perm(v4,v3,vperm2); /* b1[27,26,25,24] */
00065         
00066         vbs2 = vec_add(v5,v6);
00067         vbs7 = vec_sub(v5,v6);
00068         
00069         v2 = vec_ld(48,b1);
00070         v3 = vec_ld(80,b1);
00071         v5 = vec_perm(v1,v2,vperm1); /* b1[8,9,10,11] */
00072         v6 = vec_perm(v3,v4,vperm2); /* b1[23,22,21,20] */
00073         
00074         vbs3 = vec_add(v5,v6);
00075         vbs6 = vec_sub(v5,v6);
00076         
00077         v1 = vec_ld(64,b1);
00078         v5 = vec_perm(v2,v1,vperm1); /* b1[12,13,14,15] */
00079         v6 = vec_perm(v1,v3,vperm2); /* b1[19,18,17,16] */
00080         
00081         vbs4 = vec_add(v5,v6);
00082         vbs5 = vec_sub(v5,v6);
00083         
00084         v1 = vec_ld(0,costab);
00085         vbs8 = vec_madd(vbs8,v1,vzero);
00086         v2 = vec_ld(16,costab);
00087         vbs7 = vec_madd(vbs7,v2,vzero);
00088         v3 = vec_ld(32,costab);
00089         vbs6 = vec_madd(vbs6,v3,vzero);
00090         v4 = vec_ld(48,costab);
00091         vbs5 = vec_madd(vbs5,v4,vzero);
00092         vbs6 = vec_perm(vbs6,vbs6,vinvert);
00093         vbs5 = vec_perm(vbs5,vbs5,vinvert);
00094         
00095         
00096         costab = pnts[1];
00097         
00098         v1 = vec_perm(vbs4,vbs4,vinvert);
00099         vbs9 = vec_add(vbs1,v1);
00100         v3 = vec_sub(vbs1,v1);
00101         v5 = vec_ld(0,costab);
00102         v2 = vec_perm(vbs3,vbs3,vinvert);
00103         vbs10 = vec_add(vbs2,v2);
00104         v4 = vec_sub(vbs2,v2);
00105         v6 = vec_ld(16,costab);
00106         vbs12 = vec_madd(v3,v5,vzero);
00107         vbs11 = vec_madd(v4,v6,vzero);
00108         
00109         v7 = vec_sub(vbs7,vbs6);
00110         v8 = vec_sub(vbs8,vbs5);
00111         vbs13 = vec_add(vbs5,vbs8);
00112         vbs14 = vec_add(vbs6,vbs7);
00113         vbs15 = vec_madd(v7,v6,vzero);
00114         vbs16 = vec_madd(v8,v5,vzero);
00115         
00116         
00117         costab = pnts[2];
00118         
00119         v1 = vec_perm(vbs10,vbs10,vinvert);
00120         v5 = vec_perm(vbs14,vbs14,vinvert);
00121         vbs1 = vec_add(v1,vbs9);
00122         vbs5 = vec_add(v5,vbs13);
00123         v2 = vec_sub(vbs9,v1);
00124         v6 = vec_sub(vbs13,v5);
00125         v3 = vec_ld(0,costab);
00126         vbs11 = vec_perm(vbs11,vbs11,vinvert);
00127         vbs15 = vec_perm(vbs15,vbs15,vinvert);
00128         vbs3 = vec_add(vbs11,vbs12);
00129         vbs7 = vec_add(vbs15,vbs16);
00130         v4 = vec_sub(vbs12,vbs11);
00131         v7 = vec_sub(vbs16,vbs15);
00132         vbs2 = vec_madd(v2,v3,vzero);
00133         vbs4 = vec_madd(v4,v3,vzero);
00134         vbs6 = vec_madd(v6,v3,vzero);
00135         vbs8 = vec_madd(v7,v3,vzero);
00136         
00137         vbs2 = vec_perm(vbs2,vbs2,vinvert);
00138         vbs4 = vec_perm(vbs4,vbs4,vinvert);
00139         vbs6 = vec_perm(vbs6,vbs6,vinvert);
00140         vbs8 = vec_perm(vbs8,vbs8,vinvert);
00141         
00142         
00143         costab = pnts[3];
00144         
00145 #ifdef __APPLE__
00146         vperm1 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
00147         vperm2 = (vector unsigned char)(12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27);
00148         vperm3 = (vector unsigned char)(0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19);
00149 #else
00150         vperm1 = (vector unsigned char){0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23};
00151         vperm2 = (vector unsigned char){12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27};
00152         vperm3 = (vector unsigned char){0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19};
00153 #endif
00154         vperm4 = vec_add(vperm3,vec_splat_u8(8));
00155         
00156         v1 = vec_ld(0,costab);
00157         v2 = vec_splat(v1,0);
00158         v3 = vec_splat(v1,1);
00159         v1 = vec_mergeh(v2,v3);
00160         
00161         v2 = vec_perm(vbs1,vbs3,vperm1);
00162         v3 = vec_perm(vbs2,vbs4,vperm1);
00163         v4 = vec_perm(vbs1,vbs3,vperm2);
00164         v5 = vec_perm(vbs2,vbs4,vperm2);
00165         v6 = vec_sub(v2,v4);
00166         v7 = vec_sub(v3,v5);
00167         v2 = vec_add(v2,v4);
00168         v3 = vec_add(v3,v5);
00169         v4 = vec_madd(v6,v1,vzero);
00170         v5 = vec_nmsub(v7,v1,vzero);
00171         vbs9 = vec_perm(v2,v4,vperm3);
00172         vbs11 = vec_perm(v2,v4,vperm4);
00173         vbs10 = vec_perm(v3,v5,vperm3);
00174         vbs12 = vec_perm(v3,v5,vperm4);
00175         
00176         v2 = vec_perm(vbs5,vbs7,vperm1);
00177         v3 = vec_perm(vbs6,vbs8,vperm1);
00178         v4 = vec_perm(vbs5,vbs7,vperm2);
00179         v5 = vec_perm(vbs6,vbs8,vperm2);
00180         v6 = vec_sub(v2,v4);
00181         v7 = vec_sub(v3,v5);
00182         v2 = vec_add(v2,v4);
00183         v3 = vec_add(v3,v5);
00184         v4 = vec_madd(v6,v1,vzero);
00185         v5 = vec_nmsub(v7,v1,vzero);
00186         vbs13 = vec_perm(v2,v4,vperm3);
00187         vbs15 = vec_perm(v2,v4,vperm4);
00188         vbs14 = vec_perm(v3,v5,vperm3);
00189         vbs16 = vec_perm(v3,v5,vperm4);
00190         
00191         
00192         costab = pnts[4];
00193         
00194         v1 = vec_lde(0,costab);
00195 #ifdef __APPLE__
00196         v2 = (vector float)(1.0f,-1.0f,1.0f,-1.0f);
00197 #else
00198         v2 = (vector float){1.0f,-1.0f,1.0f,-1.0f};
00199 #endif
00200         v3 = vec_splat(v1,0);
00201         v1 = vec_madd(v2,v3,vzero);
00202         
00203         v2 = vec_mergeh(vbs9,vbs10);
00204         v3 = vec_mergel(vbs9,vbs10);
00205         v4 = vec_mergeh(vbs11,vbs12);
00206         v5 = vec_mergel(vbs11,vbs12);
00207         v6 = vec_mergeh(v2,v3);
00208         v7 = vec_mergel(v2,v3);
00209         v2 = vec_mergeh(v4,v5);
00210         v3 = vec_mergel(v4,v5); 
00211         v4 = vec_sub(v6,v7);
00212         v5 = vec_sub(v2,v3);
00213         v6 = vec_add(v6,v7);
00214         v7 = vec_add(v2,v3);
00215         v2 = vec_madd(v4,v1,vzero);
00216         v3 = vec_madd(v5,v1,vzero);
00217         vbs1 = vec_mergeh(v6,v2);
00218         vbs2 = vec_mergel(v6,v2);
00219         vbs3 = vec_mergeh(v7,v3);
00220         vbs4 = vec_mergel(v7,v3);
00221         
00222         v2 = vec_mergeh(vbs13,vbs14);
00223         v3 = vec_mergel(vbs13,vbs14);
00224         v4 = vec_mergeh(vbs15,vbs16);
00225         v5 = vec_mergel(vbs15,vbs16);
00226         v6 = vec_mergeh(v2,v3);
00227         v7 = vec_mergel(v2,v3);
00228         v2 = vec_mergeh(v4,v5);
00229         v3 = vec_mergel(v4,v5); 
00230         v4 = vec_sub(v6,v7);
00231         v5 = vec_sub(v2,v3);
00232         v6 = vec_add(v6,v7);
00233         v7 = vec_add(v2,v3);
00234         v2 = vec_madd(v4,v1,vzero);
00235         v3 = vec_madd(v5,v1,vzero);
00236         vbs5 = vec_mergeh(v6,v2);
00237         vbs6 = vec_mergel(v6,v2);
00238         vbs7 = vec_mergeh(v7,v3);
00239         vbs8 = vec_mergel(v7,v3);
00240         
00241         vec_st(vbs1,0,bufs);
00242         vec_st(vbs2,16,bufs);
00243         vec_st(vbs3,32,bufs);
00244         vec_st(vbs4,48,bufs);
00245         vec_st(vbs5,64,bufs);
00246         vec_st(vbs6,80,bufs);
00247         vec_st(vbs7,96,bufs);
00248         vec_st(vbs8,112,bufs);
00249     }
00250 
00251  {
00252   register real *b1;
00253   register int i;
00254 
00255   for(b1=bufs,i=8;i;i--,b1+=4)
00256     b1[2] += b1[3];
00257 
00258   for(b1=bufs,i=4;i;i--,b1+=8)
00259   {
00260     b1[4] += b1[6];
00261     b1[6] += b1[5];
00262     b1[5] += b1[7];
00263   }
00264 
00265   for(b1=bufs,i=2;i;i--,b1+=16)
00266   {
00267     b1[8]  += b1[12];
00268     b1[12] += b1[10];
00269     b1[10] += b1[14];
00270     b1[14] += b1[9];
00271     b1[9]  += b1[13];
00272     b1[13] += b1[11];
00273     b1[11] += b1[15];
00274   }
00275  }
00276 
00277 
00278   out0[0x10*16] = bufs[0];
00279   out0[0x10*15] = bufs[16+0]  + bufs[16+8];
00280   out0[0x10*14] = bufs[8];
00281   out0[0x10*13] = bufs[16+8]  + bufs[16+4];
00282   out0[0x10*12] = bufs[4];
00283   out0[0x10*11] = bufs[16+4]  + bufs[16+12];
00284   out0[0x10*10] = bufs[12];
00285   out0[0x10* 9] = bufs[16+12] + bufs[16+2];
00286   out0[0x10* 8] = bufs[2];
00287   out0[0x10* 7] = bufs[16+2]  + bufs[16+10];
00288   out0[0x10* 6] = bufs[10];
00289   out0[0x10* 5] = bufs[16+10] + bufs[16+6];
00290   out0[0x10* 4] = bufs[6];
00291   out0[0x10* 3] = bufs[16+6]  + bufs[16+14];
00292   out0[0x10* 2] = bufs[14];
00293   out0[0x10* 1] = bufs[16+14] + bufs[16+1];
00294   out0[0x10* 0] = bufs[1];
00295 
00296   out1[0x10* 0] = bufs[1];
00297   out1[0x10* 1] = bufs[16+1]  + bufs[16+9];
00298   out1[0x10* 2] = bufs[9];
00299   out1[0x10* 3] = bufs[16+9]  + bufs[16+5];
00300   out1[0x10* 4] = bufs[5];
00301   out1[0x10* 5] = bufs[16+5]  + bufs[16+13];
00302   out1[0x10* 6] = bufs[13];
00303   out1[0x10* 7] = bufs[16+13] + bufs[16+3];
00304   out1[0x10* 8] = bufs[3];
00305   out1[0x10* 9] = bufs[16+3]  + bufs[16+11];
00306   out1[0x10*10] = bufs[11];
00307   out1[0x10*11] = bufs[16+11] + bufs[16+7];
00308   out1[0x10*12] = bufs[7];
00309   out1[0x10*13] = bufs[16+7]  + bufs[16+15];
00310   out1[0x10*14] = bufs[15];
00311   out1[0x10*15] = bufs[16+15];
00312 
00313 }
00314 
00315 

Generated on Fri May 25 2012 04:32:36 for ReactOS by doxygen 1.7.6.1

ReactOS is a registered trademark or a trademark of ReactOS Foundation in the United States and other countries.