ReactOS Fundraising Campaign 2012
 
€ 4,410 / € 30,000

Information | Donate

Home | Info | Community | Development | myReactOS | Contact Us

  1. Home
  2. Community
  3. Development
  4. myReactOS
  5. Fundraiser 2012

  1. Main Page
  2. Alphabetical List
  3. Data Structures
  4. Directories
  5. File List
  6. Data Fields
  7. Globals
  8. Related Pages

ReactOS Development > Doxygen

t_vertex_sse.c
Go to the documentation of this file.
00001 /*
00002  * Copyright 2003 Tungsten Graphics, inc.
00003  * All Rights Reserved.
00004  *
00005  * Permission is hereby granted, free of charge, to any person obtaining a
00006  * copy of this software and associated documentation files (the "Software"),
00007  * to deal in the Software without restriction, including without limitation
00008  * on the rights to use, copy, modify, merge, publish, distribute, sub
00009  * license, and/or sell copies of the Software, and to permit persons to whom
00010  * the Software is furnished to do so, subject to the following conditions:
00011  *
00012  * The above copyright notice and this permission notice (including the next
00013  * paragraph) shall be included in all copies or substantial portions of the
00014  * Software.
00015  *
00016  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00017  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00018  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
00019  * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
00020  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
00021  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
00022  * USE OR OTHER DEALINGS IN THE SOFTWARE.
00023  *
00024  * Authors:
00025  *    Keith Whitwell <keithw@tungstengraphics.com>
00026  */
00027 
00028 #include "main/glheader.h"
00029 #include "main/context.h"
00030 #include "main/colormac.h"
00031 #include "main/simple_list.h"
00032 #include "main/enums.h"
00033 #include "t_context.h"
00034 #include "t_vertex.h"
00035 
00036 #if defined(USE_SSE_ASM)
00037 
00038 #include "x86/rtasm/x86sse.h"
00039 #include "x86/common_x86_asm.h"
00040 
00041 
00045 #define MAX_SSE_CODE_SIZE 1024
00046 
00047 
00048 #define X    0
00049 #define Y    1
00050 #define Z    2
00051 #define W    3
00052 
00053 
00054 struct x86_program {
00055    struct x86_function func;
00056 
00057    GLcontext *ctx;
00058    GLboolean inputs_safe;
00059    GLboolean outputs_safe;
00060    GLboolean have_sse2;
00061    
00062    struct x86_reg identity;
00063    struct x86_reg chan0;
00064 };
00065 
00066 
00067 static struct x86_reg get_identity( struct x86_program *p )
00068 {
00069    return p->identity;
00070 }
00071 
00072 static void emit_load4f_4( struct x86_program *p,              
00073                struct x86_reg dest,
00074                struct x86_reg arg0 )
00075 {
00076    sse_movups(&p->func, dest, arg0);
00077 }
00078 
00079 static void emit_load4f_3( struct x86_program *p, 
00080                struct x86_reg dest,
00081                struct x86_reg arg0 )
00082 {
00083    /* Have to jump through some hoops:
00084     *
00085     * c 0 0 0
00086     * c 0 0 1
00087     * 0 0 c 1
00088     * a b c 1
00089     */
00090    sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
00091    sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
00092    sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
00093    sse_movlps(&p->func, dest, arg0);
00094 }
00095 
00096 static void emit_load4f_2( struct x86_program *p, 
00097                struct x86_reg dest,
00098                struct x86_reg arg0 )
00099 {
00100    /* Initialize from identity, then pull in low two words:
00101     */
00102    sse_movups(&p->func, dest, get_identity(p));
00103    sse_movlps(&p->func, dest, arg0);
00104 }
00105 
00106 static void emit_load4f_1( struct x86_program *p, 
00107                struct x86_reg dest,
00108                struct x86_reg arg0 )
00109 {
00110    /* Pull in low word, then swizzle in identity */
00111    sse_movss(&p->func, dest, arg0);
00112    sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
00113 }
00114 
00115 
00116 
00117 static void emit_load3f_3( struct x86_program *p,              
00118                struct x86_reg dest,
00119                struct x86_reg arg0 )
00120 {
00121    /* Over-reads by 1 dword - potential SEGV if input is a vertex
00122     * array.
00123     */
00124    if (p->inputs_safe) {
00125       sse_movups(&p->func, dest, arg0);
00126    } 
00127    else {
00128       /* c 0 0 0
00129        * c c c c
00130        * a b c c 
00131        */
00132       sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
00133       sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
00134       sse_movlps(&p->func, dest, arg0);
00135    }
00136 }
00137 
00138 static void emit_load3f_2( struct x86_program *p, 
00139                struct x86_reg dest,
00140                struct x86_reg arg0 )
00141 {
00142    emit_load4f_2(p, dest, arg0);
00143 }
00144 
00145 static void emit_load3f_1( struct x86_program *p, 
00146                struct x86_reg dest,
00147                struct x86_reg arg0 )
00148 {
00149    /* Loading from memory erases the upper bits. */
00150    sse_movss(&p->func, dest, arg0);
00151 }
00152 
00153 static void emit_load2f_2( struct x86_program *p, 
00154                struct x86_reg dest,
00155                struct x86_reg arg0 )
00156 {
00157    sse_movlps(&p->func, dest, arg0);
00158 }
00159 
00160 static void emit_load2f_1( struct x86_program *p, 
00161                struct x86_reg dest,
00162                struct x86_reg arg0 )
00163 {
00164    /* Loading from memory erases the upper bits. */
00165    sse_movss(&p->func, dest, arg0);
00166 }
00167 
00168 static void emit_load1f_1( struct x86_program *p, 
00169                struct x86_reg dest,
00170                struct x86_reg arg0 )
00171 {
00172    sse_movss(&p->func, dest, arg0);
00173 }
00174 
00175 static void (*load[4][4])( struct x86_program *p, 
00176                struct x86_reg dest,
00177                struct x86_reg arg0 ) = {
00178    { emit_load1f_1, 
00179      emit_load1f_1, 
00180      emit_load1f_1, 
00181      emit_load1f_1 },
00182 
00183    { emit_load2f_1, 
00184      emit_load2f_2, 
00185      emit_load2f_2, 
00186      emit_load2f_2 },
00187 
00188    { emit_load3f_1, 
00189      emit_load3f_2, 
00190      emit_load3f_3, 
00191      emit_load3f_3 },
00192 
00193    { emit_load4f_1, 
00194      emit_load4f_2, 
00195      emit_load4f_3, 
00196      emit_load4f_4 } 
00197 };
00198 
00199 static void emit_load( struct x86_program *p,
00200                struct x86_reg dest,
00201                GLuint sz,
00202                struct x86_reg src,
00203                GLuint src_sz)
00204 {
00205    load[sz-1][src_sz-1](p, dest, src);
00206 }
00207 
00208 static void emit_store4f( struct x86_program *p,               
00209               struct x86_reg dest,
00210               struct x86_reg arg0 )
00211 {
00212    sse_movups(&p->func, dest, arg0);
00213 }
00214 
00215 static void emit_store3f( struct x86_program *p, 
00216               struct x86_reg dest,
00217               struct x86_reg arg0 )
00218 {
00219    if (p->outputs_safe) {
00220       /* Emit the extra dword anyway.  This may hurt writecombining,
00221        * may cause other problems.
00222        */
00223       sse_movups(&p->func, dest, arg0);
00224    }
00225    else {
00226       /* Alternate strategy - emit two, shuffle, emit one.
00227        */
00228       sse_movlps(&p->func, dest, arg0);
00229       sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
00230       sse_movss(&p->func, x86_make_disp(dest,8), arg0);
00231    }
00232 }
00233 
00234 static void emit_store2f( struct x86_program *p, 
00235                struct x86_reg dest,
00236                struct x86_reg arg0 )
00237 {
00238    sse_movlps(&p->func, dest, arg0);
00239 }
00240 
00241 static void emit_store1f( struct x86_program *p, 
00242               struct x86_reg dest,
00243               struct x86_reg arg0 )
00244 {
00245    sse_movss(&p->func, dest, arg0);
00246 }
00247 
00248 
00249 static void (*store[4])( struct x86_program *p, 
00250              struct x86_reg dest,
00251              struct x86_reg arg0 ) = 
00252 {
00253    emit_store1f, 
00254    emit_store2f, 
00255    emit_store3f, 
00256    emit_store4f 
00257 };
00258 
00259 static void emit_store( struct x86_program *p,
00260             struct x86_reg dest,
00261             GLuint sz,
00262             struct x86_reg temp )
00263 
00264 {
00265    store[sz-1](p, dest, temp);
00266 }
00267 
00268 static void emit_pack_store_4ub( struct x86_program *p,
00269                  struct x86_reg dest,
00270                  struct x86_reg temp )
00271 {
00272    /* Scale by 255.0
00273     */
00274    sse_mulps(&p->func, temp, p->chan0);
00275 
00276    if (p->have_sse2) {
00277       sse2_cvtps2dq(&p->func, temp, temp);
00278       sse2_packssdw(&p->func, temp, temp);
00279       sse2_packuswb(&p->func, temp, temp);
00280       sse_movss(&p->func, dest, temp);
00281    }
00282    else {
00283       struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
00284       struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
00285       sse_cvtps2pi(&p->func, mmx0, temp);
00286       sse_movhlps(&p->func, temp, temp);
00287       sse_cvtps2pi(&p->func, mmx1, temp);
00288       mmx_packssdw(&p->func, mmx0, mmx1);
00289       mmx_packuswb(&p->func, mmx0, mmx0);
00290       mmx_movd(&p->func, dest, mmx0);
00291    }
00292 }
00293 
00294 static GLint get_offset( const void *a, const void *b )
00295 {
00296    return (const char *)b - (const char *)a;
00297 }
00298 
00299 /* Not much happens here.  Eventually use this function to try and
00300  * avoid saving/reloading the source pointers each vertex (if some of
00301  * them can fit in registers).
00302  */
00303 static void get_src_ptr( struct x86_program *p,
00304              struct x86_reg srcREG,
00305              struct x86_reg vtxREG,
00306              struct tnl_clipspace_attr *a )
00307 {
00308    struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
00309    struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
00310 
00311    /* Load current a[j].inputptr
00312     */
00313    x86_mov(&p->func, srcREG, ptr_to_src);
00314 }
00315 
00316 static void update_src_ptr( struct x86_program *p,
00317              struct x86_reg srcREG,
00318              struct x86_reg vtxREG,
00319              struct tnl_clipspace_attr *a )
00320 {
00321    if (a->inputstride) {
00322       struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
00323       struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
00324 
00325       /* add a[j].inputstride (hardcoded value - could just as easily
00326        * pull the stride value from memory each time).
00327        */
00328       x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
00329       
00330       /* save new value of a[j].inputptr 
00331        */
00332       x86_mov(&p->func, ptr_to_src, srcREG);
00333    }
00334 }
00335 
00336 
00337 /* Lots of hardcoding
00338  *
00339  * EAX -- pointer to current output vertex
00340  * ECX -- pointer to current attribute 
00341  * 
00342  */
00343 static GLboolean build_vertex_emit( struct x86_program *p )
00344 {
00345    GLcontext *ctx = p->ctx;
00346    TNLcontext *tnl = TNL_CONTEXT(ctx);
00347    struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
00348    GLuint j = 0;
00349 
00350    struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
00351    struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
00352    struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
00353    struct x86_reg vtxESI = x86_make_reg(file_REG32, reg_SI);
00354    struct x86_reg temp = x86_make_reg(file_XMM, 0);
00355    struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
00356    struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
00357    struct x86_reg temp2 = x86_make_reg(file_XMM, 3);
00358    GLubyte *fixup, *label;
00359 
00360    /* Push a few regs?
00361     */
00362    x86_push(&p->func, countEBP);
00363    x86_push(&p->func, vtxESI);
00364 
00365 
00366    /* Get vertex count, compare to zero
00367     */
00368    x86_xor(&p->func, srcECX, srcECX);
00369    x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
00370    x86_cmp(&p->func, countEBP, srcECX);
00371    fixup = x86_jcc_forward(&p->func, cc_E);
00372 
00373    /* Initialize destination register. 
00374     */
00375    x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
00376 
00377    /* Dereference ctx to get tnl, then vtx:
00378     */
00379    x86_mov(&p->func, vtxESI, x86_fn_arg(&p->func, 1));
00380    x86_mov(&p->func, vtxESI, x86_make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context)));
00381    vtxESI = x86_make_disp(vtxESI, get_offset(tnl, &tnl->clipspace));
00382 
00383    
00384    /* Possibly load vp0, vp1 for viewport calcs:
00385     */
00386    if (vtx->need_viewport) {
00387       sse_movups(&p->func, vp0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0])));
00388       sse_movups(&p->func, vp1, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0])));
00389    }
00390 
00391    /* always load, needed or not:
00392     */
00393    sse_movups(&p->func, p->chan0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0])));
00394    sse_movups(&p->func, p->identity, x86_make_disp(vtxESI, get_offset(vtx, &vtx->identity[0])));
00395 
00396    /* Note address for loop jump */
00397    label = x86_get_label(&p->func);
00398 
00399    /* Emit code for each of the attributes.  Currently routes
00400     * everything through SSE registers, even when it might be more
00401     * efficient to stick with regular old x86.  No optimization or
00402     * other tricks - enough new ground to cover here just getting
00403     * things working.
00404     */
00405    while (j < vtx->attr_count) {
00406       struct tnl_clipspace_attr *a = &vtx->attr[j];
00407       struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
00408 
00409       /* Now, load an XMM reg from src, perhaps transform, then save.
00410        * Could be shortcircuited in specific cases:
00411        */
00412       switch (a->format) {
00413       case EMIT_1F:
00414      get_src_ptr(p, srcECX, vtxESI, a);
00415      emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
00416      emit_store(p, dest, 1, temp);
00417      update_src_ptr(p, srcECX, vtxESI, a);
00418      break;
00419       case EMIT_2F:
00420      get_src_ptr(p, srcECX, vtxESI, a);
00421      emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
00422      emit_store(p, dest, 2, temp);
00423      update_src_ptr(p, srcECX, vtxESI, a);
00424      break;
00425       case EMIT_3F:
00426      /* Potentially the worst case - hardcode 2+1 copying:
00427       */
00428      if (0) {
00429         get_src_ptr(p, srcECX, vtxESI, a);
00430         emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
00431         emit_store(p, dest, 3, temp);
00432         update_src_ptr(p, srcECX, vtxESI, a);
00433      }
00434      else {
00435         get_src_ptr(p, srcECX, vtxESI, a);
00436         emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
00437         emit_store(p, dest, 2, temp);
00438         if (a->inputsize > 2) {
00439            emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
00440            emit_store(p, x86_make_disp(dest,8), 1, temp);
00441         }
00442         else {
00443            sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
00444         }
00445         update_src_ptr(p, srcECX, vtxESI, a);
00446      }
00447      break;
00448       case EMIT_4F:
00449      get_src_ptr(p, srcECX, vtxESI, a);
00450      emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
00451      emit_store(p, dest, 4, temp);
00452      update_src_ptr(p, srcECX, vtxESI, a);
00453      break;
00454       case EMIT_2F_VIEWPORT: 
00455      get_src_ptr(p, srcECX, vtxESI, a);
00456      emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
00457      sse_mulps(&p->func, temp, vp0);
00458      sse_addps(&p->func, temp, vp1);
00459      emit_store(p, dest, 2, temp);
00460      update_src_ptr(p, srcECX, vtxESI, a);
00461      break;
00462       case EMIT_3F_VIEWPORT: 
00463      get_src_ptr(p, srcECX, vtxESI, a);
00464      emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
00465      sse_mulps(&p->func, temp, vp0);
00466      sse_addps(&p->func, temp, vp1);
00467      emit_store(p, dest, 3, temp);
00468      update_src_ptr(p, srcECX, vtxESI, a);
00469      break;
00470       case EMIT_4F_VIEWPORT: 
00471      get_src_ptr(p, srcECX, vtxESI, a);
00472      emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
00473      sse_mulps(&p->func, temp, vp0);
00474      sse_addps(&p->func, temp, vp1);
00475      emit_store(p, dest, 4, temp);
00476      update_src_ptr(p, srcECX, vtxESI, a);
00477      break;
00478       case EMIT_3F_XYW:
00479      get_src_ptr(p, srcECX, vtxESI, a);
00480      emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
00481      sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
00482      emit_store(p, dest, 3, temp);
00483      update_src_ptr(p, srcECX, vtxESI, a);
00484      break;
00485 
00486       case EMIT_1UB_1F:  
00487      /* Test for PAD3 + 1UB:
00488       */
00489      if (j > 0 &&
00490          a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
00491      {
00492         get_src_ptr(p, srcECX, vtxESI, a);
00493         emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
00494         sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
00495         emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
00496         update_src_ptr(p, srcECX, vtxESI, a);
00497      }
00498      else {
00499         _mesa_printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
00500         return GL_FALSE;
00501      }
00502      break;
00503       case EMIT_3UB_3F_RGB:
00504       case EMIT_3UB_3F_BGR:
00505      /* Test for 3UB + PAD1:
00506       */
00507      if (j == vtx->attr_count - 1 ||
00508          a[1].vertoffset >= a->vertoffset + 4) {
00509         get_src_ptr(p, srcECX, vtxESI, a);
00510         emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
00511         if (a->format == EMIT_3UB_3F_BGR)
00512            sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
00513         emit_pack_store_4ub(p, dest, temp);
00514         update_src_ptr(p, srcECX, vtxESI, a);
00515      }
00516      /* Test for 3UB + 1UB:
00517       */
00518      else if (j < vtx->attr_count - 1 &&
00519           a[1].format == EMIT_1UB_1F &&
00520           a[1].vertoffset == a->vertoffset + 3) {
00521         get_src_ptr(p, srcECX, vtxESI, a);
00522         emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
00523         update_src_ptr(p, srcECX, vtxESI, a);
00524 
00525         /* Make room for incoming value:
00526          */
00527         sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
00528 
00529         get_src_ptr(p, srcECX, vtxESI, &a[1]);
00530         emit_load(p, temp2, 1, x86_deref(srcECX), a[1].inputsize);
00531         sse_movss(&p->func, temp, temp2);
00532         update_src_ptr(p, srcECX, vtxESI, &a[1]);
00533 
00534         /* Rearrange and possibly do BGR conversion:
00535          */
00536         if (a->format == EMIT_3UB_3F_BGR)
00537            sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
00538         else
00539            sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
00540 
00541         emit_pack_store_4ub(p, dest, temp);
00542         j++;        /* NOTE: two attrs consumed */
00543      }
00544      else {
00545         _mesa_printf("Can't emit 3ub\n");
00546         return GL_FALSE;    /* add this later */
00547      }
00548      break;
00549 
00550       case EMIT_4UB_4F_RGBA:
00551      get_src_ptr(p, srcECX, vtxESI, a);
00552      emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
00553      emit_pack_store_4ub(p, dest, temp);
00554      update_src_ptr(p, srcECX, vtxESI, a);
00555      break;
00556       case EMIT_4UB_4F_BGRA:
00557      get_src_ptr(p, srcECX, vtxESI, a);
00558      emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
00559      sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
00560      emit_pack_store_4ub(p, dest, temp);
00561      update_src_ptr(p, srcECX, vtxESI, a);
00562      break;
00563       case EMIT_4UB_4F_ARGB:
00564      get_src_ptr(p, srcECX, vtxESI, a);
00565      emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
00566      sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
00567      emit_pack_store_4ub(p, dest, temp);
00568      update_src_ptr(p, srcECX, vtxESI, a);
00569      break;
00570       case EMIT_4UB_4F_ABGR:
00571      get_src_ptr(p, srcECX, vtxESI, a);
00572      emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
00573      sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
00574      emit_pack_store_4ub(p, dest, temp);
00575      update_src_ptr(p, srcECX, vtxESI, a);
00576      break;
00577       case EMIT_4CHAN_4F_RGBA:
00578      switch (CHAN_TYPE) {
00579      case GL_UNSIGNED_BYTE:
00580         get_src_ptr(p, srcECX, vtxESI, a);
00581         emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
00582         emit_pack_store_4ub(p, dest, temp);
00583         update_src_ptr(p, srcECX, vtxESI, a);
00584         break;
00585      case GL_FLOAT:
00586         get_src_ptr(p, srcECX, vtxESI, a);
00587         emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
00588         emit_store(p, dest, 4, temp);
00589         update_src_ptr(p, srcECX, vtxESI, a);
00590         break;
00591      case GL_UNSIGNED_SHORT:
00592      default:
00593         _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
00594         return GL_FALSE;
00595      }
00596      break;
00597       default:
00598      _mesa_printf("unknown a[%d].format %d\n", j, a->format);
00599      return GL_FALSE;   /* catch any new opcodes */
00600       }
00601       
00602       /* Increment j by at least 1 - may have been incremented above also:
00603        */
00604       j++;
00605    }
00606 
00607    /* Next vertex:
00608     */
00609    x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vtx->vertex_size));
00610 
00611    /* decr count, loop if not zero
00612     */
00613    x86_dec(&p->func, countEBP);
00614    x86_test(&p->func, countEBP, countEBP); 
00615    x86_jcc(&p->func, cc_NZ, label);
00616 
00617    /* Exit mmx state?
00618     */
00619    if (p->func.need_emms)
00620       mmx_emms(&p->func);
00621 
00622    /* Land forward jump here:
00623     */
00624    x86_fixup_fwd_jump(&p->func, fixup);
00625 
00626    /* Pop regs and return
00627     */
00628    x86_pop(&p->func, x86_get_base_reg(vtxESI));
00629    x86_pop(&p->func, countEBP);
00630    x86_ret(&p->func);
00631 
00632    assert(!vtx->emit);
00633    vtx->emit = (tnl_emit_func)x86_get_func(&p->func);
00634 
00635    assert( (char *) p->func.csr - (char *) p->func.store <= MAX_SSE_CODE_SIZE );
00636    return GL_TRUE;
00637 }
00638 
00639 
00640 
00641 void _tnl_generate_sse_emit( GLcontext *ctx )
00642 {
00643    struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
00644    struct x86_program p;   
00645 
00646    if (!cpu_has_xmm) {
00647       vtx->codegen_emit = NULL;
00648       return;
00649    }
00650 
00651    _mesa_memset(&p, 0, sizeof(p));
00652 
00653    p.ctx = ctx;
00654    p.inputs_safe = 0;       /* for now */
00655    p.outputs_safe = 0;      /* for now */
00656    p.have_sse2 = cpu_has_xmm2;
00657    p.identity = x86_make_reg(file_XMM, 6);
00658    p.chan0 = x86_make_reg(file_XMM, 7);
00659 
00660    if (!x86_init_func_size(&p.func, MAX_SSE_CODE_SIZE)) {
00661       vtx->emit = NULL;
00662       return;
00663    }
00664 
00665    if (build_vertex_emit(&p)) {
00666       _tnl_register_fastpath( vtx, GL_TRUE );
00667    }
00668    else {
00669       /* Note the failure so that we don't keep trying to codegen an
00670        * impossible state:
00671        */
00672       _tnl_register_fastpath( vtx, GL_FALSE );
00673       x86_release_func(&p.func);
00674    }
00675 }
00676 
00677 #else
00678 
00679 void _tnl_generate_sse_emit( GLcontext *ctx )
00680 {
00681    /* Dummy version for when USE_SSE_ASM not defined */
00682 }
00683 
00684 #endif

Generated on Sat May 26 2012 04:19:36 for ReactOS by doxygen 1.7.6.1

ReactOS is a registered trademark or a trademark of ReactOS Foundation in the United States and other countries.