Home | Info | Community | Development | myReactOS | Contact Us
ReactOS Development > Doxygent_vertex_sse.c
Go to the documentation of this file.
00001 /* 00002 * Copyright 2003 Tungsten Graphics, inc. 00003 * All Rights Reserved. 00004 * 00005 * Permission is hereby granted, free of charge, to any person obtaining a 00006 * copy of this software and associated documentation files (the "Software"), 00007 * to deal in the Software without restriction, including without limitation 00008 * on the rights to use, copy, modify, merge, publish, distribute, sub 00009 * license, and/or sell copies of the Software, and to permit persons to whom 00010 * the Software is furnished to do so, subject to the following conditions: 00011 * 00012 * The above copyright notice and this permission notice (including the next 00013 * paragraph) shall be included in all copies or substantial portions of the 00014 * Software. 00015 * 00016 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00017 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00018 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 00019 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 00020 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 00021 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 00022 * USE OR OTHER DEALINGS IN THE SOFTWARE. 00023 * 00024 * Authors: 00025 * Keith Whitwell <keithw@tungstengraphics.com> 00026 */ 00027 00028 #include "main/glheader.h" 00029 #include "main/context.h" 00030 #include "main/colormac.h" 00031 #include "main/simple_list.h" 00032 #include "main/enums.h" 00033 #include "t_context.h" 00034 #include "t_vertex.h" 00035 00036 #if defined(USE_SSE_ASM) 00037 00038 #include "x86/rtasm/x86sse.h" 00039 #include "x86/common_x86_asm.h" 00040 00041 00045 #define MAX_SSE_CODE_SIZE 1024 00046 00047 00048 #define X 0 00049 #define Y 1 00050 #define Z 2 00051 #define W 3 00052 00053 00054 struct x86_program { 00055 struct x86_function func; 00056 00057 GLcontext *ctx; 00058 GLboolean inputs_safe; 00059 GLboolean outputs_safe; 00060 GLboolean have_sse2; 00061 00062 struct x86_reg identity; 00063 struct x86_reg chan0; 00064 }; 00065 00066 00067 static struct x86_reg get_identity( struct x86_program *p ) 00068 { 00069 return p->identity; 00070 } 00071 00072 static void emit_load4f_4( struct x86_program *p, 00073 struct x86_reg dest, 00074 struct x86_reg arg0 ) 00075 { 00076 sse_movups(&p->func, dest, arg0); 00077 } 00078 00079 static void emit_load4f_3( struct x86_program *p, 00080 struct x86_reg dest, 00081 struct x86_reg arg0 ) 00082 { 00083 /* Have to jump through some hoops: 00084 * 00085 * c 0 0 0 00086 * c 0 0 1 00087 * 0 0 c 1 00088 * a b c 1 00089 */ 00090 sse_movss(&p->func, dest, x86_make_disp(arg0, 8)); 00091 sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) ); 00092 sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) ); 00093 sse_movlps(&p->func, dest, arg0); 00094 } 00095 00096 static void emit_load4f_2( struct x86_program *p, 00097 struct x86_reg dest, 00098 struct x86_reg arg0 ) 00099 { 00100 /* Initialize from identity, then pull in low two words: 00101 */ 00102 sse_movups(&p->func, dest, get_identity(p)); 00103 sse_movlps(&p->func, dest, arg0); 00104 } 00105 00106 static void emit_load4f_1( struct x86_program *p, 00107 struct x86_reg dest, 00108 struct x86_reg arg0 ) 00109 { 00110 /* Pull in low word, then swizzle in identity */ 00111 sse_movss(&p->func, dest, arg0); 00112 sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) ); 00113 } 00114 00115 00116 00117 static void emit_load3f_3( struct x86_program *p, 00118 struct x86_reg dest, 00119 struct x86_reg arg0 ) 00120 { 00121 /* Over-reads by 1 dword - potential SEGV if input is a vertex 00122 * array. 00123 */ 00124 if (p->inputs_safe) { 00125 sse_movups(&p->func, dest, arg0); 00126 } 00127 else { 00128 /* c 0 0 0 00129 * c c c c 00130 * a b c c 00131 */ 00132 sse_movss(&p->func, dest, x86_make_disp(arg0, 8)); 00133 sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X)); 00134 sse_movlps(&p->func, dest, arg0); 00135 } 00136 } 00137 00138 static void emit_load3f_2( struct x86_program *p, 00139 struct x86_reg dest, 00140 struct x86_reg arg0 ) 00141 { 00142 emit_load4f_2(p, dest, arg0); 00143 } 00144 00145 static void emit_load3f_1( struct x86_program *p, 00146 struct x86_reg dest, 00147 struct x86_reg arg0 ) 00148 { 00149 /* Loading from memory erases the upper bits. */ 00150 sse_movss(&p->func, dest, arg0); 00151 } 00152 00153 static void emit_load2f_2( struct x86_program *p, 00154 struct x86_reg dest, 00155 struct x86_reg arg0 ) 00156 { 00157 sse_movlps(&p->func, dest, arg0); 00158 } 00159 00160 static void emit_load2f_1( struct x86_program *p, 00161 struct x86_reg dest, 00162 struct x86_reg arg0 ) 00163 { 00164 /* Loading from memory erases the upper bits. */ 00165 sse_movss(&p->func, dest, arg0); 00166 } 00167 00168 static void emit_load1f_1( struct x86_program *p, 00169 struct x86_reg dest, 00170 struct x86_reg arg0 ) 00171 { 00172 sse_movss(&p->func, dest, arg0); 00173 } 00174 00175 static void (*load[4][4])( struct x86_program *p, 00176 struct x86_reg dest, 00177 struct x86_reg arg0 ) = { 00178 { emit_load1f_1, 00179 emit_load1f_1, 00180 emit_load1f_1, 00181 emit_load1f_1 }, 00182 00183 { emit_load2f_1, 00184 emit_load2f_2, 00185 emit_load2f_2, 00186 emit_load2f_2 }, 00187 00188 { emit_load3f_1, 00189 emit_load3f_2, 00190 emit_load3f_3, 00191 emit_load3f_3 }, 00192 00193 { emit_load4f_1, 00194 emit_load4f_2, 00195 emit_load4f_3, 00196 emit_load4f_4 } 00197 }; 00198 00199 static void emit_load( struct x86_program *p, 00200 struct x86_reg dest, 00201 GLuint sz, 00202 struct x86_reg src, 00203 GLuint src_sz) 00204 { 00205 load[sz-1][src_sz-1](p, dest, src); 00206 } 00207 00208 static void emit_store4f( struct x86_program *p, 00209 struct x86_reg dest, 00210 struct x86_reg arg0 ) 00211 { 00212 sse_movups(&p->func, dest, arg0); 00213 } 00214 00215 static void emit_store3f( struct x86_program *p, 00216 struct x86_reg dest, 00217 struct x86_reg arg0 ) 00218 { 00219 if (p->outputs_safe) { 00220 /* Emit the extra dword anyway. This may hurt writecombining, 00221 * may cause other problems. 00222 */ 00223 sse_movups(&p->func, dest, arg0); 00224 } 00225 else { 00226 /* Alternate strategy - emit two, shuffle, emit one. 00227 */ 00228 sse_movlps(&p->func, dest, arg0); 00229 sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ 00230 sse_movss(&p->func, x86_make_disp(dest,8), arg0); 00231 } 00232 } 00233 00234 static void emit_store2f( struct x86_program *p, 00235 struct x86_reg dest, 00236 struct x86_reg arg0 ) 00237 { 00238 sse_movlps(&p->func, dest, arg0); 00239 } 00240 00241 static void emit_store1f( struct x86_program *p, 00242 struct x86_reg dest, 00243 struct x86_reg arg0 ) 00244 { 00245 sse_movss(&p->func, dest, arg0); 00246 } 00247 00248 00249 static void (*store[4])( struct x86_program *p, 00250 struct x86_reg dest, 00251 struct x86_reg arg0 ) = 00252 { 00253 emit_store1f, 00254 emit_store2f, 00255 emit_store3f, 00256 emit_store4f 00257 }; 00258 00259 static void emit_store( struct x86_program *p, 00260 struct x86_reg dest, 00261 GLuint sz, 00262 struct x86_reg temp ) 00263 00264 { 00265 store[sz-1](p, dest, temp); 00266 } 00267 00268 static void emit_pack_store_4ub( struct x86_program *p, 00269 struct x86_reg dest, 00270 struct x86_reg temp ) 00271 { 00272 /* Scale by 255.0 00273 */ 00274 sse_mulps(&p->func, temp, p->chan0); 00275 00276 if (p->have_sse2) { 00277 sse2_cvtps2dq(&p->func, temp, temp); 00278 sse2_packssdw(&p->func, temp, temp); 00279 sse2_packuswb(&p->func, temp, temp); 00280 sse_movss(&p->func, dest, temp); 00281 } 00282 else { 00283 struct x86_reg mmx0 = x86_make_reg(file_MMX, 0); 00284 struct x86_reg mmx1 = x86_make_reg(file_MMX, 1); 00285 sse_cvtps2pi(&p->func, mmx0, temp); 00286 sse_movhlps(&p->func, temp, temp); 00287 sse_cvtps2pi(&p->func, mmx1, temp); 00288 mmx_packssdw(&p->func, mmx0, mmx1); 00289 mmx_packuswb(&p->func, mmx0, mmx0); 00290 mmx_movd(&p->func, dest, mmx0); 00291 } 00292 } 00293 00294 static GLint get_offset( const void *a, const void *b ) 00295 { 00296 return (const char *)b - (const char *)a; 00297 } 00298 00299 /* Not much happens here. Eventually use this function to try and 00300 * avoid saving/reloading the source pointers each vertex (if some of 00301 * them can fit in registers). 00302 */ 00303 static void get_src_ptr( struct x86_program *p, 00304 struct x86_reg srcREG, 00305 struct x86_reg vtxREG, 00306 struct tnl_clipspace_attr *a ) 00307 { 00308 struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx); 00309 struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr)); 00310 00311 /* Load current a[j].inputptr 00312 */ 00313 x86_mov(&p->func, srcREG, ptr_to_src); 00314 } 00315 00316 static void update_src_ptr( struct x86_program *p, 00317 struct x86_reg srcREG, 00318 struct x86_reg vtxREG, 00319 struct tnl_clipspace_attr *a ) 00320 { 00321 if (a->inputstride) { 00322 struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx); 00323 struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr)); 00324 00325 /* add a[j].inputstride (hardcoded value - could just as easily 00326 * pull the stride value from memory each time). 00327 */ 00328 x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride)); 00329 00330 /* save new value of a[j].inputptr 00331 */ 00332 x86_mov(&p->func, ptr_to_src, srcREG); 00333 } 00334 } 00335 00336 00337 /* Lots of hardcoding 00338 * 00339 * EAX -- pointer to current output vertex 00340 * ECX -- pointer to current attribute 00341 * 00342 */ 00343 static GLboolean build_vertex_emit( struct x86_program *p ) 00344 { 00345 GLcontext *ctx = p->ctx; 00346 TNLcontext *tnl = TNL_CONTEXT(ctx); 00347 struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx); 00348 GLuint j = 0; 00349 00350 struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX); 00351 struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX); 00352 struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP); 00353 struct x86_reg vtxESI = x86_make_reg(file_REG32, reg_SI); 00354 struct x86_reg temp = x86_make_reg(file_XMM, 0); 00355 struct x86_reg vp0 = x86_make_reg(file_XMM, 1); 00356 struct x86_reg vp1 = x86_make_reg(file_XMM, 2); 00357 struct x86_reg temp2 = x86_make_reg(file_XMM, 3); 00358 GLubyte *fixup, *label; 00359 00360 /* Push a few regs? 00361 */ 00362 x86_push(&p->func, countEBP); 00363 x86_push(&p->func, vtxESI); 00364 00365 00366 /* Get vertex count, compare to zero 00367 */ 00368 x86_xor(&p->func, srcECX, srcECX); 00369 x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2)); 00370 x86_cmp(&p->func, countEBP, srcECX); 00371 fixup = x86_jcc_forward(&p->func, cc_E); 00372 00373 /* Initialize destination register. 00374 */ 00375 x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3)); 00376 00377 /* Dereference ctx to get tnl, then vtx: 00378 */ 00379 x86_mov(&p->func, vtxESI, x86_fn_arg(&p->func, 1)); 00380 x86_mov(&p->func, vtxESI, x86_make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context))); 00381 vtxESI = x86_make_disp(vtxESI, get_offset(tnl, &tnl->clipspace)); 00382 00383 00384 /* Possibly load vp0, vp1 for viewport calcs: 00385 */ 00386 if (vtx->need_viewport) { 00387 sse_movups(&p->func, vp0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0]))); 00388 sse_movups(&p->func, vp1, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0]))); 00389 } 00390 00391 /* always load, needed or not: 00392 */ 00393 sse_movups(&p->func, p->chan0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0]))); 00394 sse_movups(&p->func, p->identity, x86_make_disp(vtxESI, get_offset(vtx, &vtx->identity[0]))); 00395 00396 /* Note address for loop jump */ 00397 label = x86_get_label(&p->func); 00398 00399 /* Emit code for each of the attributes. Currently routes 00400 * everything through SSE registers, even when it might be more 00401 * efficient to stick with regular old x86. No optimization or 00402 * other tricks - enough new ground to cover here just getting 00403 * things working. 00404 */ 00405 while (j < vtx->attr_count) { 00406 struct tnl_clipspace_attr *a = &vtx->attr[j]; 00407 struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset); 00408 00409 /* Now, load an XMM reg from src, perhaps transform, then save. 00410 * Could be shortcircuited in specific cases: 00411 */ 00412 switch (a->format) { 00413 case EMIT_1F: 00414 get_src_ptr(p, srcECX, vtxESI, a); 00415 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize); 00416 emit_store(p, dest, 1, temp); 00417 update_src_ptr(p, srcECX, vtxESI, a); 00418 break; 00419 case EMIT_2F: 00420 get_src_ptr(p, srcECX, vtxESI, a); 00421 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize); 00422 emit_store(p, dest, 2, temp); 00423 update_src_ptr(p, srcECX, vtxESI, a); 00424 break; 00425 case EMIT_3F: 00426 /* Potentially the worst case - hardcode 2+1 copying: 00427 */ 00428 if (0) { 00429 get_src_ptr(p, srcECX, vtxESI, a); 00430 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize); 00431 emit_store(p, dest, 3, temp); 00432 update_src_ptr(p, srcECX, vtxESI, a); 00433 } 00434 else { 00435 get_src_ptr(p, srcECX, vtxESI, a); 00436 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize); 00437 emit_store(p, dest, 2, temp); 00438 if (a->inputsize > 2) { 00439 emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1); 00440 emit_store(p, x86_make_disp(dest,8), 1, temp); 00441 } 00442 else { 00443 sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p)); 00444 } 00445 update_src_ptr(p, srcECX, vtxESI, a); 00446 } 00447 break; 00448 case EMIT_4F: 00449 get_src_ptr(p, srcECX, vtxESI, a); 00450 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); 00451 emit_store(p, dest, 4, temp); 00452 update_src_ptr(p, srcECX, vtxESI, a); 00453 break; 00454 case EMIT_2F_VIEWPORT: 00455 get_src_ptr(p, srcECX, vtxESI, a); 00456 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize); 00457 sse_mulps(&p->func, temp, vp0); 00458 sse_addps(&p->func, temp, vp1); 00459 emit_store(p, dest, 2, temp); 00460 update_src_ptr(p, srcECX, vtxESI, a); 00461 break; 00462 case EMIT_3F_VIEWPORT: 00463 get_src_ptr(p, srcECX, vtxESI, a); 00464 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize); 00465 sse_mulps(&p->func, temp, vp0); 00466 sse_addps(&p->func, temp, vp1); 00467 emit_store(p, dest, 3, temp); 00468 update_src_ptr(p, srcECX, vtxESI, a); 00469 break; 00470 case EMIT_4F_VIEWPORT: 00471 get_src_ptr(p, srcECX, vtxESI, a); 00472 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); 00473 sse_mulps(&p->func, temp, vp0); 00474 sse_addps(&p->func, temp, vp1); 00475 emit_store(p, dest, 4, temp); 00476 update_src_ptr(p, srcECX, vtxESI, a); 00477 break; 00478 case EMIT_3F_XYW: 00479 get_src_ptr(p, srcECX, vtxESI, a); 00480 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); 00481 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z)); 00482 emit_store(p, dest, 3, temp); 00483 update_src_ptr(p, srcECX, vtxESI, a); 00484 break; 00485 00486 case EMIT_1UB_1F: 00487 /* Test for PAD3 + 1UB: 00488 */ 00489 if (j > 0 && 00490 a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3) 00491 { 00492 get_src_ptr(p, srcECX, vtxESI, a); 00493 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize); 00494 sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X)); 00495 emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */ 00496 update_src_ptr(p, srcECX, vtxESI, a); 00497 } 00498 else { 00499 _mesa_printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize ); 00500 return GL_FALSE; 00501 } 00502 break; 00503 case EMIT_3UB_3F_RGB: 00504 case EMIT_3UB_3F_BGR: 00505 /* Test for 3UB + PAD1: 00506 */ 00507 if (j == vtx->attr_count - 1 || 00508 a[1].vertoffset >= a->vertoffset + 4) { 00509 get_src_ptr(p, srcECX, vtxESI, a); 00510 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize); 00511 if (a->format == EMIT_3UB_3F_BGR) 00512 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W)); 00513 emit_pack_store_4ub(p, dest, temp); 00514 update_src_ptr(p, srcECX, vtxESI, a); 00515 } 00516 /* Test for 3UB + 1UB: 00517 */ 00518 else if (j < vtx->attr_count - 1 && 00519 a[1].format == EMIT_1UB_1F && 00520 a[1].vertoffset == a->vertoffset + 3) { 00521 get_src_ptr(p, srcECX, vtxESI, a); 00522 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize); 00523 update_src_ptr(p, srcECX, vtxESI, a); 00524 00525 /* Make room for incoming value: 00526 */ 00527 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z)); 00528 00529 get_src_ptr(p, srcECX, vtxESI, &a[1]); 00530 emit_load(p, temp2, 1, x86_deref(srcECX), a[1].inputsize); 00531 sse_movss(&p->func, temp, temp2); 00532 update_src_ptr(p, srcECX, vtxESI, &a[1]); 00533 00534 /* Rearrange and possibly do BGR conversion: 00535 */ 00536 if (a->format == EMIT_3UB_3F_BGR) 00537 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X)); 00538 else 00539 sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X)); 00540 00541 emit_pack_store_4ub(p, dest, temp); 00542 j++; /* NOTE: two attrs consumed */ 00543 } 00544 else { 00545 _mesa_printf("Can't emit 3ub\n"); 00546 return GL_FALSE; /* add this later */ 00547 } 00548 break; 00549 00550 case EMIT_4UB_4F_RGBA: 00551 get_src_ptr(p, srcECX, vtxESI, a); 00552 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); 00553 emit_pack_store_4ub(p, dest, temp); 00554 update_src_ptr(p, srcECX, vtxESI, a); 00555 break; 00556 case EMIT_4UB_4F_BGRA: 00557 get_src_ptr(p, srcECX, vtxESI, a); 00558 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); 00559 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W)); 00560 emit_pack_store_4ub(p, dest, temp); 00561 update_src_ptr(p, srcECX, vtxESI, a); 00562 break; 00563 case EMIT_4UB_4F_ARGB: 00564 get_src_ptr(p, srcECX, vtxESI, a); 00565 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); 00566 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z)); 00567 emit_pack_store_4ub(p, dest, temp); 00568 update_src_ptr(p, srcECX, vtxESI, a); 00569 break; 00570 case EMIT_4UB_4F_ABGR: 00571 get_src_ptr(p, srcECX, vtxESI, a); 00572 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); 00573 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X)); 00574 emit_pack_store_4ub(p, dest, temp); 00575 update_src_ptr(p, srcECX, vtxESI, a); 00576 break; 00577 case EMIT_4CHAN_4F_RGBA: 00578 switch (CHAN_TYPE) { 00579 case GL_UNSIGNED_BYTE: 00580 get_src_ptr(p, srcECX, vtxESI, a); 00581 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); 00582 emit_pack_store_4ub(p, dest, temp); 00583 update_src_ptr(p, srcECX, vtxESI, a); 00584 break; 00585 case GL_FLOAT: 00586 get_src_ptr(p, srcECX, vtxESI, a); 00587 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); 00588 emit_store(p, dest, 4, temp); 00589 update_src_ptr(p, srcECX, vtxESI, a); 00590 break; 00591 case GL_UNSIGNED_SHORT: 00592 default: 00593 _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE)); 00594 return GL_FALSE; 00595 } 00596 break; 00597 default: 00598 _mesa_printf("unknown a[%d].format %d\n", j, a->format); 00599 return GL_FALSE; /* catch any new opcodes */ 00600 } 00601 00602 /* Increment j by at least 1 - may have been incremented above also: 00603 */ 00604 j++; 00605 } 00606 00607 /* Next vertex: 00608 */ 00609 x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vtx->vertex_size)); 00610 00611 /* decr count, loop if not zero 00612 */ 00613 x86_dec(&p->func, countEBP); 00614 x86_test(&p->func, countEBP, countEBP); 00615 x86_jcc(&p->func, cc_NZ, label); 00616 00617 /* Exit mmx state? 00618 */ 00619 if (p->func.need_emms) 00620 mmx_emms(&p->func); 00621 00622 /* Land forward jump here: 00623 */ 00624 x86_fixup_fwd_jump(&p->func, fixup); 00625 00626 /* Pop regs and return 00627 */ 00628 x86_pop(&p->func, x86_get_base_reg(vtxESI)); 00629 x86_pop(&p->func, countEBP); 00630 x86_ret(&p->func); 00631 00632 assert(!vtx->emit); 00633 vtx->emit = (tnl_emit_func)x86_get_func(&p->func); 00634 00635 assert( (char *) p->func.csr - (char *) p->func.store <= MAX_SSE_CODE_SIZE ); 00636 return GL_TRUE; 00637 } 00638 00639 00640 00641 void _tnl_generate_sse_emit( GLcontext *ctx ) 00642 { 00643 struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx); 00644 struct x86_program p; 00645 00646 if (!cpu_has_xmm) { 00647 vtx->codegen_emit = NULL; 00648 return; 00649 } 00650 00651 _mesa_memset(&p, 0, sizeof(p)); 00652 00653 p.ctx = ctx; 00654 p.inputs_safe = 0; /* for now */ 00655 p.outputs_safe = 0; /* for now */ 00656 p.have_sse2 = cpu_has_xmm2; 00657 p.identity = x86_make_reg(file_XMM, 6); 00658 p.chan0 = x86_make_reg(file_XMM, 7); 00659 00660 if (!x86_init_func_size(&p.func, MAX_SSE_CODE_SIZE)) { 00661 vtx->emit = NULL; 00662 return; 00663 } 00664 00665 if (build_vertex_emit(&p)) { 00666 _tnl_register_fastpath( vtx, GL_TRUE ); 00667 } 00668 else { 00669 /* Note the failure so that we don't keep trying to codegen an 00670 * impossible state: 00671 */ 00672 _tnl_register_fastpath( vtx, GL_FALSE ); 00673 x86_release_func(&p.func); 00674 } 00675 } 00676 00677 #else 00678 00679 void _tnl_generate_sse_emit( GLcontext *ctx ) 00680 { 00681 /* Dummy version for when USE_SSE_ASM not defined */ 00682 } 00683 00684 #endif Generated on Sat May 26 2012 04:19:36 for ReactOS by
1.7.6.1
|