先看图,一次drawcall中有多个角色被渲染出来,并且vb中只有一个角色模型,这些角色的动作并不完全一样。
dx9能力有限,只能做静态物体的instancing。dx10开始支持4096个常量寄存器,贴图格式可以是R32G32B32A32_FLOAT,每个实例在shader都有一个int的实例id对应。在这些新能力下,把动画数据存储在贴图里,每个实例的位置矩阵存在常量寄存器里,可以实现skinning & instancing,GPU Gems 3 Chapter 2. Animated Crowd Rendering就介绍了这种方法。
可以看到第一个贴图data only应该就是动画数据
但是shader常量却只有12个float4,但人物有14个,估计不是在常量寄存器里设置位置的方法。
看vs shader
//
// Generated by Microsoft (R) D3D Shader Disassembler
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// POSITION 0 xyz 0 NONE float xyz
// BLENDWEIGHT 0 xyzw 1 NONE float xyz
// BLENDINDICES 0 xyzw 2 NONE uint xyz
// NORMAL 0 xyz 3 NONE float xyz
// TEXCOORD 0 xy 4 NONE float xy
// BLENDINDICES 1 x 5 NONE uint x
//
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// SV_Position 0 xyzw 0 POS float xyzw
// TEXCOORD 0 xyzw 1 NONE float xyzw
// TEXCOORD 1 xyzw 2 NONE float xyzw
// TEXCOORD 2 xyz 3 NONE float xyz
// TEXCOORD 3 xyz 4 NONE float xyz
// InstID 0 x 5 NONE uint x
//
vs_5_0
dcl_globalFlags refactoringAllowed
dcl_constantbuffer cb1[8], immediateIndexed
dcl_resource_buffer (mixed,mixed,mixed,mixed) t27 ----------动画数据贴图
dcl_input v0.xyz
dcl_input v1.xyz
dcl_input v2.xyz
dcl_input v3.xyz
dcl_input v4.xy
dcl_input v5.x -----------------------每个instance相关的数据
dcl_output_siv o0.xyzw, position
dcl_output o1.xyzw
dcl_output o2.xyzw
dcl_output o3.xyz
dcl_output o4.xyz
dcl_output o5.x
dcl_temps 7
imul null, r0.x, v5.x, l(98) ------------------乘98
imad r0.y, v2.x, l(3), r0.x ------------------加上(骨骼id乘3)
iadd r0.z, l(2), r0.y ------------加2
ld_indexable(buffer)(mixed,mixed,mixed,mixed) r1.xyzw, r0.zzzz, t27.xyzw -- 从贴图取数据
mov r2.xyz, v0.xyzx
mov r2.w, l(1.000000)
dp4 r3.x, r2.xyzw, r1.xyzw
iadd r0.z, l(3), r0.y -------------加3
ld_indexable(buffer)(mixed,mixed,mixed,mixed) r4.xyzw, r0.zzzz, t27.xyzw -- 再取数据
dp4 r3.y, r2.xyzw, r4.xyzw
iadd r0.y, l(4), r0.y -------------加4
ld_indexable(buffer)(mixed,mixed,mixed,mixed) r5.xyzw, r0.yyyy, t27.xyzw --------------第三次取数据,现在够组成一个骨骼的矩阵了
dp4 r3.z, r2.xyzw, r5.xyzw
mul r0.yzw, r3.xxyz, v1.xxxx
dp3 r1.x, v3.xyzx, r1.xyzx
dp3 r1.y, v3.xyzx, r4.xyzx
dp3 r1.z, v3.xyzx, r5.xyzx
mul r1.xyz, r1.xyzx, v1.xxxx
lt r3.x, l(0.000000), v1.y
if_nz r3.x ---------顶点的第二个权重骨骼,这里有缩进的话会好看些
imad r3.x, v2.y, l(3), r0.x
iadd r3.y, l(2), r3.x
ld_indexable(buffer)(mixed,mixed,mixed,mixed) r4.xyzw, r3.yyyy, t27.xyzw
dp4 r5.x, r2.xyzw, r4.xyzw
iadd r3.y, l(3), r3.x
ld_indexable(buffer)(mixed,mixed,mixed,mixed) r6.xyzw, r3.yyyy, t27.xyzw
dp4 r5.y, r2.xyzw, r6.xyzw
iadd r3.x, l(4), r3.x
ld_indexable(buffer)(mixed,mixed,mixed,mixed) r3.xyzw, r3.xxxx, t27.xyzw
dp4 r5.z, r2.xyzw, r3.xyzw
mad r0.yzw, v1.yyyy, r5.xxyz, r0.yyzw
dp3 r4.x, v3.xyzx, r4.xyzx
dp3 r4.y, v3.xyzx, r6.xyzx
dp3 r4.z, v3.xyzx, r3.xyzx
mad r1.xyz, v1.yyyy, r4.xyzx, r1.xyzx
lt r3.x, l(0.000000), v1.z
if_nz r3.x ---------顶点的第三个权重骨骼
imad r0.x, v2.z, l(3), r0.x
iadd r3.x, l(2), r0.x
ld_indexable(buffer)(mixed,mixed,mixed,mixed) r3.xyzw, r3.xxxx, t27.xyzw
dp4 r4.x, r2.xyzw, r3.xyzw
iadd r3.w, l(3), r0.x
ld_indexable(buffer)(mixed,mixed,mixed,mixed) r5.xyzw, r3.wwww, t27.xyzw
dp4 r4.y, r2.xyzw, r5.xyzw
iadd r0.x, l(4), r0.x
ld_indexable(buffer)(mixed,mixed,mixed,mixed) r6.xyzw, r0.xxxx, t27.xyzw
dp4 r4.z, r2.xyzw, r6.xyzw
mad r0.yzw, v1.zzzz, r4.xxyz, r0.yyzw
dp3 r2.x, v3.xyzx, r3.xyzx
dp3 r2.y, v3.xyzx, r5.xyzx
dp3 r2.z, v3.xyzx, r6.xyzx
mad r1.xyz, v1.zzzz, r2.xyzx, r1.xyzx
endif
endif ---------------三个权重骨骼计算完毕
mul r2.xyzw, r0.zzzz, cb1[5].xyzw
mad r2.xyzw, r0.yyyy, cb1[4].xyzw, r2.xyzw
mad r2.xyzw, r0.wwww, cb1[6].xyzw, r2.xyzw
add r2.xyzw, r2.xyzw, cb1[7].xyzw
add o2.xyz, r0.yzwy, -cb1[3].xyzx
mov o0.xyzw, r2.xyzw
mov r1.w, v4.x
mov o1.xyzw, r1.xyzw
mov o2.w, v4.y
mov o3.xyz, r0.yzwy
mov o4.xyz, r2.xywx
mov o5.x, v5.x
ret
// Approximately 0 instruction slots used
每个骨骼取数据的方式是perInstData * 98 + boneId * 3 +2, perInstData * 98 + boneId * 3 +3,perInstData * 98 + boneId * 3 +4,猜测perInstData应该是动画的时间,由cpu在每次drawcall时设置。一帧的动画数据用了98个float4,可以表示32个骨骼了,还多出2个float4会在ps中使用。如果每秒用30帧数据,这张贴图能表示22秒,已经挺多的了。