從2010年起,基於GPGPU的通用目的計算隨著OpenCL以及CUDA的大熱而變得異常火熱。而基於GPU的通用目的計算,其實從其本質上上來說就是通過GPU內部的Compute Shader來完成的。而OpenCL以及CUDA則是將主機端與GPU端的通信接口做了更為標准化的統一。而在最近這幾年中,除了OpenCL與CUDA之外,還有像微軟發布的C++ AMP,還有最近被融合到OpenMP的OpenACC等工具,這些都是利用GPU的大規模數據級並行計算來做數據級密集通用目的計算的。
而現在在高性能計算領域,用得比較多的仍然是CUDA與OpenCL。但是對應用開發者來說,如果我們要將一個應用上傳到Windows Store,那麼我們只能使用微軟官方出的API;同理,我們如果要將應用上傳到App Store,那麼也只能使用Apple推出的Metal API。由於Metal API在使用上來說非常簡便,並且Apple在編程指南上都有詳細的描述以及demo提供,所以各位要參考基於Metal API的通用目的計算,可以直接上Apple開發者官網即可。而基於Direct3D的Compute shader構建起來比較繁瑣,而且完整使用的例子也較少,這裡將提供一份完整的,基於純C語言的demo。
以下代碼部分都用到了一些C99標准中所引入的語法特性以及庫文件,所以各位應該至少在Visual Studio 2013上,最好是Visual Studio 2015上編寫以下代碼。筆者用的開發環境是Visual Studio 2015 Express Edition for Desktop,這是微軟免費的IDE,盡管自帶的工具不多,但夠用。
我們首先創建一個名為SimpleCS的Windows Console Application,然後在Application Settings中將復選框裡的鉤子全都去掉,然後勾選上Empty Project。然後我們添加main.c文件。根據這篇博文設置項目選項:http://blog.csdn.net/zenny_chen/article/details/52938512
然後在鏈接庫選項中,把所有的12改成11即可。因為我們這裡要用的是Direct3D 11,而不是12。12用起來非常繁瑣,而且有幾個C API的實現還有bug,等它穩定了之後我會在介紹Direct3D 12中使用Compute Shader的例子。然後,仍然選擇x64進行構建。
以下是main.c的內容:
// compute shader簡單示例
#include
#include
#include
#include
#include
#include
#include
#include
#define NUM_ELEMENTS 2048
static struct BufType
{
int i;
float f;
} s_vBuf0[NUM_ELEMENTS], s_vBuf1[NUM_ELEMENTS];
static bool CreateComputeDevice(ID3D11Device** ppDeviceOut, ID3D11DeviceContext** ppContextOut)
{
*ppDeviceOut = NULL;
*ppContextOut = NULL;
const uint32_t uCreationFlags = D3D11_CREATE_DEVICE_SINGLETHREADED | D3D11_CREATE_DEVICE_DEBUG;
D3D_FEATURE_LEVEL flOut;
const D3D_FEATURE_LEVEL flvl[] = { D3D_FEATURE_LEVEL_11_1, D3D_FEATURE_LEVEL_11_0 };
bool result = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, uCreationFlags, flvl,
sizeof(flvl) / sizeof(D3D_FEATURE_LEVEL), D3D11_SDK_VERSION, ppDeviceOut, &flOut, ppContextOut) >= 0;
if (result)
printf("Currently use Direct3D level: %d.%d\n", flOut >> 12, (flOut >> 8) & 0xf);
return result;
}
static bool CreateStructureBuffer(ID3D11Device* pDevice, uint32_t elementSize, uint32_t uCount,
void* pInitData, ID3D11Buffer** ppBufferOut)
{
*ppBufferOut = NULL;
D3D11_BUFFER_DESC desc;
ZeroMemory(&desc, sizeof(desc));
desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
desc.ByteWidth = elementSize*uCount;
desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
desc.StructureByteStride = elementSize;
if (pInitData != NULL)
{
D3D11_SUBRESOURCE_DATA InitData = { 0 };
InitData.pSysMem = pInitData;
return pDevice->lpVtbl->CreateBuffer(pDevice, &desc, &InitData, ppBufferOut) >= 0;
}
else
return pDevice->lpVtbl->CreateBuffer(pDevice, &desc, NULL, ppBufferOut) >= 0;
}
static bool CreateConstantBuffer(ID3D11Device* pDevice, uint32_t nBytes, void* pInitData, ID3D11Buffer** ppBufferOut)
{
*ppBufferOut = NULL;
D3D11_BUFFER_DESC desc;
ZeroMemory(&desc, sizeof(desc));
desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
desc.ByteWidth = nBytes;
desc.Usage = D3D11_USAGE_DYNAMIC;
desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
D3D11_SUBRESOURCE_DATA initData;
initData.pSysMem = pInitData;
initData.SysMemPitch = 0;
initData.SysMemSlicePitch = 0;
return pDevice->lpVtbl->CreateBuffer(pDevice, &desc, &initData, ppBufferOut) >= 0;
}
static bool CreateComputeShader(LPCWSTR pSrcFile, LPCSTR pFunctionName,
ID3D11Device* pDevice, ID3D11ComputeShader** ppShaderOut)
{
uint32_t dwShaderFlags = D3DCOMPILE_ENABLE_STRICTNESS;
// Set the D3DCOMPILE_DEBUG flag to embed debug information in the shaders.
// Setting this flag improves the shader debugging experience, but still allows
// the shaders to be optimized and to run exactly the way they will run in
// the release configuration of this program.
dwShaderFlags |= D3DCOMPILE_DEBUG;
const D3D_SHADER_MACRO defines[] =
{
"USE_STRUCTURED_BUFFERS", "1",
NULL, NULL
};
// We generally prefer to use the higher CS shader profile when possible as CS 5.0 is better performance on 11-class hardware
ID3DBlob* pErrorBlob = NULL;
ID3DBlob* computeShader = NULL;
if (D3DCompileFromFile(pSrcFile, defines, NULL, pFunctionName, "cs_5_0", dwShaderFlags, 0,
&computeShader, &pErrorBlob) < 0)
{
if (pErrorBlob != NULL)
OutputDebugStringA((char*)pErrorBlob->lpVtbl->GetBufferPointer(pErrorBlob));
if(pErrorBlob != NULL)
pErrorBlob->lpVtbl->Release(pErrorBlob);
if(computeShader != NULL)
computeShader->lpVtbl->Release(computeShader);
return false;
}
bool result = true;
if (pDevice->lpVtbl->CreateComputeShader(pDevice, computeShader->lpVtbl->GetBufferPointer(computeShader),
computeShader->lpVtbl->GetBufferSize(computeShader), NULL, ppShaderOut))
result = false;
if (pErrorBlob != NULL)
pErrorBlob->lpVtbl->Release(pErrorBlob);
if (computeShader != NULL)
computeShader->lpVtbl->Release(computeShader);
return result;
}
/**
利用ID3D11Device::CreateShaderResouceView()來創建GPU中Buffer的resourceView
*/
static bool CreateBufferSRV(ID3D11Device* pDevice, ID3D11Buffer* pBuffer, ID3D11ShaderResourceView** ppSRVOut)
{
D3D11_BUFFER_DESC descBuf;
ZeroMemory(&descBuf, sizeof(descBuf));
pBuffer->lpVtbl->GetDesc(pBuffer, &descBuf);
D3D11_SHADER_RESOURCE_VIEW_DESC desc;
ZeroMemory(&desc, sizeof(desc));
desc.ViewDimension = D3D11_SRV_DIMENSION_BUFFEREX;
desc.BufferEx.FirstElement = 0;
//假定這是個structure buffer
desc.Format = DXGI_FORMAT_UNKNOWN;
desc.BufferEx.NumElements = descBuf.ByteWidth / descBuf.StructureByteStride;
return pDevice->lpVtbl->CreateShaderResourceView(pDevice, (ID3D11Resource*)pBuffer, &desc, ppSRVOut) >= 0;
}
static bool CreateBufferUAV(ID3D11Device* pDevice, ID3D11Buffer* pBuffer, ID3D11UnorderedAccessView** ppUAVOut)
{
D3D11_BUFFER_DESC descBuf;
ZeroMemory(&descBuf, sizeof(descBuf));
pBuffer->lpVtbl->GetDesc(pBuffer, &descBuf);
D3D11_UNORDERED_ACCESS_VIEW_DESC desc;
ZeroMemory(&desc, sizeof(desc));
desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
desc.Buffer.FirstElement = 0;
//假設這是一個structure buffer
desc.Format = DXGI_FORMAT_UNKNOWN;
desc.Buffer.NumElements = descBuf.ByteWidth / descBuf.StructureByteStride;
return pDevice->lpVtbl->CreateUnorderedAccessView(pDevice, (ID3D11Resource*)pBuffer, &desc, ppUAVOut) >= 0;
}
static void RunComputeShader(ID3D11DeviceContext* pImmediateContext, ID3D11ComputeShader* pComputeShader,
uint32_t nSRVs, uint32_t nUAVs, ID3D11ShaderResourceView* pShaderResourceViews[],
ID3D11UnorderedAccessView* pUnorderedViews[], uint32_t X, uint32_t Y, uint32_t Z)
{
pImmediateContext->lpVtbl->CSSetShader(pImmediateContext, pComputeShader, NULL, 0);
pImmediateContext->lpVtbl->CSSetShaderResources(pImmediateContext, 0, nSRVs, pShaderResourceViews);
pImmediateContext->lpVtbl->CSSetUnorderedAccessViews(pImmediateContext, 0, nUAVs, pUnorderedViews, NULL);
pImmediateContext->lpVtbl->Dispatch(pImmediateContext, NUM_ELEMENTS, 1, 1);
//清空Shader和各個Shader Resource View、Unordered Access View以及一些Constant Buffer
pImmediateContext->lpVtbl->CSSetShader(pImmediateContext, NULL, NULL, 0);
ID3D11UnorderedAccessView* ppUAViewNULL[] = { NULL, NULL };
pImmediateContext->lpVtbl->CSSetUnorderedAccessViews(pImmediateContext, 0, 2, ppUAViewNULL, NULL);
ID3D11ShaderResourceView* ppSRVNULL[2] = { NULL,NULL };
pImmediateContext->lpVtbl->CSSetShaderResources(pImmediateContext, 0, 2, ppSRVNULL);
ID3D11Buffer* ppCBNULL[1] = { NULL };
pImmediateContext->lpVtbl->CSSetConstantBuffers(pImmediateContext, 0, 1, ppCBNULL);
}
static ID3D11Buffer* CreateAndCopyToDebugBuf(ID3D11Device* pDevice, ID3D11DeviceContext* pd3dImmediateContext,
ID3D11Buffer* pBuffer)
{
ID3D11Buffer* debugBuf = NULL;
D3D11_BUFFER_DESC desc;
ZeroMemory(&desc, sizeof(desc));
pBuffer->lpVtbl->GetDesc(pBuffer, &desc);
desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
desc.Usage = D3D11_USAGE_STAGING;
desc.BindFlags = 0;
desc.MiscFlags = 0;
if (pDevice->lpVtbl->CreateBuffer(pDevice, &desc, NULL, &debugBuf) >= 0)
{
pd3dImmediateContext->lpVtbl->CopyResource(pd3dImmediateContext, (ID3D11Resource*)debugBuf,
(ID3D11Resource*)pBuffer);
}
return debugBuf;
}
int main(void)
{
_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
ID3D11Device *device = NULL;
ID3D11DeviceContext *context = NULL;
ID3D11ComputeShader *computeShader = NULL;
//各個Buffer指針變量
ID3D11Buffer *srcBuffer0 = NULL;
ID3D11Buffer *srcBuffer1 = NULL;
ID3D11Buffer *resultBuffer = NULL;
ID3D11Buffer *srcDstBuffer = NULL;
ID3D11Buffer *constBuffer = NULL;
//讀寫上面buffer的ID3D11ShaderResourceView和UnorderedAccessView接口
ID3D11ShaderResourceView *srcBuf0SRV = NULL;
ID3D11ShaderResourceView *srcBuf1SRV = NULL;
ID3D11UnorderedAccessView *resBufUAV = NULL;
ID3D11UnorderedAccessView *srcdstBufUAV = NULL;
int localBuffer[NUM_ELEMENTS];
for (int i = 0; i < NUM_ELEMENTS; i++)
localBuffer[i] = i + 1;
do
{
if (!CreateComputeDevice(&device, &context))
{
puts("CreateComputeDevice failed!");
break;
}
if (!CreateComputeShader(L"compute.hlsl", "CSMain", device, &computeShader))
{
puts("CreateComputeShader failed!");
break;
}
//初始化計算數據
for (int i = 0; i<num_elements; i++)="" {="" s_vbuf0[i].i="i;" s_vbuf0[i].f="(float)i;" s_vbuf1[i].i="i;" s_vbuf1[i].f="(float)i;" }="" 為cpu中的數組創建gpu中相應buffer="" if="" (!createstructurebuffer(device,="" sizeof(struct="" buftype),="" num_elements,="" s_vbuf0,="" &srcbuffer0))="" puts("create="" srcbuffer0="" failed");="" break;="" s_vbuf1,="" &srcbuffer1))="" srcbuffer1="" null,="" &resultbuffer))="" resultbuffer="" sizeof(localbuffer[0]),="" _countof(localbuffer),="" localbuffer,="" &srcdstbuffer))="" srcdstbuffer="" 在d3d11中,常量緩存至少需要4個int元素="" int="" value[4]="{" 10,="" 20="" };="" (!createconstantbuffer(device,="" sizeof(value),="" value,="" &constbuffer))="" constbuffer="" failed!");="" 綁定常量緩存="" context-="">lpVtbl->CSSetConstantBuffers(context, 0, 1, &constBuffer);
//為buffer創建相應的shader resource view與unordered access view
if (!CreateBufferSRV(device, srcBuffer0, &srcBuf0SRV))
{
puts("create srcBuf0SRV failed");
break;
}
if (!CreateBufferSRV(device, srcBuffer1, &srcBuf1SRV))
{
puts("create srcBuf1SRV failed");
break;
}
if (!CreateBufferUAV(device, resultBuffer, &resBufUAV))
{
puts("create resBufUAV failed");
break;
}
if (!CreateBufferUAV(device, srcDstBuffer, &srcdstBufUAV))
{
puts("create srcdstBufUAV failed!");
break;
}
ID3D11ShaderResourceView* shaderResourceViews[] = { srcBuf0SRV, srcBuf1SRV };
ID3D11UnorderedAccessView* unorderedAccessViews[] = { resBufUAV, srcdstBufUAV };
//運行Shader Compute程序
RunComputeShader(context, computeShader, _countof(shaderResourceViews), _countof(unorderedAccessViews),
shaderResourceViews, unorderedAccessViews, NUM_ELEMENTS, 1, 1);
//將GPU計算的結果寫回CPU
ID3D11Buffer* debugBuf = NULL;
// 先查看resultBuffer中的內容
debugBuf = CreateAndCopyToDebugBuf(device, context, resultBuffer);
if (debugBuf == NULL)
{
puts("debugBuf create failed!");
break;
}
D3D11_MAPPED_SUBRESOURCE mappedResource;
context->lpVtbl->Map(context, (ID3D11Resource*)debugBuf, 0, D3D11_MAP_READ, 0, &mappedResource);
struct BufType *p = mappedResource.pData;
puts("Output GPU resultBuffer results, first ten:");
for (int i = 0; i < 10; i++)
printf("i: %d, f: %.1f\n", p[i].i, p[i].f);
puts("last ten:");
for(int i = NUM_ELEMENTS - 10; i < NUM_ELEMENTS; i++)
printf("i: %d, f: %.1f\n", p[i].i, p[i].f);
context->lpVtbl->Unmap(context, (ID3D11Resource*)debugBuf, 0);
debugBuf->lpVtbl->Release(debugBuf);
// 再查看srcdstBuffer中的內容
debugBuf = CreateAndCopyToDebugBuf(device, context, srcDstBuffer);
if (debugBuf == NULL)
{
puts("debugBuf create failed!");
break;
}
context->lpVtbl->Map(context, (ID3D11Resource*)debugBuf, 0, D3D11_MAP_READ, 0, &mappedResource);
int *q = mappedResource.pData;
puts("Output GPU srcDstBuffer results, first ten:");
for (int i = 0; i < 10; i++)
printf("[%d] = %d\n", i, q[i]);
puts("last ten:");
for (int i = NUM_ELEMENTS - 10; i < NUM_ELEMENTS; i++)
printf("[%d] = %d\n", i, q[i]);
context->lpVtbl->Unmap(context, (ID3D11Resource*)debugBuf, 0);
debugBuf->lpVtbl->Release(debugBuf);
}
while (false);
//釋放資源
if (srcBuf0SRV != NULL)
srcBuf0SRV->lpVtbl->Release(srcBuf0SRV);
if (srcBuf1SRV != NULL)
srcBuf1SRV->lpVtbl->Release(srcBuf1SRV);
if (resBufUAV != NULL)
resBufUAV->lpVtbl->Release(resBufUAV);
if (srcdstBufUAV != NULL)
srcdstBufUAV->lpVtbl->Release(srcdstBufUAV);
if (srcBuffer0 != NULL)
srcBuffer0->lpVtbl->Release(srcBuffer0);
if (srcBuffer1 != NULL)
srcBuffer1->lpVtbl->Release(srcBuffer1);
if (resultBuffer != NULL)
resultBuffer->lpVtbl->Release(resultBuffer);
if (srcDstBuffer != NULL)
srcDstBuffer->lpVtbl->Release(srcDstBuffer);
if (computeShader != NULL)
computeShader->lpVtbl->Release(computeShader);
if (context != NULL)
context->lpVtbl->Release(context);
if (device != NULL)
device->lpVtbl->Release(device);
puts("\nInput enter to exit...");
getchar();
}
完成之後,我們再創建一個名為compute.hlsl的shader文件,將它存放在與main.c相同的目錄下。
// 這是一個計算著色器程序
struct BufType
{
int i;
float f;
};
// 對應於主機端的constant buffer
cbuffer cbNeverChanges : register(b0)
{
int cValue0;
int cValue1;
};
// 對應於主機端的Shader Resource View
StructuredBuffer buffer0 : register(t0);
StructuredBuffer buffer1 : register(t1);
// 對應於主機端的Unordered Access View
RWStructuredBuffer bufferOut : register(u0);
RWStructuredBuffer srcdstBuffer : register(u1);
// Direct3D中,一個線程組(threadgroup)最多允許1024個線程
[numthreads(1024, 1, 1)]
void CSMain(uint3 groupID : SV_GroupID, uint3 tid : SV_DispatchThreadID,
uint3 localTID : SV_GroupThreadID, uint gIdx : SV_GroupIndex)
{
const int index = tid.x;
const int cValue = cValue1 / cValue0;
int resValue = (buffer0[index].i + buffer1[index].i) * cValue - srcdstBuffer[index];
bufferOut[index].i = resValue;
bufferOut[index].f = (buffer0[index].f + buffer1[index].f) * float(cValue);
srcdstBuffer[index] = resValue;
}
我們在保存這兩個文件的時候,可以在在菜單欄File下面找到Advanced Save Options...,可以將Encoding改為Unicode(UTF-8 without Signature),這樣我們就可以在所有操作系統以及語言環境上看到正常的中文漢字了。否則系統不支持GBK或GB2312,會導致漢字部分出現亂碼。完成之後我們就可以編譯運行了。