OpenCV环境下CUDA编程示例

oskycar 2014-11-27

展开全文

在CUDA平台上对图像算法进行并行加速是目前并行计算方面比较简单易行的一种方式，而同时利用OpenCV提供的一些库函数的话，那么事情将会变得更加easy。以下是我个人采用的一种模板，这个模板是从OpenCV里的算法CUDA源码挖掘出来的，我感觉这个用起来比较傲方便，所以经常采用。首先大牛们写的源码都很鲁棒，考虑的比较全面（如大部分算法将1,3,4通道的图像同时搞定），感觉还有一个比较神奇的地方在于CPU端GpuMat和GPU端PtrStepSzb的转换，让我欲罢不能，一个不太理想的地方在于第一帧的初始化时间比较长，应该是CPU到GPU的数据传输。代码中有考虑流，但貌似没有使用。

我使用的是赵开勇的CUDA_VS_Wizard，主函数还是用的cu文件。以下代码是对Vibe背景建模算法的并行，背景建模算法是目前接触到易于并行的一类，如GMM等，而且加速效果不错，因为一个线程执行的数据就是对应一个像素点。

代码如下：

sample.cu

[cpp] view plain copy

/********************************************************************
* sample.cu
* This is a example of the CUDA program.
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>
#include <iostream>
#include <string>
#include "opencv2/core/core.hpp"
#include "opencv2/gpu/gpu.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "Vibe_M_kernel.cu"
#include "Vibe_M.h"
using namespace std;
using namespace cv;
using namespace cv::gpu;
enum Method
{
FGD_STAT,
MOG,
MOG2,
VIBE,
GMG
};
int main(int argc, const char** argv)
{
cv::CommandLineParser cmd(argc, argv,
"{ c | camera | flase | use camera }"
"{ f | file | 768x576.avi | input video file }"
"{ m | method | vibe | method (fgd, mog, mog2, vibe, gmg) }"
"{ h | help | false | print help message }");
if (cmd.get<bool>("help"))
{
cout << "Usage : bgfg_segm [options]" << endl;
cout << "Avaible options:" << endl;
cmd.printParams();
return 0;
}
bool useCamera = cmd.get<bool>("camera");
string file = cmd.get<string>("file");
string method = cmd.get<string>("method");
if (method != "fgd" && method != "mog" && method != "mog2" && method != "vibe" && method != "gmg")
{
cerr << "Incorrect method" << endl;
return -1;
}
Method m = method == "fgd" ? FGD_STAT : method == "mog" ? MOG : method == "mog2" ? MOG2 : method == "vibe" ? VIBE : GMG;
VideoCapture cap;
if (useCamera)
cap.open(0);
else
cap.open(file);
if (!cap.isOpened())
{
cerr << "can not open camera or video file" << endl;
return -1;
}
Mat origin, frame;
cap >> origin;
cvtColor(origin,frame,CV_BGR2GRAY);
GpuMat d_frame(frame);
Vibe_M vibe;
GpuMat d_fgmask;
Mat fgmask;
Mat fgimg;
Mat bgimg;
switch (m)
{
case VIBE:
vibe.initialize(d_frame);
break;
}
namedWindow("image", WINDOW_NORMAL);
namedWindow("foreground mask", WINDOW_NORMAL);
for(;;)
{
cap >> origin;
if (origin.empty())
break;
cvtColor(origin,frame,CV_BGR2GRAY);
d_frame.upload(frame);
//update the model
switch (m)
{
case VIBE:
vibe(d_frame, d_fgmask);
break;
}
d_fgmask.download(fgmask);
imshow("image", frame);
imshow("foreground mask", fgmask);
int key = waitKey(30);
if (key == 27)
break;
else if(key == ' ')
{
cvWaitKey(0);
}
}
exit(0);
}

Vibe_M.cpp

[cpp] view plain copy

#include "Vibe_M.h"
namespace cv { namespace gpu { namespace device
{
namespace vibe_m
{
void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor);
void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
}
}}}
namespace
{
const int defaultNbSamples = 20;
const int defaultReqMatches = 2;
const int defaultRadius = 20;
const int defaultSubsamplingFactor = 16;
}
Vibe_M::Vibe_M(unsigned long rngSeed) :
frameSize_(0, 0), rngSeed_(rngSeed)
{
nbSamples = defaultNbSamples;
reqMatches = defaultReqMatches;
radius = defaultRadius;
subsamplingFactor = defaultSubsamplingFactor;
}
void Vibe_M::initialize(const GpuMat& firstFrame, Stream& s)
{
using namespace cv::gpu::device::vibe_m;
CV_Assert(firstFrame.type() == CV_8UC1 || firstFrame.type() == CV_8UC3 || firstFrame.type() == CV_8UC4);
//cudaStream_t stream = StreamAccessor::getStream(s);
loadConstants(nbSamples, reqMatches, radius, subsamplingFactor);
frameSize_ = firstFrame.size();
if (randStates_.size() != frameSize_)
{
cv::RNG rng(rngSeed_);
cv::Mat h_randStates(frameSize_, CV_8UC4);
rng.fill(h_randStates, cv::RNG::UNIFORM, 0, 255);
randStates_.upload(h_randStates);
}
int ch = firstFrame.channels();
int sample_ch = ch == 1 ? 1 : 4;
samples_.create(nbSamples * frameSize_.height, frameSize_.width, CV_8UC(sample_ch));
init_gpu(firstFrame, ch, samples_, randStates_, 0);
}
void Vibe_M::operator()(const GpuMat& frame, GpuMat& fgmask, Stream& s)
{
using namespace cv::gpu::device::vibe_m;
CV_Assert(frame.depth() == CV_8U);
int ch = frame.channels();
int sample_ch = ch == 1 ? 1 : 4;
if (frame.size() != frameSize_ || sample_ch != samples_.channels())
initialize(frame);
fgmask.create(frameSize_, CV_8UC1);
update_gpu(frame, ch, fgmask, samples_, randStates_, StreamAccessor::getStream(s));
}
void Vibe_M::release()
{
frameSize_ = Size(0, 0);
randStates_.release();
samples_.release();
}

Vibe_M.h

[cpp] view plain copy

#ifndef _VIBE_M_H_
#define _VIBE_M_H_
#ifndef SKIP_INCLUDES
#include <vector>
#include <memory>
#include <iosfwd>
#endif
#include "opencv2/core/core.hpp"
#include "opencv2/core/gpumat.hpp"
#include "opencv2/gpu/gpu.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/objdetect/objdetect.hpp"
#include "opencv2/features2d/features2d.hpp"
using namespace std;
using namespace cv;
using namespace cv::gpu;
class Vibe_M
{
public:
//! the default constructor
explicit Vibe_M(unsigned long rngSeed = 1234567);
//! re-initiaization method
void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null());
//! the update operator
void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null());
//! releases all inner buffers
void release();
int nbSamples; // number of samples per pixel
int reqMatches; // #_min
int radius; // R
int subsamplingFactor; // amount of random subsampling
private:
Size frameSize_;
unsigned long rngSeed_;
GpuMat randStates_;
GpuMat samples_;
};
#endif

Vibe_M.cu

[html] view plain copy

#include "Vibe_M.h"
#include "opencv2/gpu/stream_accessor.hpp"
namespace cv { namespace gpu { namespace device
{
namespace vibe_m
{
void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor);
void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
}
}}}
namespace
{
const int defaultNbSamples = 20;
const int defaultReqMatches = 2;
const int defaultRadius = 20;
const int defaultSubsamplingFactor = 16;
}
Vibe_M::Vibe_M(unsigned long rngSeed) :
frameSize_(0, 0), rngSeed_(rngSeed)
{
nbSamples = defaultNbSamples;
reqMatches = defaultReqMatches;
radius = defaultRadius;
subsamplingFactor = defaultSubsamplingFactor;
}
void Vibe_M::initialize(const GpuMat& firstFrame, Stream& s)
{
using namespace cv::gpu::device::vibe_m;
CV_Assert(firstFrame.type() == CV_8UC1 || firstFrame.type() == CV_8UC3 || firstFrame.type() == CV_8UC4);
cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
loadConstants(nbSamples, reqMatches, radius, subsamplingFactor);
frameSize_ = firstFrame.size();
if (randStates_.size() != frameSize_)
{
cv::RNG rng(rngSeed_);
cv::Mat h_randStates(frameSize_, CV_8UC4);
rng.fill(h_randStates, cv::RNG::UNIFORM, 0, 255);
randStates_.upload(h_randStates);
}
int ch = firstFrame.channels();
int sample_ch = ch == 1 ? 1 : 4;
samples_.create(nbSamples * frameSize_.height, frameSize_.width, CV_8UC(sample_ch));
init_gpu(firstFrame, ch, samples_, randStates_, stream);
}
void Vibe_M::operator()(const GpuMat& frame, GpuMat& fgmask, Stream& s)
{
using namespace cv::gpu::device::vibe_m;
CV_Assert(frame.depth() == CV_8U);
int ch = frame.channels();
int sample_ch = ch == 1 ? 1 : 4;
if (frame.size() != frameSize_ || sample_ch != samples_.channels())
initialize(frame);
fgmask.create(frameSize_, CV_8UC1);
update_gpu(frame, ch, fgmask, samples_, randStates_, cv::gpu::StreamAccessor::getStream(s));
}
void Vibe_M::release()
{
frameSize_ = Size(0, 0);
randStates_.release();
samples_.release();
}

Vibe_M_kernel.cu

[cpp] view plain copy

#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
namespace cv { namespace gpu { namespace device
{
namespace vibe_m
{
__constant__ int c_nbSamples;
__constant__ int c_reqMatches;
__constant__ int c_radius;
__constant__ int c_subsamplingFactor;
void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor)
{
cudaSafeCall( cudaMemcpyToSymbol(c_nbSamples, &nbSamples, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_reqMatches, &reqMatches, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_radius, &radius, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_subsamplingFactor, &subsamplingFactor, sizeof(int)) );
}
__device__ __forceinline__ uint nextRand(uint& state)
{
//const unsigned int CV_RNG_COEFF = 4164903690U;//已经定义
state = state * CV_RNG_COEFF + (state >> 16);
return state;
}
__constant__ int c_xoff[9] = {-1, 0, 1, -1, 1, -1, 0, 1, 0};
__constant__ int c_yoff[9] = {-1, -1, -1, 0, 0, 1, 1, 1, 0};
__device__ __forceinline__ int2 chooseRandomNeighbor(int x, int y, uint& randState, int count = 8)
{
int idx = nextRand(randState) % count;
return make_int2(x + c_xoff[idx], y + c_yoff[idx]);
}
__device__ __forceinline__ uchar cvt(uchar val)
{
return val;
}
__device__ __forceinline__ uchar4 cvt(const uchar3& val)
{
return make_uchar4(val.x, val.y, val.z, 0);
}
__device__ __forceinline__ uchar4 cvt(const uchar4& val)
{
return val;
}
template <typename SrcT, typename SampleT>
__global__ void init(const PtrStepSz<SrcT> frame, PtrStep<SampleT> samples, PtrStep<uint> randStates)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= frame.cols || y >= frame.rows)
return;
uint localState = randStates(y, x);
for (int k = 0; k < c_nbSamples; ++k)
{
int2 np = chooseRandomNeighbor(x, y, localState, 9);
np.x = ::max(0, ::min(np.x, frame.cols - 1));
np.y = ::max(0, ::min(np.y, frame.rows - 1));
SrcT pix = frame(np.y, np.x);
samples(k * frame.rows + y, x) = cvt(pix);
}
randStates(y, x) = localState;
}
template <typename SrcT, typename SampleT>
void init_caller(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(init<SrcT, SampleT>, cudaFuncCachePreferL1) );
init<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, (PtrStepSz<SampleT>) samples, randStates);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
static const func_t funcs[] =
{
0, init_caller<uchar, uchar>, 0, init_caller<uchar3, uchar4>, init_caller<uchar4, uchar4>
};
funcs[cn](frame, samples, randStates, stream);
}
__device__ __forceinline__ int calcDist(uchar a, uchar b)
{
return ::abs(a - b);
}
__device__ __forceinline__ int calcDist(const uchar3& a, const uchar4& b)
{
return (::abs(a.x - b.x) + ::abs(a.y - b.y) + ::abs(a.z - b.z)) / 3;
}
__device__ __forceinline__ int calcDist(const uchar4& a, const uchar4& b)
{
return (::abs(a.x - b.x) + ::abs(a.y - b.y) + ::abs(a.z - b.z)) / 3;
}
template <typename SrcT, typename SampleT>
__global__ void update(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStep<SampleT> samples, PtrStep<uint> randStates)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= frame.cols || y >= frame.rows)
return;
uint localState = randStates(y, x);
SrcT imgPix = frame(y, x);
// comparison with the model
int count = 0;
for (int k = 0; (count < c_reqMatches) && (k < c_nbSamples); ++k)
{
SampleT samplePix = samples(k * frame.rows + y, x);
int distance = calcDist(imgPix, samplePix);
if (distance < c_radius)
++count;
}
// pixel classification according to reqMatches
fgmask(y, x) = (uchar) (-(count < c_reqMatches));//当count<2时，为前景当计数器count>=2时，为背景
if (count >= c_reqMatches)
{
// the pixel belongs to the background
// gets a random number between 0 and subsamplingFactor-1
int randomNumber = nextRand(localState) % c_subsamplingFactor;
// update of the current pixel model
if (randomNumber == 0)
{
// random subsampling
int k = nextRand(localState) % c_nbSamples;
samples(k * frame.rows + y, x) = cvt(imgPix);
}
// update of a neighboring pixel model
randomNumber = nextRand(localState) % c_subsamplingFactor;
if (randomNumber == 0)
{
// random subsampling
// chooses a neighboring pixel randomly
int2 np = chooseRandomNeighbor(x, y, localState);
np.x = ::max(0, ::min(np.x, frame.cols - 1));
np.y = ::max(0, ::min(np.y, frame.rows - 1));
// chooses the value to be replaced randomly
int k = nextRand(localState) % c_nbSamples;
samples(k * frame.rows + np.y, np.x) = cvt(imgPix);
}
}
randStates(y, x) = localState;
}
template <typename SrcT, typename SampleT>
void update_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT, SampleT>, cudaFuncCachePreferL1) );
update<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, (PtrStepSz<SampleT>) samples, randStates);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
static const func_t funcs[] =
{
0, update_caller<uchar, uchar>, 0, update_caller<uchar3, uchar4>, update_caller<uchar4, uchar4>
};
funcs[cn](frame, fgmask, samples, randStates, stream);
}
}
}}}