是否可以使用CUDA并行化该嵌套for循环？

问题描述：

我想加快这个嵌套的循环，刚开始学习CUDA，我怎么可以使用CUDA来并行这个C++代码？是否可以使用CUDA并行化该嵌套for循环？

#define PI 3.14159265 
using namespace std; 
int main() 
{ 
    int nbint = 2; 
    int hits = 20; 
    int nbinp = 2; 
    float _theta, _phi, _l, _m, _n, _k = 0, delta = 5; 
    float x[20],y[20],z[20],a[20],t[20]; 
    for (int i = 0; i < hits; ++i) 
    { 
     x[i] = rand()/(float)(RAND_MAX/100); 
    } 
    for (int i = 0; i < hits; ++i) 
    { 
     y[i] = rand()/(float)(RAND_MAX/100); 
    } 
    for (int i = 0; i < hits; ++i) 
    { 
     z[i] = rand()/(float)(RAND_MAX/100); 
    } 
    for (int i = 0; i < hits; ++i) 
    { 
     a[i] = rand()/(float)(RAND_MAX/100); 
    } 
    float maxforall = 1e-6; 
    float theta0; 
    float phi0; 
    for (int i = 0; i < nbint; i++) 
    { 
     _theta = (0.5 + i)*delta; 
     for (int j = 0; j < nbinp; j++) 
     { 
      _phi = (0.5 + j)*delta/_theta; 
      _l = sin(_theta* PI/180.0)*cos(_phi* PI/180.0); 
      _m = sin(_theta* PI/180.0)*sin(_phi* PI/180.0); 
      _n = cos(_theta* PI/180.0); 
      for (int k = 0; k < hits; k++) 
      { 
       _k = -(_l*x[k] + _m*y[k] + _n*z[k]); 
       t[k] = a[k] - _k; 
      } 

      qsort(t, 0, hits - 1); 
      float max = t[0]; 
      for (int k = 0; k < hits; k++) 
      { 
       if (max < t[k]) 
        max = t[k]; 
      } 
      if (max > maxforall) 
      { 
       maxforall = max; 
      } 

     } 
    } 
    return 0; 
}

我想把innermost for循环和排序部分（也许整个嵌套循环）并行。对这些数组进行排序后，我发现了所有数组的最大值。我使用最大值来简化代码。我需要排序的原因是，最大代表这里是一个连续的时间信息（所有数组都包含时间信息）。排序部分使这些时间从最低到最高。然后我比较一个特定的时间间隔（不是一个单一的值）。比较过程几乎就像我选择的最大值，但连续的间隔不是一个单一的值。

你在这里计算什么？ 'nbint'，'nbinp'和'hits'有多大？请发布[mcve]，其中包含您的输入数据的小数字样本以及所需的输出。 –

首先，我想计算数组t [k]，然后对这个数组进行排序。我想要的输出是nbint * nbinp排序数组。 – Alex

你想要'20 * 2 = 40'数组还是使用'40'元素的单个数组？为什么在循环内执行排序？算法对我来说还不清楚 –

答

您的3个嵌套循环计算出nbint*nbinp*hits值。由于每个值是独立彼此，所有值可以并行计算。

你在你的评论中说过，你有一个交换和关联的“过滤条件”，它将输出减少到单个标量值。这可以被利用来避免排序和存储临时值。相反，我们可以即时计算这些值，然后应用并行减少来确定最终结果。

这可以在“原始”CUDA中完成，下面我使用推力实现了这个想法。主要想法是并行运行grid_opnbint*nbinp*hits次。为了从传递到grid_op的单标量索引中找出三个原始的“循环索引”，使用来自this SO question的算法。

thrust::transform_reduce执行动态转换和随后的并行还原（这里使用thrust::maximum作为替代）。

#include <cmath> 

#include <thrust/device_vector.h> 
#include <thrust/functional.h> 
#include <thrust/transform_reduce.h> 
#include <thrust/iterator/counting_iterator.h> 
#include <thrust/tuple.h> 

// ### BEGIN utility for demo #### 
#include <iostream> 
#include <thrust/random.h> 

thrust::host_vector<float> random_vector(const size_t N) 
{ 
    thrust::default_random_engine rng; 
    thrust::uniform_real_distribution<float> u01(0.0f, 1.0f); 
    thrust::host_vector<float> temp(N); 
    for(size_t i = 0; i < N; i++) { 
     temp[i] = u01(rng); 
    } 
    return temp; 
} 
// ### END utility for demo #### 

template <typename... Iterators> 
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its) 
{ 
    return thrust::make_zip_iterator(thrust::make_tuple(its...)); 
} 

template <typename ZipIterator> 
class grid_op 
{ 
public: 
    grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2) : zipIt(zipIt), dim1(dim1), dim2(dim2){} 

    __host__ __device__ 
    float operator()(std::size_t index) const 
    { 
     const auto coords = unflatten_3d_index(index, dim1, dim2); 
     const auto values = zipIt[thrust::get<2>(coords)]; 
     const float delta = 5; 
     const float _theta = (0.5f + thrust::get<0>(coords))*delta; 
     const float _phi = (0.5f + thrust::get<1>(coords))*delta/_theta; 
     const float _l = sin(_theta* M_PI/180.0)*cos(_phi* M_PI/180.0); 
     const float _m = sin(_theta* M_PI/180.0)*sin(_phi* M_PI/180.0); 
     const float _n = cos(_theta* M_PI/180.0); 
     const float _k = -(_l*thrust::get<0>(values) + _m*thrust::get<1>(values) + _n*thrust::get<2>(values)); 
     return (thrust::get<3>(values) - _k); 
    } 

private: 
    __host__ __device__ 
    thrust::tuple<std::size_t, std::size_t, std::size_t> 
    unflatten_3d_index(std::size_t index, std::size_t dim1, std::size_t dim2) const 
    { 
     // taken from https://stackoverflow.com/questions/29142417/4d-position-from-1d-index 
     std::size_t x = index % dim1; 
     std::size_t y = ((index - x)/dim1) % dim2; 
     std::size_t z = ((index - y * dim1 - x)/(dim1 * dim2)); 
     return thrust::make_tuple(x,y,z); 
    } 

    ZipIterator zipIt; 
    std::size_t dim1; 
    std::size_t dim2; 
}; 

template <typename ZipIterator> 
grid_op<ZipIterator> make_grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2) 
{ 
    return grid_op<ZipIterator>(zipIt, dim1, dim2); 
} 

int main() 
{ 
    const int nbint = 3; 
    const int nbinp = 4; 
    const int hits = 20; 
    const std::size_t N = nbint * nbinp * hits; 

    thrust::device_vector<float> d_x = random_vector(hits); 
    thrust::device_vector<float> d_y = random_vector(hits); 
    thrust::device_vector<float> d_z = random_vector(hits); 
    thrust::device_vector<float> d_a = random_vector(hits); 

    auto zipIt = zip(d_x.begin(), d_y.begin(), d_z.begin(), d_a.begin()); 
    auto countingIt = thrust::counting_iterator<std::size_t>(0); 
    auto unary_op = make_grid_op(zipIt, nbint, nbinp); 
    auto binary_op = thrust::maximum<float>(); 
    const float init = 0; 

    float max = thrust::transform_reduce(
     countingIt, countingIt+N, 
     unary_op, 
     init, 
     binary_op 
    ); 

    std::cout << "max = " << max << std::endl; 
}

谢谢，伙计。你的回答非常有帮助。我会检查编程指南，了解更多关于推力的信息，索引部分也非常有帮助。也许这是我对交换和联想的误解（对不起），但我真的需要这里的排序部分。我可以做它包括排序？ – Alex

@Alex当然，你可以添加排序功能，但是你将无法进行任何动态缩减，因此性能会低得多。你应该真的编辑你的问题来显示为什么你需要排序。 –

我编辑我的问题显示为什么我需要排序部分。希望你能理解，或者我可以画出一张关于此的图片。谢谢〜 – Alex

是否可以使用CUDA并行化该嵌套for循环？

相关推荐