工作小计- RGB相关算子实现

项目中的模型一直都是直接操作NV12的yuv格式数据，这次的模型只支持RGB格式的输入，正好来自己实现对应的算子。
这里记录一下对应算子的实现过程，主要涉及到NV12到RGB的变换，RGB的crop/resize操作，对于数据的Norm/ToFloat操作，调整Layout等等。

cu文件是要nvcc来进行编译的，但是其头文件可以供外部的cpp文件调用，另外这里的核函数并没有涉及到stream的考虑，因为这个涉及到之后的性能优化环节，要有先来后到。实际stream也就是在核函数调用前的<<<>>>中传入stream而已，然后之后要跟着同步stream的操作。与函数实现逻辑无关。

cuda_transformation.cu

在这里实现真正的核函数，

NV12toRGB

在这里插入图片描述

这里的坑点在于 BT.601/709 FULL/非FULL的yuv格式，如果出了差错会导致图像看起来色度不对，遇到过的问题就是红色很不明显，原因就是转换公式写的有问题。

__global__ void NV12toRGB(uint8_t *yuv, uint8_t *rgb, int width,
                          int height) {
  const int nv_start = width * height;
  int i, j, nv_index = 0;
  uint8_t y, u, v;
  int r, g, b;

  j = blockIdx.x * blockDim.x + threadIdx.x;
  i = blockIdx.y * blockDim.y + threadIdx.y;

  if (i >= height || j >= width)
    return;

  nv_index = i / 2 * width + j - j % 2;

  int rgb_index = i * width + j;

  y = yuv[rgb_index];
  u = yuv[nv_start + nv_index];
  v = yuv[nv_start + nv_index + 1];

  r = y + (140 * (v - 128)) / 100;                         // r
  g = y - (34 * (u - 128)) / 100 - (71 * (v - 128)) / 100; // g
  b = y + (177 * (u - 128)) / 100;                         // b

  if (r > 255)
    r = 255;
  if (g > 255)
    g = 255;
  if (b > 255)
    b = 255;
  if (r < 0)
    r = 0;
  if (g < 0)
    g = 0;
  if (b < 0)
    b = 0;

  rgb[rgb_index * 3 + 0] = b;
  rgb[rgb_index * 3 + 1] = g;
  rgb[rgb_index * 3 + 2] = r;
}

int cudaNV12toRGB(uint8_t *input, uint8_t *output, size_t width,
                  size_t height) {
  if (!input || !output)
    return cudaErrorInvalidDevicePointer;

  const dim3 blockDim(32, 32, 1);
  const dim3 gridDim((width + blockDim.x - 1) / blockDim.x,
                     (height + blockDim.y - 1) / blockDim.y, 1);
  NV12toRGB<<<gridDim, blockDim>>>(input, output, width, height);
  return cudaDeviceSynchronize();
}

RGBBilinearResize

__global__ void RGBBilinearResize(uint8_t *input, uint8_t *output,
                                  int inputWidth, int inputHeight,
                                  int outputWidth, int outputHeight) {
  // 计算线程的全局索引
  int x = blockIdx.x * blockDim.x + threadIdx.x;
  int y = blockIdx.y * blockDim.y + threadIdx.y;

  if (x >= outputWidth || y >= outputHeight)
    return;

  // gx,gy是相对于resize后的图中的点，这里计算对应的原图中的浮点位置，确定要从哪里采样
  float gx = ((float)x) / outputWidth * (inputWidth - 1);
  float gy = ((float)y) / outputHeight * (inputHeight - 1);

  // 对应的整数位置及其偏移量
  int gxi = (int)gx;
  int gyi = (int)gy;
  float dx = gx - gxi;
  float dy = gy - gyi;

  // 读取四个最近的像素值
  uint8_t topLeft[3] = {input[(gyi * inputWidth + gxi) * 3 + 0],
                              input[(gyi * inputWidth + gxi) * 3 + 1],
                              input[(gyi * inputWidth + gxi) * 3 + 2]};
  uint8_t topRight[3] = {input[(gyi * inputWidth + gxi + 1) * 3 + 0],
                               input[(gyi * inputWidth + gxi + 1) * 3 + 1],
                               input[(gyi * inputWidth + gxi + 1) * 3 + 2]};
  uint8_t bottomLeft[3] = {input[((gyi + 1) * inputWidth + gxi) * 3 + 0],
                                 input[((gyi + 1) * inputWidth + gxi) * 3 + 1],
                                 input[((gyi + 1) * inputWidth + gxi) * 3 + 2]};
  uint8_t bottomRight[3] = {
      input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 0],
      input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 1],
      input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 2]};

  // 对每个通道进行双线性插值
  for (int i = 0; i < 3; i++) {
    float top = topLeft[i] * (1 - dx) + topRight[i] * dx;
    float bottom = bottomLeft[i] * (1 - dx) + bottomRight[i] * dx;
    output[(y * outputWidth + x) * 3 + i] = top * (1 - dy) + bottom * dy;
  }
}

int cudaRGBBilinearResize(uint8_t *input, uint8_t *output, size_t width,
                          size_t height, size_t resize_width,
                          size_t resize_height) {
  if (!input || !output)
    return cudaErrorInvalidDevicePointer;

  const dim3 blockDim(32, 32, 1);
  const dim3 gridDim((width + blockDim.x - 1) / blockDim.x,
                     (height + blockDim.y - 1) / blockDim.y, 1);
  RGBBilinearResize<<<gridDim, blockDim>>>(input, output, width, height,
                                           resize_width, resize_height);
  return cudaDeviceSynchronize();
}

RGBToFloat

这里的实现要额外记录下，因为涉及到debug中的opencv-dump所以在传入模型前的数据都是BGR格式的，在转浮点这里重新调整成模型需要的RGB格式。

__global__ void RGBToFloat(uint8_t *input, float *output, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width || y >= height) return;

    int idx = y * width + x;
    output[idx * 3 + 0] = input[idx * 3 + 2] / 255.0f; // R
    output[idx * 3 + 1] = input[idx * 3 + 1] / 255.0f; // G
    output[idx * 3 + 2] = input[idx * 3 + 0] / 255.0f; // B
}

int cudaRGBToFloat(uint8_t *input, float *output, int width, int height) {
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);

    RGBToFloat<<<gridDim, blockDim>>>(input, output, width, height);

    return cudaDeviceSynchronize();
}

RGBNormalize

__global__ void RGBNormalize(float *image, int width, int height, float mean[], float std[]) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width || y >= height) {
        return;
    }

    int idx = y * width + x;

    if (std[0] < 1e-6 || std[1] < 1e-6 || std[2] < 1e-6) {
        printf("Error: std values are too small for safe division.\n");
        return;
    }

    image[idx * 3 + 0] = (image[idx * 3 + 0] - mean[0]) / std[0]; // B
    image[idx * 3 + 1] = (image[idx * 3 + 1] - mean[1]) / std[1]; // G
    image[idx * 3 + 2] = (image[idx * 3 + 2] - mean[2]) / std[2]; // R
}

int cudaRGBNormalize(float *d_image, int width, int height, float mean[], float std[]) {
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);
    
    RGBNormalize<<<gridDim, blockDim>>>(d_image, width, height, mean, std);
    
    cudaError_t cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(cudaStatus));
        return -1;
    }
    
    return 0;
}

HWC2CHW

template <typename T>
__global__ void HWC2CHW(const T* input, T* output, int height, int width) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width || y >= height) return;

    int channelSize = width * height;
    int hwcIndex = y * width + x;
    int chwIndex;

    for (int c = 0; c < 3; ++c) {
        chwIndex = c * channelSize + y * width + x;
        output[chwIndex] = input[hwcIndex * 3 + c];
    }
}

template <typename T>
int cudaHWC2CHW(const T* input, T* output, int height, int width) {
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);

    HWC2CHW<<<gridDim, blockDim>>>(input, output, height, width);
    return cudaDeviceSynchronize();
}
template int cudaHWC2CHW<float>(const float* input, float* output, int height, int width);

cuda_transformation.h

void convertNV12toYUV444withActions_cuda(uint8_t *src_img, uint8_t *src_imgcuda,
                                         uint8_t *tmpImagecuda,
                                         ImageTransParam &trans_param,
                                         uint8_t *dst_imgcuda, uint8_t *dst_img,
                                         cudaStream_t stream);
void convertNV12toYUV444withActions_cuda1(uint8_t *src_imgcuda,
                                          ImageTransParam &trans_param,
                                          uint8_t *dst_imgcuda);

int cudaNV12toRGB(uint8_t* input, uint8_t* output, size_t width, size_t height);

int cudaRGBBilinearResize(uint8_t *input, uint8_t *output, size_t width,
                          size_t height, size_t resize_width,
                          size_t resize_height);

int cudaRGBToFloat(uint8_t *input, float *output, int width, int height);

int cudaRGBNormalize(float *d_image, int width, int height, float mean[], float std[]);

template <typename T>
int cudaHWC2CHW(const T* input, T* output, int height, int width);

image_transformation.h

这里也是对该变换进行封装，虽然项目是面向对象的抽象出了类似Transformer这个类，但是出于逻辑清晰和方便调试，我这里提供的都是面向过程的代码，另外附上了cpu中算子的实现。实际上一个图像处理算子的实现，一般过程是先生成cpu的，基于NCHW的循环版本，再对其改装成gpu上的算子，毕竟gpu的算子调试相较cpu不是很方便。虽然有cuda-gdb这种东西。可以看到cpu和gpu的版本基本上只在循环方式上有差别，因此核函数也是可以称为 for_each_pixel_func

  void TransformNV12toRGB(uint8_t *input, uint8_t *output,
                         int width, int height) {
    int ret = cudaNV12toRGB(input, output, width, height);
    if (ret != 0){
        HSLOG_E << "cudaNV12toRGB FAILED";
    }
  }

  void CpuTransformNV12toRGB(uint8_t *yuv, uint8_t *rgb,
                             int width, int height) {
    const int nv_start = width * height;
    uint32_t i, j, index = 0, rgb_index = 0;
    uint8_t y, u, v;
    int r, g, b, nv_index = 0;

    for (i = 0; i < height; i++) {
      for (j = 0; j < width; j++) {
        // nv_index = (rgb_index / 2 - width / 2 * ((i + 1) / 2)) * 2;
        nv_index = i / 2 * width + j - j % 2;

        y = yuv[rgb_index];
        u = yuv[nv_start + nv_index];
        v = yuv[nv_start + nv_index + 1];

        r = y + (140 * (v - 128)) / 100;                         // r
        g = y - (34 * (u - 128)) / 100 - (71 * (v - 128)) / 100; // g
        b = y + (177 * (u - 128)) / 100;                         // b

        if (r > 255)
          r = 255;
        if (g > 255)
          g = 255;
        if (b > 255)
          b = 255;
        if (r < 0)
          r = 0;
        if (g < 0)
          g = 0;
        if (b < 0)
          b = 0;

        // index = rgb_index % width + (height - i - 1) * width;
        index = rgb_index % width + i * width;
        rgb[index * 3 + 0] = b;
        rgb[index * 3 + 1] = g;
        rgb[index * 3 + 2] = r;
        rgb_index++;
      }
    }
  }

  void TransformRGBResize(uint8_t *input, uint8_t *output, size_t width,
                          size_t height, size_t resize_width,
                          size_t resize_height) {
    int ret = cudaRGBBilinearResize(input, output, width, height, resize_width, resize_height);
    if (ret != 0){
        HSLOG_E << "cudaRGBBilinearResize FAILED: " << ret;
    }
  }

  void CPURGBBilinearResize(uint8_t *input, uint8_t *output,
                            int inputWidth, int inputHeight, int outputWidth,
                            int outputHeight) {
    for (int y = 0; y < outputHeight; y++) {
      for (int x = 0; x < outputWidth; x++) {
        // 计算对应的原图中的浮点位置
        float gx = ((float)x) / outputWidth * (inputWidth - 1);
        float gy = ((float)y) / outputHeight * (inputHeight - 1);

        // 对应的整数位置及其偏移量
        int gxi = (int)gx;
        int gyi = (int)gy;
        float dx = gx - gxi;
        float dy = gy - gyi;

        // 读取四个最近的像素值
        uint8_t topLeft[3] = {input[(gyi * inputWidth + gxi) * 3 + 0],
                                    input[(gyi * inputWidth + gxi) * 3 + 1],
                                    input[(gyi * inputWidth + gxi) * 3 + 2]};
        uint8_t topRight[3] = {
            input[(gyi * inputWidth + gxi + 1) * 3 + 0],
            input[(gyi * inputWidth + gxi + 1) * 3 + 1],
            input[(gyi * inputWidth + gxi + 1) * 3 + 2]};
        uint8_t bottomLeft[3] = {
            input[((gyi + 1) * inputWidth + gxi) * 3 + 0],
            input[((gyi + 1) * inputWidth + gxi) * 3 + 1],
            input[((gyi + 1) * inputWidth + gxi) * 3 + 2]};
        uint8_t bottomRight[3] = {
            input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 0],
            input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 1],
            input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 2]};

        // 对每个通道进行双线性插值
        for (int i = 0; i < 3; i++) {
          float top = topLeft[i] * (1 - dx) + topRight[i] * dx;
          float bottom = bottomLeft[i] * (1 - dx) + bottomRight[i] * dx;
          output[(y * outputWidth + x) * 3 + i] =
              static_cast<uint8_t>(top * (1 - dy) + bottom * dy);
        }
      }
    }
  }

  void TransfromConvertRGBToFloat(uint8_t *input, float *output, int width, int height){
    int ret = cudaRGBToFloat(input, output, width, height);
    if (ret != 0){
        HSLOG_E << "cudaRGBToFloat FAILED: " << ret;
    }
  }

  void TransfromRGBNormalize(float *input, int width, int height, float* mean, float* std){
    int ret = cudaRGBNormalize(input, width, height, mean, std);
    if (ret != 0){
        HSLOG_E << "cudaRGBNormalize FAILED: " << ret;
    }
  }

  template <typename T>
  int TransfromHWC2CHW(const T* input, T* output, int height, int width){
    int ret = cudaHWC2CHW<T>(input, output, height, width);
    if (ret != 0){
      HSLOG_E << "cudaHWC2CHW FAILED: " << ret;
    }
  }

pre_process_module.cpp

这里额外加入一些dump的操作，以及debuggpu前N个字节的操作，方便调试

void PreProcessModule::Transform21dImage(hobot::dataflow::spMsgResourceProc proc,
                      const hobot::dataflow::MessageLists &msgs){
  UNUSED(proc);
  auto &input_img_batch_msgs = msgs[0];
  std::shared_ptr<ImageBatchMsg<GPUImageMsg>> batch_image_msg =
      std::static_pointer_cast<ImageBatchMsg<GPUImageMsg>>(
          input_img_batch_msgs->at(0));

  for (int i = 0; i < batch_image_msg->batch_size_; ++i) {
    auto image_msg = batch_image_msg->batch_img_msg_[i];
    int height = image_msg->img_trans_param_.src_height;
    int width = image_msg->img_trans_param_.src_width;

    image_transformation_[i].TransformNV12toRGB(image_msg->cuda_nv12_, image_transformation_[i].cuda_image_out_, width, height);

        static int cnt = 0;
    if (true)
    {
        std::string input_file_path= "/home/yuxuan03.zhang/utils_code/lcc/query/" + std::to_string(cnt) + ".jpg";
        cv::Mat bgrImage = cv::imread(input_file_path);
        if (bgrImage.empty()) {
            std::cerr << "Error: Image cannot be loaded!" << std::endl;
        }
        size_t size = bgrImage.total() * bgrImage.elemSize(); // 计算需要复制的内存大小
        HSLOG_E << "height: " << height << "width: " << width << "size: " << size << "file" << input_file_path;
        // 将数据从 cv::Mat 复制到 GPU 内存
        cudaMemcpy(image_transformation_[i].cuda_image_out_, bgrImage.ptr(), size, cudaMemcpyHostToDevice);
        image_msg->SetDoneTimestamp(cnt);
        cnt++;
    }


    // int size = width * height * 3 / 2;
    // uint8_t* cpu_nv12 = new uint8_t[size];
    // cudaMemcpy(cpu_nv12, image_msg->cuda_nv12_, size, cudaMemcpyDeviceToHost);
    // cv::Mat nv12Img(height + height / 2, width, CV_8UC1, cpu_nv12);
    // cv::Mat bgrImg;
    // cv::cvtColor(nv12Img, bgrImg, cv::COLOR_YUV2BGR_NV12);
    // std::string file = std::to_string(image_msg->GetGenTimestamp()) + "_nv12.png";
    // cv::imwrite(file, bgrImg);
    // delete[] cpu_nv12;

    // int dataSize = width * height * 3;
    // uint8_t* cpu_rgb = new uint8_t[dataSize];
    // cudaMemcpy(cpu_rgb, image_transformation_[i].cuda_image_out_, dataSize, cudaMemcpyDeviceToHost);
    // cv::Mat rgb_img(height, width, CV_8UC3, cpu_rgb);
    // std::string file1 = std::to_string(image_msg->GetGenTimestamp()) + "_rgb.png";
    // cv::imwrite(file1, rgb_img);
    // delete[] cpu_rgb;


    image_transformation_[i].TransformRGBResize(image_transformation_[i].cuda_image_out_, image_transformation_[i].cuda_image_trans_buffer_, width, height, 910, 512);
    HSLOG_E <<"Resize: " << PrintFirstNUint8Bytes((uint8_t*)image_transformation_[i].cuda_image_trans_buffer_);
    // uint8_t* cpu_rgb_resize = new uint8_t[910*512*3];
    // cudaMemcpy(cpu_rgb_resize, image_transformation_[i].cuda_image_trans_buffer_, 910*512*3, cudaMemcpyDeviceToHost);
    // cv::Mat rgb_resize_img(512, 910, CV_8UC3, cpu_rgb_resize);
    // std::string file2 = std::to_string(image_msg->GetGenTimestamp()) + "_rgb_resize.png";
    // cv::imwrite(file2, rgb_resize_img);
    // delete[] cpu_rgb_resize;

    image_transformation_[i].TransfromConvertRGBToFloat(image_transformation_[i].cuda_image_trans_buffer_, (float*)image_transformation_[i].cuda_image_out_, 910, 512);
    HSLOG_E <<"BRGToRGBFloat: " << PrintFirstNFloatBytes((float*)image_transformation_[i].cuda_image_out_);

    std::vector<float> mean = {0.485, 0.456, 0.406};
    std::vector<float> std  = {0.229, 0.224, 0.225};
    float* mean_gpu = (float*)image_transformation_[i].cuda_image_trans_buffer_;
    float* std_gpu = mean_gpu+3;
    cudaMemcpy(mean_gpu, mean.data(), 3 * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(std_gpu, std.data(), 3 * sizeof(float), cudaMemcpyHostToDevice);
    image_transformation_[i].TransfromRGBNormalize((float*)image_transformation_[i].cuda_image_out_, 910, 512, mean_gpu, std_gpu);
    HSLOG_E <<"Norm: " << PrintFirstNFloatBytes((float*)image_transformation_[i].cuda_image_out_);
    image_transformation_[i].TransfromHWC2CHW((float*)image_transformation_[i].cuda_image_out_, (float*)image_msg->cuda_yuv444_, 512, 910);
    HSLOG_E <<"HWC2CHW: " << PrintFirstNFloatBytes((float*)image_msg->cuda_yuv444_);

    if (true) {
        float *cuda_image_out_ = (float*)image_msg->cuda_yuv444_;
        size_t dataSize = 3 * 512 * 910 * sizeof(float);
        float *hostData = new float[dataSize / sizeof(float)];
        cudaMemcpy(hostData, cuda_image_out_, dataSize, cudaMemcpyDeviceToHost);
        std::string input_file_path= "./dump_bin/" + std::to_string(cnt) + ".bin";
        std::ofstream outFile(input_file_path, std::ios::out | std::ios::binary);
        outFile.write(reinterpret_cast<char *>(hostData), dataSize);
        outFile.close();
        delete[] hostData;
    }
  }
  SEND_DATA(SLOT_OUT_BATCH_TRANS_IMAGE, batch_image_msg);
}