Различия между NVCC и NVRTC при компиляции в PTX

Question

Различия между NVCC и NVRTC при компиляции в PTX

Резюме

Я портирую простое приложение для трассировки лучей, основанное на версии Scratchapixel, на несколько библиотек графического процессора. Я успешно перенес его на CUDA, используя API среды выполнения и API драйвера, но он выдает ошибкуSegmentation fault (core dumped)когда я пытаюсь использовать PTX, скомпилированный во время выполнения с помощью NVRTC. Если я раскомментирую#include <math.h> в начале файла ядра (см. ниже), он по-прежнему работает с использованием NVCC (сгенерированный PTX точно такой же), но не выполняется при компиляции с использованием NVRTC.

Я хочу знать, как заставить NVRTC вести себя так же, как NVCC (возможно ли это вообще?), Или, по крайней мере, понять причину этих проблем.

Подробное описание

файл kernel.cu (Источник ядра):

//#include <math.h>

#define MAX_RAY_DEPTH 5

template<typename T>
class Vec3
{
public:
    T x, y, z;
    __device__ Vec3() : x(T(0)), y(T(0)), z(T(0)) {}
    __device__ Vec3(T xx) : x(xx), y(xx), z(xx) {}
    __device__ Vec3(T xx, T yy, T zz) : x(xx), y(yy), z(zz) {}
    __device__ Vec3& normalize()
    {
        T nor2 = length2();
        if (nor2 > 0) {
            T invNor = 1 / sqrt(nor2);
            x *= invNor, y *= invNor, z *= invNor;
        }
        return *this;
    }
    __device__ Vec3<T> operator * (const T &f) const { return Vec3<T>(x * f, y * f, z * f); }
    __device__ Vec3<T> operator * (const Vec3<T> &v) const { return Vec3<T>(x * v.x, y * v.y, z * v.z); }
    __device__ T dot(const Vec3<T> &v) const { return x * v.x + y * v.y + z * v.z; }
    __device__ Vec3<T> operator - (const Vec3<T> &v) const { return Vec3<T>(x - v.x, y - v.y, z - v.z); }
    __device__ Vec3<T> operator + (const Vec3<T> &v) const { return Vec3<T>(x + v.x, y + v.y, z + v.z); }
    __device__ Vec3<T>& operator += (const Vec3<T> &v) { x += v.x, y += v.y, z += v.z; return *this; }
    __device__ Vec3<T>& operator *= (const Vec3<T> &v) { x *= v.x, y *= v.y, z *= v.z; return *this; }
    __device__ Vec3<T> operator - () const { return Vec3<T>(-x, -y, -z); }
    __device__ T length2() const { return x * x + y * y + z * z; }
    __device__ T length() const { return sqrt(length2()); }
};

typedef Vec3<float> Vec3f;
typedef Vec3<bool> Vec3b;

class Sphere
{
public:
    const char* id;
    Vec3f center;                           /// position of the sphere
    float radius, radius2;                  /// sphere radius and radius^2
    Vec3f surfaceColor, emissionColor;      /// surface color and emission (light)
    float transparency, reflection;         /// surface transparency and reflectivity
    int animation_frame;
    Vec3b animation_position_rand;
    Vec3f animation_position;
    Sphere(
        const char* id,
        const Vec3f &c,
        const float &r,
        const Vec3f &sc,
        const float &refl = 0,
        const float &transp = 0,
        const Vec3f &ec = 0) :
        id(id), center(c), radius(r), radius2(r * r), surfaceColor(sc),
        emissionColor(ec), transparency(transp), reflection(refl)
    {
        animation_frame = 0;
    }
    //[comment]
    // Compute a ray-sphere intersection using the geometric solution
    //[/comment]
    __device__ bool intersect(const Vec3f &rayorig, const Vec3f &raydir, float &t0, float &t1) const
    {
        Vec3f l = center - rayorig;
        float tca = l.dot(raydir);
        if (tca < 0) return false;
        float d2 = l.dot(l) - tca * tca;
        if (d2 > radius2) return false;
        float thc = sqrt(radius2 - d2);
        t0 = tca - thc;
        t1 = tca + thc;

        return true;
    }
};

__device__ float mix(const float &a, const float &b, const float &mixval)
{
    return b * mixval + a * (1 - mixval);
}

__device__ Vec3f trace(
    const Vec3f &rayorig,
    const Vec3f &raydir,
    const Sphere *spheres,
    const unsigned int spheres_size,
    const int &depth)
{
    float tnear = INFINITY;
    const Sphere* sphere = NULL;
    // find intersection of this ray with the sphere in the scene
    for (unsigned i = 0; i < spheres_size; ++i) {
        float t0 = INFINITY, t1 = INFINITY;
        if (spheres[i].intersect(rayorig, raydir, t0, t1)) {
            if (t0 < 0) t0 = t1;
            if (t0 < tnear) {
                tnear = t0;
                sphere = &spheres[i];
            }
        }
    }
    // if there's no intersection return black or background color
    if (!sphere) return Vec3f(2);
    Vec3f surfaceColor = 0; // color of the ray/surfaceof the object intersected by the ray
    Vec3f phit = rayorig + raydir * tnear; // point of intersection
    Vec3f nhit = phit - sphere->center; // normal at the intersection point
    nhit.normalize(); // normalize normal direction
    // If the normal and the view direction are not opposite to each other
    // reverse the normal direction. That also means we are inside the sphere so set
    // the inside bool to true. Finally reverse the sign of IdotN which we want
    // positive.
    float bias = 1e-4; // add some bias to the point from which we will be tracing
    bool inside = false;
    if (raydir.dot(nhit) > 0) nhit = -nhit, inside = true;
    if ((sphere->transparency > 0 || sphere->reflection > 0) && depth < MAX_RAY_DEPTH) {
        float facingratio = -raydir.dot(nhit);
        // change the mix value to tweak the effect
        float fresneleffect = mix(pow(1 - facingratio, 3), 1, 0.1);
        // compute reflection direction (not need to normalize because all vectors
        // are already normalized)
        Vec3f refldir = raydir - nhit * 2 * raydir.dot(nhit);
        refldir.normalize();
        Vec3f reflection = trace(phit + nhit * bias, refldir, spheres, spheres_size, depth + 1);
        Vec3f refraction = 0;
        // if the sphere is also transparent compute refraction ray (transmission)
        if (sphere->transparency) {
            float ior = 1.1, eta = (inside) ? ior : 1 / ior; // are we inside or outside the surface?
            float cosi = -nhit.dot(raydir);
            float k = 1 - eta * eta * (1 - cosi * cosi);
            Vec3f refrdir = raydir * eta + nhit * (eta *  cosi - sqrt(k));
            refrdir.normalize();
            refraction = trace(phit - nhit * bias, refrdir, spheres, spheres_size, depth + 1);
        }
        // the result is a mix of reflection and refraction (if the sphere is transparent)
        surfaceColor = (
            reflection * fresneleffect +
            refraction * (1 - fresneleffect) * sphere->transparency) * sphere->surfaceColor;
    }
    else {
        // it's a diffuse object, no need to raytrace any further
        for (unsigned i = 0; i < spheres_size; ++i) {
            if (spheres[i].emissionColor.x > 0) {
                // this is a light
                Vec3f transmission = 1;
                Vec3f lightDirection = spheres[i].center - phit;
                lightDirection.normalize();
                for (unsigned j = 0; j < spheres_size; ++j) {
                    if (i != j) {
                        float t0, t1;
                        if (spheres[j].intersect(phit + nhit * bias, lightDirection, t0, t1)) {
                            transmission = 0;
                            break;
                        }
                    }
                }
                surfaceColor += sphere->surfaceColor * transmission *
                max(float(0), nhit.dot(lightDirection)) * spheres[i].emissionColor;
            }
        }
    }

    return surfaceColor + sphere->emissionColor;
}

extern "C" __global__
void raytrace_kernel(unsigned int width, unsigned int height, Vec3f *image, Sphere *spheres, unsigned int spheres_size, float invWidth, float invHeight, float aspectratio, float angle) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (y < height && x < width) {
        float xx = (2 * ((x + 0.5) * invWidth) - 1) * angle * aspectratio;
        float yy = (1 - 2 * ((y + 0.5) * invHeight)) * angle;
        Vec3f raydir(xx, yy, -1);
        raydir.normalize();
        image[y*width+x] = trace(Vec3f(0), raydir, spheres, spheres_size, 0);
    }
}

Я могу успешно скомпилировать его с помощью: nvcc --ptx kernel.cu -o kernel.ptx( полный PTX здесь) и используйте этот PTX в API драйвера сcuModuleLoadDataExиспользуя следующий фрагмент. Работает как положено.

Он отлично работает, даже если я раскомментирую #include <math.h> линия (фактически, генерируемый PTX точно такой же).

CudaSafeCall( cuInit(0) );

CUdevice device;
CudaSafeCall( cuDeviceGet(&device, 0) );

CUcontext context;
CudaSafeCall( cuCtxCreate(&context, 0, device) );

unsigned int error_buffer_size = 1024;
std::vector<CUjit_option> options;
std::vector<void*> values;
char* error_log = new char[error_buffer_size];
options.push_back(CU_JIT_ERROR_LOG_BUFFER); //Pointer to a buffer in which to print any log messages that reflect errors
values.push_back(error_log);
options.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES); //Log buffer size in bytes. Log messages will be capped at this size (including null terminator)
values.push_back(&error_buffer_size);
options.push_back(CU_JIT_TARGET_FROM_CUCONTEXT); //Determines the target based on the current attached context (default)
values.push_back(0); //No option value required for CU_JIT_TARGET_FROM_CUCONTEXT

CUmodule module;
CUresult status = cuModuleLoadDataEx(&module, ptxSource, options.size(), options.data(), values.data());
if (error_log && error_log[0]) { //https://stackru.com/a/7970669/3136474
    std::cout << "Compiler error: " << error_log << std::endl;
}
CudaSafeCall( status );

Однако всякий раз, когда я пытаюсь скомпилировать именно это ядро с помощью NVRTC ( полный PTX здесь), оно компилируется успешно, но дает мнеSegmentation fault (core dumped) по звонку cuModuleLoadDataEx (при попытке использовать полученный PTX).

Если я раскомментирую #include <math.h> линия, он терпит неудачу на nvrtcCompileProgram вызов со следующим выводом:

nvrtcSafeBuild() failed at cuda_raytracer_nvrtc_api.cpp:221 : NVRTC_ERROR_COMPILATION
Build log:
/usr/include/bits/mathcalls.h(177): error: linkage specification is incompatible with previous "isinf"
__nv_nvrtc_builtin_header.h(126689): here

/usr/include/bits/mathcalls.h(211): error: linkage specification is incompatible with previous "isnan"
__nv_nvrtc_builtin_header.h(126686): here

2 errors detected in the compilation of "kernel.cu".

Код, который я использую для его компиляции с NVRTC:

nvrtcProgram prog;
NvrtcSafeCall( nvrtcCreateProgram(&prog, kernelSource, "kernel.cu", 0, NULL, NULL) );

// https://docs.nvidia.com/cuda/nvrtc/index.html#group__options
std::vector<const char*> compilationOpts;
compilationOpts.push_back("--device-as-default-execution-space");
// NvrtcSafeBuild is a macro which automatically prints nvrtcGetProgramLog if the compilation fails
NvrtcSafeBuild( nvrtcCompileProgram(prog, compilationOpts.size(), compilationOpts.data()), prog );

size_t ptxSize;
NvrtcSafeCall( nvrtcGetPTXSize(prog, &ptxSize) );
char* ptxSource = new char[ptxSize];
NvrtcSafeCall( nvrtcGetPTX(prog, ptxSource) );

NvrtcSafeCall( nvrtcDestroyProgram(&prog) );

Затем я просто загружаю ptxSource используя предыдущий фрагмент (обратите внимание: этот блок кода используется как для версии API драйвера, так и для версии NVRTC).

Дополнительные вещи, которые я заметил / пробовал до сих пор

PTX, порожденное NVCC и тот, порожденная NVRTC совершенно разные, но я не в состоянии понять их, чтобы определить возможные проблемы.
Пытался указать компилятору конкретную архитектуру GPU (в моем случае CC 6.1), без разницы.
Пытался отключить любые оптимизации компилятора (параметры --ftz=false --prec-sqrt=true --prec-div=true --fmad=false в nvrtcCompileProgram). Файл PTX стал больше, но все равно Segfaulting.
Пытался добавить --std=c++11 или --std=c++14к параметрам компилятора NVRTC. С любым из них NVRTC генерирует почти пустой (4 строки) PTX, но не выдает ни предупреждений, ни ошибок, пока я не попытаюсь его использовать.

Окружающая среда

ОС: Ubuntu 18.04.4 LTS 64-бит
nvcc --version: Инструменты компиляции Cuda, выпуск 10.1, V10.1.168. Дата постройки Ср_Апр_24_19:10:27_PDT_2019
gcc --version: gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
Аппаратное обеспечение: Intel I7-7700HQ, GeForce GTX 1050 Ti

Редактировать на OP+1 день

Я забыл добавить свое окружение. См. Предыдущий раздел.

Также можно ли скомпилировать вывод nvrtc с помощью ptxas? - комментарий @talonmies

В nvcc-сгенерированный PTX компилируется с предупреждением:

$ ptxas -o /tmp/temp_ptxas_output.o kernel.ptx
ptxas warning : Stack size for entry function 'raytrace_kernel' cannot be statically determined

Это связано с рекурсивной функцией ядра ( подробнее об этом). Это можно смело игнорировать.

В nvrtc-generated PTX не компилируется и выдает ошибку:

$ ptxas -o /tmp/temp_ptxas_output.o nvrtc_kernel.ptx
ptxas fatal   : Unresolved extern function '_Z5powiffi'

На основании этого вопроса я добавил__device__ к Sphere конструктор класса и удален --device-as-default-execution-spaceвариант компилятора. Теперь он генерирует немного другой PTX, но по-прежнему представляет ту же ошибку.

Компиляция с #include <math.h>теперь генерирует много сообщений: "Функция без аннотаций пространства выполнения считается ведущей функцией, а ведущие функции не разрешены в режиме JIT". предупреждения помимо предыдущих ошибок.

Если я пытаюсь использовать принятое решение вопроса, оно вызывает кучу синтаксических ошибок и не компилируется. NVCC по-прежнему работает безупречно.

3

c++ cuda ptx nvrtc

Источник

user3136474 01 апр '20 в 06:37

1 ответ

Решение

Другие вопросы по тегам c++ cuda ptx nvrtc

user3136474 02 апр '20 в 04:37 2020-04-02 04:37 · Accepted Answer · 2020-04-02 04:37

Только что нашел виновника древним методом комментирования и проверки: ошибка исчезнет, если я удалюpow вызов, используемый для расчета эффекта Френеля внутри trace метод.

А пока я только что заменил pow(var, 3) за var*var*var.

Я создал MVCE и отправил в NVIDIA отчет об ошибке: https://developer.nvidia.com/nvidia_bug/2917596.

На что Лиам Чжан ответил и указал мне на проблему:

Проблема в вашем коде заключается в том, что в cuModuleLoadDataEx передается неверное значение параметра. В строках:
options.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES); //Log buffer size in bytes. Log messages will be capped at this size (including null terminator)
values.push_back(&error_buffer_size);
предоставляется опция размера буфера, но вместо передачи значения с размером передается указатель на это значение. Поскольку этот указатель затем читается как число, драйвер предположил, что размер буфера намного больше 1024.
Во время компиляции NVRTC произошла ошибка "Неразрешенная внешняя функция", потому что сигнатура функции pow, как вы можете найти в документации, имеет следующий вид:
__device__ double pow ( double x, double y )
Когда драйвер пытался обнулить буфер при помещении в него сообщения об ошибке, произошел segfault.
Без вызова pow ошибки компиляции не было, поэтому буфер ошибок не использовался и segfault отсутствовал.
Чтобы обеспечить правильный код устройства, значения, используемые для вызова функции pow, а также выходной указатель должны быть двойным числом или функцией, эквивалентной float, powf, может быть использован.

Если я изменю вызов на values.push_back((void*)error_buffer_size); он сообщает ту же ошибку, что и ptxas компиляция сгенерированного PTX:

Compiler error: ptxas fatal   : Unresolved extern function '_Z5powiffi'
cudaSafeCall() failed at file.cpp:74 : CUDA_ERROR_INVALID_PTX - a PTX JIT compilation failed