// automatically generated, don't edit
#pragma once
static const char* hip_hiprt_vec= \
"\n"
"#pragma once\n"
"\n"
"#include <hiprt/hiprt_common.h>\n"
"\n"
"#if !defined( __KERNELCC__ )\n"
"namespace hiprt\n"
"{\n"
"template <typename T, uint32_t N>\n"
"struct Vector;\n"
"\n"
"template <typename T>\n"
"struct alignas( 2 * sizeof( T ) ) Vector<T, 2>\n"
"{\n"
"	T x, y;\n"
"};\n"
"\n"
"template <typename T>\n"
"struct Vector<T, 3>\n"
"{\n"
"	T x, y, z;\n"
"};\n"
"\n"
"template <typename T>\n"
"struct alignas( 4 * sizeof( T ) ) Vector<T, 4>\n"
"{\n"
"	T x, y, z, w;\n"
"};\n"
"} // namespace hiprt\n"
"\n"
"using hiprtInt2	  = hiprt::Vector<int, 2>;\n"
"using hiprtInt3	  = hiprt::Vector<int, 3>;\n"
"using hiprtInt4	  = hiprt::Vector<int, 4>;\n"
"using hiprtUint2  = hiprt::Vector<unsigned int, 2>;\n"
"using hiprtUint3  = hiprt::Vector<unsigned int, 3>;\n"
"using hiprtUint4  = hiprt::Vector<unsigned int, 4>;\n"
"using hiprtFloat2 = hiprt::Vector<float, 2>;\n"
"using hiprtFloat3 = hiprt::Vector<float, 3>;\n"
"using hiprtFloat4 = hiprt::Vector<float, 4>;\n"
"#if defined( HIPRT_EXPORTS )\n"
"using int2	 = hiprtInt2;\n"
"using int3	 = hiprtInt3;\n"
"using int4	 = hiprtInt4;\n"
"using uint2	 = hiprtUint2;\n"
"using uint3	 = hiprtUint3;\n"
"using uint4	 = hiprtUint4;\n"
"using float2 = hiprtFloat2;\n"
"using float3 = hiprtFloat3;\n"
"using float4 = hiprtFloat4;\n"
"#endif\n"
"#else\n"
"using hiprtInt2	  = int2;\n"
"using hiprtInt3	  = int3;\n"
"using hiprtInt4	  = int4;\n"
"using hiprtUint2  = uint2;\n"
"using hiprtUint3  = uint3;\n"
"using hiprtUint4  = uint4;\n"
"using hiprtFloat2 = float2;\n"
"using hiprtFloat3 = float3;\n"
"using hiprtFloat4 = float4;\n"
"#endif\n"
"\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtInt2 ) == 8 );\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtInt3 ) == 12 );\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtInt4 ) == 16 );\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtUint2 ) == 8 );\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtUint3 ) == 12 );\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtUint4 ) == 16 );\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtFloat2 ) == 8 );\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtFloat3 ) == 12 );\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtFloat4 ) == 16 );\n"
;
static const char* hip_hiprt_math= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_common.h>\n"
"#include <hiprt/hiprt_vec.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 make_int2( const int c ) { return int2{ c, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 make_int3( const int c ) { return int3{ c, c, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 make_int4( const int c ) { return int4{ c, c, c, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 make_int2( const int3 a ) { return int2{ a.x, a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 make_int2( const int4 a ) { return int2{ a.x, a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 make_int3( const int4 a ) { return int3{ a.x, a.y, a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 make_int3( const int2& a, const int c ) { return int3{ a.x, a.y, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 make_int3( const int c, const int2& a ) { return int3{ c, a.x, a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 make_int4( const int2& a, const int c0, const int c1 ) { return int4{ a.x, a.y, c0, c1 }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 make_int4( const int c0, const int2& a, const int c1 ) { return int4{ c0, a.x, a.y, c1 }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 make_int4( const int c0, const int c1, const int2& a ) { return int4{ c0, c1, a.x, a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 make_int4( const int3& a, const int c ) { return int4{ a.x, a.y, a.z, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 make_int4( const int c, const int3& a ) { return int4{ c, a.x, a.y, a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 make_int2( const uint2& a )\n"
"{\n"
"	return int2{ static_cast<int>( a.x ), static_cast<int>( a.y ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 make_int3( const uint3& a )\n"
"{\n"
"	return int3{ static_cast<int>( a.x ), static_cast<int>( a.y ), static_cast<int>( a.z ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 make_int4( const uint4& a )\n"
"{\n"
"	return int4{ static_cast<int>( a.x ), static_cast<int>( a.y ), static_cast<int>( a.z ), static_cast<int>( a.w ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 make_int2( const float2& a )\n"
"{\n"
"	return int2{ static_cast<int>( a.x ), static_cast<int>( a.y ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 make_int3( const float3& a )\n"
"{\n"
"	return int3{ static_cast<int>( a.x ), static_cast<int>( a.y ), static_cast<int>( a.z ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 make_int4( const float4& a )\n"
"{\n"
"	return int4{ static_cast<int>( a.x ), static_cast<int>( a.y ), static_cast<int>( a.z ), static_cast<int>( a.w ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 make_uint2( const unsigned int c ) { return uint2{ c, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 make_uint3( const unsigned int c ) { return uint3{ c, c, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 make_uint4( const unsigned int c ) { return uint4{ c, c, c, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 make_uint2( const uint3 a ) { return uint2{ a.x, a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 make_uint2( const uint4 a ) { return uint2{ a.x, a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 make_uint3( const uint4 a ) { return uint3{ a.x, a.y, a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 make_uint3( const uint2& a, const unsigned int c ) { return uint3{ a.x, a.y, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 make_uint3( const unsigned int c, const uint2& a ) { return uint3{ c, a.x, a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 make_uint4( const uint2& a, const unsigned int c0, const unsigned int c1 )\n"
"{\n"
"	return uint4{ a.x, a.y, c0, c1 };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 make_uint4( const unsigned int c0, const uint2& a, const unsigned int c1 )\n"
"{\n"
"	return uint4{ c0, a.x, a.y, c1 };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 make_uint4( const unsigned int c0, const unsigned int c1, const uint2& a )\n"
"{\n"
"	return uint4{ c0, c1, a.x, a.y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 make_uint4( const uint3& a, const unsigned int c ) { return uint4{ a.x, a.y, a.z, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 make_uint4( const unsigned int c, const uint3& a ) { return uint4{ c, a.x, a.y, a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 make_uint2( const int2& a )\n"
"{\n"
"	return uint2{ static_cast<unsigned int>( a.x ), static_cast<unsigned int>( a.y ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 make_uint3( const int3& a )\n"
"{\n"
"	return uint3{ static_cast<unsigned int>( a.x ), static_cast<unsigned int>( a.y ), static_cast<unsigned int>( a.z ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 make_uint4( const int4& a )\n"
"{\n"
"	return uint4{\n"
"		static_cast<unsigned int>( a.x ),\n"
"		static_cast<unsigned int>( a.y ),\n"
"		static_cast<unsigned int>( a.z ),\n"
"		static_cast<unsigned int>( a.w ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 make_uint2( const float2& a )\n"
"{\n"
"	return uint2{ static_cast<unsigned int>( a.x ), static_cast<unsigned int>( a.y ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 make_uint3( const float3& a )\n"
"{\n"
"	return uint3{ static_cast<unsigned int>( a.x ), static_cast<unsigned int>( a.y ), static_cast<unsigned int>( a.z ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 make_uint4( const float4& a )\n"
"{\n"
"	return uint4{\n"
"		static_cast<unsigned int>( a.x ),\n"
"		static_cast<unsigned int>( a.y ),\n"
"		static_cast<unsigned int>( a.z ),\n"
"		static_cast<unsigned int>( a.w ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 make_float2( const float c ) { return float2{ c, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 make_float3( const float c ) { return float3{ c, c, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 make_float4( const float c ) { return float4{ c, c, c, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 make_float2( const float3 a ) { return float2{ a.x, a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 make_float2( const float4 a ) { return float2{ a.x, a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 make_float3( const float4 a ) { return float3{ a.x, a.y, a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 make_float3( const float2& a, const float c ) { return float3{ a.x, a.y, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 make_float3( const float c, const float2& a ) { return float3{ c, a.x, a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 make_float4( const float2& a, const float c0, const float c1 )\n"
"{\n"
"	return float4{ a.x, a.y, c0, c1 };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 make_float4( const float c0, const float2& a, const float c1 )\n"
"{\n"
"	return float4{ c0, a.x, a.y, c1 };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 make_float4( const float c0, const float c1, const float2& a )\n"
"{\n"
"	return float4{ c0, c1, a.x, a.y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 make_float4( const float3& a, const float c ) { return float4{ a.x, a.y, a.z, c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 make_float4( const float c, const float3& a ) { return float4{ c, a.x, a.y, a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 make_float2( const int2& a )\n"
"{\n"
"	return float2{ static_cast<float>( a.x ), static_cast<float>( a.y ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 make_float3( const int3& a )\n"
"{\n"
"	return float3{ static_cast<float>( a.x ), static_cast<float>( a.y ), static_cast<float>( a.z ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 make_float4( const int4& a )\n"
"{\n"
"	return float4{ static_cast<float>( a.x ), static_cast<float>( a.y ), static_cast<float>( a.z ), static_cast<float>( a.w ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 make_float2( const uint2& a )\n"
"{\n"
"	return float2{ static_cast<float>( a.x ), static_cast<float>( a.y ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 make_float3( const uint3& a )\n"
"{\n"
"	return float3{ static_cast<float>( a.x ), static_cast<float>( a.y ), static_cast<float>( a.z ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 make_float4( const uint4& a )\n"
"{\n"
"	return float4{ static_cast<float>( a.x ), static_cast<float>( a.y ), static_cast<float>( a.z ), static_cast<float>( a.w ) };\n"
"}\n"
"} // namespace hiprt\n"
"\n"
"#if !defined( __HIPCC__ )\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 operator+( const int2& a, const int2& b ) { return int2{ a.x + b.x, a.y + b.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 operator-( const int2& a, const int2& b ) { return int2{ a.x - b.x, a.y - b.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 operator*( const int2& a, const int2& b ) { return int2{ a.x * b.x, a.y * b.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 operator/( const int2& a, const int2& b ) { return int2{ a.x / b.x, a.y / b.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2& operator+=( int2& a, const int2& b )\n"
"{\n"
"	a.x += b.x;\n"
"	a.y += b.y;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2& operator-=( int2& a, const int2& b )\n"
"{\n"
"	a.x -= b.x;\n"
"	a.y -= b.y;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2& operator*=( int2& a, const int2& b )\n"
"{\n"
"	a.x *= b.x;\n"
"	a.y *= b.y;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2& operator/=( int2& a, const int2& b )\n"
"{\n"
"	a.x /= b.x;\n"
"	a.y /= b.y;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2& operator+=( int2& a, const int c )\n"
"{\n"
"	a.x += c;\n"
"	a.y += c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2& operator-=( int2& a, const int c )\n"
"{\n"
"	a.x -= c;\n"
"	a.y -= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2& operator*=( int2& a, const int c )\n"
"{\n"
"	a.x *= c;\n"
"	a.y *= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2& operator/=( int2& a, const int c )\n"
"{\n"
"	a.x /= c;\n"
"	a.y /= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 operator-( const int2& a ) { return int2{ -a.x, -a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 operator+( const int2& a, const int c ) { return int2{ a.x + c, a.y + c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 operator+( const int c, const int2& a ) { return int2{ c + a.x, c + a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 operator-( const int2& a, const int c ) { return int2{ a.x - c, a.y - c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 operator-( const int c, const int2& a ) { return int2{ c - a.x, c - a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 operator*( const int2& a, const int c ) { return int2{ c * a.x, c * a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 operator*( const int c, const int2& a ) { return int2{ c * a.x, c * a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 operator/( const int2& a, const int c ) { return int2{ a.x / c, a.y / c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 operator/( const int c, const int2& a ) { return int2{ c / a.x, c / a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 operator+( const int3& a, const int3& b )\n"
"{\n"
"	return int3{ a.x + b.x, a.y + b.y, a.z + b.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 operator-( const int3& a, const int3& b )\n"
"{\n"
"	return int3{ a.x - b.x, a.y - b.y, a.z - b.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 operator*( const int3& a, const int3& b )\n"
"{\n"
"	return int3{ a.x * b.x, a.y * b.y, a.z * b.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 operator/( const int3& a, const int3& b )\n"
"{\n"
"	return int3{ a.x / b.x, a.y / b.y, a.z / b.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3& operator+=( int3& a, const int3& b )\n"
"{\n"
"	a.x += b.x;\n"
"	a.y += b.y;\n"
"	a.z += b.z;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3& operator-=( int3& a, const int3& b )\n"
"{\n"
"	a.x -= b.x;\n"
"	a.y -= b.y;\n"
"	a.z -= b.z;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3& operator*=( int3& a, const int3& b )\n"
"{\n"
"	a.x *= b.x;\n"
"	a.y *= b.y;\n"
"	a.z *= b.z;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3& operator/=( int3& a, const int3& b )\n"
"{\n"
"	a.x /= b.x;\n"
"	a.y /= b.y;\n"
"	a.z /= b.z;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3& operator+=( int3& a, const int c )\n"
"{\n"
"	a.x += c;\n"
"	a.y += c;\n"
"	a.z += c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3& operator-=( int3& a, const int c )\n"
"{\n"
"	a.x -= c;\n"
"	a.y -= c;\n"
"	a.z -= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3& operator*=( int3& a, const int c )\n"
"{\n"
"	a.x *= c;\n"
"	a.y *= c;\n"
"	a.z *= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3& operator/=( int3& a, const int c )\n"
"{\n"
"	a.x /= c;\n"
"	a.y /= c;\n"
"	a.z /= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 operator-( const int3& a ) { return int3{ -a.x, -a.y, -a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 operator+( const int3& a, const int c ) { return int3{ c + a.x, c + a.y, c + a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 operator+( const int c, const int3& a ) { return int3{ c + a.x, c + a.y, c + a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 operator-( const int3& a, const int c ) { return int3{ a.x - c, a.y - c, a.z - c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 operator-( const int c, const int3& a ) { return int3{ c - a.x, c - a.y, c - a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 operator*( const int3& a, const int c ) { return int3{ c * a.x, c * a.y, c * a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 operator*( const int c, const int3& a ) { return int3{ c * a.x, c * a.y, c * a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 operator/( const int3& a, const int c ) { return int3{ a.x / c, a.y / c, a.z / c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 operator/( const int c, const int3& a ) { return int3{ c / a.x, c / a.y, c / a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 operator+( const int4& a, const int4& b )\n"
"{\n"
"	return int4{ a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 operator-( const int4& a, const int4& b )\n"
"{\n"
"	return int4{ a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 operator*( const int4& a, const int4& b )\n"
"{\n"
"	return int4{ a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 operator/( const int4& a, const int4& b )\n"
"{\n"
"	return int4{ a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4& operator+=( int4& a, const int4& b )\n"
"{\n"
"	a.x += b.x;\n"
"	a.y += b.y;\n"
"	a.z += b.z;\n"
"	a.w += b.w;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4& operator-=( int4& a, const int4& b )\n"
"{\n"
"	a.x -= b.x;\n"
"	a.y -= b.y;\n"
"	a.z -= b.z;\n"
"	a.w -= b.w;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4& operator*=( int4& a, const int4& b )\n"
"{\n"
"	a.x *= b.x;\n"
"	a.y *= b.y;\n"
"	a.z *= b.z;\n"
"	a.w *= b.w;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4& operator/=( int4& a, const int4& b )\n"
"{\n"
"	a.x /= b.x;\n"
"	a.y /= b.y;\n"
"	a.z /= b.z;\n"
"	a.w /= b.w;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4& operator+=( int4& a, const int c )\n"
"{\n"
"	a.x += c;\n"
"	a.y += c;\n"
"	a.z += c;\n"
"	a.w += c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4& operator-=( int4& a, const int c )\n"
"{\n"
"	a.x -= c;\n"
"	a.y -= c;\n"
"	a.z -= c;\n"
"	a.w -= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4& operator*=( int4& a, const int c )\n"
"{\n"
"	a.x *= c;\n"
"	a.y *= c;\n"
"	a.z *= c;\n"
"	a.w *= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4& operator/=( int4& a, const int c )\n"
"{\n"
"	a.x /= c;\n"
"	a.y /= c;\n"
"	a.z /= c;\n"
"	a.w /= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 operator-( const int4& a ) { return int4{ -a.x, -a.y, -a.z, -a.w }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 operator+( const int4& a, const int c )\n"
"{\n"
"	return int4{ c + a.x, c + a.y, c + a.z, c + a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 operator+( const int c, const int4& a )\n"
"{\n"
"	return int4{ c + a.x, c + a.y, c + a.z, c + a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 operator-( const int4& a, const int c )\n"
"{\n"
"	return int4{ a.x - c, a.y - c, a.z - c, a.w - c };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 operator-( const int c, const int4& a )\n"
"{\n"
"	return int4{ c - a.x, c - a.y, c - a.z, c - a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 operator*( const int4& a, const int c )\n"
"{\n"
"	return int4{ c * a.x, c * a.y, c * a.z, c * a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 operator*( const int c, const int4& a )\n"
"{\n"
"	return int4{ c * a.x, c * a.y, c * a.z, c * a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 operator/( const int4& a, const int c )\n"
"{\n"
"	return int4{ a.x / c, a.y / c, a.z / c, a.w / c };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 operator/( const int c, const int4& a )\n"
"{\n"
"	return int4{ c / a.x, c / a.y, c / a.z, c / a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 operator+( const uint2& a, const uint2& b ) { return uint2{ a.x + b.x, a.y + b.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 operator-( const uint2& a, const uint2& b ) { return uint2{ a.x - b.x, a.y - b.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 operator*( const uint2& a, const uint2& b ) { return uint2{ a.x * b.x, a.y * b.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 operator/( const uint2& a, const uint2& b ) { return uint2{ a.x / b.x, a.y / b.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2& operator+=( uint2& a, const uint2& b )\n"
"{\n"
"	a.x += b.x;\n"
"	a.y += b.y;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2& operator-=( uint2& a, const uint2& b )\n"
"{\n"
"	a.x -= b.x;\n"
"	a.y -= b.y;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2& operator*=( uint2& a, const uint2& b )\n"
"{\n"
"	a.x *= b.x;\n"
"	a.y *= b.y;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2& operator/=( uint2& a, const uint2& b )\n"
"{\n"
"	a.x /= b.x;\n"
"	a.y /= b.y;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2& operator+=( uint2& a, const unsigned int c )\n"
"{\n"
"	a.x += c;\n"
"	a.y += c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2& operator-=( uint2& a, const unsigned int c )\n"
"{\n"
"	a.x -= c;\n"
"	a.y -= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2& operator*=( uint2& a, const unsigned int c )\n"
"{\n"
"	a.x *= c;\n"
"	a.y *= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2& operator/=( uint2& a, const unsigned int c )\n"
"{\n"
"	a.x /= c;\n"
"	a.y /= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 operator+( const uint2& a, const unsigned int c ) { return uint2{ a.x + c, a.y + c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 operator+( const unsigned int c, const uint2& a ) { return uint2{ c + a.x, c + a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 operator-( const uint2& a, const unsigned int c ) { return uint2{ a.x - c, a.y - c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 operator-( const unsigned int c, const uint2& a ) { return uint2{ c - a.x, c - a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 operator*( const uint2& a, const unsigned int c ) { return uint2{ c * a.x, c * a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 operator*( const unsigned int c, const uint2& a ) { return uint2{ c * a.x, c * a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 operator/( const uint2& a, const unsigned int c ) { return uint2{ a.x / c, a.y / c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 operator/( const unsigned int c, const uint2& a ) { return uint2{ c / a.x, c / a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 operator+( const uint3& a, const uint3& b )\n"
"{\n"
"	return uint3{ a.x + b.x, a.y + b.y, a.z + b.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 operator-( const uint3& a, const uint3& b )\n"
"{\n"
"	return uint3{ a.x - b.x, a.y - b.y, a.z - b.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 operator*( const uint3& a, const uint3& b )\n"
"{\n"
"	return uint3{ a.x * b.x, a.y * b.y, a.z * b.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 operator/( const uint3& a, const uint3& b )\n"
"{\n"
"	return uint3{ a.x / b.x, a.y / b.y, a.z / b.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3& operator+=( uint3& a, const uint3& b )\n"
"{\n"
"	a.x += b.x;\n"
"	a.y += b.y;\n"
"	a.z += b.z;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3& operator-=( uint3& a, const uint3& b )\n"
"{\n"
"	a.x -= b.x;\n"
"	a.y -= b.y;\n"
"	a.z -= b.z;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3& operator*=( uint3& a, const uint3& b )\n"
"{\n"
"	a.x *= b.x;\n"
"	a.y *= b.y;\n"
"	a.z *= b.z;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3& operator/=( uint3& a, const uint3& b )\n"
"{\n"
"	a.x /= b.x;\n"
"	a.y /= b.y;\n"
"	a.z /= b.z;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3& operator+=( uint3& a, const unsigned int c )\n"
"{\n"
"	a.x += c;\n"
"	a.y += c;\n"
"	a.z += c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3& operator-=( uint3& a, const unsigned int c )\n"
"{\n"
"	a.x -= c;\n"
"	a.y -= c;\n"
"	a.z -= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3& operator*=( uint3& a, const unsigned int c )\n"
"{\n"
"	a.x *= c;\n"
"	a.y *= c;\n"
"	a.z *= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3& operator/=( uint3& a, const unsigned int c )\n"
"{\n"
"	a.x /= c;\n"
"	a.y /= c;\n"
"	a.z /= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 operator+( const uint3& a, const unsigned int c )\n"
"{\n"
"	return uint3{ c + a.x, c + a.y, c + a.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 operator+( const unsigned int c, const uint3& a )\n"
"{\n"
"	return uint3{ c + a.x, c + a.y, c + a.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 operator-( const uint3& a, const unsigned int c )\n"
"{\n"
"	return uint3{ a.x - c, a.y - c, a.z - c };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 operator-( const unsigned int c, const uint3& a )\n"
"{\n"
"	return uint3{ c - a.x, c - a.y, c - a.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 operator*( const uint3& a, const unsigned int c )\n"
"{\n"
"	return uint3{ c * a.x, c * a.y, c * a.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 operator*( const unsigned int c, const uint3& a )\n"
"{\n"
"	return uint3{ c * a.x, c * a.y, c * a.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 operator/( const uint3& a, const unsigned int c )\n"
"{\n"
"	return uint3{ a.x / c, a.y / c, a.z / c };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 operator/( const unsigned int c, const uint3& a )\n"
"{\n"
"	return uint3{ c / a.x, c / a.y, c / a.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 operator+( const uint4& a, const uint4& b )\n"
"{\n"
"	return uint4{ a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 operator-( const uint4& a, const uint4& b )\n"
"{\n"
"	return uint4{ a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 operator*( const uint4& a, const uint4& b )\n"
"{\n"
"	return uint4{ a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 operator/( const uint4& a, const uint4& b )\n"
"{\n"
"	return uint4{ a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4& operator+=( uint4& a, const uint4& b )\n"
"{\n"
"	a.x += b.x;\n"
"	a.y += b.y;\n"
"	a.z += b.z;\n"
"	a.w += b.w;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4& operator-=( uint4& a, const uint4& b )\n"
"{\n"
"	a.x -= b.x;\n"
"	a.y -= b.y;\n"
"	a.z -= b.z;\n"
"	a.w -= b.w;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4& operator*=( uint4& a, const uint4& b )\n"
"{\n"
"	a.x *= b.x;\n"
"	a.y *= b.y;\n"
"	a.z *= b.z;\n"
"	a.w *= b.w;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4& operator/=( uint4& a, const uint4& b )\n"
"{\n"
"	a.x /= b.x;\n"
"	a.y /= b.y;\n"
"	a.z /= b.z;\n"
"	a.w /= b.w;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4& operator+=( uint4& a, const unsigned int c )\n"
"{\n"
"	a.x += c;\n"
"	a.y += c;\n"
"	a.z += c;\n"
"	a.w += c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4& operator-=( uint4& a, const unsigned int c )\n"
"{\n"
"	a.x -= c;\n"
"	a.y -= c;\n"
"	a.z -= c;\n"
"	a.w -= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4& operator*=( uint4& a, const unsigned int c )\n"
"{\n"
"	a.x *= c;\n"
"	a.y *= c;\n"
"	a.z *= c;\n"
"	a.w *= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4& operator/=( uint4& a, const unsigned int c )\n"
"{\n"
"	a.x /= c;\n"
"	a.y /= c;\n"
"	a.z /= c;\n"
"	a.w /= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 operator+( const uint4& a, const unsigned int c )\n"
"{\n"
"	return uint4{ c + a.x, c + a.y, c + a.z, c + a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 operator+( const unsigned int c, const uint4& a )\n"
"{\n"
"	return uint4{ c + a.x, c + a.y, c + a.z, c + a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 operator-( const uint4& a, const unsigned int c )\n"
"{\n"
"	return uint4{ a.x - c, a.y - c, a.z - c, a.w - c };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 operator-( const unsigned int c, const uint4& a )\n"
"{\n"
"	return uint4{ c - a.x, c - a.y, c - a.z, c - a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 operator*( const uint4& a, const unsigned int c )\n"
"{\n"
"	return uint4{ c * a.x, c * a.y, c * a.z, c * a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 operator*( const unsigned int c, const uint4& a )\n"
"{\n"
"	return uint4{ c * a.x, c * a.y, c * a.z, c * a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 operator/( const uint4& a, const unsigned int c )\n"
"{\n"
"	return uint4{ a.x / c, a.y / c, a.z / c, a.w / c };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 operator/( const unsigned int c, const uint4& a )\n"
"{\n"
"	return uint4{ c / a.x, c / a.y, c / a.z, c / a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 operator+( const float2& a, const float2& b ) { return float2{ a.x + b.x, a.y + b.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 operator-( const float2& a, const float2& b ) { return float2{ a.x - b.x, a.y - b.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 operator*( const float2& a, const float2& b ) { return float2{ a.x * b.x, a.y * b.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 operator/( const float2& a, const float2& b ) { return float2{ a.x / b.x, a.y / b.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2& operator+=( float2& a, const float2& b )\n"
"{\n"
"	a.x += b.x;\n"
"	a.y += b.y;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2& operator-=( float2& a, const float2& b )\n"
"{\n"
"	a.x -= b.x;\n"
"	a.y -= b.y;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2& operator*=( float2& a, const float2& b )\n"
"{\n"
"	a.x *= b.x;\n"
"	a.y *= b.y;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2& operator/=( float2& a, const float2& b )\n"
"{\n"
"	a.x /= b.x;\n"
"	a.y /= b.y;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2& operator+=( float2& a, const float c )\n"
"{\n"
"	a.x += c;\n"
"	a.y += c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2& operator-=( float2& a, const float c )\n"
"{\n"
"	a.x -= c;\n"
"	a.y -= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2& operator*=( float2& a, const float c )\n"
"{\n"
"	a.x *= c;\n"
"	a.y *= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2& operator/=( float2& a, const float c )\n"
"{\n"
"	a.x /= c;\n"
"	a.y /= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 operator-( const float2& a ) { return float2{ -a.x, -a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 operator+( const float2& a, const float c ) { return float2{ a.x + c, a.y + c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 operator+( const float c, const float2& a ) { return float2{ c + a.x, c + a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 operator-( const float2& a, const float c ) { return float2{ a.x - c, a.y - c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 operator-( const float c, const float2& a ) { return float2{ c - a.x, c - a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 operator*( const float2& a, const float c ) { return float2{ c * a.x, c * a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 operator*( const float c, const float2& a ) { return float2{ c * a.x, c * a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 operator/( const float2& a, const float c ) { return float2{ a.x / c, a.y / c }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 operator/( const float c, const float2& a ) { return float2{ c / a.x, c / a.y }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 operator+( const float3& a, const float3& b )\n"
"{\n"
"	return float3{ a.x + b.x, a.y + b.y, a.z + b.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 operator-( const float3& a, const float3& b )\n"
"{\n"
"	return float3{ a.x - b.x, a.y - b.y, a.z - b.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 operator*( const float3& a, const float3& b )\n"
"{\n"
"	return float3{ a.x * b.x, a.y * b.y, a.z * b.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 operator/( const float3& a, const float3& b )\n"
"{\n"
"	return float3{ a.x / b.x, a.y / b.y, a.z / b.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3& operator+=( float3& a, const float3& b )\n"
"{\n"
"	a.x += b.x;\n"
"	a.y += b.y;\n"
"	a.z += b.z;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3& operator-=( float3& a, const float3& b )\n"
"{\n"
"	a.x -= b.x;\n"
"	a.y -= b.y;\n"
"	a.z -= b.z;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3& operator*=( float3& a, const float3& b )\n"
"{\n"
"	a.x *= b.x;\n"
"	a.y *= b.y;\n"
"	a.z *= b.z;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3& operator/=( float3& a, const float3& b )\n"
"{\n"
"	a.x /= b.x;\n"
"	a.y /= b.y;\n"
"	a.z /= b.z;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3& operator+=( float3& a, const float c )\n"
"{\n"
"	a.x += c;\n"
"	a.y += c;\n"
"	a.z += c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3& operator-=( float3& a, const float c )\n"
"{\n"
"	a.x -= c;\n"
"	a.y -= c;\n"
"	a.z -= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3& operator*=( float3& a, const float c )\n"
"{\n"
"	a.x *= c;\n"
"	a.y *= c;\n"
"	a.z *= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3& operator/=( float3& a, const float c )\n"
"{\n"
"	a.x /= c;\n"
"	a.y /= c;\n"
"	a.z /= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 operator-( const float3& a ) { return float3{ -a.x, -a.y, -a.z }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 operator+( const float3& a, const float c )\n"
"{\n"
"	return float3{ c + a.x, c + a.y, c + a.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 operator+( const float c, const float3& a )\n"
"{\n"
"	return float3{ c + a.x, c + a.y, c + a.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 operator-( const float3& a, const float c )\n"
"{\n"
"	return float3{ a.x - c, a.y - c, a.z - c };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 operator-( const float c, const float3& a )\n"
"{\n"
"	return float3{ c - a.x, c - a.y, c - a.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 operator*( const float3& a, const float c )\n"
"{\n"
"	return float3{ c * a.x, c * a.y, c * a.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 operator*( const float c, const float3& a )\n"
"{\n"
"	return float3{ c * a.x, c * a.y, c * a.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 operator/( const float3& a, const float c )\n"
"{\n"
"	return float3{ a.x / c, a.y / c, a.z / c };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 operator/( const float c, const float3& a )\n"
"{\n"
"	return float3{ c / a.x, c / a.y, c / a.z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 operator+( const float4& a, const float4& b )\n"
"{\n"
"	return float4{ a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 operator-( const float4& a, const float4& b )\n"
"{\n"
"	return float4{ a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 operator*( const float4& a, const float4& b )\n"
"{\n"
"	return float4{ a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 operator/( const float4& a, const float4& b )\n"
"{\n"
"	return float4{ a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4& operator+=( float4& a, const float4& b )\n"
"{\n"
"	a.x += b.x;\n"
"	a.y += b.y;\n"
"	a.z += b.z;\n"
"	a.w += b.w;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4& operator-=( float4& a, const float4& b )\n"
"{\n"
"	a.x -= b.x;\n"
"	a.y -= b.y;\n"
"	a.z -= b.z;\n"
"	a.w -= b.w;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4& operator*=( float4& a, const float4& b )\n"
"{\n"
"	a.x *= b.x;\n"
"	a.y *= b.y;\n"
"	a.z *= b.z;\n"
"	a.w *= b.w;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4& operator/=( float4& a, const float4& b )\n"
"{\n"
"	a.x /= b.x;\n"
"	a.y /= b.y;\n"
"	a.z /= b.z;\n"
"	a.w /= b.w;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4& operator+=( float4& a, const float c )\n"
"{\n"
"	a.x += c;\n"
"	a.y += c;\n"
"	a.z += c;\n"
"	a.w += c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4& operator-=( float4& a, const float c )\n"
"{\n"
"	a.x -= c;\n"
"	a.y -= c;\n"
"	a.z -= c;\n"
"	a.w -= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4& operator*=( float4& a, const float c )\n"
"{\n"
"	a.x *= c;\n"
"	a.y *= c;\n"
"	a.z *= c;\n"
"	a.w *= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4& operator/=( float4& a, const float c )\n"
"{\n"
"	a.x /= c;\n"
"	a.y /= c;\n"
"	a.z /= c;\n"
"	a.w /= c;\n"
"	return a;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 operator-( const float4& a ) { return float4{ -a.x, -a.y, -a.z, -a.w }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 operator+( const float4& a, const float c )\n"
"{\n"
"	return float4{ c + a.x, c + a.y, c + a.z, c + a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 operator+( const float c, const float4& a )\n"
"{\n"
"	return float4{ c + a.x, c + a.y, c + a.z, c + a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 operator-( const float4& a, const float c )\n"
"{\n"
"	return float4{ a.x - c, a.y - c, a.z - c, a.w - c };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 operator-( const float c, const float4& a )\n"
"{\n"
"	return float4{ c - a.x, c - a.y, c - a.z, c - a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 operator*( const float4& a, const float c )\n"
"{\n"
"	return float4{ c * a.x, c * a.y, c * a.z, c * a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 operator*( const float c, const float4& a )\n"
"{\n"
"	return float4{ c * a.x, c * a.y, c * a.z, c * a.w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 operator/( const float4& a, const float c )\n"
"{\n"
"	return float4{ a.x / c, a.y / c, a.z / c, a.w / c };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 operator/( const float c, const float4& a )\n"
"{\n"
"	return float4{ c / a.x, c / a.y, c / a.z, c / a.w };\n"
"}\n"
"#endif\n"
"\n"
"namespace hiprt\n"
"{\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float min( const float a, const float b ) { return fminf( a, b ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float max( const float a, const float b ) { return fmaxf( a, b ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 max( const int2& a, const int2& b )\n"
"{\n"
"	int x = max( a.x, b.x );\n"
"	int y = max( a.y, b.y );\n"
"	return int2{ x, y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 max( const int2& a, const int c )\n"
"{\n"
"	int x = max( a.x, c );\n"
"	int y = max( a.y, c );\n"
"	return int2{ x, y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 max( const int c, const int2& a ) { return max( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 min( const int2& a, const int2& b )\n"
"{\n"
"	int x = min( a.x, b.x );\n"
"	int y = min( a.y, b.y );\n"
"	return int2{ x, y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 min( const int2& a, const int c )\n"
"{\n"
"	int x = min( a.x, c );\n"
"	int y = min( a.y, c );\n"
"	return int2{ x, y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int2 min( const int c, const int2& a ) { return min( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 max( const int3& a, const int3& b )\n"
"{\n"
"	int x = max( a.x, b.x );\n"
"	int y = max( a.y, b.y );\n"
"	int z = max( a.z, b.z );\n"
"	return int3{ x, y, z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 max( const int3& a, const int c )\n"
"{\n"
"	int x = max( a.x, c );\n"
"	int y = max( a.y, c );\n"
"	int z = max( a.z, c );\n"
"	return int3{ x, y, z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 max( const int c, const int3& a ) { return max( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 min( const int3& a, const int3& b )\n"
"{\n"
"	int x = min( a.x, b.x );\n"
"	int y = min( a.y, b.y );\n"
"	int z = min( a.z, b.z );\n"
"	return int3{ x, y, z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 min( const int3& a, const int c )\n"
"{\n"
"	int x = min( a.x, c );\n"
"	int y = min( a.y, c );\n"
"	int z = min( a.z, c );\n"
"	return int3{ x, y, z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int3 min( const int c, const int3& a ) { return min( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 max( const int4& a, const int4& b )\n"
"{\n"
"	int x = max( a.x, b.x );\n"
"	int y = max( a.y, b.y );\n"
"	int z = max( a.z, b.z );\n"
"	int w = max( a.w, b.w );\n"
"	return int4{ x, y, z, w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 max( const int4& a, const int c )\n"
"{\n"
"	int x = max( a.x, c );\n"
"	int y = max( a.y, c );\n"
"	int z = max( a.z, c );\n"
"	int w = max( a.w, c );\n"
"	return int4{ x, y, z, w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 max( const int c, const int4& a ) { return max( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 min( const int4& a, const int4& b )\n"
"{\n"
"	int x = min( a.x, b.x );\n"
"	int y = min( a.y, b.y );\n"
"	int z = min( a.z, b.z );\n"
"	int w = min( a.w, b.w );\n"
"	return int4{ x, y, z, w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 min( const int4& a, const int c )\n"
"{\n"
"	int x = min( a.x, c );\n"
"	int y = min( a.y, c );\n"
"	int z = min( a.z, c );\n"
"	int w = min( a.w, c );\n"
"	return int4{ x, y, z, w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int4 min( const int c, const int4& a ) { return min( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 max( const uint2& a, const uint2& b )\n"
"{\n"
"	unsigned int x = max( a.x, b.x );\n"
"	unsigned int y = max( a.y, b.y );\n"
"	return uint2{ x, y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 max( const uint2& a, const unsigned int c )\n"
"{\n"
"	unsigned int x = max( a.x, c );\n"
"	unsigned int y = max( a.y, c );\n"
"	return uint2{ x, y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 max( const unsigned int c, const uint2& a ) { return max( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 min( const uint2& a, const uint2& b )\n"
"{\n"
"	unsigned int x = min( a.x, b.x );\n"
"	unsigned int y = min( a.y, b.y );\n"
"	return uint2{ x, y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 min( const uint2& a, const unsigned int c )\n"
"{\n"
"	unsigned int x = min( a.x, c );\n"
"	unsigned int y = min( a.y, c );\n"
"	return uint2{ x, y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint2 min( const unsigned int c, const uint2& a ) { return min( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 max( const uint3& a, const uint3& b )\n"
"{\n"
"	unsigned int x = max( a.x, b.x );\n"
"	unsigned int y = max( a.y, b.y );\n"
"	unsigned int z = max( a.z, b.z );\n"
"	return uint3{ x, y, z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 max( const uint3& a, const unsigned int c )\n"
"{\n"
"	unsigned int x = max( a.x, c );\n"
"	unsigned int y = max( a.y, c );\n"
"	unsigned int z = max( a.z, c );\n"
"	return uint3{ x, y, z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 max( const unsigned int c, const uint3& a ) { return max( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 min( const uint3& a, const uint3& b )\n"
"{\n"
"	unsigned int x = min( a.x, b.x );\n"
"	unsigned int y = min( a.y, b.y );\n"
"	unsigned int z = min( a.z, b.z );\n"
"	return uint3{ x, y, z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 min( const uint3& a, const unsigned int c )\n"
"{\n"
"	unsigned int x = min( a.x, c );\n"
"	unsigned int y = min( a.y, c );\n"
"	unsigned int z = min( a.z, c );\n"
"	return uint3{ x, y, z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 min( const unsigned int c, const uint3& a ) { return min( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 max( const uint4& a, const uint4& b )\n"
"{\n"
"	unsigned int x = max( a.x, b.x );\n"
"	unsigned int y = max( a.y, b.y );\n"
"	unsigned int z = max( a.z, b.z );\n"
"	unsigned int w = max( a.w, b.w );\n"
"	return uint4{ x, y, z, w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 max( const uint4& a, const unsigned int c )\n"
"{\n"
"	unsigned int x = max( a.x, c );\n"
"	unsigned int y = max( a.y, c );\n"
"	unsigned int z = max( a.z, c );\n"
"	unsigned int w = max( a.w, c );\n"
"	return uint4{ x, y, z, w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 max( const unsigned int c, const uint4& a ) { return max( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 min( const uint4& a, const uint4& b )\n"
"{\n"
"	unsigned int x = min( a.x, b.x );\n"
"	unsigned int y = min( a.y, b.y );\n"
"	unsigned int z = min( a.z, b.z );\n"
"	unsigned int w = min( a.w, b.w );\n"
"	return uint4{ x, y, z, w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 min( const uint4& a, const unsigned int c )\n"
"{\n"
"	unsigned int x = min( a.x, c );\n"
"	unsigned int y = min( a.y, c );\n"
"	unsigned int z = min( a.z, c );\n"
"	unsigned int w = min( a.w, c );\n"
"	return uint4{ x, y, z, w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint4 min( const unsigned int c, const uint4& a ) { return min( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 max( const float2& a, const float2& b )\n"
"{\n"
"	float x = fmaxf( a.x, b.x );\n"
"	float y = fmaxf( a.y, b.y );\n"
"	return float2{ x, y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 max( const float2& a, const float c )\n"
"{\n"
"	float x = fmaxf( a.x, c );\n"
"	float y = fmaxf( a.y, c );\n"
"	return float2{ x, y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 max( const float c, const float2& a ) { return max( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 min( const float2& a, const float2& b )\n"
"{\n"
"	float x = fminf( a.x, b.x );\n"
"	float y = fminf( a.y, b.y );\n"
"	return float2{ x, y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 min( const float2& a, const float c )\n"
"{\n"
"	float x = fminf( a.x, c );\n"
"	float y = fminf( a.y, c );\n"
"	return float2{ x, y };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float2 min( const float c, const float2& a ) { return min( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 max( const float3& a, const float3& b )\n"
"{\n"
"	float x = fmaxf( a.x, b.x );\n"
"	float y = fmaxf( a.y, b.y );\n"
"	float z = fmaxf( a.z, b.z );\n"
"	return float3{ x, y, z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 max( const float3& a, const float c )\n"
"{\n"
"	float x = fmaxf( a.x, c );\n"
"	float y = fmaxf( a.y, c );\n"
"	float z = fmaxf( a.z, c );\n"
"	return float3{ x, y, z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 max( const float c, const float3& a ) { return max( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 min( const float3& a, const float3& b )\n"
"{\n"
"	float x = fminf( a.x, b.x );\n"
"	float y = fminf( a.y, b.y );\n"
"	float z = fminf( a.z, b.z );\n"
"	return float3{ x, y, z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 min( const float3& a, const float c )\n"
"{\n"
"	float x = fminf( a.x, c );\n"
"	float y = fminf( a.y, c );\n"
"	float z = fminf( a.z, c );\n"
"	return float3{ x, y, z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 min( const float c, const float3& a ) { return min( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 max( const float4& a, const float4& b )\n"
"{\n"
"	float x = fmaxf( a.x, b.x );\n"
"	float y = fmaxf( a.y, b.y );\n"
"	float z = fmaxf( a.z, b.z );\n"
"	float w = fmaxf( a.w, b.w );\n"
"	return float4{ x, y, z, w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 max( const float4& a, const float c )\n"
"{\n"
"	float x = fmaxf( a.x, c );\n"
"	float y = fmaxf( a.y, c );\n"
"	float z = fmaxf( a.z, c );\n"
"	float w = fmaxf( a.w, c );\n"
"	return float4{ x, y, z, w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 max( const float c, const float4& a ) { return max( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 min( const float4& a, const float4& b )\n"
"{\n"
"	float x = fminf( a.x, b.x );\n"
"	float y = fminf( a.y, b.y );\n"
"	float z = fminf( a.z, b.z );\n"
"	float w = fminf( a.w, b.w );\n"
"	return float4{ x, y, z, w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 min( const float4& a, const float c )\n"
"{\n"
"	float x = fminf( a.x, c );\n"
"	float y = fminf( a.y, c );\n"
"	float z = fminf( a.z, c );\n"
"	float w = fminf( a.w, c );\n"
"	return float4{ x, y, z, w };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 min( const float c, const float4& a ) { return min( a, c ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 fma( const float3& a, const float3& b, const float3& c )\n"
"{\n"
"	float x = fmaf( a.x, b.x, c.x );\n"
"	float y = fmaf( a.y, b.y, c.y );\n"
"	float z = fmaf( a.z, b.z, c.z );\n"
"	return float3{ x, y, z };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float sumOfProducts( const float a, const float b, const float c, const float d )\n"
"{\n"
"	const float cd = c * d;\n"
"	return fmaf( a, b, cd ) + fmaf( c, d, -cd );\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float differenceOfProducts( const float a, const float b, const float c, const float d )\n"
"{\n"
"	const float cd = c * d;\n"
"	return fmaf( a, b, -cd ) - fmaf( c, d, -cd );\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float hypot( const float3& a )\n"
"{\n"
"	float x = fabsf( a.x );\n"
"	float y = fabsf( a.y );\n"
"	float z = fabsf( a.z );\n"
"	float d = x < y ? y < z ? z : y : x < z ? z : x;\n"
"	return d != 0.0f ? ( d * sqrtf( ( x / d ) * ( x / d ) + ( y / d ) * ( y / d ) + ( z / d ) * ( z / d ) ) ) : 0.0f;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 cross( const float3& a, const float3& b )\n"
"{\n"
"	return {\n"
"		differenceOfProducts( a.y, b.z, a.z, b.y ),\n"
"		differenceOfProducts( a.z, b.x, a.x, b.z ),\n"
"		differenceOfProducts( a.x, b.y, a.y, b.x ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float dot( const float3& a, const float3& b )\n"
"{\n"
"	return fmaf( a.x, b.x, sumOfProducts( a.y, b.y, a.z, b.z ) );\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 normalize( const float3& a ) { return a / hypot( a ); }\n"
"\n"
"template <typename V>\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE auto* ptr( V& a )\n"
"{\n"
"	return &a.x;\n"
"}\n"
"\n"
"template <typename T, typename V>\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE V mix( const V& lo, const V& hi, const T& t )\n"
"{\n"
"	return lo * ( static_cast<T>( 1 ) - t ) + hi * t;\n"
"}\n"
"\n"
"template <typename T, typename V>\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE V clamp( const V& v, const T& lo, const T& hi )\n"
"{\n"
"	return max( min( v, hi ), lo );\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE T sign( T val )\n"
"{\n"
"	return val < T( 0 ) ? T( -1 ) : ( val == T( 0 ) ? T( 0 ) : T( 1 ) );\n"
"}\n"
"\n"
"#if defined( __KERNELCC__ )\n"
"HIPRT_DEVICE HIPRT_INLINE float atomicMinFloat( float* addr, float value )\n"
"{\n"
"	float old;\n"
"	old = ( __float_as_int( value ) >= 0 )\n"
"			  ? __int_as_float( atomicMin( reinterpret_cast<int*>( addr ), __float_as_int( value ) ) )\n"
"			  : __uint_as_float( atomicMax( reinterpret_cast<unsigned int*>( addr ), __float_as_uint( value ) ) );\n"
"	return old;\n"
"}\n"
"\n"
"HIPRT_DEVICE HIPRT_INLINE float atomicMaxFloat( float* addr, float value )\n"
"{\n"
"	float old;\n"
"	old = ( __float_as_int( value ) >= 0 )\n"
"			  ? __int_as_float( atomicMax( reinterpret_cast<int*>( addr ), __float_as_int( value ) ) )\n"
"			  : __uint_as_float( atomicMin( reinterpret_cast<unsigned int*>( addr ), __float_as_uint( value ) ) );\n"
"	return old;\n"
"}\n"
"#endif\n"
"} // namespace hiprt\n"
;
static const char* hip_Obb= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_math.h>\n"
"#include <hiprt/impl/Aabb.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"static constexpr uint32_t RotationCount = 88;\n"
"\n"
"HIPRT_CONST static uint32_t EncodedRotations[RotationCount][9] = {\n"
"	{ 25, 0, 0, 0, 22, 43, 0, 11, 22 },		{ 25, 0, 0, 0, 22, 11, 0, 43, 22 },		{ 25, 0, 0, 0, 17, 49, 0, 17, 17 },\n"
"	{ 25, 0, 0, 0, 0, 57, 0, 25, 0 },		{ 22, 0, 11, 0, 25, 0, 43, 0, 22 },		{ 22, 0, 43, 0, 25, 0, 11, 0, 22 },\n"
"	{ 17, 0, 17, 0, 25, 0, 49, 0, 17 },		{ 22, 43, 0, 11, 22, 0, 0, 0, 25 },		{ 22, 11, 0, 43, 22, 0, 0, 0, 25 },\n"
"	{ 17, 49, 0, 17, 17, 0, 0, 0, 25 },		{ 22, 38, 6, 6, 24, 1, 38, 1, 24 },		{ 22, 6, 38, 38, 24, 1, 6, 1, 24 },\n"
"	{ 17, 44, 12, 12, 20, 2, 44, 2, 20 },	{ 17, 12, 44, 44, 20, 2, 12, 2, 20 },	{ 11, 47, 15, 15, 16, 7, 47, 7, 16 },\n"
"	{ 11, 15, 47, 47, 16, 7, 15, 7, 16 },	{ 0, 49, 17, 17, 12, 12, 49, 12, 12 },	{ 0, 17, 49, 49, 12, 12, 17, 12, 12 },\n"
"	{ 22, 38, 38, 6, 24, 33, 6, 33, 24 },	{ 22, 6, 6, 38, 24, 33, 38, 33, 24 },	{ 17, 44, 44, 12, 20, 34, 12, 34, 20 },\n"
"	{ 17, 12, 12, 44, 20, 34, 44, 34, 20 }, { 11, 47, 47, 15, 16, 39, 15, 39, 16 }, { 11, 15, 15, 47, 16, 39, 47, 39, 16 },\n"
"	{ 0, 49, 49, 17, 12, 44, 17, 44, 12 },	{ 0, 17, 17, 49, 12, 44, 49, 44, 12 },	{ 24, 38, 1, 6, 22, 38, 1, 6, 24 },\n"
"	{ 24, 6, 1, 38, 22, 6, 1, 38, 24 },		{ 20, 44, 2, 12, 17, 44, 2, 12, 20 },	{ 20, 12, 2, 44, 17, 12, 2, 44, 20 },\n"
"	{ 16, 47, 7, 15, 11, 47, 7, 15, 16 },	{ 16, 15, 7, 47, 11, 15, 7, 47, 16 },	{ 12, 49, 12, 17, 0, 49, 12, 17, 12 },\n"
"	{ 12, 17, 12, 49, 0, 17, 12, 49, 12 },	{ 24, 6, 33, 38, 22, 38, 33, 6, 24 },	{ 24, 38, 33, 6, 22, 6, 33, 38, 24 },\n"
"	{ 20, 12, 34, 44, 17, 44, 34, 12, 20 }, { 20, 44, 34, 12, 17, 12, 34, 44, 20 }, { 16, 15, 39, 47, 11, 47, 39, 15, 16 },\n"
"	{ 16, 47, 39, 15, 11, 15, 39, 47, 16 }, { 12, 17, 44, 49, 0, 49, 44, 17, 12 },	{ 12, 49, 44, 17, 0, 17, 44, 49, 12 },\n"
"	{ 24, 1, 6, 1, 24, 38, 38, 6, 22 },		{ 24, 1, 38, 1, 24, 6, 6, 38, 22 },		{ 20, 2, 12, 2, 20, 44, 44, 12, 17 },\n"
"	{ 20, 2, 44, 2, 20, 12, 12, 44, 17 },	{ 16, 7, 15, 7, 16, 47, 47, 15, 11 },	{ 16, 7, 47, 7, 16, 15, 15, 47, 11 },\n"
"	{ 12, 12, 17, 12, 12, 49, 49, 17, 0 },	{ 24, 33, 6, 33, 24, 6, 38, 38, 22 },	{ 24, 33, 38, 33, 24, 38, 6, 6, 22 },\n"
"	{ 20, 34, 12, 34, 20, 12, 44, 44, 17 }, { 20, 34, 44, 34, 20, 44, 12, 12, 17 }, { 16, 39, 15, 39, 16, 15, 47, 47, 11 },\n"
"	{ 16, 39, 47, 39, 16, 47, 15, 15, 11 }, { 12, 44, 17, 44, 12, 17, 49, 49, 0 },	{ 23, 35, 5, 5, 23, 35, 35, 5, 23 },\n"
"	{ 23, 5, 35, 35, 23, 5, 5, 35, 23 },	{ 19, 40, 13, 13, 19, 40, 40, 13, 19 }, { 19, 13, 40, 40, 19, 13, 13, 40, 19 },\n"
"	{ 14, 41, 18, 18, 14, 41, 41, 18, 14 }, { 14, 18, 41, 41, 14, 18, 18, 41, 14 }, { 10, 36, 21, 21, 10, 36, 36, 21, 10 },\n"
"	{ 10, 21, 36, 36, 10, 21, 21, 36, 10 }, { 23, 37, 3, 3, 23, 5, 37, 35, 23 },	{ 23, 3, 37, 37, 23, 35, 3, 5, 23 },\n"
"	{ 19, 45, 8, 8, 19, 13, 45, 40, 19 },	{ 19, 8, 45, 45, 19, 40, 8, 13, 19 },	{ 14, 50, 9, 9, 14, 18, 50, 41, 14 },\n"
"	{ 14, 9, 50, 50, 14, 41, 9, 18, 14 },	{ 10, 53, 4, 4, 10, 21, 53, 36, 10 },	{ 10, 4, 53, 53, 10, 36, 4, 21, 10 },\n"
"	{ 23, 37, 35, 3, 23, 37, 5, 3, 23 },	{ 23, 3, 5, 37, 23, 3, 35, 37, 23 },	{ 19, 45, 40, 8, 19, 45, 13, 8, 19 },\n"
"	{ 19, 8, 13, 45, 19, 8, 40, 45, 19 },	{ 14, 50, 41, 9, 14, 50, 18, 9, 14 },	{ 14, 9, 18, 50, 14, 9, 41, 50, 14 },\n"
"	{ 10, 53, 36, 4, 10, 53, 21, 4, 10 },	{ 10, 4, 21, 53, 10, 4, 36, 53, 10 },	{ 23, 35, 37, 5, 23, 3, 3, 37, 23 },\n"
"	{ 23, 5, 3, 35, 23, 37, 37, 3, 23 },	{ 19, 40, 45, 13, 19, 8, 8, 45, 19 },	{ 19, 13, 8, 40, 19, 45, 45, 8, 19 },\n"
"	{ 14, 41, 50, 18, 14, 9, 9, 50, 14 },	{ 14, 18, 9, 41, 14, 50, 50, 9, 14 },	{ 10, 36, 53, 21, 10, 4, 4, 53, 10 },\n"
"	{ 10, 21, 4, 36, 10, 53, 53, 4, 10 } };\n"
"\n"
"HIPRT_CONST static uint32_t IndexToFloat[26] = {\n"
"	0,			0x3d1be50c, 0x3e15f61a, 0x3e484336, 0x3e79df93, 0x3e7c3a3a, 0x3e8a8bd4, 0x3e9e0875, 0x3e9f0938,\n"
"	0x3ea7bf1b, 0x3eaaaaab, 0x3ec3ef15, 0x3f000000, 0x3f01814f, 0x3f16a507, 0x3f273d75, 0x3f30fbc5, 0x3f3504f3,\n"
"	0x3f3d3a87, 0x3f4e034d, 0x3f5a827a, 0x3f692290, 0x3f6c835e, 0x3f73023f, 0x3f7641af, 0x3f800000 };\n"
"\n"
"HIPRT_CONST static uint32_t MatrixIndexToId[RotationCount + 1] = {\n"
"	0,	1,	2,	6,	8,	9,	10, 16, 17, 18, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,	 39,  40,  41,	42, 43,\n"
"	44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69,	 70,  72,  73,	74, 75,\n"
"	76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 127 };\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE static float\n"
"getRotationMatrixEntry( const uint32_t matrixIndex, const uint32_t row, const uint32_t col )\n"
"{\n"
"	const uint32_t entryIndex = EncodedRotations[matrixIndex][3 * row + col];\n"
"	const uint32_t out		  = IndexToFloat[entryIndex & 0x1f] | ( ( entryIndex >> 5 ) << 31 );\n"
"	return as_float( out );\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE static void getRotationMatrix( const uint32_t matrixIndex, float ( &R )[3][3] )\n"
"{\n"
"	for ( uint32_t i = 0; i < 3; ++i )\n"
"	{\n"
"		for ( uint32_t j = 0; j < 3; ++j )\n"
"		{\n"
"			R[i][j] = getRotationMatrixEntry( matrixIndex, i, j );\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"class Obb\n"
"{\n"
"public:\n"
"	HIPRT_HOST_DEVICE Obb( const uint32_t matrixIndex ) : m_matrixIndex( matrixIndex ) { m_box.reset(); }\n"
"\n"
"	HIPRT_HOST_DEVICE Obb& grow( const float3& point )\n"
"	{\n"
"		if ( m_matrixIndex < RotationCount )\n"
"		{\n"
"			float R[3][3];\n"
"			getRotationMatrix( m_matrixIndex, R );\n"
"\n"
"			float3 p{};\n"
"			p.x = dot( { R[0][0], R[0][1], R[0][2] }, point );\n"
"			p.y = dot( { R[1][0], R[1][1], R[1][2] }, point );\n"
"			p.z = dot( { R[2][0], R[2][1], R[2][2] }, point );\n"
"			m_box.grow( p );\n"
"		}\n"
"		else\n"
"		{\n"
"			m_box.grow( point );\n"
"		}\n"
"\n"
"		return *this;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Obb& grow( const Aabb& aabb )\n"
"	{\n"
"		grow( aabb.m_min );\n"
"		grow( { aabb.m_min.x, aabb.m_min.y, aabb.m_max.z } );\n"
"		grow( { aabb.m_min.x, aabb.m_max.y, aabb.m_min.z } );\n"
"		grow( { aabb.m_min.x, aabb.m_max.y, aabb.m_max.z } );\n"
"		grow( { aabb.m_max.x, aabb.m_min.y, aabb.m_min.z } );\n"
"		grow( { aabb.m_max.x, aabb.m_min.y, aabb.m_max.z } );\n"
"		grow( { aabb.m_max.x, aabb.m_max.y, aabb.m_min.z } );\n"
"		grow( aabb.m_max );\n"
"		return *this;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb aabb() const { return m_box; }\n"
"\n"
"	HIPRT_HOST_DEVICE float area() const { return aabb().area(); }\n"
"\n"
"	HIPRT_HOST_DEVICE bool valid() const { return m_box.valid(); }\n"
"\n"
"	Aabb	 m_box;\n"
"	uint32_t m_matrixIndex;\n"
"};\n"
"\n"
"class Kdop\n"
"{\n"
"public:\n"
"	HIPRT_HOST_DEVICE Kdop() { reset(); }\n"
"\n"
"	HIPRT_HOST_DEVICE Kdop( const Kdop& kdop )\n"
"	{\n"
"		for ( uint32_t i = 0; i <= RotationCount; ++i )\n"
"			m_boxes[i] = kdop.m_boxes[i];\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Kdop( const Aabb& aabb )\n"
"	{\n"
"		reset();\n"
"		grow( aabb );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE void reset( void )\n"
"	{\n"
"		for ( uint32_t i = 0; i <= RotationCount; ++i )\n"
"			m_boxes[i].reset();\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Kdop& grow( const Kdop& kdop )\n"
"	{\n"
"		for ( uint32_t i = 0; i <= RotationCount; ++i )\n"
"			m_boxes[i].grow( kdop.m_boxes[i] );\n"
"		return *this;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Kdop& grow( const Aabb& aabb )\n"
"	{\n"
"		grow( aabb.m_min );\n"
"		grow( { aabb.m_min.x, aabb.m_min.y, aabb.m_max.z } );\n"
"		grow( { aabb.m_min.x, aabb.m_max.y, aabb.m_min.z } );\n"
"		grow( { aabb.m_min.x, aabb.m_max.y, aabb.m_max.z } );\n"
"		grow( { aabb.m_max.x, aabb.m_min.y, aabb.m_min.z } );\n"
"		grow( { aabb.m_max.x, aabb.m_min.y, aabb.m_max.z } );\n"
"		grow( { aabb.m_max.x, aabb.m_max.y, aabb.m_min.z } );\n"
"		grow( aabb.m_max );\n"
"		return *this;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Kdop& grow( const float3& point )\n"
"	{\n"
"		m_boxes[RotationCount].grow( point );\n"
"		float R[3][3];\n"
"		for ( uint32_t i = 0; i < RotationCount; ++i )\n"
"		{\n"
"			getRotationMatrix( i, R );\n"
"\n"
"			float3 p{};\n"
"			p.x = dot( { R[0][0], R[0][1], R[0][2] }, point );\n"
"			p.y = dot( { R[1][0], R[1][1], R[1][2] }, point );\n"
"			p.z = dot( { R[2][0], R[2][1], R[2][2] }, point );\n"
"\n"
"			m_boxes[i].grow( p );\n"
"		}\n"
"\n"
"		return *this;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Obb obb( const uint32_t matrixIndex ) const\n"
"	{\n"
"		Obb obb( matrixIndex );\n"
"		obb.m_box = m_boxes[matrixIndex];\n"
"		return obb;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t minMatrixIndex() const\n"
"	{\n"
"		float	 minArea  = FltMax;\n"
"		uint32_t minIndex = InvalidValue;\n"
"		for ( int32_t i = RotationCount; i >= 0; --i )\n"
"		{\n"
"			const float area = m_boxes[i].area();\n"
"			if ( minArea > area )\n"
"			{\n"
"				minArea	 = area;\n"
"				minIndex = i;\n"
"			}\n"
"		}\n"
"		return minIndex;\n"
"	}\n"
"\n"
"	// aabb is valid => all other frames are valid as well\n"
"	HIPRT_HOST_DEVICE bool valid() const { return m_boxes[RotationCount].valid(); }\n"
"\n"
"public:\n"
"	Aabb m_boxes[RotationCount + 1];\n"
"};\n"
"} // namespace hiprt\n"
;
static const char* hip_Aabb= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_math.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"class Aabb\n"
"{\n"
"public:\n"
"	HIPRT_HOST_DEVICE Aabb() { reset(); }\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb( const float3& point ) : m_min( point ), m_max( point ) {}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb( const float3& mn, const float3& mx ) : m_min( mn ), m_max( mx ) {}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb( const Aabb& aabb0, const Aabb& aabb1 )\n"
"	{\n"
"		m_min = min( aabb0.m_min, aabb1.m_min );\n"
"		m_max = max( aabb0.m_max, aabb1.m_max );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb( const Aabb& aabb ) : m_min( aabb.m_min ), m_max( aabb.m_max ) {}\n"
"\n"
"	HIPRT_HOST_DEVICE void reset( void )\n"
"	{\n"
"		m_min = make_float3( FltMax );\n"
"		m_max = make_float3( -FltMax );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb& grow( const Aabb& aabb )\n"
"	{\n"
"		m_min = min( m_min, aabb.m_min );\n"
"		m_max = max( m_max, aabb.m_max );\n"
"		return *this;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb& grow( const float3& point )\n"
"	{\n"
"		m_min = min( m_min, point );\n"
"		m_max = max( m_max, point );\n"
"		return *this;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 center() const { return ( m_max + m_min ) * 0.5f; }\n"
"\n"
"	HIPRT_HOST_DEVICE float3 extent() const { return m_max - m_min; }\n"
"\n"
"	HIPRT_HOST_DEVICE float area() const\n"
"	{\n"
"		float3 ext = extent();\n"
"		return 2 * ( ext.x * ext.y + ext.x * ext.z + ext.y * ext.z );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE bool valid( void ) const { return m_min.x <= m_max.x && m_min.y <= m_max.y && m_min.z <= m_max.z; }\n"
"\n"
"	HIPRT_HOST_DEVICE bool contains( const float3& point ) const\n"
"	{\n"
"		return m_min.x <= point.x && m_min.y <= point.y && m_min.z <= point.z && point.x <= m_max.x && point.y <= m_max.y &&\n"
"			   point.z <= m_max.z;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb& intersect( const Aabb& aabb )\n"
"	{\n"
"		m_min = max( m_min, aabb.m_min );\n"
"		m_max = min( m_max, aabb.m_max );\n"
"		return *this;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float2 intersect( const float3& origin, const float3& invDirection, float maxT ) const\n"
"	{\n"
"		float3 f	= ( m_max - origin ) * invDirection;\n"
"		float3 n	= ( m_min - origin ) * invDirection;\n"
"		float3 tmax = max( f, n );\n"
"		float3 tmin = min( f, n );\n"
"		float  t1	= fminf( fminf( fminf( tmax.x, tmax.y ), tmax.z ), maxT );\n"
"		float  t0	= fmaxf( fmaxf( fmaxf( tmin.x, tmin.y ), tmin.z ), 0.0f );\n"
"		return float2{ t0, t1 };\n"
"	}\n"
"\n"
"#if defined( __KERNELCC__ )\n"
"	HIPRT_DEVICE void atomicGrow( const Aabb& aabb )\n"
"	{\n"
"		atomicMinFloat( &m_min.x, aabb.m_min.x );\n"
"		atomicMinFloat( &m_min.y, aabb.m_min.y );\n"
"		atomicMinFloat( &m_min.z, aabb.m_min.z );\n"
"		atomicMaxFloat( &m_max.x, aabb.m_max.x );\n"
"		atomicMaxFloat( &m_max.y, aabb.m_max.y );\n"
"		atomicMaxFloat( &m_max.z, aabb.m_max.z );\n"
"	}\n"
"\n"
"	HIPRT_DEVICE void atomicGrow( const float3& p )\n"
"	{\n"
"		atomicMinFloat( &m_min.x, p.x );\n"
"		atomicMinFloat( &m_min.y, p.y );\n"
"		atomicMinFloat( &m_min.z, p.z );\n"
"		atomicMaxFloat( &m_max.x, p.x );\n"
"		atomicMaxFloat( &m_max.y, p.y );\n"
"		atomicMaxFloat( &m_max.z, p.z );\n"
"	}\n"
"\n"
"	HIPRT_DEVICE Aabb shuffle( uint32_t index )\n"
"	{\n"
"		Aabb aabb;\n"
"		aabb.m_min.x = shfl( m_min.x, index );\n"
"		aabb.m_min.y = shfl( m_min.x, index );\n"
"		aabb.m_min.z = shfl( m_min.x, index );\n"
"		aabb.m_max.x = shfl( m_max.x, index );\n"
"		aabb.m_max.y = shfl( m_max.y, index );\n"
"		aabb.m_max.z = shfl( m_max.z, index );\n"
"		return aabb;\n"
"	}\n"
"#endif\n"
"\n"
"public:\n"
"	float3 m_min;\n"
"	float3 m_max;\n"
"};\n"
"} // namespace hiprt\n"
;
static const char* hip_AabbList= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_types.h>\n"
"#include <hiprt/impl/BvhNode.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"class AabbList\n"
"{\n"
"public:\n"
"	HIPRT_HOST_DEVICE AabbList( const hiprtAABBListPrimitive& list )\n"
"		: m_aabbCount( list.aabbCount ), m_aabbStride( list.aabbStride )\n"
"	{\n"
"		m_aabbs = reinterpret_cast<const uint8_t*>( list.aabbs );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE CustomNode fetchPrimNode( const uint32_t index ) const { return CustomNode{ index }; }\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb fetchAabb( const uint32_t index ) const\n"
"	{\n"
"		const uint32_t halfStride = ( m_aabbStride >> 1 );\n"
"		const float*   boxMinPtr  = reinterpret_cast<const float*>( m_aabbs + index * m_aabbStride + 0 * halfStride );\n"
"		const float*   boxMaxPtr  = reinterpret_cast<const float*>( m_aabbs + index * m_aabbStride + 1 * halfStride );\n"
"		Aabb		   box;\n"
"		box.m_min = { boxMinPtr[0], boxMinPtr[1], boxMinPtr[2] };\n"
"		box.m_max = { boxMaxPtr[0], boxMaxPtr[1], boxMaxPtr[2] };\n"
"		return box;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 fetchCenter( const uint32_t index ) const { return fetchAabb( index ).center(); }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getCount() const { return m_aabbCount; }\n"
"\n"
"private:\n"
"	const uint8_t* m_aabbs;\n"
"	uint32_t	   m_aabbCount;\n"
"	uint32_t	   m_aabbStride;\n"
"};\n"
"} // namespace hiprt\n"
;
static const char* hip_BvhCommon= \
"\n"
"#pragma once\n"
"#include <hiprt/impl/BvhConfig.h>\n"
"#include <hiprt/impl/BvhNode.h>\n"
"#include <hiprt/impl/Header.h>\n"
"#include <hiprt/impl/Instance.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"HIPRT_INLINE HIPRT_HOST_DEVICE size_t getMaxTrianglePacketNodeCount( const size_t count )\n"
"{\n"
"	return 2 * DivideRoundUp( count, MinTrianglePairsPerPacket + 1 );\n"
"}\n"
"\n"
"HIPRT_INLINE HIPRT_HOST_DEVICE size_t\n"
"getMaxBoxNodeCount( const size_t count, const uint32_t branchingFactor, const uint32_t maxFatLeafSize )\n"
"{\n"
"	const size_t maxLeafNodes	  = DivideRoundUp( count, maxFatLeafSize + 1 );\n"
"	const size_t maxInternalNodes = 1 + DivideRoundUp( maxLeafNodes, branchingFactor - 1 );\n"
"	return maxLeafNodes + maxInternalNodes;\n"
"}\n"
"\n"
"template <typename PrimitiveNode>\n"
"HIPRT_INLINE HIPRT_HOST_DEVICE size_t getMaxPrimNodeCount( const size_t count )\n"
"{\n"
"	size_t primNodeCount = count;\n"
"	if constexpr ( is_same<PrimitiveNode, TrianglePacketNode>::value ) primNodeCount = getMaxTrianglePacketNodeCount( count );\n"
"	return primNodeCount;\n"
"}\n"
"\n"
"HIPRT_INLINE HIPRT_HOST_DEVICE size_t\n"
"getMaxPrimNodeCount( const hiprtGeometryBuildInput& buildInput, const uint32_t rtip, const size_t count )\n"
"{\n"
"	size_t primNodeCount = count;\n"
"	if ( buildInput.type == hiprtPrimitiveTypeTriangleMesh && rtip >= 31 )\n"
"		primNodeCount = getMaxTrianglePacketNodeCount( count );\n"
"	return primNodeCount;\n"
"}\n"
"\n"
"template <typename BoxNode, typename PrimitiveNode>\n"
"HIPRT_INLINE HIPRT_HOST_DEVICE size_t getMaxBoxNodeCount( const size_t count )\n"
"{\n"
"	const uint32_t branchingFactor = BoxNode::BranchingFactor;\n"
"	if ( count <= branchingFactor ) return 1;\n"
"\n"
"	uint32_t maxFatLeafSize = 1;\n"
"	if constexpr ( is_same<PrimitiveNode, TrianglePacketNode>::value ) maxFatLeafSize = MaxFatLeafSize;\n"
"\n"
"	return getMaxBoxNodeCount( count, branchingFactor, maxFatLeafSize );\n"
"}\n"
"\n"
"template <typename BuildInput>\n"
"HIPRT_INLINE HIPRT_HOST_DEVICE size_t\n"
"getMaxBoxNodeCount( const BuildInput& buildInput, const uint32_t rtip, const size_t count )\n"
"{\n"
"	const uint32_t branchingFactor = rtip >= 31 ? 8 : 4;\n"
"	if ( count <= branchingFactor ) return 1;\n"
"\n"
"	uint32_t maxFatLeafSize = 1;\n"
"	if constexpr ( is_same<BuildInput, hiprtGeometryBuildInput>::value )\n"
"		if ( buildInput.type == hiprtPrimitiveTypeTriangleMesh && rtip >= 31 ) maxFatLeafSize = MaxFatLeafSize;\n"
"\n"
"	return getMaxBoxNodeCount( count, branchingFactor, maxFatLeafSize );\n"
"}\n"
"\n"
"HIPRT_INLINE HIPRT_HOST_DEVICE size_t getPrimCount( const hiprtGeometryBuildInput& buildInput )\n"
"{\n"
"	size_t primCount{};\n"
"	switch ( buildInput.type )\n"
"	{\n"
"	case hiprtPrimitiveTypeTriangleMesh: {\n"
"		primCount = buildInput.primitive.triangleMesh.triangleCount;\n"
"		if ( buildInput.primitive.triangleMesh.trianglePairCount > 0 )\n"
"			primCount = buildInput.primitive.triangleMesh.trianglePairCount;\n"
"		break;\n"
"	}\n"
"	case hiprtPrimitiveTypeAABBList: {\n"
"		primCount = buildInput.primitive.aabbList.aabbCount;\n"
"		break;\n"
"	}\n"
"#if !defined( __KERNELCC__ )\n"
"	default:\n"
"		throw std::runtime_error( \"Not supported\" );\n"
"#endif\n"
"	}\n"
"	return primCount;\n"
"}\n"
"\n"
"HIPRT_INLINE HIPRT_HOST_DEVICE size_t\n"
"getPrimNodeSize( const hiprtGeometryBuildInput& buildInput, const size_t triangleNodeSize )\n"
"{\n"
"	size_t nodeSize{};\n"
"	switch ( buildInput.type )\n"
"	{\n"
"	case hiprtPrimitiveTypeTriangleMesh: {\n"
"		nodeSize = triangleNodeSize;\n"
"		break;\n"
"	}\n"
"	case hiprtPrimitiveTypeAABBList: {\n"
"		nodeSize = sizeof( CustomNode );\n"
"		break;\n"
"	}\n"
"#if !defined( __KERNELCC__ )\n"
"	default:\n"
"		throw std::runtime_error( \"Not supported\" );\n"
"#endif\n"
"	}\n"
"	return nodeSize;\n"
"}\n"
"\n"
"HIPRT_INLINE HIPRT_HOST_DEVICE size_t getGeometryStorageBufferSize(\n"
"	const size_t primNodeCount, const size_t boxNodeCount, const size_t primNodeSize, const size_t boxNodeSize )\n"
"{\n"
"	return RoundUp( sizeof( GeomHeader ), DefaultAlignment ) + RoundUp( primNodeCount * primNodeSize, DefaultAlignment ) +\n"
"		   RoundUp( boxNodeCount * boxNodeSize, DefaultAlignment );\n"
"}\n"
"\n"
"HIPRT_INLINE HIPRT_HOST_DEVICE size_t getSceneStorageBufferSize(\n"
"	const size_t primCount,\n"
"	const size_t primNodeCount,\n"
"	const size_t boxNodeCount,\n"
"	const size_t primNodeSize,\n"
"	const size_t boxNodeSize,\n"
"	const size_t frameCount )\n"
"{\n"
"	return RoundUp( sizeof( SceneHeader ), DefaultAlignment ) + RoundUp( boxNodeCount * boxNodeSize, DefaultAlignment ) +\n"
"		   RoundUp( primNodeCount * primNodeSize, DefaultAlignment ) +\n"
"		   RoundUp( primCount * sizeof( Instance ), DefaultAlignment ) +\n"
"		   RoundUp( frameCount * sizeof( Frame ), DefaultAlignment );\n"
"}\n"
"\n"
"HIPRT_INLINE HIPRT_HOST_DEVICE bool\n"
"batchBuild( const hiprtGeometryBuildInput& buildInput, const hiprtBuildOptions buildOptions )\n"
"{\n"
"	return getPrimCount( buildInput ) <= buildOptions.batchBuildMaxPrimCount &&\n"
"		   ( buildOptions.buildFlags & 7 ) != hiprtBuildFlagBitCustomBvhImport;\n"
"}\n"
"\n"
"HIPRT_INLINE HIPRT_HOST_DEVICE bool batchBuild( const hiprtSceneBuildInput& buildInput, const hiprtBuildOptions buildOptions )\n"
"{\n"
"	return buildInput.instanceCount <= buildOptions.batchBuildMaxPrimCount &&\n"
"		   ( buildOptions.buildFlags & 7 ) != hiprtBuildFlagBitCustomBvhImport;\n"
"}\n"
"} // namespace hiprt\n"
;
static const char* hip_BvhNode= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_types.h>\n"
"#include <hiprt/impl/Transform.h>\n"
"#include <hiprt/impl/Triangle.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"enum\n"
"{\n"
"	TriangleType = 0,\n"
"	BoxType		 = 5,\n"
"	InstanceType = 6,\n"
"	CustomType	 = 7\n"
"};\n"
"\n"
"static constexpr uint32_t RootIndex							= BoxType;\n"
"static constexpr uint32_t DefaultTriangleFlags				= ( 2 << 2 ) | ( 1 << 0 );\n"
"static constexpr uint32_t TrianglePairDescriptorSize		= 29;\n"
"static constexpr uint32_t TriangleStructHeaderSize			= 52;\n"
"static constexpr uint32_t MaxVerticesPerTrianglePacket		= 16;\n"
"static constexpr uint32_t MaxTrianglePairsPerTrianglePacket = 8;\n"
"static constexpr uint32_t MinTrianglePairsPerPacket			= 2u;\n"
"\n"
"static constexpr uint32_t FatLeafBit	= 1u << 31u;\n"
"static constexpr uint32_t RangeEndBit	= 1u << 31u;\n"
"static constexpr uint32_t RangeStartBit = 1u << 30u;\n"
"\n"
"static constexpr float Ci = 1.0f;\n"
"static constexpr float Ct = 1.0f;\n"
"\n"
"struct GeomHeader;\n"
"struct SceneHeader;\n"
"\n"
"HIPRT_DEVICE HIPRT_INLINE static uint32_t getNodeType( uint32_t nodeIndex )\n"
"{\n"
"#ifndef __KERNELCC__\n"
"	throw std::runtime_error( \"Function \'getNodeType()\' is not supposed to run on the host.\" );\n"
"#else\n"
"	if constexpr ( Rtip >= 31 )\n"
"		return nodeIndex & 15;\n"
"	else\n"
"		return nodeIndex & 7;\n"
"#endif\n"
"}\n"
"\n"
"HIPRT_DEVICE HIPRT_INLINE static uint32_t getNodeAddr( uint32_t nodeIndex )\n"
"{\n"
"#ifndef __KERNELCC__\n"
"	throw std::runtime_error( \"Function \'getNodeAddr()\' is not supposed to run on the host.\" );\n"
"#else\n"
"	nodeIndex &= ~FatLeafBit;\n"
"	if constexpr ( Rtip >= 31 )\n"
"	{\n"
"		return nodeIndex >> 4;\n"
"	}\n"
"	else\n"
"	{\n"
"		const uint32_t nodeType = getNodeType( nodeIndex );\n"
"		return nodeIndex >> ( nodeType == BoxType || nodeType == InstanceType ? 4 : 3 );\n"
"	}\n"
"#endif\n"
"}\n"
"\n"
"HIPRT_DEVICE HIPRT_INLINE static uint32_t encodeNodeIndex( uint32_t nodeAddr, uint32_t nodeType )\n"
"{\n"
"#ifndef __KERNELCC__\n"
"	throw std::runtime_error( \"Function \'encodeNodeIndex()\' is not supposed to run on the host.\" );\n"
"#else\n"
"	if constexpr ( Rtip >= 31 )\n"
"	{\n"
"		return ( nodeAddr << 4 ) | nodeType;\n"
"	}\n"
"	else\n"
"	{\n"
"		if ( nodeType == BoxType || nodeType == InstanceType ) nodeAddr <<= 1;\n"
"		return ( nodeAddr << 3 ) | nodeType;\n"
"	}\n"
"#endif\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE static uint32_t triPairIndexToType( uint32_t triPairIndex )\n"
"{\n"
"	return ( triPairIndex & 3 ) + ( ( triPairIndex & 4 ) << 1 );\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE static uint32_t typeToTriPairIndex( uint32_t nodeType )\n"
"{\n"
"	return ( nodeType & 3 ) + ( ( nodeType & 8 ) >> 1 );\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE static uint64_t encodeBaseAddr( const void* baseAddr, uint32_t nodeIndex = 0 )\n"
"{\n"
"	uint64_t baseIndex = reinterpret_cast<uint64_t>( baseAddr ) >> 3ull;\n"
"	return baseIndex + nodeIndex;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE static bool isFatLeafNode( uint32_t nodeIndex ) { return nodeIndex & FatLeafBit; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE static bool isLeafNode( uint32_t nodeIndex )\n"
"{\n"
"	return getNodeType( nodeIndex ) != BoxType && nodeIndex != InvalidValue;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE static bool isInternalNode( uint32_t nodeIndex ) { return getNodeType( nodeIndex ) == BoxType; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float ulp( float x ) { return fabs( x - as_float( as_uint( x ) ^ 1 ) ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 subDown( float3 a, float3 b )\n"
"{\n"
"	const float3 d = ( a - b );\n"
"	return d - float3{ ulp( d.x ), ulp( d.y ), ulp( d.z ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 subUp( float3 a, float3 b )\n"
"{\n"
"	const float3 d = ( a - b );\n"
"	return d + float3{ ulp( d.x ), ulp( d.y ), ulp( d.z ) };\n"
"}\n"
"\n"
"struct alignas( DefaultAlignment ) Box4Node\n"
"{\n"
"	static constexpr uint32_t BranchingFactor = 4;\n"
"\n"
"	HIPRT_HOST_DEVICE void initBox(\n"
"		const uint32_t					i,\n"
"		const uint32_t					childCount,\n"
"		const uint32_t					childIndex,\n"
"		const Aabb						childBox,\n"
"		[[maybe_unused]] const Aabb		nodeBox,\n"
"		[[maybe_unused]] const uint32_t childRanges = InvalidValue,\n"
"		[[maybe_unused]] const uint32_t matrixId	= NoRotationIndex )\n"
"	{\n"
"		if ( i < childCount )\n"
"		{\n"
"			( &m_childIndex0 )[i] = childIndex;\n"
"			( &m_box0 )[i]		  = childBox;\n"
"		}\n"
"		else\n"
"		{\n"
"			( &m_childIndex0 )[i] = InvalidValue;\n"
"			( &m_box0 )[i]		  = Aabb();\n"
"		}\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE void initBoxes(\n"
"		const uint32_t*					childIndices,\n"
"		const Aabb*						childBoxes,\n"
"		[[maybe_unused]] const uint32_t childRanges = InvalidValue,\n"
"		[[maybe_unused]] const uint32_t matrixId	= NoRotationIndex )\n"
"	{\n"
"		// at least one child is valid\n"
"		m_childIndex0 = childIndices[0];\n"
"		m_box0		  = childBoxes[0];\n"
"\n"
"		for ( uint32_t i = 1; i < BranchingFactor; ++i )\n"
"		{\n"
"			if ( i < getChildCount() )\n"
"			{\n"
"				( &m_childIndex0 )[i] = childIndices[i];\n"
"				( &m_box0 )[i]		  = childBoxes[i];\n"
"			}\n"
"			else\n"
"			{\n"
"				( &m_childIndex0 )[i] = InvalidValue;\n"
"				( &m_box0 )[i]		  = Aabb();\n"
"			}\n"
"		}\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE void init(\n"
"		const uint32_t					i,\n"
"		const uint32_t					parentAddr,\n"
"		const uint32_t					childCount,\n"
"		[[maybe_unused]] const uint32_t boxNodeBase,\n"
"		[[maybe_unused]] const uint32_t primNodeBase,\n"
"		const uint32_t					childIndex,\n"
"		const Aabb						childBox,\n"
"		[[maybe_unused]] const Aabb		nodeBox,\n"
"		const uint32_t					childRange = InvalidValue,\n"
"		const uint32_t					matrixId   = NoRotationIndex )\n"
"	{\n"
"		m_parentAddr	= parentAddr;\n"
"		m_updateCounter = 0;\n"
"		m_childCount	= childCount;\n"
"		initBox( i, childCount, childIndex, childBox, nodeBox, childRange, matrixId );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE void init(\n"
"		const uint32_t					parentAddr,\n"
"		const uint32_t					childCount,\n"
"		[[maybe_unused]] const uint32_t boxNodeBase,\n"
"		[[maybe_unused]] const uint32_t primNodeBase,\n"
"		const uint32_t*					childIndices,\n"
"		const Aabb*						childBoxes,\n"
"		const uint32_t					childRanges = InvalidValue,\n"
"		const uint32_t					matrixId	= NoRotationIndex )\n"
"	{\n"
"		m_parentAddr	= parentAddr;\n"
"		m_updateCounter = 0;\n"
"		m_childCount	= childCount;\n"
"		initBoxes( childIndices, childBoxes, childRanges, matrixId );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE void setBoxNodeBase( [[maybe_unused]] const uint32_t boxNodeBase ) {}\n"
"\n"
"	HIPRT_HOST_DEVICE void setPrimNodeBase( [[maybe_unused]] const uint32_t primNodeBase ) {}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb aabb() const\n"
"	{\n"
"		Aabb aabb;\n"
"		if ( m_childIndex0 != InvalidValue ) aabb.grow( m_box0 );\n"
"		if ( m_childIndex1 != InvalidValue ) aabb.grow( m_box1 );\n"
"		if ( m_childIndex2 != InvalidValue ) aabb.grow( m_box2 );\n"
"		if ( m_childIndex3 != InvalidValue ) aabb.grow( m_box3 );\n"
"		return aabb;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float area() const { return aabb().area(); }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getParentAddr() const { return m_parentAddr; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildCount() const { return m_childCount; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildIndex( uint32_t i ) const { return ( &m_childIndex0 )[i]; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildType( uint32_t i ) const { return getNodeType( getChildIndex( i ) ); }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildAddr( uint32_t i ) const { return getNodeAddr( getChildIndex( i ) ); }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildRange( uint32_t i ) const { return 1; }\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb getChildBox( uint32_t i ) const { return ( &m_box0 )[i]; }\n"
"\n"
"	HIPRT_HOST_DEVICE void patchChild( uint32_t i, uint32_t childIndex, [[maybe_unused]] uint32_t childRange )\n"
"	{\n"
"		( &m_childIndex0 )[i] = childIndex;\n"
"	}\n"
"\n"
"	uint32_t m_childIndex0 = InvalidValue;\n"
"	uint32_t m_childIndex1 = InvalidValue;\n"
"	uint32_t m_childIndex2 = InvalidValue;\n"
"	uint32_t m_childIndex3 = InvalidValue;\n"
"	Aabb	 m_box0;\n"
"	Aabb	 m_box1;\n"
"	Aabb	 m_box2;\n"
"	Aabb	 m_box3;\n"
"	uint32_t m_parentAddr = InvalidValue;\n"
"	uint32_t m_updateCounter;\n"
"	uint32_t m_childCount;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( Box4Node ) == 128 );\n"
"\n"
"struct ChildInfo\n"
"{\n"
"	uint32_t m_minX : 12;\n"
"	uint32_t m_minY : 12;\n"
"	uint32_t : 8;\n"
"\n"
"	uint32_t m_minZ : 12;\n"
"	uint32_t m_maxX : 12;\n"
"	uint32_t m_instanceMask : 8;\n"
"\n"
"	uint32_t m_maxY : 12;\n"
"	uint32_t m_maxZ : 12;\n"
"	uint32_t m_nodeType : 4;\n"
"	uint32_t m_nodeRange : 4;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( ChildInfo ) == 12 );\n"
"\n"
"template <typename Node>\n"
"HIPRT_HOST_DEVICE void initChildInfo(\n"
"	const uint32_t i,\n"
"	const uint32_t childCount,\n"
"	const uint32_t childIndex,\n"
"	const Aabb	   childBox,\n"
"	const Aabb	   nodeBox,\n"
"	const uint32_t childRange,\n"
"	Node&		   node )\n"
"{\n"
"	const float3 extent = subUp( nodeBox.m_max, nodeBox.m_min );\n"
"	const float3 origin = nodeBox.m_min;\n"
"\n"
"	uint3 exponent;\n"
"	exponent.x = ( as_uint( extent.x ) + 0x7fffff ) >> 23;\n"
"	exponent.y = ( as_uint( extent.y ) + 0x7fffff ) >> 23;\n"
"	exponent.z = ( as_uint( extent.z ) + 0x7fffff ) >> 23;\n"
"	exponent.x = exponent.x == 0 ? 0 : max( 13u, exponent.x );\n"
"	exponent.y = exponent.y == 0 ? 0 : max( 13u, exponent.y );\n"
"	exponent.z = exponent.z == 0 ? 0 : max( 13u, exponent.z );\n"
"\n"
"	node.m_origin	 = origin;\n"
"	node.m_xExponent = exponent.x;\n"
"	node.m_yExponent = exponent.y;\n"
"	node.m_zExponent = exponent.z;\n"
"\n"
"	float3 rcp_exponent;\n"
"	rcp_exponent.x = as_float( ( 254 - exponent.x + 12 ) << 23 );\n"
"	rcp_exponent.y = as_float( ( 254 - exponent.y + 12 ) << 23 );\n"
"	rcp_exponent.z = as_float( ( 254 - exponent.z + 12 ) << 23 );\n"
"\n"
"	ChildInfo childInfo{};\n"
"	if ( i < childCount && childBox.valid() )\n"
"	{\n"
"		float3 qmin = subDown( childBox.m_min, origin ) * rcp_exponent;\n"
"		qmin.x		= floor( qmin.x );\n"
"		qmin.y		= floor( qmin.y );\n"
"		qmin.z		= floor( qmin.z );\n"
"		qmin		= clamp( qmin, 0.0f, 4095.0f );\n"
"\n"
"		float3 qmax = subUp( childBox.m_max, origin ) * rcp_exponent;\n"
"		qmax.x		= ceil( qmax.x );\n"
"		qmax.y		= ceil( qmax.y );\n"
"		qmax.z		= ceil( qmax.z );\n"
"		qmax		= clamp( qmax, 1.0f, 4096.0f ) - make_float3( 1.0f );\n"
"\n"
"		childInfo.m_minX = static_cast<uint32_t>( qmin.x );\n"
"		childInfo.m_minY = static_cast<uint32_t>( qmin.y );\n"
"		childInfo.m_minZ = static_cast<uint32_t>( qmin.z );\n"
"		childInfo.m_maxX = static_cast<uint32_t>( qmax.x );\n"
"		childInfo.m_maxY = static_cast<uint32_t>( qmax.y );\n"
"		childInfo.m_maxZ = static_cast<uint32_t>( qmax.z );\n"
"\n"
"		childInfo.m_nodeType	 = getNodeType( childIndex );\n"
"		childInfo.m_nodeRange	 = childRange == InvalidValue ? 1 : childRange & 15;\n"
"		childInfo.m_instanceMask = 0xff;\n"
"	}\n"
"	else\n"
"	{\n"
"		childInfo.m_minX = 0xfff;\n"
"		childInfo.m_minY = 0xfff;\n"
"		childInfo.m_minZ = 0xfff;\n"
"		childInfo.m_maxX = 0xfff;\n"
"		childInfo.m_maxY = 0xfff;\n"
"		childInfo.m_maxZ = 0xfff;\n"
"	}\n"
"\n"
"	node.m_childInfos[i] = childInfo;\n"
"}\n"
"\n"
"template <typename Node>\n"
"HIPRT_HOST_DEVICE void\n"
"initChildInfos( const uint32_t* childIndices, const Aabb* childBoxes, const uint32_t childRanges, Node& node )\n"
"{\n"
"	Aabb nodeBox;\n"
"	for ( uint32_t i = 0; i < Node::BranchingFactor; ++i )\n"
"		nodeBox.grow( childBoxes[i] );\n"
"\n"
"	const float3 extent = subUp( nodeBox.m_max, nodeBox.m_min );\n"
"	const float3 origin = nodeBox.m_min;\n"
"\n"
"	uint3 exponent;\n"
"	exponent.x = ( as_uint( extent.x ) + 0x7fffff ) >> 23;\n"
"	exponent.y = ( as_uint( extent.y ) + 0x7fffff ) >> 23;\n"
"	exponent.z = ( as_uint( extent.z ) + 0x7fffff ) >> 23;\n"
"	exponent.x = exponent.x == 0 ? 0 : max( 13u, exponent.x );\n"
"	exponent.y = exponent.y == 0 ? 0 : max( 13u, exponent.y );\n"
"	exponent.z = exponent.z == 0 ? 0 : max( 13u, exponent.z );\n"
"\n"
"	node.m_origin	 = origin;\n"
"	node.m_xExponent = exponent.x;\n"
"	node.m_yExponent = exponent.y;\n"
"	node.m_zExponent = exponent.z;\n"
"\n"
"	float3 rcp_exponent;\n"
"	rcp_exponent.x = as_float( ( 254 - exponent.x + 12 ) << 23 );\n"
"	rcp_exponent.y = as_float( ( 254 - exponent.y + 12 ) << 23 );\n"
"	rcp_exponent.z = as_float( ( 254 - exponent.z + 12 ) << 23 );\n"
"\n"
"	for ( uint32_t i = 0; i < Node::BranchingFactor; ++i )\n"
"	{\n"
"		const Aabb& childBox = childBoxes[i];\n"
"\n"
"		ChildInfo childInfo{};\n"
"		if ( i < node.getChildCount() && childBox.valid() )\n"
"		{\n"
"			float3 qmin = subDown( childBox.m_min, origin ) * rcp_exponent;\n"
"			qmin.x		= floor( qmin.x );\n"
"			qmin.y		= floor( qmin.y );\n"
"			qmin.z		= floor( qmin.z );\n"
"			qmin		= clamp( qmin, 0.0f, 4095.0f );\n"
"\n"
"			float3 qmax = subUp( childBox.m_max, origin ) * rcp_exponent;\n"
"			qmax.x		= ceil( qmax.x );\n"
"			qmax.y		= ceil( qmax.y );\n"
"			qmax.z		= ceil( qmax.z );\n"
"			qmax		= clamp( qmax, 1.0f, 4096.0f ) - make_float3( 1.0f );\n"
"\n"
"			childInfo.m_minX = static_cast<uint32_t>( qmin.x );\n"
"			childInfo.m_minY = static_cast<uint32_t>( qmin.y );\n"
"			childInfo.m_minZ = static_cast<uint32_t>( qmin.z );\n"
"			childInfo.m_maxX = static_cast<uint32_t>( qmax.x );\n"
"			childInfo.m_maxY = static_cast<uint32_t>( qmax.y );\n"
"			childInfo.m_maxZ = static_cast<uint32_t>( qmax.z );\n"
"\n"
"			childInfo.m_nodeType	 = getNodeType( childIndices[i] );\n"
"			childInfo.m_nodeRange	 = childRanges == InvalidValue ? 1 : ( childRanges >> ( 4 * i ) ) & 15;\n"
"			childInfo.m_instanceMask = 0xff;\n"
"		}\n"
"		else\n"
"		{\n"
"			childInfo.m_minX = 0xfff;\n"
"			childInfo.m_minY = 0xfff;\n"
"			childInfo.m_minZ = 0xfff;\n"
"			childInfo.m_maxX = 0xfff;\n"
"			childInfo.m_maxY = 0xfff;\n"
"			childInfo.m_maxZ = 0xfff;\n"
"		}\n"
"\n"
"		node.m_childInfos[i] = childInfo;\n"
"	}\n"
"}\n"
"\n"
"struct alignas( DefaultAlignment ) Box8Node\n"
"{\n"
"	static constexpr uint32_t BranchingFactor = 8;\n"
"\n"
"	HIPRT_HOST_DEVICE void initBox(\n"
"		const uint32_t i,\n"
"		const uint32_t childCount,\n"
"		const uint32_t childIndex,\n"
"		const Aabb	   childBox,\n"
"		const Aabb	   nodeBox,\n"
"		const uint32_t childRange = InvalidValue,\n"
"		const uint32_t matrixId	  = NoRotationIndex )\n"
"	{\n"
"		m_matrixId = matrixId;\n"
"		initChildInfo( i, childCount, childIndex, childBox, nodeBox, childRange, *this );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE void initBoxes(\n"
"		const uint32_t* childIndices,\n"
"		const Aabb*		childBoxes,\n"
"		const uint32_t	childRanges = InvalidValue,\n"
"		const uint32_t	matrixId	= NoRotationIndex )\n"
"	{\n"
"		m_matrixId = matrixId;\n"
"		initChildInfos( childIndices, childBoxes, childRanges, *this );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE void init(\n"
"		const uint32_t i,\n"
"		const uint32_t parentAddr,\n"
"		const uint32_t childCount,\n"
"		const uint32_t boxNodeBase,\n"
"		const uint32_t primNodeBase,\n"
"		const uint32_t childIndex,\n"
"		const Aabb	   childBox,\n"
"		const Aabb	   nodeBox,\n"
"		const uint32_t childRange = InvalidValue,\n"
"		const uint32_t matrixId	  = NoRotationIndex )\n"
"	{\n"
"		m_boxNodeBase		 = boxNodeBase << 4;\n"
"		m_primNodeBase		 = primNodeBase << 4;\n"
"		m_parentAddr		 = parentAddr;\n"
"		m_childCountMinusOne = childCount - 1;\n"
"\n"
"		initBox( i, childCount, childIndex, childBox, nodeBox, childRange, matrixId );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE void init(\n"
"		const uint32_t	parentAddr,\n"
"		const uint32_t	childCount,\n"
"		const uint32_t	boxNodeBase,\n"
"		const uint32_t	primNodeBase,\n"
"		const uint32_t* childIndices,\n"
"		const Aabb*		childBoxes,\n"
"		const uint32_t	childRanges = InvalidValue,\n"
"		const uint32_t	matrixId	= NoRotationIndex )\n"
"	{\n"
"		m_boxNodeBase		 = boxNodeBase << 4;\n"
"		m_primNodeBase		 = primNodeBase << 4;\n"
"		m_parentAddr		 = parentAddr;\n"
"		m_childCountMinusOne = childCount - 1;\n"
"\n"
"		initBoxes( childIndices, childBoxes, childRanges, matrixId );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE void setBoxNodeBase( const uint32_t boxNodeBase ) { m_boxNodeBase = boxNodeBase << 4; }\n"
"\n"
"	HIPRT_HOST_DEVICE void setPrimNodeBase( const uint32_t primNodeBase ) { m_primNodeBase = primNodeBase << 4; }\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb aabb() const\n"
"	{\n"
"		Aabb box;\n"
"		for ( uint32_t i = 0; i < getChildCount(); ++i )\n"
"			box.grow( getChildBox( i ) );\n"
"		return box;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float area() const { return aabb().area(); }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getParentAddr() const { return m_parentAddr; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildCount() const { return m_childCountMinusOne + 1; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildIndex( uint32_t i ) const\n"
"	{\n"
"		if ( i >= getChildCount() ) return InvalidValue;\n"
"		return encodeNodeIndex( getChildAddr( i ), getChildType( i ) );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildType( uint32_t i ) const { return m_childInfos[i].m_nodeType; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildAddr( uint32_t i ) const\n"
"	{\n"
"		uint32_t childType = getChildType( i );\n"
"		uint32_t childAddr = childType == BoxType ? m_boxNodeBase >> 4 : m_primNodeBase >> 4;\n"
"		for ( uint32_t j = 0; j < i; ++j )\n"
"			if ( ( getChildType( j ) == BoxType ) == ( childType == BoxType ) ) childAddr += m_childInfos[j].m_nodeRange;\n"
"		return childAddr;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb getChildBox( uint32_t i ) const\n"
"	{\n"
"		if ( i >= getChildCount() ) return Aabb();\n"
"\n"
"		float3 rcp_exponent;\n"
"		rcp_exponent.x = as_float( ( 254 - m_xExponent + 12 ) << 23 );\n"
"		rcp_exponent.y = as_float( ( 254 - m_yExponent + 12 ) << 23 );\n"
"		rcp_exponent.z = as_float( ( 254 - m_zExponent + 12 ) << 23 );\n"
"\n"
"		Aabb childBox;\n"
"		childBox.m_min.x = m_origin.x + m_childInfos[i].m_minX / rcp_exponent.x;\n"
"		childBox.m_min.y = m_origin.y + m_childInfos[i].m_minY / rcp_exponent.y;\n"
"		childBox.m_min.z = m_origin.z + m_childInfos[i].m_minZ / rcp_exponent.z;\n"
"		childBox.m_max.x = m_origin.x + ( m_xExponent != 0 ? ( m_childInfos[i].m_maxX + 1 ) / rcp_exponent.x : 0.0f );\n"
"		childBox.m_max.y = m_origin.y + ( m_yExponent != 0 ? ( m_childInfos[i].m_maxY + 1 ) / rcp_exponent.y : 0.0f );\n"
"		childBox.m_max.z = m_origin.z + ( m_zExponent != 0 ? ( m_childInfos[i].m_maxZ + 1 ) / rcp_exponent.z : 0.0f );\n"
"\n"
"		return childBox;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildRange( uint32_t i ) const { return m_childInfos[i].m_nodeRange; }\n"
"\n"
"	HIPRT_HOST_DEVICE void patchChild( uint32_t i, uint32_t childIndex, [[maybe_unused]] uint32_t childRange )\n"
"	{\n"
"		// TODO: do not use bit fields\n"
"		m_childInfos[i].m_nodeType	= getNodeType( childIndex );\n"
"		m_childInfos[i].m_nodeRange = childRange;\n"
"	}\n"
"\n"
"	uint32_t m_boxNodeBase;\n"
"	uint32_t m_primNodeBase;\n"
"	uint32_t m_parentAddr;\n"
"	union\n"
"	{\n"
"		float3	 m_origin;\n"
"		uint32_t m_updateCounter;\n"
"	};\n"
"	uint8_t m_xExponent;\n"
"	uint8_t m_yExponent;\n"
"	uint8_t m_zExponent;\n"
"	uint8_t : 4;\n"
"	uint8_t	 m_childCountMinusOne : 4;\n"
"	uint32_t m_matrixId : 7;\n"
"	uint32_t : 25;\n"
"	ChildInfo m_childInfos[8] = {};\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( Box8Node ) == 128 );\n"
"\n"
"struct TrianglePacketHeader\n"
"{\n"
"	uint32_t m_primIndexAnchorSize;\n"
"	uint32_t m_primIndexPayloadSize;\n"
"	uint32_t m_vertCount;\n"
"	uint32_t m_triPairCount;\n"
"	uint32_t m_payloadSizeX;\n"
"	uint32_t m_payloadSizeY;\n"
"	uint32_t m_payloadSizeZ;\n"
"	uint32_t m_vertexTzBits;\n"
"	uint32_t m_indexSectionMidpoint;\n"
"};\n"
"\n"
"struct TrianglePacketData\n"
"{\n"
"	TrianglePacketData() = default;\n"
"\n"
"	HIPRT_HOST_DEVICE TrianglePacketData( const uint32_t primIndex0, const uint32_t primIndex1, const uint32_t vertCount )\n"
"	{\n"
"		tryAddTrianglePair( primIndex0, primIndex1, vertCount );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE bool\n"
"	tryAddTrianglePair( const uint32_t primIndex0, const uint32_t primIndex1, const uint32_t newVertCount )\n"
"	{\n"
"		if ( m_triPairCount == 0 )\n"
"		{\n"
"			m_primIndexAnchor = primIndex0;\n"
"			m_primIndexDiff	  = primIndex0 ^ primIndex1;\n"
"			m_triPairCount	  = 1;\n"
"			m_vertCount		  = newVertCount;\n"
"			return true;\n"
"		}\n"
"		else\n"
"		{\n"
"			if ( m_triPairCount + 1 >= MaxTrianglePairsPerTrianglePacket ||\n"
"				 m_vertCount + newVertCount >= MaxVerticesPerTrianglePacket )\n"
"				return false;\n"
"\n"
"			uint32_t newPrimIndexDiff =\n"
"				m_primIndexDiff | ( m_primIndexAnchor ^ primIndex0 ) | ( m_primIndexAnchor ^ primIndex1 );\n"
"			uint32_t primAnchorSize	 = 32 - clz( m_primIndexAnchor );\n"
"			uint32_t primPayloadSize = 32 - clz( newPrimIndexDiff );\n"
"\n"
"			uint32_t headerBits		= TriangleStructHeaderSize;\n"
"			uint32_t primIndexBits	= primAnchorSize + ( 2 * ( m_triPairCount + 1 ) - 1 ) * primPayloadSize;\n"
"			uint32_t vertexBits		= 96 * ( m_vertCount + newVertCount );\n"
"			uint32_t descriptorBits = TrianglePairDescriptorSize * ( m_triPairCount + 1 );\n"
"			if ( headerBits + vertexBits + descriptorBits + primIndexBits > 1024 ) return false;\n"
"\n"
"			m_primIndexDiff = newPrimIndexDiff;\n"
"			m_vertCount += newVertCount;\n"
"			m_triPairCount++;\n"
"\n"
"			return true;\n"
"		}\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE TrianglePacketHeader buildHeader() const\n"
"	{\n"
"		TrianglePacketHeader hdr;\n"
"		hdr.m_triPairCount		   = m_triPairCount;\n"
"		hdr.m_vertCount			   = m_vertCount;\n"
"		hdr.m_primIndexAnchorSize  = 32 - clz( m_primIndexAnchor );\n"
"		hdr.m_primIndexPayloadSize = 32 - clz( m_primIndexDiff );\n"
"\n"
"		const uint32_t pairDescSize			= hdr.m_triPairCount * TrianglePairDescriptorSize;\n"
"		const uint32_t primIndexPayloadSize = ( ( 2 * hdr.m_triPairCount ) - 1 ) * hdr.m_primIndexPayloadSize;\n"
"		hdr.m_indexSectionMidpoint			= 1024 - pairDescSize - primIndexPayloadSize - hdr.m_primIndexAnchorSize;\n"
"\n"
"		hdr.m_payloadSizeX = 32;\n"
"		hdr.m_payloadSizeY = 32;\n"
"		hdr.m_payloadSizeZ = 32;\n"
"		hdr.m_vertexTzBits = 0;\n"
"		return hdr;\n"
"	}\n"
"\n"
"#if defined( __KERNELCC__ )\n"
"	HIPRT_DEVICE TrianglePacketData shuffle( uint32_t index )\n"
"	{\n"
"		TrianglePacketData data{};\n"
"		data.m_primIndexAnchor = shfl( m_primIndexAnchor, index );\n"
"		data.m_primIndexDiff   = shfl( m_primIndexDiff, index );\n"
"		data.m_triPairCount	   = shfl( m_triPairCount, index );\n"
"		data.m_vertCount	   = shfl( m_vertCount, index );\n"
"		return data;\n"
"	}\n"
"#endif\n"
"\n"
"	uint32_t m_primIndexAnchor;\n"
"	uint32_t m_primIndexDiff;\n"
"	uint32_t m_triPairCount = 0;\n"
"	uint32_t m_vertCount	= 0;\n"
"};\n"
"\n"
"struct alignas( DefaultAlignment ) TrianglePairNode\n"
"{\n"
"	HIPRT_HOST_DEVICE Aabb	aabb() const { return m_triPair.aabb(); }\n"
"	HIPRT_HOST_DEVICE float area() const { return aabb().area(); }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getPrimIndex( uint32_t triangleIndex ) const\n"
"	{\n"
"		return triangleIndex > 0 ? m_primIndex1 : m_primIndex0;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 getNormal( uint32_t triangleIndex ) const\n"
"	{\n"
"		return m_triPair.fetchTriangle( triangleIndex ).normal( m_flags >> ( triangleIndex * 8 ) );\n"
"	}\n"
"\n"
"	TrianglePair m_triPair;\n"
"	uint32_t	 padding;\n"
"	uint32_t	 m_primIndex0 = InvalidValue;\n"
"	uint32_t	 m_primIndex1 = InvalidValue;\n"
"	uint32_t	 m_flags;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( TrianglePairNode ) == 64 );\n"
"\n"
"struct alignas( DefaultAlignment ) TrianglePacketNode\n"
"{\n"
"	template <bool Atomic = false, bool Clear = false>\n"
"	HIPRT_HOST_DEVICE void writeUnalignedBits( uint32_t position, uint32_t length, uint32_t data )\n"
"	{\n"
"		if ( length )\n"
"		{\n"
"			const uint32_t hiOfs  = ( position + length ) % 32;\n"
"			const uint32_t loWord = position / 32;\n"
"			const uint32_t hiWord = ( position + length ) / 32;\n"
"\n"
"			[[maybe_unused]] const uint32_t loMask = ( length == 32 ? ~0u : ( 1u << length ) - 1 ) << ( position % 32 );\n"
"			const uint32_t					hiMask = ( 1u << hiOfs ) - 1;\n"
"\n"
"			const uint32_t loBits = data << ( position % 32 );\n"
"#if defined( __KERNELCC__ )\n"
"			if constexpr ( Atomic )\n"
"			{\n"
"				if constexpr ( Clear )\n"
"				{\n"
"					__threadfence();\n"
"					atomicAnd( &m_data[loWord], ~loMask );\n"
"					__threadfence();\n"
"					atomicOr( &m_data[loWord], loBits );\n"
"					__threadfence();\n"
"				}\n"
"				else\n"
"				{\n"
"					atomicOr( &m_data[loWord], loBits );\n"
"				}\n"
"			}\n"
"			else\n"
"			{\n"
"				if constexpr ( Clear ) m_data[loWord] &= ~loMask;\n"
"				m_data[loWord] |= loBits;\n"
"			}\n"
"#else\n"
"			if constexpr ( Clear ) m_data[loWord] &= ~loMask;\n"
"			m_data[loWord] |= loBits;\n"
"#endif\n"
"\n"
"			if ( hiWord < 32 && hiWord != loWord && hiMask > 0 )\n"
"			{\n"
"				const uint32_t hiBits = data >> ( length - hiOfs );\n"
"#if defined( __KERNELCC__ )\n"
"				if constexpr ( Atomic )\n"
"				{\n"
"					if constexpr ( Clear )\n"
"					{\n"
"						__threadfence();\n"
"						atomicAnd( &m_data[hiWord], ~hiMask );\n"
"						__threadfence();\n"
"						atomicOr( &m_data[hiWord], hiBits );\n"
"						__threadfence();\n"
"					}\n"
"					else\n"
"					{\n"
"						atomicOr( &m_data[hiWord], hiBits );\n"
"					}\n"
"				}\n"
"				else\n"
"				{\n"
"					if constexpr ( Clear ) m_data[hiWord] &= ~hiMask;\n"
"					m_data[hiWord] |= hiBits;\n"
"				}\n"
"#else\n"
"				if constexpr ( Clear ) m_data[hiWord] &= ~hiMask;\n"
"				m_data[hiWord] |= hiBits;\n"
"#endif\n"
"			}\n"
"		}\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t readUnalignedBits( uint32_t position, uint32_t length ) const\n"
"	{\n"
"		uint32_t data = 0;\n"
"		if ( length )\n"
"		{\n"
"			const uint32_t hiOfs  = ( position + length ) % 32;\n"
"			const uint32_t loWord = position / 32;\n"
"			const uint32_t hiWord = ( position + length ) / 32;\n"
"			const uint32_t loMask = ( length == 32 ? ~0u : ( 1u << length ) - 1 ) << ( position % 32 );\n"
"			const uint32_t hiMask = ( 1u << hiOfs ) - 1;\n"
"			const uint32_t loBits = ( m_data[loWord] & loMask ) >> ( position % 32 );\n"
"\n"
"			data = loBits;\n"
"			if ( hiWord < 32 && hiWord != loWord && hiMask > 0 )\n"
"			{\n"
"				const uint32_t hiBits = ( m_data[hiWord] & hiMask ) << ( length - hiOfs );\n"
"				data |= hiBits;\n"
"			}\n"
"		}\n"
"		return data;\n"
"	}\n"
"\n"
"	template <bool Atomic = false, bool Clear = false>\n"
"	HIPRT_HOST_DEVICE void writeHeader( const TrianglePacketHeader& hdr )\n"
"	{\n"
"		uint32_t dw0 = ( hdr.m_payloadSizeX - 1 );\n"
"		dw0 += ( ( hdr.m_payloadSizeY - 1 ) << 5 );\n"
"		dw0 += ( ( hdr.m_payloadSizeZ - 1 ) << 10 );\n"
"		dw0 += ( hdr.m_vertexTzBits << 15 );\n"
"		dw0 += ( ( hdr.m_vertCount - 1 ) << 20 );\n"
"		dw0 += ( hdr.m_triPairCount - 1 ) << 28;\n"
"		m_data[0] = dw0;\n"
"\n"
"		uint32_t dw1 = hdr.m_primIndexAnchorSize;\n"
"		dw1 += hdr.m_primIndexPayloadSize << 5;\n"
"		dw1 += hdr.m_indexSectionMidpoint << 10;\n"
"\n"
"		[[maybe_unused]] const uint32_t mask = ( 1u << ( TriangleStructHeaderSize - 32 ) ) - 1;\n"
"#if defined( __KERNELCC__ )\n"
"		if constexpr ( Atomic )\n"
"		{\n"
"			if constexpr ( Clear )\n"
"			{\n"
"				__threadfence();\n"
"				atomicAnd( &m_data[1], ~mask );\n"
"				__threadfence();\n"
"				atomicOr( &m_data[1], dw1 );\n"
"				__threadfence();\n"
"			}\n"
"			else\n"
"			{\n"
"				atomicOr( &m_data[1], dw1 );\n"
"			}\n"
"		}\n"
"		else\n"
"		{\n"
"			if constexpr ( Clear ) m_data[1] &= ~mask;\n"
"			m_data[1] |= dw1;\n"
"		}\n"
"#else\n"
"		if constexpr ( Clear ) m_data[1] &= ~mask;\n"
"		m_data[1] |= dw1;\n"
"#endif\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE TrianglePacketHeader readHeader()\n"
"	{\n"
"		TrianglePacketHeader hdr{};\n"
"\n"
"		hdr.m_payloadSizeX = readUnalignedBits( 0, 5 ) + 1;\n"
"		hdr.m_payloadSizeY = readUnalignedBits( 5, 5 ) + 1;\n"
"		hdr.m_payloadSizeZ = readUnalignedBits( 10, 5 ) + 1;\n"
"		hdr.m_vertexTzBits = readUnalignedBits( 15, 5 );\n"
"\n"
"		hdr.m_vertCount	   = getVertexCount();\n"
"		hdr.m_triPairCount = getTrianglePairCount();\n"
"\n"
"		hdr.m_primIndexAnchorSize  = getPrimIndexAnchorSize();\n"
"		hdr.m_primIndexPayloadSize = getPrimIndexPayloadSize();\n"
"		hdr.m_indexSectionMidpoint = getIndexSectionMidpoint();\n"
"\n"
"		return hdr;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getVertexCount() const { return readUnalignedBits( 20, 4 ) + 1; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getTrianglePairCount() const { return readUnalignedBits( 28, 3 ) + 1; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getIndexSectionMidpoint() const { return readUnalignedBits( 32 + 10, 10 ); }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getPrimIndexAnchorSize() const { return readUnalignedBits( 32 + 0, 5 ); }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getPrimIndexPayloadSize() const { return readUnalignedBits( 32 + 5, 5 ); }\n"
"\n"
"	template <bool Atomic = false, bool Clear = false>\n"
"	HIPRT_HOST_DEVICE void\n"
"	writePrimIndex( uint32_t pairIndex, uint32_t triangleIndex, const TrianglePacketHeader& hdr, uint32_t primIndex )\n"
"	{\n"
"		const uint32_t flatTriIndex = 2 * pairIndex + triangleIndex;\n"
"\n"
"		const uint32_t primIndexPayloadSize = hdr.m_primIndexPayloadSize;\n"
"		const uint32_t primIndexAnchorSize	= hdr.m_primIndexAnchorSize;\n"
"		const uint32_t primIndexAnchorPos	= hdr.m_indexSectionMidpoint;\n"
"		const uint32_t primIndexPayloadPos =\n"
"			primIndexAnchorPos + primIndexAnchorSize + ( flatTriIndex - 1 ) * primIndexPayloadSize;\n"
"\n"
"		const uint32_t primIndexPos	 = ( flatTriIndex == 0 ) ? primIndexAnchorPos : primIndexPayloadPos;\n"
"		const uint32_t primIndexSize = ( flatTriIndex == 0 ) ? primIndexAnchorSize : primIndexPayloadSize;\n"
"		const uint32_t primIndexMask = ( 1 << primIndexSize ) - 1;\n"
"\n"
"		writeUnalignedBits<Atomic, Clear>( primIndexPos, primIndexSize, primIndex & primIndexMask );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t readPrimIndex( uint32_t pairIndex, uint32_t triangleIndex ) const\n"
"	{\n"
"		const uint32_t flatTriIndex = 2 * pairIndex + triangleIndex;\n"
"\n"
"		const uint32_t primIndexPayloadSize = getPrimIndexPayloadSize();\n"
"		const uint32_t primIndexAnchorSize	= getPrimIndexAnchorSize();\n"
"		const uint32_t primIndexAnchorPos	= getIndexSectionMidpoint();\n"
"		const uint32_t primIndexPayloadPos =\n"
"			primIndexAnchorPos + primIndexAnchorSize + ( flatTriIndex - 1 ) * primIndexPayloadSize;\n"
"\n"
"		const uint32_t primIndexAnchor = readUnalignedBits( primIndexAnchorPos, primIndexAnchorSize );\n"
"		if ( flatTriIndex == 0 ) return primIndexAnchor;\n"
"\n"
"		const uint32_t primIndex	 = readUnalignedBits( primIndexPayloadPos, primIndexPayloadSize );\n"
"		const uint32_t primIndexMask = ( 1 << primIndexPayloadSize ) - 1;\n"
"\n"
"		if ( primIndexPayloadSize >= primIndexAnchorSize )\n"
"			return primIndex;\n"
"		else\n"
"			return primIndex | ( primIndexAnchor & ~primIndexMask );\n"
"	}\n"
"\n"
"	template <bool Atomic = false, bool Clear = false>\n"
"	HIPRT_HOST_DEVICE void writeDescriptor( uint32_t pairIndex, uint32_t descriptor )\n"
"	{\n"
"		const uint32_t position = 1024 - ( pairIndex + 1 ) * TrianglePairDescriptorSize;\n"
"		writeUnalignedBits<Atomic, Clear>( position, TrianglePairDescriptorSize, descriptor );\n"
"	}\n"
"\n"
"	template <bool Atomic = false, bool Clear = false>\n"
"	HIPRT_HOST_DEVICE void\n"
"	writeDescriptor( uint32_t pairIndex, const uint3& triIndices0, const uint3& triIndices1, bool rangeEnd )\n"
"	{\n"
"		uint32_t descriptor = 0;\n"
"		descriptor |= rangeEnd ? 1 : 0;\n"
"		descriptor |= ( triIndices0.x + ( triIndices0.y << 4 ) + ( triIndices0.z << 8 ) ) << 17;\n"
"		descriptor |= ( triIndices1.x + ( triIndices1.y << 4 ) + ( triIndices1.z << 8 ) ) << 3;\n"
"		writeDescriptor<Atomic, Clear>( pairIndex, descriptor );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE uint4 readDescriptor( uint32_t pairIndex, uint32_t triangleIndex ) const\n"
"	{\n"
"		const uint32_t position	  = 1024 - ( pairIndex + 1 ) * TrianglePairDescriptorSize;\n"
"		const uint32_t descriptor = readUnalignedBits( position, TrianglePairDescriptorSize );\n"
"		const uint32_t triIndices = descriptor >> ( triangleIndex > 0 ? 3 : 17 );\n"
"		return uint4{ triIndices & 15, ( triIndices >> 4 ) & 15, ( triIndices >> 8 ) & 15, descriptor & 1 };\n"
"	}\n"
"\n"
"	template <bool Atomic = false, bool Clear = false>\n"
"	HIPRT_HOST_DEVICE void writeVertex( uint32_t vertexIndex, const float3& vertex )\n"
"	{\n"
"		const uint32_t position = TriangleStructHeaderSize + 96 * vertexIndex;\n"
"		writeUnalignedBits<Atomic, Clear>( position + 0 * 32, 32, as_uint( vertex.x ) );\n"
"		writeUnalignedBits<Atomic, Clear>( position + 1 * 32, 32, as_uint( vertex.y ) );\n"
"		writeUnalignedBits<Atomic, Clear>( position + 2 * 32, 32, as_uint( vertex.z ) );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 readVertex( uint32_t vertexIndex ) const\n"
"	{\n"
"		const uint32_t position = TriangleStructHeaderSize + 96 * vertexIndex;\n"
"\n"
"		float3 vertex;\n"
"		vertex.x = as_float( readUnalignedBits( position + 0 * 32, 32 ) );\n"
"		vertex.y = as_float( readUnalignedBits( position + 1 * 32, 32 ) );\n"
"		vertex.z = as_float( readUnalignedBits( position + 2 * 32, 32 ) );\n"
"		return vertex;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Triangle fetchTriangle( uint32_t pairIndex, uint32_t triangleIndex ) const\n"
"	{\n"
"		const uint3 triIndices = make_uint3( readDescriptor( pairIndex, triangleIndex ) );\n"
"\n"
"		Triangle triangle;\n"
"		triangle.m_v0 = readVertex( triIndices.x );\n"
"		triangle.m_v1 = readVertex( triIndices.y );\n"
"		triangle.m_v2 = readVertex( triIndices.z );\n"
"\n"
"		return triangle;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getPrimIndex( uint32_t pairIndex, uint32_t triangleIndex ) const\n"
"	{\n"
"		return readPrimIndex( pairIndex, triangleIndex );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 getNormal( uint32_t pairIndex, uint32_t triangleIndex ) const\n"
"	{\n"
"		return fetchTriangle( pairIndex, triangleIndex ).normal();\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE bool isRangeEnd( uint32_t pairIndex ) const { return readDescriptor( pairIndex, 0 ).w != 0; }\n"
"\n"
"	HIPRT_HOST_DEVICE Obb obb( const uint32_t pairIndex, const uint32_t matrixIndex, const Aabb& box ) const\n"
"	{\n"
"		Obb obb( matrixIndex );\n"
"\n"
"		uint32_t				  triPairIndex	= pairIndex;\n"
"		const TrianglePacketNode* triPacketNode = this;\n"
"\n"
"		while ( true )\n"
"		{\n"
"			const Triangle& tri0 = triPacketNode->fetchTriangle( triPairIndex, 0 );\n"
"			obb.m_box.grow( tri0.obb( box, matrixIndex ).m_box );\n"
"\n"
"			if ( triPacketNode->getPrimIndex( triPairIndex, 0 ) != triPacketNode->getPrimIndex( triPairIndex, 1 ) )\n"
"			{\n"
"				const Triangle& tri1 = triPacketNode->fetchTriangle( triPairIndex, 1 );\n"
"				obb.m_box.grow( tri1.obb( box, matrixIndex ).m_box );\n"
"			}\n"
"\n"
"			bool nodeEnd  = triPairIndex + 1 == triPacketNode->getTrianglePairCount();\n"
"			bool rangeEnd = triPacketNode->isRangeEnd( triPairIndex );\n"
"\n"
"			if ( rangeEnd ) break;\n"
"\n"
"			triPairIndex++;\n"
"			if ( nodeEnd )\n"
"			{\n"
"				triPairIndex = 0;\n"
"				triPacketNode++;\n"
"			}\n"
"		}\n"
"\n"
"		return obb;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb aabb( const uint32_t pairIndex ) const\n"
"	{\n"
"		Aabb box;\n"
"\n"
"		uint32_t				  triPairIndex	= pairIndex;\n"
"		const TrianglePacketNode* triPacketNode = this;\n"
"\n"
"		while ( true )\n"
"		{\n"
"			const Triangle& tri0 = triPacketNode->fetchTriangle( triPairIndex, 0 );\n"
"			box.grow( tri0.m_v0 );\n"
"			box.grow( tri0.m_v1 );\n"
"			box.grow( tri0.m_v2 );\n"
"\n"
"			if ( triPacketNode->getPrimIndex( triPairIndex, 0 ) != triPacketNode->getPrimIndex( triPairIndex, 1 ) )\n"
"			{\n"
"				const Triangle& tri1 = triPacketNode->fetchTriangle( triPairIndex, 1 );\n"
"				box.grow( tri1.m_v0 );\n"
"				box.grow( tri1.m_v1 );\n"
"				box.grow( tri1.m_v2 );\n"
"			}\n"
"\n"
"			bool nodeEnd  = triPairIndex + 1 == triPacketNode->getTrianglePairCount();\n"
"			bool rangeEnd = triPacketNode->isRangeEnd( triPairIndex );\n"
"\n"
"			if ( rangeEnd ) break;\n"
"\n"
"			triPairIndex++;\n"
"			if ( nodeEnd )\n"
"			{\n"
"				triPairIndex = 0;\n"
"				triPacketNode++;\n"
"			}\n"
"		}\n"
"\n"
"		return box;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float area( const uint32_t pairIndex ) const { return aabb( pairIndex ).area(); }\n"
"\n"
"	uint32_t m_data[32];\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( TrianglePacketNode ) == 128 );\n"
"\n"
"struct TrianglePairData\n"
"{\n"
"	HIPRT_HOST_DEVICE TrianglePairData() {}\n"
"\n"
"	HIPRT_HOST_DEVICE\n"
"	TrianglePairData( const uint2& pairIndices, const uint3& triIndices0, const uint3& triIndices1, const bool rangeEnd )\n"
"		: m_pairIndices( pairIndices )\n"
"	{\n"
"		m_descriptor = 0;\n"
"		m_descriptor |= rangeEnd ? 1 : 0;\n"
"		m_descriptor |= ( triIndices0.x + ( triIndices0.y << 4 ) + ( triIndices0.z << 8 ) ) << 17;\n"
"		m_descriptor |= ( triIndices1.x + ( triIndices1.y << 4 ) + ( triIndices1.z << 8 ) ) << 3;\n"
"	}\n"
"\n"
"	uint2	 m_pairIndices;\n"
"	uint32_t m_descriptor;\n"
"};\n"
"\n"
"struct TrianglePairOffsets\n"
"{\n"
"	HIPRT_HOST_DEVICE TrianglePairOffsets() {}\n"
"\n"
"	HIPRT_HOST_DEVICE TrianglePairOffsets( const uint32_t pairOffset, const uint32_t packetOffset )\n"
"	{\n"
"		m_pairOffset   = pairOffset & 7;\n"
"		m_packetOffset = packetOffset & 31;\n"
"	}\n"
"\n"
"	uint8_t m_pairOffset : 3;\n"
"	uint8_t m_packetOffset : 5;\n"
"};\n"
"\n"
"struct alignas( DefaultAlignment ) TrianglePacketCache\n"
"{\n"
"	HIPRT_HOST_DEVICE TrianglePacketCache() {}\n"
"	TrianglePairData  m_triPairData[MaxTrianglePairsPerTrianglePacket];\n"
"	uint32_t		  m_vertexIndices[MaxVerticesPerTrianglePacket];\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( TrianglePacketCache ) >= sizeof( TrianglePacketNode ) );\n"
"\n"
"struct alignas( 4 ) CustomNode\n"
"{\n"
"	uint32_t m_primIndex = InvalidValue;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( CustomNode ) == 4 );\n"
"\n"
"struct InstanceNodeBase\n"
"{\n"
"	HIPRT_HOST_DEVICE hiprtRay transformRay( const hiprtRay& ray ) const\n"
"	{\n"
"		hiprtRay	 outRay;\n"
"		const float3 o	= ray.origin;\n"
"		const float3 d	= ray.direction;\n"
"		outRay.origin.x = dot( { m_matrix[0][0], m_matrix[0][1], m_matrix[0][2] }, o );\n"
"		outRay.origin.y = dot( { m_matrix[1][0], m_matrix[1][1], m_matrix[1][2] }, o );\n"
"		outRay.origin.z = dot( { m_matrix[2][0], m_matrix[2][1], m_matrix[2][2] }, o );\n"
"		outRay.origin += { m_matrix[0][3], m_matrix[1][3], m_matrix[2][3] };\n"
"		outRay.direction.x = dot( { m_matrix[0][0], m_matrix[0][1], m_matrix[0][2] }, d );\n"
"		outRay.direction.y = dot( { m_matrix[1][0], m_matrix[1][1], m_matrix[1][2] }, d );\n"
"		outRay.direction.z = dot( { m_matrix[2][0], m_matrix[2][1], m_matrix[2][2] }, d );\n"
"		outRay.minT		   = ray.minT;\n"
"		outRay.maxT		   = ray.maxT;\n"
"		return outRay;\n"
"	}\n"
"\n"
"	union\n"
"	{\n"
"		float				 m_matrix[3][4];\n"
"		hiprtTransformHeader m_transform;\n"
"	};\n"
"\n"
"	union\n"
"	{\n"
"		GeomHeader*	 m_geometry;\n"
"		SceneHeader* m_scene;\n"
"	};\n"
"};\n"
"\n"
"struct alignas( 64 ) UserInstanceNode : public InstanceNodeBase\n"
"{\n"
"	HIPRT_HOST_DEVICE void init(\n"
"		const uint32_t					 primIndex,\n"
"		const uint32_t					 mask,\n"
"		const Frame&					 frame,\n"
"		const hiprtInstance&			 instance,\n"
"		const hiprtTransformHeader&		 transform,\n"
"		[[maybe_unused]] const uint32_t	 childCount,\n"
"		[[maybe_unused]] const uint32_t* childIndices,\n"
"		[[maybe_unused]] const Aabb*	 childBoxes )\n"
"	{\n"
"		m_primIndex = primIndex;\n"
"		m_mask		= mask;\n"
"		m_type		= instance.type;\n"
"		m_static	= transform.frameCount == 1 ? 1 : 0;\n"
"\n"
"		if ( instance.type == hiprtInstanceTypeScene )\n"
"			m_scene = reinterpret_cast<SceneHeader*>( instance.scene );\n"
"		else\n"
"			m_geometry = reinterpret_cast<GeomHeader*>( instance.geometry );\n"
"\n"
"		if ( transform.frameCount == 1 )\n"
"		{\n"
"			m_identity = computeInvTransformMatrix( frame, m_matrix ) ? 1 : 0;\n"
"		}\n"
"		else\n"
"		{\n"
"			m_identity	= 0;\n"
"			m_transform = transform;\n"
"		}\n"
"	}\n"
"\n"
"	uint32_t m_mask = FullRayMask;\n"
"	uint32_t m_primIndex : InstanceIDBits;\n"
"	uint32_t m_type : 1;\n"
"	uint32_t m_static : 1;\n"
"	uint32_t m_identity : 1;\n"
"	uint32_t : 5;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( UserInstanceNode ) == 64 );\n"
"\n"
"struct alignas( 128 ) HwInstanceNode : public InstanceNodeBase\n"
"{\n"
"	static constexpr uint32_t BranchingFactor = 4;\n"
"\n"
"	HIPRT_HOST_DEVICE void initBoxes( const uint32_t* childIndices, const Aabb* childBoxes )\n"
"	{\n"
"		initChildInfos( childIndices, childBoxes, InvalidValue, *this );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE void init(\n"
"		const uint32_t				primIndex,\n"
"		const uint32_t				mask,\n"
"		const Frame&				frame,\n"
"		const hiprtInstance&		instance,\n"
"		const hiprtTransformHeader& transform,\n"
"		const uint32_t				childCount,\n"
"		const uint32_t*				childIndices,\n"
"		const Aabb*					childBoxes,\n"
"		const uint32_t				childRanges = InvalidValue )\n"
"	{\n"
"		m_primIndex = primIndex;\n"
"		m_mask		= mask;\n"
"		m_hwMask	= 0xff;\n"
"		m_type		= instance.type;\n"
"		m_static	= transform.frameCount == 1 ? 1 : 0;\n"
"\n"
"		if ( instance.type == hiprtInstanceTypeScene )\n"
"			m_scene = reinterpret_cast<SceneHeader*>( instance.scene );\n"
"		else\n"
"			m_geometry = reinterpret_cast<GeomHeader*>( instance.geometry );\n"
"\n"
"		if ( transform.frameCount == 1 )\n"
"		{\n"
"			m_identity = computeInvTransformMatrix( frame, m_matrix ) ? 1 : 0;\n"
"		}\n"
"		else\n"
"		{\n"
"			m_identity	= 0;\n"
"			m_transform = transform;\n"
"		}\n"
"\n"
"		m_disableBoxSort	 = 0;\n"
"		m_childCountMinusOne = childCount - 1;\n"
"		initBoxes( childIndices, childBoxes );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildCount() const { return m_childCountMinusOne + 1; }\n"
"\n"
"	uint32_t m_mask = FullRayMask;\n"
"	uint32_t m_primIndex : InstanceIDBits;\n"
"	uint32_t m_hwMask : 8;\n"
"\n"
"	float3	m_origin;\n"
"	uint8_t m_xExponent;\n"
"	uint8_t m_yExponent;\n"
"	uint8_t m_zExponent;\n"
"\n"
"	uint8_t m_disableBoxSort : 1;\n"
"	uint8_t m_type : 1;\n"
"	uint8_t m_static : 1;\n"
"	uint8_t m_identity : 1;\n"
"	uint8_t m_childCountMinusOne : 2;\n"
"	uint8_t : 2;\n"
"\n"
"	ChildInfo m_childInfos[BranchingFactor];\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( HwInstanceNode ) == 128 );\n"
"\n"
"struct alignas( 32 ) ScratchNode\n"
"{\n"
"	HIPRT_HOST_DEVICE Aabb aabb() const { return m_box; };\n"
"\n"
"	HIPRT_HOST_DEVICE float area() const { return aabb().area(); }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildType( uint32_t i ) const { return getNodeType( ( &m_childIndex0 )[i] & ~FatLeafBit ); }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildIndex( uint32_t i ) const { return ( &m_childIndex0 )[i] & ~FatLeafBit; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildAddr( uint32_t i ) const { return getNodeAddr( ( &m_childIndex0 )[i] & ~FatLeafBit ); }\n"
"\n"
"	HIPRT_HOST_DEVICE void setChildFatLeafFlag( uint32_t i ) { ( &m_childIndex0 )[i] |= FatLeafBit; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t operator[]( uint32_t i ) const { return ( &m_childIndex0 )[i]; }\n"
"\n"
"	Aabb	 m_box;\n"
"	uint32_t m_childIndex0;\n"
"	uint32_t m_childIndex1;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( ScratchNode ) == 32 );\n"
"\n"
"struct alignas( 64 ) ApiNode\n"
"{\n"
"	HIPRT_HOST_DEVICE Aabb aabb() const { return m_box; }\n"
"\n"
"	HIPRT_HOST_DEVICE float area() const { return aabb().area(); }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildType( uint32_t i ) const { return m_childTypes[i] & ~FatLeafBit; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildIndex( uint32_t i ) const\n"
"	{\n"
"		return encodeNodeIndex( m_childAddrs[i], m_childTypes[i] & ~FatLeafBit );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getChildAddr( uint32_t i ) const { return m_childAddrs[i]; }\n"
"\n"
"	HIPRT_HOST_DEVICE void setChildFatLeafFlag( uint32_t i ) { m_childTypes[i] |= FatLeafBit; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t operator[]( uint32_t i ) const { return getChildIndex( i ) | ( m_childTypes[i] & FatLeafBit ); }\n"
"\n"
"	Aabb	 m_box;\n"
"	uint32_t m_childAddrs[2];\n"
"	uint32_t m_childTypes[2];\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( ApiNode ) == sizeof( hiprtInternalNode ) );\n"
"HIPRT_STATIC_ASSERT( alignof( ApiNode ) == alignof( hiprtInternalNode ) );\n"
"\n"
"struct alignas( 32 ) ReferenceNode\n"
"{\n"
"	ReferenceNode() = default;\n"
"	HIPRT_HOST_DEVICE	   ReferenceNode( uint32_t primIndex ) : m_primIndex( primIndex ) {}\n"
"	HIPRT_HOST_DEVICE	   ReferenceNode( uint32_t primIndex, const Aabb& box ) : m_primIndex( primIndex ), m_box( box ) {}\n"
"	HIPRT_HOST_DEVICE Aabb aabb() const { return m_box; };\n"
"\n"
"	Aabb	 m_box;\n"
"	uint32_t m_primIndex = InvalidValue;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( ReferenceNode ) == 32 );\n"
"HIPRT_STATIC_ASSERT( sizeof( ReferenceNode ) == sizeof( hiprtLeafNode ) );\n"
"HIPRT_STATIC_ASSERT( alignof( ReferenceNode ) == alignof( hiprtLeafNode ) );\n"
"\n"
"#if defined( __KERNELCC__ )\n"
"#if HIPRT_RTIP >= 31\n"
"using TriangleNode = TrianglePacketNode;\n"
"using BoxNode	   = Box8Node;\n"
"using InstanceNode = HwInstanceNode;\n"
"#else\n"
"using TriangleNode = TrianglePairNode;\n"
"using BoxNode	   = Box4Node;\n"
"using InstanceNode = UserInstanceNode;\n"
"#endif\n"
"#endif\n"
"} // namespace hiprt\n"
;
static const char* hip_Header= \
"\n"
"#pragma once\n"
"#include <hiprt/impl/BvhNode.h>\n"
"#include <hiprt/impl/Transform.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"struct Instance;\n"
"\n"
"struct GeomHeader\n"
"{\n"
"#if defined( __KERNELCC__ )\n"
"	BoxNode* m_boxNodes;\n"
"#else\n"
"	void* m_boxNodes;\n"
"#endif\n"
"	void*	 m_primNodes;\n"
"	size_t	 m_size;\n"
"	uint32_t m_referenceCount;\n"
"	uint32_t m_boxNodeCount;\n"
"	uint32_t m_primNodeCount;\n"
"	uint32_t m_geomType;\n"
"	uint32_t m_rtip;\n"
"};\n"
"HIPRT_STATIC_ASSERT( alignof( GeomHeader ) <= DefaultAlignment );\n"
"\n"
"struct SceneHeader\n"
"{\n"
"#if defined( __KERNELCC__ )\n"
"	BoxNode*	  m_boxNodes;\n"
"	InstanceNode* m_primNodes;\n"
"#else\n"
"	void* m_boxNodes;\n"
"	void* m_primNodes;\n"
"#endif\n"
"	Instance* m_instances;\n"
"	Frame*	  m_frames;\n"
"	size_t	  m_size;\n"
"	uint32_t  m_referenceCount;\n"
"	uint32_t  m_primCount;\n"
"	uint32_t  m_primNodeCount;\n"
"	uint32_t  m_boxNodeCount;\n"
"	uint32_t  m_frameCount;\n"
"	uint32_t  m_rtip;\n"
"};\n"
"HIPRT_STATIC_ASSERT( alignof( SceneHeader ) <= DefaultAlignment );\n"
"} // namespace hiprt\n"
;
static const char* hip_QrDecomposition= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_math.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"HIPRT_HOST_DEVICE static void qr( const float* a, float* q, float* r )\n"
"{\n"
"	// A and Q may be the same. QR using Modified Gram-Schmidt\n"
"	// method, A must be fullrank.\n"
"	float w;\n"
"\n"
"	if ( a != q )\n"
"	{\n"
"		q[0] = a[0];\n"
"		q[1] = a[1];\n"
"		q[2] = a[2];\n"
"		q[3] = a[3];\n"
"		q[4] = a[4];\n"
"		q[5] = a[5];\n"
"		q[6] = a[6];\n"
"		q[7] = a[7];\n"
"		q[8] = a[8];\n"
"	}\n"
"\n"
"	r[0] = 0.0f;\n"
"	r[1] = 0.0f;\n"
"	r[2] = 0.0f;\n"
"	r[3] = 0.0f;\n"
"	r[4] = 0.0f;\n"
"	r[5] = 0.0f;\n"
"	r[6] = 0.0f;\n"
"	r[7] = 0.0f;\n"
"	r[8] = 0.0f;\n"
"\n"
"	w	 = hypot( { q[0], q[3], q[6] } );\n"
"	r[0] = w;\n"
"	if ( w != 0.0f )\n"
"	{\n"
"		w	 = 1.0f / w;\n"
"		q[0] = q[0] * w;\n"
"		q[3] = q[3] * w;\n"
"		q[6] = q[6] * w;\n"
"	}\n"
"	else\n"
"	{\n"
"		q[0] = 1.0f;\n"
"	}\n"
"\n"
"	w	 = q[0] * q[1] + q[3] * q[4] + q[6] * q[7];\n"
"	r[1] = w;\n"
"\n"
"	q[1] = q[1] - w * q[0];\n"
"	q[4] = q[4] - w * q[3];\n"
"	q[7] = q[7] - w * q[6];\n"
"\n"
"	w	 = hypot( { q[1], q[4], q[7] } );\n"
"	r[4] = w;\n"
"	if ( w != 0.0f )\n"
"	{\n"
"		w	 = 1.0f / w;\n"
"		q[1] = q[1] * w;\n"
"		q[4] = q[4] * w;\n"
"		q[7] = q[7] * w;\n"
"	}\n"
"	else\n"
"	{\n"
"		q[1] = 1.0f;\n"
"	}\n"
"\n"
"	w	 = q[0] * q[2] + q[3] * q[5] + q[6] * q[8];\n"
"	r[2] = w;\n"
"\n"
"	q[2] = q[2] - w * q[0];\n"
"	q[5] = q[5] - w * q[3];\n"
"	q[8] = q[8] - w * q[6];\n"
"\n"
"	w	 = q[1] * q[2] + q[4] * q[5] + q[7] * q[8];\n"
"	r[5] = w;\n"
"\n"
"	q[2] = q[2] - w * q[1];\n"
"	q[5] = q[5] - w * q[4];\n"
"	q[8] = q[8] - w * q[7];\n"
"\n"
"	w	 = hypot( { q[2], q[5], q[8] } );\n"
"	r[8] = w;\n"
"	if ( w != 0.0f )\n"
"	{\n"
"		w	 = 1.0f / w;\n"
"		q[2] = q[2] * w;\n"
"		q[5] = q[5] * w;\n"
"		q[8] = q[8] * w;\n"
"	}\n"
"	else\n"
"	{\n"
"		q[2] = 1.0f;\n"
"	}\n"
"\n"
"	float d = q[0] * q[4] * q[8] + q[1] * q[5] * q[6] + q[3] * q[7] * q[2] - q[2] * q[4] * q[6] - q[1] * q[3] * q[8] -\n"
"			  q[5] * q[7] * q[0];\n"
"\n"
"	if ( d < 0.0f )\n"
"	{\n"
"		q[0] = -q[0];\n"
"		q[3] = -q[3];\n"
"		q[6] = -q[6];\n"
"		r[0] = -r[0];\n"
"		r[1] = -r[1];\n"
"		r[2] = -r[2];\n"
"	}\n"
"}\n"
"} // namespace hiprt\n"
;
static const char* hip_Quaternion= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_math.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 qtFromAxisAngle( const float4& axisAngle )\n"
"{\n"
"	if ( axisAngle.w == 0.0f ) return { 0.0f, 0.0f, 0.0f, 1.0f };\n"
"\n"
"	const float3 axis  = normalize( make_float3( axisAngle ) );\n"
"	const float	 angle = axisAngle.w;\n"
"\n"
"	float4 q;\n"
"	q.x = axis.x * sinf( angle / 2.0f );\n"
"	q.y = axis.y * sinf( angle / 2.0f );\n"
"	q.z = axis.z * sinf( angle / 2.0f );\n"
"	q.w = cosf( angle / 2.0f );\n"
"	return q;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 qtToAxisAngle( const float4& q )\n"
"{\n"
"	const float3 axis = make_float3( q );\n"
"	const float	 norm = hypot( axis );\n"
"	if ( norm == 0.0f ) return float4{ 0.0f, 0.0f, 1.0f, 0.0f };\n"
"	const float angle = 2.0f * atan2f( norm, q.w );\n"
"	return make_float4( axis / norm, angle );\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 qtFromRotationMatrix( const float ( &R )[3][3] )\n"
"{\n"
"	const float tr = R[0][0] + R[1][1] + R[2][2];\n"
"	float4		q;\n"
"\n"
"	if ( tr > 0.0f )\n"
"	{\n"
"		const float S = sqrtf( tr + 1.0f ) * 2.0f;\n"
"		q.w			  = 0.25f * S;\n"
"		q.x			  = ( R[2][1] - R[1][2] ) / S;\n"
"		q.y			  = ( R[0][2] - R[2][0] ) / S;\n"
"		q.z			  = ( R[1][0] - R[0][1] ) / S;\n"
"	}\n"
"	else if ( ( R[0][0] > R[1][1] ) && ( R[0][0] > R[2][2] ) )\n"
"	{\n"
"		const float S = sqrtf( 1.0f + R[0][0] - R[1][1] - R[2][2] ) * 2.0f;\n"
"		q.w			  = ( R[2][1] - R[1][2] ) / S;\n"
"		q.x			  = 0.25f * S;\n"
"		q.y			  = ( R[0][1] + R[1][0] ) / S;\n"
"		q.z			  = ( R[0][2] + R[2][0] ) / S;\n"
"	}\n"
"	else if ( R[1][1] > R[2][2] )\n"
"	{\n"
"		const float S = sqrtf( 1.0f + R[1][1] - R[0][0] - R[2][2] ) * 2.0f;\n"
"		q.w			  = ( R[0][2] - R[2][0] ) / S;\n"
"		q.x			  = ( R[0][1] + R[1][0] ) / S;\n"
"		q.y			  = 0.25f * S;\n"
"		q.z			  = ( R[1][2] + R[2][1] ) / S;\n"
"	}\n"
"	else\n"
"	{\n"
"		const float S = sqrtf( 1.0f + R[2][2] - R[0][0] - R[1][1] ) * 2.0f;\n"
"		q.w			  = ( R[1][0] - R[0][1] ) / S;\n"
"		q.x			  = ( R[0][2] + R[2][0] ) / S;\n"
"		q.y			  = ( R[1][2] + R[2][1] ) / S;\n"
"		q.z			  = 0.25f * S;\n"
"	}\n"
"	return q;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE void qtToRotationMatrix( const float4& q, float ( &R )[3][3] )\n"
"{\n"
"	const float4 q2{ q.x * q.x, q.y * q.y, q.z * q.z, 0.0f };\n"
"\n"
"	R[0][0] = 1.0f - 2.0f * q2.y - 2.0f * q2.z;\n"
"	R[0][1] = 2.0f * q.x * q.y - 2.0f * q.w * q.z;\n"
"	R[0][2] = 2.0f * q.x * q.z + 2.0f * q.w * q.y;\n"
"\n"
"	R[1][0] = 2.0f * q.x * q.y + 2.0f * q.w * q.z;\n"
"	R[1][1] = 1.0f - 2.0f * q2.x - 2.0f * q2.z;\n"
"	R[1][2] = 2.0f * q.y * q.z - 2.0f * q.w * q.x;\n"
"\n"
"	R[2][0] = 2.0f * q.x * q.z - 2.0f * q.w * q.y;\n"
"	R[2][1] = 2.0f * q.y * q.z + 2.0f * q.w * q.x;\n"
"	R[2][2] = 1.0f - 2.0f * q2.x - 2.0f * q2.y;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float qtDot( const float4& q0, const float4& q1 )\n"
"{\n"
"	return fmaf( q0.x, q1.x, fmaf( q0.y, q1.y, sumOfProducts( q0.z, q1.z, q0.w, q1.w ) ) );\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 qtNormalize( const float4& q ) { return q / sqrtf( qtDot( q, q ) ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 qtMul( const float4& a, const float4& b )\n"
"{\n"
"	const float3 c = cross( make_float3( a ), make_float3( b ) ) + a.w * make_float3( b ) + b.w * make_float3( a );\n"
"	return { c.x, c.y, c.z, a.w * b.w - dot( make_float3( a ), make_float3( b ) ) };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 qtInvert( const float4& q ) { return float4{ -q.x, -q.y, -q.z, q.w }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 qtRotate( const float4& q, const float3& p )\n"
"{\n"
"	const float4 qp	  = make_float4( p, 0.0f );\n"
"	const float4 qInv = qtInvert( q );\n"
"	const float4 out  = qtMul( qtMul( q, qp ), qInv );\n"
"	return make_float3( out );\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float3 qtInvRotate( const float4& q, const float3& p ) { return qtRotate( qtInvert( q ), p ); }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float4 qtMix( float4 v0, float4 v1, const float t )\n"
"{\n"
"	// Only unit quaternions are valid rotations.\n"
"	// Normalize to avoid undefined behavior.\n"
"	v0 = qtNormalize( v0 );\n"
"	v1 = qtNormalize( v1 );\n"
"\n"
"	// Compute the cosine of the angle between the two vectors.\n"
"	float dot = qtDot( v0, v1 );\n"
"\n"
"	// If the dot product is negative, slerp won\'t take\n"
"	// the shorter path. Note that v1 and -v1 are equivalent when\n"
"	// the negation is applied to all four components. Fix by\n"
"	// reversing one quaternion.\n"
"	if ( dot < 0.0f )\n"
"	{\n"
"		v1	= -v1;\n"
"		dot = -dot;\n"
"	}\n"
"\n"
"	static constexpr float DotThreshold = 0.9995;\n"
"	if ( dot > DotThreshold )\n"
"	{\n"
"		// If the inputs are too close for comfort, linearly interpolate\n"
"		// and normalize the result.\n"
"\n"
"		float4 result = v0 + ( v1 - v0 ) * t;\n"
"		result		  = qtNormalize( result );\n"
"		return result;\n"
"	}\n"
"\n"
"	// Since dot is in range [0, DotThreshold], acos is safe\n"
"	const float theta_0		= acosf( dot );	   // theta_0 = angle between input vectors\n"
"	const float theta		= theta_0 * t;	   // theta = angle between v0 and result\n"
"	const float sin_theta	= sinf( theta );   // compute this value only once\n"
"	const float sin_theta_0 = sinf( theta_0 ); // compute this value only once\n"
"\n"
"	const float s0 = cosf( theta ) - dot * sin_theta / sin_theta_0; // == sin(theta_0 - theta) / sin(theta_0)\n"
"	const float s1 = sin_theta / sin_theta_0;\n"
"\n"
"	return ( v0 * s0 ) + ( v1 * s1 );\n"
"}\n"
"} // namespace hiprt\n"
;
static const char* hip_Transform= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_types.h>\n"
"#include <hiprt/impl/Aabb.h>\n"
"#include <hiprt/impl/QrDecomposition.h>\n"
"#include <hiprt/impl/Quaternion.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"struct SRTFrame;\n"
"struct MatrixFrame;\n"
"\n"
"#if defined( HIPRT_MATRIX_FRAME )\n"
"using Frame = MatrixFrame;\n"
"#else\n"
"using Frame = SRTFrame;\n"
"#endif\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE static bool\n"
"identitySRT( const float3& scale, const float3& shear, const float4& rotation, const float3& translation )\n"
"{\n"
"	if ( scale.x != 1.0f || scale.y != 1.0f || scale.z != 1.0f ) return false;\n"
"	if ( shear.x != 0.0f || shear.y != 0.0f || shear.z != 0.0f ) return false;\n"
"	if ( translation.x != 0.0f || translation.y != 0.0f || translation.z != 0.0f ) return false;\n"
"	if ( rotation.w != 1.0f ) return false;\n"
"	return true;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE static bool identityMatrix( const float ( &matrix )[3][4] )\n"
"{\n"
"	if ( matrix[0][0] != 1.0f || matrix[1][1] != 1.0f || matrix[2][2] != 1.0f ) return false;\n"
"	if ( matrix[0][1] != 0.0f || matrix[0][2] != 0.0f || matrix[0][3] != 0.0f ) return false;\n"
"	if ( matrix[1][0] != 0.0f || matrix[1][2] != 0.0f || matrix[1][3] != 0.0f ) return false;\n"
"	if ( matrix[2][0] != 0.0f || matrix[2][1] != 0.0f || matrix[2][3] != 0.0f ) return false;\n"
"	return true;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE static void SRTToInvMatrix(\n"
"	const float3& scale, const float3& shear, const float4& rotation, const float3& translation, float ( &matrixInv )[3][4] )\n"
"{\n"
"	float Q[3][3];\n"
"	qtToRotationMatrix( rotation, Q );\n"
"\n"
"	float Ri[3][3];\n"
"	Ri[0][0] = 1.0f / scale.x;\n"
"	Ri[1][1] = 1.0f / scale.y;\n"
"	Ri[2][2] = 1.0f / scale.z;\n"
"	Ri[0][1] = -shear.x / ( scale.x * scale.y );\n"
"	Ri[0][2] = ( shear.x * shear.z - shear.y * scale.y ) / ( scale.x * scale.y * scale.z );\n"
"	Ri[1][2] = -shear.z / ( scale.y * scale.z );\n"
"	Ri[1][0] = 0.0f;\n"
"	Ri[2][0] = 0.0f;\n"
"	Ri[2][1] = 0.0f;\n"
"\n"
"#ifdef __KERNECC__\n"
"#pragma unroll\n"
"#endif\n"
"	for ( uint32_t i = 0; i < 3; ++i )\n"
"	{\n"
"#ifdef __KERNECC__\n"
"#pragma unroll\n"
"#endif\n"
"		for ( uint32_t j = 0; j < 3; ++j )\n"
"		{\n"
"			matrixInv[i][j] = dot( { Ri[i][0], Ri[i][1], Ri[i][2] }, { Q[j][0], Q[j][1], Q[j][2] } );\n"
"		}\n"
"	}\n"
"\n"
"	matrixInv[0][3] = -dot( { matrixInv[0][0], matrixInv[0][1], matrixInv[0][2] }, translation );\n"
"	matrixInv[1][3] = -dot( { matrixInv[1][0], matrixInv[1][1], matrixInv[1][2] }, translation );\n"
"	matrixInv[2][3] = -dot( { matrixInv[2][0], matrixInv[2][1], matrixInv[2][2] }, translation );\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE static void matrixToInvMatrix( const float ( &matrix )[3][4], float ( &matrixInv )[3][4] )\n"
"{\n"
"	const auto& m = matrix;\n"
"\n"
"	const float det = m[0][0] * ( m[1][1] * m[2][2] - m[1][2] * m[2][1] ) -\n"
"					  m[0][1] * ( m[1][0] * m[2][2] - m[1][2] * m[2][0] ) + m[0][2] * ( m[1][0] * m[2][1] - m[1][1] * m[2][0] );\n"
"\n"
"	constexpr float Epsilon = 1e-10f;\n"
"	if ( fabs( det ) < Epsilon )\n"
"	{\n"
"		memset( &matrixInv[0][0], 0, 12 * sizeof( float ) );\n"
"		return;\n"
"	}\n"
"\n"
"	const float invDet = 1.0f / det;\n"
"\n"
"	matrixInv[0][0] = ( m[1][1] * m[2][2] - m[1][2] * m[2][1] ) * invDet;\n"
"	matrixInv[0][1] = ( m[0][2] * m[2][1] - m[0][1] * m[2][2] ) * invDet;\n"
"	matrixInv[0][2] = ( m[0][1] * m[1][2] - m[0][2] * m[1][1] ) * invDet;\n"
"	matrixInv[1][0] = ( m[1][2] * m[2][0] - m[1][0] * m[2][2] ) * invDet;\n"
"	matrixInv[1][1] = ( m[0][0] * m[2][2] - m[0][2] * m[2][0] ) * invDet;\n"
"	matrixInv[1][2] = ( m[0][2] * m[1][0] - m[0][0] * m[1][2] ) * invDet;\n"
"	matrixInv[2][0] = ( m[1][0] * m[2][1] - m[1][1] * m[2][0] ) * invDet;\n"
"	matrixInv[2][1] = ( m[0][1] * m[2][0] - m[0][0] * m[2][1] ) * invDet;\n"
"	matrixInv[2][2] = ( m[0][0] * m[1][1] - m[0][1] * m[1][0] ) * invDet;\n"
"\n"
"	const float3 translation{ matrix[0][3], matrix[1][3], matrix[2][3] };\n"
"	matrixInv[0][3] = -dot( { matrixInv[0][0], matrixInv[0][1], matrixInv[0][2] }, translation );\n"
"	matrixInv[1][3] = -dot( { matrixInv[1][0], matrixInv[1][1], matrixInv[1][2] }, translation );\n"
"	matrixInv[2][3] = -dot( { matrixInv[2][0], matrixInv[2][1], matrixInv[2][2] }, translation );\n"
"}\n"
"\n"
"struct alignas( 64 ) SRTFrame\n"
"{\n"
"	SRTFrame() = default;\n"
"\n"
"	HIPRT_HOST_DEVICE SRTFrame( const hiprtFrameSRT& frame )\n"
"	{\n"
"		m_rotation	  = qtFromAxisAngle( frame.rotation );\n"
"		m_scale		  = frame.scale;\n"
"		m_shear		  = make_float3( 0.0f );\n"
"		m_translation = frame.translation;\n"
"		m_time		  = frame.time;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE SRTFrame( const hiprtFrameMatrix& frame )\n"
"	{\n"
"		const bool identity = identityMatrix( frame.matrix );\n"
"		if ( identity )\n"
"		{\n"
"			m_scale		  = make_float3( 1.0f );\n"
"			m_shear		  = make_float3( 0.0f );\n"
"			m_translation = make_float3( 0.0f );\n"
"			m_rotation	  = { 0.0f, 0.0f, 0.0f, 1.0f };\n"
"		}\n"
"		else\n"
"		{\n"
"			float QR[3][3], Q[3][3], R[3][3];\n"
"#ifdef __KERNECC__\n"
"#pragma unroll\n"
"#endif\n"
"			for ( uint32_t i = 0; i < 3; ++i )\n"
"			{\n"
"#ifdef __KERNECC__\n"
"#pragma unroll\n"
"#endif\n"
"				for ( uint32_t j = 0; j < 3; ++j )\n"
"				{\n"
"					QR[i][j] = frame.matrix[i][j];\n"
"				}\n"
"			}\n"
"\n"
"			qr( &QR[0][0], &Q[0][0], &R[0][0] );\n"
"			m_translation = { frame.matrix[0][3], frame.matrix[1][3], frame.matrix[2][3] };\n"
"			m_rotation	  = qtFromRotationMatrix( Q );\n"
"			m_scale		  = { R[0][0], R[1][1], R[2][2] };\n"
"			m_shear		  = { R[0][1], R[0][2], R[1][2] };\n"
"		}\n"
"		m_time = frame.time;\n"
"	}\n"
"\n"
"	static HIPRT_HOST_DEVICE SRTFrame interpolate( const SRTFrame& f0, const SRTFrame& f1, const float t )\n"
"	{\n"
"		SRTFrame f{};\n"
"		f.m_scale		= mix( f0.m_scale, f1.m_scale, t );\n"
"		f.m_shear		= mix( f0.m_shear, f1.m_shear, t );\n"
"		f.m_translation = mix( f0.m_translation, f1.m_translation, t );\n"
"		f.m_rotation	= qtMix( f0.m_rotation, f1.m_rotation, t );\n"
"		return f;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 transform( const float3& p ) const\n"
"	{\n"
"		if ( identity() ) return p;\n"
"		float3 result = p;\n"
"		result *= m_scale;\n"
"		result += float3{ p.y * m_shear.x + p.z * m_shear.y, p.z * m_shear.z, 0.0f };\n"
"		result = qtRotate( m_rotation, result );\n"
"		result += m_translation;\n"
"		return result;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 transformVector( const float3& v ) const\n"
"	{\n"
"		if ( identity() ) return v;\n"
"		float3 result = v;\n"
"		result /= m_scale;\n"
"		result.y -= v.x * m_shear.x / m_scale.y;\n"
"		result.z -= ( m_shear.y * result.x + m_shear.z * result.y ) / m_scale.z;\n"
"		result = qtRotate( m_rotation, result );\n"
"		return result;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 invTransform( const float3& p ) const\n"
"	{\n"
"		if ( identity() ) return p;\n"
"		float3 result = p;\n"
"		result -= m_translation;\n"
"		result = qtInvRotate( m_rotation, result );\n"
"		result /= m_scale;\n"
"		result.y -= p.z * m_shear.z / m_scale.y;\n"
"		result.x -= ( m_shear.x * result.y + m_shear.y * result.z ) / m_scale.x;\n"
"		return result;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 invTransformVector( const float3& v ) const\n"
"	{\n"
"		if ( identity() ) return v;\n"
"		float3 result = v;\n"
"		result		  = qtInvRotate( m_rotation, result );\n"
"		result *= m_scale;\n"
"		result += float3{ 0.0f, v.x * m_shear.x, v.x * m_shear.y + v.y * m_shear.z };\n"
"		return result;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE bool identity() const { return identitySRT( m_scale, m_shear, m_rotation, m_translation ); }\n"
"\n"
"	float4 m_rotation{ 0.0f, 0.0f, 0.0f, 1.0f };\n"
"	float3 m_scale{ 1.0f, 1.0f, 1.0f };\n"
"	float3 m_shear{};\n"
"	float3 m_translation{};\n"
"	float  m_time{};\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( SRTFrame ) == 64 );\n"
"\n"
"struct alignas( 64 ) MatrixFrame\n"
"{\n"
"	MatrixFrame() = default;\n"
"\n"
"	HIPRT_HOST_DEVICE MatrixFrame( const hiprtFrameSRT& frame )\n"
"	{\n"
"		const float4 rotation = qtFromAxisAngle( frame.rotation );\n"
"		const bool	 identity = identitySRT( frame.scale, make_float3( 0.0f ), rotation, frame.translation );\n"
"		if ( identity )\n"
"		{\n"
"#ifdef __KERNECC__\n"
"#pragma unroll\n"
"#endif\n"
"			for ( uint32_t i = 0; i < 3; ++i )\n"
"			{\n"
"#ifdef __KERNECC__\n"
"#pragma unroll\n"
"#endif\n"
"				for ( uint32_t j = 0; j < 4; ++j )\n"
"				{\n"
"					if ( i == j )\n"
"						m_matrix[i][j] = 1.0f;\n"
"					else\n"
"						m_matrix[i][j] = 0.0f;\n"
"				}\n"
"			}\n"
"		}\n"
"		else\n"
"		{\n"
"			float Q[3][3];\n"
"			qtToRotationMatrix( rotation, Q );\n"
"\n"
"#ifdef __KERNECC__\n"
"#pragma unroll\n"
"#endif\n"
"			for ( uint32_t i = 0; i < 3; ++i )\n"
"			{\n"
"				m_matrix[i][0] = Q[i][0] * frame.scale.x;\n"
"				m_matrix[i][1] = Q[i][1] * frame.scale.y;\n"
"				m_matrix[i][2] = Q[i][2] * frame.scale.z;\n"
"			}\n"
"\n"
"			m_matrix[0][3] = frame.translation.x;\n"
"			m_matrix[1][3] = frame.translation.y;\n"
"			m_matrix[2][3] = frame.translation.z;\n"
"		}\n"
"		m_time = frame.time;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE MatrixFrame( const hiprtFrameMatrix& frame )\n"
"	{\n"
"		m_time = frame.time;\n"
"		memcpy( &m_matrix[0][0], &frame.matrix[0][0], 12 * sizeof( float ) );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE static MatrixFrame interpolate( const MatrixFrame& f0, const MatrixFrame& f1, const float t )\n"
"	{\n"
"		MatrixFrame f{};\n"
"#ifdef __KERNECC__\n"
"#pragma unroll\n"
"#endif\n"
"		for ( uint32_t i = 0; i < 3; ++i )\n"
"		{\n"
"#ifdef __KERNECC__\n"
"#pragma unroll\n"
"#endif\n"
"			for ( uint32_t j = 0; j < 4; ++j )\n"
"			{\n"
"				f.m_matrix[i][j] = mix( f0.m_matrix[i][j], f1.m_matrix[i][j], t );\n"
"			}\n"
"		}\n"
"		return f;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 transform( const float3& p ) const\n"
"	{\n"
"		if ( identity() ) return p;\n"
"		float3 result{};\n"
"		result.x = dot( { m_matrix[0][0], m_matrix[0][1], m_matrix[0][2] }, p );\n"
"		result.y = dot( { m_matrix[1][0], m_matrix[1][1], m_matrix[1][2] }, p );\n"
"		result.z = dot( { m_matrix[2][0], m_matrix[2][1], m_matrix[2][2] }, p );\n"
"		result += { m_matrix[0][3], m_matrix[1][3], m_matrix[2][3] };\n"
"		return result;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 transformVector( const float3& v ) const\n"
"	{\n"
"		if ( identity() ) return v;\n"
"		float matrixInv[3][4];\n"
"		matrixToInvMatrix( m_matrix, matrixInv );\n"
"		float3 result{};\n"
"		result.x = dot( { matrixInv[0][0], matrixInv[1][0], matrixInv[2][0] }, v );\n"
"		result.y = dot( { matrixInv[0][1], matrixInv[1][1], matrixInv[2][1] }, v );\n"
"		result.z = dot( { matrixInv[0][2], matrixInv[1][2], matrixInv[2][2] }, v );\n"
"		return result;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 invTransform( const float3& p ) const\n"
"	{\n"
"		if ( identity() ) return p;\n"
"		float matrixInv[3][4];\n"
"		matrixToInvMatrix( m_matrix, matrixInv );\n"
"		float3 result{};\n"
"		result.x = dot( { matrixInv[0][0], matrixInv[0][1], matrixInv[0][2] }, p );\n"
"		result.y = dot( { matrixInv[1][0], matrixInv[1][1], matrixInv[1][2] }, p );\n"
"		result.z = dot( { matrixInv[2][0], matrixInv[2][1], matrixInv[2][2] }, p );\n"
"		result += { matrixInv[0][3], matrixInv[1][3], matrixInv[2][3] };\n"
"		return result;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 invTransformVector( const float3& v ) const\n"
"	{\n"
"		if ( identity() ) return v;\n"
"		float3 result{};\n"
"		result.x = dot( { m_matrix[0][0], m_matrix[1][0], m_matrix[2][0] }, v );\n"
"		result.y = dot( { m_matrix[0][1], m_matrix[1][1], m_matrix[2][1] }, v );\n"
"		result.z = dot( { m_matrix[0][2], m_matrix[1][2], m_matrix[2][2] }, v );\n"
"		return result;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE bool identity() const { return identityMatrix( m_matrix ); }\n"
"\n"
"	float m_matrix[3][4] = { { 1.0f, 0.0f, 0.0f, 0.0f }, { 0.0f, 1.0f, 0.0f, 0.0f }, { 0.0f, 0.0f, 1.0f, 0.0f } };\n"
"	float m_time{};\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( MatrixFrame ) == 64 );\n"
"\n"
"HIPRT_HOST_DEVICE static bool computeInvTransformMatrix( const SRTFrame& frame, float ( &matrixInv )[3][4] )\n"
"{\n"
"	if ( identitySRT( frame.m_scale, frame.m_shear, frame.m_rotation, frame.m_translation ) )\n"
"	{\n"
"#ifdef __KERNECC__\n"
"#pragma unroll\n"
"#endif\n"
"		for ( uint32_t i = 0; i < 3; ++i )\n"
"		{\n"
"#ifdef __KERNECC__\n"
"#pragma unroll\n"
"#endif\n"
"			for ( uint32_t j = 0; j < 4; ++j )\n"
"			{\n"
"				if ( i == j )\n"
"					matrixInv[i][j] = 1.0f;\n"
"				else\n"
"					matrixInv[i][j] = 0.0f;\n"
"			}\n"
"		}\n"
"		return true;\n"
"	}\n"
"\n"
"	SRTToInvMatrix( frame.m_scale, frame.m_shear, frame.m_rotation, frame.m_translation, matrixInv );\n"
"\n"
"	return false;\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE static bool computeInvTransformMatrix( const MatrixFrame& frame, float ( &matrixInv )[3][4] )\n"
"{\n"
"	if ( identityMatrix( frame.m_matrix ) )\n"
"	{\n"
"		memcpy( &matrixInv[0][0], &frame.m_matrix[0][0], 12 * sizeof( float ) );\n"
"		return true;\n"
"	}\n"
"\n"
"	matrixToInvMatrix( frame.m_matrix, matrixInv );\n"
"\n"
"	return false;\n"
"}\n"
"\n"
"class Transform\n"
"{\n"
"public:\n"
"	HIPRT_HOST_DEVICE Transform( const Frame* frameData, uint32_t frameIndex, uint32_t frameCount )\n"
"		: m_frameCount( frameCount ), m_frames( nullptr )\n"
"	{\n"
"		if ( frameData != nullptr ) m_frames = frameData + frameIndex;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Frame interpolateFrames( float time ) const\n"
"	{\n"
"		if ( m_frameCount == 0 || m_frames == nullptr ) return Frame();\n"
"\n"
"		Frame f0 = m_frames[0];\n"
"		if ( m_frameCount == 1 || time == 0.0f || time <= f0.m_time ) return f0;\n"
"\n"
"		Frame f1 = m_frames[m_frameCount - 1];\n"
"		if ( time >= f1.m_time ) return f1;\n"
"\n"
"		for ( uint32_t i = 1; i < m_frameCount; ++i )\n"
"		{\n"
"			f1 = m_frames[i];\n"
"			if ( time >= f0.m_time && time <= f1.m_time ) break;\n"
"			f0 = f1;\n"
"		}\n"
"\n"
"		const float t = ( time - f0.m_time ) / ( f1.m_time - f0.m_time );\n"
"\n"
"		return Frame::interpolate( f0, f1, t );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE hiprtRay transformRay( const hiprtRay& ray, float time ) const\n"
"	{\n"
"		hiprtRay	outRay;\n"
"		const Frame frame = interpolateFrames( time );\n"
"		if ( frame.identity() ) return ray;\n"
"		outRay.origin	 = frame.invTransform( ray.origin );\n"
"		outRay.direction = frame.invTransform( ray.origin + ray.direction );\n"
"		outRay.direction = outRay.direction - outRay.origin;\n"
"		outRay.minT		 = ray.minT;\n"
"		outRay.maxT		 = ray.maxT;\n"
"		return outRay;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 transformNormal( const float3& normal, float time ) const\n"
"	{\n"
"		const Frame frame = interpolateFrames( time );\n"
"		return frame.transformVector( normal );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb boundPointMotion( const float3& p ) const\n"
"	{\n"
"		Aabb outAabb;\n"
"\n"
"		if ( m_frameCount == 0 || m_frames == nullptr )\n"
"		{\n"
"			outAabb.grow( p );\n"
"			return outAabb;\n"
"		}\n"
"\n"
"		Frame f0 = m_frames[0];\n"
"		outAabb.grow( f0.transform( p ) );\n"
"\n"
"		if ( m_frameCount == 1 ) return outAabb;\n"
"\n"
"		constexpr uint32_t Steps = 3;\n"
"		constexpr float	   Delta = 1.0f / float( Steps + 1 );\n"
"\n"
"		Frame f1;\n"
"		for ( uint32_t i = 1; i < m_frameCount; ++i )\n"
"		{\n"
"			f1		= m_frames[i];\n"
"			float t = Delta;\n"
"			for ( uint32_t j = 1; j <= Steps; ++j )\n"
"			{\n"
"				Frame f = Frame::interpolate( f0, f1, t );\n"
"				outAabb.grow( f.transform( p ) );\n"
"				t += Delta;\n"
"			}\n"
"			f0 = f1;\n"
"			outAabb.grow( f0.transform( p ) );\n"
"		}\n"
"\n"
"		return outAabb;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb motionBounds( const Aabb& aabb ) const\n"
"	{\n"
"		const float3 p0 = aabb.m_min;\n"
"		const float3 p1 = { aabb.m_min.x, aabb.m_min.y, aabb.m_max.z };\n"
"		const float3 p2 = { aabb.m_min.x, aabb.m_max.y, aabb.m_min.z };\n"
"		const float3 p3 = { aabb.m_min.x, aabb.m_max.y, aabb.m_max.z };\n"
"		const float3 p4 = { aabb.m_max.x, aabb.m_min.y, aabb.m_min.z };\n"
"		const float3 p5 = { aabb.m_max.x, aabb.m_min.y, aabb.m_max.z };\n"
"		const float3 p6 = { aabb.m_max.x, aabb.m_max.y, aabb.m_min.z };\n"
"		const float3 p7 = aabb.m_max;\n"
"\n"
"		Aabb outAabb;\n"
"		outAabb.grow( boundPointMotion( p0 ) );\n"
"		outAabb.grow( boundPointMotion( p1 ) );\n"
"		outAabb.grow( boundPointMotion( p2 ) );\n"
"		outAabb.grow( boundPointMotion( p3 ) );\n"
"		outAabb.grow( boundPointMotion( p4 ) );\n"
"		outAabb.grow( boundPointMotion( p5 ) );\n"
"		outAabb.grow( boundPointMotion( p6 ) );\n"
"		outAabb.grow( boundPointMotion( p7 ) );\n"
"		return outAabb;\n"
"	}\n"
"\n"
"private:\n"
"	uint32_t	 m_frameCount;\n"
"	const Frame* m_frames;\n"
"};\n"
"} // namespace hiprt\n"
;
static const char* hip_Instance= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_types.h>\n"
"#include <hiprt/impl/Header.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"struct alignas( 16 ) Instance\n"
"{\n"
"	uint32_t m_frameIndex;\n"
"	uint32_t m_frameCount : 31;\n"
"	uint32_t m_type : 1;\n"
"	union\n"
"	{\n"
"		GeomHeader*	 m_geometry;\n"
"		SceneHeader* m_scene;\n"
"	};\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( Instance ) == 16 );\n"
"} // namespace hiprt\n"
;
static const char* hip_InstanceList= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_types.h>\n"
"#include <hiprt/impl/Header.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"template <typename ApiFrame>\n"
"class InstanceList\n"
"{\n"
"public:\n"
"	HIPRT_HOST_DEVICE InstanceList( const hiprtSceneBuildInput& input )\n"
"		: m_instanceCount( input.instanceCount ), m_frameCount( input.frameCount )\n"
"	{\n"
"		m_instances		   = reinterpret_cast<hiprtInstance*>( input.instances );\n"
"		m_transformHeaders = reinterpret_cast<hiprtTransformHeader*>( input.instanceTransformHeaders );\n"
"		m_apiFrames		   = reinterpret_cast<ApiFrame*>( input.instanceFrames );\n"
"		m_masks			   = reinterpret_cast<uint32_t*>( input.instanceMasks );\n"
"	}\n"
"\n"
"#if defined( __KERNELCC__ )\n"
"	HIPRT_DEVICE InstanceNode fetchPrimNode( const uint32_t index ) const\n"
"	{\n"
"		const hiprtInstance		   instance	 = fetchInstance( index );\n"
"		const hiprtTransformHeader transform = fetchTransformHeader( index );\n"
"		const Frame				   frame	 = fetchFrame( transform.frameIndex );\n"
"		const uint32_t			   mask		 = fetchMask( index );\n"
"\n"
"		InstanceNode instanceNode{};\n"
"		if constexpr ( is_same<InstanceNode, HwInstanceNode>::value )\n"
"		{\n"
"			const BoxNode root = instance.type == hiprtInstanceTypeScene\n"
"									 ? reinterpret_cast<SceneHeader*>( instance.scene )->m_boxNodes[0]\n"
"									 : reinterpret_cast<GeomHeader*>( instance.geometry )->m_boxNodes[0];\n"
"\n"
"			Aabb	 childBoxes[BranchingFactor];\n"
"			uint32_t childIndices[BranchingFactor];\n"
"			uint32_t childCount = root.getChildCount();\n"
"\n"
"			for ( uint32_t i = 0; i < childCount; ++i )\n"
"				childBoxes[i] = root.getChildBox( i );\n"
"\n"
"			while ( childCount > HwInstanceNode::BranchingFactor )\n"
"			{\n"
"				float	 minArea = FltMax;\n"
"				uint32_t min_i	 = InvalidValue;\n"
"				uint32_t min_j	 = InvalidValue;\n"
"\n"
"				for ( uint32_t i = 0; i < childCount; ++i )\n"
"				{\n"
"					for ( uint32_t j = i + 1; j < childCount; ++j )\n"
"					{\n"
"						const Aabb	box( childBoxes[i], childBoxes[j] );\n"
"						const float area = box.area();\n"
"						if ( minArea > area )\n"
"						{\n"
"							minArea = area;\n"
"							min_i	= i;\n"
"							min_j	= j;\n"
"						}\n"
"					}\n"
"				}\n"
"\n"
"				childBoxes[min_i] = Aabb( childBoxes[min_i], childBoxes[min_j] );\n"
"				childBoxes[min_j] = childBoxes[--childCount];\n"
"			}\n"
"\n"
"			for ( uint32_t i = 0; i < childCount; ++i )\n"
"				childIndices[i] = RootIndex;\n"
"\n"
"			instanceNode.init( index, mask, frame, instance, transform, childCount, childIndices, childBoxes );\n"
"		}\n"
"		else\n"
"		{\n"
"			instanceNode.init( index, mask, frame, instance, transform, InvalidValue, nullptr, nullptr );\n"
"		}\n"
"\n"
"		return instanceNode;\n"
"	}\n"
"\n"
"	HIPRT_DEVICE Aabb fetchAabb( const uint32_t index ) const\n"
"	{\n"
"		const hiprtTransformHeader header = fetchTransformHeader( index );\n"
"		const Transform			   t( m_frames, header.frameIndex, header.frameCount );\n"
"		const hiprtInstance		   instance = fetchInstance( index );\n"
"		const BoxNode*			   boxNodes = instance.type == hiprtInstanceTypeScene\n"
"												  ? reinterpret_cast<SceneHeader*>( instance.scene )->m_boxNodes\n"
"												  : reinterpret_cast<GeomHeader*>( instance.geometry )->m_boxNodes;\n"
"		const BoxNode&			   root		= boxNodes[0];\n"
"\n"
"		Aabb aabb;\n"
"		for ( uint32_t i = 0; i < root.getChildCount(); ++i )\n"
"		{\n"
"			const Aabb childBox = root.getChildBox( i );\n"
"			aabb.grow( t.motionBounds( childBox ) );\n"
"		}\n"
"		return aabb;\n"
"	}\n"
"\n"
"	HIPRT_DEVICE float3 fetchCenter( const uint32_t index ) const { return fetchAabb( index ).center(); }\n"
"\n"
"	HIPRT_DEVICE void split(\n"
"		const uint32_t index, const uint32_t axis, const float position, const Aabb& box, Aabb& leftBox, Aabb& rightBox ) const\n"
"	{\n"
"		const hiprtTransformHeader header = fetchTransformHeader( index );\n"
"		const Transform			   t( m_frames, header.frameIndex, header.frameCount );\n"
"		const hiprtInstance		   instance = fetchInstance( index );\n"
"		const BoxNode*			   boxNodes = instance.type == hiprtInstanceTypeScene\n"
"												  ? reinterpret_cast<SceneHeader*>( instance.scene )->m_boxNodes\n"
"												  : reinterpret_cast<GeomHeader*>( instance.geometry )->m_boxNodes;\n"
"		const BoxNode&			   root		= boxNodes[0];\n"
"\n"
"		leftBox = rightBox = Aabb();\n"
"		for ( uint32_t i = 0; i < root.getChildCount(); ++i )\n"
"		{\n"
"			const Aabb	childBox = t.motionBounds( root.getChildBox( i ) );\n"
"			const float mn		 = ( &childBox.m_min.x )[axis];\n"
"			const float mx		 = ( &childBox.m_max.x )[axis];\n"
"			if ( position >= mx )\n"
"			{\n"
"				leftBox.grow( childBox );\n"
"			}\n"
"			else if ( position <= mn )\n"
"			{\n"
"				rightBox.grow( childBox );\n"
"			}\n"
"			else\n"
"			{\n"
"				Aabb leftChildBox				 = childBox;\n"
"				Aabb rightChildBox				 = childBox;\n"
"				( &leftChildBox.m_max.x )[axis]	 = position;\n"
"				( &rightChildBox.m_min.x )[axis] = position;\n"
"				leftBox.grow( leftChildBox );\n"
"				rightBox.grow( rightChildBox );\n"
"			}\n"
"		}\n"
"\n"
"		( &leftBox.m_max.x )[axis]	= position;\n"
"		( &rightBox.m_min.x )[axis] = position;\n"
"		leftBox.intersect( box );\n"
"		rightBox.intersect( box );\n"
"	}\n"
"\n"
"	HIPRT_DEVICE Obb fetchObb( const uint32_t index, const uint32_t matrixIndex, const Aabb& box ) const\n"
"	{\n"
"		const hiprtTransformHeader header = fetchTransformHeader( index );\n"
"		const Transform			   t( m_frames, header.frameIndex, header.frameCount );\n"
"		const hiprtInstance		   instance = fetchInstance( index );\n"
"		const BoxNode*			   boxNodes = instance.type == hiprtInstanceTypeScene\n"
"												  ? reinterpret_cast<SceneHeader*>( instance.scene )->m_boxNodes\n"
"												  : reinterpret_cast<GeomHeader*>( instance.geometry )->m_boxNodes;\n"
"		const BoxNode&			   root		= boxNodes[0];\n"
"\n"
"		Obb obb( matrixIndex );\n"
"		for ( uint32_t i = 0; i < root.getChildCount(); ++i )\n"
"		{\n"
"			const Aabb childBox = t.motionBounds( root.getChildBox( i ) ).intersect( box );\n"
"			if ( childBox.valid() ) obb.grow( childBox );\n"
"		}\n"
"\n"
"		if ( !obb.valid() ) obb.grow( box );\n"
"\n"
"		return obb;\n"
"	}\n"
"#endif\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t fetchMask( const uint32_t index ) const\n"
"	{\n"
"		if ( m_masks == nullptr ) return FullRayMask;\n"
"		return m_masks[index];\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE hiprtInstance fetchInstance( const uint32_t index ) const { return m_instances[index]; }\n"
"\n"
"	HIPRT_HOST_DEVICE hiprtTransformHeader fetchTransformHeader( uint32_t index ) const\n"
"	{\n"
"		if ( m_transformHeaders == nullptr ) return hiprtTransformHeader{ index, 1 };\n"
"		return m_transformHeaders[index];\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Frame fetchFrame( const uint32_t index ) const\n"
"	{\n"
"		if ( m_frameCount == 0 || m_apiFrames == nullptr || m_frames == nullptr ) return Frame();\n"
"		return m_frames[index];\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE void convertFrame( const uint32_t index )\n"
"	{\n"
"		if ( m_frameCount > 0 && m_apiFrames != nullptr && m_frames != nullptr ) m_frames[index] = Frame( m_apiFrames[index] );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE bool computeInvTransformMatrix( const uint32_t index, float ( &matrix )[3][4] ) const\n"
"	{\n"
"		const Frame frame = fetchFrame( index );\n"
"		return hiprt::computeInvTransformMatrix( frame, matrix );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getCount() const { return m_instanceCount; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getFrameCount() const { return m_frameCount; }\n"
"\n"
"	HIPRT_HOST_DEVICE void setFrames( Frame* frames ) { m_frames = frames; }\n"
"\n"
"private:\n"
"	hiprtInstance*		  m_instances;\n"
"	hiprtTransformHeader* m_transformHeaders;\n"
"	Frame*				  m_frames = nullptr;\n"
"	ApiFrame*			  m_apiFrames;\n"
"	uint32_t*			  m_masks;\n"
"	uint32_t			  m_instanceCount;\n"
"	uint32_t			  m_frameCount;\n"
"};\n"
"} // namespace hiprt\n"
;
static const char* hip_MortonCode= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_common.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"HIPRT_DEVICE HIPRT_INLINE uint32_t expandBits2D( uint32_t v )\n"
"{\n"
"	v &= 0x0000ffff;					 /* w = ---- ---- ---- ---- fedc ba98 7654 3210 */\n"
"	v = ( v ^ ( v << 8 ) ) & 0x00ff00ff; /* w = ---- ---- fedc ba98 ---- ---- 7654 3210 */\n"
"	v = ( v ^ ( v << 4 ) ) & 0x0f0f0f0f; /* w = ---- fedc ---- ba98 ---- 7654 ---- 3210 */\n"
"	v = ( v ^ ( v << 2 ) ) & 0x33333333; /* w = --fe --dc --ba --98 --76 --54 --32 --10 */\n"
"	v = ( v ^ ( v << 1 ) ) & 0x55555555; /* w = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0 */\n"
"	return v;\n"
"}\n"
"\n"
"HIPRT_DEVICE HIPRT_INLINE uint32_t expandBits3D( uint32_t v )\n"
"{\n"
"	v = ( v * 0x00010001u ) & 0xFF0000FFu;\n"
"	v = ( v * 0x00000101u ) & 0x0F00F00Fu;\n"
"	v = ( v * 0x00000011u ) & 0xC30C30C3u;\n"
"	v = ( v * 0x00000005u ) & 0x49249249u;\n"
"	return v;\n"
"}\n"
"\n"
"HIPRT_DEVICE HIPRT_INLINE uint32_t computeMortonCode( float3 normalizedPos )\n"
"{\n"
"	float	 x	= min( max( normalizedPos.x * 1024.0f, 0.0f ), 1023.0f );\n"
"	float	 y	= min( max( normalizedPos.y * 1024.0f, 0.0f ), 1023.0f );\n"
"	float	 z	= min( max( normalizedPos.z * 1024.0f, 0.0f ), 1023.0f );\n"
"	uint32_t xx = expandBits3D( uint32_t( x ) );\n"
"	uint32_t yy = expandBits3D( uint32_t( y ) );\n"
"	uint32_t zz = expandBits3D( uint32_t( z ) );\n"
"	return xx * 4 + yy * 2 + zz;\n"
"}\n"
"\n"
"HIPRT_DEVICE HIPRT_INLINE uint32_t computeExtendedMortonCode( float3 normalizedPos, float3 sceneExtent )\n"
"{\n"
"	const uint32_t numMortonBits = 30;\n"
"	int3		   numBits		 = make_int3( 0 );\n"
"\n"
"	int3 numPrebits;\n"
"	int3 startAxis;\n"
"\n"
"	// Find the largest start axis and how many prebits are needed between largest and two other axes\n"
"	if ( sceneExtent.x < sceneExtent.y )\n"
"	{\n"
"		if ( sceneExtent.x < sceneExtent.z )\n"
"		{\n"
"			if ( sceneExtent.y < sceneExtent.z )\n"
"			{\n"
"				// z, y, x\n"
"				startAxis.x	 = 2;\n"
"				numPrebits.x = log2( sceneExtent.z / sceneExtent.y );\n"
"\n"
"				startAxis.y	 = 1;\n"
"				numPrebits.y = log2( sceneExtent.y / sceneExtent.x );\n"
"\n"
"				startAxis.z	 = 0;\n"
"				numPrebits.z = log2( sceneExtent.z / sceneExtent.x );\n"
"			}\n"
"			else\n"
"			{\n"
"				// y, z, x\n"
"				startAxis.x	 = 1;\n"
"				numPrebits.x = log2( sceneExtent.y / sceneExtent.z );\n"
"\n"
"				startAxis.y	 = 2;\n"
"				numPrebits.y = log2( sceneExtent.z / sceneExtent.x );\n"
"\n"
"				startAxis.z	 = 0;\n"
"				numPrebits.z = log2( sceneExtent.y / sceneExtent.x );\n"
"			}\n"
"		}\n"
"		else\n"
"		{\n"
"			// y, x, z\n"
"			startAxis.x	 = 1;\n"
"			numPrebits.x = log2( sceneExtent.y / sceneExtent.x );\n"
"\n"
"			startAxis.y	 = 0;\n"
"			numPrebits.y = log2( sceneExtent.x / sceneExtent.z );\n"
"\n"
"			startAxis.z	 = 2;\n"
"			numPrebits.z = log2( sceneExtent.y / sceneExtent.z );\n"
"		}\n"
"	}\n"
"	else\n"
"	{\n"
"		if ( sceneExtent.y < sceneExtent.z )\n"
"		{\n"
"			if ( sceneExtent.x < sceneExtent.z )\n"
"			{\n"
"				// z, x, y\n"
"				startAxis.x	 = 2;\n"
"				numPrebits.x = log2( sceneExtent.z / sceneExtent.x );\n"
"\n"
"				startAxis.y	 = 0;\n"
"				numPrebits.y = log2( sceneExtent.x / sceneExtent.y );\n"
"\n"
"				startAxis.z	 = 1;\n"
"				numPrebits.z = log2( sceneExtent.z / sceneExtent.y );\n"
"			}\n"
"			else\n"
"			{\n"
"				// x, z, y\n"
"				startAxis.x	 = 0;\n"
"				numPrebits.x = log2( sceneExtent.x / sceneExtent.z );\n"
"\n"
"				startAxis.y	 = 2;\n"
"				numPrebits.y = log2( sceneExtent.z / sceneExtent.y );\n"
"\n"
"				startAxis.z	 = 1;\n"
"				numPrebits.z = log2( sceneExtent.x / sceneExtent.y );\n"
"			}\n"
"		}\n"
"		else\n"
"		{\n"
"			// x, y, z\n"
"			startAxis.x	 = 0;\n"
"			numPrebits.x = log2( sceneExtent.x / sceneExtent.y );\n"
"\n"
"			startAxis.y	 = 1;\n"
"			numPrebits.y = log2( sceneExtent.y / sceneExtent.z );\n"
"\n"
"			startAxis.z	 = 2;\n"
"			numPrebits.z = log2( sceneExtent.x / sceneExtent.z );\n"
"		}\n"
"	}\n"
"\n"
"	// say x > y > z\n"
"	// prebits[0] = 3\n"
"	// prebits[1] = 2\n"
"	// if swap == 1\n"
"	// xxx xy xy x yxz yxz ...\n"
"	// if swap == 0\n"
"	// xxx xy xy xyz xyz ...\n"
"	int swap = numPrebits.z - ( numPrebits.x + numPrebits.y );\n"
"\n"
"	numPrebits.x = min( numPrebits.x, numMortonBits );\n"
"	numPrebits.y = min( numPrebits.y * 2, numMortonBits - numPrebits.x ) / 2;\n"
"\n"
"	int numPrebitsSum = numPrebits.x + numPrebits.y * 2;\n"
"\n"
"	if ( numPrebitsSum != numMortonBits )\n"
"		numPrebitsSum += swap;\n"
"	else\n"
"		swap = 0;\n"
"\n"
"	// The scene might be 2D so check for the smallest axis\n"
"	numBits.z = ( ptr( sceneExtent )[startAxis.z] != 0 ) ? max( 0, ( numMortonBits - numPrebitsSum ) / 3 ) : 0;\n"
"\n"
"	if ( swap > 0 )\n"
"	{\n"
"		numBits.x = max( 0, ( numMortonBits - numBits.z - numPrebitsSum ) / 2 + numPrebits.y + numPrebits.x + 1 );\n"
"		numBits.y = numMortonBits - numBits.x - numBits.z;\n"
"	}\n"
"	else\n"
"	{\n"
"		numBits.y = max( 0, ( numMortonBits - numBits.z - numPrebitsSum ) / 2 + numPrebits.y );\n"
"		numBits.x = numMortonBits - numBits.y - numBits.z;\n"
"	}\n"
"\n"
"	uint32_t mortonCode = 0;\n"
"	int3	 axisCode;\n"
"\n"
"	// Based on the number of bits, calculate each code per axis\n"
"	axisCode.x =\n"
"		min( uint32_t( max( ptr( normalizedPos )[startAxis.x] * ( 1u << numBits.x ), 0.0f ) ), ( 1u << numBits.x ) - 1 );\n"
"	axisCode.y =\n"
"		min( uint32_t( max( ptr( normalizedPos )[startAxis.y] * ( 1u << numBits.y ), 0.0f ) ), ( 1u << numBits.y ) - 1 );\n"
"	axisCode.z =\n"
"		min( uint32_t( max( ptr( normalizedPos )[startAxis.z] * ( 1u << numBits.z ), 0.0f ) ), ( 1u << numBits.z ) - 1 );\n"
"\n"
"	uint32_t delta0 = 0;\n"
"	uint32_t delta1 = 0;\n"
"\n"
"	// if there are prebits, set them in the morton code:\n"
"	// if swap == 1\n"
"	// [xxx xy xy x] yxz yxz ...\n"
"	// if swap == 0\n"
"	// [xxx xy xy xyz] xyz ...\n"
"	if ( numPrebitsSum > 0 )\n"
"	{\n"
"		numBits.x -= numPrebits.x;\n"
"		mortonCode = axisCode.x & ( ( ( 1U << numPrebits.x ) - 1 ) << numBits.x );\n"
"		mortonCode >>= numBits.x;\n"
"\n"
"		mortonCode <<= numPrebits.y * 2;\n"
"		numBits.x -= numPrebits.y;\n"
"		numBits.y -= numPrebits.y;\n"
"		uint32_t temp0 = axisCode.x & ( ( ( 1u << numPrebits.y ) - 1 ) << numBits.x );\n"
"		temp0 >>= numBits.x;\n"
"		temp0 = expandBits2D( temp0 );\n"
"\n"
"		uint32_t temp1 = axisCode.y & ( ( ( 1u << numPrebits.y ) - 1 ) << numBits.y );\n"
"		temp1 >>= numBits.y;\n"
"		temp1 = expandBits2D( temp1 );\n"
"\n"
"		mortonCode |= temp0 * 2 + temp1;\n"
"\n"
"		if ( swap > 0 )\n"
"		{\n"
"			mortonCode <<= 1;\n"
"			numBits.x -= 1;\n"
"			uint32_t temp = axisCode.x & ( 1U << numBits.x );\n"
"			temp >>= numBits.x;\n"
"			mortonCode |= temp;\n"
"		}\n"
"\n"
"		mortonCode <<= numBits.x + numBits.y + numBits.z;\n"
"\n"
"		axisCode.x &= ( ( 1u << numBits.x ) - 1 );\n"
"		axisCode.y &= ( ( 1u << numBits.y ) - 1 );\n"
"\n"
"		if ( swap > 0 )\n"
"		{\n"
"			delta0 = ( numBits.y - numBits.x );\n"
"			axisCode.x <<= delta0;\n"
"\n"
"			delta1 = ( numBits.y - numBits.z );\n"
"			axisCode.z <<= delta1;\n"
"		}\n"
"		else\n"
"		{\n"
"			delta0 = ( numBits.x - numBits.y );\n"
"			axisCode.y <<= delta0;\n"
"\n"
"			delta1 = ( numBits.x - numBits.z );\n"
"			axisCode.z <<= delta1;\n"
"		}\n"
"	}\n"
"\n"
"	// 2D case, just use xy xy xy...\n"
"	if ( numBits.z == 0 )\n"
"	{\n"
"		axisCode.x = expandBits2D( axisCode.x );\n"
"		axisCode.y = expandBits2D( axisCode.y );\n"
"		mortonCode |= axisCode.x * 2 + axisCode.y;\n"
"	}\n"
"	else // 3D case, just use if swap == 0 xyz xyz xyz..., if swap == 1 yxz yxz yxz...\n"
"	{\n"
"		axisCode.x = ( axisCode.x > 0 ) ? expandBits3D( axisCode.x ) : 0;\n"
"		axisCode.y = ( axisCode.y > 0 ) ? expandBits3D( axisCode.y ) : 0;\n"
"		axisCode.z = ( axisCode.z > 0 ) ? expandBits3D( axisCode.z ) : 0;\n"
"\n"
"		if ( swap > 0 )\n"
"			mortonCode |= ( axisCode.y * 4 + axisCode.x * 2 + axisCode.z ) >> ( delta0 + delta1 );\n"
"		else\n"
"			mortonCode |= ( axisCode.x * 4 + axisCode.y * 2 + axisCode.z ) >> ( delta0 + delta1 );\n"
"	}\n"
"\n"
"	return mortonCode;\n"
"}\n"
"\n"
"HIPRT_DEVICE uint64_t findHighestDifferentBit( int i, int j, int n, const uint32_t* sortedMortonCodeKeys )\n"
"{\n"
"	if ( j < 0 || j >= n ) return ~0ull;\n"
"	const uint64_t a = ( static_cast<uint64_t>( sortedMortonCodeKeys[i] ) << 32ull ) | i;\n"
"	const uint64_t b = ( static_cast<uint64_t>( sortedMortonCodeKeys[j] ) << 32ull ) | j;\n"
"	return a ^ b;\n"
"}\n"
"} // namespace hiprt\n"
;
static const char* hip_TriangleMesh= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_common.h>\n"
"#include <hiprt/hiprt_types.h>\n"
"#include <hiprt/impl/BvhNode.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 tryPairTriangles( const uint3& a, const uint3& b )\n"
"{\n"
"	uint3 lb = uint3{ 3, 3, 3 };\n"
"\n"
"	lb.x = ( b.x == a.x ) ? 0 : lb.x;\n"
"	lb.y = ( b.y == a.x ) ? 0 : lb.y;\n"
"	lb.z = ( b.z == a.x ) ? 0 : lb.z;\n"
"\n"
"	lb.x = ( b.x == a.y ) ? 1 : lb.x;\n"
"	lb.y = ( b.y == a.y ) ? 1 : lb.y;\n"
"	lb.z = ( b.z == a.y ) ? 1 : lb.z;\n"
"\n"
"	lb.x = ( b.x == a.z ) ? 2 : lb.x;\n"
"	lb.y = ( b.y == a.z ) ? 2 : lb.y;\n"
"	lb.z = ( b.z == a.z ) ? 2 : lb.z;\n"
"\n"
"	if ( ( lb.x == 3 ) + ( lb.y == 3 ) + ( lb.z == 3 ) <= 1 ) return lb;\n"
"	return uint3{ InvalidValue, InvalidValue, InvalidValue };\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 shiftLeft( const uint3& a ) { return uint3{ a.y, a.z, a.x }; }\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint3 shiftRight( const uint3& a ) { return uint3{ a.z, a.x, a.y }; }\n"
"\n"
"class TriangleMesh\n"
"{\n"
"public:\n"
"	HIPRT_HOST_DEVICE TriangleMesh( const hiprtTriangleMeshPrimitive& mesh )\n"
"		: m_vertexCount( mesh.vertexCount ), m_vertexStride( mesh.vertexStride ), m_triangleCount( mesh.triangleCount ),\n"
"		  m_triangleStride( mesh.triangleStride ), m_pairCount( mesh.trianglePairCount )\n"
"	{\n"
"		m_vertices		  = reinterpret_cast<const uint8_t*>( mesh.vertices );\n"
"		m_triangleIndices = reinterpret_cast<const uint8_t*>( mesh.triangleIndices );\n"
"		m_pairIndices	  = reinterpret_cast<const uint2*>( mesh.trianglePairIndices );\n"
"		if ( m_triangleCount == 0 || m_triangleIndices == nullptr ) m_triangleCount = m_vertexCount / 3;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 fetchVertex( const uint32_t index ) const\n"
"	{\n"
"		const float* vertexPtr = reinterpret_cast<const float*>( m_vertices + index * m_vertexStride );\n"
"		return float3{ vertexPtr[0], vertexPtr[1], vertexPtr[2] };\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE uint3 fetchTriangleIndices( const uint32_t index ) const\n"
"	{\n"
"		if ( m_triangleIndices == nullptr ) return uint3{ 3 * index + 0, 3 * index + 1, 3 * index + 2 };\n"
"		const uint32_t* trianglePtr = reinterpret_cast<const uint32_t*>( m_triangleIndices + index * m_triangleStride );\n"
"		return uint3{ trianglePtr[0], trianglePtr[1], trianglePtr[2] };\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE uint2 fetchTrianglePairIndices( const uint32_t index ) const\n"
"	{\n"
"		uint2 pairIndices = make_uint2( index );\n"
"		if ( m_pairCount > 0 ) pairIndices = m_pairIndices[index];\n"
"		return pairIndices;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE TrianglePairNode fetchTrianglePairNode( const uint2 pairIndices ) const\n"
"	{\n"
"		uint3 indices0 = fetchTriangleIndices( pairIndices.x );\n"
"		uint3 indices1;\n"
"\n"
"		uint32_t flags = DefaultTriangleFlags;\n"
"		if ( pairIndices.x != pairIndices.y )\n"
"		{\n"
"			indices1			= fetchTriangleIndices( pairIndices.y );\n"
"			uint3 vertexMapping = tryPairTriangles( indices1, indices0 );\n"
"\n"
"			// align the first triangle to [1,2]\n"
"			uint3 flags0 = { 1, 2, 0 };\n"
"			if ( vertexMapping.y == 3 )\n"
"			{\n"
"				vertexMapping = shiftLeft( vertexMapping );\n"
"				indices0	  = shiftLeft( indices0 );\n"
"				flags0		  = shiftRight( flags0 );\n"
"			}\n"
"			else if ( vertexMapping.z == 3 )\n"
"			{\n"
"				vertexMapping = shiftRight( vertexMapping );\n"
"				indices0	  = shiftRight( indices0 );\n"
"				flags0		  = shiftLeft( flags0 );\n"
"			}\n"
"			// vertexMapping.x == 3\n"
"\n"
"			// [2 1 0] -- L --> [0 2 1] -- flip --> [0 1 2]\n"
"			// [0 2 1] -- R --> [1 0 2] -- flip --> [0 1 2]\n"
"			// [1 0 2] -- flip --> [0 1 2]\n"
"			bool flip =\n"
"				!( ( vertexMapping.y == 0 && vertexMapping.z == 2 ) || ( vertexMapping.y == 2 && vertexMapping.z == 1 ) ||\n"
"				   ( vertexMapping.y == 1 && vertexMapping.z == 0 ) );\n"
"\n"
"			// align the second triangle to [1,2]\n"
"			uint3 flags1 = flip ? uint3{ 2, 1, 0 } : uint3{ 0, 1, 2 };\n"
"			if ( ( vertexMapping.y == 2 && vertexMapping.z == 1 ) || ( vertexMapping.y == 1 && vertexMapping.z == 2 ) )\n"
"			{\n"
"				// [2 0 1] -- L --> [0 1 2]\n"
"				// [2 1 0] -- L --> [1 0 2] -- flip --> [0 1 2]\n"
"				indices1 = shiftLeft( indices1 );\n"
"				flags1	 = shiftRight( flags1 );\n"
"			}\n"
"			else if ( ( vertexMapping.y == 2 && vertexMapping.z == 0 ) || ( vertexMapping.y == 0 && vertexMapping.z == 2 ) )\n"
"			{\n"
"				// [1 2 0] -- R --> [0 1 2]\n"
"				// [0 2 1] -- R --> [1 0 2] -- flip --> [0 1 2]\n"
"				indices1 = shiftRight( indices1 );\n"
"				flags1	 = shiftLeft( flags1 );\n"
"			}\n"
"\n"
"			// triangle flags\n"
"			flags = ( flip << 13 ) | ( flags1.y << 10 ) | ( flags1.x << 8 ) | ( flags0.y << 2 ) | ( flags0.x << 0 );\n"
"		}\n"
"\n"
"		TrianglePairNode triPairNode;\n"
"		triPairNode.m_flags		 = flags;\n"
"		triPairNode.m_primIndex0 = pairIndices.x;\n"
"		triPairNode.m_primIndex1 = pairIndices.y;\n"
"\n"
"		triPairNode.m_triPair.m_v0 = fetchVertex( indices0.x );\n"
"		triPairNode.m_triPair.m_v1 = fetchVertex( indices0.y );\n"
"		triPairNode.m_triPair.m_v2 = fetchVertex( indices0.z );\n"
"		triPairNode.m_triPair.m_v3 = triPairNode.m_triPair.m_v2;\n"
"\n"
"		if ( pairIndices.x != pairIndices.y ) triPairNode.m_triPair.m_v3 = fetchVertex( indices1.z );\n"
"\n"
"		return triPairNode;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE TrianglePairNode fetchTrianglePairNode( const uint32_t index ) const\n"
"	{\n"
"		return fetchTrianglePairNode( fetchTrianglePairIndices( index ) );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE TrianglePacketNode fetchTrianglePacketNode( const uint2 pairIndices ) const\n"
"	{\n"
"		uint3 indices0 = fetchTriangleIndices( pairIndices.x );\n"
"\n"
"		float3 vertex0 = fetchVertex( indices0.x );\n"
"		float3 vertex1 = fetchVertex( indices0.y );\n"
"		float3 vertex2 = fetchVertex( indices0.z );\n"
"		float3 vertex3 = vertex2;\n"
"\n"
"		uint3 vertexMapping{};\n"
"		if ( pairIndices.x != pairIndices.y )\n"
"		{\n"
"			uint3 indices1 = fetchTriangleIndices( pairIndices.y );\n"
"			vertexMapping  = tryPairTriangles( indices0, indices1 );\n"
"\n"
"			uint32_t vertexIndex = 0;\n"
"			if ( vertexMapping.x == 3 ) vertexIndex = indices1.x;\n"
"			if ( vertexMapping.y == 3 ) vertexIndex = indices1.y;\n"
"			if ( vertexMapping.z == 3 ) vertexIndex = indices1.z;\n"
"\n"
"			vertex3 = fetchVertex( vertexIndex );\n"
"		}\n"
"\n"
"		TrianglePacketData	 data( pairIndices.x, pairIndices.y, 4 );\n"
"		TrianglePacketHeader hdr = data.buildHeader();\n"
"		TrianglePacketNode	 triPacketNode{};\n"
"		triPacketNode.writeHeader( hdr );\n"
"		triPacketNode.writePrimIndex( 0, 0, hdr, pairIndices.x );\n"
"		triPacketNode.writePrimIndex( 0, 1, hdr, pairIndices.y );\n"
"		triPacketNode.writeDescriptor( 0, { 0, 1, 2 }, vertexMapping, true );\n"
"		triPacketNode.writeVertex( 0, vertex0 );\n"
"		triPacketNode.writeVertex( 1, vertex1 );\n"
"		triPacketNode.writeVertex( 2, vertex2 );\n"
"		triPacketNode.writeVertex( 3, vertex3 );\n"
"\n"
"		return triPacketNode;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE TrianglePacketNode fetchTrianglePacketNode( const uint32_t index ) const\n"
"	{\n"
"		return fetchTrianglePacketNode( fetchTrianglePairIndices( index ) );\n"
"	}\n"
"\n"
"	template <typename TriangleNode>\n"
"	HIPRT_HOST_DEVICE TriangleNode fetchPrimNode( const uint32_t index ) const\n"
"	{\n"
"		if constexpr ( is_same<TriangleNode, TrianglePairNode>::value )\n"
"			return fetchTrianglePairNode( index );\n"
"		else\n"
"			return fetchTrianglePacketNode( index );\n"
"	}\n"
"\n"
"	template <typename TriangleNode>\n"
"	HIPRT_HOST_DEVICE TriangleNode fetchPrimNode( const uint2 pairIndices ) const\n"
"	{\n"
"		if constexpr ( is_same<TriangleNode, TrianglePairNode>::value )\n"
"			return fetchTrianglePairNode( pairIndices );\n"
"		else\n"
"			return fetchTrianglePacketNode( pairIndices );\n"
"	}\n"
"\n"
"#if defined( __KERNELCC__ )\n"
"	HIPRT_HOST_DEVICE TriangleNode fetchPrimNode( const uint32_t index ) const { return fetchPrimNode<TriangleNode>( index ); }\n"
"\n"
"	HIPRT_HOST_DEVICE TriangleNode fetchPrimNode( const uint2 pairIndices ) const\n"
"	{\n"
"		return fetchPrimNode<TriangleNode>( pairIndices );\n"
"	}\n"
"#endif\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb fetchAabb( const uint32_t index ) const { return fetchTrianglePairNode( index ).aabb(); }\n"
"\n"
"	HIPRT_HOST_DEVICE float3 fetchCenter( const uint32_t index ) const { return fetchAabb( index ).center(); }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getCount() const { return m_pairCount > 0 ? m_pairCount : m_triangleCount; }\n"
"\n"
"	HIPRT_HOST_DEVICE void setPairs( const uint32_t pairCount, const uint2* pairIndices )\n"
"	{\n"
"		m_pairCount	  = pairCount;\n"
"		m_pairIndices = pairIndices;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE bool pairable() const { return m_triangleIndices != nullptr && m_triangleCount > 2 && m_pairCount == 0; }\n"
"\n"
"	HIPRT_HOST_DEVICE void\n"
"	split( const uint32_t index, const uint32_t axis, float position, const Aabb& box, Aabb& leftBox, Aabb& rightBox ) const\n"
"	{\n"
"		uint2 pairIndices = fetchTrianglePairIndices( index );\n"
"\n"
"		Aabb leftBox0, rightBox0;\n"
"		{\n"
"			Triangle	tri;\n"
"			const uint3 indices = fetchTriangleIndices( pairIndices.x );\n"
"			tri.m_v0			= fetchVertex( indices.x );\n"
"			tri.m_v1			= fetchVertex( indices.y );\n"
"			tri.m_v2			= fetchVertex( indices.z );\n"
"			tri.split( axis, position, box, leftBox0, rightBox0 );\n"
"		}\n"
"\n"
"		Aabb leftBox1, rightBox1;\n"
"		{\n"
"			Triangle	tri;\n"
"			const uint3 indices = fetchTriangleIndices( pairIndices.y );\n"
"			tri.m_v0			= fetchVertex( indices.x );\n"
"			tri.m_v1			= fetchVertex( indices.y );\n"
"			tri.m_v2			= fetchVertex( indices.z );\n"
"			tri.split( axis, position, box, leftBox1, rightBox1 );\n"
"		}\n"
"\n"
"		leftBox	 = Aabb( leftBox0, leftBox1 );\n"
"		rightBox = Aabb( rightBox0, rightBox1 );\n"
"	}\n"
"\n"
"private:\n"
"	const uint8_t* m_vertices;\n"
"	uint32_t	   m_vertexCount;\n"
"	uint32_t	   m_vertexStride;\n"
"	const uint8_t* m_triangleIndices;\n"
"	uint32_t	   m_triangleCount;\n"
"	uint32_t	   m_triangleStride;\n"
"	const uint2*   m_pairIndices = nullptr;\n"
"	uint32_t	   m_pairCount	 = 0u;\n"
"};\n"
"} // namespace hiprt\n"
;
static const char* hip_Triangle= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_types.h>\n"
"#include <hiprt/impl/BvhConfig.h>\n"
"#include <hiprt/impl/Obb.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"class alignas( alignof( float3 ) ) Triangle\n"
"{\n"
"private:\n"
"	template <uint32_t N>\n"
"	struct PolygonN\n"
"	{\n"
"		static constexpr uint32_t MaxVertexCount = N;\n"
"		float3					  m_vertices[MaxVertexCount];\n"
"		uint32_t				  m_count;\n"
"	};\n"
"	using Polygon = PolygonN<16>;\n"
"\n"
"	HIPRT_HOST_DEVICE static bool inside( const float3& p, uint32_t axis, float pos, bool isMin, const float maxExtent )\n"
"	{\n"
"		if ( isMin )\n"
"			return ( &p.x )[axis] >= pos - ObbEpsilon * maxExtent;\n"
"		else\n"
"			return ( &p.x )[axis] <= pos + ObbEpsilon * maxExtent;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE static float3 intersect( const float3& a, const float3& b, uint32_t axis, float pos )\n"
"	{\n"
"		float da = ( &a.x )[axis] - pos;\n"
"		float db = ( &b.x )[axis] - pos;\n"
"		float t	 = da / ( da - db );\n"
"		return a + t * ( b - a );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE static void\n"
"	clip( const Polygon& inPoly, uint32_t axis, float pos, bool isMin, Polygon& outPoly, const float maxExtent )\n"
"	{\n"
"		outPoly.m_count = 0;\n"
"		if ( inPoly.m_count == 0 ) return;\n"
"\n"
"		for ( uint32_t i = 0; i < inPoly.m_count; i++ )\n"
"		{\n"
"			const float3& curr	 = inPoly.m_vertices[i];\n"
"			const float3& prev	 = inPoly.m_vertices[( i + inPoly.m_count - 1 ) % inPoly.m_count];\n"
"			bool		  currIn = inside( curr, axis, pos, isMin, maxExtent );\n"
"			bool		  prevIn = inside( prev, axis, pos, isMin, maxExtent );\n"
"\n"
"			if ( currIn )\n"
"			{\n"
"				if ( !prevIn ) outPoly.m_vertices[outPoly.m_count++] = intersect( prev, curr, axis, pos );\n"
"				if ( outPoly.m_count < Polygon::MaxVertexCount ) outPoly.m_vertices[outPoly.m_count++] = curr;\n"
"			}\n"
"			else if ( prevIn )\n"
"			{\n"
"				if ( outPoly.m_count < Polygon::MaxVertexCount )\n"
"					outPoly.m_vertices[outPoly.m_count++] = intersect( prev, curr, axis, pos );\n"
"			}\n"
"		}\n"
"	}\n"
"\n"
"public:\n"
"	Triangle() = default;\n"
"\n"
"	HIPRT_HOST_DEVICE Triangle( const float3& v0, const float3& v1, const float3& v2 ) : m_v0( v0 ), m_v1( v1 ), m_v2( v2 ) {}\n"
"\n"
"	HIPRT_HOST_DEVICE float3 normal( uint32_t flags = 0u ) const\n"
"	{\n"
"		return ( ( flags >> 5 ) & 1 ) ? cross( m_v2 - m_v0, m_v1 - m_v0 ) : cross( m_v1 - m_v0, m_v2 - m_v0 );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE\n"
"	bool intersect( const hiprtRay& ray, float2& uv, float& t, uint32_t flags ) const\n"
"	{\n"
"		float3 e1 = m_v1 - m_v0;\n"
"		float3 e2 = m_v2 - m_v0;\n"
"		float3 s1 = cross( ray.direction, e2 );\n"
"\n"
"		float denom = dot( s1, e1 );\n"
"\n"
"		if ( denom == 0.f ) return false;\n"
"\n"
"		float  invDemom = 1.0f / denom;\n"
"		float3 d		= ray.origin - m_v0;\n"
"		float3 b;\n"
"		b.y = dot( d, s1 ) * invDemom;\n"
"\n"
"		float3 s2 = cross( d, e1 );\n"
"		b.z		  = dot( ray.direction, s2 ) * invDemom;\n"
"\n"
"		float t0 = dot( e2, s2 ) * invDemom;\n"
"\n"
"		if ( ( b.y < 0.0f ) || ( b.y > 1.0f ) || ( b.z < 0.0f ) || ( b.y + b.z > 1.0f ) || ( t0 < ray.minT ) ||\n"
"			 ( t0 > ray.maxT ) )\n"
"		{\n"
"			return false;\n"
"		}\n"
"		else\n"
"		{\n"
"			b.x	 = 1.0f - b.y - b.z;\n"
"			uv.x = ptr( b )[( flags >> 0 ) & 3];\n"
"			uv.y = ptr( b )[( flags >> 2 ) & 3];\n"
"			t	 = t0;\n"
"			return true;\n"
"		}\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE void\n"
"	split( const uint32_t axis, const float position, const Aabb& box, Aabb& leftBox, Aabb& rightBox ) const\n"
"	{\n"
"		leftBox = rightBox = Aabb();\n"
"\n"
"		const float3* vertices = &m_v0;\n"
"		const float3* v1	   = &vertices[2];\n"
"\n"
"		for ( uint32_t i = 0; i < 3; i++ )\n"
"		{\n"
"			const float3* v0 = v1;\n"
"			v1				 = &vertices[i];\n"
"			const float v0p	 = ( &v0->x )[axis];\n"
"			const float v1p	 = ( &v1->x )[axis];\n"
"\n"
"			if ( v0p <= position ) leftBox.grow( *v0 );\n"
"			if ( v0p >= position ) rightBox.grow( *v0 );\n"
"\n"
"			if ( ( v0p < position && v1p > position ) || ( v0p > position && v1p < position ) )\n"
"			{\n"
"				const float3 t = mix( *v0, *v1, clamp( ( position - v0p ) / ( v1p - v0p ), 0.0f, 1.0f ) );\n"
"				leftBox.grow( t );\n"
"				rightBox.grow( t );\n"
"			}\n"
"		}\n"
"\n"
"		( &leftBox.m_max.x )[axis]	= position;\n"
"		( &rightBox.m_min.x )[axis] = position;\n"
"		leftBox.intersect( box );\n"
"		rightBox.intersect( box );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Obb obb( const Aabb& box, uint32_t matrixIndex ) const\n"
"	{\n"
"		Polygon poly0, poly1;\n"
"		poly0.m_vertices[0] = m_v0;\n"
"		poly0.m_vertices[1] = m_v1;\n"
"		poly0.m_vertices[2] = m_v2;\n"
"		poly0.m_count		= 3;\n"
"		poly1.m_count		= 0;\n"
"\n"
"		const float maxExtent = fmaxf( fmaxf( box.extent().x, box.extent().y ), box.extent().z );\n"
"		clip( poly0, 0, box.m_min.x, true, poly1, maxExtent );\n"
"		clip( poly1, 0, box.m_max.x, false, poly0, maxExtent );\n"
"		clip( poly0, 1, box.m_min.y, true, poly1, maxExtent );\n"
"		clip( poly1, 1, box.m_max.y, false, poly0, maxExtent );\n"
"		clip( poly0, 2, box.m_min.z, true, poly1, maxExtent );\n"
"		clip( poly1, 2, box.m_max.z, false, poly0, maxExtent );\n"
"\n"
"		Obb obb( matrixIndex );\n"
"		for ( uint32_t i = 0; i < poly0.m_count; i++ )\n"
"			obb.grow( poly0.m_vertices[i] );\n"
"\n"
"		return obb;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb aabb() const\n"
"	{\n"
"		Aabb aabb;\n"
"		return aabb.grow( m_v0 ).grow( m_v1 ).grow( m_v2 );\n"
"	}\n"
"\n"
"public:\n"
"	float3 m_v0;\n"
"	float3 m_v1;\n"
"	float3 m_v2;\n"
"};\n"
"\n"
"class alignas( alignof( float3 ) ) TrianglePair\n"
"{\n"
"public:\n"
"	TrianglePair() = default;\n"
"\n"
"	HIPRT_HOST_DEVICE TrianglePair( const float3& v0, const float3& v1, const float3& v2, const float3& v3 )\n"
"		: m_v0( v0 ), m_v1( v1 ), m_v2( v2 ), m_v3( v3 )\n"
"	{\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Triangle fetchTriangle( uint32_t index ) const\n"
"	{\n"
"		if ( index > 0 ) return Triangle( m_v1, m_v3, m_v2 );\n"
"		return Triangle( m_v0, m_v1, m_v2 );\n"
"	}\n"
"	HIPRT_HOST_DEVICE void split( uint32_t axis, float position, const Aabb& box, Aabb& leftBox, Aabb& rightBox ) const\n"
"	{\n"
"		Aabb leftBox0, rightBox0;\n"
"		fetchTriangle( 0 ).split( axis, position, box, leftBox0, rightBox0 );\n"
"		Aabb leftBox1, rightBox1;\n"
"		fetchTriangle( 1 ).split( axis, position, box, leftBox1, rightBox1 );\n"
"		leftBox	 = Aabb( leftBox0, leftBox1 );\n"
"		rightBox = Aabb( rightBox0, rightBox1 );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE Aabb aabb() const\n"
"	{\n"
"		Aabb aabb;\n"
"		return aabb.grow( m_v0 ).grow( m_v1 ).grow( m_v2 ).grow( m_v3 );\n"
"	}\n"
"\n"
"public:\n"
"	float3 m_v0;\n"
"	float3 m_v1;\n"
"	float3 m_v2;\n"
"	float3 m_v3;\n"
"};\n"
"} // namespace hiprt\n"
;
static const char* hip_BvhBuilderUtil= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_common.h>\n"
"using namespace hiprt;\n"
"\n"
"HIPRT_DEVICE Aabb shflAabb( const Aabb& box, uint32_t srcLane )\n"
"{\n"
"	Aabb b;\n"
"	b.m_min.x = shfl( box.m_min.x, srcLane );\n"
"	b.m_min.y = shfl( box.m_min.y, srcLane );\n"
"	b.m_min.z = shfl( box.m_min.z, srcLane );\n"
"	b.m_max.x = shfl( box.m_max.x, srcLane );\n"
"	b.m_max.y = shfl( box.m_max.y, srcLane );\n"
"	b.m_max.z = shfl( box.m_max.z, srcLane );\n"
"	return b;\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_DEVICE T warpMin( T warpVal )\n"
"{\n"
"	T warpValue = shfl_xor( warpVal, 1 );\n"
"	warpVal		= hiprt::min( warpVal, warpValue );\n"
"	warpValue	= shfl_xor( warpVal, 2 );\n"
"	warpVal		= hiprt::min( warpVal, warpValue );\n"
"	warpValue	= shfl_xor( warpVal, 4 );\n"
"	warpVal		= hiprt::min( warpVal, warpValue );\n"
"	warpValue	= shfl_xor( warpVal, 8 );\n"
"	warpVal		= hiprt::min( warpVal, warpValue );\n"
"	warpValue	= shfl_xor( warpVal, 16 );\n"
"	warpVal		= hiprt::min( warpVal, warpValue );\n"
"	if constexpr ( WarpSize == 64 )\n"
"	{\n"
"		warpValue = shfl_xor( warpVal, 32 );\n"
"		warpVal	  = hiprt::min( warpVal, warpValue );\n"
"	}\n"
"	warpVal = shfl( warpVal, WarpSize - 1 );\n"
"	return warpVal;\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_DEVICE T warpMax( T warpVal )\n"
"{\n"
"	T warpValue = shfl_xor( warpVal, 1 );\n"
"	warpVal		= hiprt::max( warpVal, warpValue );\n"
"	warpValue	= shfl_xor( warpVal, 2 );\n"
"	warpVal		= hiprt::max( warpVal, warpValue );\n"
"	warpValue	= shfl_xor( warpVal, 4 );\n"
"	warpVal		= hiprt::max( warpVal, warpValue );\n"
"	warpValue	= shfl_xor( warpVal, 8 );\n"
"	warpVal		= hiprt::max( warpVal, warpValue );\n"
"	warpValue	= shfl_xor( warpVal, 16 );\n"
"	warpVal		= hiprt::max( warpVal, warpValue );\n"
"	if constexpr ( WarpSize == 64 )\n"
"	{\n"
"		warpValue = shfl_xor( warpVal, 32 );\n"
"		warpVal	  = hiprt::max( warpVal, warpValue );\n"
"	}\n"
"	warpVal = shfl( warpVal, WarpSize - 1 );\n"
"	return warpVal;\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_DEVICE T warpSum( T warpVal )\n"
"{\n"
"	T warpValue = shfl_xor( warpVal, 1 );\n"
"	warpVal += warpValue;\n"
"	warpValue = shfl_xor( warpVal, 2 );\n"
"	warpVal += warpValue;\n"
"	warpValue = shfl_xor( warpVal, 4 );\n"
"	warpVal += warpValue;\n"
"	warpValue = shfl_xor( warpVal, 8 );\n"
"	warpVal += warpValue;\n"
"	warpValue = shfl_xor( warpVal, 16 );\n"
"	warpVal += warpValue;\n"
"	if constexpr ( WarpSize == 64 )\n"
"	{\n"
"		warpValue = shfl_xor( warpVal, 32 );\n"
"		warpVal += warpValue;\n"
"	}\n"
"	warpVal = shfl( warpVal, WarpSize - 1 );\n"
"	return warpVal;\n"
"}\n"
"\n"
"HIPRT_DEVICE Aabb warpUnion( Aabb warpVal )\n"
"{\n"
"	const uint32_t laneIndex = threadIdx.x & ( WarpSize - 1 );\n"
"	Aabb		   warpValue = shflAabb( warpVal, laneIndex ^ 1 );\n"
"	warpVal.grow( warpValue );\n"
"	warpValue = shflAabb( warpVal, laneIndex ^ 2 );\n"
"	warpVal.grow( warpValue );\n"
"	warpValue = shflAabb( warpVal, laneIndex ^ 4 );\n"
"	warpVal.grow( warpValue );\n"
"	warpValue = shflAabb( warpVal, laneIndex ^ 8 );\n"
"	warpVal.grow( warpValue );\n"
"	warpValue = shflAabb( warpVal, laneIndex ^ 16 );\n"
"	warpVal.grow( warpValue );\n"
"	if constexpr ( WarpSize == 64 )\n"
"	{\n"
"		warpValue = shflAabb( warpVal, laneIndex ^ 32 );\n"
"		warpVal.grow( warpValue );\n"
"	}\n"
"	warpVal = shflAabb( warpVal, WarpSize - 1 );\n"
"	return warpVal;\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_DEVICE T warpScan( T warpVal )\n"
"{\n"
"	const uint32_t laneIndex = threadIdx.x & ( WarpSize - 1 );\n"
"	T			   warpValue = shfl_up( warpVal, 1 );\n"
"	if ( laneIndex >= 1 ) warpVal += warpValue;\n"
"	warpValue = shfl_up( warpVal, 2 );\n"
"	if ( laneIndex >= 2 ) warpVal += warpValue;\n"
"	warpValue = shfl_up( warpVal, 4 );\n"
"	if ( laneIndex >= 4 ) warpVal += warpValue;\n"
"	warpValue = shfl_up( warpVal, 8 );\n"
"	if ( laneIndex >= 8 ) warpVal += warpValue;\n"
"	warpValue = shfl_up( warpVal, 16 );\n"
"	if ( laneIndex >= 16 ) warpVal += warpValue;\n"
"	if constexpr ( WarpSize == 64 )\n"
"	{\n"
"		warpValue = shfl_up( warpVal, 32 );\n"
"		if ( laneIndex >= 32 ) warpVal += warpValue;\n"
"	}\n"
"	return warpVal;\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_DEVICE T warpOffset( T warpVal, T* counter )\n"
"{\n"
"	const uint32_t laneIndex  = threadIdx.x & ( WarpSize - 1 );\n"
"	T			   warpSum	  = warpScan( warpVal );\n"
"	T			   warpOffset = static_cast<T>( 0 );\n"
"	if ( laneIndex == WarpSize - 1 ) warpOffset = atomicAdd( counter, warpSum );\n"
"	warpSum -= warpVal;\n"
"	warpOffset = shfl( warpOffset, WarpSize - 1 );\n"
"	return warpOffset + warpSum;\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_DEVICE T warpOffset( bool warpVal, T* counter )\n"
"{\n"
"	const uint32_t laneIndex  = threadIdx.x & ( WarpSize - 1 );\n"
"	const uint64_t warpBallot = hiprt::ballot( warpVal );\n"
"	const T		   warpCount  = __popcll( warpBallot );\n"
"	const T		   warpSum	  = __popcll( warpBallot & ( ( 1ull << laneIndex ) - 1ull ) );\n"
"	T			   warpOffset;\n"
"	if ( laneIndex == __ffsll( static_cast<unsigned long long>( warpBallot ) ) - 1 )\n"
"		warpOffset = atomicAdd( counter, warpCount );\n"
"	warpOffset = shfl( warpOffset, __ffsll( static_cast<unsigned long long>( warpBallot ) ) - 1 );\n"
"	return warpOffset + warpSum;\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_DEVICE T blockMin( T blockVal, T* blockCache )\n"
"{\n"
"	const uint32_t laneIndex	 = threadIdx.x & ( WarpSize - 1 );\n"
"	const uint32_t warpIndex	 = threadIdx.x >> Log2( WarpSize );\n"
"	const uint32_t warpsPerBlock = DivideRoundUp( static_cast<uint32_t>( blockDim.x ), WarpSize );\n"
"\n"
"	blockVal = warpMin( blockVal );\n"
"	if ( laneIndex == 0 ) blockCache[warpIndex] = blockVal;\n"
"\n"
"	for ( uint32_t i = 1; i < warpsPerBlock; i <<= 1 )\n"
"	{\n"
"		__syncthreads();\n"
"		if ( threadIdx.x < warpsPerBlock )\n"
"			blockCache[threadIdx.x] = hiprt::min( blockCache[threadIdx.x], blockCache[threadIdx.x ^ i] );\n"
"	}\n"
"	__syncthreads();\n"
"	return blockCache[0];\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_DEVICE T blockMax( T blockVal, T* blockCache )\n"
"{\n"
"	const uint32_t laneIndex	 = threadIdx.x & ( WarpSize - 1 );\n"
"	const uint32_t warpIndex	 = threadIdx.x >> Log2( WarpSize );\n"
"	const uint32_t warpsPerBlock = DivideRoundUp( static_cast<uint32_t>( blockDim.x ), WarpSize );\n"
"\n"
"	blockVal = warpMax( blockVal );\n"
"	if ( laneIndex == 0 ) blockCache[warpIndex] = blockVal;\n"
"\n"
"	for ( uint32_t i = 1; i < warpsPerBlock; i <<= 1 )\n"
"	{\n"
"		__syncthreads();\n"
"		if ( threadIdx.x < warpsPerBlock )\n"
"			blockCache[threadIdx.x] = hiprt::max( blockCache[threadIdx.x], blockCache[threadIdx.x ^ i] );\n"
"	}\n"
"	__syncthreads();\n"
"	return blockCache[0];\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_DEVICE T blockSum( T blockVal, T* blockCache )\n"
"{\n"
"	const uint32_t laneIndex	 = threadIdx.x & ( WarpSize - 1 );\n"
"	const uint32_t warpIndex	 = threadIdx.x >> Log2( WarpSize );\n"
"	const uint32_t warpsPerBlock = DivideRoundUp( static_cast<uint32_t>( blockDim.x ), WarpSize );\n"
"\n"
"	blockVal = warpSum( blockVal );\n"
"	if ( laneIndex == 0 ) blockCache[warpIndex] = blockVal;\n"
"\n"
"	for ( uint32_t i = 1; i < warpsPerBlock; i <<= 1 )\n"
"	{\n"
"		__syncthreads();\n"
"		if ( threadIdx.x < warpsPerBlock ) blockCache[threadIdx.x] += blockCache[threadIdx.x ^ i];\n"
"	}\n"
"	__syncthreads();\n"
"	return blockCache[0];\n"
"}\n"
"\n"
"HIPRT_DEVICE Aabb blockUnion( Aabb blockVal, Aabb* blockCache )\n"
"{\n"
"	const uint32_t laneIndex	 = threadIdx.x & ( WarpSize - 1 );\n"
"	const uint32_t warpIndex	 = threadIdx.x >> Log2( WarpSize );\n"
"	const uint32_t warpsPerBlock = DivideRoundUp( static_cast<uint32_t>( blockDim.x ), WarpSize );\n"
"\n"
"	blockVal = warpUnion( blockVal );\n"
"	if ( laneIndex == 0 ) blockCache[warpIndex] = blockVal;\n"
"\n"
"	for ( uint32_t i = 1; i < warpsPerBlock; i <<= 1 )\n"
"	{\n"
"		__syncthreads();\n"
"		if ( threadIdx.x < warpsPerBlock ) blockCache[threadIdx.x].grow( blockCache[threadIdx.x ^ i] );\n"
"	}\n"
"	__syncthreads();\n"
"	return blockCache[0];\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_DEVICE T blockScan( T blockVal, T* blockCache )\n"
"{\n"
"	const uint32_t laneIndex	 = threadIdx.x & ( WarpSize - 1 );\n"
"	const uint32_t warpIndex	 = threadIdx.x >> Log2( WarpSize );\n"
"	const uint32_t warpsPerBlock = DivideRoundUp( static_cast<uint32_t>( blockDim.x ), WarpSize );\n"
"\n"
"	T blockValue = blockVal;\n"
"	T warpSum	 = warpScan( blockValue );\n"
"\n"
"	if ( laneIndex == WarpSize - 1 ) blockCache[warpIndex] = warpSum;\n"
"\n"
"	__syncthreads();\n"
"	if ( threadIdx.x < warpsPerBlock ) blockValue = blockCache[threadIdx.x];\n"
"\n"
"	for ( uint32_t i = 1; i < warpsPerBlock; i <<= 1 )\n"
"	{\n"
"		__syncthreads();\n"
"		if ( threadIdx.x < warpsPerBlock && threadIdx.x >= i ) blockValue += blockCache[threadIdx.x - i];\n"
"		__syncthreads();\n"
"		if ( threadIdx.x < warpsPerBlock ) blockCache[threadIdx.x] = blockValue;\n"
"	}\n"
"\n"
"	__syncthreads();\n"
"	if ( laneIndex == WarpSize - 1 ) blockCache[warpIndex] -= warpSum;\n"
"\n"
"	__syncthreads();\n"
"	return blockCache[warpIndex] + warpSum;\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_DEVICE T blockScan( bool blockVal, T* blockCache )\n"
"{\n"
"	const uint32_t laneIndex	 = threadIdx.x & ( WarpSize - 1 );\n"
"	const uint32_t warpIndex	 = threadIdx.x >> Log2( WarpSize );\n"
"	const uint32_t warpsPerBlock = DivideRoundUp( static_cast<uint32_t>( blockDim.x ), WarpSize );\n"
"\n"
"	T			   blockValue = blockVal;\n"
"	const uint64_t warpBallot = hiprt::ballot( blockVal );\n"
"	const T		   warpCount  = __popcll( warpBallot );\n"
"	const T		   warpSum	  = __popcll( warpBallot & ( ( 1ull << laneIndex ) - 1ull ) );\n"
"\n"
"	if ( laneIndex == 0 ) blockCache[warpIndex] = warpCount;\n"
"\n"
"	__syncthreads();\n"
"	if ( threadIdx.x < warpsPerBlock ) blockValue = blockCache[threadIdx.x];\n"
"\n"
"	for ( uint32_t i = 1; i < warpsPerBlock; i <<= 1 )\n"
"	{\n"
"		__syncthreads();\n"
"		if ( threadIdx.x < warpsPerBlock && threadIdx.x >= i ) blockValue += blockCache[threadIdx.x - i];\n"
"		__syncthreads();\n"
"		if ( threadIdx.x < warpsPerBlock ) blockCache[threadIdx.x] = blockValue;\n"
"	}\n"
"\n"
"	__syncthreads();\n"
"	return blockCache[warpIndex] + warpSum - warpCount + static_cast<T>( blockVal );\n"
"}\n"
"\n"
"HIPRT_DEVICE HIPRT_INLINE void sync_warp()\n"
"{\n"
"#if defined( __CUDACC__ )\n"
"	__syncwarp();\n"
"#endif\n"
"}\n"
;
static const char* hip_SbvhCommon= \
"\n"
"#pragma once\n"
"#include <hiprt/impl/Aabb.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"struct alignas( 32 ) Bin\n"
"{\n"
"	HIPRT_HOST_DEVICE float cost() const { return m_box.area() * m_counter; }\n"
"\n"
"	HIPRT_HOST_DEVICE float leftCost() const { return m_box.area() * m_enter; }\n"
"\n"
"	HIPRT_HOST_DEVICE float rightCost() const { return m_box.area() * m_exit; }\n"
"\n"
"	HIPRT_HOST_DEVICE void reset()\n"
"	{\n"
"		m_box.reset();\n"
"		m_enter = 0;\n"
"		m_exit	= 0;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE void include( const Bin& bin )\n"
"	{\n"
"		m_box.grow( bin.m_box );\n"
"		m_enter += bin.m_enter;\n"
"		m_exit += bin.m_exit;\n"
"	}\n"
"\n"
"	Aabb	 m_box;\n"
"	uint32_t m_enter;\n"
"	union\n"
"	{\n"
"		uint32_t m_counter;\n"
"		uint32_t m_exit;\n"
"	};\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( Bin ) == 32 );\n"
"\n"
"struct Split\n"
"{\n"
"	HIPRT_HOST_DEVICE void\n"
"	setSplitInfo( uint8_t splitAxis, uint32_t splitIndex, bool leftLeaf, bool rightLeaf, bool spatialSplit )\n"
"	{\n"
"		m_splitIndex   = splitIndex;\n"
"		m_splitAxis	   = splitAxis;\n"
"		m_spatialSplit = spatialSplit;\n"
"		m_leftLeaf	   = leftLeaf;\n"
"		m_rightLeaf	   = rightLeaf;\n"
"	}\n"
"\n"
"	uint32_t m_splitIndex : 27;\n"
"	uint32_t m_splitAxis : 2;\n"
"	uint32_t m_spatialSplit : 1;\n"
"	uint32_t m_leftLeaf : 1;\n"
"	uint32_t m_rightLeaf : 1;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( Split ) == 4 );\n"
"\n"
"struct alignas( 64 ) Task\n"
"{\n"
"	Task() = default;\n"
"\n"
"	HIPRT_HOST_DEVICE Task( const Aabb& box, uint32_t refOffset = InvalidValue ) : m_box( box ), m_refOffset( refOffset ) {}\n"
"\n"
"	Aabb m_box;\n"
"	Aabb m_box0;\n"
"	Aabb m_box1;\n"
"\n"
"	Split m_split{};\n"
"	float m_cost{};\n"
"\n"
"	union\n"
"	{\n"
"		uint32_t m_counter0 = 0;\n"
"		uint32_t m_taskOffset;\n"
"	};\n"
"	union\n"
"	{\n"
"		uint32_t m_counter1 = 0;\n"
"		uint32_t m_refOffset;\n"
"	};\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( Task ) == 128 );\n"
"\n"
"} // namespace hiprt\n"
;
static const char* hip_NodeList= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_types.h>\n"
"#include <hiprt/impl/BvhNode.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"class NodeList\n"
"{\n"
"public:\n"
"	HIPRT_HOST_DEVICE NodeList( const hiprtBvhNodeList& list ) : m_referenceCount( list.nodeCount )\n"
"	{\n"
"		m_apiNodes		 = reinterpret_cast<ApiNode*>( list.internalNodes );\n"
"		m_referenceNodes = reinterpret_cast<ReferenceNode*>( list.leafNodes );\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE const ApiNode* getApiNodes() const { return m_apiNodes; }\n"
"\n"
"	HIPRT_HOST_DEVICE const ReferenceNode* getReferenceNodes() const { return m_referenceNodes; }\n"
"\n"
"	HIPRT_HOST_DEVICE uint32_t getReferenceCount() const { return m_referenceCount; }\n"
"\n"
"private:\n"
"	const ApiNode*		 m_apiNodes;\n"
"	const ReferenceNode* m_referenceNodes;\n"
"	uint32_t			 m_referenceCount;\n"
"};\n"
"} // namespace hiprt\n"
;
static const char* hip_BvhConfig= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_common.h>\n"
"\n"
"namespace hiprt\n"
"{\n"
"static constexpr bool LogBvhCost = false;\n"
"static constexpr uint32_t BvhBuilderReductionBlockSize	= 256u;\n"
"static constexpr uint32_t BvhBuilderCompactionBlockSize = 1024;\n"
"static constexpr uint32_t BatchBuilderMaxBlockSize		= MaxBatchBuildMaxPrimCount;\n"
"static constexpr uint32_t MaxFatLeafSize				= 4u;\n"
"static constexpr uint32_t LanesPerLeafPacketTask		= 4u;\n"
"static constexpr float	  ObbEpsilon					= 1.0e-4f;\n"
"static constexpr float	  ObbSurfaceAreaAlpha			= 1.1f;\n"
"static constexpr uint32_t LbvhEmitBlockSize = 512u;\n"
"static constexpr uint32_t PlocMainBlockSize = 1024u;\n"
"static constexpr uint32_t PlocRadius		= 8u;\n"
"static constexpr uint32_t SbvhMinBinCount = 8u;\n"
"static constexpr uint32_t SbvhMaxBinCount = 32u;\n"
"static constexpr float	  SbvhAlpha		  = 1.5f;\n"
"static constexpr float	  SbvhBeta		  = 1.0e-4f;\n"
"static constexpr float	  SbvhGamma		  = 1.0e-3f;\n"
"static constexpr float	  SbvhEpsilon	  = 1.0e-2f;\n"
"}; // namespace hiprt\n"
;
static const char* hip_MemoryArena= \
"\n"
"#pragma once\n"
"\n"
"namespace hiprt\n"
"{\n"
"class MemoryArena\n"
"{\n"
"public:\n"
"	HIPRT_HOST_DEVICE MemoryArena( hiprtDevicePtr data, size_t storageSize, uint32_t alignment )\n"
"		: m_data( data ), m_storageSize( storageSize ), m_alignment( alignment ), m_offset( 0 )\n"
"	{\n"
"	}\n"
"\n"
"	template <typename T>\n"
"	HIPRT_HOST_DEVICE T* allocate( size_t size = 1 )\n"
"	{\n"
"		if ( size == 0 ) return nullptr;\n"
"		T* p = reinterpret_cast<T*>( reinterpret_cast<uint8_t*>( m_data ) + m_offset );\n"
"		m_offset += RoundUp( sizeof( T ) * size, m_alignment );\n"
"		HIPRT_ASSERT( m_offset <= m_storageSize );\n"
"		return p;\n"
"	}\n"
"\n"
"	HIPRT_HOST_DEVICE size_t getStorageSize() const { return m_storageSize; }\n"
"\n"
"private:\n"
"	hiprtDevicePtr m_data;\n"
"	uint32_t	   m_alignment;\n"
"	size_t		   m_offset;\n"
"	size_t		   m_storageSize;\n"
"};\n"
"} // namespace hiprt\n"
;
static const char* hip_hiprt_types= \
"\n"
"#pragma once\n"
"\n"
"#include <hiprt/hiprt_vec.h>\n"
"\n"
"struct _hiprtGeometry;\n"
"struct _hiprtScene;\n"
"struct _hiprtContext;\n"
"struct _hiprtFuncTable;\n"
"\n"
"using hiprtDevicePtr  = void*;\n"
"using hiprtGeometry	  = _hiprtGeometry*;\n"
"using hiprtScene	  = _hiprtScene*;\n"
"using hiprtContext	  = _hiprtContext*;\n"
"using hiprtFuncTable  = _hiprtFuncTable*;\n"
"using hiprtLogLevel	  = uint32_t;\n"
"using hiprtBuildFlags = uint32_t;\n"
"using hiprtRayMask	  = uint32_t;\n"
"\n"
"using hiprtApiDevice   = int;	// hipDevice, cuDevice\n"
"using hiprtApiCtx	   = void*; // hipCtx, cuCtx\n"
"using hiprtApiStream   = void*; // hipStream, cuStream\n"
"using hiprtApiFunction = void*; // hipFunction, cuFunction\n"
"using hiprtApiModule   = void*; // hipModule, cuModule\n"
"\n"
"/** \\brief Ray traversal type.\n"
"*\n"
"*/\n"
"enum hiprtTraversalType\n"
"{\n"
"	/*!< 0 or 1 element iterator with any hit along the ray */\n"
"	hiprtTraversalTerminateAtAnyHit = 1,\n"
"	/*!< 0 or 1 element iterator with a closest hit along the ray */\n"
"	hiprtTraversalTerminateAtClosestHit = 2,\n"
"};\n"
"\n"
"/** \\brief Traversal state.\n"
"*\n"
"* On-device traversal can be in either hit state (and can be continued using\n"
"* hiprtNextHit) or finished state.\n"
"*/\n"
"enum hiprtTraversalState\n"
"{\n"
"	hiprtTraversalStateInit,\n"
"	hiprtTraversalStateFinished,\n"
"	hiprtTraversalStateHit,\n"
"	hiprtTraversalStateStackOverflow\n"
"};\n"
"\n"
"/** \\brief Traversal hint.\n"
"*\n"
"* An additional information about the rays for the traversal object.\n"
"* It is taken into account only on AMD Navi3x (RDNA3) and above.\n"
"*/\n"
"enum hiprtTraversalHint\n"
"{\n"
"	hiprtTraversalHintDefault		 = 0,\n"
"	hiprtTraversalHintShadowRays	 = 1,\n"
"	hiprtTraversalHintReflectionRays = 2\n"
"};\n"
"\n"
"/** \\brief Various constants.\n"
"*\n"
"*/\n"
"enum : uint32_t\n"
"{\n"
"	hiprtInvalidValue			   = hiprt::InvalidValue,\n"
"	hiprtFullRayMask			   = hiprt::FullRayMask,\n"
"	hiprtMaxBatchBuildMaxPrimCount = hiprt::MaxBatchBuildMaxPrimCount,\n"
"	hiprtMaxInstanceLevels		   = hiprt::MaxInstanceLevels,\n"
"};\n"
"\n"
"/** \\brief Error codes.\n"
"*\n"
"*/\n"
"enum hiprtError\n"
"{\n"
"	hiprtSuccess				= 0,\n"
"	hiprtErrorNotImplemented	= 1,\n"
"	hiprtErrorInternal			= 2,\n"
"	hiprtErrorOutOfHostMemory	= 3,\n"
"	hiprtErrorOutOfDeviceMemory = 4,\n"
"	hiprtErrorInvalidApiVersion = 5,\n"
"	hiprtErrorInvalidParameter	= 6\n"
"};\n"
"\n"
"/** \\brief Log levels.\n"
"*\n"
"*/\n"
"enum hiprtLogLevelBits\n"
"{\n"
"	hiprtLogLevelNone  = 0,\n"
"	hiprtLogLevelInfo  = 1 << 0,\n"
"	hiprtLogLevelWarn  = 1 << 1,\n"
"	hiprtLogLevelError = 1 << 2\n"
"};\n"
"\n"
"/** \\brief Type of geometry/scene build operation.\n"
"*\n"
"* hiprtBuildGeometry/hiprtBuildScene can either build or update\n"
"* an underlying acceleration structure.\n"
"*/\n"
"enum hiprtBuildOperation\n"
"{\n"
"	hiprtBuildOperationBuild  = 1,\n"
"	hiprtBuildOperationUpdate = 2\n"
"};\n"
"\n"
"/** \\brief Hint flags for geometry/scene build functions.\n"
"*\n"
"* hiprtBuildGeometry/hiprtBuildScene use these flags to choose\n"
"* an appropriate build format/algorithm.\n"
"*/\n"
"enum hiprtBuildFlagBits\n"
"{\n"
"	hiprtBuildFlagBitPreferFastBuild			  = 0,\n"
"	hiprtBuildFlagBitPreferBalancedBuild		  = 1,\n"
"	hiprtBuildFlagBitPreferHighQualityBuild		  = 2,\n"
"	hiprtBuildFlagBitCustomBvhImport			  = 3,\n"
"	hiprtBuildFlagBitDisableSpatialSplits		  = 1 << 2,\n"
"	hiprtBuildFlagBitDisableTrianglePairing		  = 1 << 3,\n"
"	hiprtBuildFlagBitDisableOrientedBoundingBoxes = 1 << 4\n"
"};\n"
"\n"
"/** \\brief Geometric primitive type.\n"
"*\n"
"* hiprtGeometry can be built from multiple primitive types,\n"
"* such as triangle meshes, AABB lists, line lists, etc. This enum\n"
"* defines primitive type for hiprtBuildGeometry function.\n"
"*/\n"
"enum hiprtPrimitiveType\n"
"{\n"
"	hiprtPrimitiveTypeTriangleMesh,\n"
"	hiprtPrimitiveTypeAABBList\n"
"};\n"
"\n"
"/** \\brief Instance type.\n"
"*\n"
"* hiprtScene can be bult from instances either of hiprtGeometry or hiprtScene.\n"
"* This enum defines instance type for hiprtBuildScene function.\n"
"*/\n"
"enum hiprtInstanceType\n"
"{\n"
"	hiprtInstanceTypeGeometry,\n"
"	hiprtInstanceTypeScene\n"
"};\n"
"\n"
"/** \\brief Primitve types\n"
"*\n"
"*/\n"
"enum hiprtPrimitiveNodeType\n"
"{\n"
"	hiprtTriangleNode = 0,\n"
"	hiprtCustomNode	  = 1\n"
"};\n"
"\n"
"/** \\brief Transformation frame type.\n"
"*\n"
"*/\n"
"enum hiprtFrameType\n"
"{\n"
"	hiprtFrameTypeSRT,\n"
"	hiprtFrameTypeMatrix\n"
"};\n"
"\n"
"/** \\brief Stack type.\n"
"*\n"
"*/\n"
"enum hiprtStackType\n"
"{\n"
"	hiprtStackTypeGlobal,\n"
"	hiprtStackTypeDynamic\n"
"};\n"
"\n"
"/** \\brief Stack entry type.\n"
"*\n"
"*/\n"
"enum hiprtStackEntryType\n"
"{\n"
"	hiprtStackEntryTypeInteger,\n"
"	hiprtStackEntryTypeInstance\n"
"};\n"
"\n"
"/** \\brief Bvh node type.\n"
"*\n"
"*/\n"
"enum hiprtBvhNodeType\n"
"{\n"
"	/*!< Internal node */\n"
"	hiprtBvhNodeTypeInternal = 0,\n"
"	/*!< Leaf node */\n"
"	hiprtBvhNodeTypeLeaf = 1,\n"
"};\n"
"\n"
"/** \\brief Ray data structure.\n"
"*\n"
"*/\n"
"struct alignas( 16 ) hiprtRay\n"
"{\n"
"	/*!< Ray origin */\n"
"	hiprtFloat3 origin;\n"
"	/*!< Ray maximum distance */\n"
"	float minT = 0.0f;\n"
"	/*!< Ray direction */\n"
"	hiprtFloat3 direction;\n"
"	/*!< Ray maximum distance */\n"
"	float maxT = hiprt::FltMax;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtRay ) == 32 );\n"
"\n"
"/** \\brief Ray hit data structure.\n"
"*\n"
"*/\n"
"struct alignas( 16 ) hiprtHit\n"
"{\n"
"	/*!< Instance IDs */\n"
"	union\n"
"	{\n"
"		/*!< Instance ID (for a single level instancing) */\n"
"		uint32_t instanceID = hiprtInvalidValue;\n"
"		/*!< Instance IDs */\n"
"		uint32_t instanceIDs[hiprtMaxInstanceLevels];\n"
"	};\n"
"	/*!< Primitive ID */\n"
"	uint32_t primID = hiprtInvalidValue;\n"
"	/*!< Texture coordinates */\n"
"	hiprtFloat2 uv;\n"
"	/*!< Geometric normal (not normalized and in the object space) */\n"
"	hiprtFloat3 normal;\n"
"	/*!< Distance */\n"
"	float t = -1.0f;\n"
"\n"
"	HIPRT_DEVICE bool hasHit() const { return primID != hiprtInvalidValue; }\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtHit ) == 48 );\n"
"\n"
"/** \\brief Set of device data pointers for custom functions.\n"
"*\n"
"*/\n"
"struct hiprtFuncDataSet\n"
"{\n"
"	const void* intersectFuncData = nullptr;\n"
"	const void* filterFuncData	  = nullptr;\n"
"};\n"
"\n"
"/** \\brief A header of the function table.\n"
"*\n"
"*/\n"
"struct hiprtFuncTableHeader\n"
"{\n"
"	uint32_t		  numGeomTypes;\n"
"	uint32_t		  numRayTypes;\n"
"	hiprtFuncDataSet* funcDataSets;\n"
"};\n"
"\n"
"/** \\brief A header of the global stack buffer.\n"
"* Use API functions to create this buffer.\n"
"* - hiprtCreateStackBuffer\n"
"* - hiprtDestroyStackBuffer\n"
"*/\n"
"struct hiprtGlobalStackBuffer\n"
"{\n"
"	uint32_t stackSize;\n"
"	uint32_t stackCount;\n"
"	void*	 stackData;\n"
"};\n"
"\n"
"/** \\brief A header of the shared stack buffer.\n"
"*\n"
"*/\n"
"struct hiprtSharedStackBuffer\n"
"{\n"
"	uint32_t stackSize;\n"
"	void*	 stackData;\n"
"};\n"
"\n"
"/** \\brief Set of function names.\n"
"*\n"
"*/\n"
"struct hiprtFuncNameSet\n"
"{\n"
"	const char* intersectFuncName = nullptr;\n"
"	const char* filterFuncName	  = nullptr;\n"
"};\n"
"\n"
"/** \\brief Device type.\n"
"*\n"
"*/\n"
"enum hiprtDeviceType\n"
"{\n"
"	/*!< AMD device */\n"
"	hiprtDeviceAMD,\n"
"	/*!< Nvidia device */\n"
"	hiprtDeviceNVIDIA,\n"
"};\n"
"\n"
"/** \\brief Context creation input.\n"
"*\n"
"*/\n"
"struct hiprtContextCreationInput\n"
"{\n"
"	/*!< HIPRT API context */\n"
"	hiprtApiCtx ctxt;\n"
"	/*!< HIPRT API device */\n"
"	hiprtApiDevice device;\n"
"	/*!< HIPRT API device type */\n"
"	hiprtDeviceType deviceType;\n"
"};\n"
"\n"
"/** \\brief Various flags controlling scene/geometry build process.\n"
"*\n"
"*/\n"
"struct hiprtBuildOptions\n"
"{\n"
"	/*!< Build flags */\n"
"	hiprtBuildFlags buildFlags;\n"
"	/*!< Batch build max prim count (if 0 then batch build is not used) */\n"
"	uint32_t batchBuildMaxPrimCount = 0u;\n"
"};\n"
"\n"
"/** \\brief Triangle mesh primitive.\n"
"*\n"
"* Triangle mesh primitive is represented as an indexed vertex array.\n"
"* Vertex and index arrays are defined using device pointers and strides.\n"
"* Each vertex has to have 3 components: (x, y, z) coordinates.\n"
"* Indices are organized into triples (i0, i1, i2) - one for each triangle.\n"
"* If the indices are not provided, it assumes (3*t+0, 3*t+1, 3*t+2).\n"
"*/\n"
"struct hiprtTriangleMeshPrimitive\n"
"{\n"
"	/*!< Device pointer to vertex data */\n"
"	hiprtDevicePtr vertices;\n"
"	/*!< Number of vertices in vertex array */\n"
"	uint32_t vertexCount;\n"
"	/*!< Stride in bytes between two vertices */\n"
"	uint32_t vertexStride;\n"
"\n"
"	/*!< Device pointer to triangle index data (optional) */\n"
"	hiprtDevicePtr triangleIndices = nullptr;\n"
"	/*!< Number of triangles in index array */\n"
"	uint32_t triangleCount = 0u;\n"
"	/*!< Stride in bytes between two triangles */\n"
"	uint32_t triangleStride = 0u;\n"
"\n"
"	/*!< Device pointer to triangle pair index data (optional) */\n"
"	hiprtDevicePtr trianglePairIndices = nullptr;\n"
"	/*!< Number of triangle pairs */\n"
"	uint32_t trianglePairCount = 0u;\n"
"};\n"
"\n"
"/** \\brief AABB list primitive.\n"
"*\n"
"* AABB list is an array of axis aligned bounding boxes, represented\n"
"* by device memory pointer and stride between two consecutive boxes.\n"
"* Each AABB is a pair of float3 or float4 values.\n"
"*/\n"
"struct hiprtAABBListPrimitive\n"
"{\n"
"	/*!< Device pointer to AABB data */\n"
"	hiprtDevicePtr aabbs;\n"
"	/*!< Number of AABBs in the array */\n"
"	uint32_t aabbCount;\n"
"	/*!< Stride in bytes between two AABBs (2 * sizeof(float3) or 2 * sizeof(float4)) */\n"
"	uint32_t aabbStride;\n"
"};\n"
"\n"
"/** \\brief Internal Bvh node for custom import Bvh.\n"
"*\n"
"*/\n"
"struct alignas( 64 ) hiprtInternalNode\n"
"{\n"
"	/*!< Node bounding box min */\n"
"	hiprtFloat3 aabbMin;\n"
"	/*!< Node bounding box max */\n"
"	hiprtFloat3 aabbMax;\n"
"	/*!< Child indices */\n"
"	uint32_t childIndices[2];\n"
"	/*!< Child node types */\n"
"	hiprtBvhNodeType childNodeTypes[2];\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtInternalNode ) == 64 );\n"
"\n"
"/** \\brief Leaf Bvh node for custom import Bvh.\n"
"*\n"
"* Each leaf references a single primitive.\n"
"* Multiple primitives can be references from different leaves.\n"
"*/\n"
"struct alignas( 32 ) hiprtLeafNode\n"
"{\n"
"	/*!< Node bounding box min\'s */\n"
"	hiprtFloat3 aabbMin;\n"
"	/*!< Node bounding box max\'s */\n"
"	hiprtFloat3 aabbMax;\n"
"	/*!< Primitive ID */\n"
"	uint32_t primID;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtLeafNode ) == 32 );\n"
"\n"
"/** \\brief Bvh node list.\n"
"*\n"
"* The list with nodes representing a binary BVH to be imported.\n"
"* The root nodes is assumed to be at the first position.\n"
"* The number of internal nodes is equal to the number of leaves minus one\'.\n"
"*/\n"
"struct hiprtBvhNodeList\n"
"{\n"
"	/*!< Array of hiprtInternalNode\'s */\n"
"	hiprtDevicePtr internalNodes;\n"
"	/*!< Array of hiprtLeafNode\'s */\n"
"	hiprtDevicePtr leafNodes;\n"
"	/*!< The number of leaf nodes */\n"
"	uint32_t nodeCount;\n"
"};\n"
"\n"
"/** \\brief Build input for geometry build/update operation.\n"
"*\n"
"* Build input defines concrete primitive type and a pointer to an actual\n"
"* primitive description.\n"
"*/\n"
"struct alignas( 64 ) hiprtGeometryBuildInput\n"
"{\n"
"	/*!< Primitive type */\n"
"	hiprtPrimitiveType type;\n"
"	/*!< Geometry type used for custom function table */\n"
"	uint32_t geomType = hiprtInvalidValue;\n"
"	/*!< Defines the following union */\n"
"	union\n"
"	{\n"
"		/*!< Triangle mesh */\n"
"		hiprtTriangleMeshPrimitive triangleMesh;\n"
"		/*!< Bounding boxes of custom primitives */\n"
"		hiprtAABBListPrimitive aabbList;\n"
"	} primitive{};\n"
"	/*!< Custom Bvh nodes (optional) */\n"
"	hiprtBvhNodeList nodeList;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtGeometryBuildInput ) == 128 );\n"
"\n"
"/** \\brief Instance containing a pointer to the actual geometry/scene.\n"
"*\n"
"*/\n"
"struct alignas( 16 ) hiprtInstance\n"
"{\n"
"	/*!< Instance type */\n"
"	hiprtInstanceType type;\n"
"	/*!< Defines the following union */\n"
"	union\n"
"	{\n"
"		/*!< Geometry */\n"
"		hiprtGeometry geometry;\n"
"		/*!< Scene */\n"
"		hiprtScene scene;\n"
"	};\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtInstance ) == 16 );\n"
"\n"
"/** \\brief Build input for the scene.\n"
"*\n"
"* Scene consists of a set of instances. Each of the instances is defined by:\n"
"*  - Root pointer of the corresponding geometry\n"
"*  - Transformation header\n"
"*  - Mask\n"
"*\n"
"* Instances can refer to the same geometry but with different transformations\n"
"* (essentially implementing instancing). Mask is used to implement ray\n"
"* masking: ray mask is bitwise &ded with an instance mask, and no intersections\n"
"* are evaluated with the primitive of corresponding instance if the result is\n"
"* 0. The transformation header defines the offset and the number of consecutive\n"
"* transformation frames in the frame array for each instance. More than one frame\n"
"* is interpreted as motion blur. If the transformation headers is nullptr, it\n"
"* assumes one frame per instance. Optionally, it is possible to import a custom\n"
"* BVH by setting nodes and the corresponding build flag.\n"
"*/\n"
"struct alignas( 16 ) hiprtSceneBuildInput\n"
"{\n"
"	/*!< Array of instanceCount pointers to instances */\n"
"	hiprtDevicePtr instances;\n"
"	/*!< Array of instanceCount transform headers (optional: per object frame assumed if nullptr) */\n"
"	hiprtDevicePtr instanceTransformHeaders;\n"
"	/*!< Array of frameCount frames (supposed to be ordered according to time) */\n"
"	hiprtDevicePtr instanceFrames;\n"
"	/*!< Per object bit masks for instance masking (optional: if nullptr masks treated as hiprtFullRayMask) */\n"
"	hiprtDevicePtr instanceMasks;\n"
"	/*!< Custom Bvh nodes (optional) */\n"
"	hiprtBvhNodeList nodeList;\n"
"	/*!< Number of instances */\n"
"	uint32_t instanceCount;\n"
"	/*!< Number of frames (such that instanceCount <= frameCount) */\n"
"	uint32_t frameCount;\n"
"	/*!< Frame type (SRT or matrix) */\n"
"	hiprtFrameType frameType = hiprtFrameTypeSRT;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtSceneBuildInput ) == 80 );\n"
"\n"
"/** \\brief Input for the global stack buffer allocation\n"
"*\n"
"*/\n"
"struct hiprtGlobalStackBufferInput\n"
"{\n"
"	/*!< Stack type */\n"
"	hiprtStackType type = hiprtStackTypeGlobal;\n"
"	/*!< Stack entry type */\n"
"	hiprtStackEntryType entryType = hiprtStackEntryTypeInteger;\n"
"	/*!< Global stack size (e.g. 64) */\n"
"	uint32_t stackSize;\n"
"	/*!< Total number of threads (for hiprtGlobalStack only) */\n"
"	uint32_t threadCount;\n"
"};\n"
"\n"
"/** \\brief Stack entry for instace stacks\n"
"*\n"
"*/\n"
"struct hiprtInstanceStackEntry\n"
"{\n"
"	/*!< Ray */\n"
"	hiprtRay ray;\n"
"	/*!< Scene */\n"
"	hiprtScene scene;\n"
"};\n"
"\n"
"/** \\brief SRT transformation frame.\n"
"*\n"
"* Represented by scale (S), rotation (R), translation (T), and frame time.\n"
"* Object to world transformation is composed as (T * R * S) * x = y\n"
"*/\n"
"struct alignas( 16 ) hiprtFrameSRT\n"
"{\n"
"	/*!< Rotation (axis and angle) */\n"
"	hiprtFloat4 rotation;\n"
"	/*!< Scale */\n"
"	hiprtFloat3 scale;\n"
"	/*!< Translation */\n"
"	hiprtFloat3 translation;\n"
"	/*!< Frame time */\n"
"	float time;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtFrameSRT ) == 48 );\n"
"\n"
"/** \\brief Transformation matrix frame representation.\n"
"*\n"
"* Represented by a 3x4 matrix and frame time.\n"
"*/\n"
"struct alignas( 64 ) hiprtFrameMatrix\n"
"{\n"
"	/*!< Matrix */\n"
"	float matrix[3][4];\n"
"	/*!< Frame time */\n"
"	float time;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtFrameMatrix ) == 64 );\n"
"\n"
"/** \\brief Transformation header.\n"
"*\n"
"* Defines defines the index to the array of frames and the number of frames.\n"
"*/\n"
"struct alignas( 8 ) hiprtTransformHeader\n"
"{\n"
"	/*!< Frame index */\n"
"	uint32_t frameIndex;\n"
"	/*!< Number of frames */\n"
"	uint32_t frameCount;\n"
"};\n"
"HIPRT_STATIC_ASSERT( sizeof( hiprtTransformHeader ) == 8 );\n"
;
static const char* hip_hiprt_common= \
"\n"
"#pragma once\n"
"\n"
"#if ( defined( __CUDACC__ ) || defined( __HIPCC__ ) )\n"
"#define __KERNELCC__\n"
"#endif\n"
"\n"
"#if ( defined( __CUDACC_RTC__ ) || defined( __HIPCC_RTC__ ) )\n"
"#define __KERNELCC_RTC__\n"
"#endif\n"
"\n"
"#if !defined( __KERNELCC__ )\n"
"#include <algorithm>\n"
"#include <cfloat>\n"
"#include <cstring>\n"
"#include <cmath>\n"
"#include <map>\n"
"#include <string>\n"
"#include <vector>\n"
"#include <fstream>\n"
"#include <iostream>\n"
"#include <type_traits>\n"
"#define __host__\n"
"#define __device__\n"
"#endif\n"
"\n"
"#if !defined( __KERNELCC_RTC__ )\n"
"#include <cstdint>\n"
"#endif\n"
"\n"
"#if !defined( __KERNELCC__ )\n"
"#if defined( _MSC_VER )\n"
"#define HIPRT_ASSERT( cond ) \\\n"
"	if ( !( cond ) )         \\\n"
"	{                        \\\n"
"		__debugbreak();      \\\n"
"	}\n"
"#elif defined( __GNUC__ )\n"
"#include <signal.h>\n"
"#define HIPRT_ASSERT( cond ) \\\n"
"	if ( !( cond ) )         \\\n"
"	{                        \\\n"
"		raise( SIGTRAP );    \\\n"
"	}\n"
"#else\n"
"#define HIPRT_ASSERT( cond )\n"
"#endif\n"
"#else\n"
"#define HIPRT_ASSERT( cond )\n"
"#endif\n"
"\n"
"#define HIPRT_STATIC_ASSERT( cond ) static_assert( ( cond ), \"\" )\n"
"\n"
"#ifdef __KERNELCC__\n"
"#define HIPRT_INLINE __forceinline__\n"
"#define HIPRT_CONST __constant__\n"
"#else\n"
"#define HIPRT_INLINE inline\n"
"#define HIPRT_CONST const\n"
"#endif\n"
"\n"
"#define HIPRT_HOST __host__\n"
"#define HIPRT_DEVICE __device__\n"
"#define HIPRT_HOST_DEVICE __host__ __device__\n"
"\n"
"#if defined( HIPRT_BAKE_KERNEL_GENERATED )\n"
"#define GET_ARGS( X ) ( hip::X##Args )\n"
"#define GET_INC( X ) ( hip::X##Includes )\n"
"#else\n"
"#define GET_ARGS( X ) static_cast<const char**>( nullptr )\n"
"#define GET_INC( X ) static_cast<const char**>( nullptr )\n"
"#endif\n"
"\n"
"#if defined( HIPRT_LOAD_FROM_STRING )\n"
"#define GET_ARG_LIST( X ) sizeof( GET_ARGS( X ) ) / sizeof( void* ), GET_ARGS( X ), GET_INC( X )\n"
"#else\n"
"#define GET_ARG_LIST( X ) 0, 0, 0\n"
"#endif\n"
"\n"
"#if defined( __KERNELCC_RTC__ )\n"
"#if defined( __CUDACC_RTC__ ) || HIP_VERSION_MAJOR < 7\n"
"using int8_t   = char;\n"
"using uint8_t  = unsigned char;\n"
"using int16_t  = short;\n"
"using uint16_t = unsigned short;\n"
"using int32_t  = int;\n"
"using uint32_t = unsigned int;\n"
"using int64_t  = long long;\n"
"using uint64_t = unsigned long long;\n"
"#else\n"
"using int8_t					   = __hip_internal::int8_t;\n"
"using uint8_t					   = __hip_internal::uint8_t;\n"
"using int16_t					   = __hip_internal::int16_t;\n"
"using uint16_t					   = __hip_internal::uint16_t;\n"
"using int32_t					   = __hip_internal::int32_t;\n"
"using uint32_t					   = __hip_internal::uint32_t;\n"
"using int64_t					   = __hip_internal::int64_t;\n"
"using uint64_t					   = __hip_internal::uint64_t;\n"
"#endif\n"
"#endif\n"
"\n"
"HIPRT_STATIC_ASSERT( sizeof( int8_t ) == 1 );\n"
"HIPRT_STATIC_ASSERT( sizeof( int16_t ) == 2 );\n"
"HIPRT_STATIC_ASSERT( sizeof( int32_t ) == 4 );\n"
"HIPRT_STATIC_ASSERT( sizeof( int64_t ) == 8 );\n"
"HIPRT_STATIC_ASSERT( sizeof( uint8_t ) == 1 );\n"
"HIPRT_STATIC_ASSERT( sizeof( uint16_t ) == 2 );\n"
"HIPRT_STATIC_ASSERT( sizeof( uint32_t ) == 4 );\n"
"HIPRT_STATIC_ASSERT( sizeof( uint64_t ) == 8 );\n"
"HIPRT_STATIC_ASSERT( alignof( int8_t ) == 1 );\n"
"HIPRT_STATIC_ASSERT( alignof( int16_t ) == 2 );\n"
"HIPRT_STATIC_ASSERT( alignof( int32_t ) == 4 );\n"
"HIPRT_STATIC_ASSERT( alignof( int64_t ) == 8 );\n"
"HIPRT_STATIC_ASSERT( alignof( uint8_t ) == 1 );\n"
"HIPRT_STATIC_ASSERT( alignof( uint16_t ) == 2 );\n"
"HIPRT_STATIC_ASSERT( alignof( uint32_t ) == 4 );\n"
"HIPRT_STATIC_ASSERT( alignof( uint64_t ) == 8 );\n"
"\n"
"HIPRT_STATIC_ASSERT( sizeof( double ) == 8 );\n"
"HIPRT_STATIC_ASSERT( sizeof( float ) == 4 );\n"
"HIPRT_STATIC_ASSERT( alignof( double ) == 8 );\n"
"HIPRT_STATIC_ASSERT( alignof( float ) == 4 );\n"
"\n"
"HIPRT_STATIC_ASSERT( sizeof( unsigned long long int ) == 8 );\n"
"HIPRT_STATIC_ASSERT( sizeof( long long int ) == 8 );\n"
"HIPRT_STATIC_ASSERT( sizeof( unsigned int ) == 4 );\n"
"HIPRT_STATIC_ASSERT( sizeof( int ) == 4 );\n"
"HIPRT_STATIC_ASSERT( sizeof( unsigned short int ) == 2 );\n"
"HIPRT_STATIC_ASSERT( sizeof( short int ) == 2 );\n"
"HIPRT_STATIC_ASSERT( sizeof( unsigned char ) == 1 );\n"
"HIPRT_STATIC_ASSERT( sizeof( char ) == 1 );\n"
"HIPRT_STATIC_ASSERT( alignof( unsigned long long int ) == 8 );\n"
"HIPRT_STATIC_ASSERT( alignof( long long int ) == 8 );\n"
"HIPRT_STATIC_ASSERT( alignof( unsigned int ) == 4 );\n"
"HIPRT_STATIC_ASSERT( alignof( int ) == 4 );\n"
"HIPRT_STATIC_ASSERT( alignof( unsigned short int ) == 2 );\n"
"HIPRT_STATIC_ASSERT( alignof( short int ) == 2 );\n"
"HIPRT_STATIC_ASSERT( alignof( unsigned char ) == 1 );\n"
"HIPRT_STATIC_ASSERT( alignof( char ) == 1 );\n"
"\n"
"namespace hiprt\n"
"{\n"
"constexpr float Pi	   = 3.14159265358979323846f;\n"
"constexpr float TwoPi  = 2.0f * Pi;\n"
"constexpr float FltMin = 1.175494351e-38f;\n"
"constexpr float FltMax = 3.402823466e+38f;\n"
"constexpr int	IntMin = -2147483647 - 1;\n"
"constexpr int	IntMax = 2147483647;\n"
"\n"
"constexpr uint32_t InvalidValue				 = ~0u;\n"
"constexpr uint32_t FullRayMask				 = ~0u;\n"
"constexpr uint32_t MaxBatchBuildMaxPrimCount = 512u;\n"
"constexpr uint32_t MaxInstanceLevels		 = 4u;\n"
"constexpr uint32_t DefaultAlignment			 = 64u;\n"
"constexpr uint32_t NoRotationIndex			 = 127u;\n"
"constexpr uint32_t InstanceIDBits			 = 24u;\n"
"\n"
"#ifdef __KERNELCC__\n"
"#ifndef HIPRT_RTIP\n"
"#if __gfx1200__ || __gfx1201__\n"
"#if ( HIP_VERSION_MAJOR >= 7 ) || \\\n"
"	( defined( HIPCC_OS_WINDOWS ) && ( ( HIP_VERSION_MAJOR > 6 ) || ( HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR >= 4 ) ) )\n"
"#define HIPRT_RTIP 31\n"
"#else\n"
"#define HIPRT_RTIP 20\n"
"#warning \\\n"
"	\"HW supports RTIP 3.1 but the compiler is of an older version; build with ROCm 6.4+ (Win) or 7.0+ (Linux) to fully utilize HW ray tracing features\"\n"
"#endif\n"
"#elif __gfx1100__ || __gfx1101__ || __gfx1102__ || __gfx1103__ || __gfx1150__ || __gfx1151__ || __gfx1152__ || __gfx1153__\n"
"#define HIPRT_RTIP 20\n"
"#elif __gfx1030__ || __gfx1031__ || __gfx1032__ || __gfx1033__ || __gfx1034__ || __gfx1035__ || __gfx1036__\n"
"#define HIPRT_RTIP 11\n"
"#else\n"
"#define HIPRT_RTIP 0\n"
"#endif\n"
"#endif\n"
"\n"
"#if __gfx900__ || __gfx902__ || __gfx904__ || __gfx906__ || __gfx908__ || __gfx909__ || __gfx90a__ || __gfx90c__ || \\\n"
"	__gfx940__ || __gfx941__ || __gfx942__\n"
"constexpr uint32_t WarpSize = 64;\n"
"#else\n"
"constexpr uint32_t WarpSize		   = 32;\n"
"#endif\n"
"\n"
"constexpr uint32_t Rtip = HIPRT_RTIP;\n"
"\n"
"#if HIPRT_RTIP >= 31\n"
"constexpr uint32_t BranchingFactor = 8;\n"
"#else\n"
"constexpr uint32_t BranchingFactor = 4;\n"
"#endif\n"
"#endif\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE float as_float( uint32_t value )\n"
"{\n"
"#if defined( __KERNELCC__ )\n"
"	return __uint_as_float( value );\n"
"#else\n"
"	return *reinterpret_cast<float*>( &value );\n"
"#endif\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE int as_int( float value )\n"
"{\n"
"#if defined( __KERNELCC__ )\n"
"	return __float_as_int( value );\n"
"#else\n"
"	return *reinterpret_cast<int*>( &value );\n"
"#endif\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint32_t as_uint( float value )\n"
"{\n"
"#if defined( __KERNELCC__ )\n"
"	return __float_as_uint( value );\n"
"#else\n"
"	return *reinterpret_cast<uint32_t*>( &value );\n"
"#endif\n"
"}\n"
"\n"
"HIPRT_HOST_DEVICE HIPRT_INLINE uint32_t clz( uint32_t value )\n"
"{\n"
"#if defined( __KERNELCC__ )\n"
"	return __clz( value );\n"
"#else\n"
"	uint32_t count = 0;\n"
"	for ( uint32_t mask = 1u << 31; mask && !( value & mask ); mask >>= 1 )\n"
"		++count;\n"
"	return value == 0 ? 32 : count;\n"
"#endif\n"
"}\n"
"\n"
"#ifdef __KERNELCC__\n"
"template <typename T>\n"
"HIPRT_INLINE HIPRT_DEVICE T shfl( T var, int srcLane )\n"
"{\n"
"#ifdef __CUDACC__\n"
"	return __shfl_sync( __activemask(), var, srcLane );\n"
"#else\n"
"	return __shfl( var, srcLane );\n"
"#endif\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_INLINE HIPRT_DEVICE T shfl_up( T var, int srcLane )\n"
"{\n"
"#ifdef __CUDACC__\n"
"	return __shfl_up_sync( __activemask(), var, srcLane );\n"
"#else\n"
"	return __shfl_up( var, srcLane );\n"
"#endif\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_INLINE HIPRT_DEVICE T shfl_down( T var, int srcLane )\n"
"{\n"
"#ifdef __CUDACC__\n"
"	return __shfl_down_sync( __activemask(), var, srcLane );\n"
"#else\n"
"	return __shfl_down( var, srcLane );\n"
"#endif\n"
"}\n"
"\n"
"template <typename T>\n"
"HIPRT_INLINE HIPRT_DEVICE T shfl_xor( T var, int srcLane )\n"
"{\n"
"#ifdef __CUDACC__\n"
"	return __shfl_xor_sync( __activemask(), var, srcLane );\n"
"#else\n"
"	return __shfl_xor( var, srcLane );\n"
"#endif\n"
"}\n"
"\n"
"HIPRT_INLINE HIPRT_DEVICE uint64_t ballot( int predicate )\n"
"{\n"
"#ifdef __CUDACC__\n"
"	return static_cast<uint64_t>( __ballot_sync( __activemask(), predicate ) );\n"
"#else\n"
"	return __ballot( predicate );\n"
"#endif\n"
"}\n"
"\n"
"HIPRT_INLINE HIPRT_DEVICE uint32_t any( int predicate )\n"
"{\n"
"#ifdef __CUDACC__\n"
"	return __any_sync( __activemask(), predicate );\n"
"#else\n"
"	return __any( predicate );\n"
"#endif\n"
"}\n"
"HIPRT_INLINE HIPRT_DEVICE uint32_t all( int predicate )\n"
"{\n"
"#ifdef __CUDACC__\n"
"	return __all_sync( __activemask(), predicate );\n"
"#else\n"
"	return __all( predicate );\n"
"#endif\n"
"}\n"
"#endif\n"
"\n"
"template <typename T, typename U>\n"
"constexpr HIPRT_HOST_DEVICE T RoundUp( T value, U factor )\n"
"{\n"
"	return ( value + factor - 1 ) / factor * factor;\n"
"}\n"
"\n"
"template <typename T, typename U>\n"
"constexpr HIPRT_HOST_DEVICE T DivideRoundUp( T value, U factor )\n"
"{\n"
"	return ( value + factor - 1 ) / factor;\n"
"}\n"
"\n"
"template <typename T>\n"
"constexpr HIPRT_HOST_DEVICE T Log2( T n )\n"
"{\n"
"	return n <= 1 ? 0 : 1 + Log2( ( n + 1 ) / 2 );\n"
"}\n"
"\n"
"#if !defined( __KERNELCC_RTC__ ) || defined( __CUDACC_RTC__ ) || HIP_VERSION_MAJOR < 7\n"
"template <class T, class U>\n"
"struct is_same\n"
"{\n"
"	enum\n"
"	{\n"
"		value = 0\n"
"	};\n"
"};\n"
"\n"
"template <class T>\n"
"struct is_same<T, T>\n"
"{\n"
"	enum\n"
"	{\n"
"		value = 1\n"
"	};\n"
"};\n"
"\n"
"template <bool B, class T, class F>\n"
"struct conditional\n"
"{\n"
"	using type = T;\n"
"};\n"
"\n"
"template <class T, class F>\n"
"struct conditional<false, T, F>\n"
"{\n"
"	using type = F;\n"
"};\n"
"#else\n"
"template <class T, class U>\n"
"using is_same = __hip_internal::is_same<T, U>;\n"
"template <bool B, class T, class F>\n"
"using conditional = __hip_internal::conditional<B, T, F>;\n"
"#endif\n"
"\n"
"template <class T>\n"
"struct remove_reference\n"
"{\n"
"	using type = T;\n"
"};\n"
"\n"
"template <class T>\n"
"struct remove_reference<T&>\n"
"{\n"
"	using type = T;\n"
"};\n"
"\n"
"template <class T>\n"
"struct remove_reference<T&&>\n"
"{\n"
"	using type = T;\n"
"};\n"
"\n"
"template <class Ty, Ty Val>\n"
"struct integral_constant\n"
"{\n"
"	static constexpr Ty value = Val;\n"
"	using value_type		  = Ty;\n"
"	using type				  = integral_constant;\n"
"\n"
"	HIPRT_DEVICE constexpr							operator value_type() const noexcept { return value; }\n"
"	[[nodiscard]] HIPRT_DEVICE constexpr value_type operator()() const noexcept { return value; }\n"
"};\n"
"\n"
"template <bool _Val>\n"
"using bool_constant = integral_constant<bool, _Val>;\n"
"\n"
"using true_type	 = bool_constant<true>;\n"
"using false_type = bool_constant<false>;\n"
"\n"
"template <class T>\n"
"struct is_lvalue_reference : false_type\n"
"{\n"
"};\n"
"\n"
"template <class T>\n"
"struct is_lvalue_reference<T&> : true_type\n"
"{\n"
"};\n"
"\n"
"template <class T>\n"
"HIPRT_DEVICE constexpr typename remove_reference<T>::type&& move( T&& t ) noexcept\n"
"{\n"
"	return static_cast<typename remove_reference<T>::type&&>( t );\n"
"}\n"
"\n"
"template <class T>\n"
"HIPRT_DEVICE constexpr T&& forward( typename remove_reference<T>::type& t ) noexcept\n"
"{\n"
"	return static_cast<T&&>( t );\n"
"}\n"
"\n"
"template <class T>\n"
"HIPRT_DEVICE constexpr T&& forward( typename remove_reference<T>::type&& t ) noexcept\n"
"{\n"
"	HIPRT_STATIC_ASSERT( !is_lvalue_reference<T>::value );\n"
"	return static_cast<T&&>( t );\n"
"}\n"
"\n"
"template <class T>\n"
"struct alignment_of : integral_constant<size_t, alignof( T )>\n"
"{\n"
"};\n"
"\n"
"template <size_t Size, uint32_t Align>\n"
"struct aligned_storage\n"
"{\n"
"	struct type\n"
"	{\n"
"		alignas( Align ) uint8_t data[Size];\n"
"	};\n"
"};\n"
"\n"
"#if !defined( __KERNELCC__ )\n"
"template <typename T>\n"
"struct Traits\n"
"{\n"
"	static const std::string TYPE_NAME;\n"
"};\n"
"\n"
"#define DECLARE_TYPE_TRAITS( name ) \\\n"
"	template <>                     \\\n"
"	const std::string Traits<name>::TYPE_NAME = #name;\n"
"#endif\n"
"} // namespace hiprt\n"
"\n"
"template <typename T, size_t Size, uint32_t Align>\n"
"class hiprtPimpl\n"
"{\n"
"	typename hiprt::aligned_storage<Size, Align>::type data;\n"
"\n"
"public:\n"
"	template <size_t T_size>\n"
"	HIPRT_DEVICE static constexpr void PimplSizeCheck()\n"
"	{\n"
"		HIPRT_STATIC_ASSERT( T_size == Size );\n"
"	};\n"
"\n"
"	HIPRT_DEVICE static constexpr void PimplPtrCheck()\n"
"	{\n"
"		PimplSizeCheck<sizeof( T )>();\n"
"		HIPRT_STATIC_ASSERT( alignof( T ) >= hiprt::alignment_of<T>::value );\n"
"	}\n"
"\n"
"	template <typename... Args>\n"
"	HIPRT_DEVICE hiprtPimpl( Args&&... args )\n"
"	{\n"
"		PimplPtrCheck();\n"
"		new ( &data ) T( hiprt::forward<Args>( args )... );\n"
"	}\n"
"\n"
"	HIPRT_DEVICE hiprtPimpl( hiprtPimpl const& o )\n"
"	{\n"
"		PimplPtrCheck();\n"
"		new ( &data ) T( *o );\n"
"	}\n"
"\n"
"	HIPRT_DEVICE hiprtPimpl( hiprtPimpl& o )\n"
"	{\n"
"		PimplPtrCheck();\n"
"		new ( &data ) T( *o );\n"
"	}\n"
"\n"
"	HIPRT_DEVICE hiprtPimpl( hiprtPimpl&& o )\n"
"	{\n"
"		PimplPtrCheck();\n"
"		new ( &data ) T( hiprt::move( *o ) );\n"
"	}\n"
"\n"
"	HIPRT_DEVICE ~hiprtPimpl() {}\n"
"\n"
"	HIPRT_DEVICE hiprtPimpl& operator=( hiprtPimpl const& o )\n"
"	{\n"
"		PimplPtrCheck();\n"
"		**this = *o;\n"
"		return *this;\n"
"	}\n"
"\n"
"	HIPRT_DEVICE hiprtPimpl& operator=( hiprtPimpl&& o )\n"
"	{\n"
"		PimplPtrCheck();\n"
"		**this = hiprt::move( *o );\n"
"		return *this;\n"
"	}\n"
"\n"
"	HIPRT_DEVICE T& operator*()\n"
"	{\n"
"		PimplPtrCheck();\n"
"		return *reinterpret_cast<T*>( &data );\n"
"	}\n"
"\n"
"	HIPRT_DEVICE T const& operator*() const\n"
"	{\n"
"		PimplPtrCheck();\n"
"		return *reinterpret_cast<T const*>( &data );\n"
"	}\n"
"\n"
"	HIPRT_DEVICE T* operator->()\n"
"	{\n"
"		PimplPtrCheck();\n"
"		return &**this;\n"
"	}\n"
"\n"
"	HIPRT_DEVICE T const* operator->() const\n"
"	{\n"
"		PimplPtrCheck();\n"
"		return &**this;\n"
"	}\n"
"};\n"
"\n"
"enum TraversalObjSize\n"
"{\n"
"	SizePrivateStack			   = 260,\n"
"	SizeGlobalStack				   = 48,\n"
"	SizePrivateInstanceStack	   = 160,\n"
"	SizeGlobalInstanceStack		   = 48,\n"
"	SizeGeomTraversalCustomStack   = 128,\n"
"	SizeSceneTraversalCustomStack  = 176,\n"
"	SizeGeomTraversalPrivateStack  = 400,\n"
"	SizeSceneTraversalPrivateStack = 608,\n"
"};\n"
"\n"
"enum TraversalObjAlignment\n"
"{\n"
"	AlignmentPrivateStack				= 4,\n"
"	AlignmentGlobalStack				= 8,\n"
"	AlignmentPrivateInstanceStack		= 16,\n"
"	AlignmentGlobalInstanceStack		= 8,\n"
"	AlignmentGeomTraversalCustomStack	= 16,\n"
"	AlignmentSceneTraversalCustomStack	= 16,\n"
"	AlignmentGeomTraversalPrivateStack	= 16,\n"
"	AlignmentSceneTraversalPrivateStack = 16\n"
"};\n"
;
static const char* hip_hiprt_device_impl= \
"\n"
"#include <hiprt/hiprt_common.h>\n"
"#include <hiprt/hiprt_vec.h>\n"
"#include <hiprt/hiprt_math.h>\n"
"#include <hiprt/impl/Aabb.h>\n"
"#include <hiprt/impl/BvhNode.h>\n"
"#include <hiprt/impl/Header.h>\n"
"#include <hiprt/impl/Instance.h>\n"
"#include <hiprt/impl/QrDecomposition.h>\n"
"#include <hiprt/impl/Quaternion.h>\n"
"#include <hiprt/impl/Transform.h>\n"
"#include <hiprt/impl/Triangle.h>\n"
"#include <hiprt/hiprt_device.h>\n"
"\n"
"#if HIPRT_RTIP >= 11\n"
"extern \"C\" __device__ float __ocml_native_recip_f32( float );\n"
"#endif\n"
"\n"
"#if HIPRT_RTIP >= 31\n"
"using hip_float3 = float __attribute__( ( ext_vector_type( 3 ) ) );\n"
"#endif\n"
"\n"
"#if !defined( HIPRT_BITCODE_LINKING ) && defined( __HIPCC_RTC__ )\n"
"HIPRT_DEVICE void* operator new( size_t size, void* ptr ) noexcept { return ptr; };\n"
"HIPRT_DEVICE void* operator new[]( size_t size, void* ptr ) noexcept { return ptr; };\n"
"#endif\n"
"\n"
"HIPRT_DEVICE bool intersectFunc(\n"
"	uint32_t					geomType,\n"
"	uint32_t					rayType,\n"
"	const hiprtFuncTableHeader& tableHeader,\n"
"	const hiprtRay&				ray,\n"
"	void*						payload,\n"
"	hiprtHit&					hit );\n"
"HIPRT_DEVICE bool filterFunc(\n"
"	uint32_t					geomType,\n"
"	uint32_t					rayType,\n"
"	const hiprtFuncTableHeader& tableHeader,\n"
"	const hiprtRay&				ray,\n"
"	void*						payload,\n"
"	const hiprtHit&				hit );\n"
"\n"
"namespace hiprt\n"
"{\n"
"enum\n"
"{\n"
"	Triangle0Processed = 1,\n"
"	Triangle1Processed = 2,\n"
"};\n"
"\n"
"HIPRT_DEVICE HIPRT_INLINE float3 rcp( const float3 a )\n"
"{\n"
"#if HIPRT_RTIP >= 11\n"
"	return float3{ __ocml_native_recip_f32( a.x ), __ocml_native_recip_f32( a.y ), __ocml_native_recip_f32( a.z ) };\n"
"#else\n"
"	return 1.0f / a;\n"
"#endif\n"
"}\n"
"\n"
"template <typename StackEntry, uint32_t StackSize>\n"
"class PrivateStack\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE PrivateStack() : m_top( 0u ) {}\n"
"\n"
"	HIPRT_DEVICE StackEntry pop() { return m_stackBuffer[--m_top]; }\n"
"	HIPRT_DEVICE void		push( StackEntry val ) { m_stackBuffer[m_top++] = val; }\n"
"	HIPRT_DEVICE bool		empty() const { return m_top == 0u; }\n"
"	HIPRT_DEVICE uint32_t	vacancy() const { return StackSize - m_top; }\n"
"	HIPRT_DEVICE void		reset() { m_top = 0u; }\n"
"\n"
"private:\n"
"	StackEntry m_stackBuffer[StackSize];\n"
"	uint32_t   m_top;\n"
"};\n"
"\n"
"template <typename StackEntry, bool DynamicAssignment>\n"
"class GlobalStack\n"
"{\n"
"public:\n"
"	static constexpr uint32_t Stride	= hiprt::WarpSize;\n"
"	static constexpr uint32_t LogStride = hiprt::Log2( Stride );\n"
"\n"
"	HIPRT_DEVICE\n"
"	GlobalStack( hiprtGlobalStackBuffer globalStackBuffer, hiprtSharedStackBuffer sharedStackBuffer );\n"
"\n"
"	HIPRT_DEVICE ~GlobalStack();\n"
"\n"
"	HIPRT_DEVICE StackEntry pop();\n"
"	HIPRT_DEVICE void		push( StackEntry val );\n"
"	HIPRT_DEVICE uint32_t	vacancy() const;\n"
"	HIPRT_DEVICE bool		empty() const;\n"
"	HIPRT_DEVICE void		reset();\n"
"\n"
"private:\n"
"	uint32_t*	m_globalStackLock;\n"
"	StackEntry* m_globalStackBuffer;\n"
"	StackEntry* m_sharedStackBuffer;\n"
"	uint32_t	m_globalStackSize;\n"
"	uint32_t	m_sharedStackSize;\n"
"	int32_t		m_globalIndex;\n"
"	int32_t		m_sharedIndex;\n"
"	uint32_t	m_sharedCount;\n"
"};\n"
"\n"
"template <typename StackEntry, bool DynamicAssignment>\n"
"HIPRT_DEVICE GlobalStack<StackEntry, DynamicAssignment>::GlobalStack(\n"
"	hiprtGlobalStackBuffer globalStackBuffer, hiprtSharedStackBuffer sharedStackBuffer )\n"
"{\n"
"	const uint32_t threadIndex = threadIdx.x + threadIdx.y * blockDim.x;\n"
"	const uint32_t warpIndex   = threadIndex >> LogStride;\n"
"	const uint32_t laneIndex   = threadIndex & ( Stride - 1 );\n"
"\n"
"	const uint32_t sharedStackOffset = laneIndex + warpIndex * Stride * sharedStackBuffer.stackSize;\n"
"	m_sharedStackBuffer				 = reinterpret_cast<StackEntry*>( sharedStackBuffer.stackData ) + sharedStackOffset;\n"
"	m_sharedStackSize				 = sharedStackBuffer.stackSize;\n"
"	if constexpr ( DynamicAssignment )\n"
"	{\n"
"		const uint32_t warpsPerBlock	= hiprt::DivideRoundUp( blockDim.x * blockDim.y, Stride );\n"
"		const uint32_t activeWarps		= globalStackBuffer.stackCount >> LogStride;\n"
"		const uint32_t firstThreadIndex = __ffsll( static_cast<unsigned long long>( hiprt::ballot( true ) ) ) - 1;\n"
"\n"
"		uint32_t  warpHash			= InvalidValue;\n"
"		uint32_t  warpHashCandidate = ( warpIndex + ( blockIdx.x + blockIdx.y * gridDim.x ) * warpsPerBlock ) % activeWarps;\n"
"		uint32_t* globalStackLocks	= reinterpret_cast<uint32_t*>( globalStackBuffer.stackData );\n"
"		while ( warpHash == InvalidValue )\n"
"		{\n"
"			if ( laneIndex == firstThreadIndex )\n"
"			{\n"
"				if ( atomicCAS( &globalStackLocks[warpHashCandidate], 0, 1 ) == 0 ) warpHash = warpHashCandidate;\n"
"			}\n"
"			warpHashCandidate = ( warpHashCandidate + 1 ) % activeWarps;\n"
"			warpHash		  = shfl( warpHash, firstThreadIndex );\n"
"		}\n"
"		__threadfence();\n"
"		m_globalStackLock = &globalStackLocks[warpHash];\n"
"\n"
"		const uint32_t globalStackOffset = activeWarps + laneIndex + ( warpHash << LogStride ) * globalStackBuffer.stackSize;\n"
"		m_globalStackBuffer				 = reinterpret_cast<StackEntry*>( globalStackBuffer.stackData ) + globalStackOffset;\n"
"		m_globalStackSize				 = globalStackBuffer.stackSize;\n"
"	}\n"
"	else\n"
"	{\n"
"		const uint32_t globalStackOffset =\n"
"			laneIndex + ( warpIndex * Stride + ( blockIdx.x + blockIdx.y * gridDim.x ) * ( blockDim.x * blockDim.y ) ) *\n"
"							globalStackBuffer.stackSize;\n"
"		m_globalStackBuffer = reinterpret_cast<StackEntry*>( globalStackBuffer.stackData ) + globalStackOffset;\n"
"		m_globalStackSize	= globalStackBuffer.stackSize;\n"
"	}\n"
"}\n"
"\n"
"template <typename StackEntry, bool DynamicAssignment>\n"
"HIPRT_DEVICE GlobalStack<StackEntry, DynamicAssignment>::~GlobalStack()\n"
"{\n"
"	if constexpr ( DynamicAssignment )\n"
"	{\n"
"		__threadfence();\n"
"		const uint32_t threadIndex		= threadIdx.x + threadIdx.y * blockDim.x;\n"
"		const uint32_t laneIndex		= threadIndex & ( Stride - 1 );\n"
"		const uint32_t firstThreadIndex = __ffsll( static_cast<unsigned long long>( hiprt::ballot( true ) ) ) - 1;\n"
"		if ( laneIndex == firstThreadIndex ) atomicExch( m_globalStackLock, 0 );\n"
"	}\n"
"}\n"
"\n"
"template <typename StackEntry, bool DynamicAssignment>\n"
"HIPRT_DEVICE HIPRT_INLINE StackEntry GlobalStack<StackEntry, DynamicAssignment>::pop()\n"
"{\n"
"	if ( m_sharedCount > 0 )\n"
"	{\n"
"		m_sharedCount--;\n"
"		if ( --m_sharedIndex < 0 ) m_sharedIndex += m_sharedStackSize;\n"
"		return m_sharedStackBuffer[m_sharedIndex << LogStride];\n"
"	}\n"
"	else\n"
"	{\n"
"		return m_globalStackBuffer[--m_globalIndex << LogStride];\n"
"	}\n"
"}\n"
"\n"
"template <typename StackEntry, bool DynamicAssignment>\n"
"HIPRT_DEVICE HIPRT_INLINE void GlobalStack<StackEntry, DynamicAssignment>::push( StackEntry val )\n"
"{\n"
"	if ( m_sharedCount < m_sharedStackSize )\n"
"	{\n"
"		m_sharedStackBuffer[m_sharedIndex++ << LogStride] = val;\n"
"		m_sharedCount++;\n"
"	}\n"
"	else\n"
"	{\n"
"		if ( m_sharedStackSize == 0 )\n"
"		{\n"
"			m_globalStackBuffer[m_globalIndex++ << LogStride] = val;\n"
"		}\n"
"		else\n"
"		{\n"
"			m_globalStackBuffer[m_globalIndex++ << LogStride] = m_sharedStackBuffer[m_sharedIndex << LogStride];\n"
"			m_sharedStackBuffer[m_sharedIndex++ << LogStride] = val;\n"
"		}\n"
"	}\n"
"	if ( m_sharedIndex >= m_sharedStackSize ) m_sharedIndex -= m_sharedStackSize;\n"
"}\n"
"\n"
"template <typename StackEntry, bool DynamicAssignment>\n"
"HIPRT_DEVICE HIPRT_INLINE uint32_t GlobalStack<StackEntry, DynamicAssignment>::vacancy() const\n"
"{\n"
"	return m_globalStackSize - m_globalIndex + m_sharedStackSize - m_sharedCount;\n"
"}\n"
"\n"
"template <typename StackEntry, bool DynamicAssignment>\n"
"HIPRT_DEVICE HIPRT_INLINE bool GlobalStack<StackEntry, DynamicAssignment>::empty() const\n"
"{\n"
"	return m_sharedCount == 0 && m_globalIndex == 0;\n"
"}\n"
"\n"
"template <typename StackEntry, bool DynamicAssignment>\n"
"HIPRT_DEVICE HIPRT_INLINE void GlobalStack<StackEntry, DynamicAssignment>::reset()\n"
"{\n"
"	m_globalIndex = 0;\n"
"	m_sharedIndex = 0;\n"
"	m_sharedCount = 0u;\n"
"}\n"
"\n"
"template <typename Stack, hiprtTraversalType TraversalType>\n"
"class TraversalBase\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE TraversalBase(\n"
"		const hiprtRay& ray, Stack& stack, hiprtTraversalHint hint, void* payload, hiprtFuncTable funcTable, uint32_t rayType )\n"
"		: m_ray( ray ), m_stack( stack ), m_payload( payload ), m_rayType( rayType ), m_nodeIndex( RootIndex ), m_hint( hint )\n"
"	{\n"
"		if ( funcTable != nullptr ) m_tableHeader = *reinterpret_cast<hiprtFuncTableHeader*>( funcTable );\n"
"#if HIPRT_RTIP >= 11\n"
"		packDescriptor( static_cast<uint32_t>( hint ), Rtip >= 31 );\n"
"#endif\n"
"	}\n"
"\n"
"#if HIPRT_RTIP >= 11\n"
"	HIPRT_DEVICE void packDescriptor( uint32_t boxSortHeuristic = 0u, bool compressed = false );\n"
"#endif\n"
"\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState() { return m_state; }\n"
"\n"
"	HIPRT_DEVICE bool testInternalNode( const hiprtRay& ray, const float3& invD, BoxNode* nodes, uint32_t& nodeIndex );\n"
"\n"
"	HIPRT_DEVICE bool testTriangleNode(\n"
"		const hiprtRay& ray,\n"
"		const float3&	invD,\n"
"		TriangleNode*	nodes,\n"
"		uint32_t		geomType,\n"
"		uint32_t&		leafIndex,\n"
"		uint32_t&		triangleMask,\n"
"		hiprtHit&		hit );\n"
"\n"
"	HIPRT_DEVICE bool testTriangle(\n"
"		const hiprtRay& ray, const float3& invD, TrianglePairNode* nodes, uint32_t leafAddr, uint32_t leafType, hiprtHit& hit );\n"
"\n"
"	HIPRT_DEVICE uint32_t testTrianglePair(\n"
"		const hiprtRay&		ray,\n"
"		TrianglePacketNode* nodes,\n"
"		uint32_t			leafAddr,\n"
"		uint32_t			triPairIndex,\n"
"		hiprtHit&			hit0,\n"
"		hiprtHit&			hit1,\n"
"		bool&				nodeEnd,\n"
"		bool&				rangeEnd );\n"
"\n"
"protected:\n"
"	hiprtRay			 m_ray;\n"
"	hiprtFuncTableHeader m_tableHeader = { 0, 1, nullptr };\n"
"	uint4				 m_descriptor;\n"
"	Stack&				 m_stack;\n"
"	BoxNode*			 m_boxNodes;\n"
"	void*				 m_payload;\n"
"	uint32_t			 m_nodeIndex;\n"
"	uint32_t			 m_rayType;\n"
"	uint32_t			 m_triangleMask = 0;\n"
"	hiprtTraversalState	 m_state		= hiprtTraversalStateInit;\n"
"	hiprtTraversalHint	 m_hint			= hiprtTraversalHintDefault;\n"
"};\n"
"\n"
"#if HIPRT_RTIP >= 11\n"
"template <typename Stack, hiprtTraversalType TraversalType>\n"
"HIPRT_DEVICE void TraversalBase<Stack, TraversalType>::packDescriptor( uint32_t boxSortHeuristic, bool compressed )\n"
"{\n"
"	boxSortHeuristic &= 0x3;\n"
"	uint64_t baseAddress		  = 0x0;\n"
"	uint64_t size				  = 0x3ffffffffffull;\n"
"	uint32_t boxGrowUlp			  = 0x6;\n"
"	uint32_t boxSortEnable		  = 0x1;\n"
"	uint32_t triangleReturnMode	  = 0x1;\n"
"	uint32_t type				  = 0x8;\n"
"	uint32_t compressFormatEnable = 0x0;\n"
"	uint32_t instanceEnable		  = 0x0;\n"
"	uint32_t sortTrianglesFirst	  = 0x0;\n"
"	uint32_t wideSortEnable		  = 0x0;\n"
"	if ( compressed )\n"
"	{\n"
"		compressFormatEnable = 0x1;\n"
"		instanceEnable		 = 0x1;\n"
"		sortTrianglesFirst	 = 0x1;\n"
"		wideSortEnable		 = 0x1;\n"
"	}\n"
"	m_descriptor.x = baseAddress & 0xffffffff;\n"
"	m_descriptor.y = ( baseAddress >> 32ull ) | ( boxSortEnable << 31u ) | ( boxGrowUlp << 23u ) | ( boxSortHeuristic << 21u ) |\n"
"					 ( sortTrianglesFirst << 20u );\n"
"	m_descriptor.z = size & 0xffffffff;\n"
"	m_descriptor.w = ( size >> 32ull ) | ( triangleReturnMode << 24u ) | ( type << 28u ) | ( instanceEnable << 22u ) |\n"
"					 ( wideSortEnable << 21u ) | ( compressFormatEnable << 19u );\n"
"}\n"
"#endif\n"
"\n"
"template <typename Stack, hiprtTraversalType TraversalType>\n"
"HIPRT_DEVICE bool TraversalBase<Stack, TraversalType>::testInternalNode(\n"
"	const hiprtRay& ray, [[maybe_unused]] const float3& invD, BoxNode* nodes, uint32_t& nodeIndex )\n"
"{\n"
"#if HIPRT_RTIP < 11\n"
"	BoxNode node = nodes[getNodeAddr( nodeIndex )];\n"
"	float2	s0	 = node.m_box0.intersect( ray.origin, invD, ray.maxT );\n"
"	float2	s1	 = node.m_box1.intersect( ray.origin, invD, ray.maxT );\n"
"	float2	s2	 = node.m_box2.intersect( ray.origin, invD, ray.maxT );\n"
"	float2	s3	 = node.m_box3.intersect( ray.origin, invD, ray.maxT );\n"
"\n"
"	uint32_t result[4];\n"
"	result[0] = s0.x <= s0.y ? node.m_childIndex0 : InvalidValue;\n"
"	result[1] = s1.x <= s1.y ? node.m_childIndex1 : InvalidValue;\n"
"	result[2] = s2.x <= s2.y ? node.m_childIndex2 : InvalidValue;\n"
"	result[3] = s3.x <= s3.y ? node.m_childIndex3 : InvalidValue;\n"
"\n"
"#define SORT( childIndexA, childIndexB, distA, distB )                                     \\\n"
"	if ( ( childIndexB != InvalidValue && distB < distA ) || childIndexA == InvalidValue ) \\\n"
"	{                                                                                      \\\n"
"		float	 t0 = distA;                                                               \\\n"
"		uint32_t t1 = childIndexA;                                                         \\\n"
"		childIndexA = childIndexB;                                                         \\\n"
"		distA		= distB;                                                               \\\n"
"		childIndexB = t1;                                                                  \\\n"
"		distB		= t0;                                                                  \\\n"
"	}\n"
"\n"
"	SORT( result[0], result[2], s0.x, s2.x )\n"
"	SORT( result[1], result[3], s1.x, s3.x )\n"
"	SORT( result[0], result[1], s0.x, s1.x )\n"
"	SORT( result[2], result[3], s2.x, s3.x )\n"
"	SORT( result[1], result[2], s1.x, s2.x )\n"
"#undef SORT\n"
"#elif HIPRT_RTIP >= 31\n"
"	hip_float3 dummy0, dummy1;\n"
"	auto	   result = __builtin_amdgcn_image_bvh8_intersect_ray(\n"
"		  encodeBaseAddr( nodes ),\n"
"		  ray.maxT,\n"
"		  0xff,\n"
"		  { ray.origin.x, ray.origin.y, ray.origin.z },\n"
"		  { ray.direction.x, ray.direction.y, ray.direction.z },\n"
"		  nodeIndex,\n"
"		  { m_descriptor.x, m_descriptor.y, m_descriptor.z, m_descriptor.w },\n"
"		  &dummy0,\n"
"		  &dummy1 );\n"
"#else\n"
"	auto result = __builtin_amdgcn_image_bvh_intersect_ray_l(\n"
"		encodeBaseAddr( nodes, nodeIndex ),\n"
"		ray.maxT,\n"
"		{ ray.origin.x, ray.origin.y, ray.origin.z, 0.0f },\n"
"		{ ray.direction.x, ray.direction.y, ray.direction.z, 0.0f },\n"
"		{ invD.x, invD.y, invD.z, 0.0f },\n"
"		{ m_descriptor.x, m_descriptor.y, m_descriptor.z, m_descriptor.w } );\n"
"#endif\n"
"\n"
"	if ( m_stack.vacancy() < BranchingFactor - 1 )\n"
"	{\n"
"		m_state = hiprtTraversalStateStackOverflow;\n"
"		return true;\n"
"	}\n"
"\n"
"#pragma unroll\n"
"	for ( uint32_t i = BranchingFactor - 1; i >= 1; --i )\n"
"		if ( result[i] != InvalidValue ) m_stack.push( result[i] );\n"
"\n"
"	if ( result[0] != InvalidValue )\n"
"	{\n"
"		nodeIndex = result[0];\n"
"		return true;\n"
"	}\n"
"\n"
"	return false;\n"
"}\n"
"\n"
"template <typename Stack, hiprtTraversalType TraversalType>\n"
"HIPRT_DEVICE bool TraversalBase<Stack, TraversalType>::testTriangleNode(\n"
"	const hiprtRay&				   ray,\n"
"	[[maybe_unused]] const float3& invD,\n"
"	TriangleNode*				   nodes,\n"
"	uint32_t					   geomType,\n"
"	uint32_t&					   leafIndex,\n"
"	[[maybe_unused]] uint32_t&	   triangleMask,\n"
"	hiprtHit&					   hit )\n"
"{\n"
"	bool	 hasHit	  = false;\n"
"	uint32_t leafAddr = getNodeAddr( leafIndex );\n"
"\n"
"	const bool useFilter = geomType != InvalidValue && m_tableHeader.funcDataSets != nullptr;\n"
"\n"
"	if constexpr ( is_same<TriangleNode, TrianglePacketNode>::value ) // RTIP 3.1\n"
"	{\n"
"		TrianglePacketNode* packetNodes	 = reinterpret_cast<TrianglePacketNode*>( nodes );\n"
"		hiprtHit			secondHit	 = hit;\n"
"		uint32_t			triPairIndex = typeToTriPairIndex( getNodeType( leafIndex ) );\n"
"		if constexpr ( TraversalType == hiprtTraversalTerminateAtAnyHit )\n"
"		{\n"
"			while ( true )\n"
"			{\n"
"				bool	 nodeEnd  = false;\n"
"				bool	 rangeEnd = false;\n"
"				uint32_t hitMask =\n"
"					this->testTrianglePair( ray, packetNodes, leafAddr, triPairIndex, hit, secondHit, nodeEnd, rangeEnd );\n"
"\n"
"				bool firstHasHit  = hitMask & 1;\n"
"				bool secondHasHit = hitMask & 2;\n"
"				if ( useFilter )\n"
"				{\n"
"					if ( firstHasHit && filterFunc( geomType >> 1, m_rayType, m_tableHeader, ray, m_payload, hit ) )\n"
"						firstHasHit = false;\n"
"					if ( secondHasHit && filterFunc( geomType >> 1, m_rayType, m_tableHeader, ray, m_payload, secondHit ) )\n"
"						secondHasHit = false;\n"
"				}\n"
"\n"
"				if ( ( triangleMask & Triangle0Processed ) == 0 )\n"
"				{\n"
"					hasHit = firstHasHit;\n"
"					triangleMask |= Triangle0Processed;\n"
"				}\n"
"\n"
"				if ( !hasHit )\n"
"				{\n"
"					if ( secondHasHit )\n"
"					{\n"
"						hit.t	   = secondHit.t;\n"
"						hit.normal = secondHit.normal;\n"
"						hit.primID = secondHit.primID;\n"
"						hit.uv	   = secondHit.uv;\n"
"						hasHit	   = true;\n"
"					}\n"
"					triangleMask |= Triangle1Processed;\n"
"				}\n"
"\n"
"				if ( !secondHasHit || ( triangleMask & Triangle1Processed ) != 0 ) // !( secondHasHit && !secondProcessed )\n"
"				{\n"
"					triPairIndex++;\n"
"					triangleMask = 0;\n"
"					if ( nodeEnd )\n"
"					{\n"
"						triPairIndex = 0;\n"
"						leafAddr++;\n"
"					}\n"
"\n"
"					if ( rangeEnd )\n"
"					{\n"
"						triangleMask = InvalidValue; // indicate range end by \'invalid value\'\n"
"						break;\n"
"					}\n"
"				}\n"
"\n"
"				if ( hasHit )\n"
"				{\n"
"					leafIndex = encodeNodeIndex( leafAddr, triPairIndexToType( triPairIndex ) );\n"
"					break;\n"
"				}\n"
"			}\n"
"		}\n"
"		else\n"
"		{\n"
"			hit.t			  = ray.maxT;\n"
"			hiprtHit firstHit = hit;\n"
"			while ( true )\n"
"			{\n"
"				bool	 nodeEnd  = false;\n"
"				bool	 rangeEnd = false;\n"
"				uint32_t hitMask =\n"
"					this->testTrianglePair( ray, packetNodes, leafAddr, triPairIndex, firstHit, secondHit, nodeEnd, rangeEnd );\n"
"\n"
"				bool firstHasHit  = hitMask & 1;\n"
"				bool secondHasHit = hitMask & 2;\n"
"				if ( useFilter )\n"
"				{\n"
"					if ( firstHasHit && filterFunc( geomType >> 1, m_rayType, m_tableHeader, ray, m_payload, firstHit ) )\n"
"						firstHasHit = false;\n"
"					if ( secondHasHit && filterFunc( geomType >> 1, m_rayType, m_tableHeader, ray, m_payload, secondHit ) )\n"
"						secondHasHit = false;\n"
"				}\n"
"\n"
"				if ( firstHasHit && ( !hasHit || hit.t > firstHit.t ) )\n"
"				{\n"
"					hit.t	   = firstHit.t;\n"
"					hit.normal = firstHit.normal;\n"
"					hit.primID = firstHit.primID;\n"
"					hit.uv	   = firstHit.uv;\n"
"					hasHit	   = true;\n"
"				}\n"
"\n"
"				if ( secondHasHit && ( !hasHit || hit.t > secondHit.t ) )\n"
"				{\n"
"					hit.t	   = secondHit.t;\n"
"					hit.normal = secondHit.normal;\n"
"					hit.primID = secondHit.primID;\n"
"					hit.uv	   = secondHit.uv;\n"
"					hasHit	   = true;\n"
"				}\n"
"\n"
"				if ( rangeEnd ) break;\n"
"\n"
"				triPairIndex++;\n"
"				if ( nodeEnd )\n"
"				{\n"
"					triPairIndex = 0;\n"
"					leafAddr++;\n"
"				}\n"
"			}\n"
"		}\n"
"	}\n"
"	else\n"
"	{\n"
"		TrianglePairNode* pairNodes = reinterpret_cast<TrianglePairNode*>( nodes );\n"
"\n"
"		if constexpr ( TraversalType == hiprtTraversalTerminateAtAnyHit )\n"
"		{\n"
"			if ( ( triangleMask & Triangle0Processed ) == 0 )\n"
"			{\n"
"				hasHit = this->testTriangle( ray, invD, pairNodes, leafAddr, TriangleType, hit );\n"
"				if ( useFilter && hasHit && filterFunc( geomType >> 1, m_rayType, m_tableHeader, ray, m_payload, hit ) )\n"
"					hasHit = false;\n"
"				triangleMask |= Triangle0Processed;\n"
"			}\n"
"\n"
"			if ( !hasHit )\n"
"			{\n"
"				hasHit = this->testTriangle( ray, invD, pairNodes, leafAddr, TriangleType + 1, hit );\n"
"				if ( useFilter && hasHit && filterFunc( geomType >> 1, m_rayType, m_tableHeader, ray, m_payload, hit ) )\n"
"					hasHit = false;\n"
"				triangleMask |= Triangle1Processed;\n"
"			}\n"
"\n"
"			if ( triangleMask & Triangle1Processed ) triangleMask = InvalidValue;\n"
"		}\n"
"		else\n"
"		{\n"
"			hasHit = this->testTriangle( ray, invD, pairNodes, leafAddr, TriangleType, hit );\n"
"			if ( useFilter && hasHit && filterFunc( geomType >> 1, m_rayType, m_tableHeader, ray, m_payload, hit ) )\n"
"				hasHit = false;\n"
"\n"
"			hiprtHit secondHit	  = hit;\n"
"			bool	 secondHasHit = this->testTriangle( ray, invD, pairNodes, leafAddr, TriangleType + 1, secondHit );\n"
"			if ( useFilter && secondHasHit && filterFunc( geomType >> 1, m_rayType, m_tableHeader, ray, m_payload, secondHit ) )\n"
"				secondHasHit = false;\n"
"\n"
"			if ( secondHasHit && ( !hasHit || hit.t > secondHit.t ) )\n"
"			{\n"
"				hit.t	   = secondHit.t;\n"
"				hit.normal = secondHit.normal;\n"
"				hit.primID = secondHit.primID;\n"
"				hit.uv	   = secondHit.uv;\n"
"				hasHit	   = true;\n"
"			}\n"
"		}\n"
"	}\n"
"	return hasHit;\n"
"}\n"
"\n"
"template <typename Stack, hiprtTraversalType TraversalType>\n"
"HIPRT_DEVICE bool TraversalBase<Stack, TraversalType>::testTriangle(\n"
"	const hiprtRay&				   ray,\n"
"	[[maybe_unused]] const float3& invD,\n"
"	TrianglePairNode*			   nodes,\n"
"	uint32_t					   leafAddr,\n"
"	uint32_t					   leafType,\n"
"	hiprtHit&					   hit )\n"
"{\n"
"	const TrianglePairNode& node = nodes[leafAddr];\n"
"	if ( node.getPrimIndex( 0 ) == node.getPrimIndex( 1 ) && leafType == TriangleType + 1 ) return false;\n"
"	bool hasHit = false;\n"
"#if HIPRT_RTIP < 11\n"
"	hasHit =\n"
"		node.m_triPair.fetchTriangle( leafType & 1 ).intersect( ray, hit.uv, hit.t, node.m_flags >> ( ( leafType & 1 ) * 8 ) );\n"
"	if ( hasHit )\n"
"	{\n"
"		hit.primID = leafType & 1 ? node.getPrimIndex( 1 ) : node.getPrimIndex( 0 );\n"
"		hit.normal = node.getNormal( leafType & 1 );\n"
"	}\n"
"#else\n"
"	auto result = __builtin_amdgcn_image_bvh_intersect_ray_l(\n"
"		encodeBaseAddr( nodes, encodeNodeIndex( leafAddr, leafType ) ),\n"
"		ray.maxT,\n"
"		{ ray.origin.x, ray.origin.y, ray.origin.z, 0.0f },\n"
"		{ ray.direction.x, ray.direction.y, ray.direction.z, 0.0f },\n"
"		{ invD.x, invD.y, invD.z, 0.0f },\n"
"		{ m_descriptor.x, m_descriptor.y, m_descriptor.z, m_descriptor.w } );\n"
"	float invDenom = __ocml_native_recip_f32( __int_as_float( result[1] ) );\n"
"	float t		   = __int_as_float( result[0] ) * invDenom;\n"
"	hasHit		   = ray.minT <= t && t <= ray.maxT;\n"
"	if ( hasHit )\n"
"	{\n"
"		hit.t	   = t;\n"
"		hit.uv.x   = __int_as_float( result[2] ) * invDenom;\n"
"		hit.uv.y   = __int_as_float( result[3] ) * invDenom;\n"
"		hit.primID = leafType & 1 ? node.getPrimIndex( 1 ) : node.getPrimIndex( 0 );\n"
"		hit.normal = node.getNormal( leafType & 1 );\n"
"	}\n"
"#endif\n"
"	return hasHit;\n"
"}\n"
"\n"
"template <typename Stack, hiprtTraversalType TraversalType>\n"
"HIPRT_DEVICE uint32_t TraversalBase<Stack, TraversalType>::testTrianglePair(\n"
"	const hiprtRay&		ray,\n"
"	TrianglePacketNode* nodes,\n"
"	uint32_t			leafAddr,\n"
"	uint32_t			triPairIndex,\n"
"	hiprtHit&			hit0,\n"
"	hiprtHit&			hit1,\n"
"	bool&				nodeEnd,\n"
"	bool&				rangeEnd )\n"
"{\n"
"#if HIPRT_RTIP >= 31\n"
"	const TrianglePacketNode& node = nodes[leafAddr];\n"
"\n"
"	hip_float3 dummy0, dummy1;\n"
"	auto	   result = __builtin_amdgcn_image_bvh8_intersect_ray(\n"
"		  encodeBaseAddr( nodes ),\n"
"		  ray.maxT,\n"
"		  0xff,\n"
"		  { ray.origin.x, ray.origin.y, ray.origin.z },\n"
"		  { ray.direction.x, ray.direction.y, ray.direction.z },\n"
"		  encodeNodeIndex( leafAddr, triPairIndexToType( triPairIndex ) ),\n"
"		  { m_descriptor.x, m_descriptor.y, m_descriptor.z, m_descriptor.w },\n"
"		  &dummy0,\n"
"		  &dummy1 );\n"
"\n"
"	uint32_t hitMask = 0;\n"
"	{\n"
"		float t		 = __int_as_float( result[0] );\n"
"		bool  hasHit = ray.minT <= t && t <= ray.maxT;\n"
"		hitMask |= hasHit ? 1 : 0;\n"
"		if ( hasHit )\n"
"		{\n"
"			hit0.t		= t;\n"
"			hit0.uv.x	= __int_as_float( result[1] );\n"
"			hit0.uv.y	= __int_as_float( result[2] );\n"
"			hit0.primID = result[3] >> 1;\n"
"			hit0.normal = node.getNormal( triPairIndex, 0 );\n"
"		}\n"
"	}\n"
"\n"
"	{\n"
"		float t		 = __int_as_float( result[4] );\n"
"		bool  hasHit = ray.minT <= t && t <= ray.maxT;\n"
"		hitMask |= hasHit ? 2 : 0;\n"
"		if ( hasHit )\n"
"		{\n"
"			hit1.t		= t;\n"
"			hit1.uv.x	= __int_as_float( result[5] );\n"
"			hit1.uv.y	= __int_as_float( result[6] );\n"
"			hit1.primID = result[7] >> 1;\n"
"			hit1.normal = node.getNormal( triPairIndex, 1 );\n"
"		}\n"
"	}\n"
"\n"
"	nodeEnd	 = ( result[8] & 3 ) == 1;\n"
"	rangeEnd = ( result[8] & 3 ) == 3;\n"
"\n"
"	return hitMask;\n"
"#endif\n"
"}\n"
"\n"
"template <typename Stack, typename PrimitiveNode, hiprtTraversalType TraversalType>\n"
"class GeomTraversal : public TraversalBase<Stack, TraversalType>\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE\n"
"	GeomTraversal(\n"
"		hiprtGeometry	   geom,\n"
"		const hiprtRay&	   ray,\n"
"		Stack&			   stack,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0u );\n"
"\n"
"	HIPRT_DEVICE bool\n"
"	testLeafNode( const hiprtRay& ray, const float3& invD, uint32_t& leafIndex, uint32_t& triangleMask, hiprtHit& hit );\n"
"\n"
"	HIPRT_DEVICE hiprtHit getNextHit();\n"
"\n"
"protected:\n"
"	using TraversalBase<Stack, TraversalType>::m_ray;\n"
"	using TraversalBase<Stack, TraversalType>::m_tableHeader;\n"
"	using TraversalBase<Stack, TraversalType>::m_state;\n"
"	using TraversalBase<Stack, TraversalType>::m_stack;\n"
"	using TraversalBase<Stack, TraversalType>::m_boxNodes;\n"
"	using TraversalBase<Stack, TraversalType>::m_payload;\n"
"	using TraversalBase<Stack, TraversalType>::m_nodeIndex;\n"
"	using TraversalBase<Stack, TraversalType>::m_triangleMask;\n"
"	using TraversalBase<Stack, TraversalType>::m_rayType;\n"
"#if HIPRT_RTIP >= 11\n"
"	using TraversalBase<Stack, TraversalType>::m_descriptor;\n"
"#endif\n"
"\n"
"	PrimitiveNode* m_primNodes;\n"
"	uint32_t	   m_geomType;\n"
"	uint32_t	   m_leafIndex;\n"
"};\n"
"\n"
"template <typename Stack, typename PrimitiveNode, hiprtTraversalType TraversalType>\n"
"HIPRT_DEVICE GeomTraversal<Stack, PrimitiveNode, TraversalType>::GeomTraversal(\n"
"	hiprtGeometry	   geom,\n"
"	const hiprtRay&	   ray,\n"
"	Stack&			   stack,\n"
"	hiprtTraversalHint hint,\n"
"	void*			   payload,\n"
"	hiprtFuncTable	   funcTable,\n"
"	uint32_t		   rayType )\n"
"	: TraversalBase<Stack, TraversalType>( ray, stack, hint, payload, funcTable, rayType ), m_leafIndex( InvalidValue )\n"
"{\n"
"	GeomHeader* geomHeader = reinterpret_cast<GeomHeader*>( geom );\n"
"	m_boxNodes			   = geomHeader->m_boxNodes;\n"
"	m_primNodes			   = reinterpret_cast<PrimitiveNode*>( geomHeader->m_primNodes );\n"
"	m_geomType			   = geomHeader->m_geomType;\n"
"	m_stack.reset();\n"
"}\n"
"\n"
"template <typename Stack, typename PrimitiveNode, hiprtTraversalType TraversalType>\n"
"HIPRT_DEVICE bool GeomTraversal<Stack, PrimitiveNode, TraversalType>::testLeafNode(\n"
"	const hiprtRay&				   ray,\n"
"	[[maybe_unused]] const float3& invD,\n"
"	uint32_t&					   leafIndex,\n"
"	[[maybe_unused]] uint32_t&	   triangleMask,\n"
"	hiprtHit&					   hit )\n"
"{\n"
"	bool hasHit = false;\n"
"	if constexpr ( is_same<PrimitiveNode, TriangleNode>::value )\n"
"	{\n"
"		hasHit = this->testTriangleNode( ray, invD, m_primNodes, m_geomType, leafIndex, triangleMask, hit );\n"
"	}\n"
"	else\n"
"	{\n"
"		const bool useFilter = m_geomType != InvalidValue && m_tableHeader.funcDataSets != nullptr;\n"
"		hit.primID			 = m_primNodes[getNodeAddr( leafIndex )].m_primIndex;\n"
"		hasHit				 = intersectFunc( m_geomType >> 1, m_rayType, m_tableHeader, ray, m_payload, hit );\n"
"		if ( useFilter && hasHit && filterFunc( m_geomType >> 1, m_rayType, m_tableHeader, ray, m_payload, hit ) )\n"
"			hasHit = false;\n"
"		if ( !hasHit ) hit.primID = InvalidValue;\n"
"	}\n"
"	return hasHit;\n"
"}\n"
"\n"
"template <typename Stack, typename PrimitiveNode, hiprtTraversalType TraversalType>\n"
"HIPRT_DEVICE hiprtHit GeomTraversal<Stack, PrimitiveNode, TraversalType>::getNextHit()\n"
"{\n"
"	hiprtRay ray = m_ray;\n"
"	float3	 invD;\n"
"	if constexpr ( Rtip < 31 ) invD = rcp( m_ray.direction );\n"
"\n"
"	if constexpr ( TraversalType == hiprtTraversalTerminateAtAnyHit )\n"
"	{\n"
"		if ( m_leafIndex == InvalidValue && isLeafNode( m_nodeIndex ) )\n"
"		{\n"
"			m_leafIndex = m_nodeIndex;\n"
"			m_nodeIndex = m_stack.pop();\n"
"		}\n"
"	}\n"
"	hiprtHit result;\n"
"\n"
"	if ( m_stack.empty() ) m_stack.push( InvalidValue );\n"
"\n"
"	while ( m_nodeIndex != InvalidValue || m_leafIndex != InvalidValue )\n"
"	{\n"
"		while ( isInternalNode( m_nodeIndex ) )\n"
"		{\n"
"			if ( !this->testInternalNode( ray, invD, m_boxNodes, m_nodeIndex ) ) m_nodeIndex = m_stack.pop();\n"
"\n"
"			if ( m_state == hiprtTraversalStateStackOverflow ) return hiprtHit();\n"
"\n"
"			if ( isLeafNode( m_nodeIndex ) && m_leafIndex == InvalidValue )\n"
"			{\n"
"				m_leafIndex = m_nodeIndex;\n"
"				m_nodeIndex = m_stack.pop();\n"
"			}\n"
"\n"
"			if ( !hiprt::any( m_leafIndex == InvalidValue ) ) break;\n"
"		}\n"
"\n"
"		while ( m_leafIndex != InvalidValue )\n"
"		{\n"
"			hiprtHit hit;\n"
"			if ( testLeafNode( ray, invD, m_leafIndex, m_triangleMask, hit ) )\n"
"			{\n"
"				if constexpr ( TraversalType == hiprtTraversalTerminateAtAnyHit )\n"
"				{\n"
"					if ( getNodeType( m_leafIndex ) == CustomType || m_triangleMask == InvalidValue )\n"
"					{\n"
"						m_triangleMask = 0;\n"
"						m_leafIndex	   = InvalidValue;\n"
"					}\n"
"					m_state = hiprtTraversalStateHit;\n"
"					return hit;\n"
"				}\n"
"				else\n"
"				{\n"
"					result	 = hit;\n"
"					ray.maxT = hit.t;\n"
"				}\n"
"			}\n"
"\n"
"			m_triangleMask = 0;\n"
"			m_leafIndex	   = InvalidValue;\n"
"			if ( isLeafNode( m_nodeIndex ) )\n"
"			{\n"
"				m_leafIndex = m_nodeIndex;\n"
"				m_nodeIndex = m_stack.pop();\n"
"			}\n"
"		}\n"
"	}\n"
"\n"
"	if ( m_state != hiprtTraversalStateStackOverflow ) m_state = hiprtTraversalStateFinished;\n"
"\n"
"	return result;\n"
"}\n"
"\n"
"template <typename Stack, typename InstanceStack, hiprtTraversalType TraversalType>\n"
"class SceneTraversal : public TraversalBase<Stack, TraversalType>\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE SceneTraversal(\n"
"		hiprtScene		   scene,\n"
"		const hiprtRay&	   ray,\n"
"		Stack&			   stack,\n"
"		InstanceStack&	   instanceStack,\n"
"		hiprtRayMask	   mask		 = InvalidValue,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0,\n"
"		float			   time		 = 0.0f );\n"
"\n"
"	HIPRT_DEVICE const uint32_t& instanceId() const\n"
"	{\n"
"		if constexpr ( !is_same<InstanceStack, hiprtEmptyInstanceStack>::value )\n"
"			return m_instanceIds[m_level];\n"
"		else\n"
"			return m_instanceId;\n"
"	}\n"
"\n"
"	HIPRT_DEVICE uint32_t& instanceId()\n"
"	{\n"
"		if constexpr ( !is_same<InstanceStack, hiprtEmptyInstanceStack>::value )\n"
"			return m_instanceIds[m_level];\n"
"		else\n"
"			return m_instanceId;\n"
"	}\n"
"\n"
"	HIPRT_DEVICE bool transformRay( uint32_t nodeIndex, hiprtRay& ray, float3& invD );\n"
"\n"
"	HIPRT_DEVICE void restoreRay( hiprtRay& ray, float3& invD ) const;\n"
"\n"
"	HIPRT_DEVICE bool testLeafNode(\n"
"		void*			primNodes,\n"
"		const hiprtRay& ray,\n"
"		const float3&	invD,\n"
"		uint32_t&		leafIndex,\n"
"		uint32_t&		triangleMask,\n"
"		uint32_t		geomType,\n"
"		hiprtHit&		hit );\n"
"\n"
"	HIPRT_DEVICE hiprtHit getNextHit();\n"
"\n"
"protected:\n"
"	using TraversalBase<Stack, TraversalType>::m_tableHeader;\n"
"	using TraversalBase<Stack, TraversalType>::m_ray;\n"
"	using TraversalBase<Stack, TraversalType>::m_state;\n"
"	using TraversalBase<Stack, TraversalType>::m_stack;\n"
"	using TraversalBase<Stack, TraversalType>::m_boxNodes;\n"
"	using TraversalBase<Stack, TraversalType>::m_payload;\n"
"	using TraversalBase<Stack, TraversalType>::m_nodeIndex;\n"
"	using TraversalBase<Stack, TraversalType>::m_triangleMask;\n"
"	using TraversalBase<Stack, TraversalType>::m_rayType;\n"
"	using TraversalBase<Stack, TraversalType>::m_hint;\n"
"#if HIPRT_RTIP >= 11\n"
"	using TraversalBase<Stack, TraversalType>::m_descriptor;\n"
"#endif\n"
"\n"
"	union\n"
"	{\n"
"		uint32_t m_instanceId;\n"
"		uint32_t m_instanceIds[MaxInstanceLevels];\n"
"	};\n"
"\n"
"	InstanceStack& m_instanceStack;\n"
"	SceneHeader*   m_scene;\n"
"	InstanceNode*  m_instanceNodes;\n"
"	Frame*		   m_frames;\n"
"	hiprtRayMask   m_mask;\n"
"	uint32_t	   m_level;\n"
"	uint32_t	   m_instanceIndex;\n"
"	float		   m_time;\n"
"};\n"
"\n"
"template <typename Stack, typename InstanceStack, hiprtTraversalType TraversalType>\n"
"HIPRT_DEVICE SceneTraversal<Stack, InstanceStack, TraversalType>::SceneTraversal(\n"
"	hiprtScene		   scene,\n"
"	const hiprtRay&	   ray,\n"
"	Stack&			   stack,\n"
"	InstanceStack&	   instanceStack,\n"
"	hiprtRayMask	   mask,\n"
"	hiprtTraversalHint hint,\n"
"	void*			   payload,\n"
"	hiprtFuncTable	   funcTable,\n"
"	uint32_t		   rayType,\n"
"	float			   time )\n"
"	: TraversalBase<Stack, TraversalType>( ray, stack, hint, payload, funcTable, rayType ), m_time( time ), m_mask( mask ),\n"
"	  m_instanceStack( instanceStack ), m_level( 0u )\n"
"{\n"
"	SceneHeader* sceneHeader = reinterpret_cast<SceneHeader*>( scene );\n"
"	m_boxNodes				 = sceneHeader->m_boxNodes;\n"
"	m_instanceNodes			 = sceneHeader->m_primNodes;\n"
"	m_frames				 = sceneHeader->m_frames;\n"
"	m_stack.reset();\n"
"	m_instanceIndex = InvalidValue;\n"
"	instanceId()	= InvalidValue;\n"
"	if constexpr ( !is_same<InstanceStack, hiprtEmptyInstanceStack>::value )\n"
"	{\n"
"		m_instanceStack.reset();\n"
"		m_scene = sceneHeader;\n"
"	}\n"
"}\n"
"\n"
"template <typename Stack, typename InstanceStack, hiprtTraversalType TraversalType>\n"
"HIPRT_DEVICE bool SceneTraversal<Stack, InstanceStack, TraversalType>::transformRay(\n"
"	uint32_t nodeIndex, hiprtRay& ray, [[maybe_unused]] float3& invD )\n"
"{\n"
"	const InstanceNode& instanceNode = m_instanceNodes[getNodeAddr( nodeIndex )];\n"
"	if ( instanceNode.m_identity == 0 )\n"
"	{\n"
"		if ( instanceNode.m_static != 0 )\n"
"		{\n"
"#if HIPRT_RTIP >= 31\n"
"			hip_float3 origin, direction;\n"
"			auto	   result = __builtin_amdgcn_image_bvh8_intersect_ray(\n"
"				  encodeBaseAddr( m_instanceNodes ),\n"
"				  ray.maxT,\n"
"				  0xff,\n"
"				  { ray.origin.x, ray.origin.y, ray.origin.z },\n"
"				  { ray.direction.x, ray.direction.y, ray.direction.z },\n"
"				  nodeIndex,\n"
"				  { m_descriptor.x, m_descriptor.y, m_descriptor.z, m_descriptor.w },\n"
"				  &origin,\n"
"				  &direction );\n"
"\n"
"			if ( result[7] == InvalidValue ) return false;\n"
"\n"
"			ray.origin	  = { origin.x, origin.y, origin.z };\n"
"			ray.direction = { direction.x, direction.y, direction.z };\n"
"#else\n"
"			ray = instanceNode.transformRay( ray );\n"
"#endif\n"
"		}\n"
"		else\n"
"		{\n"
"			Transform tr( m_frames, instanceNode.m_transform.frameIndex, instanceNode.m_transform.frameCount );\n"
"			ray = tr.transformRay( ray, m_time );\n"
"		}\n"
"		if constexpr ( Rtip < 31 ) invD = rcp( ray.direction );\n"
"	}\n"
"\n"
"	return true;\n"
"}\n"
"\n"
"template <typename Stack, typename InstanceStack, hiprtTraversalType TraversalType>\n"
"HIPRT_DEVICE void SceneTraversal<Stack, InstanceStack, TraversalType>::restoreRay( hiprtRay& ray, float3& invD ) const\n"
"{\n"
"	ray.origin	  = m_ray.origin;\n"
"	ray.direction = m_ray.direction;\n"
"	if constexpr ( Rtip < 31 ) invD = rcp( m_ray.direction );\n"
"}\n"
"\n"
"template <typename Stack, typename InstanceStack, hiprtTraversalType TraversalType>\n"
"HIPRT_DEVICE bool SceneTraversal<Stack, InstanceStack, TraversalType>::testLeafNode(\n"
"	void*						   primNodes,\n"
"	const hiprtRay&				   ray,\n"
"	[[maybe_unused]] const float3& invD,\n"
"	uint32_t&					   leafIndex,\n"
"	[[maybe_unused]] uint32_t&	   triangleMask,\n"
"	uint32_t					   geomType,\n"
"	hiprtHit&					   hit )\n"
"{\n"
"	bool hasHit = false;\n"
"	if constexpr ( !is_same<InstanceStack, hiprtEmptyInstanceStack>::value )\n"
"	{\n"
"#pragma unroll\n"
"		for ( uint32_t i = 0; i < MaxInstanceLevels; ++i )\n"
"		{\n"
"			if ( i <= m_level )\n"
"				hit.instanceIDs[i] = m_instanceIds[i];\n"
"			else\n"
"				hit.instanceIDs[i] = InvalidValue;\n"
"		}\n"
"	}\n"
"	else\n"
"	{\n"
"		hit.instanceID = instanceId();\n"
"	}\n"
"\n"
"	if ( geomType & 1 )\n"
"	{\n"
"		TriangleNode* nodes = reinterpret_cast<TriangleNode*>( primNodes );\n"
"		hasHit				= this->testTriangleNode( ray, invD, nodes, geomType, leafIndex, triangleMask, hit );\n"
"	}\n"
"	else\n"
"	{\n"
"		const bool	useFilter = geomType != InvalidValue && m_tableHeader.funcDataSets != nullptr;\n"
"		CustomNode* nodes	  = reinterpret_cast<CustomNode*>( primNodes );\n"
"		hit.primID			  = nodes[getNodeAddr( leafIndex )].m_primIndex;\n"
"		hasHit				  = intersectFunc( geomType >> 1, m_rayType, m_tableHeader, ray, m_payload, hit );\n"
"		if ( useFilter && hasHit && filterFunc( geomType >> 1, m_rayType, m_tableHeader, ray, m_payload, hit ) ) hasHit = false;\n"
"		if ( !hasHit ) hit.primID = InvalidValue;\n"
"	}\n"
"\n"
"	return hasHit;\n"
"}\n"
"\n"
"template <typename Stack, typename InstanceStack, hiprtTraversalType TraversalType>\n"
"HIPRT_DEVICE hiprtHit SceneTraversal<Stack, InstanceStack, TraversalType>::getNextHit()\n"
"{\n"
"	BoxNode* nodes	   = m_boxNodes;\n"
"	void*	 primNodes = nullptr;\n"
"	uint32_t geomType  = InvalidValue;\n"
"\n"
"	hiprtRay ray = m_ray;\n"
"	float3	 invD;\n"
"	if constexpr ( Rtip < 31 )\n"
"	{\n"
"		if ( instanceId() == InvalidValue ) invD = rcp( m_ray.direction );\n"
"	}\n"
"\n"
"	if constexpr ( TraversalType == hiprtTraversalTerminateAtAnyHit )\n"
"	{\n"
"		if ( instanceId() != InvalidValue )\n"
"		{\n"
"			transformRay( m_instanceIndex, ray, invD );\n"
"			nodes	  = m_instanceNodes[getNodeAddr( m_instanceIndex )].m_geometry->m_boxNodes;\n"
"			primNodes = m_instanceNodes[getNodeAddr( m_instanceIndex )].m_geometry->m_primNodes;\n"
"			geomType  = m_instanceNodes[getNodeAddr( m_instanceIndex )].m_geometry->m_geomType;\n"
"		}\n"
"	}\n"
"\n"
"	hiprtHit result;\n"
"\n"
"	if ( m_stack.empty() ) m_stack.push( InvalidValue );\n"
"\n"
"	while ( m_nodeIndex != InvalidValue && m_state != hiprtTraversalStateStackOverflow )\n"
"	{\n"
"		if ( isInternalNode( m_nodeIndex ) )\n"
"		{\n"
"			if ( this->testInternalNode( ray, invD, nodes, m_nodeIndex ) ) continue;\n"
"		}\n"
"		else\n"
"		{\n"
"			if ( instanceId() != InvalidValue )\n"
"			{\n"
"				hiprtHit hit;\n"
"				if ( testLeafNode( primNodes, ray, invD, m_nodeIndex, m_triangleMask, geomType, hit ) )\n"
"				{\n"
"					if constexpr ( TraversalType == hiprtTraversalTerminateAtAnyHit )\n"
"					{\n"
"						m_state = hiprtTraversalStateHit;\n"
"						if ( getNodeType( m_nodeIndex ) == CustomType || m_triangleMask == InvalidValue )\n"
"						{\n"
"							m_triangleMask = 0;\n"
"							m_nodeIndex	   = m_stack.pop();\n"
"\n"
"							while ( m_nodeIndex == InvalidValue && !m_stack.empty() )\n"
"							{\n"
"								if constexpr ( !is_same<InstanceStack, hiprtEmptyInstanceStack>::value )\n"
"								{\n"
"									if ( instanceId() == InvalidValue )\n"
"									{\n"
"										hiprtInstanceStackEntry instanceEntry = m_instanceStack.pop();\n"
"										m_ray								  = instanceEntry.ray;\n"
"										m_scene = reinterpret_cast<SceneHeader*>( instanceEntry.scene );\n"
"										m_level--;\n"
"\n"
"										m_boxNodes		= m_scene->m_boxNodes;\n"
"										m_instanceNodes = m_scene->m_primNodes;\n"
"										m_frames		= m_scene->m_frames;\n"
"									}\n"
"								}\n"
"\n"
"								instanceId() = InvalidValue;\n"
"								m_nodeIndex	 = m_stack.pop();\n"
"							}\n"
"						}\n"
"						return hit;\n"
"					}\n"
"					else\n"
"					{\n"
"						ray.maxT = hit.t;\n"
"						result	 = hit;\n"
"					}\n"
"				}\n"
"			}\n"
"			else\n"
"			{\n"
"				const uint32_t instanceAddr = getNodeAddr( m_nodeIndex );\n"
"				if ( ( m_instanceNodes[instanceAddr].m_mask & m_mask ) && transformRay( m_nodeIndex, ray, invD ) )\n"
"				{\n"
"					if ( m_stack.vacancy() < 1 )\n"
"					{\n"
"						m_state = hiprtTraversalStateStackOverflow;\n"
"						continue;\n"
"					}\n"
"\n"
"					m_instanceIndex = m_nodeIndex;\n"
"					instanceId()	= m_instanceNodes[instanceAddr].m_primIndex;\n"
"\n"
"					m_nodeIndex = RootIndex;\n"
"					m_stack.push( InvalidValue );\n"
"\n"
"					if constexpr ( !is_same<InstanceStack, hiprtEmptyInstanceStack>::value )\n"
"					{\n"
"						if ( m_instanceNodes[instanceAddr].m_type == hiprtInstanceTypeScene )\n"
"						{\n"
"							m_instanceStack.push( { m_ray, reinterpret_cast<hiprtScene>( m_scene ) } );\n"
"							m_ray	= ray;\n"
"							m_scene = m_instanceNodes[instanceAddr].m_scene;\n"
"							m_level++;\n"
"							instanceId() = InvalidValue;\n"
"\n"
"							m_boxNodes		= m_scene->m_boxNodes;\n"
"							m_instanceNodes = m_scene->m_primNodes;\n"
"							m_frames		= m_scene->m_frames;\n"
"\n"
"							nodes = m_boxNodes;\n"
"							continue;\n"
"						}\n"
"					}\n"
"					nodes	  = m_instanceNodes[instanceAddr].m_geometry->m_boxNodes;\n"
"					primNodes = m_instanceNodes[instanceAddr].m_geometry->m_primNodes;\n"
"					geomType  = m_instanceNodes[instanceAddr].m_geometry->m_geomType;\n"
"					continue;\n"
"				}\n"
"			}\n"
"		}\n"
"\n"
"		m_triangleMask = 0;\n"
"		m_nodeIndex	   = m_stack.pop();\n"
"		while ( m_nodeIndex == InvalidValue && !m_stack.empty() )\n"
"		{\n"
"			if constexpr ( !is_same<InstanceStack, hiprtEmptyInstanceStack>::value )\n"
"			{\n"
"				if ( instanceId() == InvalidValue )\n"
"				{\n"
"					hiprtInstanceStackEntry instanceEntry = m_instanceStack.pop();\n"
"					m_ray								  = instanceEntry.ray;\n"
"					m_scene								  = reinterpret_cast<SceneHeader*>( instanceEntry.scene );\n"
"					m_level--;\n"
"\n"
"					m_boxNodes		= m_scene->m_boxNodes;\n"
"					m_instanceNodes = m_scene->m_primNodes;\n"
"					m_frames		= m_scene->m_frames;\n"
"				}\n"
"			}\n"
"			instanceId() = InvalidValue;\n"
"			m_nodeIndex	 = m_stack.pop();\n"
"			nodes		 = m_boxNodes;\n"
"			restoreRay( ray, invD );\n"
"		}\n"
"	}\n"
"\n"
"	if ( m_state != hiprtTraversalStateStackOverflow ) m_state = hiprtTraversalStateFinished;\n"
"\n"
"	return result;\n"
"}\n"
"\n"
"template <typename PrimitiveNode, hiprtTraversalType TraversalType>\n"
"class GeomTraversalPrivateStack\n"
"{\n"
"public:\n"
"	using Stack = hiprtPrivateStack;\n"
"\n"
"	HIPRT_DEVICE GeomTraversalPrivateStack(\n"
"		hiprtGeometry	   geom,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0 )\n"
"		: m_traversal( geom, ray, m_stack, hint, payload, funcTable, rayType )\n"
"	{\n"
"	}\n"
"\n"
"	HIPRT_DEVICE hiprtHit getNextHit() { return m_traversal.getNextHit(); }\n"
"\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState() { return m_traversal.getCurrentState(); }\n"
"\n"
"private:\n"
"	Stack											   m_stack;\n"
"	GeomTraversal<Stack, PrimitiveNode, TraversalType> m_traversal;\n"
"};\n"
"\n"
"template <hiprtTraversalType TraversalType>\n"
"class SceneTraversalPrivateStack\n"
"{\n"
"public:\n"
"	using Stack			= hiprtPrivateStack;\n"
"	using InstanceStack = hiprtPrivateInstanceStack;\n"
"\n"
"	HIPRT_DEVICE SceneTraversalPrivateStack(\n"
"		hiprtScene		   scene,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtRayMask	   mask		 = hiprtFullRayMask,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0,\n"
"		float			   time		 = 0.0f )\n"
"		: m_traversal( scene, ray, m_stack, m_instanceStack, mask, hint, payload, funcTable, rayType, time )\n"
"	{\n"
"	}\n"
"\n"
"	HIPRT_DEVICE hiprtHit getNextHit() { return m_traversal.getNextHit(); }\n"
"\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState() { return m_traversal.getCurrentState(); }\n"
"\n"
"private:\n"
"	Stack												m_stack;\n"
"	InstanceStack										m_instanceStack;\n"
"	SceneTraversal<Stack, InstanceStack, TraversalType> m_traversal;\n"
"};\n"
"} // namespace hiprt\n"
"\n"
"template <typename StackEntry, uint32_t StackSize>\n"
"class hiprtPrivateStack_impl\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtPrivateStack_impl() : m_stack() {}\n"
"	~hiprtPrivateStack_impl() = default;\n"
"	HIPRT_DEVICE StackEntry pop() { return m_stack.pop(); }\n"
"	HIPRT_DEVICE void		push( StackEntry val ) { m_stack.push( val ); }\n"
"	HIPRT_DEVICE bool		empty() const { return m_stack.empty(); }\n"
"	HIPRT_DEVICE uint32_t	vacancy() const { return m_stack.vacancy(); }\n"
"	HIPRT_DEVICE void		reset() { m_stack.reset(); }\n"
"\n"
"private:\n"
"	hiprt::PrivateStack<StackEntry, StackSize> m_stack;\n"
"};\n"
"\n"
"template <typename StackEntry, bool DynamicAssignment>\n"
"class hiprtGlobalStack_impl\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtGlobalStack_impl( hiprtGlobalStackBuffer globalStackBuffer, hiprtSharedStackBuffer sharedStackBuffer )\n"
"		: m_stack( globalStackBuffer, sharedStackBuffer )\n"
"	{\n"
"	}\n"
"	~hiprtGlobalStack_impl() = default;\n"
"	HIPRT_DEVICE StackEntry pop() { return m_stack.pop(); }\n"
"	HIPRT_DEVICE void		push( StackEntry val ) { m_stack.push( val ); }\n"
"	HIPRT_DEVICE bool		empty() const { return m_stack.empty(); }\n"
"	HIPRT_DEVICE uint32_t	vacancy() const { return m_stack.vacancy(); }\n"
"	HIPRT_DEVICE void		reset() { m_stack.reset(); }\n"
"\n"
"private:\n"
"	hiprt::GlobalStack<StackEntry, DynamicAssignment> m_stack;\n"
"};\n"
"\n"
"template <hiprtPrimitiveNodeType PrimitiveNodeType, hiprtTraversalType TraversalType>\n"
"class hiprtGeomTraversal_impl\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtGeomTraversal_impl(\n"
"		hiprtGeometry	   geom,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0 )\n"
"		: m_traversal( geom, ray, hint, payload, funcTable, rayType )\n"
"	{\n"
"	}\n"
"\n"
"	HIPRT_DEVICE hiprtHit getNextHit() { return m_traversal.getNextHit(); }\n"
"\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState() { return m_traversal.getCurrentState(); }\n"
"\n"
"private:\n"
"	using NodeType =\n"
"		typename hiprt::conditional<PrimitiveNodeType == hiprtTriangleNode, hiprt::TriangleNode, hiprt::CustomNode>::type;\n"
"	hiprt::GeomTraversalPrivateStack<NodeType, TraversalType> m_traversal;\n"
"};\n"
"\n"
"template <hiprtTraversalType TraversalType>\n"
"class hiprtSceneTraversal_impl\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtSceneTraversal_impl(\n"
"		hiprtScene		   scene,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtRayMask	   mask		 = hiprtFullRayMask,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0,\n"
"		float			   time		 = 0.0f )\n"
"		: m_traversal( scene, ray, mask, hint, payload, funcTable, rayType, time )\n"
"	{\n"
"	}\n"
"\n"
"	HIPRT_DEVICE hiprtHit getNextHit() { return m_traversal.getNextHit(); }\n"
"\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState() { return m_traversal.getCurrentState(); }\n"
"\n"
"private:\n"
"	hiprt::SceneTraversalPrivateStack<TraversalType> m_traversal;\n"
"};\n"
"\n"
"template <typename hiprtStack, hiprtPrimitiveNodeType PrimitiveNodeType, hiprtTraversalType TraversalType>\n"
"class hiprtGeomTraversalCustomStack_impl\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtGeomTraversalCustomStack_impl(\n"
"		hiprtGeometry	   geom,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtStack&		   stack,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0 )\n"
"		: m_traversal( geom, ray, stack, hint, payload, funcTable, rayType )\n"
"	{\n"
"	}\n"
"\n"
"	HIPRT_DEVICE hiprtHit getNextHit() { return m_traversal.getNextHit(); }\n"
"\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState() { return m_traversal.getCurrentState(); }\n"
"\n"
"private:\n"
"	using NodeType =\n"
"		typename hiprt::conditional<PrimitiveNodeType == hiprtTriangleNode, hiprt::TriangleNode, hiprt::CustomNode>::type;\n"
"	hiprt::GeomTraversal<hiprtStack, NodeType, TraversalType> m_traversal;\n"
"};\n"
"\n"
"template <typename hiprtStack, typename hiprtInstanceStack, hiprtTraversalType TraversalType>\n"
"class hiprtSceneTraversalCustomStack_impl\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtSceneTraversalCustomStack_impl(\n"
"		hiprtScene			scene,\n"
"		const hiprtRay&		ray,\n"
"		hiprtStack&			stack,\n"
"		hiprtInstanceStack& instanceStack,\n"
"		hiprtRayMask		mask	  = hiprtFullRayMask,\n"
"		hiprtTraversalHint	hint	  = hiprtTraversalHintDefault,\n"
"		void*				payload	  = nullptr,\n"
"		hiprtFuncTable		funcTable = nullptr,\n"
"		uint32_t			rayType	  = 0,\n"
"		float				time	  = 0.0f )\n"
"		: m_traversal( scene, ray, stack, instanceStack, mask, hint, payload, funcTable, rayType, time )\n"
"	{\n"
"	}\n"
"\n"
"	HIPRT_DEVICE hiprtHit getNextHit() { return m_traversal.getNextHit(); }\n"
"\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState() { return m_traversal.getCurrentState(); }\n"
"\n"
"private:\n"
"	hiprt::SceneTraversal<hiprtStack, hiprtInstanceStack, TraversalType> m_traversal;\n"
"};\n"
"\n"
"\n"
"HIPRT_DEVICE hiprtPrivateStack::hiprtPrivateStack() : m_impl() {}\n"
"\n"
"HIPRT_DEVICE hiprtPrivateStack::~hiprtPrivateStack() { m_impl->~hiprtPrivateStack_impl(); }\n"
"\n"
"HIPRT_DEVICE uint32_t hiprtPrivateStack::pop() { return m_impl->pop(); }\n"
"\n"
"HIPRT_DEVICE void hiprtPrivateStack::push( uint32_t val ) { m_impl->push( val ); }\n"
"\n"
"HIPRT_DEVICE bool hiprtPrivateStack::empty() const { return m_impl->empty(); }\n"
"\n"
"HIPRT_DEVICE uint32_t hiprtPrivateStack::vacancy() const { return m_impl->vacancy(); }\n"
"\n"
"HIPRT_DEVICE void hiprtPrivateStack::reset() { m_impl->reset(); }\n"
"\n"
"HIPRT_DEVICE hiprtPrivateInstanceStack::hiprtPrivateInstanceStack() : m_impl() {}\n"
"\n"
"HIPRT_DEVICE hiprtPrivateInstanceStack::~hiprtPrivateInstanceStack() { m_impl->~hiprtPrivateStack_impl(); }\n"
"\n"
"HIPRT_DEVICE hiprtInstanceStackEntry hiprtPrivateInstanceStack::pop() { return m_impl->pop(); }\n"
"\n"
"HIPRT_DEVICE void hiprtPrivateInstanceStack::push( hiprtInstanceStackEntry val ) { m_impl->push( val ); }\n"
"\n"
"HIPRT_DEVICE bool hiprtPrivateInstanceStack::empty() const { return m_impl->empty(); }\n"
"\n"
"HIPRT_DEVICE uint32_t hiprtPrivateInstanceStack::vacancy() const { return m_impl->vacancy(); }\n"
"\n"
"HIPRT_DEVICE void hiprtPrivateInstanceStack::reset() { m_impl->reset(); }\n"
"\n"
"HIPRT_DEVICE\n"
"hiprtGlobalStack::hiprtGlobalStack( hiprtGlobalStackBuffer globalStackBuffer, hiprtSharedStackBuffer sharedStackBuffer )\n"
"	: m_impl( globalStackBuffer, sharedStackBuffer )\n"
"{\n"
"}\n"
"\n"
"HIPRT_DEVICE hiprtGlobalStack::~hiprtGlobalStack() { m_impl->~hiprtGlobalStack_impl(); }\n"
"\n"
"HIPRT_DEVICE uint32_t hiprtGlobalStack::pop() { return m_impl->pop(); }\n"
"\n"
"HIPRT_DEVICE void hiprtGlobalStack::push( uint32_t val ) { m_impl->push( val ); }\n"
"\n"
"HIPRT_DEVICE bool hiprtGlobalStack::empty() const { return m_impl->empty(); }\n"
"\n"
"HIPRT_DEVICE uint32_t hiprtGlobalStack::vacancy() const { return m_impl->vacancy(); }\n"
"\n"
"HIPRT_DEVICE void hiprtGlobalStack::reset() { m_impl->reset(); }\n"
"\n"
"HIPRT_DEVICE\n"
"hiprtGlobalInstanceStack::hiprtGlobalInstanceStack(\n"
"	hiprtGlobalStackBuffer globalStackBuffer, hiprtSharedStackBuffer sharedStackBuffer )\n"
"	: m_impl( globalStackBuffer, sharedStackBuffer )\n"
"{\n"
"}\n"
"\n"
"HIPRT_DEVICE hiprtGlobalInstanceStack::~hiprtGlobalInstanceStack() { m_impl->~hiprtGlobalStack_impl(); }\n"
"\n"
"HIPRT_DEVICE hiprtInstanceStackEntry hiprtGlobalInstanceStack::pop() { return m_impl->pop(); }\n"
"\n"
"HIPRT_DEVICE void hiprtGlobalInstanceStack::push( hiprtInstanceStackEntry val ) { m_impl->push( val ); }\n"
"\n"
"HIPRT_DEVICE bool hiprtGlobalInstanceStack::empty() const { return m_impl->empty(); }\n"
"\n"
"HIPRT_DEVICE uint32_t hiprtGlobalInstanceStack::vacancy() const { return m_impl->vacancy(); }\n"
"\n"
"HIPRT_DEVICE void hiprtGlobalInstanceStack::reset() { m_impl->reset(); }\n"
"\n"
"HIPRT_DEVICE\n"
"hiprtDynamicStack::hiprtDynamicStack( hiprtGlobalStackBuffer globalStackBuffer, hiprtSharedStackBuffer sharedStackBuffer )\n"
"	: m_impl( globalStackBuffer, sharedStackBuffer )\n"
"{\n"
"}\n"
"\n"
"HIPRT_DEVICE hiprtDynamicStack::~hiprtDynamicStack() { m_impl->~hiprtGlobalStack_impl(); }\n"
"\n"
"HIPRT_DEVICE uint32_t hiprtDynamicStack::pop() { return m_impl->pop(); }\n"
"\n"
"HIPRT_DEVICE void hiprtDynamicStack::push( uint32_t val ) { m_impl->push( val ); }\n"
"\n"
"HIPRT_DEVICE bool hiprtDynamicStack::empty() const { return m_impl->empty(); }\n"
"\n"
"HIPRT_DEVICE uint32_t hiprtDynamicStack::vacancy() const { return m_impl->vacancy(); }\n"
"\n"
"HIPRT_DEVICE void hiprtDynamicStack::reset() { m_impl->reset(); }\n"
"\n"
"HIPRT_DEVICE\n"
"hiprtDynamicInstanceStack::hiprtDynamicInstanceStack(\n"
"	hiprtGlobalStackBuffer globalStackBuffer, hiprtSharedStackBuffer sharedStackBuffer )\n"
"	: m_impl( globalStackBuffer, sharedStackBuffer )\n"
"{\n"
"}\n"
"\n"
"HIPRT_DEVICE hiprtDynamicInstanceStack::~hiprtDynamicInstanceStack() { m_impl->~hiprtGlobalStack_impl(); }\n"
"\n"
"HIPRT_DEVICE hiprtInstanceStackEntry hiprtDynamicInstanceStack::pop() { return m_impl->pop(); }\n"
"\n"
"HIPRT_DEVICE void hiprtDynamicInstanceStack::push( hiprtInstanceStackEntry val ) { m_impl->push( val ); }\n"
"\n"
"HIPRT_DEVICE bool hiprtDynamicInstanceStack::empty() const { return m_impl->empty(); }\n"
"\n"
"HIPRT_DEVICE uint32_t hiprtDynamicInstanceStack::vacancy() const { return m_impl->vacancy(); }\n"
"\n"
"HIPRT_DEVICE void hiprtDynamicInstanceStack::reset() { m_impl->reset(); }\n"
"\n"
"HIPRT_DEVICE hiprtGeomTraversalClosest::hiprtGeomTraversalClosest(\n"
"	hiprtGeometry	   geom,\n"
"	const hiprtRay&	   ray,\n"
"	hiprtTraversalHint hint,\n"
"	void*			   payload,\n"
"	hiprtFuncTable	   funcTable,\n"
"	uint32_t		   rayType )\n"
"	: m_impl( geom, ray, hint, payload, funcTable, rayType )\n"
"{\n"
"}\n"
"\n"
"HIPRT_DEVICE hiprtHit hiprtGeomTraversalClosest::getNextHit() { return m_impl->getNextHit(); }\n"
"\n"
"HIPRT_DEVICE hiprtTraversalState hiprtGeomTraversalClosest::getCurrentState() { return m_impl->getCurrentState(); }\n"
"\n"
"HIPRT_DEVICE hiprtGeomTraversalAnyHit::hiprtGeomTraversalAnyHit(\n"
"	hiprtGeometry	   geom,\n"
"	const hiprtRay&	   ray,\n"
"	hiprtTraversalHint hint,\n"
"	void*			   payload,\n"
"	hiprtFuncTable	   funcTable,\n"
"	uint32_t		   rayType )\n"
"	: m_impl( geom, ray, hint, payload, funcTable, rayType )\n"
"{\n"
"}\n"
"\n"
"HIPRT_DEVICE hiprtHit hiprtGeomTraversalAnyHit::getNextHit() { return m_impl->getNextHit(); }\n"
"\n"
"HIPRT_DEVICE hiprtTraversalState hiprtGeomTraversalAnyHit::getCurrentState() { return m_impl->getCurrentState(); }\n"
"\n"
"HIPRT_DEVICE hiprtGeomCustomTraversalClosest::hiprtGeomCustomTraversalClosest(\n"
"	hiprtGeometry	   geom,\n"
"	const hiprtRay&	   ray,\n"
"	hiprtTraversalHint hint,\n"
"	void*			   payload,\n"
"	hiprtFuncTable	   funcTable,\n"
"	uint32_t		   rayType )\n"
"	: m_impl( geom, ray, hint, payload, funcTable, rayType )\n"
"{\n"
"}\n"
"\n"
"HIPRT_DEVICE hiprtHit hiprtGeomCustomTraversalClosest::getNextHit() { return m_impl->getNextHit(); }\n"
"\n"
"HIPRT_DEVICE hiprtTraversalState hiprtGeomCustomTraversalClosest::getCurrentState() { return m_impl->getCurrentState(); }\n"
"\n"
"HIPRT_DEVICE hiprtGeomCustomTraversalAnyHit::hiprtGeomCustomTraversalAnyHit(\n"
"	hiprtGeometry	   geom,\n"
"	const hiprtRay&	   ray,\n"
"	hiprtTraversalHint hint,\n"
"	void*			   payload,\n"
"	hiprtFuncTable	   funcTable,\n"
"	uint32_t		   rayType )\n"
"	: m_impl( geom, ray, hint, payload, funcTable, rayType )\n"
"{\n"
"}\n"
"\n"
"HIPRT_DEVICE hiprtHit hiprtGeomCustomTraversalAnyHit::getNextHit() { return m_impl->getNextHit(); }\n"
"\n"
"HIPRT_DEVICE hiprtTraversalState hiprtGeomCustomTraversalAnyHit::getCurrentState() { return m_impl->getCurrentState(); }\n"
"\n"
"HIPRT_DEVICE hiprtSceneTraversalClosest::hiprtSceneTraversalClosest(\n"
"	hiprtScene		   scene,\n"
"	const hiprtRay&	   ray,\n"
"	hiprtRayMask	   mask,\n"
"	hiprtTraversalHint hint,\n"
"	void*			   payload,\n"
"	hiprtFuncTable	   funcTable,\n"
"	uint32_t		   rayType,\n"
"	float			   time )\n"
"	: m_impl( scene, ray, mask, hint, payload, funcTable, rayType, time )\n"
"{\n"
"}\n"
"\n"
"HIPRT_DEVICE hiprtHit hiprtSceneTraversalClosest::getNextHit() { return m_impl->getNextHit(); }\n"
"\n"
"HIPRT_DEVICE hiprtTraversalState hiprtSceneTraversalClosest::getCurrentState() { return m_impl->getCurrentState(); }\n"
"\n"
"HIPRT_DEVICE hiprtSceneTraversalAnyHit::hiprtSceneTraversalAnyHit(\n"
"	hiprtScene		   scene,\n"
"	const hiprtRay&	   ray,\n"
"	hiprtRayMask	   mask,\n"
"	hiprtTraversalHint hint,\n"
"	void*			   payload,\n"
"	hiprtFuncTable	   funcTable,\n"
"	uint32_t		   rayType,\n"
"	float			   time )\n"
"	: m_impl( scene, ray, mask, hint, payload, funcTable, rayType, time )\n"
"{\n"
"}\n"
"\n"
"HIPRT_DEVICE hiprtHit hiprtSceneTraversalAnyHit::getNextHit() { return m_impl->getNextHit(); }\n"
"\n"
"HIPRT_DEVICE hiprtTraversalState hiprtSceneTraversalAnyHit::getCurrentState() { return m_impl->getCurrentState(); }\n"
"\n"
"template <typename hiprtStack>\n"
"HIPRT_DEVICE hiprtGeomTraversalClosestCustomStack<hiprtStack>::hiprtGeomTraversalClosestCustomStack(\n"
"	hiprtGeometry	   geom,\n"
"	const hiprtRay&	   ray,\n"
"	hiprtStack&		   stack,\n"
"	hiprtTraversalHint hint,\n"
"	void*			   payload,\n"
"	hiprtFuncTable	   funcTable,\n"
"	uint32_t		   rayType )\n"
"	: m_impl( geom, ray, stack, hint, payload, funcTable, rayType )\n"
"{\n"
"}\n"
"\n"
"template <typename hiprtStack>\n"
"HIPRT_DEVICE hiprtHit hiprtGeomTraversalClosestCustomStack<hiprtStack>::getNextHit()\n"
"{\n"
"	return m_impl->getNextHit();\n"
"}\n"
"\n"
"template <typename hiprtStack>\n"
"HIPRT_DEVICE hiprtTraversalState hiprtGeomTraversalClosestCustomStack<hiprtStack>::getCurrentState()\n"
"{\n"
"	return m_impl->getCurrentState();\n"
"}\n"
"\n"
"template <typename hiprtStack>\n"
"HIPRT_DEVICE hiprtGeomTraversalAnyHitCustomStack<hiprtStack>::hiprtGeomTraversalAnyHitCustomStack(\n"
"	hiprtGeometry	   geom,\n"
"	const hiprtRay&	   ray,\n"
"	hiprtStack&		   stack,\n"
"	hiprtTraversalHint hint,\n"
"	void*			   payload,\n"
"	hiprtFuncTable	   funcTable,\n"
"	uint32_t		   rayType )\n"
"	: m_impl( geom, ray, stack, hint, payload, funcTable, rayType )\n"
"{\n"
"}\n"
"\n"
"template <typename hiprtStack>\n"
"HIPRT_DEVICE hiprtHit hiprtGeomTraversalAnyHitCustomStack<hiprtStack>::getNextHit()\n"
"{\n"
"	return m_impl->getNextHit();\n"
"}\n"
"\n"
"template <typename hiprtStack>\n"
"HIPRT_DEVICE hiprtTraversalState hiprtGeomTraversalAnyHitCustomStack<hiprtStack>::getCurrentState()\n"
"{\n"
"	return m_impl->getCurrentState();\n"
"}\n"
"\n"
"template <typename hiprtStack>\n"
"HIPRT_DEVICE hiprtGeomCustomTraversalClosestCustomStack<hiprtStack>::hiprtGeomCustomTraversalClosestCustomStack(\n"
"	hiprtGeometry	   geom,\n"
"	const hiprtRay&	   ray,\n"
"	hiprtStack&		   stack,\n"
"	hiprtTraversalHint hint,\n"
"	void*			   payload,\n"
"	hiprtFuncTable	   funcTable,\n"
"	uint32_t		   rayType )\n"
"	: m_impl( geom, ray, stack, hint, payload, funcTable, rayType )\n"
"{\n"
"}\n"
"\n"
"template <typename hiprtStack>\n"
"HIPRT_DEVICE hiprtHit hiprtGeomCustomTraversalClosestCustomStack<hiprtStack>::getNextHit()\n"
"{\n"
"	return m_impl->getNextHit();\n"
"}\n"
"\n"
"template <typename hiprtStack>\n"
"HIPRT_DEVICE hiprtTraversalState hiprtGeomCustomTraversalClosestCustomStack<hiprtStack>::getCurrentState()\n"
"{\n"
"	return m_impl->getCurrentState();\n"
"}\n"
"\n"
"template <typename hiprtStack>\n"
"HIPRT_DEVICE hiprtGeomCustomTraversalAnyHitCustomStack<hiprtStack>::hiprtGeomCustomTraversalAnyHitCustomStack(\n"
"	hiprtGeometry	   geom,\n"
"	const hiprtRay&	   ray,\n"
"	hiprtStack&		   stack,\n"
"	hiprtTraversalHint hint,\n"
"	void*			   payload,\n"
"	hiprtFuncTable	   funcTable,\n"
"	uint32_t		   rayType )\n"
"	: m_impl( geom, ray, stack, hint, payload, funcTable, rayType )\n"
"{\n"
"}\n"
"\n"
"template <typename hiprtStack>\n"
"HIPRT_DEVICE hiprtHit hiprtGeomCustomTraversalAnyHitCustomStack<hiprtStack>::getNextHit()\n"
"{\n"
"	return m_impl->getNextHit();\n"
"}\n"
"\n"
"template <typename hiprtStack>\n"
"HIPRT_DEVICE hiprtTraversalState hiprtGeomCustomTraversalAnyHitCustomStack<hiprtStack>::getCurrentState()\n"
"{\n"
"	return m_impl->getCurrentState();\n"
"}\n"
"\n"
"template <typename hiprtStack, typename hiprtInstanceStack>\n"
"HIPRT_DEVICE hiprtSceneTraversalClosestCustomStack<hiprtStack, hiprtInstanceStack>::hiprtSceneTraversalClosestCustomStack(\n"
"	hiprtScene			scene,\n"
"	const hiprtRay&		ray,\n"
"	hiprtStack&			stack,\n"
"	hiprtInstanceStack& instanceStack,\n"
"	hiprtRayMask		mask,\n"
"	hiprtTraversalHint	hint,\n"
"	void*				payload,\n"
"	hiprtFuncTable		funcTable,\n"
"	uint32_t			rayType,\n"
"	float				time )\n"
"	: m_impl( scene, ray, stack, instanceStack, mask, hint, payload, funcTable, rayType, time )\n"
"{\n"
"}\n"
"\n"
"template <typename hiprtStack, typename hiprtInstanceStack>\n"
"HIPRT_DEVICE hiprtHit hiprtSceneTraversalClosestCustomStack<hiprtStack, hiprtInstanceStack>::getNextHit()\n"
"{\n"
"	return m_impl->getNextHit();\n"
"}\n"
"\n"
"template <typename hiprtStack, typename hiprtInstanceStack>\n"
"HIPRT_DEVICE hiprtTraversalState hiprtSceneTraversalClosestCustomStack<hiprtStack, hiprtInstanceStack>::getCurrentState()\n"
"{\n"
"	return m_impl->getCurrentState();\n"
"}\n"
"\n"
"template <typename hiprtStack, typename hiprtInstanceStack>\n"
"HIPRT_DEVICE hiprtSceneTraversalAnyHitCustomStack<hiprtStack, hiprtInstanceStack>::hiprtSceneTraversalAnyHitCustomStack(\n"
"	hiprtScene			scene,\n"
"	const hiprtRay&		ray,\n"
"	hiprtStack&			stack,\n"
"	hiprtInstanceStack& instanceStack,\n"
"	hiprtRayMask		mask,\n"
"	hiprtTraversalHint	hint,\n"
"	void*				payload,\n"
"	hiprtFuncTable		funcTable,\n"
"	uint32_t			rayType,\n"
"	float				time )\n"
"	: m_impl( scene, ray, stack, instanceStack, mask, hint, payload, funcTable, rayType, time )\n"
"{\n"
"}\n"
"\n"
"template <typename hiprtStack, typename hiprtInstanceStack>\n"
"HIPRT_DEVICE hiprtHit hiprtSceneTraversalAnyHitCustomStack<hiprtStack, hiprtInstanceStack>::getNextHit()\n"
"{\n"
"	return m_impl->getNextHit();\n"
"}\n"
"\n"
"template <typename hiprtStack, typename hiprtInstanceStack>\n"
"HIPRT_DEVICE hiprtTraversalState hiprtSceneTraversalAnyHitCustomStack<hiprtStack, hiprtInstanceStack>::getCurrentState()\n"
"{\n"
"	return m_impl->getCurrentState();\n"
"}\n"
"\n"
"HIPRT_DEVICE float3 hiprtPointObjectToWorld( const float3& point, hiprtScene scene, uint32_t instanceID, float time )\n"
"{\n"
"	const hiprt::SceneHeader* sceneHeader = reinterpret_cast<hiprt::SceneHeader*>( scene );\n"
"	const hiprt::Transform	  tr(\n"
"		   sceneHeader->m_frames,\n"
"		   sceneHeader->m_instances[instanceID].m_frameIndex,\n"
"		   sceneHeader->m_instances[instanceID].m_frameCount );\n"
"	hiprt::Frame frame = tr.interpolateFrames( time );\n"
"	return frame.transform( point );\n"
"}\n"
"\n"
"HIPRT_DEVICE float3 hiprtPointWorldToObject( const float3& point, hiprtScene scene, uint32_t instanceID, float time )\n"
"{\n"
"	const hiprt::SceneHeader* sceneHeader = reinterpret_cast<hiprt::SceneHeader*>( scene );\n"
"	const hiprt::Transform	  tr(\n"
"		   sceneHeader->m_frames,\n"
"		   sceneHeader->m_instances[instanceID].m_frameIndex,\n"
"		   sceneHeader->m_instances[instanceID].m_frameCount );\n"
"	hiprt::Frame frame = tr.interpolateFrames( time );\n"
"	return frame.invTransform( point );\n"
"}\n"
"\n"
"HIPRT_DEVICE float3 hiprtVectorObjectToWorld( const float3& vector, hiprtScene scene, uint32_t instanceID, float time )\n"
"{\n"
"	const hiprt::SceneHeader* sceneHeader = reinterpret_cast<hiprt::SceneHeader*>( scene );\n"
"	const hiprt::Transform	  tr(\n"
"		   sceneHeader->m_frames,\n"
"		   sceneHeader->m_instances[instanceID].m_frameIndex,\n"
"		   sceneHeader->m_instances[instanceID].m_frameCount );\n"
"	hiprt::Frame frame = tr.interpolateFrames( time );\n"
"	return frame.transformVector( vector );\n"
"}\n"
"\n"
"HIPRT_DEVICE float3 hiprtVectorWorldToObject( const float3& vector, hiprtScene scene, uint32_t instanceID, float time )\n"
"{\n"
"	const hiprt::SceneHeader* sceneHeader = reinterpret_cast<hiprt::SceneHeader*>( scene );\n"
"	const hiprt::Transform	  tr(\n"
"		   sceneHeader->m_frames,\n"
"		   sceneHeader->m_instances[instanceID].m_frameIndex,\n"
"		   sceneHeader->m_instances[instanceID].m_frameCount );\n"
"	hiprt::Frame frame = tr.interpolateFrames( time );\n"
"	return frame.invTransformVector( vector );\n"
"}\n"
"\n"
"HIPRT_DEVICE float3 hiprtPointObjectToWorld(\n"
"	const float3& point, hiprtScene scene, const uint32_t ( &instanceIDs )[hiprtMaxInstanceLevels], float time )\n"
"{\n"
"	hiprt::SceneHeader* sceneHeaders[hiprtMaxInstanceLevels];\n"
"	hiprt::SceneHeader* sceneHeader = reinterpret_cast<hiprt::SceneHeader*>( scene );\n"
"	float3				p			= point;\n"
"\n"
"	uint32_t depth;\n"
"#pragma unroll\n"
"	for ( depth = 1; depth < hiprtMaxInstanceLevels; ++depth )\n"
"	{\n"
"		sceneHeaders[depth - 1] = sceneHeader;\n"
"		const auto& instance	= sceneHeader->m_instances[instanceIDs[depth - 1]];\n"
"		if ( instance.m_type != hiprtInstanceTypeScene ) break;\n"
"		sceneHeader = instance.m_scene;\n"
"	}\n"
"\n"
"#pragma unroll\n"
"	for ( int32_t i = depth - 1; i >= 0; --i )\n"
"	{\n"
"		sceneHeader				  = sceneHeaders[i];\n"
"		const auto&		 instance = sceneHeader->m_instances[instanceIDs[i]];\n"
"		hiprt::Transform tr		  = hiprt::Transform( sceneHeader->m_frames, instance.m_frameIndex, instance.m_frameCount );\n"
"		hiprt::Frame	 frame	  = tr.interpolateFrames( time );\n"
"		p						  = frame.transform( p );\n"
"	}\n"
"\n"
"	return p;\n"
"}\n"
"\n"
"HIPRT_DEVICE float3 hiprtPointWorldToObject(\n"
"	const float3& point, hiprtScene scene, const uint32_t ( &instanceIDs )[hiprtMaxInstanceLevels], float time )\n"
"{\n"
"	hiprt::SceneHeader* sceneHeader = reinterpret_cast<hiprt::SceneHeader*>( scene );\n"
"	float3				p			= point;\n"
"\n"
"#pragma unroll\n"
"	for ( int32_t i = 0; i < hiprtMaxInstanceLevels; ++i )\n"
"	{\n"
"		const auto&		 instance = sceneHeader->m_instances[instanceIDs[i]];\n"
"		hiprt::Transform tr		  = hiprt::Transform( sceneHeader->m_frames, instance.m_frameIndex, instance.m_frameCount );\n"
"		hiprt::Frame	 frame	  = tr.interpolateFrames( time );\n"
"		p						  = frame.invTransform( p );\n"
"		if ( instance.m_type != hiprtInstanceTypeScene ) break;\n"
"		sceneHeader = instance.m_scene;\n"
"	}\n"
"\n"
"	return p;\n"
"}\n"
"\n"
"HIPRT_DEVICE float3 hiprtVectorObjectToWorld(\n"
"	const float3& vector, hiprtScene scene, const uint32_t ( &instanceIDs )[hiprtMaxInstanceLevels], float time )\n"
"{\n"
"	hiprt::SceneHeader* sceneHeaders[hiprtMaxInstanceLevels];\n"
"	hiprt::SceneHeader* sceneHeader = reinterpret_cast<hiprt::SceneHeader*>( scene );\n"
"	float3				v			= vector;\n"
"\n"
"	uint32_t depth;\n"
"#pragma unroll\n"
"	for ( depth = 1; depth < hiprtMaxInstanceLevels; ++depth )\n"
"	{\n"
"		sceneHeaders[depth - 1] = sceneHeader;\n"
"		const auto& instance	= sceneHeader->m_instances[instanceIDs[depth - 1]];\n"
"		if ( instance.m_type != hiprtInstanceTypeScene ) break;\n"
"		sceneHeader = instance.m_scene;\n"
"	}\n"
"\n"
"#pragma unroll\n"
"	for ( int32_t i = depth - 1; i >= 0; --i )\n"
"	{\n"
"		sceneHeader				  = sceneHeaders[i];\n"
"		const auto&		 instance = sceneHeader->m_instances[instanceIDs[i]];\n"
"		hiprt::Transform tr		  = hiprt::Transform( sceneHeader->m_frames, instance.m_frameIndex, instance.m_frameCount );\n"
"		hiprt::Frame	 frame	  = tr.interpolateFrames( time );\n"
"		v						  = frame.transformVector( v );\n"
"	}\n"
"\n"
"	return v;\n"
"}\n"
"\n"
"HIPRT_DEVICE float3 hiprtVectorWorldToObject(\n"
"	const float3& vector, hiprtScene scene, const uint32_t ( &instanceIDs )[hiprtMaxInstanceLevels], float time )\n"
"{\n"
"	hiprt::SceneHeader* sceneHeader = reinterpret_cast<hiprt::SceneHeader*>( scene );\n"
"	float3				v			= vector;\n"
"\n"
"#pragma unroll\n"
"	for ( int32_t i = 0; i < hiprtMaxInstanceLevels; ++i )\n"
"	{\n"
"		const auto&		 instance = sceneHeader->m_instances[instanceIDs[i]];\n"
"		hiprt::Transform tr		  = hiprt::Transform( sceneHeader->m_frames, instance.m_frameIndex, instance.m_frameCount );\n"
"		hiprt::Frame	 frame	  = tr.interpolateFrames( time );\n"
"		v						  = frame.invTransformVector( v );\n"
"		if ( instance.m_type != hiprtInstanceTypeScene ) break;\n"
"		sceneHeader = instance.m_scene;\n"
"	}\n"
"\n"
"	return v;\n"
"}\n"
"\n"
"template class hiprtPrivateStack_impl<uint32_t, hiprtPrivateStack::StackSize>;\n"
"template class hiprtGlobalStack_impl<uint32_t, false>;\n"
"template class hiprtGlobalStack_impl<uint32_t, true>;\n"
"\n"
"template class hiprtPrivateStack_impl<hiprtInstanceStackEntry, hiprtPrivateInstanceStack::StackSize>;\n"
"template class hiprtGlobalStack_impl<hiprtInstanceStackEntry, false>;\n"
"template class hiprtGlobalStack_impl<hiprtInstanceStackEntry, true>;\n"
"\n"
"template class hiprtGeomTraversal_impl<hiprtTriangleNode, hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtGeomTraversal_impl<hiprtTriangleNode, hiprtTraversalTerminateAtAnyHit>;\n"
"template class hiprtGeomTraversal_impl<hiprtCustomNode, hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtGeomTraversal_impl<hiprtCustomNode, hiprtTraversalTerminateAtAnyHit>;\n"
"\n"
"template class hiprtSceneTraversal_impl<hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtSceneTraversal_impl<hiprtTraversalTerminateAtAnyHit>;\n"
"\n"
"template class hiprtGeomTraversalCustomStack_impl<hiprtPrivateStack, hiprtTriangleNode, hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtGeomTraversalCustomStack_impl<hiprtPrivateStack, hiprtCustomNode, hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtGeomTraversalCustomStack_impl<hiprtPrivateStack, hiprtTriangleNode, hiprtTraversalTerminateAtAnyHit>;\n"
"template class hiprtGeomTraversalCustomStack_impl<hiprtPrivateStack, hiprtCustomNode, hiprtTraversalTerminateAtAnyHit>;\n"
"\n"
"template class hiprtGeomTraversalCustomStack_impl<hiprtGlobalStack, hiprtTriangleNode, hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtGeomTraversalCustomStack_impl<hiprtGlobalStack, hiprtCustomNode, hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtGeomTraversalCustomStack_impl<hiprtGlobalStack, hiprtTriangleNode, hiprtTraversalTerminateAtAnyHit>;\n"
"template class hiprtGeomTraversalCustomStack_impl<hiprtGlobalStack, hiprtCustomNode, hiprtTraversalTerminateAtAnyHit>;\n"
"\n"
"template class hiprtGeomTraversalCustomStack_impl<hiprtDynamicStack, hiprtTriangleNode, hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtGeomTraversalCustomStack_impl<hiprtDynamicStack, hiprtCustomNode, hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtGeomTraversalCustomStack_impl<hiprtDynamicStack, hiprtTriangleNode, hiprtTraversalTerminateAtAnyHit>;\n"
"template class hiprtGeomTraversalCustomStack_impl<hiprtDynamicStack, hiprtCustomNode, hiprtTraversalTerminateAtAnyHit>;\n"
"\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtPrivateStack,\n"
"	hiprtEmptyInstanceStack,\n"
"	hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<hiprtPrivateStack, hiprtEmptyInstanceStack, hiprtTraversalTerminateAtAnyHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtGlobalStack,\n"
"	hiprtEmptyInstanceStack,\n"
"	hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<hiprtGlobalStack, hiprtEmptyInstanceStack, hiprtTraversalTerminateAtAnyHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtDynamicStack,\n"
"	hiprtEmptyInstanceStack,\n"
"	hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<hiprtDynamicStack, hiprtEmptyInstanceStack, hiprtTraversalTerminateAtAnyHit>;\n"
"\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtPrivateStack,\n"
"	hiprtPrivateInstanceStack,\n"
"	hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtPrivateStack,\n"
"	hiprtPrivateInstanceStack,\n"
"	hiprtTraversalTerminateAtAnyHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtGlobalStack,\n"
"	hiprtPrivateInstanceStack,\n"
"	hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtGlobalStack,\n"
"	hiprtPrivateInstanceStack,\n"
"	hiprtTraversalTerminateAtAnyHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtDynamicStack,\n"
"	hiprtPrivateInstanceStack,\n"
"	hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtDynamicStack,\n"
"	hiprtPrivateInstanceStack,\n"
"	hiprtTraversalTerminateAtAnyHit>;\n"
"\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtPrivateStack,\n"
"	hiprtGlobalInstanceStack,\n"
"	hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtPrivateStack,\n"
"	hiprtGlobalInstanceStack,\n"
"	hiprtTraversalTerminateAtAnyHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtGlobalStack,\n"
"	hiprtGlobalInstanceStack,\n"
"	hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<hiprtGlobalStack, hiprtGlobalInstanceStack, hiprtTraversalTerminateAtAnyHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtDynamicStack,\n"
"	hiprtGlobalInstanceStack,\n"
"	hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtDynamicStack,\n"
"	hiprtGlobalInstanceStack,\n"
"	hiprtTraversalTerminateAtAnyHit>;\n"
"\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtPrivateStack,\n"
"	hiprtDynamicInstanceStack,\n"
"	hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtPrivateStack,\n"
"	hiprtDynamicInstanceStack,\n"
"	hiprtTraversalTerminateAtAnyHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtGlobalStack,\n"
"	hiprtDynamicInstanceStack,\n"
"	hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtGlobalStack,\n"
"	hiprtDynamicInstanceStack,\n"
"	hiprtTraversalTerminateAtAnyHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtDynamicStack,\n"
"	hiprtDynamicInstanceStack,\n"
"	hiprtTraversalTerminateAtClosestHit>;\n"
"template class hiprtSceneTraversalCustomStack_impl<\n"
"	hiprtDynamicStack,\n"
"	hiprtDynamicInstanceStack,\n"
"	hiprtTraversalTerminateAtAnyHit>;\n"
"\n"
"template class hiprtGeomTraversalClosestCustomStack<hiprtPrivateStack>;\n"
"template class hiprtGeomCustomTraversalClosestCustomStack<hiprtPrivateStack>;\n"
"template class hiprtGeomTraversalAnyHitCustomStack<hiprtPrivateStack>;\n"
"template class hiprtGeomCustomTraversalAnyHitCustomStack<hiprtPrivateStack>;\n"
"\n"
"template class hiprtGeomTraversalClosestCustomStack<hiprtGlobalStack>;\n"
"template class hiprtGeomCustomTraversalClosestCustomStack<hiprtGlobalStack>;\n"
"template class hiprtGeomTraversalAnyHitCustomStack<hiprtGlobalStack>;\n"
"template class hiprtGeomCustomTraversalAnyHitCustomStack<hiprtGlobalStack>;\n"
"\n"
"template class hiprtGeomTraversalClosestCustomStack<hiprtDynamicStack>;\n"
"template class hiprtGeomCustomTraversalClosestCustomStack<hiprtDynamicStack>;\n"
"template class hiprtGeomTraversalAnyHitCustomStack<hiprtDynamicStack>;\n"
"template class hiprtGeomCustomTraversalAnyHitCustomStack<hiprtDynamicStack>;\n"
"\n"
"template class hiprtSceneTraversalClosestCustomStack<hiprtPrivateStack, hiprtEmptyInstanceStack>;\n"
"template class hiprtSceneTraversalAnyHitCustomStack<hiprtPrivateStack, hiprtEmptyInstanceStack>;\n"
"template class hiprtSceneTraversalClosestCustomStack<hiprtGlobalStack, hiprtEmptyInstanceStack>;\n"
"template class hiprtSceneTraversalAnyHitCustomStack<hiprtGlobalStack, hiprtEmptyInstanceStack>;\n"
"template class hiprtSceneTraversalClosestCustomStack<hiprtDynamicStack, hiprtEmptyInstanceStack>;\n"
"template class hiprtSceneTraversalAnyHitCustomStack<hiprtDynamicStack, hiprtEmptyInstanceStack>;\n"
"\n"
"template class hiprtSceneTraversalClosestCustomStack<hiprtPrivateStack, hiprtPrivateInstanceStack>;\n"
"template class hiprtSceneTraversalAnyHitCustomStack<hiprtPrivateStack, hiprtPrivateInstanceStack>;\n"
"template class hiprtSceneTraversalClosestCustomStack<hiprtGlobalStack, hiprtPrivateInstanceStack>;\n"
"template class hiprtSceneTraversalAnyHitCustomStack<hiprtGlobalStack, hiprtPrivateInstanceStack>;\n"
"template class hiprtSceneTraversalClosestCustomStack<hiprtDynamicStack, hiprtPrivateInstanceStack>;\n"
"template class hiprtSceneTraversalAnyHitCustomStack<hiprtDynamicStack, hiprtPrivateInstanceStack>;\n"
"\n"
"template class hiprtSceneTraversalClosestCustomStack<hiprtPrivateStack, hiprtGlobalInstanceStack>;\n"
"template class hiprtSceneTraversalAnyHitCustomStack<hiprtPrivateStack, hiprtGlobalInstanceStack>;\n"
"template class hiprtSceneTraversalClosestCustomStack<hiprtGlobalStack, hiprtGlobalInstanceStack>;\n"
"template class hiprtSceneTraversalAnyHitCustomStack<hiprtGlobalStack, hiprtGlobalInstanceStack>;\n"
"template class hiprtSceneTraversalClosestCustomStack<hiprtDynamicStack, hiprtGlobalInstanceStack>;\n"
"template class hiprtSceneTraversalAnyHitCustomStack<hiprtDynamicStack, hiprtGlobalInstanceStack>;\n"
"\n"
"template class hiprtSceneTraversalClosestCustomStack<hiprtPrivateStack, hiprtDynamicInstanceStack>;\n"
"template class hiprtSceneTraversalAnyHitCustomStack<hiprtPrivateStack, hiprtDynamicInstanceStack>;\n"
"template class hiprtSceneTraversalClosestCustomStack<hiprtGlobalStack, hiprtDynamicInstanceStack>;\n"
"template class hiprtSceneTraversalAnyHitCustomStack<hiprtGlobalStack, hiprtDynamicInstanceStack>;\n"
"template class hiprtSceneTraversalClosestCustomStack<hiprtDynamicStack, hiprtDynamicInstanceStack>;\n"
"template class hiprtSceneTraversalAnyHitCustomStack<hiprtDynamicStack, hiprtDynamicInstanceStack>;\n"
;
static const char* hip_hiprt_device= \
"\n"
"#pragma once\n"
"\n"
"#define HIPRT_PUBLIC_DEVICE_H\n"
"#define HIPRT_DEVICE __device__\n"
"\n"
"#include <hiprt/hiprt_common.h>\n"
"#include <hiprt/hiprt_types.h>\n"
"#include <hiprt/hiprt_vec.h>\n"
"\n"
"/** \\brief An empty dummy instance stack.\n"
"*\n"
"* Use this stack if you use one level of instancing.\n"
"*/\n"
"class hiprtEmptyInstanceStack\n"
"{\n"
"};\n"
"\n"
"/** \\brief A stack using (slow) local memory internally.\n"
"*\n"
"*/\n"
"template <typename StackEntry, uint32_t StackSize>\n"
"class hiprtPrivateStack_impl;\n"
"\n"
"class hiprtPrivateStack\n"
"{\n"
"public:\n"
"	static constexpr uint32_t StackSize = 64u;\n"
"	HIPRT_DEVICE			  hiprtPrivateStack();\n"
"	HIPRT_DEVICE ~hiprtPrivateStack();\n"
"	HIPRT_DEVICE uint32_t pop();\n"
"	HIPRT_DEVICE void	  push( uint32_t val );\n"
"	HIPRT_DEVICE bool	  empty() const;\n"
"	HIPRT_DEVICE uint32_t vacancy() const;\n"
"	HIPRT_DEVICE void	  reset();\n"
"\n"
"private:\n"
"	hiprtPimpl<hiprtPrivateStack_impl<uint32_t, StackSize>, SizePrivateStack, AlignmentPrivateStack> m_impl;\n"
"};\n"
"\n"
"/** \\brief A instance stack using (slow) local memory internally.\n"
"*\n"
"*/\n"
"class hiprtPrivateInstanceStack\n"
"{\n"
"public:\n"
"	static constexpr uint32_t StackSize = hiprtMaxInstanceLevels - 1;\n"
"	HIPRT_DEVICE			  hiprtPrivateInstanceStack();\n"
"	HIPRT_DEVICE ~hiprtPrivateInstanceStack();\n"
"	HIPRT_DEVICE hiprtInstanceStackEntry pop();\n"
"	HIPRT_DEVICE void					 push( hiprtInstanceStackEntry val );\n"
"	HIPRT_DEVICE bool					 empty() const;\n"
"	HIPRT_DEVICE uint32_t				 vacancy() const;\n"
"	HIPRT_DEVICE void					 reset();\n"
"\n"
"private:\n"
"	hiprtPimpl<\n"
"		hiprtPrivateStack_impl<hiprtInstanceStackEntry, StackSize>,\n"
"		SizePrivateInstanceStack,\n"
"		AlignmentPrivateInstanceStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief A stack using both (fast) shared memory and (slow) global memory.\n"
"*\n"
"* The stack uses shared memory if there is enough space.\n"
"* Otherwise, it uses global memory as a backup.\n"
"*/\n"
"template <typename StackEntry, bool DynamicAssignment>\n"
"class hiprtGlobalStack_impl;\n"
"\n"
"class hiprtGlobalStack\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE\n"
"	hiprtGlobalStack( hiprtGlobalStackBuffer globalStackBuffer, hiprtSharedStackBuffer sharedStackBuffer );\n"
"	HIPRT_DEVICE ~hiprtGlobalStack();\n"
"	HIPRT_DEVICE uint32_t pop();\n"
"	HIPRT_DEVICE void	  push( uint32_t val );\n"
"	HIPRT_DEVICE uint32_t vacancy() const;\n"
"	HIPRT_DEVICE bool	  empty() const;\n"
"	HIPRT_DEVICE void	  reset();\n"
"\n"
"private:\n"
"	hiprtPimpl<hiprtGlobalStack_impl<uint32_t, false>, SizeGlobalStack, AlignmentGlobalStack> m_impl;\n"
"};\n"
"\n"
"/** \\brief An instance stack using both (fast) shared memory and (slow) global memory.\n"
"*\n"
"* The stack uses shared memory if there is enough space.\n"
"* Otherwise, it uses global memory as a backup.\n"
"*/\n"
"class hiprtGlobalInstanceStack\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE\n"
"	hiprtGlobalInstanceStack( hiprtGlobalStackBuffer globalStackBuffer, hiprtSharedStackBuffer sharedStackBuffer );\n"
"	HIPRT_DEVICE ~hiprtGlobalInstanceStack();\n"
"	HIPRT_DEVICE hiprtInstanceStackEntry pop();\n"
"	HIPRT_DEVICE void					 push( hiprtInstanceStackEntry val );\n"
"	HIPRT_DEVICE uint32_t				 vacancy() const;\n"
"	HIPRT_DEVICE bool					 empty() const;\n"
"	HIPRT_DEVICE void					 reset();\n"
"\n"
"private:\n"
"	hiprtPimpl<hiprtGlobalStack_impl<hiprtInstanceStackEntry, false>, SizeGlobalStack, AlignmentGlobalStack> m_impl;\n"
"};\n"
"\n"
"/** \\brief A stack using both (fast) shared memory and (slow) global memory with dynamic assignment.\n"
"*\n"
"* The stack uses shared memory if there is enough space.\n"
"* Otherwise, it uses global memory as a backup.\n"
"*/\n"
"class hiprtDynamicStack\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE\n"
"	hiprtDynamicStack( hiprtGlobalStackBuffer globalStackBuffer, hiprtSharedStackBuffer sharedStackBuffer );\n"
"	HIPRT_DEVICE ~hiprtDynamicStack();\n"
"	HIPRT_DEVICE uint32_t pop();\n"
"	HIPRT_DEVICE void	  push( uint32_t val );\n"
"	HIPRT_DEVICE uint32_t vacancy() const;\n"
"	HIPRT_DEVICE bool	  empty() const;\n"
"	HIPRT_DEVICE void	  reset();\n"
"\n"
"private:\n"
"	hiprtPimpl<hiprtGlobalStack_impl<uint32_t, true>, SizeGlobalInstanceStack, AlignmentGlobalInstanceStack> m_impl;\n"
"};\n"
"\n"
"/** \\brief An instance stack using both (fast) shared memory and (slow) global memory with dynamic assignment.\n"
"*\n"
"* The stack uses shared memory if there is enough space.\n"
"* Otherwise, it uses global memory as a backup.\n"
"*/\n"
"class hiprtDynamicInstanceStack\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE\n"
"	hiprtDynamicInstanceStack( hiprtGlobalStackBuffer globalStackBuffer, hiprtSharedStackBuffer sharedStackBuffer );\n"
"	HIPRT_DEVICE ~hiprtDynamicInstanceStack();\n"
"	HIPRT_DEVICE hiprtInstanceStackEntry pop();\n"
"	HIPRT_DEVICE void					 push( hiprtInstanceStackEntry val );\n"
"	HIPRT_DEVICE uint32_t				 vacancy() const;\n"
"	HIPRT_DEVICE bool					 empty() const;\n"
"	HIPRT_DEVICE void					 reset();\n"
"\n"
"private:\n"
"	hiprtPimpl<hiprtGlobalStack_impl<hiprtInstanceStackEntry, true>, SizeGlobalInstanceStack, AlignmentGlobalInstanceStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief A traversal object for finding the closest hit with hiprtGeometry containing triangles.\n"
"*\n"
"* It uses a private stack with size 64 internally.\n"
"*/\n"
"template <hiprtPrimitiveNodeType PrimitiveNodeType, hiprtTraversalType TraversalType>\n"
"class hiprtGeomTraversal_impl;\n"
"\n"
"class hiprtGeomTraversalClosest\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtGeomTraversalClosest(\n"
"		hiprtGeometry	   geom,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0 );\n"
"	HIPRT_DEVICE hiprtHit			 getNextHit();\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState();\n"
"\n"
"private:\n"
"	hiprtPimpl<\n"
"		hiprtGeomTraversal_impl<hiprtTriangleNode, hiprtTraversalTerminateAtClosestHit>,\n"
"		SizeGeomTraversalPrivateStack,\n"
"		AlignmentGeomTraversalPrivateStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief A traversal object for finding the any hit with hiprtGeometry containing triangles.\n"
"*\n"
"* It uses a private stack with size 64 internally.\n"
"*/\n"
"class hiprtGeomTraversalAnyHit\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtGeomTraversalAnyHit(\n"
"		hiprtGeometry	   geom,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0 );\n"
"	HIPRT_DEVICE hiprtHit			 getNextHit();\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState();\n"
"\n"
"private:\n"
"	hiprtPimpl<\n"
"		hiprtGeomTraversal_impl<hiprtTriangleNode, hiprtTraversalTerminateAtAnyHit>,\n"
"		SizeGeomTraversalPrivateStack,\n"
"		AlignmentGeomTraversalPrivateStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief A traversal object for finding the closest hit with hiprtGeometry containing custom primitives.\n"
"*\n"
"* It uses a private stack with size 64 internally.\n"
"*/\n"
"class hiprtGeomCustomTraversalClosest\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtGeomCustomTraversalClosest(\n"
"		hiprtGeometry	   geom,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0 );\n"
"	HIPRT_DEVICE hiprtHit			 getNextHit();\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState();\n"
"\n"
"private:\n"
"	hiprtPimpl<\n"
"		hiprtGeomTraversal_impl<hiprtCustomNode, hiprtTraversalTerminateAtClosestHit>,\n"
"		SizeGeomTraversalPrivateStack,\n"
"		AlignmentGeomTraversalPrivateStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief A traversal object for finding the any hit with hiprtGeometry containing custom primitives.\n"
"*\n"
"* It uses a private stack with size 64 internally.\n"
"*/\n"
"class hiprtGeomCustomTraversalAnyHit\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtGeomCustomTraversalAnyHit(\n"
"		hiprtGeometry	   geom,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0 );\n"
"	HIPRT_DEVICE hiprtHit			 getNextHit();\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState();\n"
"\n"
"private:\n"
"	hiprtPimpl<\n"
"		hiprtGeomTraversal_impl<hiprtCustomNode, hiprtTraversalTerminateAtAnyHit>,\n"
"		SizeGeomTraversalPrivateStack,\n"
"		AlignmentGeomTraversalPrivateStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief A traversal object for finding the closest hit with hiprtScene.\n"
"*\n"
"* It uses a private stack with size 64 internally.\n"
"*/\n"
"template <hiprtTraversalType TraversalType>\n"
"class hiprtSceneTraversal_impl;\n"
"\n"
"class hiprtSceneTraversalClosest\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtSceneTraversalClosest(\n"
"		hiprtScene		   scene,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtRayMask	   mask		 = hiprtFullRayMask,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0,\n"
"		float			   time		 = 0.0f );\n"
"	HIPRT_DEVICE hiprtHit			 getNextHit();\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState();\n"
"\n"
"private:\n"
"	hiprtPimpl<\n"
"		hiprtSceneTraversal_impl<hiprtTraversalTerminateAtClosestHit>,\n"
"		SizeSceneTraversalPrivateStack,\n"
"		AlignmentSceneTraversalPrivateStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief A traversal object for finding the any hit with hiprtScene.\n"
"*\n"
"* It uses a private stack with size 64 internally.\n"
"*/\n"
"class hiprtSceneTraversalAnyHit\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtSceneTraversalAnyHit(\n"
"		hiprtScene		   scene,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtRayMask	   mask		 = hiprtFullRayMask,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0,\n"
"		float			   time		 = 0.0f );\n"
"	HIPRT_DEVICE hiprtHit			 getNextHit();\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState();\n"
"\n"
"private:\n"
"	hiprtPimpl<\n"
"		hiprtSceneTraversal_impl<hiprtTraversalTerminateAtAnyHit>,\n"
"		SizeSceneTraversalPrivateStack,\n"
"		AlignmentSceneTraversalPrivateStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief A traversal object for finding the closest hit with hiprtGeometry containing triangles.\n"
"*\n"
"* \\tparam hiprtStack A custom stack.\n"
"*/\n"
"template <typename hiprtStack, hiprtPrimitiveNodeType PrimitiveNodeType, hiprtTraversalType TraversalType>\n"
"class hiprtGeomTraversalCustomStack_impl;\n"
"\n"
"template <typename hiprtStack>\n"
"class hiprtGeomTraversalClosestCustomStack\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtGeomTraversalClosestCustomStack(\n"
"		hiprtGeometry	   geom,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtStack&		   stack,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0 );\n"
"	HIPRT_DEVICE hiprtHit			 getNextHit();\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState();\n"
"\n"
"private:\n"
"	hiprtPimpl<\n"
"		hiprtGeomTraversalCustomStack_impl<hiprtStack, hiprtTriangleNode, hiprtTraversalTerminateAtClosestHit>,\n"
"		SizeGeomTraversalCustomStack,\n"
"		AlignmentGeomTraversalCustomStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief A traversal object for finding the any hit with hiprtGeometry containing triangles.\n"
"*\n"
"* \\tparam hiprtStack A custom stack.\n"
"*/\n"
"template <typename hiprtStack>\n"
"class hiprtGeomTraversalAnyHitCustomStack\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtGeomTraversalAnyHitCustomStack(\n"
"		hiprtGeometry	   geom,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtStack&		   stack,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0 );\n"
"	HIPRT_DEVICE hiprtHit			 getNextHit();\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState();\n"
"\n"
"private:\n"
"	hiprtPimpl<\n"
"		hiprtGeomTraversalCustomStack_impl<hiprtStack, hiprtTriangleNode, hiprtTraversalTerminateAtAnyHit>,\n"
"		SizeGeomTraversalCustomStack,\n"
"		AlignmentGeomTraversalCustomStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief A traversal object for finding the closest hit with hiprtGeometry containing custom primitives.\n"
"*\n"
"* \\tparam hiprtStack A custom stack.\n"
"*/\n"
"template <typename hiprtStack>\n"
"class hiprtGeomCustomTraversalClosestCustomStack\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtGeomCustomTraversalClosestCustomStack(\n"
"		hiprtGeometry	   geom,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtStack&		   stack,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0 );\n"
"	HIPRT_DEVICE hiprtHit			 getNextHit();\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState();\n"
"\n"
"private:\n"
"	hiprtPimpl<\n"
"		hiprtGeomTraversalCustomStack_impl<hiprtStack, hiprtCustomNode, hiprtTraversalTerminateAtClosestHit>,\n"
"		SizeGeomTraversalCustomStack,\n"
"		AlignmentGeomTraversalCustomStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief A traversal object for finding the any hit with hiprtGeometry containing custom primitives.\n"
"*\n"
"* \\tparam hiprtStack A custom stack.\n"
"*/\n"
"template <typename hiprtStack>\n"
"class hiprtGeomCustomTraversalAnyHitCustomStack\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtGeomCustomTraversalAnyHitCustomStack(\n"
"		hiprtGeometry	   geom,\n"
"		const hiprtRay&	   ray,\n"
"		hiprtStack&		   stack,\n"
"		hiprtTraversalHint hint		 = hiprtTraversalHintDefault,\n"
"		void*			   payload	 = nullptr,\n"
"		hiprtFuncTable	   funcTable = nullptr,\n"
"		uint32_t		   rayType	 = 0 );\n"
"	HIPRT_DEVICE hiprtHit			 getNextHit();\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState();\n"
"\n"
"private:\n"
"	hiprtPimpl<\n"
"		hiprtGeomTraversalCustomStack_impl<hiprtStack, hiprtCustomNode, hiprtTraversalTerminateAtAnyHit>,\n"
"		SizeGeomTraversalCustomStack,\n"
"		AlignmentGeomTraversalCustomStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief A traversal object for finding the closest hit with hiprtScene.\n"
"*\n"
"* \\tparam hiprtStack A custom stack.\n"
"*/\n"
"template <typename hiprtStack, typename hiprtInstanceStack, hiprtTraversalType TraversalType>\n"
"class hiprtSceneTraversalCustomStack_impl;\n"
"\n"
"template <typename hiprtStack, typename hiprtInstanceStack>\n"
"class hiprtSceneTraversalClosestCustomStack\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtSceneTraversalClosestCustomStack(\n"
"		hiprtScene			scene,\n"
"		const hiprtRay&		ray,\n"
"		hiprtStack&			stack,\n"
"		hiprtInstanceStack& instanceStack,\n"
"		hiprtRayMask		mask	  = hiprtFullRayMask,\n"
"		hiprtTraversalHint	hint	  = hiprtTraversalHintDefault,\n"
"		void*				payload	  = nullptr,\n"
"		hiprtFuncTable		funcTable = nullptr,\n"
"		uint32_t			rayType	  = 0,\n"
"		float				time	  = 0.0f );\n"
"	HIPRT_DEVICE hiprtHit			 getNextHit();\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState();\n"
"\n"
"private:\n"
"	hiprtPimpl<\n"
"		hiprtSceneTraversalCustomStack_impl<hiprtStack, hiprtInstanceStack, hiprtTraversalTerminateAtClosestHit>,\n"
"		SizeSceneTraversalCustomStack,\n"
"		AlignmentSceneTraversalCustomStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief A traversal object for finding the any hit with hiprtScene.\n"
"*\n"
"* \\tparam hiprtStack A custom stack.\n"
"*/\n"
"template <typename hiprtStack, typename hiprtInstanceStack>\n"
"class hiprtSceneTraversalAnyHitCustomStack\n"
"{\n"
"public:\n"
"	HIPRT_DEVICE hiprtSceneTraversalAnyHitCustomStack(\n"
"		hiprtScene			scene,\n"
"		const hiprtRay&		ray,\n"
"		hiprtStack&			stack,\n"
"		hiprtInstanceStack& instanceStack,\n"
"		hiprtRayMask		mask	  = hiprtFullRayMask,\n"
"		hiprtTraversalHint	hint	  = hiprtTraversalHintDefault,\n"
"		void*				payload	  = nullptr,\n"
"		hiprtFuncTable		funcTable = nullptr,\n"
"		uint32_t			rayType	  = 0,\n"
"		float				time	  = 0.0f );\n"
"	HIPRT_DEVICE hiprtHit			 getNextHit();\n"
"	HIPRT_DEVICE hiprtTraversalState getCurrentState();\n"
"\n"
"private:\n"
"	hiprtPimpl<\n"
"		hiprtSceneTraversalCustomStack_impl<hiprtStack, hiprtInstanceStack, hiprtTraversalTerminateAtAnyHit>,\n"
"		SizeSceneTraversalCustomStack,\n"
"		AlignmentSceneTraversalCustomStack>\n"
"		m_impl;\n"
"};\n"
"\n"
"/** \\brief Transforms a point from the object space to the world space.\n"
"*\n"
"* \\param point A point in the object space.\n"
"* \\param scene A scene.\n"
"* \\param instanceID Instance ID.\n"
"* \\param time The time.\n"
"*/\n"
"HIPRT_DEVICE float3 hiprtPointObjectToWorld( const float3& point, hiprtScene scene, uint32_t instanceID, float time = 0.0f );\n"
"\n"
"/** \\brief Transforms a point from the world space to the object space.\n"
"*\n"
"* \\param point A point in the world space.\n"
"* \\param scene A scene.\n"
"* \\param instanceID Instance ID.\n"
"* \\param time The time.\n"
"*/\n"
"HIPRT_DEVICE float3 hiprtPointWorldToObject( const float3& point, hiprtScene scene, uint32_t instanceID, float time = 0.0f );\n"
"\n"
"/** \\brief Transforms a vector from the object space to the world space.\n"
"*\n"
"* \\param vector A vector in object space.\n"
"* \\param scene A scene.\n"
"* \\param instanceID Instance ID.\n"
"* \\param time The time.\n"
"*/\n"
"HIPRT_DEVICE float3 hiprtVectorObjectToWorld( const float3& vector, hiprtScene scene, uint32_t instanceID, float time = 0.0f );\n"
"\n"
"/** \\brief Transforms a vector from the world space to the object space.\n"
"*\n"
"* \\param vector A vector in the world space.\n"
"* \\param scene A scene.\n"
"* \\param instanceID Instance ID.\n"
"* \\param time The time.\n"
"*/\n"
"HIPRT_DEVICE float3 hiprtVectorWorldToObject( const float3& vector, hiprtScene scene, uint32_t instanceID, float time = 0.0f );\n"
"\n"
"/** \\brief Transforms a point from the object space to the world space.\n"
"*\n"
"* \\param point A point in the object space.\n"
"* \\param scene A scene.\n"
"* \\param instanceIDs Instance IDs (multi-level instancing).\n"
"* \\param time The time.\n"
"*/\n"
"HIPRT_DEVICE float3 hiprtPointObjectToWorld(\n"
"	const float3& point, hiprtScene scene, const uint32_t ( &instanceIDs )[hiprtMaxInstanceLevels], float time = 0.0f );\n"
"\n"
"/** \\brief Transforms a point from the world space to the object space.\n"
"*\n"
"* \\param point A point in the world space.\n"
"* \\param scene A scene.\n"
"* \\param instanceIDs Instance IDs (multi-level instancing).\n"
"* \\param time The time.\n"
"*/\n"
"HIPRT_DEVICE float3 hiprtPointWorldToObject(\n"
"	const float3& point, hiprtScene scene, const uint32_t ( &instanceIDs )[hiprtMaxInstanceLevels], float time = 0.0f );\n"
"\n"
"/** \\brief Transforms a vector from the object space to the world space.\n"
"*\n"
"* \\param vector A vector in object space.\n"
"* \\param scene A scene.\n"
"* \\param instanceIDs Instance IDs (multi-level instancing).\n"
"* \\param time The time.\n"
"*/\n"
"HIPRT_DEVICE float3 hiprtVectorObjectToWorld(\n"
"	const float3& vector, hiprtScene scene, const uint32_t ( &instanceIDs )[hiprtMaxInstanceLevels], float time = 0.0f );\n"
"\n"
"/** \\brief Transforms a vector from the world space to the object space.\n"
"*\n"
"* \\param vector A vector in the world space.\n"
"* \\param scene A scene.\n"
"* \\param instanceIDs Instance IDs (multi-level instancing).\n"
"* \\param time The time.\n"
"*/\n"
"HIPRT_DEVICE float3 hiprtVectorWorldToObject(\n"
"	const float3& vector, hiprtScene scene, const uint32_t ( &instanceIDs )[hiprtMaxInstanceLevels], float time = 0.0f );\n"
;
static const char* hip_BvhBuilderKernels= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_common.h>\n"
"#include <hiprt/hiprt_vec.h>\n"
"#include <hiprt/hiprt_math.h>\n"
"#include <hiprt/impl/Obb.h>\n"
"#include <hiprt/impl/Aabb.h>\n"
"#include <hiprt/impl/AabbList.h>\n"
"#include <hiprt/impl/Triangle.h>\n"
"#include <hiprt/impl/BvhNode.h>\n"
"#include <hiprt/impl/BvhBuilderUtil.h>\n"
"#include <hiprt/impl/Header.h>\n"
"#include <hiprt/impl/Instance.h>\n"
"#include <hiprt/impl/InstanceList.h>\n"
"#include <hiprt/impl/MortonCode.h>\n"
"#include <hiprt/impl/QrDecomposition.h>\n"
"#include <hiprt/impl/Quaternion.h>\n"
"#include <hiprt/impl/Transform.h>\n"
"#include <hiprt/impl/TriangleMesh.h>\n"
"#include <hiprt/impl/BvhConfig.h>\n"
"using namespace hiprt;\n"
"\n"
"HIPRT_DEVICE HIPRT_INLINE uint3 atomic_load( const uint3* addr )\n"
"{\n"
"	uint3 value;\n"
"#if HIPRT_RTIP >= 31\n"
"	value.x = __hip_atomic_load( &addr->x, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT );\n"
"	value.y = __hip_atomic_load( &addr->y, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT );\n"
"	value.z = __hip_atomic_load( &addr->z, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT );\n"
"#else\n"
"	value = *addr;\n"
"#endif\n"
"	return value;\n"
"}\n"
"\n"
"HIPRT_DEVICE HIPRT_INLINE void atomic_store( uint3* addr, const uint3& value )\n"
"{\n"
"#if HIPRT_RTIP >= 31\n"
"	__hip_atomic_store( &addr->x, value.x, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT );\n"
"	__hip_atomic_store( &addr->y, value.y, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT );\n"
"	__hip_atomic_store( &addr->z, value.z, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT );\n"
"#else\n"
"	*addr = value;\n"
"#endif\n"
"}\n"
"\n"
"template <typename BinaryNode>\n"
"HIPRT_DEVICE HIPRT_INLINE Aabb\n"
"getNodeBox( const uint32_t nodeIndex, const BinaryNode* binaryNodes, const ReferenceNode* references )\n"
"{\n"
"	const uint32_t nodeType = getNodeType( nodeIndex );\n"
"	const uint32_t nodeAddr = getNodeAddr( nodeIndex );\n"
"	if ( nodeType == BoxType )\n"
"		return binaryNodes[nodeAddr].aabb();\n"
"	else\n"
"		return references[nodeAddr].aabb();\n"
"}\n"
"\n"
"template <typename PrimitiveContainer, typename PrimitiveNode>\n"
"HIPRT_DEVICE HIPRT_INLINE Aabb\n"
"getNodeBox( const uint32_t nodeIndex, PrimitiveContainer& primitives, BoxNode* boxNodes, PrimitiveNode* primNodes )\n"
"{\n"
"	const uint32_t nodeAddr = getNodeAddr( nodeIndex );\n"
"	const uint32_t nodeType = getNodeType( nodeIndex );\n"
"	if ( nodeType != BoxType )\n"
"	{\n"
"		if constexpr ( is_same<PrimitiveNode, TrianglePairNode>::value )\n"
"			return primNodes[nodeAddr].aabb();\n"
"		else if constexpr ( is_same<PrimitiveNode, TrianglePacketNode>::value )\n"
"			return primNodes[nodeAddr].aabb( typeToTriPairIndex( nodeType ) );\n"
"		else\n"
"			return primitives.fetchAabb( primNodes[nodeAddr].m_primIndex );\n"
"	}\n"
"	else\n"
"	{\n"
"		return boxNodes[nodeAddr].aabb();\n"
"	}\n"
"}\n"
"\n"
"template <typename PrimitiveContainer, typename PrimitiveNode>\n"
"HIPRT_DEVICE HIPRT_INLINE Obb getNodeObb(\n"
"	const uint32_t		matrixIndex,\n"
"	const uint32_t		nodeIndex,\n"
"	const Aabb&			nodeBox,\n"
"	PrimitiveContainer& primitives,\n"
"	PrimitiveNode*		primNodes,\n"
"	Kdop*				kdops )\n"
"{\n"
"	const uint32_t nodeAddr = getNodeAddr( nodeIndex );\n"
"	const uint32_t nodeType = getNodeType( nodeIndex );\n"
"	if ( nodeType != BoxType )\n"
"	{\n"
"		if constexpr ( is_same<PrimitiveNode, TrianglePacketNode>::value )\n"
"		{\n"
"			Obb obb = primNodes[nodeAddr].obb( typeToTriPairIndex( nodeType ), matrixIndex, nodeBox );\n"
"			if ( !obb.aabb().valid() ) obb = Obb( matrixIndex ).grow( nodeBox );\n"
"			return obb;\n"
"		}\n"
"		else if constexpr ( is_same<PrimitiveNode, InstanceNode>::value )\n"
"		{\n"
"			return primitives.fetchObb( primNodes[nodeAddr].m_primIndex, matrixIndex, nodeBox );\n"
"		}\n"
"		else\n"
"		{\n"
"			return Obb( matrixIndex ).grow( nodeBox );\n"
"		}\n"
"	}\n"
"	else\n"
"	{\n"
"		return kdops[nodeAddr].obb( matrixIndex );\n"
"	}\n"
"}\n"
"\n"
"template <auto IsLeafNode, typename BinaryNode>\n"
"HIPRT_DEVICE HIPRT_INLINE void openNodes(\n"
"	const BinaryNode* binaryNodes, const ReferenceNode* references, uint32_t& childCount, uint32_t& childIndex, Aabb& childBox )\n"
"{\n"
"	const uint32_t laneIndex	= threadIdx.x % WarpSize;\n"
"	const uint32_t sublaneIndex = laneIndex % BranchingFactor;\n"
"	const uint32_t subwarpIndex = laneIndex / BranchingFactor;\n"
"	const uint64_t subwarpMask	= ( ( 1 << BranchingFactor ) - 1 )\n"
"								 << static_cast<uint64_t>( ( BranchingFactor * subwarpIndex ) );\n"
"\n"
"	bool done = childCount == BranchingFactor;\n"
"	while ( hiprt::ballot( !done ) )\n"
"	{\n"
"		sync_warp();\n"
"\n"
"		float area = -FltMax;\n"
"		if ( !done )\n"
"		{\n"
"			if ( sublaneIndex < childCount )\n"
"			{\n"
"				if ( !IsLeafNode( childIndex ) ) area = childBox.area();\n"
"			}\n"
"		}\n"
"\n"
"		float maxArea = area;\n"
"#pragma unroll\n"
"		for ( uint32_t i = 1; i < BranchingFactor; i <<= 1 )\n"
"			maxArea = hiprt::max( maxArea, shfl_xor( maxArea, i ) );\n"
"		if ( maxArea < 0.0f ) done = true;\n"
"\n"
"		const uint32_t maxLaneIndex =\n"
"			__ffsll( static_cast<unsigned long long>( hiprt::ballot( maxArea == area ) ) & subwarpMask ) - 1;\n"
"		const uint32_t maxIndex		 = maxLaneIndex % BranchingFactor;\n"
"		const uint32_t maxChildIndex = shfl( childIndex, maxLaneIndex );\n"
"\n"
"		if ( !done )\n"
"		{\n"
"			BinaryNode binaryChild = binaryNodes[getNodeAddr( maxChildIndex )];\n"
"\n"
"			if ( sublaneIndex == maxIndex )\n"
"			{\n"
"				childIndex = binaryChild[0];\n"
"				childBox   = getNodeBox( binaryChild[0], binaryNodes, references );\n"
"			}\n"
"\n"
"			if ( sublaneIndex == childCount )\n"
"			{\n"
"				childIndex = binaryChild[1];\n"
"				childBox   = getNodeBox( binaryChild[1], binaryNodes, references );\n"
"			}\n"
"\n"
"			childCount++;\n"
"\n"
"			if ( childCount == BranchingFactor ) done = true;\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"__device__ void InitGeomDataImpl(\n"
"	uint32_t	index,\n"
"	uint32_t	primCount,\n"
"	size_t		size,\n"
"	BoxNode*	boxNodes,\n"
"	void*		primNodes,\n"
"	uint32_t	geomType,\n"
"	GeomHeader* geomHeader )\n"
"{\n"
"	if ( index == 0 )\n"
"	{\n"
"		geomHeader->m_size			 = size;\n"
"		geomHeader->m_boxNodes		 = boxNodes;\n"
"		geomHeader->m_primNodes		 = primNodes;\n"
"		geomHeader->m_referenceCount = primCount == 1u ? 1u : 0u;\n"
"		geomHeader->m_boxNodeCount	 = 1u;\n"
"		geomHeader->m_primNodeCount	 = primCount == 1u ? 1u : 0u;\n"
"		geomHeader->m_geomType		 = geomType;\n"
"		geomHeader->m_rtip			 = Rtip;\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void\n"
"InitGeomData( size_t size, uint32_t primCount, BoxNode* boxNodes, void* primNodes, uint32_t geomType, GeomHeader* geomHeader )\n"
"{\n"
"	const uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;\n"
"	InitGeomDataImpl( index, primCount, size, boxNodes, primNodes, geomType, geomHeader );\n"
"}\n"
"\n"
"template <typename InstanceList>\n"
"__device__ void InitSceneData(\n"
"	uint32_t	  index,\n"
"	size_t		  size,\n"
"	InstanceList& instanceList,\n"
"	BoxNode*	  boxNodes,\n"
"	InstanceNode* primNodes,\n"
"	Instance*	  instances,\n"
"	Frame*		  frames,\n"
"	SceneHeader*  sceneHeader )\n"
"{\n"
"	if ( index < instanceList.getCount() )\n"
"	{\n"
"		hiprtInstance		 i = instanceList.fetchInstance( index );\n"
"		hiprtTransformHeader t = instanceList.fetchTransformHeader( index );\n"
"		Instance			 instance;\n"
"		instance.m_type		  = i.type;\n"
"		instance.m_frameIndex = t.frameIndex;\n"
"		instance.m_frameCount = t.frameCount;\n"
"		if ( i.type == hiprtInstanceTypeGeometry )\n"
"			instance.m_geometry = reinterpret_cast<GeomHeader*>( i.geometry );\n"
"		else\n"
"			instance.m_scene = reinterpret_cast<SceneHeader*>( i.scene );\n"
"		instances[index] = instance;\n"
"	}\n"
"\n"
"	if ( index < instanceList.getFrameCount() ) instanceList.convertFrame( index );\n"
"\n"
"	if ( index == 0 )\n"
"	{\n"
"		sceneHeader->m_size			  = size;\n"
"		sceneHeader->m_boxNodes		  = boxNodes;\n"
"		sceneHeader->m_primNodes	  = primNodes;\n"
"		sceneHeader->m_instances	  = instances;\n"
"		sceneHeader->m_frames		  = frames;\n"
"		sceneHeader->m_referenceCount = instanceList.getCount() == 1u ? 1u : 0u;\n"
"		sceneHeader->m_primCount	  = instanceList.getCount();\n"
"		sceneHeader->m_primNodeCount  = instanceList.getCount() == 1u ? 1u : 0u;\n"
"		sceneHeader->m_boxNodeCount	  = 1u;\n"
"		sceneHeader->m_frameCount	  = instanceList.getFrameCount();\n"
"		sceneHeader->m_rtip			  = Rtip;\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void InitSceneData_InstanceList_hiprtFrameSRT(\n"
"	size_t						size,\n"
"	InstanceList<hiprtFrameSRT> instanceList,\n"
"	BoxNode*					boxNodes,\n"
"	InstanceNode*				primNodes,\n"
"	Instance*					instances,\n"
"	Frame*						frames,\n"
"	SceneHeader*				sceneHeader )\n"
"{\n"
"	const uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;\n"
"	InitSceneData<InstanceList<hiprtFrameSRT>>(\n"
"		index, size, instanceList, boxNodes, primNodes, instances, frames, sceneHeader );\n"
"}\n"
"\n"
"extern \"C\" __global__ void InitSceneData_InstanceList_hiprtFrameMatrix(\n"
"	size_t						   size,\n"
"	InstanceList<hiprtFrameMatrix> instanceList,\n"
"	BoxNode*					   boxNodes,\n"
"	InstanceNode*				   primNodes,\n"
"	Instance*					   instances,\n"
"	Frame*						   frames,\n"
"	SceneHeader*				   sceneHeader )\n"
"{\n"
"	uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;\n"
"	InitSceneData<InstanceList<hiprtFrameMatrix>>(\n"
"		index, size, instanceList, boxNodes, primNodes, instances, frames, sceneHeader );\n"
"}\n"
"\n"
"template <typename PrimitiveContainer, typename PrimitiveNode>\n"
"__device__ void\n"
"SingletonConstruction( uint32_t index, PrimitiveContainer& primitives, BoxNode* boxNodes, PrimitiveNode* primNodes )\n"
"{\n"
"	if ( index > 0 ) return;\n"
"\n"
"	const uint32_t leafType = []() {\n"
"		if constexpr ( is_same<PrimitiveNode, TriangleNode>::value )\n"
"			return TriangleType;\n"
"		else if constexpr ( is_same<PrimitiveNode, CustomNode>::value )\n"
"			return CustomType;\n"
"		else if constexpr ( is_same<PrimitiveNode, InstanceNode>::value )\n"
"			return InstanceType;\n"
"	}();\n"
"\n"
"	primNodes[0] = primitives.fetchPrimNode( 0 );\n"
"\n"
"	Aabb	 childBoxes[BranchingFactor];\n"
"	uint32_t childIndices[BranchingFactor];\n"
"\n"
"	childBoxes[0]	= primitives.fetchAabb( 0 );\n"
"	childIndices[0] = encodeNodeIndex( 0, leafType );\n"
"	for ( uint32_t i = 1; i < BranchingFactor; ++i )\n"
"		childIndices[i] = InvalidValue;\n"
"\n"
"	boxNodes[0].init( InvalidValue, 1, 0, 0, childIndices, childBoxes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void\n"
"SingletonConstruction_TriangleMesh_TrianglePairNode( TriangleMesh primitives, BoxNode* boxNodes, TriangleNode* primNodes )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	SingletonConstruction<TriangleMesh, TriangleNode>( index, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void\n"
"SingletonConstruction_TriangleMesh_TrianglePacketNode( TriangleMesh primitives, BoxNode* boxNodes, TriangleNode* primNodes )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	SingletonConstruction<TriangleMesh, TriangleNode>( index, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void\n"
"SingletonConstruction_AabbList_CustomNode( AabbList primitives, BoxNode* boxNodes, CustomNode* primNodes )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	SingletonConstruction<AabbList, CustomNode>( index, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void SingletonConstruction_InstanceList_hiprtFrameSRT_UserInstanceNode(\n"
"	InstanceList<hiprtFrameSRT> primitives, BoxNode* boxNodes, InstanceNode* primNodes )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	SingletonConstruction<InstanceList<hiprtFrameSRT>, InstanceNode>( index, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void SingletonConstruction_InstanceList_hiprtFrameSRT_HwInstanceNode(\n"
"	InstanceList<hiprtFrameSRT> primitives, BoxNode* boxNodes, InstanceNode* primNodes )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	SingletonConstruction<InstanceList<hiprtFrameSRT>, InstanceNode>( index, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void SingletonConstruction_InstanceList_hiprtFrameMatrix_UserInstanceNode(\n"
"	InstanceList<hiprtFrameMatrix> primitives, BoxNode* boxNodes, InstanceNode* primNodes )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	SingletonConstruction<InstanceList<hiprtFrameMatrix>, InstanceNode>( index, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void SingletonConstruction_InstanceList_hiprtFrameMatrix_HwInstanceNode(\n"
"	InstanceList<hiprtFrameMatrix> primitives, BoxNode* boxNodes, InstanceNode* primNodes )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	SingletonConstruction<InstanceList<hiprtFrameMatrix>, InstanceNode>( index, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void PairTriangles( TriangleMesh mesh, uint2* pairIndices, uint32_t* pairCounter )\n"
"{\n"
"	const uint32_t index	 = blockIdx.x * blockDim.x + threadIdx.x;\n"
"	const uint32_t laneIndex = threadIdx.x & ( WarpSize - 1 );\n"
"\n"
"	bool	 valid		 = index < mesh.getCount();\n"
"	uint32_t pairedIndex = InvalidValue;\n"
"	uint64_t activeMask	 = hiprt::ballot( valid );\n"
"\n"
"	uint3 triIndices;\n"
"	if ( valid ) triIndices = mesh.fetchTriangleIndices( index );\n"
"\n"
"	while ( activeMask )\n"
"	{\n"
"		activeMask = shfl( activeMask, 0 );\n"
"\n"
"		const uint64_t broadcastLane = __ffsll( static_cast<unsigned long long>( activeMask ) ) - 1;\n"
"		if ( laneIndex == broadcastLane ) valid = false;\n"
"\n"
"		activeMask &= activeMask - 1;\n"
"\n"
"		const uint32_t broadcastIndex	   = shfl( index, broadcastLane );\n"
"		const uint3	   triIndicesBroadcast = {\n"
"			   shfl( triIndices.x, broadcastLane ), shfl( triIndices.y, broadcastLane ), shfl( triIndices.z, broadcastLane ) };\n"
"\n"
"		bool pairable = false;\n"
"		if ( index != broadcastIndex && valid )\n"
"			pairable = tryPairTriangles( triIndicesBroadcast, triIndices ).x != InvalidValue;\n"
"\n"
"		const uint32_t firstPairedLane = __ffsll( static_cast<unsigned long long>( hiprt::ballot( pairable ) ) ) - 1;\n"
"		if ( firstPairedLane < WarpSize )\n"
"		{\n"
"			activeMask &= ~( 1u << firstPairedLane );\n"
"			if ( laneIndex == firstPairedLane ) valid = false;\n"
"\n"
"			const uint32_t secondIndex = shfl( index, firstPairedLane );\n"
"			if ( laneIndex == broadcastLane ) pairedIndex = secondIndex;\n"
"		}\n"
"		else if ( laneIndex == broadcastLane )\n"
"		{\n"
"			pairedIndex = index;\n"
"		}\n"
"	}\n"
"\n"
"	bool	 pairing   = index < mesh.getCount() && pairedIndex != InvalidValue;\n"
"	uint32_t pairIndex = warpOffset( pairing, pairCounter );\n"
"	if ( pairing ) pairIndices[pairIndex] = make_uint2( index, pairedIndex );\n"
"}\n"
"\n"
"template <typename PrimitiveContainer>\n"
"__device__ void ComputeCentroidBox( PrimitiveContainer& primitives, Aabb* centroidBox )\n"
"{\n"
"	const uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;\n"
"\n"
"	Aabb primBox;\n"
"	if ( index < primitives.getCount() )\n"
"		primBox = primitives.fetchAabb( index );\n"
"	else\n"
"		primBox = primitives.fetchAabb( primitives.getCount() - 1 );\n"
"\n"
"	constexpr uint32_t							  WarpsPerBlock = DivideRoundUp( BvhBuilderReductionBlockSize, WarpSize );\n"
"	alignas( alignof( Aabb ) ) __shared__ uint8_t cache[sizeof( Aabb ) * WarpsPerBlock];\n"
"	Aabb*										  blockBoxes = reinterpret_cast<Aabb*>( cache );\n"
"\n"
"	Aabb blockBox = blockUnion( primBox, blockBoxes );\n"
"	if ( threadIdx.x == 0 ) centroidBox->atomicGrow( blockBox );\n"
"}\n"
"\n"
"extern \"C\" __global__ void __launch_bounds__( BvhBuilderReductionBlockSize )\n"
"	ComputeCentroidBox_TriangleMesh( TriangleMesh primitives, Aabb* centroidBox )\n"
"{\n"
"	ComputeCentroidBox<TriangleMesh>( primitives, centroidBox );\n"
"}\n"
"\n"
"extern \"C\" __global__ void __launch_bounds__( BvhBuilderReductionBlockSize )\n"
"	ComputeCentroidBox_AabbList( AabbList primitives, Aabb* centroidBox )\n"
"{\n"
"	ComputeCentroidBox<AabbList>( primitives, centroidBox );\n"
"}\n"
"\n"
"extern \"C\" __global__ void __launch_bounds__( BvhBuilderReductionBlockSize )\n"
"	ComputeCentroidBox_InstanceList_hiprtFrameSRT( InstanceList<hiprtFrameSRT> primitives, Aabb* centroidBox )\n"
"{\n"
"	ComputeCentroidBox<InstanceList<hiprtFrameSRT>>( primitives, centroidBox );\n"
"}\n"
"\n"
"extern \"C\" __global__ void __launch_bounds__( BvhBuilderReductionBlockSize )\n"
"	ComputeCentroidBox_InstanceList_hiprtFrameMatrix( InstanceList<hiprtFrameMatrix> primitives, Aabb* centroidBox )\n"
"{\n"
"	ComputeCentroidBox<InstanceList<hiprtFrameMatrix>>( primitives, centroidBox );\n"
"}\n"
"\n"
"template <typename PrimitiveContainer>\n"
"__device__ void ComputeBox( PrimitiveContainer& primitives, Aabb* box )\n"
"{\n"
"	const uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;\n"
"\n"
"	Aabb primBox;\n"
"	if ( index < primitives.getCount() )\n"
"		primBox = primitives.fetchAabb( index );\n"
"	else\n"
"		primBox = primitives.fetchAabb( primitives.getCount() - 1 );\n"
"\n"
"	constexpr uint32_t							  WarpsPerBlock = DivideRoundUp( BvhBuilderReductionBlockSize, WarpSize );\n"
"	alignas( alignof( Aabb ) ) __shared__ uint8_t cache[sizeof( Aabb ) * WarpsPerBlock];\n"
"	Aabb*										  blockBoxes = reinterpret_cast<Aabb*>( cache );\n"
"\n"
"	Aabb blockBox = blockUnion( primBox, blockBoxes );\n"
"	if ( threadIdx.x == 0 ) box->atomicGrow( blockBox );\n"
"}\n"
"\n"
"extern \"C\" __global__ void __launch_bounds__( BvhBuilderReductionBlockSize )\n"
"	ComputeBox_TriangleMesh( TriangleMesh primitives, Aabb* box )\n"
"{\n"
"	ComputeBox<TriangleMesh>( primitives, box );\n"
"}\n"
"\n"
"extern \"C\" __global__ void __launch_bounds__( BvhBuilderReductionBlockSize )\n"
"	ComputeBox_AabbList( AabbList primitives, Aabb* box )\n"
"{\n"
"	ComputeBox<AabbList>( primitives, box );\n"
"}\n"
"\n"
"extern \"C\" __global__ void __launch_bounds__( BvhBuilderReductionBlockSize )\n"
"	ComputeBox_InstanceList_hiprtFrameSRT( InstanceList<hiprtFrameSRT> primitives, Aabb* box )\n"
"{\n"
"	ComputeBox<InstanceList<hiprtFrameSRT>>( primitives, box );\n"
"}\n"
"\n"
"extern \"C\" __global__ void __launch_bounds__( BvhBuilderReductionBlockSize )\n"
"	ComputeBox_InstanceList_hiprtFrameMatrix( InstanceList<hiprtFrameMatrix> primitives, Aabb* box )\n"
"{\n"
"	ComputeBox<InstanceList<hiprtFrameMatrix>>( primitives, box );\n"
"}\n"
"\n"
"template <typename PrimitiveContainer>\n"
"__device__ void\n"
"ComputeMortonCodes( PrimitiveContainer& primitives, Aabb* centroidBox, uint32_t* mortonCodeKeys, uint32_t* mortonCodeValues )\n"
"{\n"
"	const Aabb box = *centroidBox;\n"
"\n"
"	const uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;\n"
"\n"
"	if ( index < primitives.getCount() )\n"
"	{\n"
"		const float3 boxExtent		  = box.extent();\n"
"		const float3 center			  = primitives.fetchCenter( index );\n"
"		const float3 normalizedCenter = ( center - box.m_min ) / boxExtent;\n"
"		mortonCodeKeys[index]		  = computeExtendedMortonCode( normalizedCenter, boxExtent );\n"
"		mortonCodeValues[index]		  = index;\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void ComputeMortonCodes_TriangleMesh(\n"
"	TriangleMesh primitives, Aabb* centroidBox, uint32_t* mortonCodeKeys, uint32_t* mortonCodeValues )\n"
"{\n"
"	ComputeMortonCodes<TriangleMesh>( primitives, centroidBox, mortonCodeKeys, mortonCodeValues );\n"
"}\n"
"\n"
"extern \"C\" __global__ void\n"
"ComputeMortonCodes_AabbList( AabbList primitives, Aabb* centroidBox, uint32_t* mortonCodeKeys, uint32_t* mortonCodeValues )\n"
"{\n"
"	ComputeMortonCodes<AabbList>( primitives, centroidBox, mortonCodeKeys, mortonCodeValues );\n"
"}\n"
"\n"
"extern \"C\" __global__ void ComputeMortonCodes_InstanceList_hiprtFrameSRT(\n"
"	InstanceList<hiprtFrameSRT> primitives, Aabb* centroidBox, uint32_t* mortonCodeKeys, uint32_t* mortonCodeValues )\n"
"{\n"
"	ComputeMortonCodes<InstanceList<hiprtFrameSRT>>( primitives, centroidBox, mortonCodeKeys, mortonCodeValues );\n"
"}\n"
"\n"
"extern \"C\" __global__ void ComputeMortonCodes_InstanceList_hiprtFrameMatrix(\n"
"	InstanceList<hiprtFrameMatrix> primitives, Aabb* centroidBox, uint32_t* mortonCodeKeys, uint32_t* mortonCodeValues )\n"
"{\n"
"	ComputeMortonCodes<InstanceList<hiprtFrameMatrix>>( primitives, centroidBox, mortonCodeKeys, mortonCodeValues );\n"
"}\n"
"\n"
"template <typename PrimitiveContainer, typename PrimitiveNode, typename Header>\n"
"__device__ void ResetCountersAndUpdateLeaves(\n"
"	const Header* header, PrimitiveContainer& primitives, BoxNode* boxNodes, PrimitiveNode* primNodes )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"\n"
"	if ( index < header->m_boxNodeCount ) boxNodes[index].m_updateCounter = 0;\n"
"\n"
"	if constexpr ( is_same<PrimitiveNode, TrianglePairNode>::value )\n"
"	{\n"
"		if ( index < header->m_primNodeCount )\n"
"		{\n"
"			primNodes[index] =\n"
"				primitives.fetchPrimNode( { primNodes[index].getPrimIndex( 0 ), primNodes[index].getPrimIndex( 1 ) } );\n"
"		}\n"
"	}\n"
"	else if constexpr ( is_same<PrimitiveNode, TrianglePacketNode>::value )\n"
"	{\n"
"		if ( index < header->m_primNodeCount )\n"
"		{\n"
"			uint32_t vertIndexCache[MaxVerticesPerTrianglePacket];\n"
"			uint32_t vertexCount = 0;\n"
"\n"
"			PrimitiveNode  triPacketNode = primNodes[index];\n"
"			const uint32_t triPairCount	 = triPacketNode.getTrianglePairCount();\n"
"			for ( uint32_t triPairIndex = 0; triPairIndex < triPairCount; ++triPairIndex )\n"
"			{\n"
"				const uint2 pairIndices{\n"
"					triPacketNode.getPrimIndex( triPairIndex, 0 ), triPacketNode.getPrimIndex( triPairIndex, 1 ) };\n"
"				const uint3 indices0 = primitives.fetchTriangleIndices( pairIndices.x );\n"
"				uint4		indices	 = make_uint4( indices0, indices0.z );\n"
"\n"
"				uint3 vertexMapping{};\n"
"				if ( pairIndices.x != pairIndices.y )\n"
"				{\n"
"					uint3 indices1 = primitives.fetchTriangleIndices( pairIndices.y );\n"
"					vertexMapping  = tryPairTriangles( indices0, indices1 );\n"
"\n"
"					uint32_t vertexIndex = 0;\n"
"					if ( vertexMapping.x == 3 ) vertexIndex = indices1.x;\n"
"					if ( vertexMapping.y == 3 ) vertexIndex = indices1.y;\n"
"					if ( vertexMapping.z == 3 ) vertexIndex = indices1.z;\n"
"					indices.w = vertexIndex;\n"
"				}\n"
"\n"
"				uint32_t newVertMask = 0;\n"
"				for ( uint32_t j = 0; j < 4; ++j )\n"
"				{\n"
"					if ( j == 3 && pairIndices.x == pairIndices.y ) break;\n"
"\n"
"					bool contains = false;\n"
"					for ( uint32_t k = 0; k < vertexCount; ++k )\n"
"					{\n"
"						if ( vertIndexCache[k] == ( &indices.x )[j] )\n"
"						{\n"
"							contains = true;\n"
"							break;\n"
"						}\n"
"					}\n"
"\n"
"					if ( !contains )\n"
"					{\n"
"						newVertMask |= 1 << j;\n"
"					}\n"
"				}\n"
"\n"
"				const uint32_t oldVertCount = vertexCount;\n"
"				const uint32_t newVertCount = __popc( newVertMask );\n"
"				for ( uint32_t j = 0; j < 4; ++j )\n"
"				{\n"
"					if ( j == 3 && pairIndices.x == pairIndices.y ) break;\n"
"\n"
"					bool contains = !( newVertMask & ( 1 << j ) );\n"
"					if ( !contains )\n"
"					{\n"
"						uint32_t vertexMask			= ( 1 << j ) - 1;\n"
"						uint32_t vertexIndex		= oldVertCount + __popc( newVertMask & vertexMask );\n"
"						vertIndexCache[vertexIndex] = ( &indices.x )[j];\n"
"					}\n"
"				}\n"
"				vertexCount += newVertCount;\n"
"			}\n"
"\n"
"			for ( uint32_t j = 0; j < vertexCount; ++j )\n"
"			{\n"
"				const float3 vertex = primitives.fetchVertex( vertIndexCache[j] );\n"
"				triPacketNode.template writeVertex<false, true>( j, vertex );\n"
"			}\n"
"\n"
"			primNodes[index] = triPacketNode;\n"
"		}\n"
"	}\n"
"	else if constexpr ( is_same<PrimitiveNode, InstanceNode>::value )\n"
"	{\n"
"		if ( index < primitives.getFrameCount() ) primitives.convertFrame( index );\n"
"\n"
"		if ( index < header->m_primNodeCount )\n"
"		{\n"
"			const uint32_t		 primIndex = primNodes[index].m_primIndex;\n"
"			hiprtTransformHeader transform = primitives.fetchTransformHeader( primIndex );\n"
"			primNodes[index].m_mask		   = primitives.fetchMask( primIndex );\n"
"			if ( transform.frameCount == 1 )\n"
"				primNodes[index].m_identity =\n"
"					primitives.computeInvTransformMatrix( transform.frameIndex, primNodes[index].m_matrix ) ? 1 : 0;\n"
"			else\n"
"				primNodes[index].m_identity = 0;\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void ResetCountersAndUpdateLeaves_TriangleMesh_TrianglePairNode(\n"
"	const GeomHeader* header, TriangleMesh primitives, BoxNode* boxNodes, TriangleNode* primNodes )\n"
"{\n"
"	ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void ResetCountersAndUpdateLeaves_TriangleMesh_TrianglePacketNode(\n"
"	const GeomHeader* header, TriangleMesh primitives, BoxNode* boxNodes, TriangleNode* primNodes )\n"
"{\n"
"	ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void ResetCountersAndUpdateLeaves_AabbList_CustomNode(\n"
"	const GeomHeader* header, AabbList primitives, BoxNode* boxNodes, CustomNode* primNodes )\n"
"{\n"
"	ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void ResetCountersAndUpdateLeaves_InstanceList_hiprtFrameMatrix_UserInstanceNode(\n"
"	const SceneHeader* header, InstanceList<hiprtFrameMatrix> primitives, BoxNode* boxNodes, UserInstanceNode* primNodes )\n"
"{\n"
"	ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void ResetCountersAndUpdateLeaves_InstanceList_hiprtFrameMatrix_HwInstanceNode(\n"
"	const SceneHeader* header, InstanceList<hiprtFrameMatrix> primitives, BoxNode* boxNodes, HwInstanceNode* primNodes )\n"
"{\n"
"	ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void ResetCountersAndUpdateLeaves_InstanceList_hiprtFrameSRT_UserInstanceNode(\n"
"	const SceneHeader* header, InstanceList<hiprtFrameSRT> primitives, BoxNode* boxNodes, UserInstanceNode* primNodes )\n"
"{\n"
"	ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void ResetCountersAndUpdateLeaves_InstanceList_hiprtFrameSRT_HwInstanceNode(\n"
"	const SceneHeader* header, InstanceList<hiprtFrameSRT> primitives, BoxNode* boxNodes, HwInstanceNode* primNodes )\n"
"{\n"
"	ResetCountersAndUpdateLeaves( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"template <typename PrimitiveContainer, typename PrimitiveNode, typename Header>\n"
"__device__ void FitBounds( Header* header, PrimitiveContainer& primitives, BoxNode* boxNodes, PrimitiveNode* primNodes )\n"
"{\n"
"	const uint32_t threadIndex	= threadIdx.x + blockIdx.x * blockDim.x;\n"
"	const uint32_t laneIndex	= threadIdx.x % WarpSize;\n"
"	const uint32_t sublaneIndex = laneIndex % BranchingFactor;\n"
"	const uint32_t subwarpIndex = laneIndex / BranchingFactor;\n"
"	const uint64_t subwarpMask	= ( ( 1 << BranchingFactor ) - 1 )\n"
"								 << static_cast<uint64_t>( ( BranchingFactor * subwarpIndex ) );\n"
"\n"
"	uint32_t index = threadIndex / BranchingFactor;\n"
"\n"
"	bool done = index >= header->m_boxNodeCount;\n"
"\n"
"	BoxNode	 node;\n"
"	uint32_t childCount{};\n"
"	bool	 internal = false;\n"
"	if ( !done )\n"
"	{\n"
"		node	   = boxNodes[index];\n"
"		childCount = node.getChildCount();\n"
"		internal   = sublaneIndex < childCount && node.getChildType( sublaneIndex ) == BoxType;\n"
"	}\n"
"\n"
"	uint32_t internalCount = __popcll( hiprt::ballot( internal ) & subwarpMask );\n"
"	if ( internalCount > 0 ) done = true;\n"
"\n"
"	while ( hiprt::any( !done ) )\n"
"	{\n"
"		__threadfence();\n"
"\n"
"		Aabb	 childBox;\n"
"		uint32_t childIndex = InvalidValue;\n"
"		uint32_t childRange = InvalidValue;\n"
"		if ( !done && sublaneIndex < childCount )\n"
"		{\n"
"			childIndex = node.getChildIndex( sublaneIndex );\n"
"			childRange = node.getChildRange( sublaneIndex );\n"
"			childBox   = getNodeBox( childIndex, primitives, boxNodes, primNodes );\n"
"		}\n"
"\n"
"		Aabb nodeBox = childBox;\n"
"#pragma unroll\n"
"		for ( uint32_t i = 1; i < BranchingFactor; i <<= 1 )\n"
"			nodeBox.grow( shflAabb( nodeBox, laneIndex ^ i ) );\n"
"\n"
"		if ( !done )\n"
"		{\n"
"			if ( sublaneIndex < childCount )\n"
"				boxNodes[index].initBox( sublaneIndex, childCount, childIndex, childBox, nodeBox, childRange );\n"
"			index = node.getParentAddr();\n"
"			if ( index == InvalidValue ) done = true;\n"
"		}\n"
"\n"
"		internal = false;\n"
"		if ( !done )\n"
"		{\n"
"			node	   = boxNodes[index];\n"
"			childCount = node.getChildCount();\n"
"			internal   = sublaneIndex < childCount && node.getChildType( sublaneIndex ) == BoxType;\n"
"		}\n"
"\n"
"		internalCount = __popcll( hiprt::ballot( internal ) & subwarpMask );\n"
"\n"
"		__threadfence();\n"
"\n"
"		if ( !done && sublaneIndex == 0 && atomicAdd( &boxNodes[index].m_updateCounter, 1 ) < internalCount - 1 ) done = true;\n"
"\n"
"		done = shfl( done, subwarpIndex * BranchingFactor );\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void FitBounds_TriangleMesh_TrianglePairNode(\n"
"	GeomHeader* header, TriangleMesh primitives, BoxNode* boxNodes, TrianglePairNode* primNodes )\n"
"{\n"
"	FitBounds( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void FitBounds_TriangleMesh_TrianglePacketNode(\n"
"	GeomHeader* header, TriangleMesh primitives, BoxNode* boxNodes, TrianglePacketNode* primNodes )\n"
"{\n"
"	FitBounds( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void\n"
"FitBounds_AabbList_CustomNode( GeomHeader* header, AabbList primitives, BoxNode* boxNodes, CustomNode* primNodes )\n"
"{\n"
"	FitBounds( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void FitBounds_InstanceList_hiprtFrameSRT_UserInstanceNode(\n"
"	SceneHeader* header, InstanceList<hiprtFrameSRT> primitives, BoxNode* boxNodes, InstanceNode* primNodes )\n"
"{\n"
"	FitBounds( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void FitBounds_InstanceList_hiprtFrameSRT_HwInstanceNode(\n"
"	SceneHeader* header, InstanceList<hiprtFrameSRT> primitives, BoxNode* boxNodes, InstanceNode* primNodes )\n"
"{\n"
"	FitBounds( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void FitBounds_InstanceList_hiprtFrameMatrix_UserInstanceNode(\n"
"	SceneHeader* header, InstanceList<hiprtFrameMatrix> primitives, BoxNode* boxNodes, InstanceNode* primNodes )\n"
"{\n"
"	FitBounds( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void FitBounds_InstanceList_hiprtFrameMatrix_HwInstanceNode(\n"
"	SceneHeader* header, InstanceList<hiprtFrameMatrix> primitives, BoxNode* boxNodes, InstanceNode* primNodes )\n"
"{\n"
"	FitBounds( header, primitives, boxNodes, primNodes );\n"
"}\n"
"\n"
"template <typename PrimitiveContainer, typename PrimitiveNode, typename Header>\n"
"__device__ void FitOrientedBounds(\n"
"	Header*				header,\n"
"	PrimitiveContainer& primitives,\n"
"	Box8Node*			boxNodes,\n"
"	PrimitiveNode*		primNodes,\n"
"	Kdop*				kdops,\n"
"	uint32_t*			updateCounters )\n"
"{\n"
"	const uint32_t threadIndex = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	const uint32_t laneIndex   = threadIdx.x % WarpSize;\n"
"\n"
"	uint32_t index = threadIndex / WarpSize;\n"
"\n"
"	if ( index >= header->m_boxNodeCount ) return;\n"
"\n"
"	Box8Node node		= boxNodes[index];\n"
"	uint32_t childCount = node.getChildCount();\n"
"	bool	 internal	= laneIndex < childCount && node.getChildType( laneIndex ) == BoxType;\n"
"\n"
"	uint32_t internalCount = __popcll( hiprt::ballot( internal ) );\n"
"\n"
"	bool done = internalCount > 0;\n"
"\n"
"	while ( hiprt::any( !done ) )\n"
"	{\n"
"		__threadfence();\n"
"\n"
"		if ( index > 0 )\n"
"		{\n"
"			uint32_t minIndexLane = InvalidValue;\n"
"			float	 minAreaLane  = FltMax;\n"
"			for ( uint32_t j = laneIndex; j <= RotationCount; j += WarpSize )\n"
"			{\n"
"				Aabb obb;\n"
"				for ( uint32_t i = 0; i < childCount; ++i )\n"
"				{\n"
"					const Aabb	   childBox	  = node.getChildBox( i );\n"
"					const uint32_t childIndex = node.getChildIndex( i );\n"
"					obb.grow( getNodeObb( j, childIndex, childBox, primitives, primNodes, kdops ).aabb() );\n"
"				}\n"
"\n"
"				kdops[index].m_boxes[j] = obb;\n"
"\n"
"				if ( minAreaLane > obb.area() )\n"
"				{\n"
"					minAreaLane	 = obb.area();\n"
"					minIndexLane = j;\n"
"				}\n"
"			}\n"
"\n"
"			const float	   minArea	= warpMin( minAreaLane );\n"
"			const uint32_t minIndex = __ffsll( static_cast<unsigned long long>( hiprt::ballot( minAreaLane == minArea ) ) ) - 1;\n"
"			const uint32_t matrixIndex = shfl( minIndexLane, minIndex );\n"
"\n"
"			Aabb	 childBox;\n"
"			uint32_t childIndex;\n"
"			uint32_t childRange;\n"
"			if ( laneIndex < childCount )\n"
"			{\n"
"				childIndex = node.getChildIndex( laneIndex );\n"
"				childRange = node.getChildRange( laneIndex );\n"
"				childBox =\n"
"					getNodeObb( matrixIndex, childIndex, node.getChildBox( laneIndex ), primitives, primNodes, kdops ).aabb();\n"
"			}\n"
"\n"
"			const Aabb nodeBox = warpUnion( childBox );\n"
"\n"
"			if ( laneIndex < childCount )\n"
"			{\n"
"				boxNodes[index].initBox(\n"
"					laneIndex, childCount, childIndex, childBox, nodeBox, childRange, MatrixIndexToId[matrixIndex] );\n"
"			}\n"
"\n"
"			// revert aabb if obb is not better\n"
"			if ( laneIndex == 0 )\n"
"			{\n"
"				// reconstructed quantized boxes\n"
"				float aabbArea = 0.0f;\n"
"				float obbArea  = 0.0f;\n"
"				for ( uint32_t j = 0; j < node.getChildCount(); ++j )\n"
"				{\n"
"					aabbArea += node.getChildBox( j ).area();\n"
"					obbArea += boxNodes[index].getChildBox( j ).area();\n"
"				}\n"
"\n"
"				// compare to aabb surface area\n"
"				if ( aabbArea < ObbSurfaceAreaAlpha * obbArea ) boxNodes[index] = node;\n"
"			}\n"
"		}\n"
"\n"
"		index = node.getParentAddr();\n"
"		if ( index == InvalidValue ) break;\n"
"\n"
"		node	   = boxNodes[index];\n"
"		childCount = node.getChildCount();\n"
"		internal   = laneIndex < childCount && node.getChildType( laneIndex ) == BoxType;\n"
"\n"
"		internalCount = __popcll( hiprt::ballot( internal ) );\n"
"\n"
"		__threadfence();\n"
"\n"
"		if ( laneIndex == 0 && atomicAdd( &updateCounters[index], 1 ) < internalCount - 1 ) done = true;\n"
"\n"
"		done = shfl( done, 0 );\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void FitOrientedBounds_TriangleMesh_TrianglePacketNode(\n"
"	GeomHeader*			header,\n"
"	TriangleMesh		primitives,\n"
"	Box8Node*			boxNodes,\n"
"	TrianglePacketNode* primNodes,\n"
"	Kdop*				kdops,\n"
"	uint32_t*			updateCounters )\n"
"{\n"
"	FitOrientedBounds( header, primitives, boxNodes, primNodes, kdops, updateCounters );\n"
"}\n"
"\n"
"extern \"C\" __global__ void FitOrientedBounds_AabbList_CustomNode(\n"
"	GeomHeader* header, AabbList primitives, Box8Node* boxNodes, CustomNode* primNodes, Kdop* kdops, uint32_t* updateCounters )\n"
"{\n"
"	FitOrientedBounds( header, primitives, boxNodes, primNodes, kdops, updateCounters );\n"
"}\n"
"\n"
"extern \"C\" __global__ void FitOrientedBounds_InstanceList_hiprtFrameSRT_UserInstanceNode(\n"
"	SceneHeader*				header,\n"
"	InstanceList<hiprtFrameSRT> primitives,\n"
"	Box8Node*					boxNodes,\n"
"	InstanceNode*				primNodes,\n"
"	Kdop*						kdops,\n"
"	uint32_t*					updateCounters )\n"
"{\n"
"	FitOrientedBounds( header, primitives, boxNodes, primNodes, kdops, updateCounters );\n"
"}\n"
"\n"
"extern \"C\" __global__ void FitOrientedBounds_InstanceList_hiprtFrameSRT_HwInstanceNode(\n"
"	SceneHeader*				header,\n"
"	InstanceList<hiprtFrameSRT> primitives,\n"
"	Box8Node*					boxNodes,\n"
"	InstanceNode*				primNodes,\n"
"	Kdop*						kdops,\n"
"	uint32_t*					updateCounters )\n"
"{\n"
"	FitOrientedBounds( header, primitives, boxNodes, primNodes, kdops, updateCounters );\n"
"}\n"
"\n"
"extern \"C\" __global__ void FitOrientedBounds_InstanceList_hiprtFrameMatrix_UserInstanceNode(\n"
"	SceneHeader*				   header,\n"
"	InstanceList<hiprtFrameMatrix> primitives,\n"
"	Box8Node*					   boxNodes,\n"
"	InstanceNode*				   primNodes,\n"
"	Kdop*						   kdops,\n"
"	uint32_t*					   updateCounters )\n"
"{\n"
"	FitOrientedBounds( header, primitives, boxNodes, primNodes, kdops, updateCounters );\n"
"}\n"
"\n"
"extern \"C\" __global__ void FitOrientedBounds_InstanceList_hiprtFrameMatrix_HwInstanceNode(\n"
"	SceneHeader*				   header,\n"
"	InstanceList<hiprtFrameMatrix> primitives,\n"
"	Box8Node*					   boxNodes,\n"
"	InstanceNode*				   primNodes,\n"
"	Kdop*						   kdops,\n"
"	uint32_t*					   updateCounters )\n"
"{\n"
"	FitOrientedBounds( header, primitives, boxNodes, primNodes, kdops, updateCounters );\n"
"}\n"
"\n"
"template <typename BinaryNode>\n"
"__device__ void\n"
"ComputeParentAddrs( uint32_t index, uint32_t leafCount, uint32_t rootAddr, BinaryNode* binaryNodes, uint32_t* parentAddrs )\n"
"{\n"
"	if ( index < leafCount - 1 )\n"
"	{\n"
"		BinaryNode binaryNode = binaryNodes[index];\n"
"		for ( uint32_t i = 0; i < 2; ++i )\n"
"		{\n"
"			if ( binaryNode.getChildType( i ) == BoxType ) parentAddrs[binaryNode.getChildAddr( i )] = index;\n"
"		}\n"
"		if ( index == rootAddr ) parentAddrs[rootAddr] = InvalidValue;\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void\n"
"ComputeParentAddrs_ScratchNode( uint32_t leafCount, uint32_t rootAddr, ScratchNode* binaryNodes, uint32_t* parentAddrs )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	ComputeParentAddrs( index, leafCount, rootAddr, binaryNodes, parentAddrs );\n"
"}\n"
"\n"
"extern \"C\" __global__ void\n"
"ComputeParentAddrs_ApiNode( uint32_t leafCount, uint32_t rootAddr, ApiNode* binaryNodes, uint32_t* parentAddrs )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	ComputeParentAddrs( index, leafCount, rootAddr, binaryNodes, parentAddrs );\n"
"}\n"
"\n"
"template <typename BinaryNode>\n"
"__device__ void ComputeFatLeaves(\n"
"	uint32_t	index,\n"
"	uint32_t	leafCount,\n"
"	BinaryNode* binaryNodes,\n"
"	uint32_t*	parentAddrs,\n"
"	uint32_t*	triangleCounts,\n"
"	uint32_t*	updateCounters )\n"
"{\n"
"	if ( index >= leafCount ) return;\n"
"\n"
"	if ( index >= leafCount - 1 ) return;\n"
"	BinaryNode node			 = binaryNodes[index];\n"
"	uint32_t   internalCount = 0;\n"
"	for ( uint32_t i = 0; i < 2; ++i )\n"
"	{\n"
"		if ( node.getChildType( i ) == BoxType ) internalCount++;\n"
"	}\n"
"\n"
"	if ( internalCount > 0 ) return;\n"
"\n"
"	while ( true )\n"
"	{\n"
"		__threadfence();\n"
"\n"
"		BinaryNode& binaryNode = binaryNodes[index];\n"
"\n"
"		uint32_t triangleCount = 0;\n"
"		for ( uint32_t i = 0; i < 2; ++i )\n"
"		{\n"
"			uint32_t childTriCount = 0;\n"
"			if ( binaryNode.getChildType( i ) == TriangleType )\n"
"				childTriCount = 1;\n"
"			else\n"
"				childTriCount = triangleCounts[binaryNode.getChildAddr( i )];\n"
"\n"
"			if ( childTriCount <= MaxFatLeafSize ) binaryNode.setChildFatLeafFlag( i );\n"
"\n"
"			triangleCount += childTriCount;\n"
"		}\n"
"\n"
"		triangleCounts[index] = triangleCount;\n"
"\n"
"		index = parentAddrs[index];\n"
"		if ( index == InvalidValue ) break;\n"
"		node = binaryNodes[index];\n"
"\n"
"		internalCount = 0;\n"
"		for ( uint32_t i = 0; i < 2; ++i )\n"
"		{\n"
"			if ( node.getChildType( i ) == BoxType ) internalCount++;\n"
"		}\n"
"\n"
"		__threadfence();\n"
"\n"
"		if ( atomicAdd( &updateCounters[index], 1 ) < internalCount - 1 ) break;\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void ComputeFatLeaves_ScratchNode(\n"
"	uint32_t leafCount, ScratchNode* binaryNodes, uint32_t* parentAddrs, uint32_t* triangleCounts, uint32_t* updateCounters )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	ComputeFatLeaves( index, leafCount, binaryNodes, parentAddrs, triangleCounts, updateCounters );\n"
"}\n"
"\n"
"extern \"C\" __global__ void ComputeFatLeaves_ApiNode(\n"
"	uint32_t leafCount, ApiNode* binaryNodes, uint32_t* parentAddrs, uint32_t* triangleCounts, uint32_t* updateCounters )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	ComputeFatLeaves( index, leafCount, binaryNodes, parentAddrs, triangleCounts, updateCounters );\n"
"}\n"
"\n"
"template <typename PrimitiveNode, typename BinaryNode, typename Header>\n"
"__device__ void Collapse(\n"
"	uint32_t	   index,\n"
"	uint32_t	   maxBoxNodeCount,\n"
"	uint32_t	   referenceCount,\n"
"	Header*		   header,\n"
"	BinaryNode*	   binaryNodes,\n"
"	ReferenceNode* references,\n"
"	BoxNode*	   boxNodes,\n"
"	uint3*		   taskQueue,\n"
"	uint32_t*	   referenceIndices )\n"
"{\n"
"	const uint32_t laneIndex	= threadIdx.x % WarpSize;\n"
"	const uint32_t taskIndex	= index / BranchingFactor;\n"
"	const uint32_t sublaneIndex = laneIndex % BranchingFactor;\n"
"	const uint32_t subwarpIndex = laneIndex / BranchingFactor;\n"
"	const uint64_t subwarpMask	= ( ( 1 << BranchingFactor ) - 1 )\n"
"								 << static_cast<uint64_t>( ( BranchingFactor * subwarpIndex ) );\n"
"\n"
"	bool done = taskIndex >= maxBoxNodeCount || taskIndex >= referenceCount;\n"
"\n"
"	while ( hiprt::any( !done ) )\n"
"	{\n"
"		sync_warp();\n"
"		__threadfence();\n"
"\n"
"		if ( atomicAdd( &header->m_referenceCount, 0 ) == referenceCount ) done = true;\n"
"\n"
"		uint32_t nodeIndex	= InvalidValue;\n"
"		uint32_t nodeAddr	= InvalidValue;\n"
"		uint32_t parentAddr = InvalidValue;\n"
"		uint3	 task		= make_uint3( InvalidValue );\n"
"		if ( !done )\n"
"		{\n"
"			task	   = atomic_load( &taskQueue[taskIndex] );\n"
"			nodeIndex  = task.x;\n"
"			nodeAddr   = task.y;\n"
"			parentAddr = task.z;\n"
"		}\n"
"\n"
"		// we need to check all three values\n"
"		const bool valid = nodeIndex != InvalidValue && nodeAddr != InvalidValue && parentAddr != InvalidValue;\n"
"\n"
"		// skip inactive warps\n"
"		if ( hiprt::all( !valid ) ) continue;\n"
"\n"
"		Aabb	 childBox;\n"
"		uint32_t childIndex = InvalidValue;\n"
"		uint32_t childCount = 2;\n"
"\n"
"		if ( nodeAddr == 0 ) parentAddr = InvalidValue;\n"
"\n"
"		// fill inactive lanes with first valid node index\n"
"		const uint32_t firstValidLane = __ffsll( static_cast<unsigned long long>( hiprt::ballot( valid ) ) ) - 1;\n"
"		nodeIndex					  = shfl( nodeIndex, valid ? laneIndex : firstValidLane );\n"
"\n"
"		BinaryNode binaryNode = binaryNodes[getNodeAddr( nodeIndex )];\n"
"		if ( sublaneIndex < 2 )\n"
"		{\n"
"			childIndex = binaryNode[sublaneIndex];\n"
"			childBox   = getNodeBox( binaryNode[sublaneIndex], binaryNodes, references );\n"
"		}\n"
"\n"
"		// open internal nodes first\n"
"		if constexpr ( is_same<PrimitiveNode, TrianglePacketNode>::value )\n"
"			openNodes<isFatLeafNode>( binaryNodes, references, childCount, childIndex, childBox );\n"
"\n"
"		// open fat leaves for the remaining slots\n"
"		openNodes<isLeafNode>( binaryNodes, references, childCount, childIndex, childBox );\n"
"\n"
"		const bool active = valid && sublaneIndex < childCount;\n"
"\n"
"		const bool	   internal		= isInternalNode( childIndex ) && !isFatLeafNode( childIndex );\n"
"		const uint32_t childAddr	= warpOffset( active && internal, &header->m_boxNodeCount );\n"
"		const uint32_t internalBase = shfl( childAddr, subwarpIndex * BranchingFactor );\n"
"		if ( active && internal )\n"
"		{\n"
"			atomic_store( &taskQueue[childAddr], { childIndex, childAddr, nodeAddr } );\n"
"			childIndex = encodeNodeIndex( childAddr, getNodeType( childIndex ) );\n"
"			__threadfence();\n"
"		}\n"
"\n"
"		if ( valid )\n"
"		{\n"
"			if constexpr ( !is_same<PrimitiveNode, TrianglePacketNode>::value )\n"
"			{\n"
"				boxNodes[nodeAddr].init(\n"
"					sublaneIndex, parentAddr, childCount, internalBase, 0, childIndex, childBox, binaryNode.m_box );\n"
"			}\n"
"			else\n"
"			{\n"
"				const bool fatLeaf = isFatLeafNode( childIndex ) && !isLeafNode( childIndex );\n"
"				if ( fatLeaf ) childIndex = encodeNodeIndex( getNodeAddr( childIndex ), TriangleType );\n"
"				boxNodes[nodeAddr].init(\n"
"					sublaneIndex, parentAddr, childCount, internalBase, 0, childIndex, childBox, binaryNode.m_box );\n"
"				if ( fatLeaf ) childIndex = encodeNodeIndex( getNodeAddr( childIndex ), BoxType ) | FatLeafBit;\n"
"			}\n"
"		}\n"
"\n"
"		task = make_uint3( InvalidValue );\n"
"\n"
"		if constexpr ( !is_same<PrimitiveNode, TrianglePacketNode>::value )\n"
"		{\n"
"			const bool	   leaf			 = isLeafNode( childIndex );\n"
"			const uint64_t activeSubmask = hiprt::ballot( active && leaf ) & subwarpMask;\n"
"			const uint32_t rangeSize	 = __popcll( activeSubmask );\n"
"			const uint32_t rangeAddr	 = warpOffset( active && leaf, &header->m_referenceCount );\n"
"			if ( active && leaf ) referenceIndices[rangeAddr] = childIndex;\n"
"			if ( valid && sublaneIndex == 0 && activeSubmask != 0 ) task = { rangeAddr, nodeAddr, rangeSize };\n"
"		}\n"
"		else\n"
"		{\n"
"			uint32_t leafRangeSize = 0;\n"
"			uint32_t subtreeRefIndices[MaxFatLeafSize];\n"
"\n"
"			const bool fatLeaf = isFatLeafNode( childIndex );\n"
"			if ( active && fatLeaf )\n"
"			{\n"
"				uint32_t prevLeafRangeSize = 0;\n"
"				leafRangeSize			   = 1;\n"
"				subtreeRefIndices[0]	   = childIndex;\n"
"				while ( prevLeafRangeSize != leafRangeSize )\n"
"				{\n"
"					prevLeafRangeSize = leafRangeSize;\n"
"					for ( uint32_t j = 0; j < prevLeafRangeSize; ++j )\n"
"					{\n"
"						if ( !isLeafNode( subtreeRefIndices[j] ) )\n"
"						{\n"
"							const uint32_t referenceIndex	   = subtreeRefIndices[j];\n"
"							subtreeRefIndices[j]			   = binaryNodes[getNodeAddr( referenceIndex )][0];\n"
"							subtreeRefIndices[leafRangeSize++] = binaryNodes[getNodeAddr( referenceIndex )][1];\n"
"						}\n"
"					}\n"
"				}\n"
"			}\n"
"\n"
"			const uint32_t rangeBase   = warpOffset( leafRangeSize, &header->m_referenceCount );\n"
"			uint32_t	   rangeOffset = rangeBase;\n"
"\n"
"			if ( active && fatLeaf )\n"
"			{\n"
"				for ( uint32_t j = 0; j < leafRangeSize; ++j )\n"
"				{\n"
"					uint32_t referenceIndex = subtreeRefIndices[j];\n"
"					referenceIndex &= ~FatLeafBit;\n"
"					referenceIndex |= j == 0 ? RangeStartBit : 0;\n"
"					referenceIndex |= j == leafRangeSize - 1 ? RangeEndBit : 0;\n"
"					referenceIndices[rangeOffset++] = referenceIndex;\n"
"				}\n"
"			}\n"
"\n"
"			const uint64_t activeSubmask   = hiprt::ballot( active && fatLeaf ) & subwarpMask;\n"
"			const uint32_t lastActiveLane  = activeSubmask == 0 ? 0 : ( WarpSize - 1 ) - __clzll( activeSubmask );\n"
"			const uint32_t lastRangeOffset = shfl( rangeOffset, lastActiveLane );\n"
"			if ( valid && sublaneIndex == 0 && activeSubmask != 0 )\n"
"			{\n"
"				const uint32_t rangeSize = lastRangeOffset - rangeBase;\n"
"				task					 = { rangeBase, nodeAddr, rangeSize };\n"
"			}\n"
"		}\n"
"		sync_warp();\n"
"\n"
"		if ( valid )\n"
"		{\n"
"			if ( sublaneIndex == 0 ) atomic_store( &taskQueue[taskIndex], task );\n"
"			done = true;\n"
"		}\n"
"\n"
"		__threadfence();\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void Collapse_TrianglePairNode_ScratchNode(\n"
"	uint32_t	   maxBoxNodeCount,\n"
"	uint32_t	   referenceCount,\n"
"	GeomHeader*	   header,\n"
"	ScratchNode*   binaryNodes,\n"
"	ReferenceNode* references,\n"
"	BoxNode*	   boxNodes,\n"
"	uint3*		   taskQueue,\n"
"	uint32_t*	   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	Collapse<TrianglePairNode>(\n"
"		index, maxBoxNodeCount, referenceCount, header, binaryNodes, references, boxNodes, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void Collapse_TrianglePacketNode_ScratchNode(\n"
"	uint32_t	   maxBoxNodeCount,\n"
"	uint32_t	   referenceCount,\n"
"	GeomHeader*	   header,\n"
"	ScratchNode*   binaryNodes,\n"
"	ReferenceNode* references,\n"
"	BoxNode*	   boxNodes,\n"
"	uint3*		   taskQueue,\n"
"	uint32_t*	   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	Collapse<TrianglePacketNode>(\n"
"		index, maxBoxNodeCount, referenceCount, header, binaryNodes, references, boxNodes, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void Collapse_CustomNode_ScratchNode(\n"
"	uint32_t	   maxBoxNodeCount,\n"
"	uint32_t	   referenceCount,\n"
"	GeomHeader*	   header,\n"
"	ScratchNode*   binaryNodes,\n"
"	ReferenceNode* references,\n"
"	BoxNode*	   boxNodes,\n"
"	uint3*		   taskQueue,\n"
"	uint32_t*	   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	Collapse<CustomNode>(\n"
"		index, maxBoxNodeCount, referenceCount, header, binaryNodes, references, boxNodes, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void Collapse_UserInstanceNode_ScratchNode(\n"
"	uint32_t	   maxBoxNodeCount,\n"
"	uint32_t	   referenceCount,\n"
"	SceneHeader*   header,\n"
"	ScratchNode*   binaryNodes,\n"
"	ReferenceNode* references,\n"
"	BoxNode*	   boxNodes,\n"
"	uint3*		   taskQueue,\n"
"	uint32_t*	   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	Collapse<UserInstanceNode>(\n"
"		index, maxBoxNodeCount, referenceCount, header, binaryNodes, references, boxNodes, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void Collapse_HwInstanceNode_ScratchNode(\n"
"	uint32_t	   maxBoxNodeCount,\n"
"	uint32_t	   referenceCount,\n"
"	SceneHeader*   header,\n"
"	ScratchNode*   binaryNodes,\n"
"	ReferenceNode* references,\n"
"	BoxNode*	   boxNodes,\n"
"	uint3*		   taskQueue,\n"
"	uint32_t*	   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	Collapse<HwInstanceNode>(\n"
"		index, maxBoxNodeCount, referenceCount, header, binaryNodes, references, boxNodes, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void Collapse_TrianglePairNode_ApiNode(\n"
"	uint32_t	   maxBoxNodeCount,\n"
"	uint32_t	   referenceCount,\n"
"	GeomHeader*	   header,\n"
"	ApiNode*	   binaryNodes,\n"
"	ReferenceNode* references,\n"
"	BoxNode*	   boxNodes,\n"
"	uint3*		   taskQueue,\n"
"	uint32_t*	   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	Collapse<TrianglePairNode>(\n"
"		index, maxBoxNodeCount, referenceCount, header, binaryNodes, references, boxNodes, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void Collapse_TrianglePacketNode_ApiNode(\n"
"	uint32_t	   maxBoxNodeCount,\n"
"	uint32_t	   referenceCount,\n"
"	GeomHeader*	   header,\n"
"	ApiNode*	   binaryNodes,\n"
"	ReferenceNode* references,\n"
"	BoxNode*	   boxNodes,\n"
"	uint3*		   taskQueue,\n"
"	uint32_t*	   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	Collapse<TrianglePacketNode>(\n"
"		index, maxBoxNodeCount, referenceCount, header, binaryNodes, references, boxNodes, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void Collapse_CustomNode_ApiNode(\n"
"	uint32_t	   maxBoxNodeCount,\n"
"	uint32_t	   referenceCount,\n"
"	GeomHeader*	   header,\n"
"	ApiNode*	   binaryNodes,\n"
"	ReferenceNode* references,\n"
"	BoxNode*	   boxNodes,\n"
"	uint3*		   taskQueue,\n"
"	uint32_t*	   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	Collapse<CustomNode>(\n"
"		index, maxBoxNodeCount, referenceCount, header, binaryNodes, references, boxNodes, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void Collapse_UserInstanceNode_ApiNode(\n"
"	uint32_t	   maxBoxNodeCount,\n"
"	uint32_t	   referenceCount,\n"
"	SceneHeader*   header,\n"
"	ApiNode*	   binaryNodes,\n"
"	ReferenceNode* references,\n"
"	BoxNode*	   boxNodes,\n"
"	uint3*		   taskQueue,\n"
"	uint32_t*	   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	Collapse<UserInstanceNode>(\n"
"		index, maxBoxNodeCount, referenceCount, header, binaryNodes, references, boxNodes, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void Collapse_HwInstanceNode_ApiNode(\n"
"	uint32_t	   maxBoxNodeCount,\n"
"	uint32_t	   referenceCount,\n"
"	SceneHeader*   header,\n"
"	ApiNode*	   binaryNodes,\n"
"	ReferenceNode* references,\n"
"	BoxNode*	   boxNodes,\n"
"	uint3*		   taskQueue,\n"
"	uint32_t*	   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	Collapse<HwInstanceNode>(\n"
"		index, maxBoxNodeCount, referenceCount, header, binaryNodes, references, boxNodes, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"__device__ void CompactTasks( uint32_t index, uint32_t taskCount, uint3* taskQueue, uint32_t* taskCounter )\n"
"{\n"
"	uint3 task	= make_uint3( InvalidValue );\n"
"	bool  valid = false;\n"
"	if ( index < taskCount )\n"
"	{\n"
"		task  = taskQueue[index];\n"
"		valid = task.z != InvalidValue && task.z > 0;\n"
"	}\n"
"	__syncthreads();\n"
"\n"
"	const uint32_t newIndex = warpOffset( valid, taskCounter );\n"
"\n"
"	if ( valid ) taskQueue[newIndex] = task;\n"
"}\n"
"\n"
"extern \"C\" __global__ void CompactTasks( uint32_t taskCount, uint3* taskQueue, uint32_t* taskCounter )\n"
"{\n"
"	__shared__ uint32_t newTaskCount;\n"
"	if ( threadIdx.x == 0 ) newTaskCount = 0;\n"
"	__syncthreads();\n"
"\n"
"	for ( uint32_t taskIndex = threadIdx.x; taskIndex < RoundUp( taskCount, blockDim.x ); taskIndex += blockDim.x )\n"
"	{\n"
"		CompactTasks( taskIndex, taskCount, taskQueue, &newTaskCount );\n"
"		__syncthreads();\n"
"	}\n"
"\n"
"	if ( threadIdx.x == 0 && blockIdx.x == 0 ) *taskCounter = newTaskCount;\n"
"}\n"
"\n"
"template <typename PrimitiveContainer, typename PrimitiveNode, typename Header>\n"
"__device__ void PackLeaves(\n"
"	uint32_t			index,\n"
"	uint32_t			taskCount,\n"
"	Header*				header,\n"
"	ReferenceNode*		references,\n"
"	BoxNode*			boxNodes,\n"
"	PrimitiveNode*		primNodes,\n"
"	PrimitiveContainer& primitives,\n"
"	uint3*				taskQueue,\n"
"	uint32_t*			referenceIndices )\n"
"{\n"
"	if ( index >= taskCount ) return;\n"
"\n"
"	uint3	 task			 = taskQueue[index];\n"
"	uint32_t referenceOffset = task.x;\n"
"	uint32_t nodeAddr		 = task.y;\n"
"	uint32_t leafCount		 = task.z;\n"
"	if ( leafCount == InvalidValue || leafCount == 0 ) return;\n"
"\n"
"	const uint32_t primNodeBase	  = atomicAdd( &header->m_primNodeCount, leafCount );\n"
"	uint32_t	   primNodeOffset = primNodeBase;\n"
"\n"
"	BoxNode& node = boxNodes[nodeAddr];\n"
"	node.setPrimNodeBase( primNodeBase );\n"
"	for ( uint32_t i = 0; i < node.getChildCount(); ++i )\n"
"	{\n"
"		uint32_t childIndex = node.getChildIndex( i );\n"
"		if ( isLeafNode( childIndex ) )\n"
"		{\n"
"			const uint32_t		referenceIndex = referenceIndices[referenceOffset++];\n"
"			const ReferenceNode reference	   = references[getNodeAddr( referenceIndex )];\n"
"			primNodes[primNodeOffset]		   = primitives.fetchPrimNode( reference.m_primIndex );\n"
"			childIndex						   = encodeNodeIndex( primNodeOffset, getNodeType( childIndex ) );\n"
"			node.patchChild( i, childIndex, 1 );\n"
"			primNodeOffset++;\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"template <>\n"
"__device__ void PackLeaves<TriangleMesh, TrianglePacketNode, GeomHeader>(\n"
"	uint32_t			index,\n"
"	uint32_t			taskCount,\n"
"	GeomHeader*			header,\n"
"	ReferenceNode*		references,\n"
"	BoxNode*			boxNodes,\n"
"	TrianglePacketNode* primNodes,\n"
"	TriangleMesh&		primitives,\n"
"	uint3*				taskQueue,\n"
"	uint32_t*			referenceIndices )\n"
"{\n"
"	if ( index >= taskCount ) return;\n"
"\n"
"	uint3	 task		 = taskQueue[index];\n"
"	uint32_t rangeOffset = task.x;\n"
"	uint32_t nodeAddr	 = task.y;\n"
"	uint32_t rangeSize	 = task.z;\n"
"	if ( rangeSize == InvalidValue || rangeSize == 0 ) return;\n"
"\n"
"	TrianglePacketCache triPacketCache;\n"
"	TrianglePairOffsets triPairOffsetCache[BranchingFactor];\n"
"\n"
"	uint32_t		   primNodeCount = 1;\n"
"	TrianglePacketData packet{};\n"
"\n"
"	const uint32_t rangeBase = rangeOffset;\n"
"	while ( rangeOffset < rangeBase + rangeSize )\n"
"	{\n"
"		uint32_t referenceIndex = referenceIndices[rangeOffset];\n"
"		referenceIndex &= ~RangeStartBit;\n"
"		referenceIndex &= ~RangeEndBit;\n"
"\n"
"		// form triangle pair\n"
"		const ReferenceNode reference	= references[getNodeAddr( referenceIndex )];\n"
"		const uint2			pairIndices = primitives.fetchTrianglePairIndices( reference.m_primIndex );\n"
"		uint3				indices0	= primitives.fetchTriangleIndices( pairIndices.x );\n"
"		uint4				indices		= make_uint4( indices0, indices0.z );\n"
"\n"
"		if ( pairIndices.x != pairIndices.y )\n"
"		{\n"
"			uint3 indices1		= primitives.fetchTriangleIndices( pairIndices.y );\n"
"			uint3 vertexMapping = tryPairTriangles( indices0, indices1 );\n"
"\n"
"			uint32_t vertexIndex = 0;\n"
"			if ( vertexMapping.x == 3 ) vertexIndex = indices1.x;\n"
"			if ( vertexMapping.y == 3 ) vertexIndex = indices1.y;\n"
"			if ( vertexMapping.z == 3 ) vertexIndex = indices1.z;\n"
"			indices.w = vertexIndex;\n"
"		}\n"
"\n"
"		// find new vertices\n"
"		uint32_t newVertMask = 0;\n"
"		for ( uint32_t j = 0; j < 4; ++j )\n"
"		{\n"
"			if ( j == 3 && pairIndices.x == pairIndices.y ) break;\n"
"\n"
"			bool contains = false;\n"
"			for ( uint32_t k = 0; k < packet.m_vertCount; ++k )\n"
"			{\n"
"				if ( triPacketCache.m_vertexIndices[k] == ( &indices.x )[j] )\n"
"				{\n"
"					contains = true;\n"
"					break;\n"
"				}\n"
"			}\n"
"\n"
"			if ( !contains ) newVertMask |= 1 << j;\n"
"		}\n"
"\n"
"		const uint32_t oldVertCount = packet.m_vertCount;\n"
"		const uint32_t newVertCount = __popc( newVertMask );\n"
"\n"
"		// try to fit\n"
"		if ( !packet.tryAddTrianglePair( pairIndices.x, pairIndices.y, newVertCount ) )\n"
"		{\n"
"			primNodeCount++;\n"
"			packet = TrianglePacketData{};\n"
"			continue;\n"
"		}\n"
"\n"
"		// store new vertices\n"
"		for ( uint32_t j = 0; j < 4; ++j )\n"
"		{\n"
"			if ( j == 3 && pairIndices.x == pairIndices.y ) break;\n"
"\n"
"			bool contains = !( newVertMask & ( 1 << j ) );\n"
"			if ( !contains )\n"
"			{\n"
"				uint32_t vertexMask							= ( 1 << j ) - 1;\n"
"				uint32_t vertexIndex						= oldVertCount + __popc( newVertMask & vertexMask );\n"
"				triPacketCache.m_vertexIndices[vertexIndex] = ( &indices.x )[j];\n"
"			}\n"
"		}\n"
"\n"
"		rangeOffset++;\n"
"	}\n"
"\n"
"	const uint32_t primNodeBase	  = atomicAdd( &header->m_primNodeCount, primNodeCount );\n"
"	uint32_t	   primNodeOffset = primNodeBase;\n"
"\n"
"	uint32_t triPairOffset	 = 0;\n"
"	uint32_t triPacketOffset = 0;\n"
"	uint32_t leafIndex		 = 0;\n"
"\n"
"	packet		= TrianglePacketData{};\n"
"	rangeOffset = rangeBase;\n"
"\n"
"	while ( rangeOffset < rangeBase + rangeSize )\n"
"	{\n"
"		uint32_t   referenceIndex = referenceIndices[rangeOffset];\n"
"		const bool rangeStart	  = referenceIndex & RangeStartBit;\n"
"		const bool rangeEnd		  = referenceIndex & RangeEndBit;\n"
"		referenceIndex &= ~RangeStartBit;\n"
"		referenceIndex &= ~RangeEndBit;\n"
"\n"
"		// form triangle pair\n"
"		const ReferenceNode reference	= references[getNodeAddr( referenceIndex )];\n"
"		const uint2			pairIndices = primitives.fetchTrianglePairIndices( reference.m_primIndex );\n"
"		uint3				indices0	= primitives.fetchTriangleIndices( pairIndices.x );\n"
"		uint4				indices		= make_uint4( indices0, indices0.z );\n"
"\n"
"		uint3 vertexMapping{};\n"
"		if ( pairIndices.x != pairIndices.y )\n"
"		{\n"
"			uint3 indices1 = primitives.fetchTriangleIndices( pairIndices.y );\n"
"			vertexMapping  = tryPairTriangles( indices0, indices1 );\n"
"\n"
"			uint32_t vertexIndex = 0;\n"
"			if ( vertexMapping.x == 3 ) vertexIndex = indices1.x;\n"
"			if ( vertexMapping.y == 3 ) vertexIndex = indices1.y;\n"
"			if ( vertexMapping.z == 3 ) vertexIndex = indices1.z;\n"
"			indices.w = vertexIndex;\n"
"		}\n"
"\n"
"		// find new vertices\n"
"		uint32_t newVertMask = 0;\n"
"		uint4	 vertexIndicesInPacket{};\n"
"		for ( uint32_t j = 0; j < 4; ++j )\n"
"		{\n"
"			if ( j == 3 && pairIndices.x == pairIndices.y ) break;\n"
"\n"
"			bool contains = false;\n"
"			for ( uint32_t k = 0; k < packet.m_vertCount; ++k )\n"
"			{\n"
"				if ( triPacketCache.m_vertexIndices[k] == ( &indices.x )[j] )\n"
"				{\n"
"					( &vertexIndicesInPacket.x )[j] = k;\n"
"					contains						= true;\n"
"					break;\n"
"				}\n"
"			}\n"
"\n"
"			if ( !contains )\n"
"			{\n"
"				( &vertexIndicesInPacket.x )[j] = packet.m_vertCount + __popc( newVertMask );\n"
"				newVertMask |= 1 << j;\n"
"			}\n"
"		}\n"
"\n"
"		const uint32_t newVertCount = __popc( newVertMask );\n"
"\n"
"		// try to fit\n"
"		if ( !packet.tryAddTrianglePair( pairIndices.x, pairIndices.y, newVertCount ) )\n"
"		{\n"
"			// build packet\n"
"			TrianglePacketNode triPacketNode{};\n"
"\n"
"			// write header\n"
"			TrianglePacketHeader hdr = packet.buildHeader();\n"
"			triPacketNode.writeHeader( hdr );\n"
"\n"
"			// write indices & descriptors\n"
"			for ( uint32_t j = 0; j < packet.m_triPairCount; ++j )\n"
"			{\n"
"				// write indices\n"
"				triPacketNode.writePrimIndex( j, 0, hdr, triPacketCache.m_triPairData[j].m_pairIndices.x );\n"
"				triPacketNode.writePrimIndex( j, 1, hdr, triPacketCache.m_triPairData[j].m_pairIndices.y );\n"
"\n"
"				// write descriptor\n"
"				triPacketNode.writeDescriptor( j, triPacketCache.m_triPairData[j].m_descriptor );\n"
"			}\n"
"\n"
"			// write vertices\n"
"			for ( uint32_t j = 0; j < packet.m_vertCount; ++j )\n"
"			{\n"
"				const float3 vertex = primitives.fetchVertex( triPacketCache.m_vertexIndices[j] );\n"
"				triPacketNode.writeVertex( j, vertex );\n"
"			}\n"
"\n"
"			// write packet\n"
"			primNodes[primNodeOffset++] = triPacketNode;\n"
"\n"
"			packet = TrianglePacketData{};\n"
"			continue;\n"
"		}\n"
"\n"
"		if ( rangeStart )\n"
"		{\n"
"			triPairOffset	= packet.m_triPairCount - 1;\n"
"			triPacketOffset = primNodeOffset;\n"
"		}\n"
"\n"
"		if ( rangeEnd )\n"
"		{\n"
"			triPairOffsetCache[leafIndex++] = TrianglePairOffsets( triPairOffset, triPacketOffset );\n"
"		}\n"
"\n"
"		// store new vertices\n"
"		for ( uint32_t j = 0; j < 4; ++j )\n"
"		{\n"
"			if ( j == 3 && pairIndices.x == pairIndices.y ) break;\n"
"\n"
"			bool contains = !( newVertMask & ( 1 << j ) );\n"
"			if ( !contains )\n"
"			{\n"
"				uint32_t vertexIndexInPacket						= ( &vertexIndicesInPacket.x )[j];\n"
"				triPacketCache.m_vertexIndices[vertexIndexInPacket] = ( &indices.x )[j];\n"
"			}\n"
"		}\n"
"\n"
"		uint3 triIndices0 = make_uint3( vertexIndicesInPacket );\n"
"		uint3 triIndices1{};\n"
"		if ( pairIndices.x != pairIndices.y )\n"
"		{\n"
"			triIndices1.x = ( &vertexIndicesInPacket.x )[vertexMapping.x];\n"
"			triIndices1.y = ( &vertexIndicesInPacket.x )[vertexMapping.y];\n"
"			triIndices1.z = ( &vertexIndicesInPacket.x )[vertexMapping.z];\n"
"		}\n"
"\n"
"		// store triangle pair\n"
"		triPacketCache.m_triPairData[packet.m_triPairCount - 1] =\n"
"			TrianglePairData( pairIndices, triIndices0, triIndices1, rangeEnd );\n"
"\n"
"		rangeOffset++;\n"
"	}\n"
"\n"
"	// build packet\n"
"	{\n"
"		TrianglePacketNode triPacketNode{};\n"
"\n"
"		// write header\n"
"		TrianglePacketHeader hdr = packet.buildHeader();\n"
"		triPacketNode.writeHeader( hdr );\n"
"\n"
"		// write indices & descriptors\n"
"		for ( uint32_t j = 0; j < packet.m_triPairCount; ++j )\n"
"		{\n"
"			// write indices\n"
"			triPacketNode.writePrimIndex( j, 0, hdr, triPacketCache.m_triPairData[j].m_pairIndices.x );\n"
"			triPacketNode.writePrimIndex( j, 1, hdr, triPacketCache.m_triPairData[j].m_pairIndices.y );\n"
"\n"
"			// write descriptor\n"
"			triPacketNode.writeDescriptor( j, triPacketCache.m_triPairData[j].m_descriptor );\n"
"		}\n"
"\n"
"		// write vertices\n"
"		for ( uint32_t j = 0; j < packet.m_vertCount; ++j )\n"
"		{\n"
"			const float3 vertex = primitives.fetchVertex( triPacketCache.m_vertexIndices[j] );\n"
"			triPacketNode.writeVertex( j, vertex );\n"
"		}\n"
"\n"
"		// write packet\n"
"		primNodes[primNodeOffset++] = triPacketNode;\n"
"	}\n"
"\n"
"	// patch children\n"
"	BoxNode& node = boxNodes[nodeAddr];\n"
"	node.setPrimNodeBase( primNodeBase );\n"
"	const uint32_t leafCount = leafIndex;\n"
"	leafIndex				 = 0;\n"
"	for ( uint32_t i = 0; i < node.getChildCount(); ++i )\n"
"	{\n"
"		uint32_t childIndex = node.getChildIndex( i );\n"
"		if ( isLeafNode( childIndex ) )\n"
"		{\n"
"			uint32_t childType	   = triPairIndexToType( triPairOffsetCache[leafIndex].m_pairOffset );\n"
"			uint32_t childOffsset0 = triPairOffsetCache[leafIndex].m_packetOffset;\n"
"			uint32_t childOffsset1 = triPairOffsetCache[min( leafIndex + 1, leafCount - 1 )].m_packetOffset;\n"
"			uint32_t childRange	   = childOffsset1 - childOffsset0;\n"
"			node.patchChild( i, childType, childRange );\n"
"			leafIndex++;\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"__device__ void PackLeavesWarp(\n"
"	uint32_t			index,\n"
"	uint32_t			taskCount,\n"
"	GeomHeader*			header,\n"
"	ReferenceNode*		references,\n"
"	BoxNode*			boxNodes,\n"
"	TrianglePacketNode* primNodes,\n"
"	TriangleMesh&		primitives,\n"
"	uint3*				taskQueue,\n"
"	uint32_t*			referenceIndices )\n"
"{\n"
"	if ( threadIdx.x >= WarpSize ) return;\n"
"\n"
"	constexpr uint32_t			   PacketTasksPerWarp = WarpSize / LanesPerLeafPacketTask;\n"
"	__shared__ TrianglePacketCache triPacketCache[PacketTasksPerWarp];\n"
"	__shared__ TrianglePairOffsets triPairOffsetCache[PacketTasksPerWarp][BranchingFactor];\n"
"\n"
"	const uint32_t laneIndex	= threadIdx.x % WarpSize;\n"
"	const uint32_t taskIndex	= index / LanesPerLeafPacketTask;\n"
"	const uint32_t subwarpIndex = laneIndex / LanesPerLeafPacketTask;\n"
"	const uint32_t sublaneIndex = laneIndex % LanesPerLeafPacketTask;\n"
"\n"
"	uint3	 task		 = taskQueue[min( taskIndex, taskCount - 1 )];\n"
"	uint32_t rangeOffset = task.x;\n"
"	uint32_t nodeAddr	 = task.y;\n"
"	uint32_t rangeSize	 = task.z;\n"
"\n"
"	uint32_t primNodeCount = 0;\n"
"\n"
"	const uint32_t rangeBase = rangeOffset;\n"
"	while ( hiprt::ballot( taskIndex < taskCount && rangeOffset < rangeBase + rangeSize ) )\n"
"	{\n"
"		TrianglePacketData packet{};\n"
"\n"
"		while ( rangeOffset < rangeBase + rangeSize && packet.m_triPairCount < MaxTrianglePairsPerTrianglePacket )\n"
"		{\n"
"			uint32_t referenceIndex = referenceIndices[rangeOffset];\n"
"			referenceIndex &= ~RangeStartBit;\n"
"			referenceIndex &= ~RangeEndBit;\n"
"\n"
"			// form triangle pair\n"
"			const ReferenceNode reference	= references[getNodeAddr( referenceIndex )];\n"
"			const uint2			pairIndices = primitives.fetchTrianglePairIndices( reference.m_primIndex );\n"
"			uint3				indices0	= primitives.fetchTriangleIndices( pairIndices.x );\n"
"			uint4				indices		= make_uint4( indices0, indices0.z );\n"
"\n"
"			if ( pairIndices.x != pairIndices.y )\n"
"			{\n"
"				uint3 indices1		= primitives.fetchTriangleIndices( pairIndices.y );\n"
"				uint3 vertexMapping = tryPairTriangles( indices0, indices1 );\n"
"\n"
"				uint32_t vertexIndex = 0;\n"
"				if ( vertexMapping.x == 3 ) vertexIndex = indices1.x;\n"
"				if ( vertexMapping.y == 3 ) vertexIndex = indices1.y;\n"
"				if ( vertexMapping.z == 3 ) vertexIndex = indices1.z;\n"
"				indices.w = vertexIndex;\n"
"			}\n"
"\n"
"			// find new vertices\n"
"			const bool valid = sublaneIndex < 3 || pairIndices.x != pairIndices.y;\n"
"\n"
"			bool	 contains = false;\n"
"			uint32_t vertexIndexInPacket{};\n"
"			for ( uint32_t k = 0; k < packet.m_vertCount; ++k )\n"
"			{\n"
"				if ( triPacketCache[subwarpIndex].m_vertexIndices[k] == ( &indices.x )[sublaneIndex] )\n"
"				{\n"
"					vertexIndexInPacket = k;\n"
"					contains			= true;\n"
"					break;\n"
"				}\n"
"			}\n"
"\n"
"			const uint32_t newVertMask =\n"
"				( hiprt::ballot( !contains && valid ) >> ( LanesPerLeafPacketTask * subwarpIndex ) ) & 0xf;\n"
"			const uint32_t oldVertCount = packet.m_vertCount;\n"
"			const uint32_t newVertCount = __popc( newVertMask );\n"
"\n"
"			// try to fit\n"
"			if ( !packet.tryAddTrianglePair( pairIndices.x, pairIndices.y, newVertCount ) ) break;\n"
"\n"
"			// store new vertices\n"
"			if ( !contains )\n"
"			{\n"
"				const uint32_t vertexMask = ( 1 << sublaneIndex ) - 1;\n"
"				vertexIndexInPacket		  = oldVertCount + __popc( newVertMask & vertexMask );\n"
"				triPacketCache[subwarpIndex].m_vertexIndices[vertexIndexInPacket] = ( &indices.x )[sublaneIndex];\n"
"			}\n"
"\n"
"			rangeOffset++;\n"
"\n"
"			// not sure why but this fence is needed on linux\n"
"			__threadfence_block();\n"
"		}\n"
"		__threadfence_block();\n"
"\n"
"		// count packets\n"
"		if ( taskIndex < taskCount && packet.m_triPairCount > 0 ) primNodeCount++;\n"
"	}\n"
"\n"
"	const uint32_t primNodeBase =\n"
"		warpOffset( sublaneIndex == LanesPerLeafPacketTask - 1 ? primNodeCount : 0u, &header->m_primNodeCount );\n"
"	uint32_t primNodeOffset = primNodeBase;\n"
"\n"
"	uint32_t triPairOffset	 = 0;\n"
"	uint32_t triPacketOffset = 0;\n"
"	uint32_t leafIndex		 = 0;\n"
"\n"
"	rangeOffset = rangeBase;\n"
"\n"
"	while ( hiprt::ballot( taskIndex < taskCount && rangeOffset < rangeBase + rangeSize ) )\n"
"	{\n"
"		TrianglePacketData packet{};\n"
"\n"
"		while ( rangeOffset < rangeBase + rangeSize && packet.m_triPairCount < MaxTrianglePairsPerTrianglePacket )\n"
"		{\n"
"			uint32_t   referenceIndex = referenceIndices[rangeOffset];\n"
"			const bool rangeStart	  = referenceIndex & RangeStartBit;\n"
"			const bool rangeEnd		  = referenceIndex & RangeEndBit;\n"
"			referenceIndex &= ~RangeStartBit;\n"
"			referenceIndex &= ~RangeEndBit;\n"
"\n"
"			// form triangle pair\n"
"			const ReferenceNode reference	= references[getNodeAddr( referenceIndex )];\n"
"			const uint2			pairIndices = primitives.fetchTrianglePairIndices( reference.m_primIndex );\n"
"			uint3				indices0	= primitives.fetchTriangleIndices( pairIndices.x );\n"
"			uint4				indices		= make_uint4( indices0, indices0.z );\n"
"\n"
"			uint3 vertexMapping{};\n"
"			if ( pairIndices.x != pairIndices.y )\n"
"			{\n"
"				uint3 indices1 = primitives.fetchTriangleIndices( pairIndices.y );\n"
"				vertexMapping  = tryPairTriangles( indices0, indices1 );\n"
"\n"
"				uint32_t vertexIndex = 0;\n"
"				if ( vertexMapping.x == 3 ) vertexIndex = indices1.x;\n"
"				if ( vertexMapping.y == 3 ) vertexIndex = indices1.y;\n"
"				if ( vertexMapping.z == 3 ) vertexIndex = indices1.z;\n"
"				indices.w = vertexIndex;\n"
"			}\n"
"\n"
"			// find new vertices\n"
"			const bool valid = sublaneIndex < 3 || pairIndices.x != pairIndices.y;\n"
"\n"
"			bool	 contains = false;\n"
"			uint32_t vertexIndexInPacket{};\n"
"			for ( uint32_t k = 0; k < packet.m_vertCount; ++k )\n"
"			{\n"
"				if ( triPacketCache[subwarpIndex].m_vertexIndices[k] == ( &indices.x )[sublaneIndex] )\n"
"				{\n"
"					vertexIndexInPacket = k;\n"
"					contains			= true;\n"
"					break;\n"
"				}\n"
"			}\n"
"\n"
"			const uint32_t newVertMask =\n"
"				( hiprt::ballot( !contains && valid ) >> ( LanesPerLeafPacketTask * subwarpIndex ) ) & 0xf;\n"
"			const uint32_t oldVertCount = packet.m_vertCount;\n"
"			const uint32_t newVertCount = __popc( newVertMask );\n"
"\n"
"			// try to fit\n"
"			if ( !packet.tryAddTrianglePair( pairIndices.x, pairIndices.y, newVertCount ) ) break;\n"
"\n"
"			if ( rangeStart )\n"
"			{\n"
"				triPairOffset	= packet.m_triPairCount - 1;\n"
"				triPacketOffset = primNodeOffset;\n"
"			}\n"
"\n"
"			if ( rangeEnd )\n"
"			{\n"
"				triPairOffsetCache[subwarpIndex][leafIndex++] = TrianglePairOffsets( triPairOffset, triPacketOffset );\n"
"			}\n"
"\n"
"			// store new vertices\n"
"			if ( !contains )\n"
"			{\n"
"				const uint32_t vertexMask = ( 1 << sublaneIndex ) - 1;\n"
"				vertexIndexInPacket		  = oldVertCount + __popc( newVertMask & vertexMask );\n"
"				triPacketCache[subwarpIndex].m_vertexIndices[vertexIndexInPacket] = ( &indices.x )[sublaneIndex];\n"
"			}\n"
"\n"
"			// shuffle vertex indices in packet\n"
"			uint4 vertexIndicesInPacket;\n"
"			vertexIndicesInPacket.x = shfl( vertexIndexInPacket, subwarpIndex * LanesPerLeafPacketTask + 0 );\n"
"			vertexIndicesInPacket.y = shfl( vertexIndexInPacket, subwarpIndex * LanesPerLeafPacketTask + 1 );\n"
"			vertexIndicesInPacket.z = shfl( vertexIndexInPacket, subwarpIndex * LanesPerLeafPacketTask + 2 );\n"
"			vertexIndicesInPacket.w = shfl( vertexIndexInPacket, subwarpIndex * LanesPerLeafPacketTask + 3 );\n"
"\n"
"			uint3 triIndices0 = make_uint3( vertexIndicesInPacket );\n"
"			uint3 triIndices1{};\n"
"			if ( pairIndices.x != pairIndices.y )\n"
"			{\n"
"				triIndices1.x = ( &vertexIndicesInPacket.x )[vertexMapping.x];\n"
"				triIndices1.y = ( &vertexIndicesInPacket.x )[vertexMapping.y];\n"
"				triIndices1.z = ( &vertexIndicesInPacket.x )[vertexMapping.z];\n"
"			}\n"
"\n"
"			// store triangle pair\n"
"			if ( sublaneIndex == 0 )\n"
"				triPacketCache[subwarpIndex].m_triPairData[packet.m_triPairCount - 1] =\n"
"					TrianglePairData( pairIndices, triIndices0, triIndices1, rangeEnd );\n"
"\n"
"			rangeOffset++;\n"
"			// not sure why but this fixes the issue on linux\n"
"			// otherwise triIndices1 are not correcly written to the final packet\n"
"			__threadfence_block();\n"
"		}\n"
"		__threadfence_block();\n"
"\n"
"		// build packets\n"
"		uint64_t packetMask = hiprt::ballot( taskIndex < taskCount && packet.m_triPairCount > 0 && sublaneIndex == 0 );\n"
"		while ( packetMask )\n"
"		{\n"
"			const uint32_t halfWarpIndex = laneIndex / 16;\n"
"			const uint32_t halfLaneIndex = laneIndex % 16;\n"
"\n"
"			const uint32_t broadcastLane0 = __ffsll( static_cast<unsigned long long>( packetMask ) ) - 1;\n"
"			packetMask ^= 1 << broadcastLane0;\n"
"\n"
"			const uint32_t broadcastLane1 = __ffsll( static_cast<unsigned long long>( packetMask ) ) - 1;\n"
"			const bool	   secondValid	  = packetMask != 0;\n"
"			if ( secondValid ) packetMask ^= 1 << broadcastLane1;\n"
"\n"
"			const uint32_t			 broadcastLane			 = ( halfWarpIndex == 0 ) ? broadcastLane0 : broadcastLane1;\n"
"			const uint32_t			 broadcastSubwarpIndex	 = broadcastLane / LanesPerLeafPacketTask;\n"
"			const uint32_t			 broadcastPrimNodeOffset = shfl( primNodeOffset, broadcastLane );\n"
"			const TrianglePacketData broadcastPacket		 = packet.shuffle( broadcastLane );\n"
"\n"
"			// store current packet data to registers\n"
"			TrianglePairData halfLaneTriPairData;\n"
"			uint32_t		 halfLaneVertexIndex;\n"
"			if ( halfWarpIndex == 0 || secondValid )\n"
"			{\n"
"				if ( halfLaneIndex < 2 * broadcastPacket.m_triPairCount )\n"
"					halfLaneTriPairData = triPacketCache[broadcastSubwarpIndex].m_triPairData[halfLaneIndex / 2];\n"
"				if ( halfLaneIndex < broadcastPacket.m_vertCount )\n"
"					halfLaneVertexIndex = triPacketCache[broadcastSubwarpIndex].m_vertexIndices[halfLaneIndex];\n"
"			}\n"
"\n"
"			// reuse shared memory\n"
"			TrianglePacketNode& triPacketNode =\n"
"				*reinterpret_cast<TrianglePacketNode*>( &triPacketCache[broadcastSubwarpIndex] );\n"
"			if ( halfWarpIndex == 0 || secondValid )\n"
"			{\n"
"				triPacketNode.m_data[halfLaneIndex + 0 * 16] = 0;\n"
"				triPacketNode.m_data[halfLaneIndex + 1 * 16] = 0;\n"
"			}\n"
"\n"
"			// build two packets at once\n"
"			if ( halfWarpIndex == 0 || secondValid )\n"
"			{\n"
"				// write header\n"
"				TrianglePacketHeader hdr = broadcastPacket.buildHeader();\n"
"				if ( halfLaneIndex == 0 ) triPacketNode.writeHeader<true>( hdr );\n"
"\n"
"				// write indices & descriptors\n"
"				if ( halfLaneIndex < 2 * broadcastPacket.m_triPairCount )\n"
"				{\n"
"					// write indices\n"
"					triPacketNode.writePrimIndex<true>(\n"
"						halfLaneIndex / 2,\n"
"						halfLaneIndex % 2,\n"
"						hdr,\n"
"						( &halfLaneTriPairData.m_pairIndices.x )[halfLaneIndex % 2] );\n"
"\n"
"					// write descriptors\n"
"					if ( halfLaneIndex % 2 == 0 )\n"
"						triPacketNode.writeDescriptor<true>( halfLaneIndex / 2, halfLaneTriPairData.m_descriptor );\n"
"				}\n"
"\n"
"				// write vertices\n"
"				if ( halfLaneIndex < broadcastPacket.m_vertCount )\n"
"				{\n"
"					const float3 vertex = primitives.fetchVertex( halfLaneVertexIndex );\n"
"					triPacketNode.writeVertex<true>( halfLaneIndex, vertex );\n"
"				}\n"
"			}\n"
"\n"
"			// write packet\n"
"			if ( ( halfWarpIndex == 0 || secondValid ) )\n"
"			{\n"
"				primNodes[broadcastPrimNodeOffset].m_data[halfLaneIndex + 0 * 16] =\n"
"					triPacketNode.m_data[halfLaneIndex + 0 * 16];\n"
"				primNodes[broadcastPrimNodeOffset].m_data[halfLaneIndex + 1 * 16] =\n"
"					triPacketNode.m_data[halfLaneIndex + 1 * 16];\n"
"			}\n"
"		}\n"
"\n"
"		if ( taskIndex < taskCount && packet.m_triPairCount > 0 ) primNodeOffset++;\n"
"	}\n"
"\n"
"	// patch children\n"
"	if ( taskIndex < taskCount )\n"
"	{\n"
"		BoxNode& node = boxNodes[nodeAddr];\n"
"		node.setPrimNodeBase( primNodeBase );\n"
"		const uint32_t leafCount = leafIndex;\n"
"		leafIndex				 = 0;\n"
"		for ( uint32_t i = 0; i < node.getChildCount(); ++i )\n"
"		{\n"
"			uint32_t childIndex = node.getChildIndex( i );\n"
"			if ( isLeafNode( childIndex ) )\n"
"			{\n"
"				uint32_t childType	   = triPairIndexToType( triPairOffsetCache[subwarpIndex][leafIndex].m_pairOffset );\n"
"				uint32_t childOffsset0 = triPairOffsetCache[subwarpIndex][leafIndex].m_packetOffset;\n"
"				uint32_t childOffsset1 = triPairOffsetCache[subwarpIndex][min( leafIndex + 1, leafCount - 1 )].m_packetOffset;\n"
"				uint32_t childRange	   = childOffsset1 - childOffsset0;\n"
"				node.patchChild( i, childType, childRange );\n"
"				leafIndex++;\n"
"			}\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void __launch_bounds__( WarpSize ) PackLeaves_TriangleMesh_TrianglePacketNode(\n"
"	uint32_t			taskCount,\n"
"	GeomHeader*			header,\n"
"	ReferenceNode*		references,\n"
"	BoxNode*			boxNodes,\n"
"	TrianglePacketNode* primNodes,\n"
"	TriangleMesh		primitives,\n"
"	uint3*				taskQueue,\n"
"	uint32_t*			referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	PackLeavesWarp( index, taskCount, header, references, boxNodes, primNodes, primitives, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void PackLeaves_TriangleMesh_TrianglePairNode(\n"
"	uint32_t	   taskCount,\n"
"	GeomHeader*	   header,\n"
"	ReferenceNode* references,\n"
"	BoxNode*	   boxNodes,\n"
"	TriangleNode*  primNodes,\n"
"	TriangleMesh   primitives,\n"
"	uint3*		   taskQueue,\n"
"	uint32_t*	   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	PackLeaves( index, taskCount, header, references, boxNodes, primNodes, primitives, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void PackLeaves_AabbList_CustomNode(\n"
"	uint32_t	   taskCount,\n"
"	GeomHeader*	   header,\n"
"	ReferenceNode* references,\n"
"	BoxNode*	   boxNodes,\n"
"	CustomNode*	   primNodes,\n"
"	AabbList	   primitives,\n"
"	uint3*		   taskQueue,\n"
"	uint32_t*	   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	PackLeaves( index, taskCount, header, references, boxNodes, primNodes, primitives, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void PackLeaves_InstanceList_hiprtFrameSRT_UserInstanceNode(\n"
"	uint32_t					taskCount,\n"
"	SceneHeader*				header,\n"
"	ReferenceNode*				references,\n"
"	BoxNode*					boxNodes,\n"
"	InstanceNode*				primNodes,\n"
"	InstanceList<hiprtFrameSRT> primitives,\n"
"	uint3*						taskQueue,\n"
"	uint32_t*					referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	PackLeaves( index, taskCount, header, references, boxNodes, primNodes, primitives, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void PackLeaves_InstanceList_hiprtFrameSRT_HwInstanceNode(\n"
"	uint32_t					taskCount,\n"
"	SceneHeader*				header,\n"
"	ReferenceNode*				references,\n"
"	BoxNode*					boxNodes,\n"
"	InstanceNode*				primNodes,\n"
"	InstanceList<hiprtFrameSRT> primitives,\n"
"	uint3*						taskQueue,\n"
"	uint32_t*					referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	PackLeaves( index, taskCount, header, references, boxNodes, primNodes, primitives, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void PackLeaves_InstanceList_hiprtFrameMatrix_UserInstanceNode(\n"
"	uint32_t					   taskCount,\n"
"	SceneHeader*				   header,\n"
"	ReferenceNode*				   references,\n"
"	BoxNode*					   boxNodes,\n"
"	InstanceNode*				   primNodes,\n"
"	InstanceList<hiprtFrameMatrix> primitives,\n"
"	uint3*						   taskQueue,\n"
"	uint32_t*					   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	PackLeaves( index, taskCount, header, references, boxNodes, primNodes, primitives, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void PackLeaves_InstanceList_hiprtFrameMatrix_HwInstanceNode(\n"
"	uint32_t					   taskCount,\n"
"	SceneHeader*				   header,\n"
"	ReferenceNode*				   references,\n"
"	BoxNode*					   boxNodes,\n"
"	InstanceNode*				   primNodes,\n"
"	InstanceList<hiprtFrameMatrix> primitives,\n"
"	uint3*						   taskQueue,\n"
"	uint32_t*					   referenceIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	PackLeaves( index, taskCount, header, references, boxNodes, primNodes, primitives, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"template <uint32_t LeafType>\n"
"__device__ void PatchApiNodes( uint32_t index, uint32_t nodeCount, ApiNode* apiNodes )\n"
"{\n"
"	if ( index < nodeCount )\n"
"	{\n"
"		ApiNode& node = apiNodes[index];\n"
"		for ( uint32_t j = 0; j < 2; ++j )\n"
"		{\n"
"			if ( node.m_childTypes[j] == hiprtBvhNodeTypeLeaf )\n"
"				node.m_childTypes[j] = LeafType;\n"
"			else\n"
"				node.m_childTypes[j] = BoxType;\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void PatchApiNodes_TriangleMesh( uint32_t nodeCount, ApiNode* apiNodes )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	PatchApiNodes<TriangleType>( index, nodeCount, apiNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void PatchApiNodes_AabbList( uint32_t nodeCount, ApiNode* apiNodes )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	PatchApiNodes<CustomType>( index, nodeCount, apiNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void PatchApiNodes_InstanceList_hiprtFrameSRT( uint32_t nodeCount, ApiNode* apiNodes )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	PatchApiNodes<InstanceType>( index, nodeCount, apiNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void PatchApiNodes_InstanceList_hiprtFrameMatrix( uint32_t nodeCount, ApiNode* apiNodes )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	PatchApiNodes<InstanceType>( index, nodeCount, apiNodes );\n"
"}\n"
"\n"
"extern \"C\" __global__ void __launch_bounds__( BvhBuilderReductionBlockSize )\n"
"	ComputeCost( uint32_t nodeCount, BoxNode* boxNodes, float* costCounter )\n"
"{\n"
"	const uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;\n"
"\n"
"	float cost = 0.0f;\n"
"	if ( index < nodeCount )\n"
"	{\n"
"		float rootAreaInv = 1.0f / boxNodes[0].area();\n"
"\n"
"		for ( uint32_t i = 0; i < boxNodes[index].getChildCount(); ++i )\n"
"			cost += ( boxNodes[index].getChildType( i ) == BoxType ? Ct : Ci ) * boxNodes[index].getChildBox( i ).area() *\n"
"					rootAreaInv;\n"
"\n"
"		if ( index == 0 ) cost += Ct;\n"
"	}\n"
"\n"
"	constexpr uint32_t WarpsPerBlock = DivideRoundUp( BvhBuilderReductionBlockSize, WarpSize );\n"
"	__shared__ float   costCache[WarpsPerBlock];\n"
"\n"
"	float blockCost = blockSum( cost, costCache );\n"
"	if ( threadIdx.x == 0 ) atomicAdd( costCounter, blockCost );\n"
"}\n"
;
static const char* hip_LbvhBuilderKernels= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_common.h>\n"
"#include <hiprt/hiprt_vec.h>\n"
"#include <hiprt/hiprt_math.h>\n"
"#include <hiprt/impl/Aabb.h>\n"
"#include <hiprt/impl/AabbList.h>\n"
"#include <hiprt/impl/Triangle.h>\n"
"#include <hiprt/impl/BvhNode.h>\n"
"#include <hiprt/impl/Header.h>\n"
"#include <hiprt/impl/QrDecomposition.h>\n"
"#include <hiprt/impl/Quaternion.h>\n"
"#include <hiprt/impl/Transform.h>\n"
"#include <hiprt/impl/InstanceList.h>\n"
"#include <hiprt/impl/MortonCode.h>\n"
"#include <hiprt/impl/TriangleMesh.h>\n"
"#include <hiprt/impl/BvhConfig.h>\n"
"\n"
"using namespace hiprt;\n"
"\n"
"HIPRT_DEVICE uint32_t findParent(\n"
"	const uint32_t	nodeAddr,\n"
"	const uint32_t	nodeType,\n"
"	const uint32_t	i,\n"
"	const uint32_t	j,\n"
"	const uint32_t	n,\n"
"	ScratchNode*	scratchNodes,\n"
"	const uint32_t* sortedMortonCodeKeys )\n"
"{\n"
"	if ( i == 0 && j == n ) return InvalidValue;\n"
"	if ( i == 0 || ( j != n && findHighestDifferentBit( j - 1, j, n, sortedMortonCodeKeys ) <\n"
"								   findHighestDifferentBit( i - 1, i, n, sortedMortonCodeKeys ) ) )\n"
"	{\n"
"		scratchNodes[j - 1].m_childIndex0 = encodeNodeIndex( nodeAddr, nodeType );\n"
"		return j - 1;\n"
"	}\n"
"	else\n"
"	{\n"
"		scratchNodes[i - 1].m_childIndex1 = encodeNodeIndex( nodeAddr, nodeType );\n"
"		return i - 1;\n"
"	}\n"
"}\n"
"\n"
"template <typename PrimitiveContainer>\n"
"__device__ void EmitTopologyAndFitBounds(\n"
"	uint32_t			index,\n"
"	const uint32_t*		sortedMortonCodeKeys,\n"
"	const uint32_t*		sortedMortonCodeValues,\n"
"	uint32_t*			updateCounters,\n"
"	PrimitiveContainer& primitives,\n"
"	ScratchNode*		scratchNodes,\n"
"	ReferenceNode*		references )\n"
"{\n"
"	const uint32_t primCount = primitives.getCount();\n"
"	uint32_t	   i		 = index;\n"
"	uint32_t	   j		 = i + 1;\n"
"	uint32_t	   k;\n"
"\n"
"	if ( index >= primCount ) return;\n"
"\n"
"	const uint32_t leafType = []() {\n"
"		if constexpr ( is_same<PrimitiveContainer, TriangleMesh>::value )\n"
"			return TriangleType;\n"
"		else if constexpr ( is_same<PrimitiveContainer, AabbList>::value )\n"
"			return CustomType;\n"
"		else if constexpr (\n"
"			is_same<PrimitiveContainer, InstanceList<hiprtFrameSRT>>::value ||\n"
"			is_same<PrimitiveContainer, InstanceList<hiprtFrameMatrix>>::value )\n"
"			return InstanceType;\n"
"	}();\n"
"\n"
"	const uint32_t primIndex = sortedMortonCodeValues[index];\n"
"	references[index]		 = ReferenceNode( primIndex, primitives.fetchAabb( primIndex ) );\n"
"\n"
"	uint32_t parentAddr = findParent( index, leafType, i, j, primCount, scratchNodes, sortedMortonCodeKeys );\n"
"	index				= parentAddr;\n"
"	while ( ( k = atomicExch( &updateCounters[index], parentAddr == i - 1 ? j : i ) ) != InvalidValue )\n"
"	{\n"
"		__threadfence();\n"
"\n"
"		ScratchNode& node = scratchNodes[index];\n"
"		if ( parentAddr == i - 1 )\n"
"			i = k;\n"
"		else\n"
"			j = k;\n"
"\n"
"		Aabb box;\n"
"		if ( node.getChildType( 0 ) != BoxType )\n"
"			box.grow( references[node.getChildAddr( 0 )].aabb() );\n"
"		else\n"
"			box.grow( scratchNodes[node.getChildAddr( 0 )].aabb() );\n"
"\n"
"		if ( node.getChildType( 1 ) != BoxType )\n"
"			box.grow( references[node.getChildAddr( 1 )].aabb() );\n"
"		else\n"
"			box.grow( scratchNodes[node.getChildAddr( 1 )].aabb() );\n"
"\n"
"		parentAddr = findParent( index, BoxType, i, j, primCount, scratchNodes, sortedMortonCodeKeys );\n"
"		node.m_box = box;\n"
"\n"
"		if ( parentAddr == InvalidValue )\n"
"		{\n"
"			updateCounters[primCount - 1] = index; // save root index\n"
"			break;\n"
"		}\n"
"\n"
"		index = parentAddr;\n"
"\n"
"		__threadfence();\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void EmitTopologyAndFitBounds_TriangleMesh(\n"
"	const uint32_t* sortedMortonCodeKeys,\n"
"	const uint32_t* sortedMortonCodeValues,\n"
"	uint32_t*		updateCounters,\n"
"	TriangleMesh	primitives,\n"
"	ScratchNode*	scratchNodes,\n"
"	ReferenceNode*	references )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	EmitTopologyAndFitBounds<TriangleMesh>(\n"
"		index, sortedMortonCodeKeys, sortedMortonCodeValues, updateCounters, primitives, scratchNodes, references );\n"
"}\n"
"\n"
"extern \"C\" __global__ void EmitTopologyAndFitBounds_AabbList(\n"
"	const uint32_t* sortedMortonCodeKeys,\n"
"	const uint32_t* sortedMortonCodeValues,\n"
"	uint32_t*		updateCounters,\n"
"	AabbList		primitives,\n"
"	ScratchNode*	scratchNodes,\n"
"	ReferenceNode*	references )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	EmitTopologyAndFitBounds<AabbList>(\n"
"		index, sortedMortonCodeKeys, sortedMortonCodeValues, updateCounters, primitives, scratchNodes, references );\n"
"}\n"
"\n"
"extern \"C\" __global__ void EmitTopologyAndFitBounds_InstanceList_hiprtFrameSRT(\n"
"	const uint32_t*				sortedMortonCodeKeys,\n"
"	const uint32_t*				sortedMortonCodeValues,\n"
"	uint32_t*					updateCounters,\n"
"	InstanceList<hiprtFrameSRT> primitives,\n"
"	ScratchNode*				scratchNodes,\n"
"	ReferenceNode*				references )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	EmitTopologyAndFitBounds<InstanceList<hiprtFrameSRT>>(\n"
"		index, sortedMortonCodeKeys, sortedMortonCodeValues, updateCounters, primitives, scratchNodes, references );\n"
"}\n"
"\n"
"extern \"C\" __global__ void EmitTopologyAndFitBounds_InstanceList_hiprtFrameMatrix(\n"
"	const uint32_t*				   sortedMortonCodeKeys,\n"
"	const uint32_t*				   sortedMortonCodeValues,\n"
"	uint32_t*					   updateCounters,\n"
"	InstanceList<hiprtFrameMatrix> primitives,\n"
"	ScratchNode*				   scratchNodes,\n"
"	ReferenceNode*				   references )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	EmitTopologyAndFitBounds<InstanceList<hiprtFrameMatrix>>(\n"
"		index, sortedMortonCodeKeys, sortedMortonCodeValues, updateCounters, primitives, scratchNodes, references );\n"
"}\n"
;
static const char* hip_PlocBuilderKernels= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_common.h>\n"
"#include <hiprt/hiprt_vec.h>\n"
"#include <hiprt/hiprt_math.h>\n"
"#include <hiprt/impl/Aabb.h>\n"
"#include <hiprt/impl/AabbList.h>\n"
"#include <hiprt/impl/Triangle.h>\n"
"#include <hiprt/impl/BvhNode.h>\n"
"#include <hiprt/impl/BvhBuilderUtil.h>\n"
"#include <hiprt/impl/Header.h>\n"
"#include <hiprt/impl/QrDecomposition.h>\n"
"#include <hiprt/impl/Quaternion.h>\n"
"#include <hiprt/impl/Transform.h>\n"
"#include <hiprt/impl/InstanceList.h>\n"
"#include <hiprt/impl/MortonCode.h>\n"
"#include <hiprt/impl/TriangleMesh.h>\n"
"#include <hiprt/impl/BvhConfig.h>\n"
"using namespace hiprt;\n"
"\n"
"HIPRT_DEVICE uint32_t findParent( uint32_t i, uint32_t j, uint32_t n, const uint32_t* sortedMortonCodeKeys )\n"
"{\n"
"	if ( i == 0 && j == n ) return InvalidValue;\n"
"	if ( i == 0 || ( j != n && findHighestDifferentBit( j - 1, j, n, sortedMortonCodeKeys ) <\n"
"								   findHighestDifferentBit( i - 1, i, n, sortedMortonCodeKeys ) ) )\n"
"		return j - 1;\n"
"	else\n"
"		return i - 1;\n"
"}\n"
"\n"
"__device__ __forceinline__ uint32_t encodeOffset( const int32_t threadIndex, const int32_t neighbourIndex )\n"
"{\n"
"	const int32_t  sOffset = neighbourIndex - threadIndex;\n"
"	const uint32_t uOffset = abs( sOffset ) - 1;\n"
"	return ( uOffset << 1 ) | ( sOffset < 0 ? ( threadIndex & 1 ) ^ 1 : threadIndex & 1 );\n"
"}\n"
"\n"
"__device__ __forceinline__ int32_t decodeOffset( const int32_t threadIndex, const uint32_t offset )\n"
"{\n"
"	const uint32_t off = ( offset >> 1 ) + 1;\n"
"	return threadIndex + ( ( offset ^ threadIndex ) & 1 ? -static_cast<int32_t>( off ) : static_cast<int32_t>( off ) );\n"
"}\n"
"\n"
"template <typename PrimitiveContainer>\n"
"__device__ void SetupClusters(\n"
"	PrimitiveContainer& primitives, ReferenceNode* references, const uint32_t* sortedMortonCodeValues, uint32_t* nodeIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"	if ( index >= primitives.getCount() ) return;\n"
"\n"
"	const uint32_t leafType = []() {\n"
"		if constexpr ( is_same<PrimitiveContainer, TriangleMesh>::value )\n"
"			return TriangleType;\n"
"		else if constexpr ( is_same<PrimitiveContainer, AabbList>::value )\n"
"			return CustomType;\n"
"		else if constexpr (\n"
"			is_same<PrimitiveContainer, InstanceList<hiprtFrameSRT>>::value ||\n"
"			is_same<PrimitiveContainer, InstanceList<hiprtFrameMatrix>>::value )\n"
"			return InstanceType;\n"
"	}();\n"
"\n"
"	const uint32_t primIndex = sortedMortonCodeValues[index];\n"
"	references[index]		 = ReferenceNode( primIndex, primitives.fetchAabb( primIndex ) );\n"
"	nodeIndices[index]		 = encodeNodeIndex( index, leafType );\n"
"}\n"
"\n"
"extern \"C\" __global__ void SetupClusters_TriangleMesh(\n"
"	TriangleMesh primitives, ReferenceNode* references, const uint32_t* sortedMortonCodeValues, uint32_t* nodeIndices )\n"
"{\n"
"	SetupClusters<TriangleMesh>( primitives, references, sortedMortonCodeValues, nodeIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void SetupClusters_AabbList(\n"
"	AabbList primitives, ReferenceNode* references, const uint32_t* sortedMortonCodeValues, uint32_t* nodeIndices )\n"
"{\n"
"	SetupClusters<AabbList>( primitives, references, sortedMortonCodeValues, nodeIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void SetupClusters_InstanceList_hiprtFrameSRT(\n"
"	InstanceList<hiprtFrameSRT> primitives,\n"
"	ReferenceNode*				references,\n"
"	const uint32_t*				sortedMortonCodeValues,\n"
"	uint32_t*					nodeIndices )\n"
"{\n"
"	SetupClusters<InstanceList<hiprtFrameSRT>>( primitives, references, sortedMortonCodeValues, nodeIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void SetupClusters_InstanceList_hiprtFrameMatrix(\n"
"	InstanceList<hiprtFrameMatrix> primitives,\n"
"	ReferenceNode*				   references,\n"
"	const uint32_t*				   sortedMortonCodeValues,\n"
"	uint32_t*					   nodeIndices )\n"
"{\n"
"	SetupClusters<InstanceList<hiprtFrameMatrix>>( primitives, references, sortedMortonCodeValues, nodeIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void __launch_bounds__( PlocMainBlockSize ) HPloc(\n"
"	uint32_t		primCount,\n"
"	const uint32_t* sortedMortonCodeKeys,\n"
"	uint32_t*		updateCounters,\n"
"	uint32_t*		nodeIndices,\n"
"	ScratchNode*	scratchNodes,\n"
"	ReferenceNode*	references,\n"
"	uint32_t*		nodeCounter )\n"
"{\n"
"	const uint32_t index	 = blockDim.x * blockIdx.x + threadIdx.x;\n"
"	const uint32_t laneIndex = threadIdx.x & ( WarpSize - 1 );\n"
"	const uint32_t warpIndex = threadIdx.x >> Log2( WarpSize );\n"
"\n"
"	alignas( alignof( Aabb ) ) __shared__ uint8_t boxesCache[sizeof( Aabb ) * PlocMainBlockSize];\n"
"	__shared__ uint32_t							  distanceOffsetsBlock[PlocMainBlockSize];\n"
"	__shared__ uint32_t							  nodeIndicesBlock[PlocMainBlockSize];\n"
"\n"
"	Aabb*	  boxesBlock		  = reinterpret_cast<Aabb*>( boxesCache );\n"
"	Aabb*	  boxesWarp			  = &boxesBlock[warpIndex * WarpSize];\n"
"	uint32_t* distanceOffsetsWarp = &distanceOffsetsBlock[warpIndex * WarpSize];\n"
"	uint32_t* nodeIndicesWarp	  = &nodeIndicesBlock[warpIndex * WarpSize];\n"
"\n"
"	uint32_t i = index;\n"
"	uint32_t j = i + 1;\n"
"	uint32_t k, s;\n"
"\n"
"	bool active = index < primCount;\n"
"\n"
"	while ( hiprt::ballot( active ) != 0 )\n"
"	{\n"
"		__threadfence();\n"
"\n"
"		if ( active )\n"
"		{\n"
"			uint32_t parentAddr = findParent( i, j, primCount, sortedMortonCodeKeys );\n"
"			if ( parentAddr == i - 1 )\n"
"			{\n"
"				k = atomicExch( &updateCounters[parentAddr], j );\n"
"				s = i;\n"
"				i = k;\n"
"			}\n"
"			else\n"
"			{\n"
"				k = atomicExch( &updateCounters[parentAddr], i );\n"
"				s = j;\n"
"				j = k;\n"
"			}\n"
"\n"
"			if ( k == InvalidValue ) active = false;\n"
"		}\n"
"\n"
"		__threadfence();\n"
"\n"
"		const uint32_t size	 = j - i;\n"
"		const bool	   last	 = active && size == primCount;\n"
"		uint64_t	   merge = hiprt::ballot( ( active && size > WarpSize / 2 ) || last );\n"
"\n"
"		while ( merge )\n"
"		{\n"
"			const uint32_t currentLane = __ffsll( static_cast<unsigned long long>( merge ) ) - 1;\n"
"			merge &= merge - 1;\n"
"\n"
"			const uint32_t current_i   = shfl( i, currentLane );\n"
"			const uint32_t current_j   = shfl( j, currentLane );\n"
"			const uint32_t current_s   = shfl( s, currentLane );\n"
"			const bool	   currentLast = shfl( last, currentLane );\n"
"\n"
"			uint32_t numLeft  = min( current_s - current_i, WarpSize / 2 );\n"
"			uint32_t numRight = min( current_j - current_s, WarpSize / 2 );\n"
"\n"
"			uint32_t leftIndex = InvalidValue;\n"
"			if ( laneIndex < numLeft ) leftIndex = nodeIndices[current_i + laneIndex];\n"
"			uint32_t numValidLeft = __popcll( hiprt::ballot( leftIndex != InvalidValue ) );\n"
"			numLeft				  = min( numLeft, numValidLeft );\n"
"\n"
"			uint32_t rightIndex = InvalidValue;\n"
"			if ( laneIndex < numRight ) rightIndex = nodeIndices[current_s + laneIndex];\n"
"			uint32_t numValidRight = __popcll( hiprt::ballot( rightIndex != InvalidValue ) );\n"
"			numRight			   = min( numRight, numValidRight );\n"
"\n"
"			if ( laneIndex < numLeft ) nodeIndicesWarp[laneIndex] = leftIndex;\n"
"			if ( laneIndex < numRight ) nodeIndicesWarp[laneIndex + numLeft] = rightIndex;\n"
"\n"
"			__threadfence_block();\n"
"			sync_warp();\n"
"\n"
"			uint32_t	   numberOfClusters = numLeft + numRight;\n"
"			const uint32_t threshold		= currentLast ? 1 : WarpSize / 2;\n"
"			if ( numberOfClusters > threshold )\n"
"			{\n"
"				uint32_t nodeIndex	 = nodeIndicesWarp[min( laneIndex, numberOfClusters - 1 )];\n"
"				Aabb	 box		 = isLeafNode( nodeIndex ) ? references[getNodeAddr( nodeIndex )].aabb()\n"
"															   : scratchNodes[getNodeAddr( nodeIndex )].aabb();\n"
"				boxesWarp[laneIndex] = box;\n"
"			}\n"
"\n"
"			constexpr uint32_t OffsetMask = ( ( 1u << ( Log2( PlocRadius ) + 1 ) ) - 1 );\n"
"\n"
"			while ( numberOfClusters > threshold )\n"
"			{\n"
"				distanceOffsetsWarp[laneIndex] = InvalidValue;\n"
"				sync_warp();\n"
"\n"
"				uint32_t minDistanceOffset = InvalidValue;\n"
"				Aabb	 box			   = boxesWarp[laneIndex];\n"
"\n"
"				for ( uint32_t neighbourIndex = laneIndex + 1;\n"
"					  neighbourIndex <= laneIndex + PlocRadius && neighbourIndex < numberOfClusters;\n"
"					  ++neighbourIndex )\n"
"				{\n"
"					Aabb neighbourBox = boxesWarp[neighbourIndex];\n"
"					neighbourBox.grow( box );\n"
"					uint32_t distance = ( ( __float_as_uint( neighbourBox.area() ) << 1 ) & ~OffsetMask );\n"
"\n"
"					const uint32_t offset0		   = encodeOffset( laneIndex, neighbourIndex );\n"
"					const uint32_t distanceOffset0 = distance | offset0;\n"
"					minDistanceOffset			   = min( minDistanceOffset, distanceOffset0 );\n"
"\n"
"					const uint32_t offset1		   = encodeOffset( neighbourIndex, laneIndex );\n"
"					const uint32_t distanceOffset1 = distance | offset1;\n"
"					atomicMin( &distanceOffsetsWarp[neighbourIndex], distanceOffset1 );\n"
"				}\n"
"				atomicMin( &distanceOffsetsWarp[laneIndex], minDistanceOffset );\n"
"\n"
"				sync_warp();\n"
"\n"
"				uint32_t nodeIndex = InvalidValue;\n"
"				if ( laneIndex < numberOfClusters )\n"
"				{\n"
"					int32_t neighbourIndex = decodeOffset( laneIndex, distanceOffsetsWarp[laneIndex] & OffsetMask );\n"
"					int32_t neighbourNeighbourIndex =\n"
"						decodeOffset( neighbourIndex, distanceOffsetsWarp[neighbourIndex] & OffsetMask );\n"
"\n"
"					uint32_t leftChildIndex	 = nodeIndicesWarp[laneIndex];\n"
"					uint32_t rightChildIndex = nodeIndicesWarp[neighbourIndex];\n"
"\n"
"					bool merging = false;\n"
"					if ( static_cast<int32_t>( laneIndex ) == neighbourNeighbourIndex )\n"
"					{\n"
"						if ( static_cast<int32_t>( laneIndex ) < neighbourIndex ) merging = true;\n"
"					}\n"
"					else\n"
"					{\n"
"						nodeIndex = leftChildIndex;\n"
"					}\n"
"\n"
"					uint32_t nodeAddr = primCount - 2 - warpOffset( merging, nodeCounter );\n"
"					if ( merging )\n"
"					{\n"
"						box.grow( boxesWarp[neighbourIndex] );\n"
"\n"
"						scratchNodes[nodeAddr].m_childIndex0 = leftChildIndex;\n"
"						scratchNodes[nodeAddr].m_childIndex1 = rightChildIndex;\n"
"						scratchNodes[nodeAddr].m_box		 = box;\n"
"\n"
"						nodeIndex = encodeNodeIndex( nodeAddr, BoxType );\n"
"					}\n"
"				}\n"
"\n"
"				const uint64_t warpBallot = hiprt::ballot( nodeIndex != InvalidValue ); // warp sync\'d here\n"
"				const uint32_t newIndex	  = __popcll( warpBallot & ( ( 1ull << laneIndex ) - 1ull ) );\n"
"				numberOfClusters		  = __popcll( warpBallot );\n"
"\n"
"				if ( nodeIndex != InvalidValue )\n"
"				{\n"
"					boxesWarp[newIndex]		  = box;\n"
"					nodeIndicesWarp[newIndex] = nodeIndex;\n"
"				}\n"
"\n"
"				__threadfence_block();\n"
"				sync_warp();\n"
"			}\n"
"\n"
"			if ( laneIndex < WarpSize / 2 )\n"
"				nodeIndices[current_i + laneIndex] =\n"
"					( laneIndex < numberOfClusters ) ? nodeIndicesWarp[laneIndex] : InvalidValue;\n"
"\n"
"			__threadfence();\n"
"		}\n"
"\n"
"		if ( last ) active = false;\n"
"\n"
"		__threadfence();\n"
"	}\n"
"}\n"
;
static const char* hip_SbvhBuilderKernels= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_common.h>\n"
"#include <hiprt/hiprt_vec.h>\n"
"#include <hiprt/hiprt_math.h>\n"
"#include <hiprt/impl/Aabb.h>\n"
"#include <hiprt/impl/AabbList.h>\n"
"#include <hiprt/impl/BvhBuilderUtil.h>\n"
"#include <hiprt/impl/Triangle.h>\n"
"#include <hiprt/impl/BvhNode.h>\n"
"#include <hiprt/impl/Header.h>\n"
"#include <hiprt/impl/QrDecomposition.h>\n"
"#include <hiprt/impl/Quaternion.h>\n"
"#include <hiprt/impl/Transform.h>\n"
"#include <hiprt/impl/InstanceList.h>\n"
"#include <hiprt/impl/SbvhCommon.h>\n"
"#include <hiprt/impl/TriangleMesh.h>\n"
"#include <hiprt/impl/BvhConfig.h>\n"
"using namespace hiprt;\n"
"\n"
"template <typename PrimitiveContainer>\n"
"__device__ void SetupLeavesAndReferences(\n"
"	PrimitiveContainer& primitives,\n"
"	ReferenceNode*		references,\n"
"	Task*				taskQueue,\n"
"	Aabb*				box,\n"
"	uint32_t*			referenceIndices,\n"
"	uint32_t*			taskIndices )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"\n"
"	if ( index < primitives.getCount() )\n"
"	{\n"
"		references[index]		= ReferenceNode( index, primitives.fetchAabb( index ) );\n"
"		referenceIndices[index] = index;\n"
"		taskIndices[index]		= 0;\n"
"	}\n"
"\n"
"	if ( index == 0 ) taskQueue[0] = Task( *box );\n"
"}\n"
"\n"
"extern \"C\" __global__ void SetupLeavesAndReferences_TriangleMesh(\n"
"	TriangleMesh   primitives,\n"
"	ReferenceNode* references,\n"
"	Task*		   taskQueue,\n"
"	Aabb*		   box,\n"
"	uint32_t*	   referenceIndices,\n"
"	uint32_t*	   taskIndices )\n"
"{\n"
"	SetupLeavesAndReferences<TriangleMesh>( primitives, references, taskQueue, box, referenceIndices, taskIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void SetupLeavesAndReferences_AabbList(\n"
"	AabbList	   primitives,\n"
"	ReferenceNode* references,\n"
"	Task*		   taskQueue,\n"
"	Aabb*		   box,\n"
"	uint32_t*	   referenceIndices,\n"
"	uint32_t*	   taskIndices )\n"
"{\n"
"	SetupLeavesAndReferences<AabbList>( primitives, references, taskQueue, box, referenceIndices, taskIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void SetupLeavesAndReferences_InstanceList_hiprtFrameSRT(\n"
"	InstanceList<hiprtFrameSRT> primitives,\n"
"	ReferenceNode*				references,\n"
"	Task*						taskQueue,\n"
"	Aabb*						box,\n"
"	uint32_t*					referenceIndices,\n"
"	uint32_t*					taskIndices )\n"
"{\n"
"	SetupLeavesAndReferences<InstanceList<hiprtFrameSRT>>(\n"
"		primitives, references, taskQueue, box, referenceIndices, taskIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void SetupLeavesAndReferences_InstanceList_hiprtFrameMatrix(\n"
"	InstanceList<hiprtFrameMatrix> primitives,\n"
"	ReferenceNode*				   references,\n"
"	Task*						   taskQueue,\n"
"	Aabb*						   box,\n"
"	uint32_t*					   referenceIndices,\n"
"	uint32_t*					   taskIndices )\n"
"{\n"
"	SetupLeavesAndReferences<InstanceList<hiprtFrameMatrix>>(\n"
"		primitives, references, taskQueue, box, referenceIndices, taskIndices );\n"
"}\n"
"\n"
"template <bool spatialSplits>\n"
"__device__ void ResetBins( uint32_t taskCount, uint32_t binCount, Bin* objectBins, Bin* spatialBins )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"\n"
"	if ( index < 3 * binCount * taskCount )\n"
"	{\n"
"		objectBins[index].reset();\n"
"		if constexpr ( spatialSplits ) spatialBins[index].reset();\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void ResetBins_true( uint32_t taskCount, uint32_t binCount, Bin* objectBins, Bin* spatialBins )\n"
"{\n"
"	ResetBins<true>( taskCount, binCount, objectBins, spatialBins );\n"
"}\n"
"\n"
"extern \"C\" __global__ void ResetBins_false( uint32_t taskCount, uint32_t binCount, Bin* objectBins, Bin* spatialBins )\n"
"{\n"
"	ResetBins<false>( taskCount, binCount, objectBins, spatialBins );\n"
"}\n"
"\n"
"extern \"C\" __global__ void BinReferencesObject(\n"
"	uint32_t	   activeRefCount,\n"
"	uint32_t	   binCount,\n"
"	uint32_t	   taskOffset,\n"
"	uint32_t	   taskCount,\n"
"	uint32_t*	   referenceIndices,\n"
"	uint32_t*	   taskIndices,\n"
"	Task*		   taskQueue,\n"
"	ReferenceNode* references,\n"
"	Bin*		   binsGlobal )\n"
"{\n"
"	const uint32_t indexStart = blockIdx.x * blockDim.x;\n"
"	const uint32_t index	  = indexStart + threadIdx.x;\n"
"	const uint32_t indexEnd	  = min( indexStart + blockDim.x, activeRefCount );\n"
"\n"
"	const uint32_t firstReferenceIndex = referenceIndices[indexStart];\n"
"	const uint32_t lastReferenceIndex  = referenceIndices[indexEnd - 1];\n"
"\n"
"	const uint32_t firstTaskIndex = taskIndices[firstReferenceIndex];\n"
"	const uint32_t lastTaskIndex  = taskIndices[lastReferenceIndex];\n"
"\n"
"	alignas( alignof( Bin ) ) __shared__ uint8_t binBuffer[3 * SbvhMaxBinCount * sizeof( Bin )];\n"
"	Bin*										 binCache = reinterpret_cast<Bin*>( binBuffer );\n"
"	Bin*										 bins	  = binsGlobal;\n"
"\n"
"	if ( firstTaskIndex == lastTaskIndex )\n"
"	{\n"
"		bins = binCache;\n"
"		for ( uint32_t binIndex = threadIdx.x; binIndex < 3 * binCount; binIndex += blockDim.x )\n"
"			binCache[binIndex].reset();\n"
"		__syncthreads();\n"
"	}\n"
"\n"
"	if ( index < activeRefCount )\n"
"	{\n"
"		const uint32_t referenceIndex = referenceIndices[index];\n"
"		const uint32_t taskIndex	  = taskIndices[referenceIndex];\n"
"\n"
"		const Task			task = taskQueue[taskIndex + taskOffset];\n"
"		const ReferenceNode ref	 = references[referenceIndex];\n"
"\n"
"		const float3 k = ( 1.0f - SbvhEpsilon ) * ( static_cast<float>( binCount ) / ( task.m_box.m_max - task.m_box.m_min ) );\n"
"		const uint3	 binIndex =\n"
"			clamp( make_uint3( k * ( ref.m_box.center() - task.m_box.m_min ) ), make_uint3( 0 ), make_uint3( binCount - 1 ) );\n"
"		uint3 binAddr = binIndex + make_uint3( 0, 1, 2 ) * binCount;\n"
"		if ( firstTaskIndex != lastTaskIndex ) binAddr = taskIndex + taskCount * binAddr;\n"
"\n"
"		bins[binAddr.x].m_box.atomicGrow( ref.m_box );\n"
"		bins[binAddr.y].m_box.atomicGrow( ref.m_box );\n"
"		bins[binAddr.z].m_box.atomicGrow( ref.m_box );\n"
"\n"
"		atomicAdd( &bins[binAddr.x].m_counter, 1 );\n"
"		atomicAdd( &bins[binAddr.y].m_counter, 1 );\n"
"		atomicAdd( &bins[binAddr.z].m_counter, 1 );\n"
"	}\n"
"\n"
"	if ( firstTaskIndex == lastTaskIndex )\n"
"	{\n"
"		__syncthreads();\n"
"		for ( uint32_t binIndex = threadIdx.x; binIndex < binCount; binIndex += blockDim.x )\n"
"		{\n"
"			uint3 binOffset = make_uint3( binIndex ) + make_uint3( 0, 1, 2 ) * binCount;\n"
"			uint3 binAddr	= firstTaskIndex + taskCount * binOffset;\n"
"\n"
"			if ( binCache[binOffset.x].m_counter > 0 )\n"
"			{\n"
"				atomicAdd( &binsGlobal[binAddr.x].m_counter, binCache[binOffset.x].m_counter );\n"
"				binsGlobal[binAddr.x].m_box.atomicGrow( binCache[binOffset.x].m_box );\n"
"			}\n"
"\n"
"			if ( binCache[binOffset.y].m_counter > 0 )\n"
"			{\n"
"				atomicAdd( &binsGlobal[binAddr.y].m_counter, binCache[binOffset.y].m_counter );\n"
"				binsGlobal[binAddr.y].m_box.atomicGrow( binCache[binOffset.y].m_box );\n"
"			}\n"
"\n"
"			if ( binCache[binOffset.z].m_counter > 0 )\n"
"			{\n"
"				atomicAdd( &binsGlobal[binAddr.z].m_counter, binCache[binOffset.z].m_counter );\n"
"				binsGlobal[binAddr.z].m_box.atomicGrow( binCache[binOffset.z].m_box );\n"
"			}\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"template <typename PrimitiveContainer>\n"
"__device__ void BinReferencesSpatial(\n"
"	uint32_t			activeRefCount,\n"
"	uint32_t			binCount,\n"
"	uint32_t			taskOffset,\n"
"	uint32_t			taskCount,\n"
"	float				overlapThreshold,\n"
"	float				edgeThreshold,\n"
"	uint32_t*			referenceIndices,\n"
"	uint32_t*			taskIndices,\n"
"	Task*				taskQueue,\n"
"	PrimitiveContainer& primitives,\n"
"	ReferenceNode*		references,\n"
"	Bin*				binsGlobal )\n"
"{\n"
"	const uint32_t indexStart = blockIdx.x * blockDim.x;\n"
"	const uint32_t index	  = indexStart + threadIdx.x;\n"
"	const uint32_t indexEnd	  = min( indexStart + blockDim.x, activeRefCount );\n"
"\n"
"	const uint32_t firstReferenceIndex = referenceIndices[indexStart];\n"
"	const uint32_t lastReferenceIndex  = referenceIndices[indexEnd - 1];\n"
"\n"
"	const uint32_t firstTaskIndex = taskIndices[firstReferenceIndex];\n"
"	const uint32_t lastTaskIndex  = taskIndices[lastReferenceIndex];\n"
"\n"
"	alignas( alignof( Bin ) ) __shared__ uint8_t binBuffer[3 * SbvhMaxBinCount * sizeof( Bin )];\n"
"	Bin*										 binCache = reinterpret_cast<Bin*>( binBuffer );\n"
"	Bin*										 bins	  = binsGlobal;\n"
"\n"
"	if ( firstTaskIndex == lastTaskIndex )\n"
"	{\n"
"		bins = binCache;\n"
"		for ( uint32_t binIndex = threadIdx.x; binIndex < 3 * binCount; binIndex += blockDim.x )\n"
"			binCache[binIndex].reset();\n"
"		__syncthreads();\n"
"	}\n"
"\n"
"	if ( index < activeRefCount )\n"
"	{\n"
"		const uint32_t referenceIndex = referenceIndices[index];\n"
"		const uint32_t taskIndex	  = taskIndices[referenceIndex];\n"
"\n"
"		const Task task = taskQueue[taskIndex + taskOffset];\n"
"\n"
"		Aabb overlap = task.m_box0;\n"
"		overlap.intersect( task.m_box1 );\n"
"\n"
"		if ( overlap.area() >= overlapThreshold )\n"
"		{\n"
"			const ReferenceNode ref = references[referenceIndex];\n"
"\n"
"			for ( uint32_t axisIndex = 0; axisIndex < 3; ++axisIndex )\n"
"			{\n"
"				if ( ptr( task.m_box.m_max )[axisIndex] - ptr( task.m_box.m_min )[axisIndex] < edgeThreshold ) continue;\n"
"\n"
"				uint32_t	firstBin = binCount - 1;\n"
"				uint32_t	lastBin	 = firstBin;\n"
"				const float binSize	 = ( ptr( task.m_box.m_max )[axisIndex] - ptr( task.m_box.m_min )[axisIndex] ) /\n"
"									  static_cast<float>( binCount );\n"
"				for ( uint32_t i = 0; i < binCount; ++i )\n"
"				{\n"
"					float position = ptr( task.m_box.m_min )[axisIndex] + binSize * static_cast<float>( i + 1 );\n"
"					if ( firstBin == binCount - 1 && ptr( ref.m_box.m_min )[axisIndex] < position ) firstBin = i;\n"
"					if ( lastBin == binCount - 1 && ptr( ref.m_box.m_max )[axisIndex] <= position ) lastBin = i;\n"
"				}\n"
"				if ( firstBin > lastBin ) firstBin = lastBin;\n"
"\n"
"				uint32_t	  curBinAddr;\n"
"				ReferenceNode curRef = ref;\n"
"				ReferenceNode leftRef( ref.m_primIndex );\n"
"				ReferenceNode rightRef( ref.m_primIndex );\n"
"\n"
"				for ( uint32_t i = firstBin; i < lastBin; i++ )\n"
"				{\n"
"					const float position = ptr( task.m_box.m_min )[axisIndex] + binSize * static_cast<float>( i + 1 );\n"
"					if constexpr ( is_same<PrimitiveContainer, AabbList>::value )\n"
"					{\n"
"						leftRef.m_box						   = curRef.m_box;\n"
"						rightRef.m_box						   = curRef.m_box;\n"
"						ptr( leftRef.m_box.m_max )[axisIndex]  = position;\n"
"						ptr( rightRef.m_box.m_min )[axisIndex] = position;\n"
"					}\n"
"					else\n"
"					{\n"
"						if constexpr (\n"
"							is_same<PrimitiveContainer, TriangleMesh>::value ||\n"
"							is_same<PrimitiveContainer, InstanceList<hiprtFrameSRT>>::value ||\n"
"							is_same<PrimitiveContainer, InstanceList<hiprtFrameMatrix>>::value )\n"
"						{\n"
"							primitives.split(\n"
"								ref.m_primIndex, axisIndex, position, curRef.m_box, leftRef.m_box, rightRef.m_box );\n"
"						}\n"
"\n"
"						if ( !leftRef.m_box.valid() )\n"
"						{\n"
"							leftRef.m_box						  = curRef.m_box;\n"
"							ptr( leftRef.m_box.m_max )[axisIndex] = position;\n"
"							leftRef.m_box.grow( leftRef.m_box.center() );\n"
"						}\n"
"\n"
"						if ( !rightRef.m_box.valid() )\n"
"						{\n"
"							rightRef.m_box						   = curRef.m_box;\n"
"							ptr( rightRef.m_box.m_min )[axisIndex] = position;\n"
"							rightRef.m_box.grow( rightRef.m_box.center() );\n"
"						}\n"
"					}\n"
"\n"
"					curBinAddr = i + axisIndex * binCount;\n"
"					if ( firstTaskIndex != lastTaskIndex ) curBinAddr = taskIndex + taskCount * curBinAddr;\n"
"					bins[curBinAddr].m_box.atomicGrow( leftRef.m_box );\n"
"\n"
"					curRef = rightRef;\n"
"				}\n"
"\n"
"				curBinAddr = firstBin + axisIndex * binCount;\n"
"				if ( firstTaskIndex != lastTaskIndex ) curBinAddr = taskIndex + taskCount * curBinAddr;\n"
"				atomicAdd( &bins[curBinAddr].m_enter, 1 );\n"
"\n"
"				curBinAddr = lastBin + axisIndex * binCount;\n"
"				if ( firstTaskIndex != lastTaskIndex ) curBinAddr = taskIndex + taskCount * curBinAddr;\n"
"				atomicAdd( &bins[curBinAddr].m_exit, 1 );\n"
"				bins[curBinAddr].m_box.atomicGrow( curRef.m_box );\n"
"			}\n"
"		}\n"
"	}\n"
"\n"
"	if ( firstTaskIndex == lastTaskIndex )\n"
"	{\n"
"		__syncthreads();\n"
"		for ( uint32_t binIndex = threadIdx.x; binIndex < binCount; binIndex += blockDim.x )\n"
"		{\n"
"			uint3 binOffset = make_uint3( binIndex ) + make_uint3( 0, 1, 2 ) * binCount;\n"
"			uint3 binAddr	= firstTaskIndex + taskCount * binOffset;\n"
"\n"
"			if ( binCache[binOffset.x].m_box.valid() ) binsGlobal[binAddr.x].m_box.atomicGrow( binCache[binOffset.x].m_box );\n"
"			if ( binCache[binOffset.y].m_box.valid() ) binsGlobal[binAddr.y].m_box.atomicGrow( binCache[binOffset.y].m_box );\n"
"			if ( binCache[binOffset.z].m_box.valid() ) binsGlobal[binAddr.z].m_box.atomicGrow( binCache[binOffset.z].m_box );\n"
"\n"
"			if ( binCache[binOffset.x].m_enter > 0 ) atomicAdd( &binsGlobal[binAddr.x].m_enter, binCache[binOffset.x].m_enter );\n"
"			if ( binCache[binOffset.y].m_enter > 0 ) atomicAdd( &binsGlobal[binAddr.y].m_enter, binCache[binOffset.y].m_enter );\n"
"			if ( binCache[binOffset.z].m_enter > 0 ) atomicAdd( &binsGlobal[binAddr.z].m_enter, binCache[binOffset.z].m_enter );\n"
"\n"
"			if ( binCache[binOffset.x].m_exit > 0 ) atomicAdd( &binsGlobal[binAddr.x].m_exit, binCache[binOffset.x].m_exit );\n"
"			if ( binCache[binOffset.y].m_exit > 0 ) atomicAdd( &binsGlobal[binAddr.y].m_exit, binCache[binOffset.y].m_exit );\n"
"			if ( binCache[binOffset.z].m_exit > 0 ) atomicAdd( &binsGlobal[binAddr.z].m_exit, binCache[binOffset.z].m_exit );\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void BinReferencesSpatial_TriangleMesh(\n"
"	uint32_t	   activeRefCount,\n"
"	uint32_t	   binCount,\n"
"	uint32_t	   taskOffset,\n"
"	uint32_t	   taskCount,\n"
"	float		   overlapThreshold,\n"
"	float		   edgeThreshold,\n"
"	uint32_t*	   referenceIndices,\n"
"	uint32_t*	   taskIndices,\n"
"	Task*		   taskQueue,\n"
"	TriangleMesh   primitives,\n"
"	ReferenceNode* references,\n"
"	Bin*		   spatialBins )\n"
"{\n"
"	BinReferencesSpatial<TriangleMesh>(\n"
"		activeRefCount,\n"
"		binCount,\n"
"		taskOffset,\n"
"		taskCount,\n"
"		overlapThreshold,\n"
"		edgeThreshold,\n"
"		referenceIndices,\n"
"		taskIndices,\n"
"		taskQueue,\n"
"		primitives,\n"
"		references,\n"
"		spatialBins );\n"
"}\n"
"\n"
"extern \"C\" __global__ void BinReferencesSpatial_AabbList(\n"
"	uint32_t	   activeRefCount,\n"
"	uint32_t	   binCount,\n"
"	uint32_t	   taskOffset,\n"
"	uint32_t	   taskCount,\n"
"	float		   overlapThreshold,\n"
"	float		   edgeThreshold,\n"
"	uint32_t*	   referenceIndices,\n"
"	uint32_t*	   taskIndices,\n"
"	Task*		   taskQueue,\n"
"	AabbList	   primitives,\n"
"	ReferenceNode* references,\n"
"	Bin*		   spatialBins )\n"
"{\n"
"	BinReferencesSpatial<AabbList>(\n"
"		activeRefCount,\n"
"		binCount,\n"
"		taskOffset,\n"
"		taskCount,\n"
"		overlapThreshold,\n"
"		edgeThreshold,\n"
"		referenceIndices,\n"
"		taskIndices,\n"
"		taskQueue,\n"
"		primitives,\n"
"		references,\n"
"		spatialBins );\n"
"}\n"
"\n"
"extern \"C\" __global__ void BinReferencesSpatial_InstanceList_hiprtFrameSRT(\n"
"	uint32_t					activeRefCount,\n"
"	uint32_t					binCount,\n"
"	uint32_t					taskOffset,\n"
"	uint32_t					taskCount,\n"
"	float						overlapThreshold,\n"
"	float						edgeThreshold,\n"
"	uint32_t*					referenceIndices,\n"
"	uint32_t*					taskIndices,\n"
"	Task*						taskQueue,\n"
"	InstanceList<hiprtFrameSRT> primitives,\n"
"	ReferenceNode*				references,\n"
"	Bin*						spatialBins )\n"
"{\n"
"	BinReferencesSpatial<InstanceList<hiprtFrameSRT>>(\n"
"		activeRefCount,\n"
"		binCount,\n"
"		taskOffset,\n"
"		taskCount,\n"
"		overlapThreshold,\n"
"		edgeThreshold,\n"
"		referenceIndices,\n"
"		taskIndices,\n"
"		taskQueue,\n"
"		primitives,\n"
"		references,\n"
"		spatialBins );\n"
"}\n"
"\n"
"extern \"C\" __global__ void BinReferencesSpatial_InstanceList_hiprtFrameMatrix(\n"
"	uint32_t					   activeRefCount,\n"
"	uint32_t					   binCount,\n"
"	uint32_t					   taskOffset,\n"
"	uint32_t					   taskCount,\n"
"	float						   overlapThreshold,\n"
"	float						   edgeThreshold,\n"
"	uint32_t*					   referenceIndices,\n"
"	uint32_t*					   taskIndices,\n"
"	Task*						   taskQueue,\n"
"	InstanceList<hiprtFrameMatrix> primitives,\n"
"	ReferenceNode*				   references,\n"
"	Bin*						   spatialBins )\n"
"{\n"
"	BinReferencesSpatial<InstanceList<hiprtFrameMatrix>>(\n"
"		activeRefCount,\n"
"		binCount,\n"
"		taskOffset,\n"
"		taskCount,\n"
"		overlapThreshold,\n"
"		edgeThreshold,\n"
"		referenceIndices,\n"
"		taskIndices,\n"
"		taskQueue,\n"
"		primitives,\n"
"		references,\n"
"		spatialBins );\n"
"}\n"
"\n"
"extern \"C\" __global__ void\n"
"FindObjectSplit( uint32_t taskCount, uint32_t binCount, uint32_t nodeCount, Bin* bins, Task* taskQueue )\n"
"{\n"
"	const uint32_t taskIndex = blockDim.x * blockIdx.x + threadIdx.x;\n"
"\n"
"	if ( taskIndex < taskCount )\n"
"	{\n"
"		Bin leftBin, rightBin;\n"
"		Bin rightBins[SbvhMaxBinCount];\n"
"\n"
"		float	 bestCost  = FltMax;\n"
"		uint32_t bestAxis  = InvalidValue;\n"
"		uint32_t bestIndex = InvalidValue;\n"
"\n"
"		uint32_t nodeAddr = nodeCount - taskCount + taskIndex;\n"
"		Task	 task	  = taskQueue[nodeAddr];\n"
"\n"
"		uint32_t nodeSize;\n"
"		for ( uint32_t axisIndex = 0; axisIndex < 3; ++axisIndex )\n"
"		{\n"
"			uint32_t binAddr		= taskIndex + taskCount * ( binCount - 1 + axisIndex * binCount );\n"
"			rightBins[binCount - 1] = bins[binAddr];\n"
"			for ( int32_t binIndex = binCount - 2; binIndex >= 0; --binIndex )\n"
"			{\n"
"				binAddr				= taskIndex + taskCount * ( binIndex + axisIndex * binCount );\n"
"				rightBins[binIndex] = rightBins[binIndex + 1];\n"
"				rightBins[binIndex].include( bins[binAddr] );\n"
"			}\n"
"			nodeSize = rightBins[0].m_counter;\n"
"\n"
"			binAddr		   = taskIndex + taskCount * axisIndex * binCount;\n"
"			Bin curLeftBin = bins[binAddr];\n"
"			for ( uint32_t binIndex = 0; binIndex < binCount - 1; ++binIndex )\n"
"			{\n"
"				if ( curLeftBin.m_counter > 0 && rightBins[binIndex + 1].m_counter > 0 )\n"
"				{\n"
"					float cost = curLeftBin.cost() + rightBins[binIndex + 1].cost();\n"
"					if ( bestCost > cost )\n"
"					{\n"
"						bestCost  = cost;\n"
"						bestAxis  = axisIndex;\n"
"						bestIndex = binIndex;\n"
"						leftBin	  = curLeftBin;\n"
"						rightBin  = rightBins[binIndex + 1];\n"
"					}\n"
"				}\n"
"				binAddr = taskIndex + taskCount * ( binIndex + 1 + axisIndex * binCount );\n"
"				curLeftBin.include( bins[binAddr] );\n"
"			}\n"
"		}\n"
"\n"
"		if ( bestIndex == InvalidValue )\n"
"		{\n"
"			bestCost		   = task.m_box.area() * nodeSize;\n"
"			bestAxis		   = 3;\n"
"			bestIndex		   = nodeSize >> 1;\n"
"			leftBin.m_counter  = bestIndex;\n"
"			rightBin.m_counter = nodeSize - bestIndex;\n"
"			leftBin.m_box	   = task.m_box;\n"
"			rightBin.m_box	   = task.m_box;\n"
"		}\n"
"\n"
"		task.m_split.setSplitInfo( bestAxis, bestIndex, leftBin.m_counter == 1, rightBin.m_counter == 1, false );\n"
"		task.m_box0			= leftBin.m_box;\n"
"		task.m_counter0		= leftBin.m_counter;\n"
"		task.m_box1			= rightBin.m_box;\n"
"		task.m_counter1		= rightBin.m_counter;\n"
"		task.m_cost			= bestCost;\n"
"		taskQueue[nodeAddr] = task;\n"
"	}\n"
"}\n"
"\n"
"template <bool spatialSplits>\n"
"__device__ void SplitReferences(\n"
"	uint32_t	 taskCount,\n"
"	uint32_t	 binCount,\n"
"	uint32_t	 nodeCount,\n"
"	uint32_t	 referenceCount,\n"
"	uint32_t	 maxReferenceCount,\n"
"	float		 overlapThreshold,\n"
"	float		 edgeThreshold,\n"
"	Bin*		 bins,\n"
"	ScratchNode* scratchNodes,\n"
"	Task*		 taskQueue,\n"
"	uint32_t*	 taskCounter,\n"
"	uint32_t*	 referenceCounter,\n"
"	uint32_t*	 refOffsetCounter )\n"
"{\n"
"	const uint32_t taskIndex = blockDim.x * blockIdx.x + threadIdx.x;\n"
"	const uint32_t taskEnd	 = RoundUp( taskCount, WarpSize );\n"
"\n"
"	if ( taskIndex < taskEnd )\n"
"	{\n"
"		uint32_t nodeAddr;\n"
"		Task	 task;\n"
"\n"
"		if ( taskIndex < taskCount )\n"
"		{\n"
"			nodeAddr = nodeCount - taskCount + taskIndex;\n"
"			task	 = taskQueue[nodeAddr];\n"
"		}\n"
"\n"
"		if constexpr ( spatialSplits )\n"
"		{\n"
"			uint32_t duplicateCount = 0;\n"
"			Bin		 leftBin, rightBin;\n"
"			Bin		 rightBins[SbvhMaxBinCount];\n"
"\n"
"			float	 bestCost  = FltMax;\n"
"			uint32_t bestAxis  = InvalidValue;\n"
"			uint32_t bestIndex = InvalidValue;\n"
"\n"
"			if ( taskIndex < taskCount )\n"
"			{\n"
"				bestCost	 = task.m_cost;\n"
"				Aabb overlap = task.m_box0;\n"
"				overlap.intersect( task.m_box1 );\n"
"\n"
"				if ( overlap.area() >= overlapThreshold )\n"
"				{\n"
"					for ( uint32_t axisIndex = 0; axisIndex < 3; ++axisIndex )\n"
"					{\n"
"						if ( ptr( task.m_box.m_max )[axisIndex] - ptr( task.m_box.m_min )[axisIndex] < edgeThreshold ) continue;\n"
"\n"
"						uint32_t binAddr		= taskIndex + taskCount * ( binCount - 1 + axisIndex * binCount );\n"
"						rightBins[binCount - 1] = bins[binAddr];\n"
"\n"
"						for ( int32_t binIndex = binCount - 2; binIndex >= 0; --binIndex )\n"
"						{\n"
"							binAddr				= taskIndex + taskCount * ( binIndex + axisIndex * binCount );\n"
"							rightBins[binIndex] = rightBins[binIndex + 1];\n"
"							rightBins[binIndex].include( bins[binAddr] );\n"
"						}\n"
"\n"
"						binAddr		   = taskIndex + taskCount * axisIndex * binCount;\n"
"						Bin curLeftBin = bins[binAddr];\n"
"						for ( uint32_t binIndex = 0; binIndex < binCount - 1; ++binIndex )\n"
"						{\n"
"							if ( curLeftBin.m_enter > 0 && rightBins[binIndex + 1].m_exit > 0 )\n"
"							{\n"
"								float cost = curLeftBin.leftCost() + rightBins[binIndex + 1].rightCost();\n"
"								if ( bestCost > cost )\n"
"								{\n"
"									bestCost		  = cost;\n"
"									bestAxis		  = axisIndex;\n"
"									bestIndex		  = binIndex;\n"
"									leftBin			  = curLeftBin;\n"
"									leftBin.m_counter = leftBin.m_enter;\n"
"									rightBin		  = rightBins[binIndex + 1];\n"
"								}\n"
"							}\n"
"							binAddr = taskIndex + taskCount * ( binIndex + 1 + axisIndex * binCount );\n"
"							curLeftBin.include( bins[binAddr] );\n"
"						}\n"
"					}\n"
"\n"
"					if ( bestIndex != InvalidValue )\n"
"						duplicateCount = leftBin.m_enter + rightBin.m_exit - task.m_counter0 - task.m_counter1;\n"
"				}\n"
"			}\n"
"\n"
"			uint32_t referenceOffset = warpOffset( duplicateCount, referenceCounter );\n"
"\n"
"			if ( taskIndex < taskCount )\n"
"			{\n"
"				if ( bestIndex != InvalidValue )\n"
"				{\n"
"					if ( referenceCount + referenceOffset + duplicateCount <= maxReferenceCount )\n"
"					{\n"
"						task.m_split.setSplitInfo( bestAxis, bestIndex, leftBin.m_enter == 1, rightBin.m_exit == 1, true );\n"
"						task.m_box0		= leftBin.m_box;\n"
"						task.m_counter0 = leftBin.m_enter;\n"
"						task.m_box1		= rightBin.m_box;\n"
"						task.m_counter1 = rightBin.m_exit;\n"
"					}\n"
"				}\n"
"			}\n"
"		}\n"
"\n"
"		uint32_t outputTaskCount = ( task.m_counter0 > 1 ) + ( task.m_counter1 > 1 );\n"
"		uint32_t taskOffset		 = warpOffset( outputTaskCount, taskCounter );\n"
"\n"
"		uint32_t refCount = 0;\n"
"		if ( task.m_counter0 > 1 ) refCount += task.m_counter0;\n"
"		if ( task.m_counter1 > 1 ) refCount += task.m_counter1;\n"
"\n"
"		uint32_t leftRefOffset	= warpOffset( refCount, refOffsetCounter );\n"
"		uint32_t rightRefOffset = leftRefOffset;\n"
"		if ( task.m_counter0 > 1 ) rightRefOffset += task.m_counter0;\n"
"\n"
"		if ( taskIndex < taskCount )\n"
"		{\n"
"			ScratchNode node;\n"
"			node.m_box = task.m_box;\n"
"\n"
"			uint32_t nodeOffset = taskOffset;\n"
"			if ( task.m_counter0 > 1 )\n"
"			{\n"
"				uint32_t leftNodeAddr	= nodeCount + ( nodeOffset++ );\n"
"				node.m_childIndex0		= encodeNodeIndex( leftNodeAddr, BoxType );\n"
"				taskQueue[leftNodeAddr] = Task( task.m_box0, leftRefOffset );\n"
"			}\n"
"\n"
"			if ( task.m_counter1 > 1 )\n"
"			{\n"
"				uint32_t rightNodeAddr	 = nodeCount + nodeOffset;\n"
"				node.m_childIndex1		 = encodeNodeIndex( rightNodeAddr, BoxType );\n"
"				taskQueue[rightNodeAddr] = Task( task.m_box1, rightRefOffset );\n"
"			}\n"
"			scratchNodes[nodeAddr] = node;\n"
"\n"
"			task.m_refOffset	= 0;\n"
"			task.m_taskOffset	= taskOffset;\n"
"			taskQueue[nodeAddr] = task;\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void SplitReferences_true(\n"
"	uint32_t	 taskCount,\n"
"	uint32_t	 binCount,\n"
"	uint32_t	 nodeCount,\n"
"	uint32_t	 referenceCount,\n"
"	uint32_t	 maxReferenceCount,\n"
"	float		 overlapThreshold,\n"
"	float		 edgeThreshold,\n"
"	Bin*		 bins,\n"
"	ScratchNode* scratchNodes,\n"
"	Task*		 taskQueue,\n"
"	uint32_t*	 taskCounter,\n"
"	uint32_t*	 referenceCounter,\n"
"	uint32_t*	 refOffsetCounter )\n"
"{\n"
"	SplitReferences<true>(\n"
"		taskCount,\n"
"		binCount,\n"
"		nodeCount,\n"
"		referenceCount,\n"
"		maxReferenceCount,\n"
"		overlapThreshold,\n"
"		edgeThreshold,\n"
"		bins,\n"
"		scratchNodes,\n"
"		taskQueue,\n"
"		taskCounter,\n"
"		referenceCounter,\n"
"		refOffsetCounter );\n"
"}\n"
"\n"
"extern \"C\" __global__ void SplitReferences_false(\n"
"	uint32_t	 taskCount,\n"
"	uint32_t	 binCount,\n"
"	uint32_t	 nodeCount,\n"
"	uint32_t	 referenceCount,\n"
"	uint32_t	 maxReferenceCount,\n"
"	float		 overlapThreshold,\n"
"	float		 edgeThreshold,\n"
"	Bin*		 bins,\n"
"	ScratchNode* scratchNodes,\n"
"	Task*		 taskQueue,\n"
"	uint32_t*	 taskCounter,\n"
"	uint32_t*	 referenceCounter,\n"
"	uint32_t*	 refOffsetCounter )\n"
"{\n"
"	SplitReferences<false>(\n"
"		taskCount,\n"
"		binCount,\n"
"		nodeCount,\n"
"		referenceCount,\n"
"		maxReferenceCount,\n"
"		overlapThreshold,\n"
"		edgeThreshold,\n"
"		bins,\n"
"		scratchNodes,\n"
"		taskQueue,\n"
"		taskCounter,\n"
"		referenceCounter,\n"
"		refOffsetCounter );\n"
"}\n"
"\n"
"template <typename PrimitiveContainer>\n"
"__device__ void DistributeReferences(\n"
"	uint32_t			activeRefCount,\n"
"	uint32_t			referenceCount,\n"
"	uint32_t			binCount,\n"
"	uint32_t			nodeCount,\n"
"	uint32_t			taskCount,\n"
"	uint32_t			taskOffset,\n"
"	uint32_t*			referenceIndices0,\n"
"	uint32_t*			referenceIndices1,\n"
"	uint32_t*			taskIndices,\n"
"	Task*				taskQueue,\n"
"	PrimitiveContainer& primitives,\n"
"	ScratchNode*		scratchNodes,\n"
"	ReferenceNode*		references,\n"
"	uint32_t*			referenceCounter )\n"
"{\n"
"	const uint32_t index = threadIdx.x + blockIdx.x * blockDim.x;\n"
"\n"
"	uint32_t leafType;\n"
"	if constexpr ( is_same<PrimitiveContainer, TriangleMesh>::value )\n"
"		leafType = TriangleType;\n"
"	else if constexpr ( is_same<PrimitiveContainer, AabbList>::value )\n"
"		leafType = CustomType;\n"
"	else if constexpr (\n"
"		is_same<PrimitiveContainer, InstanceList<hiprtFrameSRT>>::value ||\n"
"		is_same<PrimitiveContainer, InstanceList<hiprtFrameMatrix>>::value )\n"
"		leafType = InstanceType;\n"
"\n"
"	if ( index < activeRefCount )\n"
"	{\n"
"		uint32_t referenceIndex = referenceIndices0[index];\n"
"		uint32_t taskIndex		= taskIndices[referenceIndex];\n"
"		uint32_t nodeAddr		= taskIndex + taskOffset;\n"
"\n"
"		Task		  task = taskQueue[nodeAddr];\n"
"		ReferenceNode ref  = references[referenceIndex];\n"
"\n"
"		uint32_t splitAxis	  = task.m_split.m_splitAxis;\n"
"		uint32_t splitIndex	  = task.m_split.m_splitIndex;\n"
"		bool	 leftLeaf	  = task.m_split.m_leftLeaf;\n"
"		bool	 rightLeaf	  = task.m_split.m_rightLeaf;\n"
"		bool	 spatialSplit = task.m_split.m_spatialSplit;\n"
"\n"
"		if ( !spatialSplit )\n"
"		{\n"
"			float3 k = ( 1.0f - SbvhEpsilon ) * ( static_cast<float>( binCount ) / ( task.m_box.m_max - task.m_box.m_min ) );\n"
"			uint3  binIndex = clamp(\n"
"				 make_uint3( k * ( ref.m_box.center() - task.m_box.m_min ) ), make_uint3( 0 ), make_uint3( binCount - 1 ) );\n"
"\n"
"			bool onLeft;\n"
"			if ( splitAxis < 3 )\n"
"				onLeft = ptr( binIndex )[splitAxis] <= splitIndex;\n"
"			else\n"
"				onLeft = atomicAdd( &taskQueue[nodeAddr].m_refOffset, 1 ) < splitIndex;\n"
"\n"
"			uint32_t newTaskIndex = task.m_taskOffset;\n"
"			if ( !onLeft && !leftLeaf ) ++newTaskIndex;\n"
"\n"
"			if ( ( !onLeft || leftLeaf ) && ( onLeft || rightLeaf ) )\n"
"			{\n"
"				if ( onLeft )\n"
"					scratchNodes[nodeAddr].m_childIndex0 = encodeNodeIndex( referenceIndex, leafType );\n"
"				else\n"
"					scratchNodes[nodeAddr].m_childIndex1 = encodeNodeIndex( referenceIndex, leafType );\n"
"				ScratchNode node = scratchNodes[nodeAddr];\n"
"			}\n"
"			else\n"
"			{\n"
"				uint32_t newIndex			= atomicAdd( &taskQueue[newTaskIndex + nodeCount].m_refOffset, 1 );\n"
"				taskIndices[referenceIndex] = newTaskIndex;\n"
"				referenceIndices1[newIndex] = referenceIndex;\n"
"			}\n"
"		}\n"
"\n"
"		else\n"
"		{\n"
"			float	 binSize  = ( ptr( task.m_box.m_max )[splitAxis] - ptr( task.m_box.m_min )[splitAxis] ) / float( binCount );\n"
"			uint32_t firstBin = binCount - 1;\n"
"			uint32_t lastBin  = firstBin;\n"
"			for ( uint32_t i = 0; i < binCount; ++i )\n"
"			{\n"
"				float position = ptr( task.m_box.m_min )[splitAxis] + binSize * static_cast<float>( i + 1 );\n"
"				if ( firstBin == binCount - 1 && ptr( ref.m_box.m_min )[splitAxis] < position ) firstBin = i;\n"
"				if ( lastBin == binCount - 1 && ptr( ref.m_box.m_max )[splitAxis] <= position ) lastBin = i;\n"
"			}\n"
"			if ( firstBin > lastBin ) firstBin = lastBin;\n"
"\n"
"			float position	 = ptr( task.m_box.m_min )[splitAxis] + binSize * static_cast<float>( splitIndex + 1 );\n"
"			bool  duplicated = firstBin <= splitIndex && lastBin > splitIndex;\n"
"			if ( duplicated )\n"
"			{\n"
"				ReferenceNode leftRef( ref.m_primIndex );\n"
"				ReferenceNode rightRef( ref.m_primIndex );\n"
"				if constexpr ( is_same<PrimitiveContainer, AabbList>::value )\n"
"				{\n"
"					leftRef.m_box						   = ref.m_box;\n"
"					rightRef.m_box						   = ref.m_box;\n"
"					ptr( leftRef.m_box.m_max )[splitAxis]  = position;\n"
"					ptr( rightRef.m_box.m_min )[splitAxis] = position;\n"
"				}\n"
"				else\n"
"				{\n"
"					if constexpr (\n"
"						is_same<PrimitiveContainer, TriangleMesh>::value ||\n"
"						is_same<PrimitiveContainer, InstanceList<hiprtFrameSRT>>::value ||\n"
"						is_same<PrimitiveContainer, InstanceList<hiprtFrameMatrix>>::value )\n"
"					{\n"
"						primitives.split( ref.m_primIndex, splitAxis, position, ref.m_box, leftRef.m_box, rightRef.m_box );\n"
"					}\n"
"\n"
"					if ( !leftRef.m_box.valid() )\n"
"					{\n"
"						leftRef.m_box						  = ref.m_box;\n"
"						ptr( leftRef.m_box.m_max )[splitAxis] = position;\n"
"						leftRef.m_box.grow( leftRef.m_box.center() );\n"
"					}\n"
"\n"
"					if ( !rightRef.m_box.valid() )\n"
"					{\n"
"						rightRef.m_box						   = ref.m_box;\n"
"						ptr( rightRef.m_box.m_min )[splitAxis] = position;\n"
"						rightRef.m_box.grow( rightRef.m_box.center() );\n"
"					}\n"
"				}\n"
"\n"
"				uint32_t referenceOffset	  = atomicAdd( referenceCounter, 1 );\n"
"				uint32_t newReferenceIndex	  = referenceCount + referenceOffset;\n"
"				references[referenceIndex]	  = leftRef;\n"
"				references[newReferenceIndex] = rightRef;\n"
"\n"
"				uint32_t newTaskIndex = task.m_taskOffset;\n"
"				if ( leftLeaf )\n"
"				{\n"
"					scratchNodes[nodeAddr].m_childIndex0 = encodeNodeIndex( referenceIndex, leafType );\n"
"					ScratchNode node					 = scratchNodes[nodeAddr];\n"
"				}\n"
"				else\n"
"				{\n"
"					uint32_t newIndex			= atomicAdd( &taskQueue[newTaskIndex + nodeCount].m_refOffset, 1 );\n"
"					taskIndices[referenceIndex] = newTaskIndex;\n"
"					referenceIndices1[newIndex] = referenceIndex;\n"
"				}\n"
"\n"
"				newTaskIndex = task.m_taskOffset;\n"
"				if ( !leftLeaf ) ++newTaskIndex;\n"
"				if ( rightLeaf )\n"
"				{\n"
"					scratchNodes[nodeAddr].m_childIndex1 = encodeNodeIndex( newReferenceIndex, leafType );\n"
"					ScratchNode node					 = scratchNodes[nodeAddr];\n"
"				}\n"
"				else\n"
"				{\n"
"					uint32_t newIndex			   = atomicAdd( &taskQueue[newTaskIndex + nodeCount].m_refOffset, 1 );\n"
"					taskIndices[newReferenceIndex] = newTaskIndex;\n"
"					referenceIndices1[newIndex]	   = newReferenceIndex;\n"
"				}\n"
"			}\n"
"			else\n"
"			{\n"
"				uint32_t newTaskIndex = task.m_taskOffset;\n"
"				bool	 onLeft		  = ptr( ref.m_box.m_max )[splitAxis] <= position;\n"
"				if ( !onLeft && !leftLeaf ) ++newTaskIndex;\n"
"\n"
"				if ( ( !onLeft || leftLeaf ) && ( onLeft || rightLeaf ) )\n"
"				{\n"
"					if ( onLeft )\n"
"						scratchNodes[nodeAddr].m_childIndex0 = encodeNodeIndex( referenceIndex, leafType );\n"
"					else\n"
"						scratchNodes[nodeAddr].m_childIndex1 = encodeNodeIndex( referenceIndex, leafType );\n"
"					ScratchNode node = scratchNodes[nodeAddr];\n"
"				}\n"
"				else\n"
"				{\n"
"					uint32_t newIndex			= atomicAdd( &taskQueue[newTaskIndex + nodeCount].m_refOffset, 1 );\n"
"					taskIndices[referenceIndex] = newTaskIndex;\n"
"					referenceIndices1[newIndex] = referenceIndex;\n"
"				}\n"
"			}\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void DistributeReferences_TriangleMesh(\n"
"	uint32_t	   activeRefCount,\n"
"	uint32_t	   referenceCount,\n"
"	uint32_t	   binCount,\n"
"	uint32_t	   nodeCount,\n"
"	uint32_t	   taskCount,\n"
"	uint32_t	   taskOffset,\n"
"	uint32_t*	   referenceIndices0,\n"
"	uint32_t*	   referenceIndices1,\n"
"	uint32_t*	   taskIndices,\n"
"	Task*		   taskQueue,\n"
"	TriangleMesh   primitives,\n"
"	ScratchNode*   scratchNodes,\n"
"	ReferenceNode* references,\n"
"	uint32_t*	   referenceCounter )\n"
"{\n"
"	DistributeReferences<TriangleMesh>(\n"
"		activeRefCount,\n"
"		referenceCount,\n"
"		binCount,\n"
"		nodeCount,\n"
"		taskCount,\n"
"		taskOffset,\n"
"		referenceIndices0,\n"
"		referenceIndices1,\n"
"		taskIndices,\n"
"		taskQueue,\n"
"		primitives,\n"
"		scratchNodes,\n"
"		references,\n"
"		referenceCounter );\n"
"}\n"
"\n"
"extern \"C\" __global__ void DistributeReferences_AabbList(\n"
"	uint32_t	   activeRefCount,\n"
"	uint32_t	   referenceCount,\n"
"	uint32_t	   binCount,\n"
"	uint32_t	   nodeCount,\n"
"	uint32_t	   taskCount,\n"
"	uint32_t	   taskOffset,\n"
"	uint32_t*	   referenceIndices0,\n"
"	uint32_t*	   referenceIndices1,\n"
"	uint32_t*	   taskIndices,\n"
"	Task*		   taskQueue,\n"
"	AabbList	   primitives,\n"
"	ScratchNode*   scratchNodes,\n"
"	ReferenceNode* references,\n"
"	uint32_t*	   referenceCounter )\n"
"{\n"
"	DistributeReferences<AabbList>(\n"
"		activeRefCount,\n"
"		referenceCount,\n"
"		binCount,\n"
"		nodeCount,\n"
"		taskCount,\n"
"		taskOffset,\n"
"		referenceIndices0,\n"
"		referenceIndices1,\n"
"		taskIndices,\n"
"		taskQueue,\n"
"		primitives,\n"
"		scratchNodes,\n"
"		references,\n"
"		referenceCounter );\n"
"}\n"
"\n"
"extern \"C\" __global__ void DistributeReferences_InstanceList_hiprtFrameSRT(\n"
"	uint32_t					activeRefCount,\n"
"	uint32_t					referenceCount,\n"
"	uint32_t					binCount,\n"
"	uint32_t					nodeCount,\n"
"	uint32_t					taskCount,\n"
"	uint32_t					taskOffset,\n"
"	uint32_t*					referenceIndices0,\n"
"	uint32_t*					referenceIndices1,\n"
"	uint32_t*					taskIndices,\n"
"	Task*						taskQueue,\n"
"	InstanceList<hiprtFrameSRT> primitives,\n"
"	ScratchNode*				scratchNodes,\n"
"	ReferenceNode*				references,\n"
"	uint32_t*					referenceCounter )\n"
"{\n"
"	DistributeReferences<InstanceList<hiprtFrameSRT>>(\n"
"		activeRefCount,\n"
"		referenceCount,\n"
"		binCount,\n"
"		nodeCount,\n"
"		taskCount,\n"
"		taskOffset,\n"
"		referenceIndices0,\n"
"		referenceIndices1,\n"
"		taskIndices,\n"
"		taskQueue,\n"
"		primitives,\n"
"		scratchNodes,\n"
"		references,\n"
"		referenceCounter );\n"
"}\n"
"\n"
"extern \"C\" __global__ void DistributeReferences_InstanceList_hiprtFrameMatrix(\n"
"	uint32_t					   activeRefCount,\n"
"	uint32_t					   referenceCount,\n"
"	uint32_t					   binCount,\n"
"	uint32_t					   nodeCount,\n"
"	uint32_t					   taskCount,\n"
"	uint32_t					   taskOffset,\n"
"	uint32_t*					   referenceIndices0,\n"
"	uint32_t*					   referenceIndices1,\n"
"	uint32_t*					   taskIndices,\n"
"	Task*						   taskQueue,\n"
"	InstanceList<hiprtFrameMatrix> primitives,\n"
"	ScratchNode*				   scratchNodes,\n"
"	ReferenceNode*				   references,\n"
"	uint32_t*					   referenceCounter )\n"
"{\n"
"	DistributeReferences<InstanceList<hiprtFrameMatrix>>(\n"
"		activeRefCount,\n"
"		referenceCount,\n"
"		binCount,\n"
"		nodeCount,\n"
"		taskCount,\n"
"		taskOffset,\n"
"		referenceIndices0,\n"
"		referenceIndices1,\n"
"		taskIndices,\n"
"		taskQueue,\n"
"		primitives,\n"
"		scratchNodes,\n"
"		references,\n"
"		referenceCounter );\n"
"}\n"
;
static const char* hip_BatchBuilderKernels= \
"\n"
"#pragma once\n"
"#include <hiprt/hiprt_common.h>\n"
"#include <hiprt/hiprt_vec.h>\n"
"#include <hiprt/hiprt_types.h>\n"
"#include <hiprt/hiprt_math.h>\n"
"#include <hiprt/impl/Aabb.h>\n"
"#include <hiprt/impl/AabbList.h>\n"
"#include <hiprt/impl/BvhCommon.h>\n"
"#include <hiprt/impl/Triangle.h>\n"
"#include <hiprt/impl/BvhNode.h>\n"
"#include <hiprt/impl/Header.h>\n"
"#include <hiprt/impl/QrDecomposition.h>\n"
"#include <hiprt/impl/Quaternion.h>\n"
"#include <hiprt/impl/Transform.h>\n"
"#include <hiprt/impl/InstanceList.h>\n"
"#include <hiprt/impl/MortonCode.h>\n"
"#include <hiprt/impl/TriangleMesh.h>\n"
"#include <hiprt/impl/BvhConfig.h>\n"
"#include <hiprt/impl/MemoryArena.h>\n"
"\n"
"#include <hiprt/impl/BvhBuilderKernels.h>\n"
"#include <hiprt/impl/LbvhBuilderKernels.h>\n"
"\n"
"using namespace hiprt;\n"
"\n"
"static constexpr size_t BatchBuildCacheAlignment = alignof( ReferenceNode ) > alignof( ScratchNode ) ? alignof( ReferenceNode )\n"
"																									 : alignof( ScratchNode );\n"
"static constexpr size_t BatchBuildCacheSize =\n"
"	RoundUp( ( BatchBuilderMaxBlockSize - 1 ) * sizeof( ScratchNode ), BatchBuildCacheAlignment ) +\n"
"	RoundUp( ( BatchBuilderMaxBlockSize ) * sizeof( ReferenceNode ), BatchBuildCacheAlignment ) +\n"
"	4 * RoundUp( BatchBuilderMaxBlockSize * sizeof( uint32_t ), BatchBuildCacheAlignment );\n"
"\n"
"HIPRT_DEVICE size_t getStorageBufferSize( const hiprtGeometryBuildInput& buildInput )\n"
"{\n"
"	const size_t primCount	   = getPrimCount( buildInput );\n"
"	const size_t primNodeCount = getMaxPrimNodeCount( buildInput, Rtip, primCount );\n"
"	const size_t primNodeSize  = getPrimNodeSize( buildInput, sizeof( TriangleNode ) );\n"
"	const size_t boxNodeCount  = getMaxBoxNodeCount( buildInput, Rtip, primCount );\n"
"	return getGeometryStorageBufferSize( primNodeCount, boxNodeCount, primNodeSize, sizeof( BoxNode ) );\n"
"}\n"
"\n"
"HIPRT_DEVICE size_t getStorageBufferSize( const hiprtSceneBuildInput& buildInput )\n"
"{\n"
"	const size_t frameCount	  = buildInput.frameCount;\n"
"	const size_t primCount	  = buildInput.instanceCount;\n"
"	const size_t boxNodeCount = getMaxBoxNodeCount( buildInput, Rtip, primCount );\n"
"	return getSceneStorageBufferSize(\n"
"		primCount, primCount, boxNodeCount, sizeof( BoxNode ), sizeof( InstanceNode ), frameCount );\n"
"}\n"
"\n"
"template <typename PrimitiveNode, typename PrimitiveContainer>\n"
"HIPRT_DEVICE void\n"
"build( PrimitiveContainer& primitives, uint32_t geomType, MemoryArena& storageMemoryArena, MemoryArena& sharedMemoryArena )\n"
"{\n"
"	using Header = typename conditional<is_same<PrimitiveNode, InstanceNode>::value, SceneHeader, GeomHeader>::type;\n"
"\n"
"	const uint32_t maxBoxNodeCount =\n"
"		static_cast<uint32_t>( getMaxBoxNodeCount<BoxNode, PrimitiveNode>( primitives.getCount() ) );\n"
"	const uint32_t maxPrimNodeCount = static_cast<uint32_t>( getMaxPrimNodeCount<PrimitiveNode>( primitives.getCount() ) );\n"
"\n"
"	Header*		   header	 = storageMemoryArena.allocate<Header>();\n"
"	BoxNode*	   boxNodes	 = storageMemoryArena.allocate<BoxNode>( maxBoxNodeCount );\n"
"	PrimitiveNode* primNodes = storageMemoryArena.allocate<PrimitiveNode>( maxPrimNodeCount );\n"
"\n"
"	const uint32_t index	 = threadIdx.x;\n"
"	const uint32_t primCount = primitives.getCount();\n"
"\n"
"	// STEP 0: Init data\n"
"	if constexpr ( is_same<Header, SceneHeader>::value )\n"
"	{\n"
"		Frame*	  frames	= storageMemoryArena.allocate<Frame>( primitives.getFrameCount() );\n"
"		Instance* instances = storageMemoryArena.allocate<Instance>( primitives.getCount() );\n"
"\n"
"		primitives.setFrames( frames );\n"
"		InitSceneData<>(\n"
"			index, storageMemoryArena.getStorageSize(), primitives, boxNodes, primNodes, instances, frames, header );\n"
"	}\n"
"	else\n"
"	{\n"
"		geomType <<= 1;\n"
"		if constexpr ( is_same<PrimitiveNode, TriangleNode>::value ) geomType |= 1;\n"
"		InitGeomDataImpl( index, primCount, storageMemoryArena.getStorageSize(), boxNodes, primNodes, geomType, header );\n"
"	}\n"
"\n"
"	// A single primitive => special case\n"
"	if ( primCount == 1 )\n"
"	{\n"
"		SingletonConstruction( index, primitives, boxNodes, primNodes );\n"
"		return;\n"
"	}\n"
"\n"
"	Aabb primBox;\n"
"	if ( index < primCount )\n"
"		primBox = primitives.fetchAabb( index );\n"
"	else\n"
"		primBox = primitives.fetchAabb( primCount - 1 );\n"
"\n"
"	const uint32_t warpsPerBlock = DivideRoundUp( static_cast<uint32_t>( blockDim.x ), WarpSize );\n"
"\n"
"	ScratchNode*   scratchNodes	  = sharedMemoryArena.allocate<ScratchNode>( blockDim.x - 1 );\n"
"	ReferenceNode* references	  = sharedMemoryArena.allocate<ReferenceNode>( blockDim.x );\n"
"	uint32_t*	   updateCounters = sharedMemoryArena.allocate<uint32_t>( blockDim.x );\n"
"	uint3*		   taskQueue	  = sharedMemoryArena.allocate<uint3>( blockDim.x );\n"
"\n"
"	uint32_t* mortonCodeKeys   = reinterpret_cast<uint32_t*>( taskQueue + 0 * blockDim.x );\n"
"	uint32_t* mortonCodeValues = reinterpret_cast<uint32_t*>( taskQueue + 1 * blockDim.x );\n"
"\n"
"	// STEP 1: Calculate centroid bounding box by reduction\n"
"	updateCounters[index] = InvalidValue;\n"
"	Aabb* blockBoxes	  = reinterpret_cast<Aabb*>( scratchNodes );\n"
"	Aabb  centroidBox	  = blockUnion( primBox, blockBoxes );\n"
"	__syncthreads();\n"
"\n"
"	// STEP 2: Calculate Morton codes\n"
"	if ( index < primCount )\n"
"	{\n"
"		const float3 boxExtent		  = centroidBox.extent();\n"
"		const float3 center			  = primitives.fetchCenter( index );\n"
"		const float3 normalizedCenter = ( center - centroidBox.m_min ) / boxExtent;\n"
"		mortonCodeKeys[index]		  = computeExtendedMortonCode( normalizedCenter, boxExtent );\n"
"		mortonCodeValues[index]		  = index;\n"
"	}\n"
"	else\n"
"	{\n"
"		mortonCodeKeys[index]	= InvalidValue;\n"
"		mortonCodeValues[index] = InvalidValue;\n"
"	}\n"
"	__syncthreads();\n"
"\n"
"	// STEP 3: Sort Morton codes\n"
"	uint32_t* blockCache = reinterpret_cast<uint32_t*>( scratchNodes );\n"
"	for ( uint32_t i = 0; i < 32; ++i )\n"
"	{\n"
"		const uint32_t mortonCodeKey   = mortonCodeKeys[index];\n"
"		const uint32_t mortonCodeValue = mortonCodeValues[index];\n"
"		const uint32_t bit			   = ( mortonCodeKey >> i ) & 1;\n"
"		const uint32_t blockSum		   = blockScan( bit == 0, blockCache );\n"
"		const uint32_t newIndex		   = bit == 0 ? blockSum - 1 : blockCache[warpsPerBlock - 1] + index - blockSum;\n"
"		__syncthreads();\n"
"		mortonCodeKeys[newIndex]   = mortonCodeKey;\n"
"		mortonCodeValues[newIndex] = mortonCodeValue;\n"
"		__syncthreads();\n"
"	}\n"
"\n"
"	// STEP 4: Emit topology and refit nodes\n"
"	EmitTopologyAndFitBounds( index, mortonCodeKeys, mortonCodeValues, updateCounters, primitives, scratchNodes, references );\n"
"	__syncthreads();\n"
"	const uint32_t rootAddr = updateCounters[primCount - 1];\n"
"	__syncthreads();\n"
"\n"
"	// STEP 5: Compute fat leaves\n"
"	if constexpr ( is_same<PrimitiveNode, TrianglePacketNode>::value )\n"
"	{\n"
"		uint32_t* triangleCounts = reinterpret_cast<uint32_t*>( taskQueue );\n"
"		uint32_t* parentAddrs	 = triangleCounts + primCount;\n"
"		updateCounters[index]	 = 0;\n"
"		ComputeParentAddrs( index, primCount, rootAddr, scratchNodes, parentAddrs );\n"
"		__syncthreads();\n"
"		ComputeFatLeaves( index, primCount, scratchNodes, parentAddrs, triangleCounts, updateCounters );\n"
"		__syncthreads();\n"
"	}\n"
"\n"
"	// STEP 6: Collapse\n"
"	if ( index == 0 )\n"
"		taskQueue[index] = make_uint3( encodeNodeIndex( rootAddr, BoxType ), 0, 0 );\n"
"	else\n"
"		taskQueue[index] = make_uint3( InvalidValue, InvalidValue, InvalidValue );\n"
"	__syncthreads();\n"
"\n"
"	uint32_t* referenceIndices = updateCounters;\n"
"	for ( uint32_t i = index; i < BranchingFactor * maxBoxNodeCount; i += blockDim.x )\n"
"	{\n"
"		Collapse<PrimitiveNode>(\n"
"			i, maxBoxNodeCount, primCount, header, scratchNodes, references, boxNodes, taskQueue, referenceIndices );\n"
"		__syncthreads();\n"
"	}\n"
"\n"
"	PackLeaves(\n"
"		index, header->m_boxNodeCount, header, references, boxNodes, primNodes, primitives, taskQueue, referenceIndices );\n"
"}\n"
"\n"
"extern \"C\" __global__ void __launch_bounds__( BatchBuilderMaxBlockSize )\n"
"	BatchBuild_hiprtGeometryBuildInput( uint32_t count, const hiprtGeometryBuildInput* buildInputs, hiprtDevicePtr* buffers )\n"
"{\n"
"	const uint32_t index = blockIdx.x + gridDim.x * blockIdx.y;\n"
"	if ( index < count )\n"
"	{\n"
"		alignas( BatchBuildCacheAlignment ) __shared__ uint8_t cache[BatchBuildCacheSize];\n"
"		MemoryArena sharedMemoryArena( cache, BatchBuildCacheSize, BatchBuildCacheAlignment );\n"
"\n"
"		hiprtGeometryBuildInput buildInput = buildInputs[index];\n"
"		MemoryArena				storageMemoryArena( buffers[index], getStorageBufferSize( buildInput ), DefaultAlignment );\n"
"\n"
"		switch ( buildInput.type )\n"
"		{\n"
"		case hiprtPrimitiveTypeTriangleMesh: {\n"
"			TriangleMesh mesh( buildInput.primitive.triangleMesh );\n"
"			build<TriangleNode>( mesh, buildInput.geomType, storageMemoryArena, sharedMemoryArena );\n"
"			break;\n"
"		}\n"
"		case hiprtPrimitiveTypeAABBList: {\n"
"			AabbList list( buildInput.primitive.aabbList );\n"
"			build<CustomNode>( list, buildInput.geomType, storageMemoryArena, sharedMemoryArena );\n"
"			break;\n"
"		}\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"extern \"C\" __global__ void __launch_bounds__( BatchBuilderMaxBlockSize )\n"
"	BatchBuild_hiprtSceneBuildInput( uint32_t count, const hiprtSceneBuildInput* buildInputs, hiprtDevicePtr* buffers )\n"
"{\n"
"	const uint32_t index = blockIdx.x + gridDim.x * blockIdx.y;\n"
"	if ( index < count )\n"
"	{\n"
"		alignas( BatchBuildCacheAlignment ) __shared__ uint8_t cache[BatchBuildCacheSize];\n"
"		MemoryArena sharedMemoryArena( cache, BatchBuildCacheSize, BatchBuildCacheAlignment );\n"
"\n"
"		hiprtSceneBuildInput buildInput = buildInputs[index];\n"
"		MemoryArena			 storageMemoryArena( buffers[index], getStorageBufferSize( buildInput ), DefaultAlignment );\n"
"\n"
"		switch ( buildInput.frameType )\n"
"		{\n"
"		case hiprtFrameTypeSRT: {\n"
"			InstanceList<hiprtFrameSRT> list( buildInput );\n"
"			build<InstanceNode>( list, hiprtInvalidValue, storageMemoryArena, sharedMemoryArena );\n"
"			break;\n"
"		}\n"
"		case hiprtFrameTypeMatrix: {\n"
"			InstanceList<hiprtFrameMatrix> list( buildInput );\n"
"			build<InstanceNode>( list, hiprtInvalidValue, storageMemoryArena, sharedMemoryArena );\n"
"			break;\n"
"		}\n"
"		}\n"
"	}\n"
"}\n"
;
