AMD FSR1 is a super resolution technology to reduce rasterization costs at high resolution.
Although there already exist more advanced technologies like AMD FSR2, NVidia DLSS, Intel XeSS, and Unreal Engine 5's Temporal Super Resolution, they require more GPU resources than FSR1 as their temporal upscaling parts expect you to preserve previous frame's history and generate velocity buffers, which are not free. In addition, some of them might be just unavailable for your use case as they are vendor-specific or not fully open-sourced.
FSR1 is still an easy-to-integrate solution that achieves reasonable visuals. Its homepage says officially supported APIs are DirectX 12, DirectX 11, and Vulkan, but with little effort FSR1 also can be integrated to OpenGL projects.
The FSR1.0 Documentation contains a clear integration guide, so I'll talk about only OpenGL-related caveats.
FSR1 is designed to utilize half precision float (FP16) and additionally includes full precision fallback (FP32) for older hardware. You need following GLSL extensions for FP16 mode:
GL_EXT_shader_16bit_storage
for 16-bit typedefsGL_EXT_shader_explicit_arithmetic_types
for 16-bit mathBut even latest graphics cards will report no such extensions exist when queried via OpenGL API, because OpenGL specs just stopped to evolve since 2017. I checked them with RTX 3080 Ti and Ryzen 6800U in Oct 2022.
Alternatively, instead of GL_EXT_shader_16bit_storage
you can query about
GL_NV_gpu_shader5
which defines exactly same 16-bit types. Altough it's a NV extension, it's possible that
other venders also support it, including AMD Ryzen 6800U.
I also have Intel Arc A770 but hadn't check it yet.
In case of GL_EXT_shader_explicit_arithmetic_types
there is no alternative
so you have to manually define following 16-bit operations that are required by FSR1.
uint16_t halfBitsToUint16(float16_t v);
u16vec2 halfBitsToUint16(f16vec2 v);
u16vec3 halfBitsToUint16(f16vec3 v);
u16vec4 halfBitsToUint16(f16vec4 v);
float16_t uint16BitsToHalf(uint16_t v);
f16vec2 uint16BitsToHalf(u16vec2 v);
f16vec3 uint16BitsToHalf(u16vec3 v);
f16vec4 uint16BitsToHalf(u16vec4 v);
uint32_t packUint2x16(u16vec2 v);
u16vec2 unpackUint2x16(uint32_t v);
My implementation is provided in the below section, though I can't guarantee it's the best code.
It's possible that even GL_NV_gpu_shader5
is unavailable,
so you have to prepare for 3 variants before compiling FSR1 shaders:
FP32 mode, standard FP16 mode, and alternative FP16 mode.
Following code is an example of dealing with the GLSL extensions discussed above. It's shared by my EASU and RCAS wrappers.
To compile this code you must provide the definition of FP16_CRITERIA
.
It's value is determined like this:
GL_EXT_shader_16bit_storage
GL_EXT_shader_explicit_arithmetic_types
GL_NV_gpu_shader5
GL_EXT_shader_16bit_storage
and GL_EXT_shader_explicit_arithmetic_types
are both available,
then provide #define FP16_CRITERIA 1
, though it's an unlikely case for OpenGL.GL_EXT_shader_16bit_storage
and/or GL_EXT_shader_explicit_arithmetic_types
are unavailable
but GL_NV_gpu_shader5
is available, then provide #define FP16_CRITERIA 2
.#define FP16_CRITERIA 0
./* fsr1_wrapper_common.glsl */
// Common code for FSR1's EASU and RCAS passes.
// You have to provide the definition of FP16_CRITERIA.
// FP16_CRITERIA 0 : Must use FP32 fallback.
// FP16_CRITERIA 1 : Standard extensions required by FSR1 are available.
// FP16_CRITERIA 2 : GL_NV_gpu_shader5 is avilable.
#if FP16_CRITERIA == 1
#define A_HALF 1
#elif FP16_CRITERIA == 2
#extension GL_NV_gpu_shader5 : enable
#define A_SKIP_EXT 1
#define A_HALF 1
uint16_t halfBitsToUint16(float16_t v) { return uint16_t(packFloat2x16(f16vec2(v, 0))); }
u16vec2 halfBitsToUint16(f16vec2 v) { return u16vec2(packFloat2x16(f16vec2(v.x, 0)), packFloat2x16(f16vec2(v.y, 0))); }
u16vec3 halfBitsToUint16(f16vec3 v) { return u16vec3(packFloat2x16(f16vec2(v.x, 0)), packFloat2x16(f16vec2(v.y, 0)), packFloat2x16(f16vec2(v.z, 0))); }
u16vec4 halfBitsToUint16(f16vec4 v) { return u16vec4(packFloat2x16(f16vec2(v.x, 0)), packFloat2x16(f16vec2(v.y, 0)), packFloat2x16(f16vec2(v.z, 0)), packFloat2x16(f16vec2(v.w, 0))); }
float16_t uint16BitsToHalf(uint16_t v) { return unpackFloat2x16(uint(v)).x; }
f16vec2 uint16BitsToHalf(u16vec2 v) { return f16vec2(unpackFloat2x16(uint(v.x)).x, unpackFloat2x16(uint(v.y)).x); }
f16vec3 uint16BitsToHalf(u16vec3 v) { return f16vec3(unpackFloat2x16(uint(v.x)).x, unpackFloat2x16(uint(v.y)).x, unpackFloat2x16(uint(v.z)).x); }
f16vec4 uint16BitsToHalf(u16vec4 v) { return f16vec4(unpackFloat2x16(uint(v.x)).x, unpackFloat2x16(uint(v.y)).x, unpackFloat2x16(uint(v.z)).x, unpackFloat2x16(uint(v.w)).x); }
uint32_t packUint2x16(u16vec2 v) { return (uint(v.y) << 16) | uint(v.x); }
u16vec2 unpackUint2x16(uint32_t v) { return u16vec2(v & 0xffff, (v >> 16) & 0xffff); }
#endif // FP16_CRITERIA
And this is my EASU wrapper. RCAS wrapper is in similar form.
/* fsr1_easu_wrapper.glsl */
#version 460 core
#include "fsr1_wrapper_common.glsl"
#define A_GPU 1
#define A_GLSL 1
#include "fsr1/ffx_a.h"
layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
// Tone-mapped and downscaled input
layout (binding = 0) uniform sampler2D inImage;
// Upscaled output
layout (binding = 1, rgba16f) writeonly uniform image2D outImage;
// fp16
#ifdef A_HALF
AH4 FsrEasuRH(AF2 p) { return AH4(textureGather(inImage, p, 0)); }
AH4 FsrEasuGH(AF2 p) { return AH4(textureGather(inImage, p, 1)); }
AH4 FsrEasuBH(AF2 p) { return AH4(textureGather(inImage, p, 2)); }
#endif
// fp32
AF4 FsrEasuRF(AF2 p) { return textureGather(inImage, p, 0); }
AF4 FsrEasuGF(AF2 p) { return textureGather(inImage, p, 1); }
AF4 FsrEasuBF(AF2 p) { return textureGather(inImage, p, 2); }
#if A_HALF
#define FSR_EASU_H 1
#define COLOR_TYPE AH3
#define EASU_KERNEL FsrEasuH
#else
#define FSR_EASU_F 1
#define COLOR_TYPE AF3
#define EASU_KERNEL FsrEasuF
#endif
#include "fsr1/ffx_fsr1.h"
layout (std140, binding = 1) uniform UBO_FSR1 {
uvec2 renderViewportSize;
uvec2 containerTextureSize;
uvec2 upscaledViewportSize;
float sharpness; // [0.0, 2.0], 0.0 is sharpest
float _padding0;
} ubo;
void main() {
AU4 const0, const1, const2, const3;
FsrEasuCon(const0, const1, const2, const3,
float(ubo.renderViewportSize.x),
float(ubo.renderViewportSize.y),
float(ubo.containerTextureSize.x),
float(ubo.containerTextureSize.y),
float(ubo.upscaledViewportSize.x),
float(ubo.upscaledViewportSize.y));
AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u);
// Result of EASU_KERNEL() is stored here.
COLOR_TYPE gamma2Color = COLOR_TYPE(0, 0, 0);
EASU_KERNEL(gamma2Color, gxy, const0, const1, const2, const3);
imageStore(outImage, ivec2(gxy), vec4(vec3(gamma2Color), 0.0));
gxy.x += 8;
EASU_KERNEL(gamma2Color, gxy, const0, const1, const2, const3);
imageStore(outImage, ivec2(gxy), vec4(vec3(gamma2Color), 0.0));
gxy.y += 8;
EASU_KERNEL(gamma2Color, gxy, const0, const1, const2, const3);
imageStore(outImage, ivec2(gxy), vec4(vec3(gamma2Color), 0.0));
gxy.x -= 8;
EASU_KERNEL(gamma2Color, gxy, const0, const1, const2, const3);
imageStore(outImage, ivec2(gxy), vec4(vec3(gamma2Color), 0.0));
}
You can see the full code in my toy OpenGL project. (NOTE: The PR squashes a few commits unrelated to FSR1.)