30 #define TYPE_NAME "vec4"
32 #define TYPE_SIZE (TYPE_ELEMS*4)
69 GLSLF(4, s1 = texture(input_img[%
i],
pos + ivec2(%
i + %
s, %
i + %
s))[%
i];
70 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
72 GLSLF(4, s2[0] = texture(input_img[%
i],
pos + offs[0] + ivec2(%
i + %
s, %
i + %
s))[%
i];
73 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
74 GLSLF(4, s2[1] = texture(input_img[%
i],
pos + offs[1] + ivec2(%
i + %
s, %
i + %
s))[%
i];
75 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
76 GLSLF(4, s2[2] = texture(input_img[%
i],
pos + offs[2] + ivec2(%
i + %
s, %
i + %
s))[%
i];
77 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
78 GLSLF(4, s2[3] = texture(input_img[%
i],
pos + offs[3] + ivec2(%
i + %
s, %
i + %
s))[%
i];
79 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
81 GLSLC(4, s2 = (s1 - s2) * (s1 - s2); );
86 GLSLF(1,
pos.y =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
88 GLSLC(1, barrier(); );
91 GLSLC(2, #pragma unroll(1) );
92 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
93 GLSLC(3, prefix_sum = DTYPE(0); );
94 GLSLC(3, offset = int_stride * uint64_t(pos.y + r); );
95 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
97 GLSLF(3, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
99 insert_first(shd, 0,
"r", 0, plane, comp);
101 GLSLC(4, s2 = dst.v[pos.x]; );
102 GLSLC(4, dst.v[pos.x] = s2 + prefix_sum; );
103 GLSLC(4, prefix_sum += s2; );
112 GLSLF(1,
pos.x =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
113 GLSLC(1, #pragma unroll(1) );
114 GLSLF(1,
for (
r = 0;
r < %
i;
r++) ,nb_rows);
115 GLSLC(2, psum[
r] = DTYPE(0); );
118 GLSLC(1, barrier(); );
121 GLSLF(2, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane);
122 GLSLC(3, offset = int_stride * uint64_t(pos.y); );
123 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
125 GLSLC(3, #pragma unroll(1) );
126 GLSLF(3, for (r = 0; r < %i; r++) { ,nb_rows);
128 insert_first(shd, 0,
"r", 1, plane, comp);
130 GLSLC(4, s2 = dst.v[pos.x + r]; );
131 GLSLC(4, dst.v[pos.x + r] = s2 + psum[r]; );
132 GLSLC(4, psum[r] += s2; );
140 int t,
int dst_comp,
int plane,
int comp)
142 GLSLF(1, p = patch_size[%
i]; ,dst_comp);
144 GLSLC(1, barrier(); );
148 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= width[%i]) ,nb_rows, plane);
150 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
151 GLSLF(3, pos.x = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
153 GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
154 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= height[%i]) ,nb_rows, plane);
156 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
157 GLSLF(3, pos.y = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
160 GLSLC(3, a = DTYPE(0); );
161 GLSLC(3, b = DTYPE(0); );
162 GLSLC(3, c = DTYPE(0); );
163 GLSLC(3, d = DTYPE(0); );
165 GLSLC(3, lt = ((pos.x - p) < 0) || ((pos.y - p) < 0); );
167 GLSLF(3, src[0] = texture(input_img[%i], pos + offs[0])[%i]; ,plane, comp);
168 GLSLF(3, src[1] = texture(input_img[%i], pos + offs[1])[%i]; ,plane, comp);
169 GLSLF(3, src[2] = texture(input_img[%i], pos + offs[2])[%i]; ,plane, comp);
170 GLSLF(3, src[3] = texture(input_img[%i], pos + offs[3])[%i]; ,plane, comp);
172 GLSLC(3, if (lt == false) { );
173 GLSLC(3, offset = int_stride * uint64_t(pos.y - p); );
174 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
175 GLSLC(4, a = dst.v[pos.x - p]; );
176 GLSLC(4, c = dst.v[pos.x + p]; );
177 GLSLC(3, offset = int_stride * uint64_t(pos.y + p); );
178 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
179 GLSLC(4, b = dst.v[pos.x - p]; );
180 GLSLC(4, d = dst.v[pos.x + p]; );
183 GLSLC(3, patch_diff = d + a - b - c; );
184 GLSLF(3, w = exp(patch_diff * strength[%i]); ,dst_comp);
185 GLSLC(3, w_sum = w[0] + w[1] + w[2] + w[3]; );
186 GLSLC(3, sum = dot(w, src*255); );
189 GLSLF(3, atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], w_sum); ,dst_comp, dst_comp);
190 GLSLF(3, atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum); ,dst_comp, dst_comp);
192 GLSLF(3, weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum; ,dst_comp, dst_comp);
193 GLSLF(3, sums_%i[pos.y*ws_stride[%i] + pos.x] += sum; ,dst_comp, dst_comp);
199 typedef struct HorizontalPushData {
202 uint32_t ws_stride[4];
205 VkDeviceAddress integral_base;
206 uint64_t integral_size;
208 uint32_t xyoffs_start;
209 } HorizontalPushData;
221 void *spv_opaque =
NULL;
224 uint32_t max_wg = vkctx->
props.properties.limits.maxComputeWorkGroupSize[0];
225 int wg_size, wg_rows;
231 if (max_wg > max_dim) {
233 }
else if (max_wg < max_dim) {
235 while (wg_size*wg_rows < max_dim)
240 VK_SHADER_STAGE_COMPUTE_BIT,
241 (
const char *[]) {
"GL_EXT_buffer_reference",
242 "GL_EXT_buffer_reference2" }, 2,
249 GLSLC(0, #extension GL_EXT_shader_atomic_float : require );
250 GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
255 GLSLC(0,
layout(buffer_reference, buffer_reference_align = T_ALIGN)
buffer DataBuffer { );
256 GLSLC(1, DTYPE v[]; );
259 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
262 GLSLC(1, uvec4 ws_stride; );
263 GLSLC(1, ivec4 patch_size; );
264 GLSLC(1, vec4 strength; );
265 GLSLC(1, DataBuffer integral_base; );
266 GLSLC(1, uint64_t integral_size; );
267 GLSLC(1, uint64_t int_stride; );
268 GLSLC(1, uint xyoffs_start; );
273 VK_SHADER_STAGE_COMPUTE_BIT);
278 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
281 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
285 .name =
"weights_buffer_0",
286 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
287 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
288 .buf_content =
"float weights_0[];",
291 .name =
"sums_buffer_0",
292 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
293 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
294 .buf_content =
"float sums_0[];",
297 .name =
"weights_buffer_1",
298 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
299 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
300 .buf_content =
"float weights_1[];",
303 .name =
"sums_buffer_1",
304 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
305 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
306 .buf_content =
"float sums_1[];",
309 .name =
"weights_buffer_2",
310 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
311 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
312 .buf_content =
"float weights_2[];",
315 .name =
"sums_buffer_2",
316 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
317 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
318 .buf_content =
"float sums_2[];",
321 .name =
"weights_buffer_3",
322 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
323 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
324 .buf_content =
"float weights_3[];",
327 .name =
"sums_buffer_3",
328 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
329 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
330 .buf_content =
"float sums_3[];",
337 .
name =
"xyoffsets_buffer",
338 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
339 .mem_quali =
"readonly",
340 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
341 .buf_content =
"ivec2 xyoffsets[];",
351 GLSLC(1,
float s1; );
352 GLSLC(1, DTYPE s2; );
353 GLSLC(1, DTYPE prefix_sum; );
354 GLSLF(1, DTYPE psum[%
i]; ,*nb_rows);
359 GLSLC(1, DataBuffer integral_data; );
362 GLSLC(1,
int invoc_idx =
int(gl_WorkGroupID.z); );
365 GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) +
offset); );
374 GLSLC(1, DTYPE patch_diff; );
382 GLSLC(1,
float w_sum; );
383 GLSLC(1,
float sum; );
389 for (
int i = 0;
i <
desc->nb_components;
i++) {
404 RET(spv->
compile_shader(vkctx, spv, shd, &spv_data, &spv_len,
"main", &spv_opaque));
416 typedef struct DenoisePushData {
417 uint32_t ws_stride[4];
428 void *spv_opaque =
NULL;
432 VK_SHADER_STAGE_COMPUTE_BIT,
433 (
const char *[]) {
"GL_EXT_buffer_reference",
434 "GL_EXT_buffer_reference2" }, 2,
438 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
439 GLSLC(1, uvec4 ws_stride; );
443 VK_SHADER_STAGE_COMPUTE_BIT);
448 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
451 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
455 .name =
"output_img",
456 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
458 .mem_quali =
"writeonly",
461 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
468 .
name =
"weights_buffer_0",
469 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
470 .mem_quali =
"readonly",
471 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
472 .buf_content =
"float weights_0[];",
475 .name =
"sums_buffer_0",
476 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
477 .mem_quali =
"readonly",
478 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
479 .buf_content =
"float sums_0[];",
482 .name =
"weights_buffer_1",
483 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
484 .mem_quali =
"readonly",
485 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
486 .buf_content =
"float weights_1[];",
489 .name =
"sums_buffer_1",
490 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
491 .mem_quali =
"readonly",
492 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
493 .buf_content =
"float sums_1[];",
496 .name =
"weights_buffer_2",
497 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
498 .mem_quali =
"readonly",
499 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
500 .buf_content =
"float weights_2[];",
503 .name =
"sums_buffer_2",
504 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
505 .mem_quali =
"readonly",
506 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
507 .buf_content =
"float sums_2[];",
510 .name =
"weights_buffer_3",
511 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
512 .mem_quali =
"readonly",
513 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
514 .buf_content =
"float weights_3[];",
517 .name =
"sums_buffer_3",
518 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
519 .mem_quali =
"readonly",
520 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
521 .buf_content =
"float sums_3[];",
530 GLSLC(1,
const ivec2
pos = ivec2(gl_GlobalInvocationID.xy); );
531 GLSLC(1,
const uint plane = uint(gl_WorkGroupID.z); );
533 GLSLC(1,
float w_sum; );
534 GLSLC(1,
float sum; );
538 GLSLC(1,
size = imageSize(output_img[plane]); );
544 for (
int c = 0;
c <
desc->nb_components;
c++) {
549 GLSLF(2,
r[%
i] = (sum +
src[%
i]*255) / (1.0 + w_sum) / 255; ,off, off);
553 GLSLC(1, imageStore(output_img[plane],
pos,
r); );
556 RET(spv->
compile_shader(vkctx, spv, shd, &spv_data, &spv_len,
"main", &spv_opaque));
571 int xcnt = 0, ycnt = 0;
577 int offsets_dispatched = 0, nb_dispatches = 0;
584 if (!(
s->opts.r & 1)) {
590 if (!(
s->opts.p & 1)) {
596 for (
int i = 0;
i < 4;
i++) {
597 double str = (
s->opts.sc[
i] > 1.0) ?
s->opts.sc[
i] :
s->opts.s;
598 int ps = (
s->opts.pc[
i] ?
s->opts.pc[
i] :
s->opts.p);
601 str = 255.0*255.0 / str;
602 s->strength[
i] = str;
608 s->patch[
i] = ps / 2;
612 s->nb_offsets = (2*rad + 1)*(2*rad + 1) - 1;
613 s->xoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->xoffsets));
614 s->yoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->yoffsets));
617 for (
int x = -rad; x <= rad; x++) {
618 for (
int y = -rad; y <= rad; y++) {
622 s->xoffsets[xcnt++] = x;
623 s->yoffsets[ycnt++] = y;
629 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
630 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
631 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
632 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
635 for (
int i = 0;
i < 2*
s->nb_offsets;
i += 2) {
636 offsets_buf[
i + 0] =
s->xoffsets[
i >> 1];
637 offsets_buf[
i + 1] =
s->yoffsets[
i >> 1];
645 "disabling dispatch parallelism\n");
649 spv = ff_vk_spirv_init();
665 RET(init_weights_pipeline(vkctx, &
s->e, &
s->shd_weights,
s->sampler,
666 spv,
s->vkctx.output_width,
s->vkctx.output_height,
669 RET(init_denoise_pipeline(vkctx, &
s->e, &
s->shd_denoise,
s->sampler,
674 &
s->xyoffsets_buf, 0,
s->xyoffsets_buf.size,
675 VK_FORMAT_UNDEFINED));
678 int wg_invoc =
FFMIN((
s->nb_offsets - offsets_dispatched)/
TYPE_ELEMS,
s->opts.t);
679 wg_invoc =
FFMIN(wg_invoc, vkctx->
props.properties.limits.maxComputeWorkGroupCount[2]);
682 }
while (offsets_dispatched < s->nb_offsets);
685 s->nb_offsets, nb_dispatches);
701 VkBufferMemoryBarrier2 buf_bar[8];
704 DenoisePushData pd = {
705 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
713 VK_SHADER_STAGE_COMPUTE_BIT,
716 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
717 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
718 .srcStageMask = ws_vk->
stage,
719 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
720 .srcAccessMask = ws_vk->
access,
721 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
722 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
723 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
724 .buffer = ws_vk->
buf,
729 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
730 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
731 .pBufferMemoryBarriers = buf_bar,
732 .bufferMemoryBarrierCount = nb_buf_bar,
734 ws_vk->
stage = buf_bar[0].dstStageMask;
735 ws_vk->
access = buf_bar[0].dstAccessMask;
738 vk->CmdDispatch(exec->
buf,
758 int plane_heights[4];
760 int offsets_dispatched = 0;
771 VkDeviceSize weights_offs[4];
772 VkDeviceSize sums_offs[4];
773 uint32_t ws_stride[4];
775 size_t ws_total_size = 0;
780 VkImageMemoryBarrier2 img_bar[8];
782 VkBufferMemoryBarrier2 buf_bar[8];
793 int_stride =
s->shd_weights.lg_size[0]*
s->pl_weights_rows*
TYPE_SIZE;
794 int_size =
s->shd_weights.lg_size[0]*
s->pl_weights_rows*int_stride;
797 for (
int i = 0;
i <
desc->nb_components;
i++) {
800 plane_widths[
i] =
FFALIGN(plane_widths[
i],
s->shd_denoise.lg_size[0]);
801 plane_heights[
i] =
FFALIGN(plane_heights[
i],
s->shd_denoise.lg_size[1]);
803 ws_stride[
i] = plane_widths[
i];
804 ws_size[
i] = ws_stride[
i] * plane_heights[
i] *
sizeof(
float);
805 ws_total_size += ws_size[
i];
810 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
811 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
813 s->opts.t * int_size,
814 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
820 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
821 VK_BUFFER_USAGE_TRANSFER_DST_BIT |
822 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
825 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
831 sums_offs[0] = ws_total_size;
832 for (
int i = 1;
i <
desc->nb_components;
i++) {
833 weights_offs[
i] = weights_offs[
i - 1] + ws_size[
i - 1];
834 sums_offs[
i] = sums_offs[
i - 1] + ws_size[
i - 1];
850 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
851 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
853 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
854 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
865 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
866 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
867 VK_ACCESS_SHADER_READ_BIT,
868 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
869 VK_QUEUE_FAMILY_IGNORED);
874 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
875 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
876 VK_ACCESS_SHADER_WRITE_BIT,
877 VK_IMAGE_LAYOUT_GENERAL,
878 VK_QUEUE_FAMILY_IGNORED);
881 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
882 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
883 .srcStageMask = ws_vk->
stage,
884 .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
885 .srcAccessMask = ws_vk->
access,
886 .dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT,
887 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
888 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
889 .buffer = ws_vk->
buf,
893 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
894 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
895 .srcStageMask = integral_vk->
stage,
896 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
897 .srcAccessMask = integral_vk->
access,
898 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
899 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
900 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
901 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
902 .buffer = integral_vk->
buf,
903 .size = integral_vk->
size,
907 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
908 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
909 .pImageMemoryBarriers = img_bar,
910 .imageMemoryBarrierCount = nb_img_bar,
911 .pBufferMemoryBarriers = buf_bar,
912 .bufferMemoryBarrierCount = nb_buf_bar,
914 ws_vk->
stage = buf_bar[0].dstStageMask;
915 ws_vk->
access = buf_bar[0].dstAccessMask;
916 integral_vk->
stage = buf_bar[1].dstStageMask;
917 integral_vk->
access = buf_bar[1].dstAccessMask;
920 vk->CmdFillBuffer(exec->
buf, ws_vk->
buf, 0, ws_vk->
size, 0x0);
923 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
924 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
925 .srcStageMask = ws_vk->
stage,
926 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
927 .srcAccessMask = ws_vk->
access,
928 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
929 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
930 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
931 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
932 .buffer = ws_vk->
buf,
937 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
938 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
939 .pBufferMemoryBarriers = buf_bar,
940 .bufferMemoryBarrierCount = nb_buf_bar,
942 ws_vk->
stage = buf_bar[0].dstStageMask;
943 ws_vk->
access = buf_bar[0].dstAccessMask;
947 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
949 for (
int i = 0;
i <
desc->nb_components;
i++) {
951 ws_vk, weights_offs[
i], ws_size[
i],
952 VK_FORMAT_UNDEFINED));
954 ws_vk, sums_offs[
i], ws_size[
i],
955 VK_FORMAT_UNDEFINED));
960 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
963 VK_IMAGE_LAYOUT_GENERAL,
s->sampler);
964 for (
int i = 0;
i <
desc->nb_components;
i++) {
966 ws_vk, weights_offs[
i], ws_size[
i],
967 VK_FORMAT_UNDEFINED));
969 ws_vk, sums_offs[
i], ws_size[
i],
970 VK_FORMAT_UNDEFINED));
978 HorizontalPushData pd = {
979 { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
980 { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
981 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
982 {
s->patch[0],
s->patch[1],
s->patch[2],
s->patch[3] },
983 {
s->strength[0],
s->strength[1],
s->strength[2],
s->strength[2], },
986 (uint64_t)int_stride,
992 VK_SHADER_STAGE_COMPUTE_BIT,
995 if (offsets_dispatched) {
997 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
998 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
999 .srcStageMask = integral_vk->
stage,
1000 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
1001 .srcAccessMask = integral_vk->
access,
1002 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
1003 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
1004 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1005 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1006 .buffer = integral_vk->
buf,
1007 .size = integral_vk->
size,
1011 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
1012 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
1013 .pBufferMemoryBarriers = buf_bar,
1014 .bufferMemoryBarrierCount = nb_buf_bar,
1016 integral_vk->
stage = buf_bar[1].dstStageMask;
1017 integral_vk->
access = buf_bar[1].dstAccessMask;
1021 wg_invoc =
FFMIN(wg_invoc, vkctx->
props.properties.limits.maxComputeWorkGroupCount[2]);
1024 vk->CmdDispatch(exec->
buf, 1, 1, wg_invoc);
1027 }
while (offsets_dispatched < s->nb_offsets);
1029 RET(denoise_pass(
s, exec, ws_vk, ws_stride));
1076 #define OFFSET(x) offsetof(NLMeansVulkanContext, x)
1077 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
1078 static const AVOption nlmeans_vulkan_options[] = {
1099 static const AVFilterPad nlmeans_vulkan_inputs[] = {
1103 .filter_frame = &nlmeans_vulkan_filter_frame,
1108 static const AVFilterPad nlmeans_vulkan_outputs[] = {
1117 .
name =
"nlmeans_vulkan",
1121 .
uninit = &nlmeans_vulkan_uninit,
1125 .priv_class = &nlmeans_vulkan_class,