From 6e1517ca3c6759cb41cb33c165a48f11cefc7737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20S=C3=B6ntgen?= Date: Tue, 10 Jan 2023 18:16:00 +0100 Subject: [PATCH] libdrm/lima: introduce disjunct contexts Prior to this change the libdrm Lima implementation supported the creation of multiple contexts where each context, however, was treated as the same client like it was done in the Lima driver itself. With this commit each context becomes its own client while the main context always performs all buffer object related allocation and the other context import each needed BO before submitting. Fixes #4760. --- repos/libports/src/lib/libdrm/ioctl_lima.cc | 592 +++++++++++++++----- 1 file changed, 444 insertions(+), 148 deletions(-) diff --git a/repos/libports/src/lib/libdrm/ioctl_lima.cc b/repos/libports/src/lib/libdrm/ioctl_lima.cc index 2636ba7950..2415914b76 100644 --- a/repos/libports/src/lib/libdrm/ioctl_lima.cc +++ b/repos/libports/src/lib/libdrm/ioctl_lima.cc @@ -38,7 +38,7 @@ extern "C" { } -enum { verbose_ioctl = false }; +static constexpr bool verbose_ioctl = false; /** @@ -122,6 +122,17 @@ namespace Lima { void serialize(drm_lima_gem_submit *submit, char *content); + template + void for_each_submit_bo(drm_lima_gem_submit const &submit, FN const &fn) + { + auto access_bo = [&] (drm_lima_gem_submit_bo const *bo) { + fn(*bo); + }; + + for_each_object((drm_lima_gem_submit_bo*)submit.bos, + submit.nr_bos, access_bo); + } + size_t get_payload_size(drm_version const &version); } /* anonymous namespace */ @@ -189,19 +200,16 @@ namespace Lima { using namespace Gpu; struct Call; - struct Buffer; } /* namespace Lima */ -namespace Gpu { - using Buffer_id = Vram_id; -} - - -struct Gpu::Vram { }; - -struct Lima::Buffer : Gpu::Vram +/* + * Gpu::Vram encapsulates a buffer object allocation + */ +struct Gpu::Vram { + struct Allocation_failed : Genode::Exception { }; + Gpu::Connection &_gpu; Vram_id_space::Element const _elem; @@ -212,22 +220,31 @@ struct Lima::Buffer : Gpu::Vram Genode::Constructible _attached_buffer { }; - Buffer(Gpu::Connection &gpu, - size_t size, - Gpu::Virtual_address va, - Vram_id_space &space) + Vram(Gpu::Connection &gpu, + size_t size, + Gpu::Virtual_address va, + Vram_id_space &space) : _gpu { gpu }, _elem { *this, space }, - size { size }, va { va } + size { size }, + va { va } { + /* + * As we cannot easily enforce an GPU virtual-address after the + * fact when using the DRM Lima API we instead use 'map_gpu' to + * create the buffer object. + * + * A valid virtual-address needs to be enforced before attempting + * the allocation. + */ if (!_gpu.map_gpu(_elem.id(), size, 0, va)) - throw -1; + throw Allocation_failed(); cap = _gpu.map_cpu(_elem.id(), Gpu::Mapping_attributes()); } - ~Buffer() + ~Vram() { _gpu.unmap_gpu(_elem.id(), 0, Virtual_address()); } @@ -246,7 +263,7 @@ struct Lima::Buffer : Gpu::Vram return reinterpret_cast(_attached_buffer->local_addr()); } - Gpu::Buffer_id id() const + Gpu::Vram_id id() const { return _elem.id(); } @@ -265,12 +282,75 @@ class Lima::Call Genode::Env &_env { *vfs_gpu_env() }; Genode::Heap _heap { _env.ram(), _env.rm() }; - Genode::Allocator_avl _va_alloc { &_heap }; + + /* + * The virtual-address allocator manages all GPU virtual + * addresses, which are shared by all contexts. + */ + struct Va_allocator + { + Genode::Allocator_avl _alloc; + + Va_allocator(Genode::Allocator &alloc) + : + _alloc { &alloc } + { + /* + * The end of range correspondes to LIMA_VA_RESERVE_START + * in Linux minus the page we omit at the start. + */ + _alloc.add_range(0x1000, 0xfff00000ul - 0x1000); + } + + Gpu::Virtual_address alloc(uint32_t size) + { + return Gpu::Virtual_address { _alloc.alloc_aligned(size, 12).convert<::uint64_t>( + [&] (void *ptr) { return (::uint64_t)ptr; }, + [&] (Range_allocator::Alloc_error) -> ::uint64_t { + error("Could not allocate GPU virtual address for size: ", size); + return 0; + }) }; + } + + void free(Gpu::Virtual_address va) + { + _alloc.free((void*)va.value); + } + }; + + Va_allocator _va_alloc { _heap }; /***************** ** Gpu session ** *****************/ + /* + * A Buffer wraps the actual Vram object and is used + * locally in each Gpu_context. + */ + struct Buffer + { + Genode::Id_space::Element const _elem; + + Gpu::Vram const &_vram; + + Buffer(Genode::Id_space &space, + Gpu::Vram const &vram) + : + _elem { *this, space, + Genode::Id_space::Id { .value = vram.id().value } }, + _vram { vram }, + + busy { false } + { } + + ~Buffer() { } + + bool busy; + }; + + using Buffer_space = Genode::Id_space; + struct Gpu_context { private: @@ -281,12 +361,14 @@ class Lima::Call Gpu_context(Gpu_context const &) = delete; Gpu_context &operator=(Gpu_context const &) = delete; + Genode::Allocator &_alloc; + static int _open_gpu() { int const fd = ::open("/dev/gpu", 0); if (fd < 0) { - Genode::error("Failed to open '/dev/gpu': ", - "try configure '' in 'dev' directory of VFS'"); + error("Failed to open '/dev/gpu': ", + "try configure '' in 'dev' directory of VFS'"); throw Gpu::Session::Invalid_state(); } return fd; @@ -296,7 +378,7 @@ class Lima::Call { struct ::stat buf; if (::fstat(fd, &buf) < 0) { - Genode::error("Could not stat '/dev/gpu'"); + error("Could not stat '/dev/gpu'"); ::close(fd); throw Gpu::Session::Invalid_state(); } @@ -306,33 +388,184 @@ class Lima::Call int const _fd; unsigned long const _id; + Gpu::Connection &_gpu { *vfs_gpu_connection(_id) }; + Genode::Id_space::Element const _elem; + /* + * The virtual-address allocator is solely used for + * the construction of the context-local exec buffer + * as the Gpu session requires a valid Vram object + * for it. + */ + Va_allocator &_va_alloc; + + /* + * The vram space is used for actual memory allocations. + * Each and every Gpu_context is able to allocate memory + * at the Gpu session although normally the main context + * is going to alloc memory for buffer objects. + */ + Gpu::Vram_id_space _vram_space { }; + + template + void _try_apply(Gpu::Vram_id id, FN const &fn) + { + try { + _vram_space.apply(id, fn); + } catch (Vram_id_space::Unknown_id) { } + } + + + /* + * The buffer space is used to attach a given buffer + * object backed by an vram allocation - normally from the + * main context - to the given context, i.e., it is mapped + * when SUBMIT is executed. + */ + Buffer_space _buffer_space { }; + + template + void _try_apply(Buffer_space::Id id, FN const &fn) + { + try { + _buffer_space.apply(id, fn); + } catch (Buffer_space::Unknown_id) { } + } + + /* + * Each context contains its on exec buffer object that is + * required by the Gpu session to pass on driver specific + * command buffer. + */ + Gpu::Vram *_exec_buffer; + public: - Gpu_context(Genode::Id_space &space) + bool defer_destruction { false }; + + static constexpr size_t _exec_buffer_size = { 256u << 10 }; + + Gpu_context(Genode::Allocator &alloc, + Genode::Id_space &space, + Va_allocator &va_alloc) : + _alloc { alloc }, _fd { _open_gpu() }, _id { _stat_gpu(_fd) }, - _elem { *this, space } + _elem { *this, space }, + _va_alloc { va_alloc }, + _exec_buffer { new (alloc) Gpu::Vram(_gpu, _exec_buffer_size, + _va_alloc.alloc(_exec_buffer_size), + _vram_space) } { } ~Gpu_context() { + while (_buffer_space.apply_any([&] (Buffer &b) { + Genode::destroy(_alloc, &b); + })) { ; } + + /* + * 'close' below will destroy the Gpu session belonging + * to this context so free the exec buffer beforehand. + */ + while (_vram_space.apply_any([&] (Vram &v) { + Genode::destroy(_alloc, &v); + })) { ; } + ::close(_fd); } + Gpu::Connection& gpu() + { + return _gpu; + } + + Gpu::Vram_capability export_vram(Gpu::Vram_id id) + { + Gpu::Vram_capability cap { }; + _try_apply(id, [&] (Gpu::Vram const &b) { + cap = _gpu.export_vram(b.id()); + }); + return cap; + } + + Buffer *import_vram(Gpu::Vram_capability cap, Gpu::Vram const &v) + { + Buffer *b = nullptr; + + try { _gpu.import_vram(cap, v.id()); } + catch (... /* should only throw Invalid_state*/) { + return nullptr; } + + try { b = new (_alloc) Buffer(_buffer_space, v); } + catch (... /* either conflicting id or alloc failure */) { + return nullptr; } + + return b; + } + + void free_buffer(Buffer_space::Id id) + { + _try_apply(id, [&] (Buffer &b) { + + /* + * We have to invalidate any imported buffer as otherwise + * the object stays ref-counted in the driver and the + * VA occupied by the object is not freed. + * + * For that we repurpose 'unmap_cpu' that otherwise is + * not used. + */ + _gpu.unmap_cpu(Gpu::Vram_id { .value = id.value }); + Genode::destroy(_alloc, &b); + }); + } + + bool buffer_space_contains(Buffer_space::Id id) + { + bool found = false; + _try_apply(id, [&] (Buffer const &) { found = true; }); + return found; + } + + Gpu::Vram_id_space &vram_space() + { + return _vram_space; + } + + template + void with_vram_space(Gpu::Vram_id id, FN const &fn) + { + _try_apply(id, fn); + } + + template + void access_exec_buffer(Genode::Env &env, FN const &fn) + { + /* + * 'env' is solely needed for mapping the exec buffer + * and is therefor used here locally. + */ + if (_exec_buffer->mmap(env)) { + char *ptr = (char*)_exec_buffer->mmap_addr(); + if (ptr) + fn(ptr, _exec_buffer_size); + } + } + + Gpu::Vram_id exec_buffer_id() const + { + return _exec_buffer->id(); + } + unsigned long id() const { return _elem.id().value; } - Gpu::Connection& gpu() - { - return *vfs_gpu_connection(_id); - } - int fd() const { return _fd; @@ -351,33 +584,33 @@ class Lima::Call Syncobj &operator=(Syncobj const &) = delete; - Gpu_context *_gc { nullptr }; + Gpu_context_space::Id _gc_id { .value = ~0u }; + Gpu::Sequence_number _seqno { 0 }; Genode::Id_space::Element const _elem; + + bool sync_fd { false }; + bool defer_destruction { false }; + Syncobj(Genode::Id_space &space) : _elem { *this, space } { } - unsigned long id() const + void adopt(Gpu_context_space::Id id, Gpu::Sequence_number seqno) { - return _elem.id().value; - } - - void adopt(Gpu_context &gc, Gpu::Sequence_number seqno) - { - _gc = &gc; + _gc_id = id; _seqno = seqno; } - Gpu_context &gpu_context() + Genode::Id_space::Id id() const { - if (!_gc) { - struct Invalid_gpu_context { }; - throw Invalid_gpu_context(); - } + return _elem.id(); + } - return *_gc; + Gpu_context_space::Id ctx_id() const + { + return _gc_id; } Gpu::Sequence_number seqno() const @@ -388,46 +621,35 @@ class Lima::Call using Syncobj_space = Genode::Id_space; Syncobj_space _syncobj_space { }; - Gpu_context _main_ctx { _gpu_context_space }; + Gpu_context *_main_ctx { + new (_heap) Gpu_context(_heap, _gpu_context_space, _va_alloc) }; - Gpu::Connection &_gpu { _main_ctx.gpu() }; Gpu::Info_lima const &_gpu_info { - *_gpu.attached_info() }; - - Gpu::Vram_id_space _buffer_space { }; + *_main_ctx->gpu().attached_info() }; template bool _apply_handle(uint32_t handle, FN const &fn) { - Buffer_id const id { .value = handle }; + Vram_id const id { .value = handle }; bool found = false; - try { - _buffer_space.apply(id, [&] (Buffer &b) { - fn(b); - found = true; - }); - } catch (Genode::Id_space::Unknown_id) { } + _main_ctx->with_vram_space(id, [&] (Vram &b) { + fn(b); + found = true; + }); return found; } - /* - * Play it safe, glmark2 apparently submits araound 110 KiB at - * some point. - */ - enum { EXEC_BUFFER_SIZE = 256u << 10 }; - Constructible _exec_buffer { }; - void _wait_for_mapping(uint32_t handle, unsigned op) { - (void)_apply_handle(handle, [&] (Buffer &b) { + (void)_apply_handle(handle, [&] (Vram &b) { do { - if (_main_ctx.gpu().set_tiling(b.id(), 0, op)) + if (_main_ctx->gpu().set_tiling_gpu(b.id(), 0, op)) break; char buf; - (void)::read(_main_ctx.fd(), &buf, sizeof(buf)); + (void)::read(_main_ctx->fd(), &buf, sizeof(buf)); } while (true); }); } @@ -445,18 +667,39 @@ class Lima::Call try { auto wait = [&] (Syncobj &sync_obj) { - Gpu_context &gc = sync_obj.gpu_context(); - do { - if (gc.gpu().complete(sync_obj.seqno())) - break; + auto poll_completion = [&] (Gpu_context &gc) { + do { + if (gc.gpu().complete(sync_obj.seqno())) + break; - char buf; - (void)::read(gc.fd(), &buf, sizeof(buf)); - } while (true); + char buf; + (void)::read(gc.fd(), &buf, sizeof(buf)); + } while (true); + + if (gc.defer_destruction) + Genode::destroy(_heap, &gc); + }; + _gpu_context_space.apply(sync_obj.ctx_id(), + poll_completion); + + if (sync_obj.defer_destruction) + Genode::destroy(_heap, &sync_obj); }; _syncobj_space.apply(syncobj_id, wait); - } catch (Genode::Id_space::Unknown_id) { - Genode::warning("ignore unknown sync fd: ", fd); + + } catch (Syncobj_space::Unknown_id) { + /* + * We could end up here on the last wait when Mesa already + * destroyed the syncobj. For that reason we deferred the + * destruction of the syncobj as well as the referenced ctx. + */ + return -1; + } catch (Gpu_context_space::Unknown_id) { + /* + * We will end up here in case the GP or PP job could not + * be submitted as Mesa does not check for successful + * submission. + */ return -1; } @@ -466,7 +709,7 @@ class Lima::Call Dataspace_capability _lookup_cap_from_handle(uint32_t handle) { Dataspace_capability cap { }; - auto lookup_cap = [&] (Buffer const &b) { + auto lookup_cap = [&] (Vram const &b) { cap = b.cap; }; (void)_apply_handle(handle, lookup_cap); @@ -480,7 +723,7 @@ class Lima::Call int _drm_lima_gem_info(drm_lima_gem_info &arg) { int result = -1; - (void)_apply_handle(arg.handle, [&] (Buffer &b) { + (void)_apply_handle(arg.handle, [&] (Vram &b) { if (!b.mmap(_env)) return; arg.offset = reinterpret_cast<::uint64_t>(b.mmap_addr()); @@ -496,41 +739,35 @@ class Lima::Call return result; } - ::uint64_t _alloc_va(::uint64_t const size) - { - return _va_alloc.alloc_aligned(size, 12).convert<::uint64_t>( - [&] (void *ptr) { return (::uint64_t)ptr; }, - [&] (Range_allocator::Alloc_error) -> ::uint64_t { - error("Could not allocate GPU virtual address for size: ", size); - return 0; - }); - } - template void _alloc_buffer(::uint64_t const size, FUNC const &fn) { size_t donate = size; - Buffer *buffer = nullptr; + Vram *buffer = nullptr; - ::uint64_t va = _alloc_va(size); - if (!va) return; + Gpu::Virtual_address const va = _va_alloc.alloc(size); + if (!va.value) + return; - retry( - [&] () { - retry( + try { + retry( [&] () { - buffer = - new (&_heap) Buffer(_gpu, size, - Gpu::Virtual_address { va }, - _buffer_space); + retry( + [&] () { + buffer = + new (&_heap) Vram(_main_ctx->gpu(), size, va, + _main_ctx->vram_space()); + }, + [&] () { + _main_ctx->gpu().upgrade_caps(2); + }); }, [&] () { - _gpu.upgrade_caps(2); + _main_ctx->gpu().upgrade_ram(donate); }); - }, - [&] () { - _gpu.upgrade_ram(donate); - }); + } catch (Gpu::Vram::Allocation_failed) { + return; + } if (buffer) fn(*buffer); @@ -541,7 +778,7 @@ class Lima::Call ::uint64_t const size = arg.size; try { - _alloc_buffer(size, [&](Buffer const &b) { + _alloc_buffer(size, [&](Vram const &b) { arg.handle = b.id().value; }); return 0; @@ -561,31 +798,71 @@ class Lima::Call _gpu_context_space.apply(ctx_id, [&] (Gpu_context &gc) { - size_t const payload_size = Lima::get_payload_size(arg); - if (payload_size > EXEC_BUFFER_SIZE) { - Genode::error(__func__, ": exec buffer too small (", - (unsigned)EXEC_BUFFER_SIZE, ") needed ", payload_size); - return; - } - /* - * Copy each array flat to the exec buffer and adjust the - * addresses in the submit object. + * Check if we have access to all needed buffer objects and + * if not import them from the main context that normaly performs + * all allocations. */ - char *local_exec_buffer = (char*)_exec_buffer->mmap_addr(); - Genode::memset(local_exec_buffer, 0, EXEC_BUFFER_SIZE); - Lima::serialize(&arg, local_exec_buffer); + + bool all_buffer_mapped = true; + Lima::for_each_submit_bo(arg, [&] (drm_lima_gem_submit_bo const &bo) { + Buffer_space::Id const id = { .value = bo.handle }; + if (!gc.buffer_space_contains(id)) { + + bool imported = false; + (void)_apply_handle(bo.handle, [&] (Gpu::Vram const &v) { + Gpu::Vram_capability cap = _main_ctx->export_vram(v.id()); + if (gc.import_vram(cap, v) == nullptr) + return; + + imported = true; + }); + + if (!imported) + all_buffer_mapped = false; + } + }); + + if (!all_buffer_mapped) + return; + + bool serialized = false; + gc.access_exec_buffer(_env, [&] (char *ptr, size_t size) { + + size_t const payload_size = Lima::get_payload_size(arg); + if (payload_size > size) { + error("exec buffer for context ", ctx_id.value, + " too small, got ", size, " but needed ", + payload_size); + return; + } + + /* + * Copy each array flat to the exec buffer and adjust the + * addresses in the submit object. + */ + Genode::memset(ptr, 0, size); + Lima::serialize(&arg, ptr); + + serialized = true; + }); + + if (!serialized) + return; try { Gpu::Connection &gpu = gc.gpu(); Gpu::Sequence_number const seqno = - gpu.execute(_exec_buffer->id(), 0); + gpu.execute(gc.exec_buffer_id(), 0); - sync_obj.adopt(gc, seqno); + sync_obj.adopt(Gpu_context_space::Id { .value = gc.id() }, + seqno); result = true; - } catch (Gpu::Session::Invalid_state) { } + } catch (Gpu::Session::Invalid_state) { + warning(": could not execute: ", gc.exec_buffer_id().value); + } }); }); @@ -617,7 +894,7 @@ class Lima::Call { try { Gpu_context * ctx = - new (_heap) Gpu_context(_gpu_context_space); + new (_heap) Gpu_context(_heap, _gpu_context_space, _va_alloc); arg.id = ctx->id(); return 0; @@ -633,8 +910,21 @@ class Lima::Call bool result = false; auto free_ctx = [&] (Gpu_context &ctx) { - Genode::destroy(_heap, &ctx); + result = true; + + /* + * If the ctx is currently referenced by a syncobj its + * destruction gets deferred until the final wait-for-syncobj + * call. + */ + _syncobj_space.for_each([&] (Syncobj &obj) { + if (obj.ctx_id().value == ctx.id()) + ctx.defer_destruction = true; + }); + + if (!ctx.defer_destruction) + Genode::destroy(_heap, &ctx); }; _gpu_context_space.apply(id, free_ctx); @@ -675,9 +965,15 @@ class Lima::Call int _drm_gem_close(drm_gem_close const &gem_close) { + auto free_buffer = [&] (Gpu_context &ctx) { + Buffer_space::Id const id { .value = gem_close.handle }; + ctx.free_buffer(id); + }; + _gpu_context_space.for_each(free_buffer); + return _apply_handle(gem_close.handle, - [&] (Lima::Buffer &b) { - _va_alloc.free((void *)b.va.value); + [&] (Gpu::Vram &b) { + _va_alloc.free(b.va); destroy(_heap, &b); }) ? 0 : -1; } @@ -710,7 +1006,7 @@ class Lima::Call { try { Syncobj *obj = new (_heap) Syncobj(_syncobj_space); - arg.handle = (uint32_t)obj->id(); + arg.handle = (uint32_t)obj->id().value; return 0; } catch (... /* XXX which exceptions can occur? */) { } return -1; @@ -722,16 +1018,34 @@ class Lima::Call bool result = false; _syncobj_space.apply(id, [&] (Syncobj &obj) { - Genode::destroy(_heap, &obj); result = true; + + /* + * In case the syncobj was once exported destruction + * gets deferred until the final wait-for-syncobj call. + */ + if (obj.sync_fd) { + obj.defer_destruction = true; + return; + } + + Genode::destroy(_heap, &obj); }); return result ? 0 : -1; } int _drm_syncobj_handle_to_fd(drm_syncobj_handle &arg) { - arg.fd = arg.handle + SYNC_FD; - return 0; + arg.fd = -1; + + Syncobj_space::Id id { .value = arg.handle }; + _syncobj_space.apply(id, [&] (Syncobj &obj) { + obj.sync_fd = true; + + arg.fd = arg.handle + SYNC_FD; + }); + + return arg.fd != -1 ? 0 : -1; } int _generic_ioctl(unsigned cmd, void *arg) @@ -766,32 +1080,15 @@ class Lima::Call /* arbitrary start value out of the libc's FD alloc range */ static constexpr int const SYNC_FD { 10000 }; - Call() - { - _va_alloc.add_range(0x1000, 0xfff00000ul - 0x1000); - - ::uint64_t va = _alloc_va(EXEC_BUFFER_SIZE); - if (!va) throw Gpu::Session::Invalid_state(); - - try { - _exec_buffer.construct(_gpu, - (size_t)EXEC_BUFFER_SIZE, - Gpu::Virtual_address { va }, - _buffer_space); - } catch (...) { - throw Gpu::Session::Invalid_state(); - } - if (!_exec_buffer->mmap(_env)) - throw Gpu::Session::Invalid_state(); - } + Call() { } ~Call() { - while (_gpu_context_space.apply_any([&] (Gpu_context &ctx) { - Genode::destroy(_heap, &ctx); })) { ; } - while (_syncobj_space.apply_any([&] (Syncobj &obj) { Genode::destroy(_heap, &obj); })) { ; } + + while (_gpu_context_space.apply_any([&] (Gpu_context &ctx) { + Genode::destroy(_heap, &ctx); })) { ; } } int ioctl(unsigned long request, void *arg) @@ -840,7 +1137,7 @@ void lima_drm_init() struct ::stat buf; if (stat("/dev/gpu", &buf) < 0) { Genode::error("'/dev/gpu' not accessible: ", - "try configure '' in 'dev' directory of VFS'"); + "try configure '' in 'dev' directory of VFS'"); return; } @@ -879,7 +1176,6 @@ int lima_drm_ioctl(unsigned long request, void *arg) Genode::log("returned ", ret); pthread_mutex_unlock(&ioctl_mutex); - return ret; }