From a1f091f8ef2b680a5184db065527612247cb4cae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Tue, 6 Oct 2020 17:26:42 +0200 Subject: [PATCH] drm/ttm: switch to per device LRU lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of having a global lock for potentially less contention. Signed-off-by: Christian König Tested-by: Nirmoy Das Reviewed-by: Huang Rui Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/424010/ --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 8 ++--- drivers/gpu/drm/qxl/qxl_release.c | 5 +-- drivers/gpu/drm/ttm/ttm_bo.c | 49 ++++++++++++-------------- drivers/gpu/drm/ttm/ttm_device.c | 12 +++---- drivers/gpu/drm/ttm/ttm_execbuf_util.c | 8 ++--- drivers/gpu/drm/ttm/ttm_resource.c | 9 +++-- include/drm/ttm/ttm_bo_driver.h | 4 +-- include/drm/ttm/ttm_device.h | 4 +-- 8 files changed, 43 insertions(+), 56 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 9d19078246c8..ae18c0e32347 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -638,15 +638,15 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, struct amdgpu_vm_bo_base *bo_base; if (vm->bulk_moveable) { - spin_lock(&ttm_glob.lru_lock); + spin_lock(&adev->mman.bdev.lru_lock); ttm_bo_bulk_move_lru_tail(&vm->lru_bulk_move); - spin_unlock(&ttm_glob.lru_lock); + spin_unlock(&adev->mman.bdev.lru_lock); return; } memset(&vm->lru_bulk_move, 0, sizeof(vm->lru_bulk_move)); - spin_lock(&ttm_glob.lru_lock); + spin_lock(&adev->mman.bdev.lru_lock); list_for_each_entry(bo_base, &vm->idle, vm_status) { struct amdgpu_bo *bo = bo_base->bo; @@ -660,7 +660,7 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, &bo->shadow->tbo.mem, &vm->lru_bulk_move); } - spin_unlock(&ttm_glob.lru_lock); + spin_unlock(&adev->mman.bdev.lru_lock); vm->bulk_moveable = true; } diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c index f5845c96d414..b19f2f00b215 100644 --- a/drivers/gpu/drm/qxl/qxl_release.c +++ b/drivers/gpu/drm/qxl/qxl_release.c @@ -426,16 +426,13 @@ void qxl_release_fence_buffer_objects(struct qxl_release *release) release->id | 0xf0000000, release->base.seqno); trace_dma_fence_emit(&release->base); - spin_lock(&ttm_glob.lru_lock); - list_for_each_entry(entry, &release->bos, head) { bo = entry->bo; dma_resv_add_shared_fence(bo->base.resv, &release->base); - ttm_bo_move_to_lru_tail(bo, &bo->mem, NULL); + ttm_bo_move_to_lru_tail_unlocked(bo); dma_resv_unlock(bo->base.resv); } - spin_unlock(&ttm_glob.lru_lock); ww_acquire_fini(&release->ticket); } diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 3673157527ff..2d2ac532987e 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -243,9 +243,9 @@ static int ttm_bo_individualize_resv(struct ttm_buffer_object *bo) * reference it any more. The only tricky case is the trylock on * the resv object while holding the lru_lock. */ - spin_lock(&ttm_glob.lru_lock); + spin_lock(&bo->bdev->lru_lock); bo->base.resv = &bo->base._resv; - spin_unlock(&ttm_glob.lru_lock); + spin_unlock(&bo->bdev->lru_lock); } return r; @@ -304,7 +304,7 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, if (unlock_resv) dma_resv_unlock(bo->base.resv); - spin_unlock(&ttm_glob.lru_lock); + spin_unlock(&bo->bdev->lru_lock); lret = dma_resv_wait_timeout_rcu(resv, true, interruptible, 30 * HZ); @@ -314,7 +314,7 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, else if (lret == 0) return -EBUSY; - spin_lock(&ttm_glob.lru_lock); + spin_lock(&bo->bdev->lru_lock); if (unlock_resv && !dma_resv_trylock(bo->base.resv)) { /* * We raced, and lost, someone else holds the reservation now, @@ -324,7 +324,7 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, * delayed destruction would succeed, so just return success * here. */ - spin_unlock(&ttm_glob.lru_lock); + spin_unlock(&bo->bdev->lru_lock); return 0; } ret = 0; @@ -333,13 +333,13 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, if (ret || unlikely(list_empty(&bo->ddestroy))) { if (unlock_resv) dma_resv_unlock(bo->base.resv); - spin_unlock(&ttm_glob.lru_lock); + spin_unlock(&bo->bdev->lru_lock); return ret; } ttm_bo_del_from_lru(bo); list_del_init(&bo->ddestroy); - spin_unlock(&ttm_glob.lru_lock); + spin_unlock(&bo->bdev->lru_lock); ttm_bo_cleanup_memtype_use(bo); if (unlock_resv) @@ -356,13 +356,12 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, */ bool ttm_bo_delayed_delete(struct ttm_device *bdev, bool remove_all) { - struct ttm_global *glob = &ttm_glob; struct list_head removed; bool empty; INIT_LIST_HEAD(&removed); - spin_lock(&glob->lru_lock); + spin_lock(&bdev->lru_lock); while (!list_empty(&bdev->ddestroy)) { struct ttm_buffer_object *bo; @@ -373,24 +372,24 @@ bool ttm_bo_delayed_delete(struct ttm_device *bdev, bool remove_all) continue; if (remove_all || bo->base.resv != &bo->base._resv) { - spin_unlock(&glob->lru_lock); + spin_unlock(&bdev->lru_lock); dma_resv_lock(bo->base.resv, NULL); - spin_lock(&glob->lru_lock); + spin_lock(&bdev->lru_lock); ttm_bo_cleanup_refs(bo, false, !remove_all, true); } else if (dma_resv_trylock(bo->base.resv)) { ttm_bo_cleanup_refs(bo, false, !remove_all, true); } else { - spin_unlock(&glob->lru_lock); + spin_unlock(&bdev->lru_lock); } ttm_bo_put(bo); - spin_lock(&glob->lru_lock); + spin_lock(&bdev->lru_lock); } list_splice_tail(&removed, &bdev->ddestroy); empty = list_empty(&bdev->ddestroy); - spin_unlock(&glob->lru_lock); + spin_unlock(&bdev->lru_lock); return empty; } @@ -425,7 +424,7 @@ static void ttm_bo_release(struct kref *kref) ttm_bo_flush_all_fences(bo); bo->deleted = true; - spin_lock(&ttm_glob.lru_lock); + spin_lock(&bo->bdev->lru_lock); /* * Make pinned bos immediately available to @@ -442,17 +441,17 @@ static void ttm_bo_release(struct kref *kref) kref_init(&bo->kref); list_add_tail(&bo->ddestroy, &bdev->ddestroy); - spin_unlock(&ttm_glob.lru_lock); + spin_unlock(&bo->bdev->lru_lock); schedule_delayed_work(&bdev->wq, ((HZ / 100) < 1) ? 1 : HZ / 100); return; } - spin_lock(&ttm_glob.lru_lock); + spin_lock(&bo->bdev->lru_lock); ttm_bo_del_from_lru(bo); list_del(&bo->ddestroy); - spin_unlock(&ttm_glob.lru_lock); + spin_unlock(&bo->bdev->lru_lock); ttm_bo_cleanup_memtype_use(bo); dma_resv_unlock(bo->base.resv); @@ -626,7 +625,7 @@ int ttm_mem_evict_first(struct ttm_device *bdev, unsigned i; int ret; - spin_lock(&ttm_glob.lru_lock); + spin_lock(&bo->bdev->lru_lock); for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) { list_for_each_entry(bo, &man->lru[i], lru) { bool busy; @@ -663,7 +662,7 @@ int ttm_mem_evict_first(struct ttm_device *bdev, if (!bo) { if (busy_bo && !ttm_bo_get_unless_zero(busy_bo)) busy_bo = NULL; - spin_unlock(&ttm_glob.lru_lock); + spin_unlock(&bo->bdev->lru_lock); ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket); if (busy_bo) ttm_bo_put(busy_bo); @@ -677,7 +676,7 @@ int ttm_mem_evict_first(struct ttm_device *bdev, return ret; } - spin_unlock(&ttm_glob.lru_lock); + spin_unlock(&bo->bdev->lru_lock); ret = ttm_bo_evict(bo, ctx); if (locked) @@ -777,10 +776,9 @@ static int ttm_bo_mem_placement(struct ttm_buffer_object *bo, mem->mem_type = place->mem_type; mem->placement = place->flags; - spin_lock(&ttm_glob.lru_lock); + spin_lock(&bo->bdev->lru_lock); ttm_bo_move_to_lru_tail(bo, mem, NULL); - spin_unlock(&ttm_glob.lru_lock); - + spin_unlock(&bo->bdev->lru_lock); return 0; } @@ -1167,7 +1165,6 @@ EXPORT_SYMBOL(ttm_bo_wait); int ttm_bo_swapout(struct ttm_buffer_object *bo, struct ttm_operation_ctx *ctx, gfp_t gfp_flags) { - struct ttm_global *glob = &ttm_glob; bool locked; int ret; @@ -1188,7 +1185,7 @@ int ttm_bo_swapout(struct ttm_buffer_object *bo, struct ttm_operation_ctx *ctx, ttm_bo_del_from_lru(bo); /* TODO: Cleanup the locking */ - spin_unlock(&glob->lru_lock); + spin_unlock(&bo->bdev->lru_lock); /* * Move to system cached diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c index b9e3bfb60cd2..8e821cddf81c 100644 --- a/drivers/gpu/drm/ttm/ttm_device.c +++ b/drivers/gpu/drm/ttm/ttm_device.c @@ -81,7 +81,6 @@ static int ttm_global_init(void) ttm_pool_mgr_init(num_pages * 50 / 100); ttm_tt_mgr_init(); - spin_lock_init(&glob->lru_lock); glob->dummy_read_page = alloc_page(__GFP_ZERO | GFP_DMA32); if (unlikely(glob->dummy_read_page == NULL)) { @@ -125,13 +124,12 @@ EXPORT_SYMBOL(ttm_global_swapout); int ttm_device_swapout(struct ttm_device *bdev, struct ttm_operation_ctx *ctx, gfp_t gfp_flags) { - struct ttm_global *glob = &ttm_glob; struct ttm_resource_manager *man; struct ttm_buffer_object *bo; unsigned i, j; int ret; - spin_lock(&glob->lru_lock); + spin_lock(&bdev->lru_lock); for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) { man = ttm_manager_type(bdev, i); if (!man || !man->use_tt) @@ -156,7 +154,7 @@ int ttm_device_swapout(struct ttm_device *bdev, struct ttm_operation_ctx *ctx, } } } - spin_unlock(&glob->lru_lock); + spin_unlock(&bdev->lru_lock); return 0; } EXPORT_SYMBOL(ttm_device_swapout); @@ -223,6 +221,7 @@ int ttm_device_init(struct ttm_device *bdev, struct ttm_device_funcs *funcs, bdev->vma_manager = vma_manager; INIT_DELAYED_WORK(&bdev->wq, ttm_device_delayed_workqueue); + spin_lock_init(&bdev->lru_lock); INIT_LIST_HEAD(&bdev->ddestroy); bdev->dev_mapping = mapping; mutex_lock(&ttm_global_mutex); @@ -235,7 +234,6 @@ EXPORT_SYMBOL(ttm_device_init); void ttm_device_fini(struct ttm_device *bdev) { - struct ttm_global *glob = &ttm_glob; struct ttm_resource_manager *man; unsigned i; @@ -252,11 +250,11 @@ void ttm_device_fini(struct ttm_device *bdev) if (ttm_bo_delayed_delete(bdev, true)) pr_debug("Delayed destroy list was clean\n"); - spin_lock(&glob->lru_lock); + spin_lock(&bdev->lru_lock); for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) if (list_empty(&man->lru[0])) pr_debug("Swap list %d was clean\n", i); - spin_unlock(&glob->lru_lock); + spin_unlock(&bdev->lru_lock); ttm_pool_fini(&bdev->pool); ttm_global_release(); diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c index 690ab97d52b7..071c48d672c6 100644 --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c @@ -51,14 +51,12 @@ void ttm_eu_backoff_reservation(struct ww_acquire_ctx *ticket, if (list_empty(list)) return; - spin_lock(&ttm_glob.lru_lock); list_for_each_entry(entry, list, head) { struct ttm_buffer_object *bo = entry->bo; - ttm_bo_move_to_lru_tail(bo, &bo->mem, NULL); + ttm_bo_move_to_lru_tail_unlocked(bo); dma_resv_unlock(bo->base.resv); } - spin_unlock(&ttm_glob.lru_lock); if (ticket) ww_acquire_fini(ticket); @@ -154,7 +152,6 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket, if (list_empty(list)) return; - spin_lock(&ttm_glob.lru_lock); list_for_each_entry(entry, list, head) { struct ttm_buffer_object *bo = entry->bo; @@ -162,10 +159,9 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket, dma_resv_add_shared_fence(bo->base.resv, fence); else dma_resv_add_excl_fence(bo->base.resv, fence); - ttm_bo_move_to_lru_tail(bo, &bo->mem, NULL); + ttm_bo_move_to_lru_tail_unlocked(bo); dma_resv_unlock(bo->base.resv); } - spin_unlock(&ttm_glob.lru_lock); if (ticket) ww_acquire_fini(ticket); } diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c index ed1672a9f332..04f2eef653ab 100644 --- a/drivers/gpu/drm/ttm/ttm_resource.c +++ b/drivers/gpu/drm/ttm/ttm_resource.c @@ -91,7 +91,6 @@ int ttm_resource_manager_evict_all(struct ttm_device *bdev, .no_wait_gpu = false, .force_alloc = true }; - struct ttm_global *glob = &ttm_glob; struct dma_fence *fence; int ret; unsigned i; @@ -100,18 +99,18 @@ int ttm_resource_manager_evict_all(struct ttm_device *bdev, * Can't use standard list traversal since we're unlocking. */ - spin_lock(&glob->lru_lock); + spin_lock(&bdev->lru_lock); for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) { while (!list_empty(&man->lru[i])) { - spin_unlock(&glob->lru_lock); + spin_unlock(&bdev->lru_lock); ret = ttm_mem_evict_first(bdev, man, NULL, &ctx, NULL); if (ret) return ret; - spin_lock(&glob->lru_lock); + spin_lock(&bdev->lru_lock); } } - spin_unlock(&glob->lru_lock); + spin_unlock(&bdev->lru_lock); spin_lock(&man->move_lock); fence = dma_fence_get(man->move); diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index d007feef7676..dbccac957f8f 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -180,9 +180,9 @@ static inline int ttm_bo_reserve_slowpath(struct ttm_buffer_object *bo, static inline void ttm_bo_move_to_lru_tail_unlocked(struct ttm_buffer_object *bo) { - spin_lock(&ttm_glob.lru_lock); + spin_lock(&bo->bdev->lru_lock); ttm_bo_move_to_lru_tail(bo, &bo->mem, NULL); - spin_unlock(&ttm_glob.lru_lock); + spin_unlock(&bo->bdev->lru_lock); } static inline void ttm_bo_assign_mem(struct ttm_buffer_object *bo, diff --git a/include/drm/ttm/ttm_device.h b/include/drm/ttm/ttm_device.h index 93f3b59755ac..7c8f87bd52d3 100644 --- a/include/drm/ttm/ttm_device.h +++ b/include/drm/ttm/ttm_device.h @@ -56,7 +56,6 @@ extern struct ttm_global { */ struct page *dummy_read_page; - spinlock_t lru_lock; /** * Protected by ttm_global_mutex. @@ -277,8 +276,9 @@ struct ttm_device { struct ttm_pool pool; /* - * Protected by the global:lru lock. + * Protection for the per manager LRU and ddestroy lists. */ + spinlock_t lru_lock; struct list_head ddestroy; /*