From c7b98a63328a749d44b7580ee9baafc5d417e48f Mon Sep 17 00:00:00 2001
From: Dave Stevenson <dave.stevenson@raspberrypi.com>
Date: Thu, 31 Aug 2023 11:45:38 +0100
Subject: [PATCH 0635/1085] drm/vc4: Assign LBM memory during atomic_flush.

Avoid double buffering LBM allocations by making the
allocation a single alloc per crtc at atomic_flush.

Signed-off-by: Dave Stevenson <dave.stevenson@raspberrypi.com>
---
 drivers/gpu/drm/vc4/tests/vc4_test_lbm_size.c |  2 +-
 drivers/gpu/drm/vc4/vc4_drv.h                 |  8 ++--
 drivers/gpu/drm/vc4/vc4_hvs.c                 | 47 ++++++++++++++++++-
 drivers/gpu/drm/vc4/vc4_plane.c               | 38 +++------------
 4 files changed, 58 insertions(+), 37 deletions(-)

--- a/drivers/gpu/drm/vc4/tests/vc4_test_lbm_size.c
+++ b/drivers/gpu/drm/vc4/tests/vc4_test_lbm_size.c
@@ -248,7 +248,7 @@ static void drm_vc4_test_vc4_lbm_size(st
 	ret = drm_atomic_check_only(state);
 	KUNIT_ASSERT_EQ(test, ret, 0);
 
-	KUNIT_EXPECT_EQ(test, vc4_plane_state->lbm.size, params->expected_lbm_size);
+	KUNIT_EXPECT_EQ(test, vc4_plane_state->lbm_size, params->expected_lbm_size);
 
 	for (i = 0; i < 2; i++) {
 		KUNIT_EXPECT_EQ(test,
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -417,6 +417,8 @@ struct vc4_plane_state {
 	u32 dlist_size; /* Number of dwords allocated for the display list */
 	u32 dlist_count; /* Number of used dwords in the display list. */
 
+	u32 lbm_size; /* LBM requirements for this plane */
+
 	/* Offset in the dlist to various words, for pageflip or
 	 * cursor updates.
 	 */
@@ -442,9 +444,6 @@ struct vc4_plane_state {
 	bool is_unity;
 	bool is_yuv;
 
-	/* Our allocation in LBM for temporary storage during scaling. */
-	struct drm_mm_node lbm;
-
 	/* Our allocation in UPM for prefetching. */
 	struct drm_mm_node upm[DRM_FORMAT_MAX_PLANES];
 
@@ -635,6 +634,9 @@ struct vc4_crtc {
 	 * access to that value.
 	 */
 	unsigned int current_hvs_channel;
+
+	/* @lbm: Our allocation in LBM for temporary storage during scaling. */
+	struct drm_mm_node lbm;
 };
 
 #define to_vc4_crtc(_crtc)					\
--- a/drivers/gpu/drm/vc4/vc4_hvs.c
+++ b/drivers/gpu/drm/vc4/vc4_hvs.c
@@ -1103,6 +1103,7 @@ int vc4_hvs_atomic_check(struct drm_crtc
 	struct drm_plane *plane;
 	const struct drm_plane_state *plane_state;
 	u32 dlist_count = 0;
+	u32 lbm_count = 0;
 
 	/* The pixelvalve can only feed one encoder (and encoders are
 	 * 1:1 with connectors.)
@@ -1111,6 +1112,8 @@ int vc4_hvs_atomic_check(struct drm_crtc
 		return -EINVAL;
 
 	drm_atomic_crtc_state_for_each_plane_state(plane, plane_state, crtc_state) {
+		const struct vc4_plane_state *vc4_plane_state =
+						to_vc4_plane_state(plane_state);
 		u32 plane_dlist_count = vc4_plane_dlist_size(plane_state);
 
 		drm_dbg_driver(dev, "[CRTC:%d:%s] Found [PLANE:%d:%s] with DLIST size: %u\n",
@@ -1119,6 +1122,7 @@ int vc4_hvs_atomic_check(struct drm_crtc
 			       plane_dlist_count);
 
 		dlist_count += plane_dlist_count;
+		lbm_count += vc4_plane_state->lbm_size;
 	}
 
 	dlist_count++; /* Account for SCALER_CTL0_END. */
@@ -1132,6 +1136,8 @@ int vc4_hvs_atomic_check(struct drm_crtc
 
 	vc4_state->mm = alloc;
 
+	/* FIXME: Check total lbm allocation here */
+
 	return vc4_hvs_gamma_check(crtc, state);
 }
 
@@ -1246,7 +1252,10 @@ void vc4_hvs_atomic_flush(struct drm_crt
 	bool debug_dump_regs = false;
 	bool enable_bg_fill = false;
 	u32 __iomem *dlist_start, *dlist_next;
+	unsigned long irqflags;
 	unsigned int zpos = 0;
+	u32 lbm_offset = 0;
+	u32 lbm_size = 0;
 	bool found = false;
 	int idx;
 
@@ -1265,6 +1274,35 @@ void vc4_hvs_atomic_flush(struct drm_crt
 		vc4_hvs_dump_state(hvs);
 	}
 
+	drm_atomic_crtc_for_each_plane(plane, crtc) {
+		vc4_plane_state = to_vc4_plane_state(plane->state);
+		lbm_size += vc4_plane_state->lbm_size;
+	}
+
+	if (drm_mm_node_allocated(&vc4_crtc->lbm)) {
+		spin_lock_irqsave(&vc4_crtc->irq_lock, irqflags);
+		drm_mm_remove_node(&vc4_crtc->lbm);
+		spin_unlock_irqrestore(&vc4_crtc->irq_lock, irqflags);
+	}
+
+	if (lbm_size) {
+		int ret;
+
+		spin_lock_irqsave(&vc4_crtc->irq_lock, irqflags);
+		ret = drm_mm_insert_node_generic(&vc4->hvs->lbm_mm,
+						 &vc4_crtc->lbm,
+						 lbm_size, 1,
+						 0, 0);
+		spin_unlock_irqrestore(&vc4_crtc->irq_lock, irqflags);
+
+		if (ret) {
+			pr_err("Failed to allocate LBM ret %d\n", ret);
+			return;
+		}
+	}
+
+	lbm_offset = vc4_crtc->lbm.start;
+
 	dlist_start = vc4->hvs->dlist + vc4_state->mm->mm_node.start;
 	dlist_next = dlist_start;
 
@@ -1276,6 +1314,8 @@ void vc4_hvs_atomic_flush(struct drm_crt
 			if (plane->state->normalized_zpos != zpos)
 				continue;
 
+			vc4_plane_state = to_vc4_plane_state(plane->state);
+
 			/* Is this the first active plane? */
 			if (dlist_next == dlist_start) {
 				/* We need to enable background fill when a plane
@@ -1286,10 +1326,15 @@ void vc4_hvs_atomic_flush(struct drm_crt
 				 * already needs it or all planes on top blend from
 				 * the first or a lower plane.
 				 */
-				vc4_plane_state = to_vc4_plane_state(plane->state);
 				enable_bg_fill = vc4_plane_state->needs_bg_fill;
 			}
 
+			if (vc4_plane_state->lbm_size) {
+				vc4_plane_state->dlist[vc4_plane_state->lbm_offset] =
+								lbm_offset;
+				lbm_offset += vc4_plane_state->lbm_size;
+			}
+
 			dlist_next += vc4_plane_write_dlist(plane, dlist_next);
 
 			found = true;
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -288,7 +288,6 @@ struct drm_plane_state *vc4_plane_duplic
 	if (!vc4_state)
 		return NULL;
 
-	memset(&vc4_state->lbm, 0, sizeof(vc4_state->lbm));
 	memset(&vc4_state->upm, 0, sizeof(vc4_state->upm));
 
 	for (i = 0; i < DRM_FORMAT_MAX_PLANES; i++)
@@ -320,14 +319,6 @@ void vc4_plane_destroy_state(struct drm_
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
 	unsigned int i;
 
-	if (drm_mm_node_allocated(&vc4_state->lbm)) {
-		unsigned long irqflags;
-
-		spin_lock_irqsave(&hvs->mm_lock, irqflags);
-		drm_mm_remove_node(&vc4_state->lbm);
-		spin_unlock_irqrestore(&hvs->mm_lock, irqflags);
-	}
-
 	for (i = 0; i < DRM_FORMAT_MAX_PLANES; i++) {
 		unsigned long irqflags;
 
@@ -903,12 +894,13 @@ static int vc4_plane_allocate_lbm(struct
 	struct vc4_dev *vc4 = to_vc4_dev(drm);
 	struct drm_plane *plane = state->plane;
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
-	unsigned long irqflags;
 	u32 lbm_size;
 
 	lbm_size = vc4_lbm_size(state);
-	if (!lbm_size)
+	if (!lbm_size) {
+		vc4_state->lbm_size = 0;
 		return 0;
+	}
 
 	/*
 	 * NOTE: BCM2712 doesn't need to be aligned, since the size
@@ -925,28 +917,10 @@ static int vc4_plane_allocate_lbm(struct
 	if (WARN_ON(!vc4_state->lbm_offset))
 		return -EINVAL;
 
-	/* Allocate the LBM memory that the HVS will use for temporary
-	 * storage due to our scaling/format conversion.
+	/* FIXME: Add loop here that ensures that the total LBM assigned in this
+	 *  state is less than the total lbm size
 	 */
-	if (!drm_mm_node_allocated(&vc4_state->lbm)) {
-		int ret;
-
-		spin_lock_irqsave(&vc4->hvs->mm_lock, irqflags);
-		ret = drm_mm_insert_node_generic(&vc4->hvs->lbm_mm,
-						 &vc4_state->lbm,
-						 lbm_size, 1,
-						 0, 0);
-		spin_unlock_irqrestore(&vc4->hvs->mm_lock, irqflags);
-
-		if (ret) {
-			drm_err(drm, "Failed to allocate LBM entry: %d\n", ret);
-			return ret;
-		}
-	} else {
-		WARN_ON_ONCE(lbm_size != vc4_state->lbm.size);
-	}
-
-	vc4_state->dlist[vc4_state->lbm_offset] = vc4_state->lbm.start;
+	vc4_state->lbm_size = lbm_size;
 
 	return 0;
 }