openwrt/target/linux/brcm2708/patches-4.4/0281-drm-vc4-Add-support-for-scaling-of-display-planes.patch

From 8d513c7a67cce0bf0ef312323753eccbd0f3f71a Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 20 Oct 2015 16:06:57 +0100
Subject: [PATCH 281/423] drm/vc4: Add support for scaling of display planes.

This implements a simple policy for choosing scaling modes
(trapezoidal for decimation, PPF for magnification), and a single PPF
filter (Mitchell/Netravali's recommendation).

Signed-off-by: Eric Anholt <eric@anholt.net>
(cherry picked from commit 21af94cf1a4c2d3450ab7fead58e6e2291ab92a9)
---
 drivers/gpu/drm/vc4/vc4_drv.h   |   4 +
 drivers/gpu/drm/vc4/vc4_hvs.c   |  84 +++++++++++++
 drivers/gpu/drm/vc4/vc4_plane.c | 253 +++++++++++++++++++++++++++++++++++++---
 drivers/gpu/drm/vc4/vc4_regs.h  |  46 ++++++++
 4 files changed, 374 insertions(+), 13 deletions(-)

--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -156,7 +156,11 @@ struct vc4_hvs {
 	 * list.  Units are dwords.
 	 */
 	struct drm_mm dlist_mm;
+	/* Memory manager for the LBM memory used by HVS scaling. */
+	struct drm_mm lbm_mm;
 	spinlock_t mm_lock;
+
+	struct drm_mm_node mitchell_netravali_filter;
 };
 
 struct vc4_plane {
--- a/drivers/gpu/drm/vc4/vc4_hvs.c
+++ b/drivers/gpu/drm/vc4/vc4_hvs.c
@@ -100,12 +100,76 @@ int vc4_hvs_debugfs_regs(struct seq_file
 }
 #endif
 
+/* The filter kernel is composed of dwords each containing 3 9-bit
+ * signed integers packed next to each other.
+ */
+#define VC4_INT_TO_COEFF(coeff) (coeff & 0x1ff)
+#define VC4_PPF_FILTER_WORD(c0, c1, c2)				\
+	((((c0) & 0x1ff) << 0) |				\
+	 (((c1) & 0x1ff) << 9) |				\
+	 (((c2) & 0x1ff) << 18))
+
+/* The whole filter kernel is arranged as the coefficients 0-16 going
+ * up, then a pad, then 17-31 going down and reversed within the
+ * dwords.  This means that a linear phase kernel (where it's
+ * symmetrical at the boundary between 15 and 16) has the last 5
+ * dwords matching the first 5, but reversed.
+ */
+#define VC4_LINEAR_PHASE_KERNEL(c0, c1, c2, c3, c4, c5, c6, c7, c8,	\
+				c9, c10, c11, c12, c13, c14, c15)	\
+	{VC4_PPF_FILTER_WORD(c0, c1, c2),				\
+	 VC4_PPF_FILTER_WORD(c3, c4, c5),				\
+	 VC4_PPF_FILTER_WORD(c6, c7, c8),				\
+	 VC4_PPF_FILTER_WORD(c9, c10, c11),				\
+	 VC4_PPF_FILTER_WORD(c12, c13, c14),				\
+	 VC4_PPF_FILTER_WORD(c15, c15, 0)}
+
+#define VC4_LINEAR_PHASE_KERNEL_DWORDS 6
+#define VC4_KERNEL_DWORDS (VC4_LINEAR_PHASE_KERNEL_DWORDS * 2 - 1)
+
+/* Recommended B=1/3, C=1/3 filter choice from Mitchell/Netravali.
+ * http://www.cs.utexas.edu/~fussell/courses/cs384g/lectures/mitchell/Mitchell.pdf
+ */
+static const u32 mitchell_netravali_1_3_1_3_kernel[] =
+	VC4_LINEAR_PHASE_KERNEL(0, -2, -6, -8, -10, -8, -3, 2, 18,
+				50, 82, 119, 155, 187, 213, 227);
+
+static int vc4_hvs_upload_linear_kernel(struct vc4_hvs *hvs,
+					struct drm_mm_node *space,
+					const u32 *kernel)
+{
+	int ret, i;
+	u32 __iomem *dst_kernel;
+
+	ret = drm_mm_insert_node(&hvs->dlist_mm, space, VC4_KERNEL_DWORDS, 1,
+				 0);
+	if (ret) {
+		DRM_ERROR("Failed to allocate space for filter kernel: %d\n",
+			  ret);
+		return ret;
+	}
+
+	dst_kernel = hvs->dlist + space->start;
+
+	for (i = 0; i < VC4_KERNEL_DWORDS; i++) {
+		if (i < VC4_LINEAR_PHASE_KERNEL_DWORDS)
+			writel(kernel[i], &dst_kernel[i]);
+		else {
+			writel(kernel[VC4_KERNEL_DWORDS - i - 1],
+			       &dst_kernel[i]);
+		}
+	}
+
+	return 0;
+}
+
 static int vc4_hvs_bind(struct device *dev, struct device *master, void *data)
 {
 	struct platform_device *pdev = to_platform_device(dev);
 	struct drm_device *drm = dev_get_drvdata(master);
 	struct vc4_dev *vc4 = drm->dev_private;
 	struct vc4_hvs *hvs = NULL;
+	int ret;
 
 	hvs = devm_kzalloc(&pdev->dev, sizeof(*hvs), GFP_KERNEL);
 	if (!hvs)
@@ -130,6 +194,22 @@ static int vc4_hvs_bind(struct device *d
 		    HVS_BOOTLOADER_DLIST_END,
 		    (SCALER_DLIST_SIZE >> 2) - HVS_BOOTLOADER_DLIST_END);
 
+	/* Set up the HVS LBM memory manager.  We could have some more
+	 * complicated data structure that allowed reuse of LBM areas
+	 * between planes when they don't overlap on the screen, but
+	 * for now we just allocate globally.
+	 */
+	drm_mm_init(&hvs->lbm_mm, 0, 96 * 1024);
+
+	/* Upload filter kernels.  We only have the one for now, so we
+	 * keep it around for the lifetime of the driver.
+	 */
+	ret = vc4_hvs_upload_linear_kernel(hvs,
+					   &hvs->mitchell_netravali_filter,
+					   mitchell_netravali_1_3_1_3_kernel);
+	if (ret)
+		return ret;
+
 	vc4->hvs = hvs;
 	return 0;
 }
@@ -140,7 +220,11 @@ static void vc4_hvs_unbind(struct device
 	struct drm_device *drm = dev_get_drvdata(master);
 	struct vc4_dev *vc4 = drm->dev_private;
 
+	if (vc4->hvs->mitchell_netravali_filter.allocated)
+		drm_mm_remove_node(&vc4->hvs->mitchell_netravali_filter);
+
 	drm_mm_takedown(&vc4->hvs->dlist_mm);
+	drm_mm_takedown(&vc4->hvs->lbm_mm);
 
 	vc4->hvs = NULL;
 }
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -24,6 +24,12 @@
 #include "drm_fb_cma_helper.h"
 #include "drm_plane_helper.h"
 
+enum vc4_scaling_mode {
+	VC4_SCALING_NONE,
+	VC4_SCALING_TPZ,
+	VC4_SCALING_PPF,
+};
+
 struct vc4_plane_state {
 	struct drm_plane_state base;
 	/* System memory copy of the display list for this element, computed
@@ -47,13 +53,19 @@ struct vc4_plane_state {
 
 	/* Clipped coordinates of the plane on the display. */
 	int crtc_x, crtc_y, crtc_w, crtc_h;
-	/* Clipped size of the area scanned from in the FB. */
-	u32 src_w, src_h;
+	/* Clipped area being scanned from in the FB. */
+	u32 src_x, src_y, src_w, src_h;
+
+	enum vc4_scaling_mode x_scaling, y_scaling;
+	bool is_unity;
 
 	/* Offset to start scanning out from the start of the plane's
 	 * BO.
 	 */
 	u32 offset;
+
+	/* Our allocation in LBM for temporary storage during scaling. */
+	struct drm_mm_node lbm;
 };
 
 static inline struct vc4_plane_state *
@@ -106,6 +118,16 @@ static const struct hvs_format *vc4_get_
 	return NULL;
 }
 
+static enum vc4_scaling_mode vc4_get_scaling_mode(u32 src, u32 dst)
+{
+	if (dst > src)
+		return VC4_SCALING_PPF;
+	else if (dst < src)
+		return VC4_SCALING_TPZ;
+	else
+		return VC4_SCALING_NONE;
+}
+
 static bool plane_enabled(struct drm_plane_state *state)
 {
 	return state->fb && state->crtc;
@@ -122,6 +144,8 @@ static struct drm_plane_state *vc4_plane
 	if (!vc4_state)
 		return NULL;
 
+	memset(&vc4_state->lbm, 0, sizeof(vc4_state->lbm));
+
 	__drm_atomic_helper_plane_duplicate_state(plane, &vc4_state->base);
 
 	if (vc4_state->dlist) {
@@ -141,8 +165,17 @@ static struct drm_plane_state *vc4_plane
 static void vc4_plane_destroy_state(struct drm_plane *plane,
 				    struct drm_plane_state *state)
 {
+	struct vc4_dev *vc4 = to_vc4_dev(plane->dev);
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
 
+	if (vc4_state->lbm.allocated) {
+		unsigned long irqflags;
+
+		spin_lock_irqsave(&vc4->hvs->mm_lock, irqflags);
+		drm_mm_remove_node(&vc4_state->lbm);
+		spin_unlock_irqrestore(&vc4->hvs->mm_lock, irqflags);
+	}
+
 	kfree(vc4_state->dlist);
 	__drm_atomic_helper_plane_destroy_state(plane, &vc4_state->base);
 	kfree(state);
@@ -181,23 +214,60 @@ static void vc4_dlist_write(struct vc4_p
 	vc4_state->dlist[vc4_state->dlist_count++] = val;
 }
 
+/* Returns the scl0/scl1 field based on whether the dimensions need to
+ * be up/down/non-scaled.
+ *
+ * This is a replication of a table from the spec.
+ */
+static u32 vc4_get_scl_field(struct drm_plane_state *state)
+{
+	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
+
+	switch (vc4_state->x_scaling << 2 | vc4_state->y_scaling) {
+	case VC4_SCALING_PPF << 2 | VC4_SCALING_PPF:
+		return SCALER_CTL0_SCL_H_PPF_V_PPF;
+	case VC4_SCALING_TPZ << 2 | VC4_SCALING_PPF:
+		return SCALER_CTL0_SCL_H_TPZ_V_PPF;
+	case VC4_SCALING_PPF << 2 | VC4_SCALING_TPZ:
+		return SCALER_CTL0_SCL_H_PPF_V_TPZ;
+	case VC4_SCALING_TPZ << 2 | VC4_SCALING_TPZ:
+		return SCALER_CTL0_SCL_H_TPZ_V_TPZ;
+	case VC4_SCALING_PPF << 2 | VC4_SCALING_NONE:
+		return SCALER_CTL0_SCL_H_PPF_V_NONE;
+	case VC4_SCALING_NONE << 2 | VC4_SCALING_PPF:
+		return SCALER_CTL0_SCL_H_NONE_V_PPF;
+	case VC4_SCALING_NONE << 2 | VC4_SCALING_TPZ:
+		return SCALER_CTL0_SCL_H_NONE_V_TPZ;
+	case VC4_SCALING_TPZ << 2 | VC4_SCALING_NONE:
+		return SCALER_CTL0_SCL_H_TPZ_V_NONE;
+	default:
+	case VC4_SCALING_NONE << 2 | VC4_SCALING_NONE:
+		/* The unity case is independently handled by
+		 * SCALER_CTL0_UNITY.
+		 */
+		return 0;
+	}
+}
+
 static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
 {
+	struct drm_plane *plane = state->plane;
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
 	struct drm_framebuffer *fb = state->fb;
+	u32 subpixel_src_mask = (1 << 16) - 1;
 
 	vc4_state->offset = fb->offsets[0];
 
-	if (state->crtc_w << 16 != state->src_w ||
-	    state->crtc_h << 16 != state->src_h) {
-		/* We don't support scaling yet, which involves
-		 * allocating the LBM memory for scaling temporary
-		 * storage, and putting filter kernels in the HVS
-		 * context.
-		 */
+	/* We don't support subpixel source positioning for scaling. */
+	if ((state->src_x & subpixel_src_mask) ||
+	    (state->src_y & subpixel_src_mask) ||
+	    (state->src_w & subpixel_src_mask) ||
+	    (state->src_h & subpixel_src_mask)) {
 		return -EINVAL;
 	}
 
+	vc4_state->src_x = state->src_x >> 16;
+	vc4_state->src_y = state->src_y >> 16;
 	vc4_state->src_w = state->src_w >> 16;
 	vc4_state->src_h = state->src_h >> 16;
 
@@ -206,6 +276,23 @@ static int vc4_plane_setup_clipping_and_
 	vc4_state->crtc_w = state->crtc_w;
 	vc4_state->crtc_h = state->crtc_h;
 
+	vc4_state->x_scaling = vc4_get_scaling_mode(vc4_state->src_w,
+						    vc4_state->crtc_w);
+	vc4_state->y_scaling = vc4_get_scaling_mode(vc4_state->src_h,
+						    vc4_state->crtc_h);
+	vc4_state->is_unity = (vc4_state->x_scaling == VC4_SCALING_NONE &&
+			       vc4_state->y_scaling == VC4_SCALING_NONE);
+
+	/* No configuring scaling on the cursor plane, since it gets
+	   non-vblank-synced updates, and scaling requires requires
+	   LBM changes which have to be vblank-synced.
+	 */
+	if (plane->type == DRM_PLANE_TYPE_CURSOR && !vc4_state->is_unity)
+		return -EINVAL;
+
+	/* Clamp the on-screen start x/y to 0.  The hardware doesn't
+	 * support negative y, and negative x wastes bandwidth.
+	 */
 	if (vc4_state->crtc_x < 0) {
 		vc4_state->offset += (drm_format_plane_cpp(fb->pixel_format,
 							   0) *
@@ -223,6 +310,87 @@ static int vc4_plane_setup_clipping_and_
 	return 0;
 }
 
+static void vc4_write_tpz(struct vc4_plane_state *vc4_state, u32 src, u32 dst)
+{
+	u32 scale, recip;
+
+	scale = (1 << 16) * src / dst;
+
+	/* The specs note that while the reciprocal would be defined
+	 * as (1<<32)/scale, ~0 is close enough.
+	 */
+	recip = ~0 / scale;
+
+	vc4_dlist_write(vc4_state,
+			VC4_SET_FIELD(scale, SCALER_TPZ0_SCALE) |
+			VC4_SET_FIELD(0, SCALER_TPZ0_IPHASE));
+	vc4_dlist_write(vc4_state,
+			VC4_SET_FIELD(recip, SCALER_TPZ1_RECIP));
+}
+
+static void vc4_write_ppf(struct vc4_plane_state *vc4_state, u32 src, u32 dst)
+{
+	u32 scale = (1 << 16) * src / dst;
+
+	vc4_dlist_write(vc4_state,
+			SCALER_PPF_AGC |
+			VC4_SET_FIELD(scale, SCALER_PPF_SCALE) |
+			VC4_SET_FIELD(0, SCALER_PPF_IPHASE));
+}
+
+static u32 vc4_lbm_size(struct drm_plane_state *state)
+{
+	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
+	/* This is the worst case number.  One of the two sizes will
+	 * be used depending on the scaling configuration.
+	 */
+	u32 pix_per_line = max(vc4_state->src_w, (u32)vc4_state->crtc_w);
+	u32 lbm;
+
+	if (vc4_state->is_unity)
+		return 0;
+	else if (vc4_state->y_scaling == VC4_SCALING_TPZ)
+		lbm = pix_per_line * 8;
+	else {
+		/* In special cases, this multiplier might be 12. */
+		lbm = pix_per_line * 16;
+	}
+
+	lbm = roundup(lbm, 32);
+
+	return lbm;
+}
+
+static void vc4_write_scaling_parameters(struct drm_plane_state *state)
+{
+	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
+
+	/* Ch0 H-PPF Word 0: Scaling Parameters */
+	if (vc4_state->x_scaling == VC4_SCALING_PPF) {
+		vc4_write_ppf(vc4_state,
+			      vc4_state->src_w, vc4_state->crtc_w);
+	}
+
+	/* Ch0 V-PPF Words 0-1: Scaling Parameters, Context */
+	if (vc4_state->y_scaling == VC4_SCALING_PPF) {
+		vc4_write_ppf(vc4_state,
+			      vc4_state->src_h, vc4_state->crtc_h);
+		vc4_dlist_write(vc4_state, 0xc0c0c0c0);
+	}
+
+	/* Ch0 H-TPZ Words 0-1: Scaling Parameters, Recip */
+	if (vc4_state->x_scaling == VC4_SCALING_TPZ) {
+		vc4_write_tpz(vc4_state,
+			      vc4_state->src_w, vc4_state->crtc_w);
+	}
+
+	/* Ch0 V-TPZ Words 0-2: Scaling Parameters, Recip, Context */
+	if (vc4_state->y_scaling == VC4_SCALING_TPZ) {
+		vc4_write_tpz(vc4_state,
+			      vc4_state->src_h, vc4_state->crtc_h);
+		vc4_dlist_write(vc4_state, 0xc0c0c0c0);
+	}
+}
 
 /* Writes out a full display list for an active plane to the plane's
  * private dlist state.
@@ -230,22 +398,50 @@ static int vc4_plane_setup_clipping_and_
 static int vc4_plane_mode_set(struct drm_plane *plane,
 			      struct drm_plane_state *state)
 {
+	struct vc4_dev *vc4 = to_vc4_dev(plane->dev);
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);
 	struct drm_framebuffer *fb = state->fb;
 	struct drm_gem_cma_object *bo = drm_fb_cma_get_gem_obj(fb, 0);
 	u32 ctl0_offset = vc4_state->dlist_count;
 	const struct hvs_format *format = vc4_get_hvs_format(fb->pixel_format);
+	u32 scl;
+	u32 lbm_size;
+	unsigned long irqflags;
 	int ret;
 
 	ret = vc4_plane_setup_clipping_and_scaling(state);
 	if (ret)
 		return ret;
 
+	/* Allocate the LBM memory that the HVS will use for temporary
+	 * storage due to our scaling/format conversion.
+	 */
+	lbm_size = vc4_lbm_size(state);
+	if (lbm_size) {
+		if (!vc4_state->lbm.allocated) {
+			spin_lock_irqsave(&vc4->hvs->mm_lock, irqflags);
+			ret = drm_mm_insert_node(&vc4->hvs->lbm_mm,
+						 &vc4_state->lbm,
+						 lbm_size, 32, 0);
+			spin_unlock_irqrestore(&vc4->hvs->mm_lock, irqflags);
+		} else {
+			WARN_ON_ONCE(lbm_size != vc4_state->lbm.size);
+		}
+	}
+
+	if (ret)
+		return ret;
+
+	scl = vc4_get_scl_field(state);
+
+	/* Control word */
 	vc4_dlist_write(vc4_state,
 			SCALER_CTL0_VALID |
 			(format->pixel_order << SCALER_CTL0_ORDER_SHIFT) |
 			(format->hvs << SCALER_CTL0_PIXEL_FORMAT_SHIFT) |
-			SCALER_CTL0_UNITY);
+			(vc4_state->is_unity ? SCALER_CTL0_UNITY : 0) |
+			VC4_SET_FIELD(scl, SCALER_CTL0_SCL0) |
+			VC4_SET_FIELD(scl, SCALER_CTL0_SCL1));
 
 	/* Position Word 0: Image Positions and Alpha Value */
 	vc4_state->pos0_offset = vc4_state->dlist_count;
@@ -254,9 +450,14 @@ static int vc4_plane_mode_set(struct drm
 			VC4_SET_FIELD(vc4_state->crtc_x, SCALER_POS0_START_X) |
 			VC4_SET_FIELD(vc4_state->crtc_y, SCALER_POS0_START_Y));
 
-	/* Position Word 1: Scaled Image Dimensions.
-	 * Skipped due to SCALER_CTL0_UNITY scaling.
-	 */
+	/* Position Word 1: Scaled Image Dimensions. */
+	if (!vc4_state->is_unity) {
+		vc4_dlist_write(vc4_state,
+				VC4_SET_FIELD(vc4_state->crtc_w,
+					      SCALER_POS1_SCL_WIDTH) |
+				VC4_SET_FIELD(vc4_state->crtc_h,
+					      SCALER_POS1_SCL_HEIGHT));
+	}
 
 	/* Position Word 2: Source Image Size, Alpha Mode */
 	vc4_state->pos2_offset = vc4_state->dlist_count;
@@ -282,6 +483,32 @@ static int vc4_plane_mode_set(struct drm
 	vc4_dlist_write(vc4_state,
 			VC4_SET_FIELD(fb->pitches[0], SCALER_SRC_PITCH));
 
+	if (!vc4_state->is_unity) {
+		/* LBM Base Address. */
+		if (vc4_state->y_scaling != VC4_SCALING_NONE)
+			vc4_dlist_write(vc4_state, vc4_state->lbm.start);
+
+		vc4_write_scaling_parameters(state);
+
+		/* If any PPF setup was done, then all the kernel
+		 * pointers get uploaded.
+		 */
+		if (vc4_state->x_scaling == VC4_SCALING_PPF ||
+		    vc4_state->y_scaling == VC4_SCALING_PPF) {
+			u32 kernel = VC4_SET_FIELD(vc4->hvs->mitchell_netravali_filter.start,
+						   SCALER_PPF_KERNEL_OFFSET);
+
+			/* HPPF plane 0 */
+			vc4_dlist_write(vc4_state, kernel);
+			/* VPPF plane 0 */
+			vc4_dlist_write(vc4_state, kernel);
+			/* HPPF plane 1 */
+			vc4_dlist_write(vc4_state, kernel);
+			/* VPPF plane 1 */
+			vc4_dlist_write(vc4_state, kernel);
+		}
+	}
+
 	vc4_state->dlist[ctl0_offset] |=
 		VC4_SET_FIELD(vc4_state->dlist_count, SCALER_CTL0_SIZE);
 
--- a/drivers/gpu/drm/vc4/vc4_regs.h
+++ b/drivers/gpu/drm/vc4/vc4_regs.h
@@ -536,6 +536,21 @@ enum hvs_pixel_format {
 #define SCALER_CTL0_ORDER_MASK			VC4_MASK(14, 13)
 #define SCALER_CTL0_ORDER_SHIFT			13
 
+#define SCALER_CTL0_SCL1_MASK			VC4_MASK(10, 8)
+#define SCALER_CTL0_SCL1_SHIFT			8
+
+#define SCALER_CTL0_SCL0_MASK			VC4_MASK(7, 5)
+#define SCALER_CTL0_SCL0_SHIFT			5
+
+#define SCALER_CTL0_SCL_H_PPF_V_PPF		0
+#define SCALER_CTL0_SCL_H_TPZ_V_PPF		1
+#define SCALER_CTL0_SCL_H_PPF_V_TPZ		2
+#define SCALER_CTL0_SCL_H_TPZ_V_TPZ		3
+#define SCALER_CTL0_SCL_H_PPF_V_NONE		4
+#define SCALER_CTL0_SCL_H_NONE_V_PPF		5
+#define SCALER_CTL0_SCL_H_NONE_V_TPZ		6
+#define SCALER_CTL0_SCL_H_TPZ_V_NONE		7
+
 /* Set to indicate no scaling. */
 #define SCALER_CTL0_UNITY			BIT(4)
 
@@ -551,6 +566,12 @@ enum hvs_pixel_format {
 #define SCALER_POS0_START_X_MASK		VC4_MASK(11, 0)
 #define SCALER_POS0_START_X_SHIFT		0
 
+#define SCALER_POS1_SCL_HEIGHT_MASK		VC4_MASK(27, 16)
+#define SCALER_POS1_SCL_HEIGHT_SHIFT		16
+
+#define SCALER_POS1_SCL_WIDTH_MASK		VC4_MASK(11, 0)
+#define SCALER_POS1_SCL_WIDTH_SHIFT		0
+
 #define SCALER_POS2_ALPHA_MODE_MASK		VC4_MASK(31, 30)
 #define SCALER_POS2_ALPHA_MODE_SHIFT		30
 #define SCALER_POS2_ALPHA_MODE_PIPELINE		0
@@ -564,6 +585,31 @@ enum hvs_pixel_format {
 #define SCALER_POS2_WIDTH_MASK			VC4_MASK(11, 0)
 #define SCALER_POS2_WIDTH_SHIFT			0
 
+#define SCALER_TPZ0_VERT_RECALC			BIT(31)
+#define SCALER_TPZ0_SCALE_MASK			VC4_MASK(28, 8)
+#define SCALER_TPZ0_SCALE_SHIFT			8
+#define SCALER_TPZ0_IPHASE_MASK			VC4_MASK(7, 0)
+#define SCALER_TPZ0_IPHASE_SHIFT		0
+#define SCALER_TPZ1_RECIP_MASK			VC4_MASK(15, 0)
+#define SCALER_TPZ1_RECIP_SHIFT			0
+
+/* Skips interpolating coefficients to 64 phases, so just 8 are used.
+ * Required for nearest neighbor.
+ */
+#define SCALER_PPF_NOINTERP			BIT(31)
+/* Replaes the highest valued coefficient with one that makes all 4
+ * sum to unity.
+ */
+#define SCALER_PPF_AGC				BIT(30)
+#define SCALER_PPF_SCALE_MASK			VC4_MASK(24, 8)
+#define SCALER_PPF_SCALE_SHIFT			8
+#define SCALER_PPF_IPHASE_MASK			VC4_MASK(6, 0)
+#define SCALER_PPF_IPHASE_SHIFT			0
+
+#define SCALER_PPF_KERNEL_OFFSET_MASK		VC4_MASK(13, 0)
+#define SCALER_PPF_KERNEL_OFFSET_SHIFT		0
+#define SCALER_PPF_KERNEL_UNCACHED		BIT(31)
+
 #define SCALER_SRC_PITCH_MASK			VC4_MASK(15, 0)
 #define SCALER_SRC_PITCH_SHIFT			0
brcm2708: update linux 4.4 patches to latest version As usual these patches were extracted and rebased from the raspberry pi repo: https://github.com/raspberrypi/linux/tree/rpi-4.4.y Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com> 2016-07-07 07:22:07 +00:00			`From 8d513c7a67cce0bf0ef312323753eccbd0f3f71a Mon Sep 17 00:00:00 2001`
brcm2708: update linux 4.4 patches to latest version As usual these patches were extracted from the raspberry pi repo: https://github.com/raspberrypi/linux/tree/rpi-4.4.y Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com> 2016-04-24 11:03:39 +00:00			`From: Eric Anholt <eric@anholt.net>`
			`Date: Tue, 20 Oct 2015 16:06:57 +0100`
brcm2708: update linux 4.4 patches to latest version As usual these patches were extracted and rebased from the raspberry pi repo: https://github.com/raspberrypi/linux/tree/rpi-4.4.y Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com> 2016-07-07 07:22:07 +00:00			`Subject: [PATCH 281/423] drm/vc4: Add support for scaling of display planes.`
brcm2708: update linux 4.4 patches to latest version As usual these patches were extracted from the raspberry pi repo: https://github.com/raspberrypi/linux/tree/rpi-4.4.y Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com> 2016-04-24 11:03:39 +00:00
			`This implements a simple policy for choosing scaling modes`
			`(trapezoidal for decimation, PPF for magnification), and a single PPF`
			`filter (Mitchell/Netravali's recommendation).`

			`Signed-off-by: Eric Anholt <eric@anholt.net>`
			`(cherry picked from commit 21af94cf1a4c2d3450ab7fead58e6e2291ab92a9)`
			`---`
			`drivers/gpu/drm/vc4/vc4_drv.h \| 4 +`
			`drivers/gpu/drm/vc4/vc4_hvs.c \| 84 +++++++++++++`
			`drivers/gpu/drm/vc4/vc4_plane.c \| 253 +++++++++++++++++++++++++++++++++++++---`
			`drivers/gpu/drm/vc4/vc4_regs.h \| 46 ++++++++`
			`4 files changed, 374 insertions(+), 13 deletions(-)`

			`--- a/drivers/gpu/drm/vc4/vc4_drv.h`
			`+++ b/drivers/gpu/drm/vc4/vc4_drv.h`
			`@@ -156,7 +156,11 @@ struct vc4_hvs {`
			`* list. Units are dwords.`
			`*/`
			`struct drm_mm dlist_mm;`
			`+ /* Memory manager for the LBM memory used by HVS scaling. */`
			`+ struct drm_mm lbm_mm;`
			`spinlock_t mm_lock;`
			`+`
			`+ struct drm_mm_node mitchell_netravali_filter;`
			`};`

			`struct vc4_plane {`
			`--- a/drivers/gpu/drm/vc4/vc4_hvs.c`
			`+++ b/drivers/gpu/drm/vc4/vc4_hvs.c`
			`@@ -100,12 +100,76 @@ int vc4_hvs_debugfs_regs(struct seq_file`
			`}`
			`#endif`

			`+/* The filter kernel is composed of dwords each containing 3 9-bit`
			`+ * signed integers packed next to each other.`
			`+ */`
			`+#define VC4_INT_TO_COEFF(coeff) (coeff & 0x1ff)`
			`+#define VC4_PPF_FILTER_WORD(c0, c1, c2) \`
			`+ ((((c0) & 0x1ff) << 0) \| \`
			`+ (((c1) & 0x1ff) << 9) \| \`
			`+ (((c2) & 0x1ff) << 18))`
			`+`
			`+/* The whole filter kernel is arranged as the coefficients 0-16 going`
			`+ * up, then a pad, then 17-31 going down and reversed within the`
			`+ * dwords. This means that a linear phase kernel (where it's`
			`+ * symmetrical at the boundary between 15 and 16) has the last 5`
			`+ * dwords matching the first 5, but reversed.`
			`+ */`
			`+#define VC4_LINEAR_PHASE_KERNEL(c0, c1, c2, c3, c4, c5, c6, c7, c8, \`
			`+ c9, c10, c11, c12, c13, c14, c15) \`
			`+ {VC4_PPF_FILTER_WORD(c0, c1, c2), \`
			`+ VC4_PPF_FILTER_WORD(c3, c4, c5), \`
			`+ VC4_PPF_FILTER_WORD(c6, c7, c8), \`
			`+ VC4_PPF_FILTER_WORD(c9, c10, c11), \`
			`+ VC4_PPF_FILTER_WORD(c12, c13, c14), \`
			`+ VC4_PPF_FILTER_WORD(c15, c15, 0)}`
			`+`
			`+#define VC4_LINEAR_PHASE_KERNEL_DWORDS 6`
			`+#define VC4_KERNEL_DWORDS (VC4_LINEAR_PHASE_KERNEL_DWORDS * 2 - 1)`
			`+`
			`+/* Recommended B=1/3, C=1/3 filter choice from Mitchell/Netravali.`
			`+ * http://www.cs.utexas.edu/~fussell/courses/cs384g/lectures/mitchell/Mitchell.pdf`
			`+ */`
			`+static const u32 mitchell_netravali_1_3_1_3_kernel[] =`
			`+ VC4_LINEAR_PHASE_KERNEL(0, -2, -6, -8, -10, -8, -3, 2, 18,`
			`+ 50, 82, 119, 155, 187, 213, 227);`
			`+`
			`+static int vc4_hvs_upload_linear_kernel(struct vc4_hvs *hvs,`
			`+ struct drm_mm_node *space,`
			`+ const u32 *kernel)`
			`+{`
			`+ int ret, i;`
			`+ u32 __iomem *dst_kernel;`
			`+`
			`+ ret = drm_mm_insert_node(&hvs->dlist_mm, space, VC4_KERNEL_DWORDS, 1,`
			`+ 0);`
			`+ if (ret) {`
			`+ DRM_ERROR("Failed to allocate space for filter kernel: %d\n",`
			`+ ret);`
			`+ return ret;`
			`+ }`
			`+`
			`+ dst_kernel = hvs->dlist + space->start;`
			`+`
			`+ for (i = 0; i < VC4_KERNEL_DWORDS; i++) {`
			`+ if (i < VC4_LINEAR_PHASE_KERNEL_DWORDS)`
			`+ writel(kernel[i], &dst_kernel[i]);`
			`+ else {`
			`+ writel(kernel[VC4_KERNEL_DWORDS - i - 1],`
			`+ &dst_kernel[i]);`
			`+ }`
			`+ }`
			`+`
			`+ return 0;`
			`+}`
			`+`
			`static int vc4_hvs_bind(struct device dev, struct device master, void *data)`
			`{`
			`struct platform_device *pdev = to_platform_device(dev);`
			`struct drm_device *drm = dev_get_drvdata(master);`
			`struct vc4_dev *vc4 = drm->dev_private;`
			`struct vc4_hvs *hvs = NULL;`
			`+ int ret;`

			`hvs = devm_kzalloc(&pdev->dev, sizeof(*hvs), GFP_KERNEL);`
			`if (!hvs)`
			`@@ -130,6 +194,22 @@ static int vc4_hvs_bind(struct device *d`
			`HVS_BOOTLOADER_DLIST_END,`
			`(SCALER_DLIST_SIZE >> 2) - HVS_BOOTLOADER_DLIST_END);`

			`+ /* Set up the HVS LBM memory manager. We could have some more`
			`+ * complicated data structure that allowed reuse of LBM areas`
			`+ * between planes when they don't overlap on the screen, but`
			`+ * for now we just allocate globally.`
			`+ */`
			`+ drm_mm_init(&hvs->lbm_mm, 0, 96 * 1024);`
			`+`
			`+ /* Upload filter kernels. We only have the one for now, so we`
			`+ * keep it around for the lifetime of the driver.`
			`+ */`
			`+ ret = vc4_hvs_upload_linear_kernel(hvs,`
			`+ &hvs->mitchell_netravali_filter,`
			`+ mitchell_netravali_1_3_1_3_kernel);`
			`+ if (ret)`
			`+ return ret;`
			`+`
			`vc4->hvs = hvs;`
			`return 0;`
			`}`
			`@@ -140,7 +220,11 @@ static void vc4_hvs_unbind(struct device`
			`struct drm_device *drm = dev_get_drvdata(master);`
			`struct vc4_dev *vc4 = drm->dev_private;`

			`+ if (vc4->hvs->mitchell_netravali_filter.allocated)`
			`+ drm_mm_remove_node(&vc4->hvs->mitchell_netravali_filter);`
			`+`
			`drm_mm_takedown(&vc4->hvs->dlist_mm);`
			`+ drm_mm_takedown(&vc4->hvs->lbm_mm);`

			`vc4->hvs = NULL;`
			`}`
			`--- a/drivers/gpu/drm/vc4/vc4_plane.c`
			`+++ b/drivers/gpu/drm/vc4/vc4_plane.c`
			`@@ -24,6 +24,12 @@`
			`#include "drm_fb_cma_helper.h"`
			`#include "drm_plane_helper.h"`

			`+enum vc4_scaling_mode {`
			`+ VC4_SCALING_NONE,`
			`+ VC4_SCALING_TPZ,`
			`+ VC4_SCALING_PPF,`
			`+};`
			`+`
			`struct vc4_plane_state {`
			`struct drm_plane_state base;`
			`/* System memory copy of the display list for this element, computed`
			`@@ -47,13 +53,19 @@ struct vc4_plane_state {`

			`/* Clipped coordinates of the plane on the display. */`
			`int crtc_x, crtc_y, crtc_w, crtc_h;`
			`- /* Clipped size of the area scanned from in the FB. */`
			`- u32 src_w, src_h;`
			`+ /* Clipped area being scanned from in the FB. */`
			`+ u32 src_x, src_y, src_w, src_h;`
			`+`
			`+ enum vc4_scaling_mode x_scaling, y_scaling;`
			`+ bool is_unity;`

			`/* Offset to start scanning out from the start of the plane's`
			`* BO.`
			`*/`
			`u32 offset;`
			`+`
			`+ /* Our allocation in LBM for temporary storage during scaling. */`
			`+ struct drm_mm_node lbm;`
			`};`

			`static inline struct vc4_plane_state *`
			`@@ -106,6 +118,16 @@ static const struct hvs_format *vc4_get_`
			`return NULL;`
			`}`

			`+static enum vc4_scaling_mode vc4_get_scaling_mode(u32 src, u32 dst)`
			`+{`
			`+ if (dst > src)`
			`+ return VC4_SCALING_PPF;`
			`+ else if (dst < src)`
			`+ return VC4_SCALING_TPZ;`
			`+ else`
			`+ return VC4_SCALING_NONE;`
			`+}`
			`+`
			`static bool plane_enabled(struct drm_plane_state *state)`
			`{`
			`return state->fb && state->crtc;`
			`@@ -122,6 +144,8 @@ static struct drm_plane_state *vc4_plane`
			`if (!vc4_state)`
			`return NULL;`

			`+ memset(&vc4_state->lbm, 0, sizeof(vc4_state->lbm));`
			`+`
			`__drm_atomic_helper_plane_duplicate_state(plane, &vc4_state->base);`

			`if (vc4_state->dlist) {`
			`@@ -141,8 +165,17 @@ static struct drm_plane_state *vc4_plane`
			`static void vc4_plane_destroy_state(struct drm_plane *plane,`
			`struct drm_plane_state *state)`
			`{`
			`+ struct vc4_dev *vc4 = to_vc4_dev(plane->dev);`
			`struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);`

			`+ if (vc4_state->lbm.allocated) {`
			`+ unsigned long irqflags;`
			`+`
			`+ spin_lock_irqsave(&vc4->hvs->mm_lock, irqflags);`
			`+ drm_mm_remove_node(&vc4_state->lbm);`
			`+ spin_unlock_irqrestore(&vc4->hvs->mm_lock, irqflags);`
			`+ }`
			`+`
			`kfree(vc4_state->dlist);`
			`__drm_atomic_helper_plane_destroy_state(plane, &vc4_state->base);`
			`kfree(state);`
			`@@ -181,23 +214,60 @@ static void vc4_dlist_write(struct vc4_p`
			`vc4_state->dlist[vc4_state->dlist_count++] = val;`
			`}`

			`+/* Returns the scl0/scl1 field based on whether the dimensions need to`
			`+ * be up/down/non-scaled.`
			`+ *`
			`+ * This is a replication of a table from the spec.`
			`+ */`
			`+static u32 vc4_get_scl_field(struct drm_plane_state *state)`
			`+{`
			`+ struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);`
			`+`
			`+ switch (vc4_state->x_scaling << 2 \| vc4_state->y_scaling) {`
			`+ case VC4_SCALING_PPF << 2 \| VC4_SCALING_PPF:`
			`+ return SCALER_CTL0_SCL_H_PPF_V_PPF;`
			`+ case VC4_SCALING_TPZ << 2 \| VC4_SCALING_PPF:`
			`+ return SCALER_CTL0_SCL_H_TPZ_V_PPF;`
			`+ case VC4_SCALING_PPF << 2 \| VC4_SCALING_TPZ:`
			`+ return SCALER_CTL0_SCL_H_PPF_V_TPZ;`
			`+ case VC4_SCALING_TPZ << 2 \| VC4_SCALING_TPZ:`
			`+ return SCALER_CTL0_SCL_H_TPZ_V_TPZ;`
			`+ case VC4_SCALING_PPF << 2 \| VC4_SCALING_NONE:`
			`+ return SCALER_CTL0_SCL_H_PPF_V_NONE;`
			`+ case VC4_SCALING_NONE << 2 \| VC4_SCALING_PPF:`
			`+ return SCALER_CTL0_SCL_H_NONE_V_PPF;`
			`+ case VC4_SCALING_NONE << 2 \| VC4_SCALING_TPZ:`
			`+ return SCALER_CTL0_SCL_H_NONE_V_TPZ;`
			`+ case VC4_SCALING_TPZ << 2 \| VC4_SCALING_NONE:`
			`+ return SCALER_CTL0_SCL_H_TPZ_V_NONE;`
			`+ default:`
			`+ case VC4_SCALING_NONE << 2 \| VC4_SCALING_NONE:`
			`+ /* The unity case is independently handled by`
			`+ * SCALER_CTL0_UNITY.`
			`+ */`
			`+ return 0;`
			`+ }`
			`+}`
			`+`
			`static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)`
			`{`
			`+ struct drm_plane *plane = state->plane;`
			`struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);`
			`struct drm_framebuffer *fb = state->fb;`
			`+ u32 subpixel_src_mask = (1 << 16) - 1;`

			`vc4_state->offset = fb->offsets[0];`

			`- if (state->crtc_w << 16 != state->src_w \|\|`
			`- state->crtc_h << 16 != state->src_h) {`
			`- /* We don't support scaling yet, which involves`
			`- * allocating the LBM memory for scaling temporary`
			`- * storage, and putting filter kernels in the HVS`
			`- * context.`
			`- */`
			`+ /* We don't support subpixel source positioning for scaling. */`
			`+ if ((state->src_x & subpixel_src_mask) \|\|`
			`+ (state->src_y & subpixel_src_mask) \|\|`
			`+ (state->src_w & subpixel_src_mask) \|\|`
			`+ (state->src_h & subpixel_src_mask)) {`
			`return -EINVAL;`
			`}`

			`+ vc4_state->src_x = state->src_x >> 16;`
			`+ vc4_state->src_y = state->src_y >> 16;`
			`vc4_state->src_w = state->src_w >> 16;`
			`vc4_state->src_h = state->src_h >> 16;`

			`@@ -206,6 +276,23 @@ static int vc4_plane_setup_clipping_and_`
			`vc4_state->crtc_w = state->crtc_w;`
			`vc4_state->crtc_h = state->crtc_h;`

			`+ vc4_state->x_scaling = vc4_get_scaling_mode(vc4_state->src_w,`
			`+ vc4_state->crtc_w);`
			`+ vc4_state->y_scaling = vc4_get_scaling_mode(vc4_state->src_h,`
			`+ vc4_state->crtc_h);`
			`+ vc4_state->is_unity = (vc4_state->x_scaling == VC4_SCALING_NONE &&`
			`+ vc4_state->y_scaling == VC4_SCALING_NONE);`
			`+`
			`+ /* No configuring scaling on the cursor plane, since it gets`
			`+ non-vblank-synced updates, and scaling requires requires`
			`+ LBM changes which have to be vblank-synced.`
			`+ */`
			`+ if (plane->type == DRM_PLANE_TYPE_CURSOR && !vc4_state->is_unity)`
			`+ return -EINVAL;`
			`+`
			`+ /* Clamp the on-screen start x/y to 0. The hardware doesn't`
			`+ * support negative y, and negative x wastes bandwidth.`
			`+ */`
			`if (vc4_state->crtc_x < 0) {`
			`vc4_state->offset += (drm_format_plane_cpp(fb->pixel_format,`
			`0) *`
			`@@ -223,6 +310,87 @@ static int vc4_plane_setup_clipping_and_`
			`return 0;`
			`}`

			`+static void vc4_write_tpz(struct vc4_plane_state *vc4_state, u32 src, u32 dst)`
			`+{`
			`+ u32 scale, recip;`
			`+`
			`+ scale = (1 << 16) * src / dst;`
			`+`
			`+ /* The specs note that while the reciprocal would be defined`
			`+ * as (1<<32)/scale, ~0 is close enough.`
			`+ */`
			`+ recip = ~0 / scale;`
			`+`
			`+ vc4_dlist_write(vc4_state,`
			`+ VC4_SET_FIELD(scale, SCALER_TPZ0_SCALE) \|`
			`+ VC4_SET_FIELD(0, SCALER_TPZ0_IPHASE));`
			`+ vc4_dlist_write(vc4_state,`
			`+ VC4_SET_FIELD(recip, SCALER_TPZ1_RECIP));`
			`+}`
			`+`
			`+static void vc4_write_ppf(struct vc4_plane_state *vc4_state, u32 src, u32 dst)`
			`+{`
			`+ u32 scale = (1 << 16) * src / dst;`
			`+`
			`+ vc4_dlist_write(vc4_state,`
			`+ SCALER_PPF_AGC \|`
			`+ VC4_SET_FIELD(scale, SCALER_PPF_SCALE) \|`
			`+ VC4_SET_FIELD(0, SCALER_PPF_IPHASE));`
			`+}`
			`+`
			`+static u32 vc4_lbm_size(struct drm_plane_state *state)`
			`+{`
			`+ struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);`
			`+ /* This is the worst case number. One of the two sizes will`
			`+ * be used depending on the scaling configuration.`
			`+ */`
			`+ u32 pix_per_line = max(vc4_state->src_w, (u32)vc4_state->crtc_w);`
			`+ u32 lbm;`
			`+`
			`+ if (vc4_state->is_unity)`
			`+ return 0;`
			`+ else if (vc4_state->y_scaling == VC4_SCALING_TPZ)`
			`+ lbm = pix_per_line * 8;`
			`+ else {`
			`+ /* In special cases, this multiplier might be 12. */`
			`+ lbm = pix_per_line * 16;`
			`+ }`
			`+`
			`+ lbm = roundup(lbm, 32);`
			`+`
			`+ return lbm;`
			`+}`
			`+`
			`+static void vc4_write_scaling_parameters(struct drm_plane_state *state)`
			`+{`
			`+ struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);`
			`+`
			`+ /* Ch0 H-PPF Word 0: Scaling Parameters */`
			`+ if (vc4_state->x_scaling == VC4_SCALING_PPF) {`
			`+ vc4_write_ppf(vc4_state,`
			`+ vc4_state->src_w, vc4_state->crtc_w);`
			`+ }`
			`+`
			`+ /* Ch0 V-PPF Words 0-1: Scaling Parameters, Context */`
			`+ if (vc4_state->y_scaling == VC4_SCALING_PPF) {`
			`+ vc4_write_ppf(vc4_state,`
			`+ vc4_state->src_h, vc4_state->crtc_h);`
			`+ vc4_dlist_write(vc4_state, 0xc0c0c0c0);`
			`+ }`
			`+`
			`+ /* Ch0 H-TPZ Words 0-1: Scaling Parameters, Recip */`
			`+ if (vc4_state->x_scaling == VC4_SCALING_TPZ) {`
			`+ vc4_write_tpz(vc4_state,`
			`+ vc4_state->src_w, vc4_state->crtc_w);`
			`+ }`
			`+`
			`+ /* Ch0 V-TPZ Words 0-2: Scaling Parameters, Recip, Context */`
			`+ if (vc4_state->y_scaling == VC4_SCALING_TPZ) {`
			`+ vc4_write_tpz(vc4_state,`
			`+ vc4_state->src_h, vc4_state->crtc_h);`
			`+ vc4_dlist_write(vc4_state, 0xc0c0c0c0);`
			`+ }`
			`+}`

			`/* Writes out a full display list for an active plane to the plane's`
			`* private dlist state.`
			`@@ -230,22 +398,50 @@ static int vc4_plane_setup_clipping_and_`
			`static int vc4_plane_mode_set(struct drm_plane *plane,`
			`struct drm_plane_state *state)`
			`{`
			`+ struct vc4_dev *vc4 = to_vc4_dev(plane->dev);`
			`struct vc4_plane_state *vc4_state = to_vc4_plane_state(state);`
			`struct drm_framebuffer *fb = state->fb;`
			`struct drm_gem_cma_object *bo = drm_fb_cma_get_gem_obj(fb, 0);`
			`u32 ctl0_offset = vc4_state->dlist_count;`
			`const struct hvs_format *format = vc4_get_hvs_format(fb->pixel_format);`
			`+ u32 scl;`
			`+ u32 lbm_size;`
			`+ unsigned long irqflags;`
			`int ret;`

			`ret = vc4_plane_setup_clipping_and_scaling(state);`
			`if (ret)`
			`return ret;`

			`+ /* Allocate the LBM memory that the HVS will use for temporary`
			`+ * storage due to our scaling/format conversion.`
			`+ */`
			`+ lbm_size = vc4_lbm_size(state);`
			`+ if (lbm_size) {`
			`+ if (!vc4_state->lbm.allocated) {`
			`+ spin_lock_irqsave(&vc4->hvs->mm_lock, irqflags);`
			`+ ret = drm_mm_insert_node(&vc4->hvs->lbm_mm,`
			`+ &vc4_state->lbm,`
			`+ lbm_size, 32, 0);`
			`+ spin_unlock_irqrestore(&vc4->hvs->mm_lock, irqflags);`
			`+ } else {`
			`+ WARN_ON_ONCE(lbm_size != vc4_state->lbm.size);`
			`+ }`
			`+ }`
			`+`
			`+ if (ret)`
			`+ return ret;`
			`+`
			`+ scl = vc4_get_scl_field(state);`
			`+`
			`+ /* Control word */`
			`vc4_dlist_write(vc4_state,`
			`SCALER_CTL0_VALID \|`
			`(format->pixel_order << SCALER_CTL0_ORDER_SHIFT) \|`
			`(format->hvs << SCALER_CTL0_PIXEL_FORMAT_SHIFT) \|`
			`- SCALER_CTL0_UNITY);`
			`+ (vc4_state->is_unity ? SCALER_CTL0_UNITY : 0) \|`
			`+ VC4_SET_FIELD(scl, SCALER_CTL0_SCL0) \|`
			`+ VC4_SET_FIELD(scl, SCALER_CTL0_SCL1));`

			`/* Position Word 0: Image Positions and Alpha Value */`
			`vc4_state->pos0_offset = vc4_state->dlist_count;`
			`@@ -254,9 +450,14 @@ static int vc4_plane_mode_set(struct drm`
			`VC4_SET_FIELD(vc4_state->crtc_x, SCALER_POS0_START_X) \|`
			`VC4_SET_FIELD(vc4_state->crtc_y, SCALER_POS0_START_Y));`

			`- /* Position Word 1: Scaled Image Dimensions.`
			`- * Skipped due to SCALER_CTL0_UNITY scaling.`
			`- */`
			`+ /* Position Word 1: Scaled Image Dimensions. */`
			`+ if (!vc4_state->is_unity) {`
			`+ vc4_dlist_write(vc4_state,`
			`+ VC4_SET_FIELD(vc4_state->crtc_w,`
			`+ SCALER_POS1_SCL_WIDTH) \|`
			`+ VC4_SET_FIELD(vc4_state->crtc_h,`
			`+ SCALER_POS1_SCL_HEIGHT));`
			`+ }`

			`/* Position Word 2: Source Image Size, Alpha Mode */`
			`vc4_state->pos2_offset = vc4_state->dlist_count;`
			`@@ -282,6 +483,32 @@ static int vc4_plane_mode_set(struct drm`
			`vc4_dlist_write(vc4_state,`
			`VC4_SET_FIELD(fb->pitches[0], SCALER_SRC_PITCH));`

			`+ if (!vc4_state->is_unity) {`
			`+ /* LBM Base Address. */`
			`+ if (vc4_state->y_scaling != VC4_SCALING_NONE)`
			`+ vc4_dlist_write(vc4_state, vc4_state->lbm.start);`
			`+`
			`+ vc4_write_scaling_parameters(state);`
			`+`
			`+ /* If any PPF setup was done, then all the kernel`
			`+ * pointers get uploaded.`
			`+ */`
			`+ if (vc4_state->x_scaling == VC4_SCALING_PPF \|\|`
			`+ vc4_state->y_scaling == VC4_SCALING_PPF) {`
			`+ u32 kernel = VC4_SET_FIELD(vc4->hvs->mitchell_netravali_filter.start,`
			`+ SCALER_PPF_KERNEL_OFFSET);`
			`+`
			`+ /* HPPF plane 0 */`
			`+ vc4_dlist_write(vc4_state, kernel);`
			`+ /* VPPF plane 0 */`
			`+ vc4_dlist_write(vc4_state, kernel);`
			`+ /* HPPF plane 1 */`
			`+ vc4_dlist_write(vc4_state, kernel);`
			`+ /* VPPF plane 1 */`
			`+ vc4_dlist_write(vc4_state, kernel);`
			`+ }`
			`+ }`
			`+`
			`vc4_state->dlist[ctl0_offset] \|=`
			`VC4_SET_FIELD(vc4_state->dlist_count, SCALER_CTL0_SIZE);`

			`--- a/drivers/gpu/drm/vc4/vc4_regs.h`
			`+++ b/drivers/gpu/drm/vc4/vc4_regs.h`
			`@@ -536,6 +536,21 @@ enum hvs_pixel_format {`
			`#define SCALER_CTL0_ORDER_MASK VC4_MASK(14, 13)`
			`#define SCALER_CTL0_ORDER_SHIFT 13`

			`+#define SCALER_CTL0_SCL1_MASK VC4_MASK(10, 8)`
			`+#define SCALER_CTL0_SCL1_SHIFT 8`
			`+`
			`+#define SCALER_CTL0_SCL0_MASK VC4_MASK(7, 5)`
			`+#define SCALER_CTL0_SCL0_SHIFT 5`
			`+`
			`+#define SCALER_CTL0_SCL_H_PPF_V_PPF 0`
			`+#define SCALER_CTL0_SCL_H_TPZ_V_PPF 1`
			`+#define SCALER_CTL0_SCL_H_PPF_V_TPZ 2`
			`+#define SCALER_CTL0_SCL_H_TPZ_V_TPZ 3`
			`+#define SCALER_CTL0_SCL_H_PPF_V_NONE 4`
			`+#define SCALER_CTL0_SCL_H_NONE_V_PPF 5`
			`+#define SCALER_CTL0_SCL_H_NONE_V_TPZ 6`
			`+#define SCALER_CTL0_SCL_H_TPZ_V_NONE 7`
			`+`
			`/* Set to indicate no scaling. */`
			`#define SCALER_CTL0_UNITY BIT(4)`

			`@@ -551,6 +566,12 @@ enum hvs_pixel_format {`
			`#define SCALER_POS0_START_X_MASK VC4_MASK(11, 0)`
			`#define SCALER_POS0_START_X_SHIFT 0`

			`+#define SCALER_POS1_SCL_HEIGHT_MASK VC4_MASK(27, 16)`
			`+#define SCALER_POS1_SCL_HEIGHT_SHIFT 16`
			`+`
			`+#define SCALER_POS1_SCL_WIDTH_MASK VC4_MASK(11, 0)`
			`+#define SCALER_POS1_SCL_WIDTH_SHIFT 0`
			`+`
			`#define SCALER_POS2_ALPHA_MODE_MASK VC4_MASK(31, 30)`
			`#define SCALER_POS2_ALPHA_MODE_SHIFT 30`
			`#define SCALER_POS2_ALPHA_MODE_PIPELINE 0`
			`@@ -564,6 +585,31 @@ enum hvs_pixel_format {`
			`#define SCALER_POS2_WIDTH_MASK VC4_MASK(11, 0)`
			`#define SCALER_POS2_WIDTH_SHIFT 0`

			`+#define SCALER_TPZ0_VERT_RECALC BIT(31)`
			`+#define SCALER_TPZ0_SCALE_MASK VC4_MASK(28, 8)`
			`+#define SCALER_TPZ0_SCALE_SHIFT 8`
			`+#define SCALER_TPZ0_IPHASE_MASK VC4_MASK(7, 0)`
			`+#define SCALER_TPZ0_IPHASE_SHIFT 0`
			`+#define SCALER_TPZ1_RECIP_MASK VC4_MASK(15, 0)`
			`+#define SCALER_TPZ1_RECIP_SHIFT 0`
			`+`
			`+/* Skips interpolating coefficients to 64 phases, so just 8 are used.`
			`+ * Required for nearest neighbor.`
			`+ */`
			`+#define SCALER_PPF_NOINTERP BIT(31)`
			`+/* Replaes the highest valued coefficient with one that makes all 4`
			`+ * sum to unity.`
			`+ */`
			`+#define SCALER_PPF_AGC BIT(30)`
			`+#define SCALER_PPF_SCALE_MASK VC4_MASK(24, 8)`
			`+#define SCALER_PPF_SCALE_SHIFT 8`
			`+#define SCALER_PPF_IPHASE_MASK VC4_MASK(6, 0)`
			`+#define SCALER_PPF_IPHASE_SHIFT 0`
			`+`
			`+#define SCALER_PPF_KERNEL_OFFSET_MASK VC4_MASK(13, 0)`
			`+#define SCALER_PPF_KERNEL_OFFSET_SHIFT 0`
			`+#define SCALER_PPF_KERNEL_UNCACHED BIT(31)`
			`+`
			`#define SCALER_SRC_PITCH_MASK VC4_MASK(15, 0)`
			`#define SCALER_SRC_PITCH_SHIFT 0`