From 370c8243ec8e7f3abd8171b7d2dde170f4c5e63a Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@gmail.com>
Date: Mon, 17 Jun 2013 16:00:25 +0300
Subject: [PATCH 070/196] bcm2708_fb: DMA acceleration for fb_copyarea

Based on http://www.raspberrypi.org/phpBB3/viewtopic.php?p=62425#p62425
Also used Simon's dmaer_master module as a reference for tweaking DMA
settings for better performance.

For now busylooping only. IRQ support might be added later.
With non-overclocked Raspberry Pi, the performance is ~360 MB/s
for simple copy or ~260 MB/s for two-pass copy (used when dragging
windows to the right).

In the case of using DMA channel 0, the performance improves
to ~440 MB/s.

For comparison, VFP optimized CPU copy can only do ~114 MB/s in
the same conditions (hindered by reading uncached source buffer).

Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
---
 drivers/video/bcm2708_fb.c | 162 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 3 deletions(-)

diff --git a/drivers/video/bcm2708_fb.c b/drivers/video/bcm2708_fb.c
index 08d9238..c10c5ee 100644
--- a/drivers/video/bcm2708_fb.c
+++ b/drivers/video/bcm2708_fb.c
@@ -28,6 +28,7 @@
 #include <linux/printk.h>
 #include <linux/console.h>
 
+#include <mach/dma.h>
 #include <mach/platform.h>
 #include <mach/vcio.h>
 
@@ -63,6 +64,11 @@ struct bcm2708_fb {
 	struct fbinfo_s *info;
 	dma_addr_t dma;
 	u32 cmap[16];
+	int dma_chan;
+	int dma_irq;
+	void __iomem *dma_chan_base;
+	void *cb_base;		/* DMA control blocks */
+	dma_addr_t cb_handle;
 };
 
 #define to_bcm2708(info)	container_of(info, struct bcm2708_fb, fb)
@@ -312,11 +318,133 @@ static void bcm2708_fb_fillrect(struct fb_info *info,
 	cfb_fillrect(info, rect);
 }
 
+/* A helper function for configuring dma control block */
+static void set_dma_cb(struct bcm2708_dma_cb *cb,
+		       int        burst_size,
+		       dma_addr_t dst,
+		       int        dst_stride,
+		       dma_addr_t src,
+		       int        src_stride,
+		       int        w,
+		       int        h)
+{
+	cb->info = BCM2708_DMA_BURST(burst_size) | BCM2708_DMA_S_WIDTH |
+		   BCM2708_DMA_S_INC | BCM2708_DMA_D_WIDTH |
+		   BCM2708_DMA_D_INC | BCM2708_DMA_TDMODE;
+	cb->dst = dst;
+	cb->src = src;
+	/*
+	 * This is not really obvious from the DMA documentation,
+	 * but the top 16 bits must be programmmed to "height -1"
+	 * and not "height" in 2D mode.
+	 */
+	cb->length = ((h - 1) << 16) | w;
+	cb->stride = ((dst_stride - w) << 16) | (u16)(src_stride - w);
+	cb->pad[0] = 0;
+	cb->pad[1] = 0;
+}
+
 static void bcm2708_fb_copyarea(struct fb_info *info,
 				const struct fb_copyarea *region)
 {
-	/*print_debug("bcm2708_fb_copyarea\n"); */
-	cfb_copyarea(info, region);
+	struct bcm2708_fb *fb = to_bcm2708(info);
+	struct bcm2708_dma_cb *cb = fb->cb_base;
+	int bytes_per_pixel = (info->var.bits_per_pixel + 7) >> 3;
+	/* Channel 0 supports larger bursts and is a bit faster */
+	int burst_size = (fb->dma_chan == 0) ? 8 : 2;
+
+	/* Fallback to cfb_copyarea() if we don't like something */
+	if (bytes_per_pixel > 4 ||
+	    info->var.xres > 1920 || info->var.yres > 1200 ||
+	    region->width <= 0 || region->width > info->var.xres ||
+	    region->height <= 0 || region->height > info->var.yres ||
+	    region->sx < 0 || region->sx >= info->var.xres ||
+	    region->sy < 0 || region->sy >= info->var.yres ||
+	    region->dx < 0 || region->dx >= info->var.xres ||
+	    region->dy < 0 || region->dy >= info->var.yres ||
+	    region->sx + region->width > info->var.xres ||
+	    region->dx + region->width > info->var.xres ||
+	    region->sy + region->height > info->var.yres ||
+	    region->dy + region->height > info->var.yres) {
+		cfb_copyarea(info, region);
+		return;
+	}
+
+	if (region->dy == region->sy && region->dx > region->sx) {
+		/*
+		 * A difficult case of overlapped copy. Because DMA can't
+		 * copy individual scanlines in backwards direction, we need
+		 * two-pass processing. We do it by programming a chain of dma
+		 * control blocks in the first 16K part of the buffer and use
+		 * the remaining 48K as the intermediate temporary scratch
+		 * buffer. The buffer size is sufficient to handle up to
+		 * 1920x1200 resolution at 32bpp pixel depth.
+		 */
+		int y;
+		dma_addr_t control_block_pa = fb->cb_handle;
+		dma_addr_t scratchbuf = fb->cb_handle + 16 * 1024;
+		int scanline_size = bytes_per_pixel * region->width;
+		int scanlines_per_cb = (64 * 1024 - 16 * 1024) / scanline_size;
+
+		for (y = 0; y < region->height; y += scanlines_per_cb) {
+			dma_addr_t src =
+				fb->fb.fix.smem_start +
+				bytes_per_pixel * region->sx +
+				(region->sy + y) * fb->fb.fix.line_length;
+			dma_addr_t dst =
+				fb->fb.fix.smem_start +
+				bytes_per_pixel * region->dx +
+				(region->dy + y) * fb->fb.fix.line_length;
+
+			if (region->height - y < scanlines_per_cb)
+				scanlines_per_cb = region->height - y;
+
+			set_dma_cb(cb, burst_size, scratchbuf, scanline_size,
+				   src, fb->fb.fix.line_length,
+				   scanline_size, scanlines_per_cb);
+			control_block_pa += sizeof(struct bcm2708_dma_cb);
+			cb->next = control_block_pa;
+			cb++;
+
+			set_dma_cb(cb, burst_size, dst, fb->fb.fix.line_length,
+				   scratchbuf, scanline_size,
+				   scanline_size, scanlines_per_cb);
+			control_block_pa += sizeof(struct bcm2708_dma_cb);
+			cb->next = control_block_pa;
+			cb++;
+		}
+		/* move the pointer back to the last dma control block */
+		cb--;
+	} else {
+		/* A single dma control block is enough. */
+		int sy, dy, stride;
+		if (region->dy <= region->sy) {
+			/* processing from top to bottom */
+			dy = region->dy;
+			sy = region->sy;
+			stride = fb->fb.fix.line_length;
+		} else {
+			/* processing from bottom to top */
+			dy = region->dy + region->height - 1;
+			sy = region->sy + region->height - 1;
+			stride = -fb->fb.fix.line_length;
+		}
+		set_dma_cb(cb, burst_size,
+			   fb->fb.fix.smem_start + dy * fb->fb.fix.line_length +
+						   bytes_per_pixel * region->dx,
+			   stride,
+			   fb->fb.fix.smem_start + sy * fb->fb.fix.line_length +
+						   bytes_per_pixel * region->sx,
+			   stride,
+			   region->width * bytes_per_pixel,
+			   region->height);
+	}
+
+	/* end of dma control blocks chain */
+	cb->next = 0;
+
+	bcm_dma_start(fb->dma_chan_base, fb->cb_handle);
+	bcm_dma_wait_idle(fb->dma_chan_base);
 }
 
 static void bcm2708_fb_imageblit(struct fb_info *info,
@@ -359,7 +487,7 @@ static int bcm2708_fb_register(struct bcm2708_fb *fb)
 		fb->dma = dma;
 	}
 	fb->fb.fbops = &bcm2708_fb_ops;
-	fb->fb.flags = FBINFO_FLAG_DEFAULT;
+	fb->fb.flags = FBINFO_FLAG_DEFAULT | FBINFO_HWACCEL_COPYAREA;
 	fb->fb.pseudo_palette = fb->cmap;
 
 	strncpy(fb->fb.fix.id, bcm2708_name, sizeof(fb->fb.fix.id));
@@ -424,6 +552,28 @@ static int bcm2708_fb_probe(struct platform_device *dev)
 	}
 	memset(fb, 0, sizeof(struct bcm2708_fb));
 
+	fb->cb_base = dma_alloc_writecombine(&dev->dev, SZ_64K,
+					     &fb->cb_handle, GFP_KERNEL);
+	if (!fb->cb_base) {
+		dev_err(&dev->dev, "cannot allocate DMA CBs\n");
+		ret = -ENOMEM;
+		goto free_fb;
+	}
+
+	pr_info("BCM2708FB: allocated DMA memory %08x\n",
+	       fb->cb_handle);
+
+	ret = bcm_dma_chan_alloc(BCM_DMA_FEATURE_BULK,
+				 &fb->dma_chan_base, &fb->dma_irq);
+	if (ret < 0) {
+		dev_err(&dev->dev, "couldn't allocate a DMA channel\n");
+		goto free_cb;
+	}
+	fb->dma_chan = ret;
+
+	pr_info("BCM2708FB: allocated DMA channel %d @ %p\n",
+	       fb->dma_chan, fb->dma_chan_base);
+
 	fb->dev = dev;
 
 	ret = bcm2708_fb_register(fb);
@@ -432,6 +582,9 @@ static int bcm2708_fb_probe(struct platform_device *dev)
 		goto out;
 	}
 
+free_cb:
+	dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
+free_fb:
 	kfree(fb);
 free_region:
 	dev_err(&dev->dev, "probe failed, err %d\n", ret);
@@ -449,6 +602,9 @@ static int bcm2708_fb_remove(struct platform_device *dev)
 		iounmap(fb->fb.screen_base);
 	unregister_framebuffer(&fb->fb);
 
+	dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
+	bcm_dma_chan_free(fb->dma_chan);
+
 	dma_free_coherent(NULL, PAGE_ALIGN(sizeof(*fb->info)), (void *)fb->info,
 			  fb->dma);
 	kfree(fb);
-- 
1.9.1