f7a3051b2c621e38062ce1373bd432fa3b45418a
[openwrt/openwrt.git] / target / linux / brcm2708 / patches-3.10 / 0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch
1 From ba65074e39e6aee492bd3c077f640b29a0a89c05 Mon Sep 17 00:00:00 2001
2 From: Siarhei Siamashka <siarhei.siamashka@gmail.com>
3 Date: Mon, 17 Jun 2013 16:00:25 +0300
4 Subject: [PATCH 070/174] bcm2708_fb: DMA acceleration for fb_copyarea
5
6 Based on http://www.raspberrypi.org/phpBB3/viewtopic.php?p=62425#p62425
7 Also used Simon's dmaer_master module as a reference for tweaking DMA
8 settings for better performance.
9
10 For now busylooping only. IRQ support might be added later.
11 With non-overclocked Raspberry Pi, the performance is ~360 MB/s
12 for simple copy or ~260 MB/s for two-pass copy (used when dragging
13 windows to the right).
14
15 In the case of using DMA channel 0, the performance improves
16 to ~440 MB/s.
17
18 For comparison, VFP optimized CPU copy can only do ~114 MB/s in
19 the same conditions (hindered by reading uncached source buffer).
20
21 Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
22 ---
23 drivers/video/bcm2708_fb.c | 162 ++++++++++++++++++++++++++++++++++++++++++++-
24 1 file changed, 159 insertions(+), 3 deletions(-)
25
26 --- a/drivers/video/bcm2708_fb.c
27 +++ b/drivers/video/bcm2708_fb.c
28 @@ -28,6 +28,7 @@
29 #include <linux/printk.h>
30 #include <linux/console.h>
31
32 +#include <mach/dma.h>
33 #include <mach/platform.h>
34 #include <mach/vcio.h>
35
36 @@ -63,6 +64,11 @@ struct bcm2708_fb {
37 struct fbinfo_s *info;
38 dma_addr_t dma;
39 u32 cmap[16];
40 + int dma_chan;
41 + int dma_irq;
42 + void __iomem *dma_chan_base;
43 + void *cb_base; /* DMA control blocks */
44 + dma_addr_t cb_handle;
45 };
46
47 #define to_bcm2708(info) container_of(info, struct bcm2708_fb, fb)
48 @@ -312,11 +318,133 @@ static void bcm2708_fb_fillrect(struct f
49 cfb_fillrect(info, rect);
50 }
51
52 +/* A helper function for configuring dma control block */
53 +static void set_dma_cb(struct bcm2708_dma_cb *cb,
54 + int burst_size,
55 + dma_addr_t dst,
56 + int dst_stride,
57 + dma_addr_t src,
58 + int src_stride,
59 + int w,
60 + int h)
61 +{
62 + cb->info = BCM2708_DMA_BURST(burst_size) | BCM2708_DMA_S_WIDTH |
63 + BCM2708_DMA_S_INC | BCM2708_DMA_D_WIDTH |
64 + BCM2708_DMA_D_INC | BCM2708_DMA_TDMODE;
65 + cb->dst = dst;
66 + cb->src = src;
67 + /*
68 + * This is not really obvious from the DMA documentation,
69 + * but the top 16 bits must be programmmed to "height -1"
70 + * and not "height" in 2D mode.
71 + */
72 + cb->length = ((h - 1) << 16) | w;
73 + cb->stride = ((dst_stride - w) << 16) | (u16)(src_stride - w);
74 + cb->pad[0] = 0;
75 + cb->pad[1] = 0;
76 +}
77 +
78 static void bcm2708_fb_copyarea(struct fb_info *info,
79 const struct fb_copyarea *region)
80 {
81 - /*print_debug("bcm2708_fb_copyarea\n"); */
82 - cfb_copyarea(info, region);
83 + struct bcm2708_fb *fb = to_bcm2708(info);
84 + struct bcm2708_dma_cb *cb = fb->cb_base;
85 + int bytes_per_pixel = (info->var.bits_per_pixel + 7) >> 3;
86 + /* Channel 0 supports larger bursts and is a bit faster */
87 + int burst_size = (fb->dma_chan == 0) ? 8 : 2;
88 +
89 + /* Fallback to cfb_copyarea() if we don't like something */
90 + if (bytes_per_pixel > 4 ||
91 + info->var.xres > 1920 || info->var.yres > 1200 ||
92 + region->width <= 0 || region->width > info->var.xres ||
93 + region->height <= 0 || region->height > info->var.yres ||
94 + region->sx < 0 || region->sx >= info->var.xres ||
95 + region->sy < 0 || region->sy >= info->var.yres ||
96 + region->dx < 0 || region->dx >= info->var.xres ||
97 + region->dy < 0 || region->dy >= info->var.yres ||
98 + region->sx + region->width > info->var.xres ||
99 + region->dx + region->width > info->var.xres ||
100 + region->sy + region->height > info->var.yres ||
101 + region->dy + region->height > info->var.yres) {
102 + cfb_copyarea(info, region);
103 + return;
104 + }
105 +
106 + if (region->dy == region->sy && region->dx > region->sx) {
107 + /*
108 + * A difficult case of overlapped copy. Because DMA can't
109 + * copy individual scanlines in backwards direction, we need
110 + * two-pass processing. We do it by programming a chain of dma
111 + * control blocks in the first 16K part of the buffer and use
112 + * the remaining 48K as the intermediate temporary scratch
113 + * buffer. The buffer size is sufficient to handle up to
114 + * 1920x1200 resolution at 32bpp pixel depth.
115 + */
116 + int y;
117 + dma_addr_t control_block_pa = fb->cb_handle;
118 + dma_addr_t scratchbuf = fb->cb_handle + 16 * 1024;
119 + int scanline_size = bytes_per_pixel * region->width;
120 + int scanlines_per_cb = (64 * 1024 - 16 * 1024) / scanline_size;
121 +
122 + for (y = 0; y < region->height; y += scanlines_per_cb) {
123 + dma_addr_t src =
124 + fb->fb.fix.smem_start +
125 + bytes_per_pixel * region->sx +
126 + (region->sy + y) * fb->fb.fix.line_length;
127 + dma_addr_t dst =
128 + fb->fb.fix.smem_start +
129 + bytes_per_pixel * region->dx +
130 + (region->dy + y) * fb->fb.fix.line_length;
131 +
132 + if (region->height - y < scanlines_per_cb)
133 + scanlines_per_cb = region->height - y;
134 +
135 + set_dma_cb(cb, burst_size, scratchbuf, scanline_size,
136 + src, fb->fb.fix.line_length,
137 + scanline_size, scanlines_per_cb);
138 + control_block_pa += sizeof(struct bcm2708_dma_cb);
139 + cb->next = control_block_pa;
140 + cb++;
141 +
142 + set_dma_cb(cb, burst_size, dst, fb->fb.fix.line_length,
143 + scratchbuf, scanline_size,
144 + scanline_size, scanlines_per_cb);
145 + control_block_pa += sizeof(struct bcm2708_dma_cb);
146 + cb->next = control_block_pa;
147 + cb++;
148 + }
149 + /* move the pointer back to the last dma control block */
150 + cb--;
151 + } else {
152 + /* A single dma control block is enough. */
153 + int sy, dy, stride;
154 + if (region->dy <= region->sy) {
155 + /* processing from top to bottom */
156 + dy = region->dy;
157 + sy = region->sy;
158 + stride = fb->fb.fix.line_length;
159 + } else {
160 + /* processing from bottom to top */
161 + dy = region->dy + region->height - 1;
162 + sy = region->sy + region->height - 1;
163 + stride = -fb->fb.fix.line_length;
164 + }
165 + set_dma_cb(cb, burst_size,
166 + fb->fb.fix.smem_start + dy * fb->fb.fix.line_length +
167 + bytes_per_pixel * region->dx,
168 + stride,
169 + fb->fb.fix.smem_start + sy * fb->fb.fix.line_length +
170 + bytes_per_pixel * region->sx,
171 + stride,
172 + region->width * bytes_per_pixel,
173 + region->height);
174 + }
175 +
176 + /* end of dma control blocks chain */
177 + cb->next = 0;
178 +
179 + bcm_dma_start(fb->dma_chan_base, fb->cb_handle);
180 + bcm_dma_wait_idle(fb->dma_chan_base);
181 }
182
183 static void bcm2708_fb_imageblit(struct fb_info *info,
184 @@ -359,7 +487,7 @@ static int bcm2708_fb_register(struct bc
185 fb->dma = dma;
186 }
187 fb->fb.fbops = &bcm2708_fb_ops;
188 - fb->fb.flags = FBINFO_FLAG_DEFAULT;
189 + fb->fb.flags = FBINFO_FLAG_DEFAULT | FBINFO_HWACCEL_COPYAREA;
190 fb->fb.pseudo_palette = fb->cmap;
191
192 strncpy(fb->fb.fix.id, bcm2708_name, sizeof(fb->fb.fix.id));
193 @@ -424,6 +552,28 @@ static int bcm2708_fb_probe(struct platf
194 }
195 memset(fb, 0, sizeof(struct bcm2708_fb));
196
197 + fb->cb_base = dma_alloc_writecombine(&dev->dev, SZ_64K,
198 + &fb->cb_handle, GFP_KERNEL);
199 + if (!fb->cb_base) {
200 + dev_err(&dev->dev, "cannot allocate DMA CBs\n");
201 + ret = -ENOMEM;
202 + goto free_fb;
203 + }
204 +
205 + pr_info("BCM2708FB: allocated DMA memory %08x\n",
206 + fb->cb_handle);
207 +
208 + ret = bcm_dma_chan_alloc(BCM_DMA_FEATURE_BULK,
209 + &fb->dma_chan_base, &fb->dma_irq);
210 + if (ret < 0) {
211 + dev_err(&dev->dev, "couldn't allocate a DMA channel\n");
212 + goto free_cb;
213 + }
214 + fb->dma_chan = ret;
215 +
216 + pr_info("BCM2708FB: allocated DMA channel %d @ %p\n",
217 + fb->dma_chan, fb->dma_chan_base);
218 +
219 fb->dev = dev;
220
221 ret = bcm2708_fb_register(fb);
222 @@ -432,6 +582,9 @@ static int bcm2708_fb_probe(struct platf
223 goto out;
224 }
225
226 +free_cb:
227 + dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
228 +free_fb:
229 kfree(fb);
230 free_region:
231 dev_err(&dev->dev, "probe failed, err %d\n", ret);
232 @@ -449,6 +602,9 @@ static int bcm2708_fb_remove(struct plat
233 iounmap(fb->fb.screen_base);
234 unregister_framebuffer(&fb->fb);
235
236 + dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
237 + bcm_dma_chan_free(fb->dma_chan);
238 +
239 dma_free_coherent(NULL, PAGE_ALIGN(sizeof(*fb->info)), (void *)fb->info,
240 fb->dma);
241 kfree(fb);