brcm2708: organize kernel patches
[openwrt/openwrt.git] / target / linux / brcm2708 / patches-4.19 / 950-0562-drm-v3d-Add-support-for-compute-shader-dispatch.patch
1 From 22dbf1420a552d1952d22b92d8c30f8162b026b5 Mon Sep 17 00:00:00 2001
2 From: Eric Anholt <eric@anholt.net>
3 Date: Tue, 16 Apr 2019 15:58:54 -0700
4 Subject: [PATCH] drm/v3d: Add support for compute shader dispatch.
5
6 The compute shader dispatch interface is pretty simple -- just pass in
7 the regs that userspace has passed us, with no CLs to run. However,
8 with no CL to run it means that we need to do manual cache flushing of
9 the L2 after the HW execution completes (for SSBO, atomic, and
10 image_load_store writes that are the output of compute shaders).
11
12 This doesn't yet expose the L2 cache's ability to have a region of the
13 address space not write back to memory (which could be used for
14 shared_var storage).
15
16 So far, the Mesa side has been tested on V3D v4.2 simpenrose (passing
17 the ES31 tests), and on the kernel side on 7278 (failing atomic
18 compswap tests in a way that doesn't reproduce on simpenrose).
19
20 v2: Fix excessive allocation for the clean_job (reported by Dan
21 Carpenter). Keep refs on jobs until clean_job is finished, to
22 avoid spurious MMU errors if the output BOs are freed by userspace
23 before L2 cleaning is finished.
24
25 Signed-off-by: Eric Anholt <eric@anholt.net>
26 Link: https://patchwork.freedesktop.org/patch/msgid/20190416225856.20264-4-eric@anholt.net
27 Acked-by: Rob Clark <robdclark@gmail.com>
28 ---
29 drivers/gpu/drm/v3d/v3d_debugfs.c | 22 +++++
30 drivers/gpu/drm/v3d/v3d_drv.c | 10 +-
31 drivers/gpu/drm/v3d/v3d_drv.h | 28 +++++-
32 drivers/gpu/drm/v3d/v3d_fence.c | 2 +
33 drivers/gpu/drm/v3d/v3d_gem.c | 156 +++++++++++++++++++++++++++++-
34 drivers/gpu/drm/v3d/v3d_irq.c | 16 ++-
35 drivers/gpu/drm/v3d/v3d_regs.h | 73 ++++++++++++++
36 drivers/gpu/drm/v3d/v3d_sched.c | 121 +++++++++++++++++++++--
37 drivers/gpu/drm/v3d/v3d_trace.h | 94 ++++++++++++++++++
38 include/uapi/drm/v3d_drm.h | 28 ++++++
39 10 files changed, 531 insertions(+), 19 deletions(-)
40
41 --- a/drivers/gpu/drm/v3d/v3d_debugfs.c
42 +++ b/drivers/gpu/drm/v3d/v3d_debugfs.c
43 @@ -57,6 +57,17 @@ static const struct v3d_reg_def v3d_core
44 REGDEF(V3D_GMP_VIO_ADDR),
45 };
46
47 +static const struct v3d_reg_def v3d_csd_reg_defs[] = {
48 + REGDEF(V3D_CSD_STATUS),
49 + REGDEF(V3D_CSD_CURRENT_CFG0),
50 + REGDEF(V3D_CSD_CURRENT_CFG1),
51 + REGDEF(V3D_CSD_CURRENT_CFG2),
52 + REGDEF(V3D_CSD_CURRENT_CFG3),
53 + REGDEF(V3D_CSD_CURRENT_CFG4),
54 + REGDEF(V3D_CSD_CURRENT_CFG5),
55 + REGDEF(V3D_CSD_CURRENT_CFG6),
56 +};
57 +
58 static int v3d_v3d_debugfs_regs(struct seq_file *m, void *unused)
59 {
60 struct drm_info_node *node = (struct drm_info_node *)m->private;
61 @@ -88,6 +99,17 @@ static int v3d_v3d_debugfs_regs(struct s
62 V3D_CORE_READ(core,
63 v3d_core_reg_defs[i].reg));
64 }
65 +
66 + if (v3d_has_csd(v3d)) {
67 + for (i = 0; i < ARRAY_SIZE(v3d_csd_reg_defs); i++) {
68 + seq_printf(m, "core %d %s (0x%04x): 0x%08x\n",
69 + core,
70 + v3d_csd_reg_defs[i].name,
71 + v3d_csd_reg_defs[i].reg,
72 + V3D_CORE_READ(core,
73 + v3d_csd_reg_defs[i].reg));
74 + }
75 + }
76 }
77
78 return 0;
79 --- a/drivers/gpu/drm/v3d/v3d_drv.c
80 +++ b/drivers/gpu/drm/v3d/v3d_drv.c
81 @@ -7,9 +7,9 @@
82 * This driver supports the Broadcom V3D 3.3 and 4.1 OpenGL ES GPUs.
83 * For V3D 2.x support, see the VC4 driver.
84 *
85 - * Currently only single-core rendering using the binner and renderer,
86 - * along with TFU (texture formatting unit) rendering is supported.
87 - * V3D 4.x's CSD (compute shader dispatch) is not yet supported.
88 + * The V3D GPU includes a tiled render (composed of a bin and render
89 + * pipelines), the TFU (texture formatting unit), and the CSD (compute
90 + * shader dispatch).
91 */
92
93 #include <linux/clk.h>
94 @@ -114,6 +114,9 @@ static int v3d_get_param_ioctl(struct dr
95 case DRM_V3D_PARAM_SUPPORTS_TFU:
96 args->value = 1;
97 return 0;
98 + case DRM_V3D_PARAM_SUPPORTS_CSD:
99 + args->value = v3d_has_csd(v3d);
100 + return 0;
101 default:
102 DRM_DEBUG("Unknown parameter %d\n", args->param);
103 return -EINVAL;
104 @@ -183,6 +186,7 @@ static const struct drm_ioctl_desc v3d_d
105 DRM_IOCTL_DEF_DRV(V3D_GET_PARAM, v3d_get_param_ioctl, DRM_RENDER_ALLOW),
106 DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
107 DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
108 + DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CSD, v3d_submit_csd_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
109 };
110
111 static const struct vm_operations_struct v3d_vm_ops = {
112 --- a/drivers/gpu/drm/v3d/v3d_drv.h
113 +++ b/drivers/gpu/drm/v3d/v3d_drv.h
114 @@ -16,9 +16,11 @@ enum v3d_queue {
115 V3D_BIN,
116 V3D_RENDER,
117 V3D_TFU,
118 + V3D_CSD,
119 + V3D_CACHE_CLEAN,
120 };
121
122 -#define V3D_MAX_QUEUES (V3D_TFU + 1)
123 +#define V3D_MAX_QUEUES (V3D_CACHE_CLEAN + 1)
124
125 struct v3d_queue_state {
126 struct drm_gpu_scheduler sched;
127 @@ -70,6 +72,7 @@ struct v3d_dev {
128 struct v3d_bin_job *bin_job;
129 struct v3d_render_job *render_job;
130 struct v3d_tfu_job *tfu_job;
131 + struct v3d_csd_job *csd_job;
132
133 struct v3d_queue_state queue[V3D_MAX_QUEUES];
134
135 @@ -92,6 +95,12 @@ struct v3d_dev {
136 */
137 struct mutex sched_lock;
138
139 + /* Lock taken during a cache clean and when initiating an L2
140 + * flush, to keep L2 flushes from interfering with the
141 + * synchronous L2 cleans.
142 + */
143 + struct mutex cache_clean_lock;
144 +
145 struct {
146 u32 num_allocated;
147 u32 pages_allocated;
148 @@ -104,6 +113,12 @@ to_v3d_dev(struct drm_device *dev)
149 return (struct v3d_dev *)dev->dev_private;
150 }
151
152 +static inline bool
153 +v3d_has_csd(struct v3d_dev *v3d)
154 +{
155 + return v3d->ver >= 41;
156 +}
157 +
158 /* The per-fd struct, which tracks the MMU mappings. */
159 struct v3d_file_priv {
160 struct v3d_dev *v3d;
161 @@ -237,6 +252,14 @@ struct v3d_tfu_job {
162 struct drm_v3d_submit_tfu args;
163 };
164
165 +struct v3d_csd_job {
166 + struct v3d_job base;
167 +
168 + u32 timedout_batches;
169 +
170 + struct drm_v3d_submit_csd args;
171 +};
172 +
173 /**
174 * _wait_for - magic (register) wait macro
175 *
176 @@ -302,11 +325,14 @@ int v3d_submit_cl_ioctl(struct drm_devic
177 struct drm_file *file_priv);
178 int v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
179 struct drm_file *file_priv);
180 +int v3d_submit_csd_ioctl(struct drm_device *dev, void *data,
181 + struct drm_file *file_priv);
182 int v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
183 struct drm_file *file_priv);
184 void v3d_job_put(struct v3d_job *job);
185 void v3d_reset(struct v3d_dev *v3d);
186 void v3d_invalidate_caches(struct v3d_dev *v3d);
187 +void v3d_clean_caches(struct v3d_dev *v3d);
188
189 /* v3d_irq.c */
190 int v3d_irq_init(struct v3d_dev *v3d);
191 --- a/drivers/gpu/drm/v3d/v3d_fence.c
192 +++ b/drivers/gpu/drm/v3d/v3d_fence.c
193 @@ -36,6 +36,8 @@ static const char *v3d_fence_get_timelin
194 return "v3d-render";
195 case V3D_TFU:
196 return "v3d-tfu";
197 + case V3D_CSD:
198 + return "v3d-csd";
199 default:
200 return NULL;
201 }
202 --- a/drivers/gpu/drm/v3d/v3d_gem.c
203 +++ b/drivers/gpu/drm/v3d/v3d_gem.c
204 @@ -162,10 +162,52 @@ v3d_flush_l2t(struct v3d_dev *v3d, int c
205 /* While there is a busy bit (V3D_L2TCACTL_L2TFLS), we don't
206 * need to wait for completion before dispatching the job --
207 * L2T accesses will be stalled until the flush has completed.
208 + * However, we do need to make sure we don't try to trigger a
209 + * new flush while the L2_CLEAN queue is trying to
210 + * synchronously clean after a job.
211 */
212 + mutex_lock(&v3d->cache_clean_lock);
213 V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL,
214 V3D_L2TCACTL_L2TFLS |
215 V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM));
216 + mutex_unlock(&v3d->cache_clean_lock);
217 +}
218 +
219 +/* Cleans texture L1 and L2 cachelines (writing back dirty data).
220 + *
221 + * For cleaning, which happens from the CACHE_CLEAN queue after CSD has
222 + * executed, we need to make sure that the clean is done before
223 + * signaling job completion. So, we synchronously wait before
224 + * returning, and we make sure that L2 invalidates don't happen in the
225 + * meantime to confuse our are-we-done checks.
226 + */
227 +void
228 +v3d_clean_caches(struct v3d_dev *v3d)
229 +{
230 + struct drm_device *dev = &v3d->drm;
231 + int core = 0;
232 +
233 + trace_v3d_cache_clean_begin(dev);
234 +
235 + V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_TMUWCF);
236 + if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
237 + V3D_L2TCACTL_L2TFLS), 100)) {
238 + DRM_ERROR("Timeout waiting for L1T write combiner flush\n");
239 + }
240 +
241 + mutex_lock(&v3d->cache_clean_lock);
242 + V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL,
243 + V3D_L2TCACTL_L2TFLS |
244 + V3D_SET_FIELD(V3D_L2TCACTL_FLM_CLEAN, V3D_L2TCACTL_FLM));
245 +
246 + if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
247 + V3D_L2TCACTL_L2TFLS), 100)) {
248 + DRM_ERROR("Timeout waiting for L2T clean\n");
249 + }
250 +
251 + mutex_unlock(&v3d->cache_clean_lock);
252 +
253 + trace_v3d_cache_clean_end(dev);
254 }
255
256 /* Invalidates the slice caches. These are read-only caches. */
257 @@ -584,7 +626,8 @@ static void
258 v3d_attach_fences_and_unlock_reservation(struct drm_file *file_priv,
259 struct v3d_job *job,
260 struct ww_acquire_ctx *acquire_ctx,
261 - u32 out_sync)
262 + u32 out_sync,
263 + struct dma_fence *done_fence)
264 {
265 struct drm_syncobj *sync_out;
266
267 @@ -594,7 +637,7 @@ v3d_attach_fences_and_unlock_reservation
268 /* Update the return sync object for the job */
269 sync_out = drm_syncobj_find(file_priv, out_sync);
270 if (sync_out) {
271 - drm_syncobj_replace_fence(sync_out, job->done_fence);
272 + drm_syncobj_replace_fence(sync_out, done_fence);
273 drm_syncobj_put(sync_out);
274 }
275 }
276 @@ -691,8 +734,10 @@ v3d_submit_cl_ioctl(struct drm_device *d
277 mutex_unlock(&v3d->sched_lock);
278
279 v3d_attach_fences_and_unlock_reservation(file_priv,
280 - &render->base, &acquire_ctx,
281 - args->out_sync);
282 + &render->base,
283 + &acquire_ctx,
284 + args->out_sync,
285 + render->base.done_fence);
286
287 if (bin)
288 v3d_job_put(&bin->base);
289 @@ -785,7 +830,8 @@ v3d_submit_tfu_ioctl(struct drm_device *
290
291 v3d_attach_fences_and_unlock_reservation(file_priv,
292 &job->base, &acquire_ctx,
293 - args->out_sync);
294 + args->out_sync,
295 + job->base.done_fence);
296
297 v3d_job_put(&job->base);
298
299 @@ -801,6 +847,105 @@ fail:
300 return ret;
301 }
302
303 +/**
304 + * v3d_submit_csd_ioctl() - Submits a CSD (texture formatting) job to the V3D.
305 + * @dev: DRM device
306 + * @data: ioctl argument
307 + * @file_priv: DRM file for this fd
308 + *
309 + * Userspace provides the register setup for the CSD, which we don't
310 + * need to validate since the CSD is behind the MMU.
311 + */
312 +int
313 +v3d_submit_csd_ioctl(struct drm_device *dev, void *data,
314 + struct drm_file *file_priv)
315 +{
316 + struct v3d_dev *v3d = to_v3d_dev(dev);
317 + struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
318 + struct drm_v3d_submit_csd *args = data;
319 + struct v3d_csd_job *job;
320 + struct v3d_job *clean_job;
321 + struct ww_acquire_ctx acquire_ctx;
322 + int ret;
323 +
324 + trace_v3d_submit_csd_ioctl(&v3d->drm, args->cfg[5], args->cfg[6]);
325 +
326 + if (!v3d_has_csd(v3d)) {
327 + DRM_DEBUG("Attempting CSD submit on non-CSD hardware\n");
328 + return -EINVAL;
329 + }
330 +
331 + job = kcalloc(1, sizeof(*job), GFP_KERNEL);
332 + if (!job)
333 + return -ENOMEM;
334 +
335 + ret = v3d_job_init(v3d, file_priv, &job->base,
336 + v3d_job_free, args->in_sync);
337 + if (ret) {
338 + kfree(job);
339 + return ret;
340 + }
341 +
342 + clean_job = kcalloc(1, sizeof(*clean_job), GFP_KERNEL);
343 + if (!clean_job) {
344 + v3d_job_put(&job->base);
345 + kfree(job);
346 + return -ENOMEM;
347 + }
348 +
349 + ret = v3d_job_init(v3d, file_priv, clean_job, v3d_job_free, 0);
350 + if (ret) {
351 + v3d_job_put(&job->base);
352 + kfree(clean_job);
353 + return ret;
354 + }
355 +
356 + job->args = *args;
357 +
358 + ret = v3d_lookup_bos(dev, file_priv, clean_job,
359 + args->bo_handles, args->bo_handle_count);
360 + if (ret)
361 + goto fail;
362 +
363 + ret = v3d_lock_bo_reservations(clean_job, &acquire_ctx);
364 + if (ret)
365 + goto fail;
366 +
367 + mutex_lock(&v3d->sched_lock);
368 + ret = v3d_push_job(v3d_priv, &job->base, V3D_CSD);
369 + if (ret)
370 + goto fail_unreserve;
371 +
372 + ret = v3d_add_dep(clean_job, dma_fence_get(job->base.done_fence));
373 + if (ret)
374 + goto fail_unreserve;
375 + ret = v3d_push_job(v3d_priv, clean_job, V3D_CACHE_CLEAN);
376 + if (ret)
377 + goto fail_unreserve;
378 + mutex_unlock(&v3d->sched_lock);
379 +
380 + v3d_attach_fences_and_unlock_reservation(file_priv,
381 + clean_job,
382 + &acquire_ctx,
383 + args->out_sync,
384 + clean_job->done_fence);
385 +
386 + v3d_job_put(&job->base);
387 + v3d_job_put(clean_job);
388 +
389 + return 0;
390 +
391 +fail_unreserve:
392 + mutex_unlock(&v3d->sched_lock);
393 + v3d_unlock_bo_reservations(clean_job->bo, clean_job->bo_count,
394 + &acquire_ctx);
395 +fail:
396 + v3d_job_put(&job->base);
397 + v3d_job_put(clean_job);
398 +
399 + return ret;
400 +}
401 +
402 int
403 v3d_gem_init(struct drm_device *dev)
404 {
405 @@ -816,6 +961,7 @@ v3d_gem_init(struct drm_device *dev)
406 mutex_init(&v3d->bo_lock);
407 mutex_init(&v3d->reset_lock);
408 mutex_init(&v3d->sched_lock);
409 + mutex_init(&v3d->cache_clean_lock);
410
411 /* Note: We don't allocate address 0. Various bits of HW
412 * treat 0 as special, such as the occlusion query counters
413 --- a/drivers/gpu/drm/v3d/v3d_irq.c
414 +++ b/drivers/gpu/drm/v3d/v3d_irq.c
415 @@ -4,9 +4,9 @@
416 /**
417 * DOC: Interrupt management for the V3D engine
418 *
419 - * When we take a bin, render, or TFU done interrupt, we need to
420 - * signal the fence for that job so that the scheduler can queue up
421 - * the next one and unblock any waiters.
422 + * When we take a bin, render, TFU done, or CSD done interrupt, we
423 + * need to signal the fence for that job so that the scheduler can
424 + * queue up the next one and unblock any waiters.
425 *
426 * When we take the binner out of memory interrupt, we need to
427 * allocate some new memory and pass it to the binner so that the
428 @@ -20,6 +20,7 @@
429 #define V3D_CORE_IRQS ((u32)(V3D_INT_OUTOMEM | \
430 V3D_INT_FLDONE | \
431 V3D_INT_FRDONE | \
432 + V3D_INT_CSDDONE | \
433 V3D_INT_GMPV))
434
435 #define V3D_HUB_IRQS ((u32)(V3D_HUB_INT_MMU_WRV | \
436 @@ -108,6 +109,15 @@ v3d_irq(int irq, void *arg)
437 dma_fence_signal(&fence->base);
438 status = IRQ_HANDLED;
439 }
440 +
441 + if (intsts & V3D_INT_CSDDONE) {
442 + struct v3d_fence *fence =
443 + to_v3d_fence(v3d->csd_job->base.irq_fence);
444 +
445 + trace_v3d_csd_irq(&v3d->drm, fence->seqno);
446 + dma_fence_signal(&fence->base);
447 + status = IRQ_HANDLED;
448 + }
449
450 /* We shouldn't be triggering these if we have GMP in
451 * always-allowed mode.
452 --- a/drivers/gpu/drm/v3d/v3d_regs.h
453 +++ b/drivers/gpu/drm/v3d/v3d_regs.h
454 @@ -238,8 +238,11 @@
455 #define V3D_CTL_L2TCACTL 0x00030
456 # define V3D_L2TCACTL_TMUWCF BIT(8)
457 # define V3D_L2TCACTL_L2T_NO_WM BIT(4)
458 +/* Invalidates cache lines. */
459 # define V3D_L2TCACTL_FLM_FLUSH 0
460 +/* Removes cachelines without writing dirty lines back. */
461 # define V3D_L2TCACTL_FLM_CLEAR 1
462 +/* Writes out dirty cachelines and marks them clean, but doesn't invalidate. */
463 # define V3D_L2TCACTL_FLM_CLEAN 2
464 # define V3D_L2TCACTL_FLM_MASK V3D_MASK(2, 1)
465 # define V3D_L2TCACTL_FLM_SHIFT 1
466 @@ -255,6 +258,8 @@
467 #define V3D_CTL_INT_MSK_CLR 0x00064
468 # define V3D_INT_QPU_MASK V3D_MASK(27, 16)
469 # define V3D_INT_QPU_SHIFT 16
470 +# define V3D_INT_CSDDONE BIT(7)
471 +# define V3D_INT_PCTR BIT(6)
472 # define V3D_INT_GMPV BIT(5)
473 # define V3D_INT_TRFB BIT(4)
474 # define V3D_INT_SPILLUSE BIT(3)
475 @@ -374,4 +379,72 @@
476 #define V3D_GMP_PRESERVE_LOAD 0x00818
477 #define V3D_GMP_VALID_LINES 0x00820
478
479 +#define V3D_CSD_STATUS 0x00900
480 +# define V3D_CSD_STATUS_NUM_COMPLETED_MASK V3D_MASK(11, 4)
481 +# define V3D_CSD_STATUS_NUM_COMPLETED_SHIFT 4
482 +# define V3D_CSD_STATUS_NUM_ACTIVE_MASK V3D_MASK(3, 2)
483 +# define V3D_CSD_STATUS_NUM_ACTIVE_SHIFT 2
484 +# define V3D_CSD_STATUS_HAVE_CURRENT_DISPATCH BIT(1)
485 +# define V3D_CSD_STATUS_HAVE_QUEUED_DISPATCH BIT(0)
486 +
487 +#define V3D_CSD_QUEUED_CFG0 0x00904
488 +# define V3D_CSD_QUEUED_CFG0_NUM_WGS_X_MASK V3D_MASK(31, 16)
489 +# define V3D_CSD_QUEUED_CFG0_NUM_WGS_X_SHIFT 16
490 +# define V3D_CSD_QUEUED_CFG0_WG_X_OFFSET_MASK V3D_MASK(15, 0)
491 +# define V3D_CSD_QUEUED_CFG0_WG_X_OFFSET_SHIFT 0
492 +
493 +#define V3D_CSD_QUEUED_CFG1 0x00908
494 +# define V3D_CSD_QUEUED_CFG1_NUM_WGS_Y_MASK V3D_MASK(31, 16)
495 +# define V3D_CSD_QUEUED_CFG1_NUM_WGS_Y_SHIFT 16
496 +# define V3D_CSD_QUEUED_CFG1_WG_Y_OFFSET_MASK V3D_MASK(15, 0)
497 +# define V3D_CSD_QUEUED_CFG1_WG_Y_OFFSET_SHIFT 0
498 +
499 +#define V3D_CSD_QUEUED_CFG2 0x0090c
500 +# define V3D_CSD_QUEUED_CFG2_NUM_WGS_Z_MASK V3D_MASK(31, 16)
501 +# define V3D_CSD_QUEUED_CFG2_NUM_WGS_Z_SHIFT 16
502 +# define V3D_CSD_QUEUED_CFG2_WG_Z_OFFSET_MASK V3D_MASK(15, 0)
503 +# define V3D_CSD_QUEUED_CFG2_WG_Z_OFFSET_SHIFT 0
504 +
505 +#define V3D_CSD_QUEUED_CFG3 0x00910
506 +# define V3D_CSD_QUEUED_CFG3_OVERLAP_WITH_PREV BIT(26)
507 +# define V3D_CSD_QUEUED_CFG3_MAX_SG_ID_MASK V3D_MASK(25, 20)
508 +# define V3D_CSD_QUEUED_CFG3_MAX_SG_ID_SHIFT 20
509 +# define V3D_CSD_QUEUED_CFG3_BATCHES_PER_SG_M1_MASK V3D_MASK(19, 12)
510 +# define V3D_CSD_QUEUED_CFG3_BATCHES_PER_SG_M1_SHIFT 12
511 +# define V3D_CSD_QUEUED_CFG3_WGS_PER_SG_MASK V3D_MASK(11, 8)
512 +# define V3D_CSD_QUEUED_CFG3_WGS_PER_SG_SHIFT 8
513 +# define V3D_CSD_QUEUED_CFG3_WG_SIZE_MASK V3D_MASK(7, 0)
514 +# define V3D_CSD_QUEUED_CFG3_WG_SIZE_SHIFT 0
515 +
516 +/* Number of batches, minus 1 */
517 +#define V3D_CSD_QUEUED_CFG4 0x00914
518 +
519 +/* Shader address, pnan, singleseg, threading, like a shader record. */
520 +#define V3D_CSD_QUEUED_CFG5 0x00918
521 +
522 +/* Uniforms address (4 byte aligned) */
523 +#define V3D_CSD_QUEUED_CFG6 0x0091c
524 +
525 +#define V3D_CSD_CURRENT_CFG0 0x00920
526 +#define V3D_CSD_CURRENT_CFG1 0x00924
527 +#define V3D_CSD_CURRENT_CFG2 0x00928
528 +#define V3D_CSD_CURRENT_CFG3 0x0092c
529 +#define V3D_CSD_CURRENT_CFG4 0x00930
530 +#define V3D_CSD_CURRENT_CFG5 0x00934
531 +#define V3D_CSD_CURRENT_CFG6 0x00938
532 +
533 +#define V3D_CSD_CURRENT_ID0 0x0093c
534 +# define V3D_CSD_CURRENT_ID0_WG_X_MASK V3D_MASK(31, 16)
535 +# define V3D_CSD_CURRENT_ID0_WG_X_SHIFT 16
536 +# define V3D_CSD_CURRENT_ID0_WG_IN_SG_MASK V3D_MASK(11, 8)
537 +# define V3D_CSD_CURRENT_ID0_WG_IN_SG_SHIFT 8
538 +# define V3D_CSD_CURRENT_ID0_L_IDX_MASK V3D_MASK(7, 0)
539 +# define V3D_CSD_CURRENT_ID0_L_IDX_SHIFT 0
540 +
541 +#define V3D_CSD_CURRENT_ID1 0x00940
542 +# define V3D_CSD_CURRENT_ID0_WG_Z_MASK V3D_MASK(31, 16)
543 +# define V3D_CSD_CURRENT_ID0_WG_Z_SHIFT 16
544 +# define V3D_CSD_CURRENT_ID0_WG_Y_MASK V3D_MASK(15, 0)
545 +# define V3D_CSD_CURRENT_ID0_WG_Y_SHIFT 0
546 +
547 #endif /* V3D_REGS_H */
548 --- a/drivers/gpu/drm/v3d/v3d_sched.c
549 +++ b/drivers/gpu/drm/v3d/v3d_sched.c
550 @@ -48,6 +48,12 @@ to_tfu_job(struct drm_sched_job *sched_j
551 return container_of(sched_job, struct v3d_tfu_job, base.base);
552 }
553
554 +static struct v3d_csd_job *
555 +to_csd_job(struct drm_sched_job *sched_job)
556 +{
557 + return container_of(sched_job, struct v3d_csd_job, base.base);
558 +}
559 +
560 static void
561 v3d_job_free(struct drm_sched_job *sched_job)
562 {
563 @@ -205,6 +211,48 @@ v3d_tfu_job_run(struct drm_sched_job *sc
564 return fence;
565 }
566
567 +static struct dma_fence *
568 +v3d_csd_job_run(struct drm_sched_job *sched_job)
569 +{
570 + struct v3d_csd_job *job = to_csd_job(sched_job);
571 + struct v3d_dev *v3d = job->base.v3d;
572 + struct drm_device *dev = &v3d->drm;
573 + struct dma_fence *fence;
574 + int i;
575 +
576 + v3d->csd_job = job;
577 +
578 + v3d_invalidate_caches(v3d);
579 +
580 + fence = v3d_fence_create(v3d, V3D_CSD);
581 + if (IS_ERR(fence))
582 + return NULL;
583 +
584 + if (job->base.irq_fence)
585 + dma_fence_put(job->base.irq_fence);
586 + job->base.irq_fence = dma_fence_get(fence);
587 +
588 + trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
589 +
590 + for (i = 1; i <= 6; i++)
591 + V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0 + 4 * i, job->args.cfg[i]);
592 + /* CFG0 write kicks off the job. */
593 + V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0, job->args.cfg[0]);
594 +
595 + return fence;
596 +}
597 +
598 +static struct dma_fence *
599 +v3d_cache_clean_job_run(struct drm_sched_job *sched_job)
600 +{
601 + struct v3d_job *job = to_v3d_job(sched_job);
602 + struct v3d_dev *v3d = job->v3d;
603 +
604 + v3d_clean_caches(v3d);
605 +
606 + return NULL;
607 +}
608 +
609 static void
610 v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
611 {
612 @@ -277,13 +325,31 @@ v3d_render_job_timedout(struct drm_sched
613 }
614
615 static void
616 -v3d_tfu_job_timedout(struct drm_sched_job *sched_job)
617 +v3d_generic_job_timedout(struct drm_sched_job *sched_job)
618 {
619 struct v3d_job *job = to_v3d_job(sched_job);
620
621 v3d_gpu_reset_for_timeout(job->v3d, sched_job);
622 }
623
624 +static void
625 +v3d_csd_job_timedout(struct drm_sched_job *sched_job)
626 +{
627 + struct v3d_csd_job *job = to_csd_job(sched_job);
628 + struct v3d_dev *v3d = job->base.v3d;
629 + u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4);
630 +
631 + /* If we've made progress, skip reset and let the timer get
632 + * rearmed.
633 + */
634 + if (job->timedout_batches != batches) {
635 + job->timedout_batches = batches;
636 + return;
637 + }
638 +
639 + v3d_gpu_reset_for_timeout(v3d, sched_job);
640 +}
641 +
642 static const struct drm_sched_backend_ops v3d_bin_sched_ops = {
643 .dependency = v3d_job_dependency,
644 .run_job = v3d_bin_job_run,
645 @@ -301,10 +367,24 @@ static const struct drm_sched_backend_op
646 static const struct drm_sched_backend_ops v3d_tfu_sched_ops = {
647 .dependency = v3d_job_dependency,
648 .run_job = v3d_tfu_job_run,
649 - .timedout_job = v3d_tfu_job_timedout,
650 + .timedout_job = v3d_generic_job_timedout,
651 .free_job = v3d_job_free,
652 };
653
654 +static const struct drm_sched_backend_ops v3d_csd_sched_ops = {
655 + .dependency = v3d_job_dependency,
656 + .run_job = v3d_csd_job_run,
657 + .timedout_job = v3d_csd_job_timedout,
658 + .free_job = v3d_job_free
659 +};
660 +
661 +static const struct drm_sched_backend_ops v3d_cache_clean_sched_ops = {
662 + .dependency = v3d_job_dependency,
663 + .run_job = v3d_cache_clean_job_run,
664 + .timedout_job = v3d_generic_job_timedout,
665 + .free_job = v3d_job_free
666 +};
667 +
668 int
669 v3d_sched_init(struct v3d_dev *v3d)
670 {
671 @@ -331,7 +411,7 @@ v3d_sched_init(struct v3d_dev *v3d)
672 if (ret) {
673 dev_err(v3d->dev, "Failed to create render scheduler: %d.",
674 ret);
675 - drm_sched_fini(&v3d->queue[V3D_BIN].sched);
676 + v3d_sched_fini(v3d);
677 return ret;
678 }
679
680 @@ -343,11 +423,36 @@ v3d_sched_init(struct v3d_dev *v3d)
681 if (ret) {
682 dev_err(v3d->dev, "Failed to create TFU scheduler: %d.",
683 ret);
684 - drm_sched_fini(&v3d->queue[V3D_RENDER].sched);
685 - drm_sched_fini(&v3d->queue[V3D_BIN].sched);
686 + v3d_sched_fini(v3d);
687 return ret;
688 }
689
690 + if (v3d_has_csd(v3d)) {
691 + ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
692 + &v3d_csd_sched_ops,
693 + hw_jobs_limit, job_hang_limit,
694 + msecs_to_jiffies(hang_limit_ms),
695 + "v3d_csd");
696 + if (ret) {
697 + dev_err(v3d->dev, "Failed to create CSD scheduler: %d.",
698 + ret);
699 + v3d_sched_fini(v3d);
700 + return ret;
701 + }
702 +
703 + ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
704 + &v3d_cache_clean_sched_ops,
705 + hw_jobs_limit, job_hang_limit,
706 + msecs_to_jiffies(hang_limit_ms),
707 + "v3d_cache_clean");
708 + if (ret) {
709 + dev_err(v3d->dev, "Failed to create CACHE_CLEAN scheduler: %d.",
710 + ret);
711 + v3d_sched_fini(v3d);
712 + return ret;
713 + }
714 + }
715 +
716 return 0;
717 }
718
719 @@ -356,6 +461,8 @@ v3d_sched_fini(struct v3d_dev *v3d)
720 {
721 enum v3d_queue q;
722
723 - for (q = 0; q < V3D_MAX_QUEUES; q++)
724 - drm_sched_fini(&v3d->queue[q].sched);
725 + for (q = 0; q < V3D_MAX_QUEUES; q++) {
726 + if (v3d->queue[q].sched.ops)
727 + drm_sched_fini(&v3d->queue[q].sched);
728 + }
729 }
730 --- a/drivers/gpu/drm/v3d/v3d_trace.h
731 +++ b/drivers/gpu/drm/v3d/v3d_trace.h
732 @@ -124,6 +124,26 @@ TRACE_EVENT(v3d_tfu_irq,
733 __entry->seqno)
734 );
735
736 +TRACE_EVENT(v3d_csd_irq,
737 + TP_PROTO(struct drm_device *dev,
738 + uint64_t seqno),
739 + TP_ARGS(dev, seqno),
740 +
741 + TP_STRUCT__entry(
742 + __field(u32, dev)
743 + __field(u64, seqno)
744 + ),
745 +
746 + TP_fast_assign(
747 + __entry->dev = dev->primary->index;
748 + __entry->seqno = seqno;
749 + ),
750 +
751 + TP_printk("dev=%u, seqno=%llu",
752 + __entry->dev,
753 + __entry->seqno)
754 +);
755 +
756 TRACE_EVENT(v3d_submit_tfu_ioctl,
757 TP_PROTO(struct drm_device *dev, u32 iia),
758 TP_ARGS(dev, iia),
759 @@ -163,6 +183,80 @@ TRACE_EVENT(v3d_submit_tfu,
760 __entry->seqno)
761 );
762
763 +TRACE_EVENT(v3d_submit_csd_ioctl,
764 + TP_PROTO(struct drm_device *dev, u32 cfg5, u32 cfg6),
765 + TP_ARGS(dev, cfg5, cfg6),
766 +
767 + TP_STRUCT__entry(
768 + __field(u32, dev)
769 + __field(u32, cfg5)
770 + __field(u32, cfg6)
771 + ),
772 +
773 + TP_fast_assign(
774 + __entry->dev = dev->primary->index;
775 + __entry->cfg5 = cfg5;
776 + __entry->cfg6 = cfg6;
777 + ),
778 +
779 + TP_printk("dev=%u, CFG5 0x%08x, CFG6 0x%08x",
780 + __entry->dev,
781 + __entry->cfg5,
782 + __entry->cfg6)
783 +);
784 +
785 +TRACE_EVENT(v3d_submit_csd,
786 + TP_PROTO(struct drm_device *dev,
787 + uint64_t seqno),
788 + TP_ARGS(dev, seqno),
789 +
790 + TP_STRUCT__entry(
791 + __field(u32, dev)
792 + __field(u64, seqno)
793 + ),
794 +
795 + TP_fast_assign(
796 + __entry->dev = dev->primary->index;
797 + __entry->seqno = seqno;
798 + ),
799 +
800 + TP_printk("dev=%u, seqno=%llu",
801 + __entry->dev,
802 + __entry->seqno)
803 +);
804 +
805 +TRACE_EVENT(v3d_cache_clean_begin,
806 + TP_PROTO(struct drm_device *dev),
807 + TP_ARGS(dev),
808 +
809 + TP_STRUCT__entry(
810 + __field(u32, dev)
811 + ),
812 +
813 + TP_fast_assign(
814 + __entry->dev = dev->primary->index;
815 + ),
816 +
817 + TP_printk("dev=%u",
818 + __entry->dev)
819 +);
820 +
821 +TRACE_EVENT(v3d_cache_clean_end,
822 + TP_PROTO(struct drm_device *dev),
823 + TP_ARGS(dev),
824 +
825 + TP_STRUCT__entry(
826 + __field(u32, dev)
827 + ),
828 +
829 + TP_fast_assign(
830 + __entry->dev = dev->primary->index;
831 + ),
832 +
833 + TP_printk("dev=%u",
834 + __entry->dev)
835 +);
836 +
837 TRACE_EVENT(v3d_reset_begin,
838 TP_PROTO(struct drm_device *dev),
839 TP_ARGS(dev),
840 --- a/include/uapi/drm/v3d_drm.h
841 +++ b/include/uapi/drm/v3d_drm.h
842 @@ -37,6 +37,7 @@ extern "C" {
843 #define DRM_V3D_GET_PARAM 0x04
844 #define DRM_V3D_GET_BO_OFFSET 0x05
845 #define DRM_V3D_SUBMIT_TFU 0x06
846 +#define DRM_V3D_SUBMIT_CSD 0x07
847
848 #define DRM_IOCTL_V3D_SUBMIT_CL DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
849 #define DRM_IOCTL_V3D_WAIT_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
850 @@ -45,6 +46,7 @@ extern "C" {
851 #define DRM_IOCTL_V3D_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param)
852 #define DRM_IOCTL_V3D_GET_BO_OFFSET DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
853 #define DRM_IOCTL_V3D_SUBMIT_TFU DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu)
854 +#define DRM_IOCTL_V3D_SUBMIT_CSD DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CSD, struct drm_v3d_submit_csd)
855
856 /**
857 * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D
858 @@ -172,6 +174,7 @@ enum drm_v3d_param {
859 DRM_V3D_PARAM_V3D_CORE0_IDENT1,
860 DRM_V3D_PARAM_V3D_CORE0_IDENT2,
861 DRM_V3D_PARAM_SUPPORTS_TFU,
862 + DRM_V3D_PARAM_SUPPORTS_CSD,
863 };
864
865 struct drm_v3d_get_param {
866 @@ -212,6 +215,31 @@ struct drm_v3d_submit_tfu {
867 __u32 out_sync;
868 };
869
870 +/* Submits a compute shader for dispatch. This job will block on any
871 + * previous compute shaders submitted on this fd, and any other
872 + * synchronization must be performed with in_sync/out_sync.
873 + */
874 +struct drm_v3d_submit_csd {
875 + __u32 cfg[7];
876 + __u32 coef[4];
877 +
878 + /* Pointer to a u32 array of the BOs that are referenced by the job.
879 + */
880 + __u64 bo_handles;
881 +
882 + /* Number of BO handles passed in (size is that times 4). */
883 + __u32 bo_handle_count;
884 +
885 + /* sync object to block on before running the CSD job. Each
886 + * CSD job will execute in the order submitted to its FD.
887 + * Synchronization against rendering/TFU jobs or CSD from
888 + * other fds requires using sync objects.
889 + */
890 + __u32 in_sync;
891 + /* Sync object to signal when the CSD job is done. */
892 + __u32 out_sync;
893 +};
894 +
895 #if defined(__cplusplus)
896 }
897 #endif