From 387b65c16c0d763ee4610675ce61e1072fa6cd72 Mon Sep 17 00:00:00 2001 From: Dave Stevenson <dave.stevenson@raspberrypi.com> Date: Tue, 27 Apr 2021 14:24:21 +0200 Subject: [PATCH] drm/vc4: Add support for gamma on BCM2711 BCM2711 changes from a 256 entry lookup table to a 16 point piecewise linear function as the pipeline bitdepth has increased to make a LUT unwieldy. Implement a simple conversion from a 256 entry LUT that userspace is likely to expect to 16 evenly spread points in the PWL. This could be improved with curve fitting at a later date. Co-developed-by: Juerg Haefliger <juergh@canonical.com> Signed-off-by: Juerg Haefliger <juergh@canonical.com> Signed-off-by: Dave Stevenson <dave.stevenson@raspberrypi.com> Signed-off-by: Maxime Ripard <maxime@cerno.tech> --- drivers/gpu/drm/vc4/vc4_crtc.c | 35 ++++++++++--- drivers/gpu/drm/vc4/vc4_drv.h | 28 +++++++++-- drivers/gpu/drm/vc4/vc4_hvs.c | 89 ++++++++++++++++++++++++++++++++-- drivers/gpu/drm/vc4/vc4_regs.h | 22 +++++++++ 4 files changed, 162 insertions(+), 12 deletions(-) --- a/drivers/gpu/drm/vc4/vc4_crtc.c +++ b/drivers/gpu/drm/vc4/vc4_crtc.c @@ -1326,19 +1326,42 @@ int vc4_crtc_init(struct drm_device *drm if (!vc4->is_vc5) { drm_mode_crtc_set_gamma_size(crtc, ARRAY_SIZE(vc4_crtc->lut_r)); + } else { + /* This is a lie for hvs5 which uses a 16 point PWL, but it + * allows for something smarter than just 16 linearly spaced + * segments. Conversion is done in vc5_hvs_update_gamma_lut. + */ + drm_mode_crtc_set_gamma_size(crtc, 256); + } - drm_crtc_enable_color_mgmt(crtc, 0, false, crtc->gamma_size); + drm_crtc_enable_color_mgmt(crtc, 0, false, crtc->gamma_size); + if (!vc4->is_vc5) { /* We support CTM, but only for one CRTC at a time. It's therefore * implemented as private driver state in vc4_kms, not here. */ drm_crtc_enable_color_mgmt(crtc, 0, true, crtc->gamma_size); - } - for (i = 0; i < crtc->gamma_size; i++) { - vc4_crtc->lut_r[i] = i; - vc4_crtc->lut_g[i] = i; - vc4_crtc->lut_b[i] = i; + /* Initialize the VC4 gamma LUTs */ + for (i = 0; i < crtc->gamma_size; i++) { + vc4_crtc->lut_r[i] = i; + vc4_crtc->lut_g[i] = i; + vc4_crtc->lut_b[i] = i; + } + } else { + /* Initialize the VC5 gamma PWL entries. Assume 12-bit pipeline, + * evenly spread over full range. + */ + for (i = 0; i < SCALER5_DSPGAMMA_NUM_POINTS; i++) { + vc4_crtc->pwl_r[i] = + VC5_HVS_SET_GAMMA_ENTRY(i << 8, i << 12, 1 << 8); + vc4_crtc->pwl_g[i] = + VC5_HVS_SET_GAMMA_ENTRY(i << 8, i << 12, 1 << 8); + vc4_crtc->pwl_b[i] = + VC5_HVS_SET_GAMMA_ENTRY(i << 8, i << 12, 1 << 8); + vc4_crtc->pwl_a[i] = + VC5_HVS_SET_GAMMA_ENTRY(i << 8, i << 12, 1 << 8); + } } return 0; --- a/drivers/gpu/drm/vc4/vc4_drv.h +++ b/drivers/gpu/drm/vc4/vc4_drv.h @@ -20,6 +20,7 @@ #include <drm/drm_modeset_lock.h> #include "uapi/drm/vc4_drm.h" +#include "vc4_regs.h" struct drm_device; struct drm_gem_object; @@ -481,6 +482,17 @@ struct vc4_pv_data { enum vc4_encoder_type encoder_types[4]; }; +struct vc5_gamma_entry { + u32 x_c_terms; + u32 grad_term; +}; + +#define VC5_HVS_SET_GAMMA_ENTRY(x, c, g) (struct vc5_gamma_entry){ \ + .x_c_terms = VC4_SET_FIELD((x), SCALER5_DSPGAMMA_OFF_X) | \ + VC4_SET_FIELD((c), SCALER5_DSPGAMMA_OFF_C), \ + .grad_term = (g) \ +} + struct vc4_crtc { struct drm_crtc base; struct platform_device *pdev; @@ -490,9 +502,19 @@ struct vc4_crtc { /* Timestamp at start of vblank irq - unaffected by lock delays. */ ktime_t t_vblank; - u8 lut_r[256]; - u8 lut_g[256]; - u8 lut_b[256]; + union { + struct { /* VC4 gamma LUT */ + u8 lut_r[256]; + u8 lut_g[256]; + u8 lut_b[256]; + }; + struct { /* VC5 gamma PWL entries */ + struct vc5_gamma_entry pwl_r[SCALER5_DSPGAMMA_NUM_POINTS]; + struct vc5_gamma_entry pwl_g[SCALER5_DSPGAMMA_NUM_POINTS]; + struct vc5_gamma_entry pwl_b[SCALER5_DSPGAMMA_NUM_POINTS]; + struct vc5_gamma_entry pwl_a[SCALER5_DSPGAMMA_NUM_POINTS]; + }; + }; struct drm_pending_vblank_event *event; --- a/drivers/gpu/drm/vc4/vc4_hvs.c +++ b/drivers/gpu/drm/vc4/vc4_hvs.c @@ -241,7 +241,8 @@ static void vc4_hvs_lut_load(struct vc4_ static void vc4_hvs_update_gamma_lut(struct vc4_hvs *hvs, struct vc4_crtc *vc4_crtc) { - struct drm_crtc_state *crtc_state = vc4_crtc->base.state; + struct drm_crtc *crtc = &vc4_crtc->base; + struct drm_crtc_state *crtc_state = crtc->state; struct drm_color_lut *lut = crtc_state->gamma_lut->data; u32 length = drm_color_lut_size(crtc_state->gamma_lut); u32 i; @@ -255,6 +256,81 @@ static void vc4_hvs_update_gamma_lut(str vc4_hvs_lut_load(hvs, vc4_crtc); } +static void vc5_hvs_write_gamma_entry(struct vc4_hvs *hvs, + u32 offset, + struct vc5_gamma_entry *gamma) +{ + HVS_WRITE(offset, gamma->x_c_terms); + HVS_WRITE(offset + 4, gamma->grad_term); +} + +static void vc5_hvs_lut_load(struct vc4_hvs *hvs, + struct vc4_crtc *vc4_crtc) +{ + struct drm_crtc *crtc = &vc4_crtc->base; + struct drm_crtc_state *crtc_state = crtc->state; + struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(crtc_state); + u32 i; + u32 offset = SCALER5_DSPGAMMA_START + + vc4_state->assigned_channel * SCALER5_DSPGAMMA_CHAN_OFFSET; + + for (i = 0; i < SCALER5_DSPGAMMA_NUM_POINTS; i++, offset += 8) + vc5_hvs_write_gamma_entry(hvs, offset, &vc4_crtc->pwl_r[i]); + for (i = 0; i < SCALER5_DSPGAMMA_NUM_POINTS; i++, offset += 8) + vc5_hvs_write_gamma_entry(hvs, offset, &vc4_crtc->pwl_g[i]); + for (i = 0; i < SCALER5_DSPGAMMA_NUM_POINTS; i++, offset += 8) + vc5_hvs_write_gamma_entry(hvs, offset, &vc4_crtc->pwl_b[i]); + + if (vc4_state->assigned_channel == 2) { + /* Alpha only valid on channel 2 */ + for (i = 0; i < SCALER5_DSPGAMMA_NUM_POINTS; i++, offset += 8) + vc5_hvs_write_gamma_entry(hvs, offset, &vc4_crtc->pwl_a[i]); + } +} + +static void vc5_hvs_update_gamma_lut(struct vc4_hvs *hvs, + struct vc4_crtc *vc4_crtc) +{ + struct drm_crtc *crtc = &vc4_crtc->base; + struct drm_color_lut *lut = crtc->state->gamma_lut->data; + unsigned int step, i; + u32 start, end; + +#define VC5_HVS_UPDATE_GAMMA_ENTRY_FROM_LUT(pwl, chan) \ + start = drm_color_lut_extract(lut[i * step].chan, 12); \ + end = drm_color_lut_extract(lut[(i + 1) * step - 1].chan, 12); \ + \ + /* Negative gradients not permitted by the hardware, so \ + * flatten such points out. \ + */ \ + if (end < start) \ + end = start; \ + \ + /* Assume 12bit pipeline. \ + * X evenly spread over full range (12 bit). \ + * C as U12.4 format. \ + * Gradient as U4.8 format. \ + */ \ + vc4_crtc->pwl[i] = \ + VC5_HVS_SET_GAMMA_ENTRY(i << 8, start << 4, \ + ((end - start) << 4) / (step - 1)) + + /* HVS5 has a 16 point piecewise linear function for each colour + * channel (including alpha on channel 2) on each display channel. + * + * Currently take a crude subsample of the gamma LUT, but this could + * be improved to implement curve fitting. + */ + step = crtc->gamma_size / SCALER5_DSPGAMMA_NUM_POINTS; + for (i = 0; i < SCALER5_DSPGAMMA_NUM_POINTS; i++) { + VC5_HVS_UPDATE_GAMMA_ENTRY_FROM_LUT(pwl_r, red); + VC5_HVS_UPDATE_GAMMA_ENTRY_FROM_LUT(pwl_g, green); + VC5_HVS_UPDATE_GAMMA_ENTRY_FROM_LUT(pwl_b, blue); + } + + vc5_hvs_lut_load(hvs, vc4_crtc); +} + u8 vc4_hvs_get_fifo_frame_count(struct vc4_hvs *hvs, unsigned int fifo) { struct drm_device *drm = &hvs->vc4->base; @@ -398,7 +474,10 @@ static int vc4_hvs_init_channel(struct v /* Reload the LUT, since the SRAMs would have been disabled if * all CRTCs had SCALER_DISPBKGND_GAMMA unset at once. */ - vc4_hvs_lut_load(hvs, vc4_crtc); + if (!vc4->is_vc5) + vc4_hvs_lut_load(hvs, vc4_crtc); + else + vc5_hvs_lut_load(hvs, vc4_crtc); drm_dev_exit(idx); @@ -628,7 +707,11 @@ void vc4_hvs_atomic_flush(struct drm_crt u32 dispbkgndx = HVS_READ(SCALER_DISPBKGNDX(channel)); if (crtc->state->gamma_lut) { - vc4_hvs_update_gamma_lut(hvs, vc4_crtc); + if (!vc4->is_vc5) + vc4_hvs_update_gamma_lut(hvs, vc4_crtc); + else + vc5_hvs_update_gamma_lut(hvs, vc4_crtc); + dispbkgndx |= SCALER_DISPBKGND_GAMMA; } else { /* Unsetting DISPBKGND_GAMMA skips the gamma lut step --- a/drivers/gpu/drm/vc4/vc4_regs.h +++ b/drivers/gpu/drm/vc4/vc4_regs.h @@ -512,6 +512,28 @@ #define SCALER_DLIST_START 0x00002000 #define SCALER_DLIST_SIZE 0x00004000 +/* Gamma PWL for each channel. 16 points for each of 4 colour channels (alpha + * only on channel 2). 8 bytes per entry, offsets first, then gradient: + * Y = GRAD * X + C + * + * Values for X and C are left justified, and vary depending on the width of + * the HVS channel: + * 8-bit pipeline: X uses [31:24], C is U8.8 format, and GRAD is U4.8. + * 12-bit pipeline: X uses [31:20], C is U12.4 format, and GRAD is U4.8. + * + * The 3 HVS channels start at 0x400 offsets (ie chan 1 starts at 0x2400, and + * chan 2 at 0x2800). + */ +#define SCALER5_DSPGAMMA_NUM_POINTS 16 +#define SCALER5_DSPGAMMA_START 0x00002000 +#define SCALER5_DSPGAMMA_CHAN_OFFSET 0x400 +# define SCALER5_DSPGAMMA_OFF_X_MASK VC4_MASK(31, 20) +# define SCALER5_DSPGAMMA_OFF_X_SHIFT 20 +# define SCALER5_DSPGAMMA_OFF_C_MASK VC4_MASK(15, 0) +# define SCALER5_DSPGAMMA_OFF_C_SHIFT 0 +# define SCALER5_DSPGAMMA_GRAD_MASK VC4_MASK(11, 0) +# define SCALER5_DSPGAMMA_GRAD_SHIFT 0 + #define SCALER5_DLIST_START 0x00004000 # define VC4_HDMI_SW_RESET_FORMAT_DETECT BIT(1)