Date: Tue, 13 Jun 2017 09:47:39 +0200
From: Boris Brezillon <boris.brezillon@free-electrons.com>
To: Eric Anholt <eric@anholt.net>
Cc: dri-devel@lists.freedesktop.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH 1/2] drm/vc4: Add T-format scanout support.
Message-ID: <20170613094739.0365e19f@bbrezillon>
In-Reply-To: <20170608001336.12842-1-eric@anholt.net>
References: <20170608001336.12842-1-eric@anholt.net>
MIME-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 7261
Lines: 185

On Wed,  7 Jun 2017 17:13:35 -0700
Eric Anholt <eric@anholt.net> wrote:

> The T tiling format is what V3D uses for textures, with no raster
> support at all until later revisions of the hardware (and always at a
> large 3D performance penalty).  If we can't scan out V3D's format,
> then we often need to do a relayout at some stage of the pipeline,
> either right before texturing from the scanout buffer (common in X11
> without a compositor) or between a tiled screen buffer right before
> scanout (an option I've considered in trying to resolve this
> inconsistency, but which means needing to use the dirty fb ioctl and
> having some update policy).
> 
> T-format scanout lets us avoid either of those shadow copies, for a
> massive, obvious performance improvement to X11 window dragging
> without a compositor.  Unfortunately, enabling a compositor to work
> around the discrepancy has turned out to be too costly in memory
> consumption for the Raspbian distribution.
> 
> Because the HVS operates a scanline at a time, compositing from T does
> increase the memory bandwidth cost of scanout.  On my 1920x1080@32bpp
> display on a RPi3, we go from about 15% of system memory bandwidth
> with linear to about 20% with tiled.  However, for X11 this still ends
> up being a huge performance win in active usage.
> 
> This patch doesn't yet handle src_x/src_y offsetting within the tiled
> buffer.  However, we fail to do so for untiled buffers already.
> 
> Signed-off-by: Eric Anholt <eric@anholt.net>

Reviewed-by: Boris Brezillon <boris.brezillon@free-electrons.com>

> ---
>  drivers/gpu/drm/vc4/vc4_plane.c | 31 +++++++++++++++++++++++++++----
>  drivers/gpu/drm/vc4/vc4_regs.h  | 19 +++++++++++++++++++
>  include/uapi/drm/drm_fourcc.h   | 23 ++++++++++++++++++++++-
>  3 files changed, 68 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
> index da18dec21696..fa6809d8b0fe 100644
> --- a/drivers/gpu/drm/vc4/vc4_plane.c
> +++ b/drivers/gpu/drm/vc4/vc4_plane.c
> @@ -500,8 +500,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
>  	u32 ctl0_offset = vc4_state->dlist_count;
>  	const struct hvs_format *format = vc4_get_hvs_format(fb->format->format);
>  	int num_planes = drm_format_num_planes(format->drm);
> -	u32 scl0, scl1;
> -	u32 lbm_size;
> +	u32 scl0, scl1, pitch0;
> +	u32 lbm_size, tiling;
>  	unsigned long irqflags;
>  	int ret, i;
>  
> @@ -542,11 +542,31 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
>  		scl1 = vc4_get_scl_field(state, 0);
>  	}
>  
> +	switch (fb->modifier) {
> +	case DRM_FORMAT_MOD_LINEAR:
> +		tiling = SCALER_CTL0_TILING_LINEAR;
> +		pitch0 = VC4_SET_FIELD(fb->pitches[0], SCALER_SRC_PITCH);
> +		break;
> +	case DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED:
> +		tiling = SCALER_CTL0_TILING_256B_OR_T;
> +
> +		pitch0 = (VC4_SET_FIELD(0, SCALER_PITCH0_TILE_Y_OFFSET),
> +			  VC4_SET_FIELD(0, SCALER_PITCH0_TILE_WIDTH_L),
> +			  VC4_SET_FIELD((vc4_state->src_w[0] + 31) >> 5,
> +					SCALER_PITCH0_TILE_WIDTH_R));
> +		break;
> +	default:
> +		DRM_DEBUG_KMS("Unsupported FB tiling flag 0x%16llx",
> +			      (long long)fb->modifier);
> +		return -EINVAL;
> +	}
> +
>  	/* Control word */
>  	vc4_dlist_write(vc4_state,
>  			SCALER_CTL0_VALID |
>  			(format->pixel_order << SCALER_CTL0_ORDER_SHIFT) |
>  			(format->hvs << SCALER_CTL0_PIXEL_FORMAT_SHIFT) |
> +			VC4_SET_FIELD(tiling, SCALER_CTL0_TILING) |
>  			(vc4_state->is_unity ? SCALER_CTL0_UNITY : 0) |
>  			VC4_SET_FIELD(scl0, SCALER_CTL0_SCL0) |
>  			VC4_SET_FIELD(scl1, SCALER_CTL0_SCL1));
> @@ -600,8 +620,11 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
>  	for (i = 0; i < num_planes; i++)
>  		vc4_dlist_write(vc4_state, 0xc0c0c0c0);
>  
> -	/* Pitch word 0/1/2 */
> -	for (i = 0; i < num_planes; i++) {
> +	/* Pitch word 0 */
> +	vc4_dlist_write(vc4_state, pitch0);
> +
> +	/* Pitch word 1/2 */
> +	for (i = 1; i < num_planes; i++) {
>  		vc4_dlist_write(vc4_state,
>  				VC4_SET_FIELD(fb->pitches[i], SCALER_SRC_PITCH));
>  	}
> diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h
> index 932093936178..d382c34c1b9e 100644
> --- a/drivers/gpu/drm/vc4/vc4_regs.h
> +++ b/drivers/gpu/drm/vc4/vc4_regs.h
> @@ -709,6 +709,13 @@ enum hvs_pixel_format {
>  #define SCALER_CTL0_SIZE_MASK			VC4_MASK(29, 24)
>  #define SCALER_CTL0_SIZE_SHIFT			24
>  
> +#define SCALER_CTL0_TILING_MASK			VC4_MASK(21, 20)
> +#define SCALER_CTL0_TILING_SHIFT		20
> +#define SCALER_CTL0_TILING_LINEAR		0
> +#define SCALER_CTL0_TILING_64B			1
> +#define SCALER_CTL0_TILING_128B			2
> +#define SCALER_CTL0_TILING_256B_OR_T		3
> +
>  #define SCALER_CTL0_HFLIP                       BIT(16)
>  #define SCALER_CTL0_VFLIP                       BIT(15)
>  
> @@ -838,7 +845,19 @@ enum hvs_pixel_format {
>  #define SCALER_PPF_KERNEL_OFFSET_SHIFT		0
>  #define SCALER_PPF_KERNEL_UNCACHED		BIT(31)
>  
> +/* PITCH0/1/2 fields for raster. */
>  #define SCALER_SRC_PITCH_MASK			VC4_MASK(15, 0)
>  #define SCALER_SRC_PITCH_SHIFT			0
>  
> +/* PITCH0 fields for T-tiled. */
> +#define SCALER_PITCH0_TILE_WIDTH_L_MASK		VC4_MASK(22, 16)
> +#define SCALER_PITCH0_TILE_WIDTH_L_SHIFT	16
> +#define SCALER_PITCH0_TILE_LINE_DIR		BIT(15)
> +#define SCALER_PITCH0_TILE_INITIAL_LINE_DIR	BIT(14)
> +/* Y offset within a tile. */
> +#define SCALER_PITCH0_TILE_Y_OFFSET_MASK	VC4_MASK(13, 7)
> +#define SCALER_PITCH0_TILE_Y_OFFSET_SHIFT	7
> +#define SCALER_PITCH0_TILE_WIDTH_R_MASK		VC4_MASK(6, 0)
> +#define SCALER_PITCH0_TILE_WIDTH_R_SHIFT	0
> +
>  #endif /* VC4_REGS_H */
> diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
> index 55e301047b3e..7586c46f68bf 100644
> --- a/include/uapi/drm/drm_fourcc.h
> +++ b/include/uapi/drm/drm_fourcc.h
> @@ -182,6 +182,7 @@ extern "C" {
>  #define DRM_FORMAT_MOD_VENDOR_SAMSUNG 0x04
>  #define DRM_FORMAT_MOD_VENDOR_QCOM    0x05
>  #define DRM_FORMAT_MOD_VENDOR_VIVANTE 0x06
> +#define DRM_FORMAT_MOD_VENDOR_BROADCOM 0x07
>  /* add more to the end as needed */
>  
>  #define fourcc_mod_code(vendor, val) \
> @@ -306,7 +307,6 @@ extern "C" {
>   */
>  #define DRM_FORMAT_MOD_VIVANTE_SPLIT_SUPER_TILED fourcc_mod_code(VIVANTE, 4)
>  
> -
>  /* NVIDIA Tegra frame buffer modifiers */
>  
>  /*
> @@ -351,6 +351,27 @@ extern "C" {
>   */
>  #define NV_FORMAT_MOD_TEGRA_16BX2_BLOCK(v) fourcc_mod_tegra_code(2, v)
>  
> +/*
> + * Broadcom VC4 "T" format
> + *
> + * This is the primary layout that the V3D GPU can texture from (it
> + * can't do linear).  The T format has:
> + *
> + * - 64b utiles of pixels in a raster-order grid according to cpp.  It's 4x4
> + *   pixels at 32 bit depth.
> + *
> + * - 1k subtiles made of a 4x4 raster-order grid of 64b utiles (so usually
> + *   16x16 pixels).
> + *
> + * - 4k tiles made of a 2x2 grid of 1k subtiles (so usually 32x32 pixels).  On
> + *   even 4k tile rows, they're arranged as (BL, TL, TR, BR), and on odd rows
> + *   they're (TR, BR, BL, TL), where bottom left is start of memory.
> + *
> + * - an image made of 4k tiles in rows either left-to-right (even rows of 4k
> + *   tiles) or right-to-left (odd rows of 4k tiles).
> + */
> +#define DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED fourcc_mod_code(BROADCOM, 1)
> +
>  #if defined(__cplusplus)
>  }
>  #endif