These two patches are not strictly required but they do a bit of
cleanup and also fix a false reporting issue when the kernel is compiled
with CONFIG_DEBUG_DMA_API.
Essentially the patchset modifies 'struct ttm_tt' and 'struct ttm_bo_device'
to contain a pointer to 'struct device'. Passing of the 'struct device'
is done via the 'tt_bo_device_init'. The 'struct device' is then used
during the allocation/deallocation of pages when TTM_PAGE_FLAG_DMA32 flag
is set.
At first it seems that we would need to keep a copy of
'struct device' in the struct ttm_bo_device only and use that. However,
when 'ttm_tt_destroy' is called it sets ttm->be (which contains
the 'struct ttm_bo_device') to NULL so we cannot use it. Hence
we copy the 'struct device' pointer to the 'struct ttm_tm' and keep
it there.
I've put the patches on:
git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git devel/ttm.pci-api.v5
The testing was a bit difficult b/c I could not persuade the machine to unload
either the nouveau or radeon driver (I had a serial console hooked so I figured
it would fallback to that?). Ended up calling 'unbind' on the
/sys/pci/.../$BDF/driver/[radeon,nouveau] to trigger the de-allocation path.
Thomas,
What is the best way to test vmw-gfx*? Does the Workstation (5?) support
this particular frontend?
Konrad Rzeszutek Wilk (2):
ttm: Include the 'struct dev' when using the DMA API.
ttm: Pass in 'struct device' to TTM so it can do DMA API on behalf of device.
drivers/gpu/drm/nouveau/nouveau_mem.c | 3 ++-
drivers/gpu/drm/radeon/radeon_ttm.c | 3 ++-
drivers/gpu/drm/ttm/ttm_bo.c | 4 +++-
drivers/gpu/drm/ttm/ttm_page_alloc.c | 11 ++++++-----
drivers/gpu/drm/ttm/ttm_tt.c | 5 +++--
drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 4 ++--
include/drm/ttm/ttm_bo_driver.h | 7 ++++++-
include/drm/ttm/ttm_page_alloc.h | 8 ++++++--
8 files changed, 30 insertions(+), 15 deletions(-)
The full diff:
diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c b/drivers/gpu/drm/nouveau/nouveau_mem.c
index 26347b7..3706156 100644
--- a/drivers/gpu/drm/nouveau/nouveau_mem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_mem.c
@@ -412,7 +412,8 @@ nouveau_mem_vram_init(struct drm_device *dev)
ret = ttm_bo_device_init(&dev_priv->ttm.bdev,
dev_priv->ttm.bo_global_ref.ref.object,
&nouveau_bo_driver, DRM_FILE_PAGE_OFFSET,
- dma_bits <= 32 ? true : false);
+ dma_bits <= 32 ? true : false,
+ dev->dev);
if (ret) {
NV_ERROR(dev, "Error initialising bo driver: %d\n", ret);
return ret;
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index 9ead11f..371890c 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -517,7 +517,8 @@ int radeon_ttm_init(struct radeon_device *rdev)
r = ttm_bo_device_init(&rdev->mman.bdev,
rdev->mman.bo_global_ref.ref.object,
&radeon_bo_driver, DRM_FILE_PAGE_OFFSET,
- rdev->need_dma32);
+ rdev->need_dma32,
+ rdev->dev);
if (r) {
DRM_ERROR("failed initializing buffer object driver(%d).\n", r);
return r;
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index af61fc2..278a2d3 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1526,12 +1526,14 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
struct ttm_bo_global *glob,
struct ttm_bo_driver *driver,
uint64_t file_page_offset,
- bool need_dma32)
+ bool need_dma32,
+ struct device *dev)
{
int ret = -EINVAL;
rwlock_init(&bdev->vm_lock);
bdev->driver = driver;
+ bdev->dev = dev;
memset(bdev->man, 0, sizeof(bdev->man));
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index 737a2a2..35849db 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -664,7 +664,7 @@ out:
*/
int ttm_get_pages(struct list_head *pages, int flags,
enum ttm_caching_state cstate, unsigned count,
- dma_addr_t *dma_address)
+ dma_addr_t *dma_address, struct device *dev)
{
struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
struct page *p = NULL;
@@ -685,7 +685,7 @@ int ttm_get_pages(struct list_head *pages, int flags,
for (r = 0; r < count; ++r) {
if ((flags & TTM_PAGE_FLAG_DMA32) && dma_address) {
void *addr;
- addr = dma_alloc_coherent(NULL, PAGE_SIZE,
+ addr = dma_alloc_coherent(dev, PAGE_SIZE,
&dma_address[r],
gfp_flags);
if (addr == NULL)
@@ -730,7 +730,7 @@ int ttm_get_pages(struct list_head *pages, int flags,
printk(KERN_ERR TTM_PFX
"Failed to allocate extra pages "
"for large request.");
- ttm_put_pages(pages, 0, flags, cstate, NULL);
+ ttm_put_pages(pages, 0, flags, cstate, NULL, NULL);
return r;
}
}
@@ -741,7 +741,8 @@ int ttm_get_pages(struct list_head *pages, int flags,
/* Put all pages in pages list to correct pool to wait for reuse */
void ttm_put_pages(struct list_head *pages, unsigned page_count, int flags,
- enum ttm_caching_state cstate, dma_addr_t *dma_address)
+ enum ttm_caching_state cstate, dma_addr_t *dma_address,
+ struct device *dev)
{
unsigned long irq_flags;
struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
@@ -757,7 +758,7 @@ void ttm_put_pages(struct list_head *pages, unsigned page_count, int flags,
void *addr = page_address(p);
WARN_ON(!addr || !dma_address[r]);
if (addr)
- dma_free_coherent(NULL, PAGE_SIZE,
+ dma_free_coherent(dev, PAGE_SIZE,
addr,
dma_address[r]);
dma_address[r] = 0;
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 86d5b17..354f9d9 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -110,7 +110,7 @@ static struct page *__ttm_tt_get_page(struct ttm_tt *ttm, int index)
INIT_LIST_HEAD(&h);
ret = ttm_get_pages(&h, ttm->page_flags, ttm->caching_state, 1,
- &ttm->dma_address[index]);
+ &ttm->dma_address[index], ttm->dev);
if (ret != 0)
return NULL;
@@ -304,7 +304,7 @@ static void ttm_tt_free_alloced_pages(struct ttm_tt *ttm)
}
}
ttm_put_pages(&h, count, ttm->page_flags, ttm->caching_state,
- ttm->dma_address);
+ ttm->dma_address, ttm->dev);
ttm->state = tt_unpopulated;
ttm->first_himem_page = ttm->num_pages;
ttm->last_lomem_page = -1;
@@ -397,6 +397,7 @@ struct ttm_tt *ttm_tt_create(struct ttm_bo_device *bdev, unsigned long size,
ttm->last_lomem_page = -1;
ttm->caching_state = tt_cached;
ttm->page_flags = page_flags;
+ ttm->dev = bdev->dev;
ttm->dummy_read_page = dummy_read_page;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 10ca97e..803d979 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -322,11 +322,11 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM);
dev_priv->active_master = &dev_priv->fbdev_master;
-
ret = ttm_bo_device_init(&dev_priv->bdev,
dev_priv->bo_global_ref.ref.object,
&vmw_bo_driver, VMWGFX_FILE_PAGE_OFFSET,
- false);
+ false,
+ dev->dev);
if (unlikely(ret != 0)) {
DRM_ERROR("Failed initializing TTM buffer object driver.\n");
goto out_err1;
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index efed082..2024a74 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -177,6 +177,7 @@ struct ttm_tt {
tt_unpopulated,
} state;
dma_addr_t *dma_address;
+ struct device *dev;
};
#define TTM_MEMTYPE_FLAG_FIXED (1 << 0) /* Fixed (on-card) PCI memory */
@@ -551,6 +552,7 @@ struct ttm_bo_device {
struct list_head device_list;
struct ttm_bo_global *glob;
struct ttm_bo_driver *driver;
+ struct device *dev;
rwlock_t vm_lock;
struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
spinlock_t fence_lock;
@@ -791,6 +793,8 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev);
* @file_page_offset: Offset into the device address space that is available
* for buffer data. This ensures compatibility with other users of the
* address space.
+ * @need_dma32: Allocate pages under 4GB
+ * @dev: 'struct device' of the PCI device.
*
* Initializes a struct ttm_bo_device:
* Returns:
@@ -799,7 +803,8 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev);
extern int ttm_bo_device_init(struct ttm_bo_device *bdev,
struct ttm_bo_global *glob,
struct ttm_bo_driver *driver,
- uint64_t file_page_offset, bool need_dma32);
+ uint64_t file_page_offset, bool need_dma32,
+ struct device *dev);
/**
* ttm_bo_unmap_virtual
diff --git a/include/drm/ttm/ttm_page_alloc.h b/include/drm/ttm/ttm_page_alloc.h
index 8062890..ccb6b7a 100644
--- a/include/drm/ttm/ttm_page_alloc.h
+++ b/include/drm/ttm/ttm_page_alloc.h
@@ -37,12 +37,14 @@
* @cstate: ttm caching state for the page.
* @count: number of pages to allocate.
* @dma_address: The DMA (bus) address of pages (if TTM_PAGE_FLAG_DMA32 set).
+ * @dev: struct device for appropiate DMA accounting.
*/
int ttm_get_pages(struct list_head *pages,
int flags,
enum ttm_caching_state cstate,
unsigned count,
- dma_addr_t *dma_address);
+ dma_addr_t *dma_address,
+ struct device *dev);
/**
* Put linked list of pages to pool.
*
@@ -52,12 +54,14 @@ int ttm_get_pages(struct list_head *pages,
* @flags: ttm flags for page allocation.
* @cstate: ttm caching state.
* @dma_address: The DMA (bus) address of pages (if TTM_PAGE_FLAG_DMA32 set).
+ * @dev: struct device for appropiate DMA accounting.
*/
void ttm_put_pages(struct list_head *pages,
unsigned page_count,
int flags,
enum ttm_caching_state cstate,
- dma_addr_t *dma_address);
+ dma_addr_t *dma_address,
+ struct device *dev);
/**
* Initialize pool allocator.
*/
We want to pass in the 'struct device' to the TTM layer in a nice way.
This is not strictly required, but it does make accounting of
pages and their DMA addresses correct when using CONFIG_DMA_API_DEBUG=y.
This patch builds on top of "ttm: Include the 'struct dev' when using the DMA API."
and moves the mechanism of passing in 'struct device' to the TTM API.
Signed-off-by: Konrad Rzeszutek Wilk <[email protected]>
---
drivers/gpu/drm/nouveau/nouveau_mem.c | 4 ++--
drivers/gpu/drm/radeon/radeon_ttm.c | 4 ++--
drivers/gpu/drm/ttm/ttm_bo.c | 4 +++-
drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 4 ++--
include/drm/ttm/ttm_bo_driver.h | 5 ++++-
5 files changed, 13 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c b/drivers/gpu/drm/nouveau/nouveau_mem.c
index 7b57067..3706156 100644
--- a/drivers/gpu/drm/nouveau/nouveau_mem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_mem.c
@@ -409,11 +409,11 @@ nouveau_mem_vram_init(struct drm_device *dev)
if (ret)
return ret;
- dev_priv->ttm.bdev.dev = dev->dev;
ret = ttm_bo_device_init(&dev_priv->ttm.bdev,
dev_priv->ttm.bo_global_ref.ref.object,
&nouveau_bo_driver, DRM_FILE_PAGE_OFFSET,
- dma_bits <= 32 ? true : false);
+ dma_bits <= 32 ? true : false,
+ dev->dev);
if (ret) {
NV_ERROR(dev, "Error initialising bo driver: %d\n", ret);
return ret;
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index d19bfcf..371890c 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -513,12 +513,12 @@ int radeon_ttm_init(struct radeon_device *rdev)
if (r) {
return r;
}
- rdev->mman.bdev.dev = rdev->dev;
/* No others user of address space so set it to 0 */
r = ttm_bo_device_init(&rdev->mman.bdev,
rdev->mman.bo_global_ref.ref.object,
&radeon_bo_driver, DRM_FILE_PAGE_OFFSET,
- rdev->need_dma32);
+ rdev->need_dma32,
+ rdev->dev);
if (r) {
DRM_ERROR("failed initializing buffer object driver(%d).\n", r);
return r;
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index af61fc2..278a2d3 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1526,12 +1526,14 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
struct ttm_bo_global *glob,
struct ttm_bo_driver *driver,
uint64_t file_page_offset,
- bool need_dma32)
+ bool need_dma32,
+ struct device *dev)
{
int ret = -EINVAL;
rwlock_init(&bdev->vm_lock);
bdev->driver = driver;
+ bdev->dev = dev;
memset(bdev->man, 0, sizeof(bdev->man));
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 4a8c789..803d979 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -322,11 +322,11 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM);
dev_priv->active_master = &dev_priv->fbdev_master;
- dev_priv->bdev.dev = dev->dev;
ret = ttm_bo_device_init(&dev_priv->bdev,
dev_priv->bo_global_ref.ref.object,
&vmw_bo_driver, VMWGFX_FILE_PAGE_OFFSET,
- false);
+ false,
+ dev->dev);
if (unlikely(ret != 0)) {
DRM_ERROR("Failed initializing TTM buffer object driver.\n");
goto out_err1;
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index 7589c0a..2024a74 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -793,6 +793,8 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev);
* @file_page_offset: Offset into the device address space that is available
* for buffer data. This ensures compatibility with other users of the
* address space.
+ * @need_dma32: Allocate pages under 4GB
+ * @dev: 'struct device' of the PCI device.
*
* Initializes a struct ttm_bo_device:
* Returns:
@@ -801,7 +803,8 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev);
extern int ttm_bo_device_init(struct ttm_bo_device *bdev,
struct ttm_bo_global *glob,
struct ttm_bo_driver *driver,
- uint64_t file_page_offset, bool need_dma32);
+ uint64_t file_page_offset, bool need_dma32,
+ struct device *dev);
/**
* ttm_bo_unmap_virtual
--
1.7.1
This makes the accounting when using 'debug_dma_dump_mappings()'
and CONFIG_DMA_API_DEBUG=y be assigned to the correct device
instead of 'fallback'.
No functional change - just cosmetic.
At first it seems that we just need to keep a copy of
'struct device' in the struct ttm_bo_device and use that. However,
when 'ttm_tt_destroy' is called it sets ttm->be (which contains
the 'struct ttm_bo_device') to NULL so we cannot use it. Hence
we copy the 'struct device' pointer to the 'struct ttm_tm' and keep
it there.
[v2: Added 'struct device' in 'struct ttm_tm']
Signed-off-by: Konrad Rzeszutek Wilk <[email protected]>
---
drivers/gpu/drm/nouveau/nouveau_mem.c | 1 +
drivers/gpu/drm/radeon/radeon_ttm.c | 1 +
drivers/gpu/drm/ttm/ttm_page_alloc.c | 11 ++++++-----
drivers/gpu/drm/ttm/ttm_tt.c | 5 +++--
drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 2 +-
include/drm/ttm/ttm_bo_driver.h | 2 ++
include/drm/ttm/ttm_page_alloc.h | 8 ++++++--
7 files changed, 20 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c b/drivers/gpu/drm/nouveau/nouveau_mem.c
index 26347b7..7b57067 100644
--- a/drivers/gpu/drm/nouveau/nouveau_mem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_mem.c
@@ -409,6 +409,7 @@ nouveau_mem_vram_init(struct drm_device *dev)
if (ret)
return ret;
+ dev_priv->ttm.bdev.dev = dev->dev;
ret = ttm_bo_device_init(&dev_priv->ttm.bdev,
dev_priv->ttm.bo_global_ref.ref.object,
&nouveau_bo_driver, DRM_FILE_PAGE_OFFSET,
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index 9ead11f..d19bfcf 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -513,6 +513,7 @@ int radeon_ttm_init(struct radeon_device *rdev)
if (r) {
return r;
}
+ rdev->mman.bdev.dev = rdev->dev;
/* No others user of address space so set it to 0 */
r = ttm_bo_device_init(&rdev->mman.bdev,
rdev->mman.bo_global_ref.ref.object,
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index 737a2a2..35849db 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -664,7 +664,7 @@ out:
*/
int ttm_get_pages(struct list_head *pages, int flags,
enum ttm_caching_state cstate, unsigned count,
- dma_addr_t *dma_address)
+ dma_addr_t *dma_address, struct device *dev)
{
struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
struct page *p = NULL;
@@ -685,7 +685,7 @@ int ttm_get_pages(struct list_head *pages, int flags,
for (r = 0; r < count; ++r) {
if ((flags & TTM_PAGE_FLAG_DMA32) && dma_address) {
void *addr;
- addr = dma_alloc_coherent(NULL, PAGE_SIZE,
+ addr = dma_alloc_coherent(dev, PAGE_SIZE,
&dma_address[r],
gfp_flags);
if (addr == NULL)
@@ -730,7 +730,7 @@ int ttm_get_pages(struct list_head *pages, int flags,
printk(KERN_ERR TTM_PFX
"Failed to allocate extra pages "
"for large request.");
- ttm_put_pages(pages, 0, flags, cstate, NULL);
+ ttm_put_pages(pages, 0, flags, cstate, NULL, NULL);
return r;
}
}
@@ -741,7 +741,8 @@ int ttm_get_pages(struct list_head *pages, int flags,
/* Put all pages in pages list to correct pool to wait for reuse */
void ttm_put_pages(struct list_head *pages, unsigned page_count, int flags,
- enum ttm_caching_state cstate, dma_addr_t *dma_address)
+ enum ttm_caching_state cstate, dma_addr_t *dma_address,
+ struct device *dev)
{
unsigned long irq_flags;
struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
@@ -757,7 +758,7 @@ void ttm_put_pages(struct list_head *pages, unsigned page_count, int flags,
void *addr = page_address(p);
WARN_ON(!addr || !dma_address[r]);
if (addr)
- dma_free_coherent(NULL, PAGE_SIZE,
+ dma_free_coherent(dev, PAGE_SIZE,
addr,
dma_address[r]);
dma_address[r] = 0;
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 86d5b17..354f9d9 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -110,7 +110,7 @@ static struct page *__ttm_tt_get_page(struct ttm_tt *ttm, int index)
INIT_LIST_HEAD(&h);
ret = ttm_get_pages(&h, ttm->page_flags, ttm->caching_state, 1,
- &ttm->dma_address[index]);
+ &ttm->dma_address[index], ttm->dev);
if (ret != 0)
return NULL;
@@ -304,7 +304,7 @@ static void ttm_tt_free_alloced_pages(struct ttm_tt *ttm)
}
}
ttm_put_pages(&h, count, ttm->page_flags, ttm->caching_state,
- ttm->dma_address);
+ ttm->dma_address, ttm->dev);
ttm->state = tt_unpopulated;
ttm->first_himem_page = ttm->num_pages;
ttm->last_lomem_page = -1;
@@ -397,6 +397,7 @@ struct ttm_tt *ttm_tt_create(struct ttm_bo_device *bdev, unsigned long size,
ttm->last_lomem_page = -1;
ttm->caching_state = tt_cached;
ttm->page_flags = page_flags;
+ ttm->dev = bdev->dev;
ttm->dummy_read_page = dummy_read_page;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 10ca97e..4a8c789 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -322,7 +322,7 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM);
dev_priv->active_master = &dev_priv->fbdev_master;
-
+ dev_priv->bdev.dev = dev->dev;
ret = ttm_bo_device_init(&dev_priv->bdev,
dev_priv->bo_global_ref.ref.object,
&vmw_bo_driver, VMWGFX_FILE_PAGE_OFFSET,
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index efed082..7589c0a 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -177,6 +177,7 @@ struct ttm_tt {
tt_unpopulated,
} state;
dma_addr_t *dma_address;
+ struct device *dev;
};
#define TTM_MEMTYPE_FLAG_FIXED (1 << 0) /* Fixed (on-card) PCI memory */
@@ -551,6 +552,7 @@ struct ttm_bo_device {
struct list_head device_list;
struct ttm_bo_global *glob;
struct ttm_bo_driver *driver;
+ struct device *dev;
rwlock_t vm_lock;
struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
spinlock_t fence_lock;
diff --git a/include/drm/ttm/ttm_page_alloc.h b/include/drm/ttm/ttm_page_alloc.h
index 8062890..ccb6b7a 100644
--- a/include/drm/ttm/ttm_page_alloc.h
+++ b/include/drm/ttm/ttm_page_alloc.h
@@ -37,12 +37,14 @@
* @cstate: ttm caching state for the page.
* @count: number of pages to allocate.
* @dma_address: The DMA (bus) address of pages (if TTM_PAGE_FLAG_DMA32 set).
+ * @dev: struct device for appropiate DMA accounting.
*/
int ttm_get_pages(struct list_head *pages,
int flags,
enum ttm_caching_state cstate,
unsigned count,
- dma_addr_t *dma_address);
+ dma_addr_t *dma_address,
+ struct device *dev);
/**
* Put linked list of pages to pool.
*
@@ -52,12 +54,14 @@ int ttm_get_pages(struct list_head *pages,
* @flags: ttm flags for page allocation.
* @cstate: ttm caching state.
* @dma_address: The DMA (bus) address of pages (if TTM_PAGE_FLAG_DMA32 set).
+ * @dev: struct device for appropiate DMA accounting.
*/
void ttm_put_pages(struct list_head *pages,
unsigned page_count,
int flags,
enum ttm_caching_state cstate,
- dma_addr_t *dma_address);
+ dma_addr_t *dma_address,
+ struct device *dev);
/**
* Initialize pool allocator.
*/
--
1.7.1
Hi, Konrad,
Is passing a struct device to the DMA api really *strictly* necessary?
I'd like to avoid that at all cost, since we don't want pages that are
backing buffer objects
(coherent pages) to be associated with a specific device.
The reason for this is that we probably soon will want to move ttm
buffer objects between devices, and that should ideally be a simple
operation: If the memory type the buffer object currently resides in is
not shared between two devices, then move it out to system memory and
change its struct bo_device pointer.
If pages are associated with a specific device, this will become much
harder. Basically we need to change backing pages and copy all the data.
/Thomas
On 03/08/2011 04:39 PM, Konrad Rzeszutek Wilk wrote:
> These two patches are not strictly required but they do a bit of
> cleanup and also fix a false reporting issue when the kernel is compiled
> with CONFIG_DEBUG_DMA_API.
>
> Essentially the patchset modifies 'struct ttm_tt' and 'struct ttm_bo_device'
> to contain a pointer to 'struct device'. Passing of the 'struct device'
> is done via the 'tt_bo_device_init'. The 'struct device' is then used
> during the allocation/deallocation of pages when TTM_PAGE_FLAG_DMA32 flag
> is set.
>
> At first it seems that we would need to keep a copy of
> 'struct device' in the struct ttm_bo_device only and use that. However,
> when 'ttm_tt_destroy' is called it sets ttm->be (which contains
> the 'struct ttm_bo_device') to NULL so we cannot use it. Hence
> we copy the 'struct device' pointer to the 'struct ttm_tm' and keep
> it there.
>
> I've put the patches on:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git devel/ttm.pci-api.v5
>
> The testing was a bit difficult b/c I could not persuade the machine to unload
> either the nouveau or radeon driver (I had a serial console hooked so I figured
> it would fallback to that?). Ended up calling 'unbind' on the
> /sys/pci/.../$BDF/driver/[radeon,nouveau] to trigger the de-allocation path.
>
> Thomas,
>
> What is the best way to test vmw-gfx*? Does the Workstation (5?) support
> this particular frontend?
>
> Konrad Rzeszutek Wilk (2):
> ttm: Include the 'struct dev' when using the DMA API.
> ttm: Pass in 'struct device' to TTM so it can do DMA API on behalf of device.
>
> drivers/gpu/drm/nouveau/nouveau_mem.c | 3 ++-
> drivers/gpu/drm/radeon/radeon_ttm.c | 3 ++-
> drivers/gpu/drm/ttm/ttm_bo.c | 4 +++-
> drivers/gpu/drm/ttm/ttm_page_alloc.c | 11 ++++++-----
> drivers/gpu/drm/ttm/ttm_tt.c | 5 +++--
> drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 4 ++--
> include/drm/ttm/ttm_bo_driver.h | 7 ++++++-
> include/drm/ttm/ttm_page_alloc.h | 8 ++++++--
> 8 files changed, 30 insertions(+), 15 deletions(-)
>
>
> The full diff:
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c b/drivers/gpu/drm/nouveau/nouveau_mem.c
> index 26347b7..3706156 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_mem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_mem.c
> @@ -412,7 +412,8 @@ nouveau_mem_vram_init(struct drm_device *dev)
> ret = ttm_bo_device_init(&dev_priv->ttm.bdev,
> dev_priv->ttm.bo_global_ref.ref.object,
> &nouveau_bo_driver, DRM_FILE_PAGE_OFFSET,
> - dma_bits<= 32 ? true : false);
> + dma_bits<= 32 ? true : false,
> + dev->dev);
> if (ret) {
> NV_ERROR(dev, "Error initialising bo driver: %d\n", ret);
> return ret;
> diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
> index 9ead11f..371890c 100644
> --- a/drivers/gpu/drm/radeon/radeon_ttm.c
> +++ b/drivers/gpu/drm/radeon/radeon_ttm.c
> @@ -517,7 +517,8 @@ int radeon_ttm_init(struct radeon_device *rdev)
> r = ttm_bo_device_init(&rdev->mman.bdev,
> rdev->mman.bo_global_ref.ref.object,
> &radeon_bo_driver, DRM_FILE_PAGE_OFFSET,
> - rdev->need_dma32);
> + rdev->need_dma32,
> + rdev->dev);
> if (r) {
> DRM_ERROR("failed initializing buffer object driver(%d).\n", r);
> return r;
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index af61fc2..278a2d3 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -1526,12 +1526,14 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
> struct ttm_bo_global *glob,
> struct ttm_bo_driver *driver,
> uint64_t file_page_offset,
> - bool need_dma32)
> + bool need_dma32,
> + struct device *dev)
> {
> int ret = -EINVAL;
>
> rwlock_init(&bdev->vm_lock);
> bdev->driver = driver;
> + bdev->dev = dev;
>
> memset(bdev->man, 0, sizeof(bdev->man));
>
> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> index 737a2a2..35849db 100644
> --- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> @@ -664,7 +664,7 @@ out:
> */
> int ttm_get_pages(struct list_head *pages, int flags,
> enum ttm_caching_state cstate, unsigned count,
> - dma_addr_t *dma_address)
> + dma_addr_t *dma_address, struct device *dev)
> {
> struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
> struct page *p = NULL;
> @@ -685,7 +685,7 @@ int ttm_get_pages(struct list_head *pages, int flags,
> for (r = 0; r< count; ++r) {
> if ((flags& TTM_PAGE_FLAG_DMA32)&& dma_address) {
> void *addr;
> - addr = dma_alloc_coherent(NULL, PAGE_SIZE,
> + addr = dma_alloc_coherent(dev, PAGE_SIZE,
> &dma_address[r],
> gfp_flags);
> if (addr == NULL)
> @@ -730,7 +730,7 @@ int ttm_get_pages(struct list_head *pages, int flags,
> printk(KERN_ERR TTM_PFX
> "Failed to allocate extra pages "
> "for large request.");
> - ttm_put_pages(pages, 0, flags, cstate, NULL);
> + ttm_put_pages(pages, 0, flags, cstate, NULL, NULL);
> return r;
> }
> }
> @@ -741,7 +741,8 @@ int ttm_get_pages(struct list_head *pages, int flags,
>
> /* Put all pages in pages list to correct pool to wait for reuse */
> void ttm_put_pages(struct list_head *pages, unsigned page_count, int flags,
> - enum ttm_caching_state cstate, dma_addr_t *dma_address)
> + enum ttm_caching_state cstate, dma_addr_t *dma_address,
> + struct device *dev)
> {
> unsigned long irq_flags;
> struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
> @@ -757,7 +758,7 @@ void ttm_put_pages(struct list_head *pages, unsigned page_count, int flags,
> void *addr = page_address(p);
> WARN_ON(!addr || !dma_address[r]);
> if (addr)
> - dma_free_coherent(NULL, PAGE_SIZE,
> + dma_free_coherent(dev, PAGE_SIZE,
> addr,
> dma_address[r]);
> dma_address[r] = 0;
> diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
> index 86d5b17..354f9d9 100644
> --- a/drivers/gpu/drm/ttm/ttm_tt.c
> +++ b/drivers/gpu/drm/ttm/ttm_tt.c
> @@ -110,7 +110,7 @@ static struct page *__ttm_tt_get_page(struct ttm_tt *ttm, int index)
> INIT_LIST_HEAD(&h);
>
> ret = ttm_get_pages(&h, ttm->page_flags, ttm->caching_state, 1,
> - &ttm->dma_address[index]);
> + &ttm->dma_address[index], ttm->dev);
>
> if (ret != 0)
> return NULL;
> @@ -304,7 +304,7 @@ static void ttm_tt_free_alloced_pages(struct ttm_tt *ttm)
> }
> }
> ttm_put_pages(&h, count, ttm->page_flags, ttm->caching_state,
> - ttm->dma_address);
> + ttm->dma_address, ttm->dev);
> ttm->state = tt_unpopulated;
> ttm->first_himem_page = ttm->num_pages;
> ttm->last_lomem_page = -1;
> @@ -397,6 +397,7 @@ struct ttm_tt *ttm_tt_create(struct ttm_bo_device *bdev, unsigned long size,
> ttm->last_lomem_page = -1;
> ttm->caching_state = tt_cached;
> ttm->page_flags = page_flags;
> + ttm->dev = bdev->dev;
>
> ttm->dummy_read_page = dummy_read_page;
>
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> index 10ca97e..803d979 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> @@ -322,11 +322,11 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
> ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM);
> dev_priv->active_master =&dev_priv->fbdev_master;
>
> -
> ret = ttm_bo_device_init(&dev_priv->bdev,
> dev_priv->bo_global_ref.ref.object,
> &vmw_bo_driver, VMWGFX_FILE_PAGE_OFFSET,
> - false);
> + false,
> + dev->dev);
> if (unlikely(ret != 0)) {
> DRM_ERROR("Failed initializing TTM buffer object driver.\n");
> goto out_err1;
> diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
> index efed082..2024a74 100644
> --- a/include/drm/ttm/ttm_bo_driver.h
> +++ b/include/drm/ttm/ttm_bo_driver.h
> @@ -177,6 +177,7 @@ struct ttm_tt {
> tt_unpopulated,
> } state;
> dma_addr_t *dma_address;
> + struct device *dev;
> };
>
> #define TTM_MEMTYPE_FLAG_FIXED (1<< 0) /* Fixed (on-card) PCI memory */
> @@ -551,6 +552,7 @@ struct ttm_bo_device {
> struct list_head device_list;
> struct ttm_bo_global *glob;
> struct ttm_bo_driver *driver;
> + struct device *dev;
> rwlock_t vm_lock;
> struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
> spinlock_t fence_lock;
> @@ -791,6 +793,8 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev);
> * @file_page_offset: Offset into the device address space that is available
> * for buffer data. This ensures compatibility with other users of the
> * address space.
> + * @need_dma32: Allocate pages under 4GB
> + * @dev: 'struct device' of the PCI device.
> *
> * Initializes a struct ttm_bo_device:
> * Returns:
> @@ -799,7 +803,8 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev);
> extern int ttm_bo_device_init(struct ttm_bo_device *bdev,
> struct ttm_bo_global *glob,
> struct ttm_bo_driver *driver,
> - uint64_t file_page_offset, bool need_dma32);
> + uint64_t file_page_offset, bool need_dma32,
> + struct device *dev);
>
> /**
> * ttm_bo_unmap_virtual
> diff --git a/include/drm/ttm/ttm_page_alloc.h b/include/drm/ttm/ttm_page_alloc.h
> index 8062890..ccb6b7a 100644
> --- a/include/drm/ttm/ttm_page_alloc.h
> +++ b/include/drm/ttm/ttm_page_alloc.h
> @@ -37,12 +37,14 @@
> * @cstate: ttm caching state for the page.
> * @count: number of pages to allocate.
> * @dma_address: The DMA (bus) address of pages (if TTM_PAGE_FLAG_DMA32 set).
> + * @dev: struct device for appropiate DMA accounting.
> */
> int ttm_get_pages(struct list_head *pages,
> int flags,
> enum ttm_caching_state cstate,
> unsigned count,
> - dma_addr_t *dma_address);
> + dma_addr_t *dma_address,
> + struct device *dev);
> /**
> * Put linked list of pages to pool.
> *
> @@ -52,12 +54,14 @@ int ttm_get_pages(struct list_head *pages,
> * @flags: ttm flags for page allocation.
> * @cstate: ttm caching state.
> * @dma_address: The DMA (bus) address of pages (if TTM_PAGE_FLAG_DMA32 set).
> + * @dev: struct device for appropiate DMA accounting.
> */
> void ttm_put_pages(struct list_head *pages,
> unsigned page_count,
> int flags,
> enum ttm_caching_state cstate,
> - dma_addr_t *dma_address);
> + dma_addr_t *dma_address,
> + struct device *dev);
> /**
> * Initialize pool allocator.
> */
>
On Tue, Mar 08, 2011 at 09:52:54PM +0100, Thomas Hellstrom wrote:
> Hi, Konrad,
>
> Is passing a struct device to the DMA api really *strictly* necessary?
It is allowed. As said, it is a cleanup patch so it can be dropped.
>
> I'd like to avoid that at all cost, since we don't want pages that
> are backing buffer objects
> (coherent pages) to be associated with a specific device.
OK.
>
> The reason for this is that we probably soon will want to move ttm
> buffer objects between devices, and that should ideally be a simple
> operation: If the memory type the buffer object currently resides in
> is not shared between two devices, then move it out to system memory
> and change its struct bo_device pointer.
>
> If pages are associated with a specific device, this will become
> much harder. Basically we need to change backing pages and copy all
> the data.
<nods> Makes sense.
>
> /Thomas
>
>
> On 03/08/2011 04:39 PM, Konrad Rzeszutek Wilk wrote:
> >These two patches are not strictly required but they do a bit of
> >cleanup and also fix a false reporting issue when the kernel is compiled
> >with CONFIG_DEBUG_DMA_API.
> >
> >Essentially the patchset modifies 'struct ttm_tt' and 'struct ttm_bo_device'
> >to contain a pointer to 'struct device'. Passing of the 'struct device'
> >is done via the 'tt_bo_device_init'. The 'struct device' is then used
> >during the allocation/deallocation of pages when TTM_PAGE_FLAG_DMA32 flag
> >is set.
> >
> >At first it seems that we would need to keep a copy of
> >'struct device' in the struct ttm_bo_device only and use that. However,
> >when 'ttm_tt_destroy' is called it sets ttm->be (which contains
> >the 'struct ttm_bo_device') to NULL so we cannot use it. Hence
> >we copy the 'struct device' pointer to the 'struct ttm_tm' and keep
> >it there.
> >
> >I've put the patches on:
> >
> >git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git devel/ttm.pci-api.v5
> >
> >The testing was a bit difficult b/c I could not persuade the machine to unload
> >either the nouveau or radeon driver (I had a serial console hooked so I figured
> >it would fallback to that?). Ended up calling 'unbind' on the
> >/sys/pci/.../$BDF/driver/[radeon,nouveau] to trigger the de-allocation path.
> >
> >Thomas,
> >
> >What is the best way to test vmw-gfx*? Does the Workstation (5?) support
> >this particular frontend?
> >
> >Konrad Rzeszutek Wilk (2):
> > ttm: Include the 'struct dev' when using the DMA API.
> > ttm: Pass in 'struct device' to TTM so it can do DMA API on behalf of device.
> >
> > drivers/gpu/drm/nouveau/nouveau_mem.c | 3 ++-
> > drivers/gpu/drm/radeon/radeon_ttm.c | 3 ++-
> > drivers/gpu/drm/ttm/ttm_bo.c | 4 +++-
> > drivers/gpu/drm/ttm/ttm_page_alloc.c | 11 ++++++-----
> > drivers/gpu/drm/ttm/ttm_tt.c | 5 +++--
> > drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 4 ++--
> > include/drm/ttm/ttm_bo_driver.h | 7 ++++++-
> > include/drm/ttm/ttm_page_alloc.h | 8 ++++++--
> > 8 files changed, 30 insertions(+), 15 deletions(-)
> >
> >
> >The full diff:
> >
> >diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c b/drivers/gpu/drm/nouveau/nouveau_mem.c
> >index 26347b7..3706156 100644
> >--- a/drivers/gpu/drm/nouveau/nouveau_mem.c
> >+++ b/drivers/gpu/drm/nouveau/nouveau_mem.c
> >@@ -412,7 +412,8 @@ nouveau_mem_vram_init(struct drm_device *dev)
> > ret = ttm_bo_device_init(&dev_priv->ttm.bdev,
> > dev_priv->ttm.bo_global_ref.ref.object,
> > &nouveau_bo_driver, DRM_FILE_PAGE_OFFSET,
> >- dma_bits<= 32 ? true : false);
> >+ dma_bits<= 32 ? true : false,
> >+ dev->dev);
> > if (ret) {
> > NV_ERROR(dev, "Error initialising bo driver: %d\n", ret);
> > return ret;
> >diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
> >index 9ead11f..371890c 100644
> >--- a/drivers/gpu/drm/radeon/radeon_ttm.c
> >+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
> >@@ -517,7 +517,8 @@ int radeon_ttm_init(struct radeon_device *rdev)
> > r = ttm_bo_device_init(&rdev->mman.bdev,
> > rdev->mman.bo_global_ref.ref.object,
> > &radeon_bo_driver, DRM_FILE_PAGE_OFFSET,
> >- rdev->need_dma32);
> >+ rdev->need_dma32,
> >+ rdev->dev);
> > if (r) {
> > DRM_ERROR("failed initializing buffer object driver(%d).\n", r);
> > return r;
> >diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> >index af61fc2..278a2d3 100644
> >--- a/drivers/gpu/drm/ttm/ttm_bo.c
> >+++ b/drivers/gpu/drm/ttm/ttm_bo.c
> >@@ -1526,12 +1526,14 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
> > struct ttm_bo_global *glob,
> > struct ttm_bo_driver *driver,
> > uint64_t file_page_offset,
> >- bool need_dma32)
> >+ bool need_dma32,
> >+ struct device *dev)
> > {
> > int ret = -EINVAL;
> >
> > rwlock_init(&bdev->vm_lock);
> > bdev->driver = driver;
> >+ bdev->dev = dev;
> >
> > memset(bdev->man, 0, sizeof(bdev->man));
> >
> >diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> >index 737a2a2..35849db 100644
> >--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
> >+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> >@@ -664,7 +664,7 @@ out:
> > */
> > int ttm_get_pages(struct list_head *pages, int flags,
> > enum ttm_caching_state cstate, unsigned count,
> >- dma_addr_t *dma_address)
> >+ dma_addr_t *dma_address, struct device *dev)
> > {
> > struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
> > struct page *p = NULL;
> >@@ -685,7 +685,7 @@ int ttm_get_pages(struct list_head *pages, int flags,
> > for (r = 0; r< count; ++r) {
> > if ((flags& TTM_PAGE_FLAG_DMA32)&& dma_address) {
> > void *addr;
> >- addr = dma_alloc_coherent(NULL, PAGE_SIZE,
> >+ addr = dma_alloc_coherent(dev, PAGE_SIZE,
> > &dma_address[r],
> > gfp_flags);
> > if (addr == NULL)
> >@@ -730,7 +730,7 @@ int ttm_get_pages(struct list_head *pages, int flags,
> > printk(KERN_ERR TTM_PFX
> > "Failed to allocate extra pages "
> > "for large request.");
> >- ttm_put_pages(pages, 0, flags, cstate, NULL);
> >+ ttm_put_pages(pages, 0, flags, cstate, NULL, NULL);
> > return r;
> > }
> > }
> >@@ -741,7 +741,8 @@ int ttm_get_pages(struct list_head *pages, int flags,
> >
> > /* Put all pages in pages list to correct pool to wait for reuse */
> > void ttm_put_pages(struct list_head *pages, unsigned page_count, int flags,
> >- enum ttm_caching_state cstate, dma_addr_t *dma_address)
> >+ enum ttm_caching_state cstate, dma_addr_t *dma_address,
> >+ struct device *dev)
> > {
> > unsigned long irq_flags;
> > struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
> >@@ -757,7 +758,7 @@ void ttm_put_pages(struct list_head *pages, unsigned page_count, int flags,
> > void *addr = page_address(p);
> > WARN_ON(!addr || !dma_address[r]);
> > if (addr)
> >- dma_free_coherent(NULL, PAGE_SIZE,
> >+ dma_free_coherent(dev, PAGE_SIZE,
> > addr,
> > dma_address[r]);
> > dma_address[r] = 0;
> >diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
> >index 86d5b17..354f9d9 100644
> >--- a/drivers/gpu/drm/ttm/ttm_tt.c
> >+++ b/drivers/gpu/drm/ttm/ttm_tt.c
> >@@ -110,7 +110,7 @@ static struct page *__ttm_tt_get_page(struct ttm_tt *ttm, int index)
> > INIT_LIST_HEAD(&h);
> >
> > ret = ttm_get_pages(&h, ttm->page_flags, ttm->caching_state, 1,
> >- &ttm->dma_address[index]);
> >+ &ttm->dma_address[index], ttm->dev);
> >
> > if (ret != 0)
> > return NULL;
> >@@ -304,7 +304,7 @@ static void ttm_tt_free_alloced_pages(struct ttm_tt *ttm)
> > }
> > }
> > ttm_put_pages(&h, count, ttm->page_flags, ttm->caching_state,
> >- ttm->dma_address);
> >+ ttm->dma_address, ttm->dev);
> > ttm->state = tt_unpopulated;
> > ttm->first_himem_page = ttm->num_pages;
> > ttm->last_lomem_page = -1;
> >@@ -397,6 +397,7 @@ struct ttm_tt *ttm_tt_create(struct ttm_bo_device *bdev, unsigned long size,
> > ttm->last_lomem_page = -1;
> > ttm->caching_state = tt_cached;
> > ttm->page_flags = page_flags;
> >+ ttm->dev = bdev->dev;
> >
> > ttm->dummy_read_page = dummy_read_page;
> >
> >diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> >index 10ca97e..803d979 100644
> >--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> >+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> >@@ -322,11 +322,11 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
> > ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM);
> > dev_priv->active_master =&dev_priv->fbdev_master;
> >
> >-
> > ret = ttm_bo_device_init(&dev_priv->bdev,
> > dev_priv->bo_global_ref.ref.object,
> > &vmw_bo_driver, VMWGFX_FILE_PAGE_OFFSET,
> >- false);
> >+ false,
> >+ dev->dev);
> > if (unlikely(ret != 0)) {
> > DRM_ERROR("Failed initializing TTM buffer object driver.\n");
> > goto out_err1;
> >diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
> >index efed082..2024a74 100644
> >--- a/include/drm/ttm/ttm_bo_driver.h
> >+++ b/include/drm/ttm/ttm_bo_driver.h
> >@@ -177,6 +177,7 @@ struct ttm_tt {
> > tt_unpopulated,
> > } state;
> > dma_addr_t *dma_address;
> >+ struct device *dev;
> > };
> >
> > #define TTM_MEMTYPE_FLAG_FIXED (1<< 0) /* Fixed (on-card) PCI memory */
> >@@ -551,6 +552,7 @@ struct ttm_bo_device {
> > struct list_head device_list;
> > struct ttm_bo_global *glob;
> > struct ttm_bo_driver *driver;
> >+ struct device *dev;
> > rwlock_t vm_lock;
> > struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
> > spinlock_t fence_lock;
> >@@ -791,6 +793,8 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev);
> > * @file_page_offset: Offset into the device address space that is available
> > * for buffer data. This ensures compatibility with other users of the
> > * address space.
> >+ * @need_dma32: Allocate pages under 4GB
> >+ * @dev: 'struct device' of the PCI device.
> > *
> > * Initializes a struct ttm_bo_device:
> > * Returns:
> >@@ -799,7 +803,8 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev);
> > extern int ttm_bo_device_init(struct ttm_bo_device *bdev,
> > struct ttm_bo_global *glob,
> > struct ttm_bo_driver *driver,
> >- uint64_t file_page_offset, bool need_dma32);
> >+ uint64_t file_page_offset, bool need_dma32,
> >+ struct device *dev);
> >
> > /**
> > * ttm_bo_unmap_virtual
> >diff --git a/include/drm/ttm/ttm_page_alloc.h b/include/drm/ttm/ttm_page_alloc.h
> >index 8062890..ccb6b7a 100644
> >--- a/include/drm/ttm/ttm_page_alloc.h
> >+++ b/include/drm/ttm/ttm_page_alloc.h
> >@@ -37,12 +37,14 @@
> > * @cstate: ttm caching state for the page.
> > * @count: number of pages to allocate.
> > * @dma_address: The DMA (bus) address of pages (if TTM_PAGE_FLAG_DMA32 set).
> >+ * @dev: struct device for appropiate DMA accounting.
> > */
> > int ttm_get_pages(struct list_head *pages,
> > int flags,
> > enum ttm_caching_state cstate,
> > unsigned count,
> >- dma_addr_t *dma_address);
> >+ dma_addr_t *dma_address,
> >+ struct device *dev);
> > /**
> > * Put linked list of pages to pool.
> > *
> >@@ -52,12 +54,14 @@ int ttm_get_pages(struct list_head *pages,
> > * @flags: ttm flags for page allocation.
> > * @cstate: ttm caching state.
> > * @dma_address: The DMA (bus) address of pages (if TTM_PAGE_FLAG_DMA32 set).
> >+ * @dev: struct device for appropiate DMA accounting.
> > */
> > void ttm_put_pages(struct list_head *pages,
> > unsigned page_count,
> > int flags,
> > enum ttm_caching_state cstate,
> >- dma_addr_t *dma_address);
> >+ dma_addr_t *dma_address,
> >+ struct device *dev);
> > /**
> > * Initialize pool allocator.
> > */
On Tue, Mar 08, 2011 at 09:52:54PM +0100, Thomas Hellstrom wrote:
> Hi, Konrad,
>
> Is passing a struct device to the DMA api really *strictly* necessary?
Soo.. it seems it is on PowerPC, which I sadly didn't check for, does require
this.
>
> I'd like to avoid that at all cost, since we don't want pages that
> are backing buffer objects
> (coherent pages) to be associated with a specific device.
>
> The reason for this is that we probably soon will want to move ttm
> buffer objects between devices, and that should ideally be a simple
> operation: If the memory type the buffer object currently resides in
> is not shared between two devices, then move it out to system memory
> and change its struct bo_device pointer.
I was thinking about this a bit after I found that the PowerPC requires
the 'struct dev'. But I got a question first, what do you with pages
that were allocated to a device that can do 64-bit DMA and then
move it to a device than can 32-bit DMA? Obviously the 32-bit card would
set the TTM_PAGE_FLAG_DMA32 flag, but the 64-bit would not. What is the
process then? Allocate a new page from the 32-bit device and then copy over the
page from the 64-bit TTM and put the 64-bit TTM page?
>
> If pages are associated with a specific device, this will become
> much harder. Basically we need to change backing pages and copy all
what if you track it. Right now you need to track two things:
'struct page *' and 'dma_addr_t'. What if you had also to track
'struct dev *' with the page in question? Something like this:
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index efed082..1986761 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -158,9 +158,14 @@ enum ttm_caching_state {
* memory.
*/
+struct ttm_tt_page {
+ struct page *page;
+ dma_addr_t *dma_addr;
+ struct dev *dev;
+}
struct ttm_tt {
struct page *dummy_read_page;
- struct page **pages;
+ struct ttm_tt_page **pages;
long first_himem_page;
long last_lomem_page;
uint32_t page_flags;
@@ -176,7 +181,6 @@ struct ttm_tt {
tt_unbound,
tt_unpopulated,
} state;
- dma_addr_t *dma_address;
};
#define TTM_MEMTYPE_FLAG_FIXED (1 << 0) /* Fixed (on-card) PCI memory */
could do it. And when you pass the 'page' to the other TTM, it is just the matter
of passing in the 'struct ttm_tt_page' now.
On Tue, Mar 08, 2011 at 09:52:54PM +0100, Thomas Hellstrom wrote:
> Is passing a struct device to the DMA api really *strictly* necessary?
>
Yes.
> I'd like to avoid that at all cost, since we don't want pages that are
> backing buffer objects
> (coherent pages) to be associated with a specific device.
>
And what do you do when coherent memory for a given device can only be
obtained from that bus or that device alone?
This API permits for cases where system memory is non-coherent or you are
handling transactions across busses lacking coherency protocols between
each other and so on. There are plenty of cases where coherent DMA
buffers for PCI devices need to be allocated within the context of the
PCI bus and no other coherent memory is possible. More complex examples
exist as well. There are MFDs where DMA transactions are only coherent
with regards to local SRAM that in turn is mapped in and out of an 8051
microcontroller's address space, and so on. The kernel deals with all of
these sorts of cases by way of that dev pointer in the DMA API, and any
attempt to move beyond that scope makes non-portable assumptions about
the nature of coherent memory pools.
> The reason for this is that we probably soon will want to move ttm
> buffer objects between devices, and that should ideally be a simple
> operation: If the memory type the buffer object currently resides in is
> not shared between two devices, then move it out to system memory and
> change its struct bo_device pointer.
>
> If pages are associated with a specific device, this will become much
> harder. Basically we need to change backing pages and copy all the data.
>
That won't work if a device has consistent buffers locally but system
memory is non-coherent. A not too uncommon situation for embedded
platforms with a PCI bus hanging off of the CPU local bus.
On 03/22/2011 03:31 PM, Konrad Rzeszutek Wilk wrote:
> On Tue, Mar 08, 2011 at 09:52:54PM +0100, Thomas Hellstrom wrote:
>
>> Hi, Konrad,
>>
>> Is passing a struct device to the DMA api really *strictly* necessary?
>>
> Soo.. it seems it is on PowerPC, which I sadly didn't check for, does require
> this.
>
>> I'd like to avoid that at all cost, since we don't want pages that
>> are backing buffer objects
>> (coherent pages) to be associated with a specific device.
>>
>> The reason for this is that we probably soon will want to move ttm
>> buffer objects between devices, and that should ideally be a simple
>> operation: If the memory type the buffer object currently resides in
>> is not shared between two devices, then move it out to system memory
>> and change its struct bo_device pointer.
>>
> I was thinking about this a bit after I found that the PowerPC requires
> the 'struct dev'. But I got a question first, what do you with pages
> that were allocated to a device that can do 64-bit DMA and then
> move it to a device than can 32-bit DMA? Obviously the 32-bit card would
> set the TTM_PAGE_FLAG_DMA32 flag, but the 64-bit would not. What is the
> process then? Allocate a new page from the 32-bit device and then copy over the
> page from the 64-bit TTM and put the 64-bit TTM page?
>
Yes, in certain situations we need to copy, and if it's necessary in
some cases to use coherent memory with a struct device assoicated with
it, I agree it may be reasonable to do a copy in that case as well. I'm
against, however, to make that the default case when running on bare metal.
However, I've looked a bit deeper into all this, and it looks like we
already have other problems that need to be addressed, and that exists
with the code already in git:
Consider a situation where you allocate a cached DMA32 page from the ttm
page allocator. You'll end up with a coherent page. Then you make it
uncached and finally you return it to the ttm page allocator. Since it's
uncached, it will not be freed by the dma api, but kept in the uncached
pool, and later the incorrect page free function will be called.
I think we might need to take a few steps back and rethink this whole idea:
1) How does this work in the AGP case? Let's say you allocate
write-combined DMA32 pages from the ttm page pool (in this case you
won't get coherent memory) and then use them in an AGP gart? Why is it
that we don't need coherent pages then in the Xen case?
2) http://www.mjmwired.net/kernel/Documentation/DMA-API.txt, line 33
makes me scared.
We should identify what platforms may have problems with this.
3) When hacking on the unichrome DMA engine it wasn't that hard to use
the synchronization functions of the DMA api correctly:
When binding a TTM, the backend calls dma_map_page() on pages, When
unbinding, the backend calls dma_unmap_page(), If we need cpu access
when bound, we need to call dma_sync_single_for_[cpu|device]. If this is
done, it will be harder to implement user-space sub-allocation, but
possible. There will be a performance loss on some platforms, though.
/Thomas
> >I was thinking about this a bit after I found that the PowerPC requires
> >the 'struct dev'. But I got a question first, what do you with pages
> >that were allocated to a device that can do 64-bit DMA and then
> >move it to a device than can 32-bit DMA? Obviously the 32-bit card would
> >set the TTM_PAGE_FLAG_DMA32 flag, but the 64-bit would not. What is the
> >process then? Allocate a new page from the 32-bit device and then copy over the
> >page from the 64-bit TTM and put the 64-bit TTM page?
>
> Yes, in certain situations we need to copy, and if it's necessary in
> some cases to use coherent memory with a struct device assoicated
> with it, I agree it may be reasonable to do a copy in that case as
> well. I'm against, however, to make that the default case when
> running on bare metal.
This situation could occur on native/baremetal. When you say 'default
case' you mean for every type of page without consulting whether it
had the TTM_PAGE_FLAG_DMA32?
>
> However, I've looked a bit deeper into all this, and it looks like
> we already have other problems that need to be addressed, and that
> exists with the code already in git:
>
> Consider a situation where you allocate a cached DMA32 page from the
> ttm page allocator. You'll end up with a coherent page. Then you
> make it uncached and finally you return it to the ttm page
> allocator. Since it's uncached, it will not be freed by the dma api,
> but kept in the uncached pool, and later the incorrect page free
> function will be called.
Let me look in details in the code, but I thought it would check the
TTM_PAGE_FLAG_DMA32 and direct the page to the correct pool?
We could piggyback on the idea of the struct I had and have these
values:
struct ttm_page {
struct page *page;
dma_addr_t *bus_addr;
struct *ttm_pool *origin;
}
And the origin would point to the correct pool so that on de-allocate
it would divert it to the original one. Naturally there would
be some thinking to be done on the de-alloc path so that
the *origin isn't pointing to something htat has already been free-d.
>
> I think we might need to take a few steps back and rethink this whole idea:
>
> 1) How does this work in the AGP case? Let's say you allocate
> write-combined DMA32 pages from the ttm page pool (in this case you
> won't get coherent memory) and then use them in an AGP gart? Why is
> it that we don't need coherent pages then in the Xen case?
Hehe.. So I had posted a set of patches to carry the 'dma_addr_t' through
the AGP API and then to its backends to program that. And also the frontends
(so DRM, TTM) Here is the
patchset I posted some time ago:
http://linux.derkeiler.com/Mailing-Lists/Kernel/2010-12/msg02382.html
and the discussion:
http://linux.derkeiler.com/Mailing-Lists/Kernel/2010-12/msg02411.html
Dave recommended I skip AGP and just concentrate on PCIe since not to many
folks use AGP anymore. Thought I realized that server boards use PCI
cards (ATI ES1000), which do utilize the AGP API. So there is breakage there
and I have a set of patches for this that I was in process of rebasing
on 2.6.39-rcX.
>
> 2) http://www.mjmwired.net/kernel/Documentation/DMA-API.txt, line 33
> makes me scared.
> We should identify what platforms may have problems with this.
Right.. I think nobody much thought about this in context of TTM since
that was only used on X86. I can take a look at the DMA API's of the
other two major platforms: IA64 and PPC and see what lurks there.
>
> 3) When hacking on the unichrome DMA engine it wasn't that hard to
> use the synchronization functions of the DMA api correctly:
>
> When binding a TTM, the backend calls dma_map_page() on pages, When
> unbinding, the backend calls dma_unmap_page(), If we need cpu access
> when bound, we need to call dma_sync_single_for_[cpu|device]. If
> this is done, it will be harder to implement user-space
> sub-allocation, but possible. There will be a performance loss on
> some platforms, though.
Yup. That was my other suggestion about this. But I had no idea
where to sprinkle those 'dma_sync_single_[*]' calls, as they would
have been done in the drivers. Probably on its DMA paths, right before
telling the GPU to process the CP, and when receiving an interrupt
when the CP has been completed.
On 03/23/2011 01:51 PM, Konrad Rzeszutek Wilk wrote:
>>> I was thinking about this a bit after I found that the PowerPC requires
>>> the 'struct dev'. But I got a question first, what do you with pages
>>> that were allocated to a device that can do 64-bit DMA and then
>>> move it to a device than can 32-bit DMA? Obviously the 32-bit card would
>>> set the TTM_PAGE_FLAG_DMA32 flag, but the 64-bit would not. What is the
>>> process then? Allocate a new page from the 32-bit device and then copy over the
>>> page from the 64-bit TTM and put the 64-bit TTM page?
>>>
>> Yes, in certain situations we need to copy, and if it's necessary in
>> some cases to use coherent memory with a struct device assoicated
>> with it, I agree it may be reasonable to do a copy in that case as
>> well. I'm against, however, to make that the default case when
>> running on bare metal.
>>
> This situation could occur on native/baremetal. When you say 'default
> case' you mean for every type of page without consulting whether it
> had the TTM_PAGE_FLAG_DMA32?
>
No, Basically I mean a device that runs perfectly fine with
alloc_pages(DMA32) on bare metal shouldn't need to be using
dma_alloc_coherent() on bare metal, because that would mean we'd need
to take the copy path above.
>> However, I've looked a bit deeper into all this, and it looks like
>> we already have other problems that need to be addressed, and that
>> exists with the code already in git:
>>
>> Consider a situation where you allocate a cached DMA32 page from the
>> ttm page allocator. You'll end up with a coherent page. Then you
>> make it uncached and finally you return it to the ttm page
>> allocator. Since it's uncached, it will not be freed by the dma api,
>> but kept in the uncached pool, and later the incorrect page free
>> function will be called.
>>
> Let me look in details in the code, but I thought it would check the
> TTM_PAGE_FLAG_DMA32 and direct the page to the correct pool?
>
> We could piggyback on the idea of the struct I had and have these
> values:
>
> struct ttm_page {
> struct page *page;
> dma_addr_t *bus_addr;
> struct *ttm_pool *origin;
> }
>
> And the origin would point to the correct pool so that on de-allocate
> it would divert it to the original one. Naturally there would
> be some thinking to be done on the de-alloc path so that
> the *origin isn't pointing to something htat has already been free-d.
>
The idea with these pools is to keep a cache of write-combined pages, so
the pool
is determined by the caching status of the page when it is returned to
the page
allocator.
If we were to associate a page with a device, we'd basically need one
such pool per
device.
>
>> I think we might need to take a few steps back and rethink this whole idea:
>>
>> 1) How does this work in the AGP case? Let's say you allocate
>> write-combined DMA32 pages from the ttm page pool (in this case you
>> won't get coherent memory) and then use them in an AGP gart? Why is
>> it that we don't need coherent pages then in the Xen case?
>>
> Hehe.. So I had posted a set of patches to carry the 'dma_addr_t' through
> the AGP API and then to its backends to program that. And also the frontends
> (so DRM, TTM) Here is the
> patchset I posted some time ago:
>
> http://linux.derkeiler.com/Mailing-Lists/Kernel/2010-12/msg02382.html
> and the discussion:
>
> http://linux.derkeiler.com/Mailing-Lists/Kernel/2010-12/msg02411.html
>
> Dave recommended I skip AGP and just concentrate on PCIe since not to many
> folks use AGP anymore. Thought I realized that server boards use PCI
> cards (ATI ES1000), which do utilize the AGP API. So there is breakage there
> and I have a set of patches for this that I was in process of rebasing
> on 2.6.39-rcX.
>
I see. Well, both in the AGP case and sometimes in the PCIe case,
(some devices can use a non-cache-coherent PCIe mode for speed)
it's a not too uncommon use case of TTM to alloc cached pages and
transition them to write-combined (see impact below).
>
>> 2) http://www.mjmwired.net/kernel/Documentation/DMA-API.txt, line 33
>> makes me scared.
>> We should identify what platforms may have problems with this.
>>
> Right.. I think nobody much thought about this in context of TTM since
> that was only used on X86. I can take a look at the DMA API's of the
> other two major platforms: IA64 and PPC and see what lurks there.
>
>
>> 3) When hacking on the unichrome DMA engine it wasn't that hard to
>> use the synchronization functions of the DMA api correctly:
>>
>> When binding a TTM, the backend calls dma_map_page() on pages, When
>> unbinding, the backend calls dma_unmap_page(), If we need cpu access
>> when bound, we need to call dma_sync_single_for_[cpu|device]. If
>> this is done, it will be harder to implement user-space
>> sub-allocation, but possible. There will be a performance loss on
>> some platforms, though.
>>
> Yup. That was my other suggestion about this. But I had no idea
> where to sprinkle those 'dma_sync_single_[*]' calls, as they would
> have been done in the drivers. Probably on its DMA paths, right before
> telling the GPU to process the CP, and when receiving an interrupt
> when the CP has been completed.
>
In this case dma_sync_single_for_device() would be needed for a bo when
it was validated for a
PCI dma engine. At the same time, all user-space virtual mappings of the
buffer would need to be killed.
A dma_sync_single_for_cpu() would be needed as part of making a bo idle.
/Thomas
On Wed, Mar 23, 2011 at 02:17:18PM +0100, Thomas Hellstrom wrote:
> On 03/23/2011 01:51 PM, Konrad Rzeszutek Wilk wrote:
> >>>I was thinking about this a bit after I found that the PowerPC requires
> >>>the 'struct dev'. But I got a question first, what do you with pages
> >>>that were allocated to a device that can do 64-bit DMA and then
> >>>move it to a device than can 32-bit DMA? Obviously the 32-bit card would
> >>>set the TTM_PAGE_FLAG_DMA32 flag, but the 64-bit would not. What is the
> >>>process then? Allocate a new page from the 32-bit device and then copy over the
> >>>page from the 64-bit TTM and put the 64-bit TTM page?
> >>Yes, in certain situations we need to copy, and if it's necessary in
> >>some cases to use coherent memory with a struct device assoicated
> >>with it, I agree it may be reasonable to do a copy in that case as
> >>well. I'm against, however, to make that the default case when
> >>running on bare metal.
> >This situation could occur on native/baremetal. When you say 'default
> >case' you mean for every type of page without consulting whether it
> >had the TTM_PAGE_FLAG_DMA32?
>
> No, Basically I mean a device that runs perfectly fine with
> alloc_pages(DMA32) on bare metal shouldn't need to be using
> dma_alloc_coherent() on bare metal, because that would mean we'd need
> to take the copy path above.
I think we got the scenarios confused (or I did at least).
The scenario I used ("I was thinking.."), the 64-bit device would do
alloc_page(GFP_HIGHUSER) and if you were to move it to a 32-bit device
it would have to make a copy of the page as it could not reach the page
from GFP_HIGUSER.
The other scenario, which I think is what you are using, is that
we have a 32-bit device allocating a page, so TTM_PAGE_FLAG_DMA32 is set
and then we if we were to move it a 64-bit device it would need to
copied. But I don't think that is the case - the page would be
reachable by the 64-bit device. Help me out please if I am misunderstanding this.
>
> >>However, I've looked a bit deeper into all this, and it looks like
> >>we already have other problems that need to be addressed, and that
> >>exists with the code already in git:
> >>
> >>Consider a situation where you allocate a cached DMA32 page from the
> >>ttm page allocator. You'll end up with a coherent page. Then you
> >>make it uncached and finally you return it to the ttm page
> >>allocator. Since it's uncached, it will not be freed by the dma api,
> >>but kept in the uncached pool, and later the incorrect page free
> >>function will be called.
> >Let me look in details in the code, but I thought it would check the
> >TTM_PAGE_FLAG_DMA32 and direct the page to the correct pool?
> >
> >We could piggyback on the idea of the struct I had and have these
> >values:
> >
> >struct ttm_page {
> > struct page *page;
> > dma_addr_t *bus_addr;
> > struct *ttm_pool *origin;
> >}
> >
> >And the origin would point to the correct pool so that on de-allocate
> >it would divert it to the original one. Naturally there would
> >be some thinking to be done on the de-alloc path so that
> >the *origin isn't pointing to something htat has already been free-d.
>
> The idea with these pools is to keep a cache of write-combined
> pages, so the pool
> is determined by the caching status of the page when it is returned
> to the page
> allocator.
> If we were to associate a page with a device, we'd basically need
> one such pool per
> device.
OK, I think I get it. The issue here is that with the struct I mentioned
above is that a page that was initially allocated as coherent, and then
you make it uncached, and if you were to free it - it would go back
to the pool that only deals with 'coherent' - which is wrong.
I had an earlier suggestion which was:
struct ttm_page {
struct page *page;
dma_addr_t *bus_addr;
struct *dev *dev;
}
this way even if it was to become uncached, you could still have it
associated with the right device. Granted at point the page is not
coherent anymore, it is uncached.
The problem is that on free-ing it is using the 'dma_free_coherent'
on a non-coherent page which it should not do. Would it be possible
then to make it coherent when we free it? Say as part of freeing
everything it would check what the original pool it belonged to
(so back to the struct with 'struct *ttm_pool *origin) and
convert it back to coherent as part of "moving" it to the original
pool?
> >}
>
>
> >>I think we might need to take a few steps back and rethink this whole idea:
> >>
> >>1) How does this work in the AGP case? Let's say you allocate
> >>write-combined DMA32 pages from the ttm page pool (in this case you
> >>won't get coherent memory) and then use them in an AGP gart? Why is
> >>it that we don't need coherent pages then in the Xen case?
> >Hehe.. So I had posted a set of patches to carry the 'dma_addr_t' through
> >the AGP API and then to its backends to program that. And also the frontends
> >(so DRM, TTM) Here is the
> >patchset I posted some time ago:
> >
> >http://linux.derkeiler.com/Mailing-Lists/Kernel/2010-12/msg02382.html
> >and the discussion:
> >
> >http://linux.derkeiler.com/Mailing-Lists/Kernel/2010-12/msg02411.html
> >
> >Dave recommended I skip AGP and just concentrate on PCIe since not to many
> >folks use AGP anymore. Thought I realized that server boards use PCI
> >cards (ATI ES1000), which do utilize the AGP API. So there is breakage there
> >and I have a set of patches for this that I was in process of rebasing
> >on 2.6.39-rcX.
>
> I see. Well, both in the AGP case and sometimes in the PCIe case,
> (some devices can use a non-cache-coherent PCIe mode for speed)
> it's a not too uncommon use case of TTM to alloc cached pages and
> transition them to write-combined (see impact below).
>
> >>2) http://www.mjmwired.net/kernel/Documentation/DMA-API.txt, line 33
> >>makes me scared.
> >>We should identify what platforms may have problems with this.
> >Right.. I think nobody much thought about this in context of TTM since
> >that was only used on X86. I can take a look at the DMA API's of the
> >other two major platforms: IA64 and PPC and see what lurks there.
> >
> >>3) When hacking on the unichrome DMA engine it wasn't that hard to
> >>use the synchronization functions of the DMA api correctly:
> >>
> >> When binding a TTM, the backend calls dma_map_page() on pages, When
> >>unbinding, the backend calls dma_unmap_page(), If we need cpu access
> >>when bound, we need to call dma_sync_single_for_[cpu|device]. If
> >>this is done, it will be harder to implement user-space
> >>sub-allocation, but possible. There will be a performance loss on
> >>some platforms, though.
> >Yup. That was my other suggestion about this. But I had no idea
> >where to sprinkle those 'dma_sync_single_[*]' calls, as they would
> >have been done in the drivers. Probably on its DMA paths, right before
> >telling the GPU to process the CP, and when receiving an interrupt
> >when the CP has been completed.
>
> In this case dma_sync_single_for_device() would be needed for a bo
> when it was validated for a
> PCI dma engine. At the same time, all user-space virtual mappings of
> the buffer would need to be killed.
We need to kill the user-space virtual mappings b/c they might contain
the wrong caching state? And since they are an alias of the "synced"
page, the "synced" page might be uncached, while the alias is "write-combined".
Ugh. But I thought the WB or WC is done on the pages, not on the
mappings. So it would use the MTRR or PAT, which deal with physical addresses.
So it would not matter what the alias thought it was?
> A dma_sync_single_for_cpu() would be needed as part of making a bo idle.
>
>
> /Thomas
On Wed, Mar 23, 2011 at 8:51 AM, Konrad Rzeszutek Wilk
<[email protected]> wrote:
>> >I was thinking about this a bit after I found that the PowerPC requires
>> >the 'struct dev'. But I got a question first, what do you with pages
>> >that were allocated to a device that can do 64-bit DMA and then
>> >move it to a device than can 32-bit DMA? Obviously the 32-bit card would
>> >set the TTM_PAGE_FLAG_DMA32 flag, but the 64-bit would not. What is the
>> >process then? Allocate a new page from the 32-bit device and then copy over the
>> >page from the 64-bit TTM and put the 64-bit TTM page?
>>
>> Yes, in certain situations we need to copy, and if it's necessary in
>> some cases to use coherent memory with a struct device assoicated
>> with it, I agree it may be reasonable to do a copy in that case as
>> well. I'm against, however, to make that the default case when
>> running on bare metal.
>
> This situation could occur on native/baremetal. When you say 'default
> case' you mean for every type of page without consulting whether it
> had the TTM_PAGE_FLAG_DMA32?
>>
>> However, I've looked a bit deeper into all this, and it looks like
>> we already have other problems that need to be addressed, and that
>> exists with the code already in git:
>>
>> Consider a situation where you allocate a cached DMA32 page from the
>> ttm page allocator. You'll end up with a coherent page. Then you
>> make it uncached and finally you return it to the ttm page
>> allocator. Since it's uncached, it will not be freed by the dma api,
>> but kept in the uncached pool, and later the incorrect page free
>> function will be called.
>
> Let me look in details in the code, but I thought it would check the
> TTM_PAGE_FLAG_DMA32 and direct the page to the correct pool?
>
> We could piggyback on the idea of the struct I had and have these
> values:
>
> struct ttm_page {
> ? ? ? ?struct page *page;
> ? ? ? ?dma_addr_t ? ? ?*bus_addr;
> ? ? ? ?struct *ttm_pool ? ? ? ?*origin;
> }
>
> And the origin would point to the correct pool so that on de-allocate
> it would divert it to the original one. Naturally there would
> be some thinking to be done on the de-alloc path so that
> the *origin isn't pointing to something htat has already been free-d.
>
>>
>> I think we might need to take a few steps back and rethink this whole idea:
>>
>> 1) How does this work in the AGP case? Let's say you allocate
>> write-combined DMA32 pages from the ttm page pool (in this case you
>> won't get coherent memory) and then use them in an AGP gart? Why is
>> it that we don't need coherent pages then in the Xen case?
>
> Hehe.. So I had posted a set of patches to carry the 'dma_addr_t' through
> the AGP API and then to its backends to program that. And also the frontends
> (so DRM, TTM) Here is the
> patchset I posted some time ago:
>
> http://linux.derkeiler.com/Mailing-Lists/Kernel/2010-12/msg02382.html
> and the discussion:
>
> http://linux.derkeiler.com/Mailing-Lists/Kernel/2010-12/msg02411.html
>
> Dave recommended I skip AGP and just concentrate on PCIe since not to many
> folks use AGP anymore. Thought I realized that server boards use PCI
> cards (ATI ES1000), which do utilize the AGP API. So there is breakage there
> and I have a set of patches for this that I was in process of rebasing
> on 2.6.39-rcX.
Actually the PCI gart mechanism uses cached pages rather than uncached
on the pre-pcie asics. On pcie asics, the on-board gart can use both
cached and uncached pages. There is a flag in the gpu's page table to
specify whether the pages need a snooped cycle or not. At the moment
we only use cached pages for the onboard gart, but there are
performance advantages for using uncached pages for certain types of
buffers (pretty much everything that doesn't require CPU reads). That
may be something we want to take advantage of in the future. FWIW,
all AGP radeons can use the onboard gart instead of or in addition to
AGP.
Alex
>
>>
>> 2) http://www.mjmwired.net/kernel/Documentation/DMA-API.txt, line 33
>> makes me scared.
>> We should identify what platforms may have problems with this.
>
> Right.. I think nobody much thought about this in context of TTM since
> that was only used on X86. I can take a look at the DMA API's of the
> other two major platforms: IA64 and PPC and see what lurks there.
>
>>
>> 3) When hacking on the unichrome DMA engine it wasn't that hard to
>> use the synchronization functions of the DMA api correctly:
>>
>> ?When binding a TTM, the backend calls dma_map_page() on pages, When
>> unbinding, the backend calls dma_unmap_page(), If we need cpu access
>> when bound, we need to call dma_sync_single_for_[cpu|device]. If
>> this is done, it will be harder to implement user-space
>> sub-allocation, but possible. There will be a performance loss on
>> some platforms, though.
>
> Yup. That was my other suggestion about this. But I had no idea
> where to sprinkle those 'dma_sync_single_[*]' calls, as they would
> have been done in the drivers. Probably on its DMA paths, right before
> telling the GPU to process the CP, and when receiving an interrupt
> when the CP has been completed.
>
On 03/23/2011 03:52 PM, Konrad Rzeszutek Wilk wrote:
> On Wed, Mar 23, 2011 at 02:17:18PM +0100, Thomas Hellstrom wrote:
>
>> On 03/23/2011 01:51 PM, Konrad Rzeszutek Wilk wrote:
>>
>>>>> I was thinking about this a bit after I found that the PowerPC requires
>>>>> the 'struct dev'. But I got a question first, what do you with pages
>>>>> that were allocated to a device that can do 64-bit DMA and then
>>>>> move it to a device than can 32-bit DMA? Obviously the 32-bit card would
>>>>> set the TTM_PAGE_FLAG_DMA32 flag, but the 64-bit would not. What is the
>>>>> process then? Allocate a new page from the 32-bit device and then copy over the
>>>>> page from the 64-bit TTM and put the 64-bit TTM page?
>>>>>
>>>> Yes, in certain situations we need to copy, and if it's necessary in
>>>> some cases to use coherent memory with a struct device assoicated
>>>> with it, I agree it may be reasonable to do a copy in that case as
>>>> well. I'm against, however, to make that the default case when
>>>> running on bare metal.
>>>>
>>> This situation could occur on native/baremetal. When you say 'default
>>> case' you mean for every type of page without consulting whether it
>>> had the TTM_PAGE_FLAG_DMA32?
>>>
>> No, Basically I mean a device that runs perfectly fine with
>> alloc_pages(DMA32) on bare metal shouldn't need to be using
>> dma_alloc_coherent() on bare metal, because that would mean we'd need
>> to take the copy path above.
>>
> I think we got the scenarios confused (or I did at least).
> The scenario I used ("I was thinking.."), the 64-bit device would do
> alloc_page(GFP_HIGHUSER) and if you were to move it to a 32-bit device
> it would have to make a copy of the page as it could not reach the page
> from GFP_HIGUSER.
>
> The other scenario, which I think is what you are using, is that
> we have a 32-bit device allocating a page, so TTM_PAGE_FLAG_DMA32 is set
> and then we if we were to move it a 64-bit device it would need to
> copied. But I don't think that is the case - the page would be
> reachable by the 64-bit device. Help me out please if I am misunderstanding this.
>
Yes, this is completely correct.
Now, with a struct dev attached to each page in a 32-bit system
(coherent memory)
we would need to always copy in the 32-bit case, since you can't hand
over pages
belonging to other physical devices.
But on bare metal you don't need coherent memory, but in this case you
need to copy anyway becase you choose to allocate coherent memory.
I see a sort of a hackish way around these problems.
Let's say ttm were trying to detect a hypervisor dummy virtual device
sitting on the pci bus. That device would perhaps provide pci
information detailing what GFP masks needing to
allocate coherent memory. The TTM page pool could then grab that device
and create a struct dev to use for allocating "anonymous" TTM BO memory.
Could that be a way forward? The struct dev would then be private to the
page pool code, bare metal wouldn't need to allocate coherent memory,
since the virtual device wouldn't be present. The page pool code would
need to be updated to be able to cache also coherent pages.
Xen would need to create such a device in the guest with a suitable PCI
ID that it would be explicitly willing to share with other hypervisor
suppliers....
It's ugly, I know, but it might work...
Thomas
On Thu, Mar 24, 2011 at 08:52:20AM +0100, Thomas Hellstrom wrote:
> On 03/23/2011 03:52 PM, Konrad Rzeszutek Wilk wrote:
> >On Wed, Mar 23, 2011 at 02:17:18PM +0100, Thomas Hellstrom wrote:
> >>On 03/23/2011 01:51 PM, Konrad Rzeszutek Wilk wrote:
> >>>>>I was thinking about this a bit after I found that the PowerPC requires
> >>>>>the 'struct dev'. But I got a question first, what do you with pages
> >>>>>that were allocated to a device that can do 64-bit DMA and then
> >>>>>move it to a device than can 32-bit DMA? Obviously the 32-bit card would
> >>>>>set the TTM_PAGE_FLAG_DMA32 flag, but the 64-bit would not. What is the
> >>>>>process then? Allocate a new page from the 32-bit device and then copy over the
> >>>>>page from the 64-bit TTM and put the 64-bit TTM page?
> >>>>Yes, in certain situations we need to copy, and if it's necessary in
> >>>>some cases to use coherent memory with a struct device assoicated
> >>>>with it, I agree it may be reasonable to do a copy in that case as
> >>>>well. I'm against, however, to make that the default case when
> >>>>running on bare metal.
> >>>This situation could occur on native/baremetal. When you say 'default
> >>>case' you mean for every type of page without consulting whether it
> >>>had the TTM_PAGE_FLAG_DMA32?
> >>No, Basically I mean a device that runs perfectly fine with
> >>alloc_pages(DMA32) on bare metal shouldn't need to be using
> >>dma_alloc_coherent() on bare metal, because that would mean we'd need
> >>to take the copy path above.
> >I think we got the scenarios confused (or I did at least).
> >The scenario I used ("I was thinking.."), the 64-bit device would do
> >alloc_page(GFP_HIGHUSER) and if you were to move it to a 32-bit device
> >it would have to make a copy of the page as it could not reach the page
> >from GFP_HIGUSER.
> >
> >The other scenario, which I think is what you are using, is that
> >we have a 32-bit device allocating a page, so TTM_PAGE_FLAG_DMA32 is set
> >and then we if we were to move it a 64-bit device it would need to
> >copied. But I don't think that is the case - the page would be
> >reachable by the 64-bit device. Help me out please if I am misunderstanding this.
>
> Yes, this is completely correct.
>
> Now, with a struct dev attached to each page in a 32-bit system
> (coherent memory)
> we would need to always copy in the 32-bit case, since you can't
> hand over pages
> belonging to other physical devices.
> But on bare metal you don't need coherent memory, but in this case you
> need to copy anyway becase you choose to allocate coherent memory.
Ok, can we go back one more step back. I am unclear about one other
thing. Lets think on this in terms of 2.6.38 code.
When a page in the TTM pool is being moved back and forth and also changes
the caching model, what happens on the free part? Is the original caching
state put back on it? Say I allocated a DMA32 page (GFP_DMA32), and move it
to another pool for another radeon device. I also do some cache changes:
make it write-back, then un-cached, then writeback, and when I am done, I
return it back to the pool (DMA32). Once that is done I want to unload
the DRM/TTM driver. Does that page get its caching state reverted
back to what it originally had (probably un-cached)? And where is this done?
>
> I see a sort of a hackish way around these problems.
>
> Let's say ttm were trying to detect a hypervisor dummy virtual
> device sitting on the pci bus. That device would perhaps provide pci
> information detailing what GFP masks needing to
> allocate coherent memory. The TTM page pool could then grab that
> device and create a struct dev to use for allocating "anonymous" TTM
> BO memory.
>
> Could that be a way forward? The struct dev would then be private to
> the page pool code, bare metal wouldn't need to allocate coherent
> memory, since the virtual device wouldn't be present. The page pool
> code would need to be updated to be able to cache also coherent
> pages.
>
> Xen would need to create such a device in the guest with a suitable
> PCI ID that it would be explicitly willing to share with other
> hypervisor suppliers....
>
> It's ugly, I know, but it might work...
Or just create a phantom 'struct device' that is used with
the TTM layer. And it has the lowest common denominator of the
different graphic cards that exist. But that is a bit strange b/c
for some machines (IBM boxes), you have per-device DMA API specific
calls. This depends on what PCI bus you have the card as some of these
boxes have multiple IOMMUs. So that wouldn't work that nicely..
How about a backend TTM alloc API? So the calls to 'ttm_get_page'
and 'ttm_put_page' call to a TTM-alloc API to do allocation.
The default one is the native, and it would have those 'dma_alloc_coherent'
removed. When booting under virtualized
environment a virtualisation "friendly" backend TTM alloc would
register and all calls to 'put/get/probe' would be diverted to it.
'probe' would obviously check whether it should use this backend or not.
It would mean two new files: drivers/gpu/drm/ttm/ttm-memory-xen.c and
a ttm-memory-generic.c and some header work.
It would still need to keep the 'dma_address[i]' around so that
those can be passed to the radeon/nouveau GTT, but for native it
could just contain BAD_DMA_ADDRESS - and the code in the radeon/nouveau
GTT binding is smart to figure out to do 'pci_map_single' if the
dma_addr_t has BAD_DMA_ADDRESS.
The issuer here is with the caching I had a question about. We
would need to reset the caching state back to the original one
before free-ing it. So does the TTM pool de-alloc code deal with this?
On Thu, Mar 24, 2011 at 10:25 AM, Konrad Rzeszutek Wilk
<[email protected]> wrote:
> On Thu, Mar 24, 2011 at 08:52:20AM +0100, Thomas Hellstrom wrote:
>> On 03/23/2011 03:52 PM, Konrad Rzeszutek Wilk wrote:
>> >On Wed, Mar 23, 2011 at 02:17:18PM +0100, Thomas Hellstrom wrote:
>> >>On 03/23/2011 01:51 PM, Konrad Rzeszutek Wilk wrote:
>> >>>>>I was thinking about this a bit after I found that the PowerPC requires
>> >>>>>the 'struct dev'. But I got a question first, what do you with pages
>> >>>>>that were allocated to a device that can do 64-bit DMA and then
>> >>>>>move it to a device than can 32-bit DMA? Obviously the 32-bit card would
>> >>>>>set the TTM_PAGE_FLAG_DMA32 flag, but the 64-bit would not. What is the
>> >>>>>process then? Allocate a new page from the 32-bit device and then copy over the
>> >>>>>page from the 64-bit TTM and put the 64-bit TTM page?
>> >>>>Yes, in certain situations we need to copy, and if it's necessary in
>> >>>>some cases to use coherent memory with a struct device assoicated
>> >>>>with it, I agree it may be reasonable to do a copy in that case as
>> >>>>well. I'm against, however, to make that the default case when
>> >>>>running on bare metal.
>> >>>This situation could occur on native/baremetal. When you say 'default
>> >>>case' you mean for every type of page without consulting whether it
>> >>>had the TTM_PAGE_FLAG_DMA32?
>> >>No, Basically I mean a device that runs perfectly fine with
>> >>alloc_pages(DMA32) on bare metal shouldn't need to be using
>> >>dma_alloc_coherent() on bare metal, because that would mean we'd need
>> >>to take the copy path above.
>> >I think we got the scenarios confused (or I did at least).
>> >The scenario I used ("I was thinking.."), the 64-bit device would do
>> >alloc_page(GFP_HIGHUSER) and if you were to move it to a 32-bit device
>> >it would have to make a copy of the page as it could not reach the page
>> >from GFP_HIGUSER.
>> >
>> >The other scenario, which I think is what you are using, is that
>> >we have a 32-bit device allocating a page, so TTM_PAGE_FLAG_DMA32 is set
>> >and then we if we were to move it a 64-bit device it would need to
>> >copied. But I don't think that is the case - the page would be
>> >reachable by the 64-bit device. Help me out please if I am misunderstanding this.
>>
>> Yes, this is completely correct.
>>
>> Now, with a struct dev attached to each page in a 32-bit system
>> (coherent memory)
>> we would need to always copy in the 32-bit case, since you can't
>> hand over pages
>> belonging to other physical devices.
>> But on bare metal you don't need coherent memory, but in this case you
>> need to copy anyway becase you choose to allocate coherent memory.
>
> Ok, can we go back one more step back. I am unclear about one other
> thing. Lets think on this in terms of 2.6.38 code.
>
> When a page in the TTM pool is being moved back and forth and also changes
> the caching model, what happens on the free part? Is the original caching
> state put back on it? Say I allocated a DMA32 page (GFP_DMA32), and move it
> to another pool for another radeon device. I also do some cache changes:
> make it write-back, then ?un-cached, then writeback, and when I am done, I
> return it back to the pool (DMA32). Once that is done I want to unload
> the DRM/TTM driver. Does that page get its caching state reverted
> back to what it originally had (probably un-cached)? And where is this done?
When ultimately being free all the page are set to write back again as
it's default of all allocated page (see ttm_pages_put). ttm_put_pages
will add page to the correct pool (uc or wc).
>
>>
>> I see a sort of a hackish way around these problems.
>>
>> Let's say ttm were trying to detect a hypervisor dummy virtual
>> device sitting on the pci bus. That device would perhaps provide pci
>> information detailing what GFP masks needing to
>> allocate coherent memory. The TTM page pool could then grab that
>> device and create a struct dev to use for allocating "anonymous" TTM
>> BO memory.
>>
>> Could that be a way forward? The struct dev would then be private to
>> the page pool code, bare metal wouldn't need to allocate coherent
>> memory, since the virtual device wouldn't be present. The page pool
>> code would need to be updated to be able to cache also coherent
>> pages.
>>
>> Xen would need to create such a device in the guest with a suitable
>> PCI ID that it would be explicitly willing to share with other
>> hypervisor suppliers....
>>
>> It's ugly, I know, but it might work...
>
> Or just create a phantom 'struct device' that is used with
> the TTM layer. And it has the lowest common denominator of the
> different graphic cards that exist. But that is a bit strange b/c
> for some machines (IBM boxes), you have per-device DMA API specific
> calls. This depends on what PCI bus you have the card as some of these
> boxes have multiple IOMMUs. So that wouldn't work that nicely..
>
> How about a backend TTM alloc API? So the calls to 'ttm_get_page'
> and 'ttm_put_page' call to a TTM-alloc API to do allocation.
>
> The default one is the native, and it would have those 'dma_alloc_coherent'
> removed. ?When booting under virtualized
> environment a virtualisation "friendly" backend TTM alloc would
> register and all calls to 'put/get/probe' would be diverted to it.
> 'probe' would obviously check whether it should use this backend or not.
>
> It would mean two new files: drivers/gpu/drm/ttm/ttm-memory-xen.c and
> a ttm-memory-generic.c and some header work.
>
> It would still need to keep the 'dma_address[i]' around so that
> those can be passed to the radeon/nouveau GTT, but for native it
> could just contain BAD_DMA_ADDRESS - and the code in the radeon/nouveau
> GTT binding is smart to figure out to do 'pci_map_single' if the
> dma_addr_t has BAD_DMA_ADDRESS.
>
> The issuer here is with the caching I had a question about. We
> would need to reset the caching state back to the original one
> before free-ing it. So does the TTM pool de-alloc code deal with this?
>
Sounds doable. Thought i don't understand why you want virtualized
guest to be able to use hw directly. From my point of view all device
in a virtualized guest should be virtualized device that talks to the
host system driver.
Cheers,
Jerome
> > When a page in the TTM pool is being moved back and forth and also changes
> > the caching model, what happens on the free part? Is the original caching
> > state put back on it? Say I allocated a DMA32 page (GFP_DMA32), and move it
> > to another pool for another radeon device. I also do some cache changes:
> > make it write-back, then ?un-cached, then writeback, and when I am done, I
> > return it back to the pool (DMA32). Once that is done I want to unload
> > the DRM/TTM driver. Does that page get its caching state reverted
> > back to what it originally had (probably un-cached)? And where is this done?
>
> When ultimately being free all the page are set to write back again as
> it's default of all allocated page (see ttm_pages_put). ttm_put_pages
> will add page to the correct pool (uc or wc).
OK.
.. snip ..
> > How about a backend TTM alloc API? So the calls to 'ttm_get_page'
> > and 'ttm_put_page' call to a TTM-alloc API to do allocation.
> >
> > The default one is the native, and it would have those 'dma_alloc_coherent'
> > removed. ?When booting under virtualized
> > environment a virtualisation "friendly" backend TTM alloc would
> > register and all calls to 'put/get/probe' would be diverted to it.
> > 'probe' would obviously check whether it should use this backend or not.
> >
> > It would mean two new files: drivers/gpu/drm/ttm/ttm-memory-xen.c and
> > a ttm-memory-generic.c and some header work.
> >
> > It would still need to keep the 'dma_address[i]' around so that
> > those can be passed to the radeon/nouveau GTT, but for native it
> > could just contain BAD_DMA_ADDRESS - and the code in the radeon/nouveau
> > GTT binding is smart to figure out to do 'pci_map_single' if the
> > dma_addr_t has BAD_DMA_ADDRESS.
> >
> > The issuer here is with the caching I had a question about. We
> > would need to reset the caching state back to the original one
> > before free-ing it. So does the TTM pool de-alloc code deal with this?
> >
>
> Sounds doable. Thought i don't understand why you want virtualized
Thomas, Jerome, Dave,
I can start this next week if you guys are comfortable with this idea.
> guest to be able to use hw directly. From my point of view all device
> in a virtualized guest should be virtualized device that talks to the
> host system driver.
That "virtualized guest" in this case is the first Linux kernel that
is booted under a hypervisor. It serves as the "driver domain"
so that it can drive the network, storage, and graphics. To get the
graphics working right the patchset that introduced using the PCI DMA
in the TTM layer allows us to program the GTT with the bus address
instead of programming the bus address of a bounce buffer. The first
set of patches have a great lengthy explanation of this :-)
https://lkml.org/lkml/2010/12/6/516
>
> Cheers,
> Jerome
On 03/24/2011 05:21 PM, Konrad Rzeszutek Wilk wrote:
>>> When a page in the TTM pool is being moved back and forth and also changes
>>> the caching model, what happens on the free part? Is the original caching
>>> state put back on it? Say I allocated a DMA32 page (GFP_DMA32), and move it
>>> to another pool for another radeon device. I also do some cache changes:
>>> make it write-back, then un-cached, then writeback, and when I am done, I
>>> return it back to the pool (DMA32). Once that is done I want to unload
>>> the DRM/TTM driver. Does that page get its caching state reverted
>>> back to what it originally had (probably un-cached)? And where is this done?
>>>
>> When ultimately being free all the page are set to write back again as
>> it's default of all allocated page (see ttm_pages_put). ttm_put_pages
>> will add page to the correct pool (uc or wc).
>>
> OK.
>
> .. snip ..
>
>>> How about a backend TTM alloc API? So the calls to 'ttm_get_page'
>>> and 'ttm_put_page' call to a TTM-alloc API to do allocation.
>>>
>>> The default one is the native, and it would have those 'dma_alloc_coherent'
>>> removed. When booting under virtualized
>>> environment a virtualisation "friendly" backend TTM alloc would
>>> register and all calls to 'put/get/probe' would be diverted to it.
>>> 'probe' would obviously check whether it should use this backend or not.
>>>
>>> It would mean two new files: drivers/gpu/drm/ttm/ttm-memory-xen.c and
>>> a ttm-memory-generic.c and some header work.
>>>
>>> It would still need to keep the 'dma_address[i]' around so that
>>> those can be passed to the radeon/nouveau GTT, but for native it
>>> could just contain BAD_DMA_ADDRESS - and the code in the radeon/nouveau
>>> GTT binding is smart to figure out to do 'pci_map_single' if the
>>> dma_addr_t has BAD_DMA_ADDRESS.
>>>
>>> The issuer here is with the caching I had a question about. We
>>> would need to reset the caching state back to the original one
>>> before free-ing it. So does the TTM pool de-alloc code deal with this?
>>>
>>>
>> Sounds doable. Thought i don't understand why you want virtualized
>>
> Thomas, Jerome, Dave,
>
> I can start this next week if you guys are comfortable with this idea.
>
>
Konrad,
1) A couple of questions first. Where are the memory pools going to end
up in this design. Could you draft an API? How is page accounting going
to be taken care of? How do we differentiate between running on bare
metal and running on a hypervisor?
2) When a page changes caching policy, I guess that needs to be
propagated to any physical hypervisor mappings of that page. We don't
allow for different maps with different caching policies in the general
case.
3) TTM needs to be able to query the bo whether its pages can be
a) moved between devices (I guess this is the case for coherent pages
with a phony device).
b) inserted into the swap cache (I guess this is in general not the case
for coherent pages).
4) We'll probably need to move ttm page alloc and put and the query in
3) to the ttm backend. The backend can then hide things like DMA address
and device-specific memory to the rest of TTM as long as the query in 3)
is exposed, so the rest of TTM doesn't need to care. The backend can
then plug in whatever memory allocation utility function it wants.
5) For the future we should probably make TTM support non-coherent
memory if needed, with appropriate flushes and sync for device / cpu.
However, that's something we can look at later.
/Thomas
>> guest to be able to use hw directly. From my point of view all device
>> in a virtualized guest should be virtualized device that talks to the
>> host system driver.
>>
> That "virtualized guest" in this case is the first Linux kernel that
> is booted under a hypervisor. It serves as the "driver domain"
> so that it can drive the network, storage, and graphics. To get the
> graphics working right the patchset that introduced using the PCI DMA
> in the TTM layer allows us to program the GTT with the bus address
> instead of programming the bus address of a bounce buffer. The first
> set of patches have a great lengthy explanation of this :-)
>
> https://lkml.org/lkml/2010/12/6/516
>
>
>> Cheers,
>> Jerome
>>
> >I can start this next week if you guys are comfortable with this idea.
> >
> >
>
> Konrad,
>
> 1) A couple of questions first. Where are the memory pools going to
> end up in this design. Could you draft an API? How is page
> accounting going to be taken care of? How do we differentiate
> between running on bare metal and running on a hypervisor?
My thought was that the memory pool's wouldn't be affected. Instead
of all of the calls to alloc_page/__free_page (and dma_alloc_coherent/
dma_free_coherent) would go through this API calls.
What I thought off are three phases:
1). Get in the patch that passed in 'struct dev' to the dma_alloc_coherent
for 2.6.39 so that PowerPC folks can use the it with radeon cards. My
understanding is that the work you plan on to isn't going in 2.6.39
but rather in 2.6.40 - and if get my stuff ready (the other phases)
we can work out the kinks together. This way also the 'struct dev'
is passed in the TTM layer.
2). Encapsulate the 'struct page *' and 'dma_addr_t *dma_address' from
'struct ttm_tt' in its own structure ('struct ttm_tt_page'?) along
with the a struct list_head (this is b/c most of the code in that
layer uses the 'page->lru' to key off, but if we are using a tuple
we could get un-sync with which dma_addr_t the page is associated with).
During startup we would allocate this 'struct ttm_tt_page' the
same way we create 'struct page **' and 'dma_addr_t *dma_address'
(meaning in ttm_tt_alloc_page_directory utilize drm_calloc_large)
3). Provide an 'struct ttm_page_alloc_ops' that has two implementors.
The struct would look like this:
struct ttm_page_alloc_ops {
bool (*probe) (struct dev *dev);
struct ttm_tt_page (*alloc) (struct dev *dev, gfp_t gfp_flags);
void (*free) (struct ttm_tt_page *page);
}
The ttm_page_alloc.c would have these defined:
struct ttm_page_alloc_ops *alloc_backend = &ttm_page_alloc_generic;
static struct ttm_page_alloc_ops *alloc_backends[] = {
&ttm_page_alloc_dma,
&ttm_page_alloc_generic,
NULL,
};
And the ttm_page_alloc_init function would iterate over it
for (i = 0; alloc_backends[i]; ++i) {
if (alloc_backends[i]->probe(dev)) {
alloc_backend = alloc_backends[i];
break;
}
}
3.1). Implement the ttm_page_alloc_generic calls, which would:
probe() would return 1, alloc() would return a struct ttm_tt_page
with the 'page' pointing to what 'alloc_page()' returned. And 'dma_address'
would contain DMA_ADDRESS_BAD. free() would just call __free_page.
3.2). The ttm_page_alloc_dma would be more complex.
probe() would squirell the 'struct dev' away. And if
it detected it is running under Xen (if (xen_domain_pv())) return true.
alloc_page() would pretty much do what ttm_put_pages() does right now
and save on its own list the 'struct page *' address and as well
with what 'struct dev' it is associated.
The free() would use the list to figure out which 'struct dev' to use
for the passed in 'struct ttm_page_tt' and use all of that data
to call 'dma_free_coherent' with the right arguments.
4). All of the calls in for alloc_page/__free_page would be
swapped with alloc_backend->alloc, or alloc_backend->free.
5). Test, test, test..
Ok, that is actually five phases. Please poke at it and see if there is
some other case I had forgotten.
>
> 2) When a page changes caching policy, I guess that needs to be
> propagated to any physical hypervisor mappings of that page. We
Right, that is OK. the set_pages_wb and its friends end up modifying
the PTE's at some point. And right now, if you compile the kernel with
CONFIG_PVOPS (which you need to have it working under a Xen hypervisor
as a PV guest), the modifications to the PTE calls end up calling the
right hypervisors PTE modification code. In Xen it would be xen_make_pte
- which sets the right attributes.
> don't allow for different maps with different caching policies in
> the general case.
>
> 3) TTM needs to be able to query the bo whether its pages can be
> a) moved between devices (I guess this is the case for coherent
> pages with a phony device).
The device wouldn't be phony anymore. It would be the real physical
device. So whatever code does it ought to work.
> b) inserted into the swap cache (I guess this is in general not the
> case for coherent pages).
It might still be the case. Under what conditions is this triggered?
How can I test this easily?
>
> 4) We'll probably need to move ttm page alloc and put and the query
> in 3) to the ttm backend. The backend can then hide things like DMA
> address and device-specific memory to the rest of TTM as long as the
> query in 3) is exposed, so the rest of TTM doesn't need to care. The
> backend can then plug in whatever memory allocation utility function
> it wants.
<nods> I think the draft I mentioned covers that.
>
> 5) For the future we should probably make TTM support non-coherent
> memory if needed, with appropriate flushes and sync for device /
> cpu. However, that's something we can look at later
OK. I am up for doing that when the time will come for it.
Konrad,
Sorry for waiting so long to answer. Workload is quite heavy ATM.
Please see inline.
On 03/31/2011 05:49 PM, Konrad Rzeszutek Wilk wrote:
>>> I can start this next week if you guys are comfortable with this idea.
>>>
>>>
>>>
>> Konrad,
>>
>> 1) A couple of questions first. Where are the memory pools going to
>> end up in this design. Could you draft an API? How is page
>> accounting going to be taken care of? How do we differentiate
>> between running on bare metal and running on a hypervisor?
>>
> My thought was that the memory pool's wouldn't be affected. Instead
> of all of the calls to alloc_page/__free_page (and dma_alloc_coherent/
> dma_free_coherent) would go through this API calls.
>
> What I thought off are three phases:
>
> 1). Get in the patch that passed in 'struct dev' to the dma_alloc_coherent
> for 2.6.39 so that PowerPC folks can use the it with radeon cards. My
> understanding is that the work you plan on to isn't going in 2.6.39
> but rather in 2.6.40 - and if get my stuff ready (the other phases)
> we can work out the kinks together. This way also the 'struct dev'
> is passed in the TTM layer.
>
I'm not happy with this solution. If something goes in, it should be
complete, otherwise future work need to worry about not breaking
something that's already broken. Also it adds things to TTM api's that
are not really necessary.
I'd like to see a solution that encapsulates all device-dependent stuff
(including the dma adresses) in the ttm backend, so the TTM backend code
is the only code that needs to worry about device dependent stuff. Core
ttm should only need to worry about whether pages can be transferrable
to other devices, and whether pages can be inserted into the page cache.
This change should be pretty straightforward. We move the ttm::pages
array into the backend, and add ttm backend functions to allocate pages
and to free pages. The backend is then completely free to keep track of
page types and dma addresses completely hidden from core ttm, and we
don't need to shuffle those around. This opens up both for completely
device-private coherent memory and for "dummy device" coherent memory.
In the future, when TTM needs to move a ttm to another device, or when
it needs to insert pages into the page cache, pages that are device
specific will be copied and then freed. "Dummy device" pages can be
transferred to other devices, but not inserted into the page cache.
/Thomas
On 04/08/2011 04:57 PM, Thomas Hellstrom wrote:
> Konrad,
>
> Sorry for waiting so long to answer. Workload is quite heavy ATM.
> Please see inline.
>
>
> On 03/31/2011 05:49 PM, Konrad Rzeszutek Wilk wrote:
>>>> I can start this next week if you guys are comfortable with this idea.
>>>>
>>>>
>>> Konrad,
>>>
>>> 1) A couple of questions first. Where are the memory pools going to
>>> end up in this design. Could you draft an API? How is page
>>> accounting going to be taken care of? How do we differentiate
>>> between running on bare metal and running on a hypervisor?
>> My thought was that the memory pool's wouldn't be affected. Instead
>> of all of the calls to alloc_page/__free_page (and dma_alloc_coherent/
>> dma_free_coherent) would go through this API calls.
>>
>> What I thought off are three phases:
>>
>> 1). Get in the patch that passed in 'struct dev' to the
>> dma_alloc_coherent
>> for 2.6.39 so that PowerPC folks can use the it with radeon cards. My
>> understanding is that the work you plan on to isn't going in 2.6.39
>> but rather in 2.6.40 - and if get my stuff ready (the other phases)
>> we can work out the kinks together. This way also the 'struct dev'
>> is passed in the TTM layer.
>
> I'm not happy with this solution. If something goes in, it should be
> complete, otherwise future work need to worry about not breaking
> something that's already broken. Also it adds things to TTM api's that
> are not really necessary.
>
>
> I'd like to see a solution that encapsulates all device-dependent
> stuff (including the dma adresses) in the ttm backend, so the TTM
> backend code is the only code that needs to worry about device
> dependent stuff. Core ttm should only need to worry about whether
> pages can be transferrable to other devices, and whether pages can be
> inserted into the page cache.
>
> This change should be pretty straightforward. We move the ttm::pages
> array into the backend, and add ttm backend functions to allocate
> pages and to free pages. The backend is then completely free to keep
> track of page types and dma addresses completely hidden from core ttm,
> and we don't need to shuffle those around. This opens up both for
> completely device-private coherent memory and for "dummy device"
> coherent memory.
>
> In the future, when TTM needs to move a ttm to another device, or when
> it needs to insert pages into the page cache, pages that are device
> specific will be copied and then freed. "Dummy device" pages can be
> transferred to other devices, but not inserted into the page cache.
>
> /Thomas
>
>
Oh, I forgot, I'll be on vacation for a week with limited possibilities
to read mail, but after that I can prototype the ttm backend api changes
if necessary.
/Thomas
On Fri, Apr 08, 2011 at 04:57:14PM +0200, Thomas Hellstrom wrote:
> Konrad,
>
> Sorry for waiting so long to answer. Workload is quite heavy ATM.
> Please see inline.
OK. Thank you for taking a look... some questions before you
depart on vacation.
> > 1). Get in the patch that passed in 'struct dev' to the dma_alloc_coherent
> > for 2.6.39 so that PowerPC folks can use the it with radeon cards. My
> > understanding is that the work you plan on to isn't going in 2.6.39
> > but rather in 2.6.40 - and if get my stuff ready (the other phases)
> > we can work out the kinks together. This way also the 'struct dev'
> > is passed in the TTM layer.
>
> I'm not happy with this solution. If something goes in, it should be
> complete, otherwise future work need to worry about not breaking
> something that's already broken. Also it adds things to TTM api's
<nods>
> that are not really necessary.
>
>
> I'd like to see a solution that encapsulates all device-dependent
> stuff (including the dma adresses) in the ttm backend, so the TTM
> backend code is the only code that needs to worry about device
I am a bit confused here. The usual "ttm backend" refers to the
device specific hooks (so the radeon/nouveau/via driver), which
use this structure: ttm_backend_func
That is not what you are referring to right?
> dependent stuff. Core ttm should only need to worry about whether
> pages can be transferrable to other devices, and whether pages can
> be inserted into the page cache.
Ok. So the core ttm would need to know the 'struct dev' to figure
out what the criteria are for transferring the page (ie, it is
Ok for a 64-bit card to use a 32-bit card's pages, but not the other
way around)..
>
> This change should be pretty straightforward. We move the ttm::pages
> array into the backend, and add ttm backend functions to allocate
> pages and to free pages. The backend is then completely free to keep
> track of page types and dma addresses completely hidden from core
> ttm, and we don't need to shuffle those around. This opens up both
> for completely device-private coherent memory and for "dummy device"
> coherent memory.
The 'dummy device' is a bit of hack thought? Why not get rid
of that idea and just squirrel away the the 'struct dev' and let the
ttm::backend figure out how to allocate the pages?
>
> In the future, when TTM needs to move a ttm to another device, or
> when it needs to insert pages into the page cache, pages that are
> device specific will be copied and then freed. "Dummy device" pages
> can be transferred to other devices, but not inserted into the page
> cache.
OK. That would require some extra function in the ttm::backend to
say "dont_stick_this_in_page_cache".
On 04/08/2011 05:12 PM, Konrad Rzeszutek Wilk wrote:
> On Fri, Apr 08, 2011 at 04:57:14PM +0200, Thomas Hellstrom wrote:
>
>> Konrad,
>>
>> Sorry for waiting so long to answer. Workload is quite heavy ATM.
>> Please see inline.
>>
> OK. Thank you for taking a look... some questions before you
> depart on vacation.
>
>
>>> 1). Get in the patch that passed in 'struct dev' to the dma_alloc_coherent
>>> for 2.6.39 so that PowerPC folks can use the it with radeon cards. My
>>> understanding is that the work you plan on to isn't going in 2.6.39
>>> but rather in 2.6.40 - and if get my stuff ready (the other phases)
>>> we can work out the kinks together. This way also the 'struct dev'
>>> is passed in the TTM layer.
>>>
>> I'm not happy with this solution. If something goes in, it should be
>> complete, otherwise future work need to worry about not breaking
>> something that's already broken. Also it adds things to TTM api's
>>
> <nods>
>
>> that are not really necessary.
>>
>>
>> I'd like to see a solution that encapsulates all device-dependent
>> stuff (including the dma adresses) in the ttm backend, so the TTM
>> backend code is the only code that needs to worry about device
>>
> I am a bit confused here. The usual "ttm backend" refers to the
> device specific hooks (so the radeon/nouveau/via driver), which
> use this structure: ttm_backend_func
>
> That is not what you are referring to right?
>
Yes, exactly.
>> dependent stuff. Core ttm should only need to worry about whether
>> pages can be transferrable to other devices, and whether pages can
>> be inserted into the page cache.
>>
> Ok. So the core ttm would need to know the 'struct dev' to figure
> out what the criteria are for transferring the page (ie, it is
> Ok for a 64-bit card to use a 32-bit card's pages, but not the other
> way around)..
>
So the idea would be to have "ttm_backend::populate" decide whether the
current pages are compatible with the device or not, and copy if that's
the case.
Usually the pages are allocated by the backend itself and should be
compatible, but the populate check would trigger if pages were
transferred from another device. This case happens when the destination
device has special requirements, and needs to be implemented in all
backends when we start transfer TTMs between devices. Here we can use
struct dev or something similar as a page compatibility identifier.
The other case is where the source device has special requirements, for
example when the source device pages can't be inserted into the swap
cache (This is the case you are referring to above). Core TTM does only
need to know whether the pages are "normal pages" or not, and does not
need to know about struct dev. Hence, the backend needs a query
function, but not until we actually implement direct swap cache insertions.
So none of this stuff needs to be implemented now, and we can always
hide struct dev in the backend.
>
>> This change should be pretty straightforward. We move the ttm::pages
>> array into the backend, and add ttm backend functions to allocate
>> pages and to free pages. The backend is then completely free to keep
>> track of page types and dma addresses completely hidden from core
>> ttm, and we don't need to shuffle those around. This opens up both
>> for completely device-private coherent memory and for "dummy device"
>> coherent memory.
>>
> The 'dummy device' is a bit of hack thought? Why not get rid
> of that idea and just squirrel away the the 'struct dev' and let the
> ttm::backend figure out how to allocate the pages?
>
Yes it's a hack. The advantage of a dummy device is that pages will be
movable across backends that share the same dummy device. For example
between a radeon and a nouveau driver on a Xen platform.
>
>> In the future, when TTM needs to move a ttm to another device, or
>> when it needs to insert pages into the page cache, pages that are
>> device specific will be copied and then freed. "Dummy device" pages
>> can be transferred to other devices, but not inserted into the page
>> cache.
>>
> OK. That would require some extra function in the ttm::backend to
> say "dont_stick_this_in_page_cache".
>
Correct. We should use the ttm backend query function discussed above,
and enumerate queries.
Thanks
Thomas