2009-03-25 21:45:29

by Eric Anholt

[permalink] [raw]
Subject: DRM lock ordering fix series

Here's hopefully the final attempt at the lock ordering fix for GEM. The
problem was introduced in .29 with the GTT mapping support. We hashed out
a few potential fixes on the mailing list and at OSTS. Peter's plan was
to use get_user_pages, but it has significant CPU overhead (10% cost to text
rendering, though part of that is due to some dumb userland code. But it's
dumb userland code we're all running). Linus's plan was to just try the
atomic copy_from_user and see if it succeeds, then fall back to allocating
space and copying in to the kernel since it'll "never" happen. Except that
we can't allocate as much as a user may try to copy in in kernel space, so
we'd have had to break up the write into multiple passes dropping the lock
in between, and I wasn't comfortable with that.

So, this time, I try the optimistic Linus path, and if not, fall back to the
slow Peter Zijlstra path. It seems to be working, and I forced the
slow paths and tested each case against some regression tests I have, so I
think it's good to go. But I'd love some Reviewed-bys on the ones that are
missing it, because this morning I had to fix 9 bugs in 60 LOC, and that's
just what I caught with testcases.


2009-03-25 21:45:43

by Eric Anholt

[permalink] [raw]
Subject: [PATCH 2/6] drm/i915: Make GEM object's page lists refcounted instead of get/free.

We've wanted this for a few consumers that touch the pages directly (such as
the following commit), which have been doing the refcounting outside of
get/put pages.
---
drivers/gpu/drm/i915/i915_drv.h | 3 +-
drivers/gpu/drm/i915/i915_gem.c | 70 ++++++++++++++++++++-------------------
2 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d6cc986..75e3384 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -404,7 +404,8 @@ struct drm_i915_gem_object {
/** AGP memory structure for our GTT binding. */
DRM_AGP_MEM *agp_mem;

- struct page **page_list;
+ struct page **pages;
+ int pages_refcount;

/**
* Current offset of the object in GTT space.
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 35f8c7b..b998d65 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -43,8 +43,8 @@ static int i915_gem_object_set_cpu_read_domain_range(struct drm_gem_object *obj,
uint64_t offset,
uint64_t size);
static void i915_gem_object_set_to_full_cpu_read_domain(struct drm_gem_object *obj);
-static int i915_gem_object_get_page_list(struct drm_gem_object *obj);
-static void i915_gem_object_free_page_list(struct drm_gem_object *obj);
+static int i915_gem_object_get_pages(struct drm_gem_object *obj);
+static void i915_gem_object_put_pages(struct drm_gem_object *obj);
static int i915_gem_object_wait_rendering(struct drm_gem_object *obj);
static int i915_gem_object_bind_to_gtt(struct drm_gem_object *obj,
unsigned alignment);
@@ -928,29 +928,30 @@ i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
}

static void
-i915_gem_object_free_page_list(struct drm_gem_object *obj)
+i915_gem_object_put_pages(struct drm_gem_object *obj)
{
struct drm_i915_gem_object *obj_priv = obj->driver_private;
int page_count = obj->size / PAGE_SIZE;
int i;

- if (obj_priv->page_list == NULL)
- return;
+ BUG_ON(obj_priv->pages_refcount == 0);

+ if (--obj_priv->pages_refcount != 0)
+ return;

for (i = 0; i < page_count; i++)
- if (obj_priv->page_list[i] != NULL) {
+ if (obj_priv->pages[i] != NULL) {
if (obj_priv->dirty)
- set_page_dirty(obj_priv->page_list[i]);
- mark_page_accessed(obj_priv->page_list[i]);
- page_cache_release(obj_priv->page_list[i]);
+ set_page_dirty(obj_priv->pages[i]);
+ mark_page_accessed(obj_priv->pages[i]);
+ page_cache_release(obj_priv->pages[i]);
}
obj_priv->dirty = 0;

- drm_free(obj_priv->page_list,
+ drm_free(obj_priv->pages,
page_count * sizeof(struct page *),
DRM_MEM_DRIVER);
- obj_priv->page_list = NULL;
+ obj_priv->pages = NULL;
}

static void
@@ -1402,7 +1403,7 @@ i915_gem_object_unbind(struct drm_gem_object *obj)
if (obj_priv->fence_reg != I915_FENCE_REG_NONE)
i915_gem_clear_fence_reg(obj);

- i915_gem_object_free_page_list(obj);
+ i915_gem_object_put_pages(obj);

if (obj_priv->gtt_space) {
atomic_dec(&dev->gtt_count);
@@ -1521,7 +1522,7 @@ i915_gem_evict_everything(struct drm_device *dev)
}

static int
-i915_gem_object_get_page_list(struct drm_gem_object *obj)
+i915_gem_object_get_pages(struct drm_gem_object *obj)
{
struct drm_i915_gem_object *obj_priv = obj->driver_private;
int page_count, i;
@@ -1530,18 +1531,19 @@ i915_gem_object_get_page_list(struct drm_gem_object *obj)
struct page *page;
int ret;

- if (obj_priv->page_list)
+ if (obj_priv->pages_refcount++ != 0)
return 0;

/* Get the list of pages out of our struct file. They'll be pinned
* at this point until we release them.
*/
page_count = obj->size / PAGE_SIZE;
- BUG_ON(obj_priv->page_list != NULL);
- obj_priv->page_list = drm_calloc(page_count, sizeof(struct page *),
- DRM_MEM_DRIVER);
- if (obj_priv->page_list == NULL) {
+ BUG_ON(obj_priv->pages != NULL);
+ obj_priv->pages = drm_calloc(page_count, sizeof(struct page *),
+ DRM_MEM_DRIVER);
+ if (obj_priv->pages == NULL) {
DRM_ERROR("Faled to allocate page list\n");
+ obj_priv->pages_refcount--;
return -ENOMEM;
}

@@ -1552,10 +1554,10 @@ i915_gem_object_get_page_list(struct drm_gem_object *obj)
if (IS_ERR(page)) {
ret = PTR_ERR(page);
DRM_ERROR("read_mapping_page failed: %d\n", ret);
- i915_gem_object_free_page_list(obj);
+ i915_gem_object_put_pages(obj);
return ret;
}
- obj_priv->page_list[i] = page;
+ obj_priv->pages[i] = page;
}
return 0;
}
@@ -1878,7 +1880,7 @@ i915_gem_object_bind_to_gtt(struct drm_gem_object *obj, unsigned alignment)
DRM_INFO("Binding object of size %d at 0x%08x\n",
obj->size, obj_priv->gtt_offset);
#endif
- ret = i915_gem_object_get_page_list(obj);
+ ret = i915_gem_object_get_pages(obj);
if (ret) {
drm_mm_put_block(obj_priv->gtt_space);
obj_priv->gtt_space = NULL;
@@ -1890,12 +1892,12 @@ i915_gem_object_bind_to_gtt(struct drm_gem_object *obj, unsigned alignment)
* into the GTT.
*/
obj_priv->agp_mem = drm_agp_bind_pages(dev,
- obj_priv->page_list,
+ obj_priv->pages,
page_count,
obj_priv->gtt_offset,
obj_priv->agp_type);
if (obj_priv->agp_mem == NULL) {
- i915_gem_object_free_page_list(obj);
+ i915_gem_object_put_pages(obj);
drm_mm_put_block(obj_priv->gtt_space);
obj_priv->gtt_space = NULL;
return -ENOMEM;
@@ -1922,10 +1924,10 @@ i915_gem_clflush_object(struct drm_gem_object *obj)
* to GPU, and we can ignore the cache flush because it'll happen
* again at bind time.
*/
- if (obj_priv->page_list == NULL)
+ if (obj_priv->pages == NULL)
return;

- drm_clflush_pages(obj_priv->page_list, obj->size / PAGE_SIZE);
+ drm_clflush_pages(obj_priv->pages, obj->size / PAGE_SIZE);
}

/** Flushes any GPU write domain for the object if it's dirty. */
@@ -2270,7 +2272,7 @@ i915_gem_object_set_to_full_cpu_read_domain(struct drm_gem_object *obj)
for (i = 0; i <= (obj->size - 1) / PAGE_SIZE; i++) {
if (obj_priv->page_cpu_valid[i])
continue;
- drm_clflush_pages(obj_priv->page_list + i, 1);
+ drm_clflush_pages(obj_priv->pages + i, 1);
}
drm_agp_chipset_flush(dev);
}
@@ -2336,7 +2338,7 @@ i915_gem_object_set_cpu_read_domain_range(struct drm_gem_object *obj,
if (obj_priv->page_cpu_valid[i])
continue;

- drm_clflush_pages(obj_priv->page_list + i, 1);
+ drm_clflush_pages(obj_priv->pages + i, 1);

obj_priv->page_cpu_valid[i] = 1;
}
@@ -3304,7 +3306,7 @@ i915_gem_init_hws(struct drm_device *dev)

dev_priv->status_gfx_addr = obj_priv->gtt_offset;

- dev_priv->hw_status_page = kmap(obj_priv->page_list[0]);
+ dev_priv->hw_status_page = kmap(obj_priv->pages[0]);
if (dev_priv->hw_status_page == NULL) {
DRM_ERROR("Failed to map status page.\n");
memset(&dev_priv->hws_map, 0, sizeof(dev_priv->hws_map));
@@ -3334,7 +3336,7 @@ i915_gem_cleanup_hws(struct drm_device *dev)
obj = dev_priv->hws_obj;
obj_priv = obj->driver_private;

- kunmap(obj_priv->page_list[0]);
+ kunmap(obj_priv->pages[0]);
i915_gem_object_unpin(obj);
drm_gem_object_unreference(obj);
dev_priv->hws_obj = NULL;
@@ -3637,20 +3639,20 @@ void i915_gem_detach_phys_object(struct drm_device *dev,
if (!obj_priv->phys_obj)
return;

- ret = i915_gem_object_get_page_list(obj);
+ ret = i915_gem_object_get_pages(obj);
if (ret)
goto out;

page_count = obj->size / PAGE_SIZE;

for (i = 0; i < page_count; i++) {
- char *dst = kmap_atomic(obj_priv->page_list[i], KM_USER0);
+ char *dst = kmap_atomic(obj_priv->pages[i], KM_USER0);
char *src = obj_priv->phys_obj->handle->vaddr + (i * PAGE_SIZE);

memcpy(dst, src, PAGE_SIZE);
kunmap_atomic(dst, KM_USER0);
}
- drm_clflush_pages(obj_priv->page_list, page_count);
+ drm_clflush_pages(obj_priv->pages, page_count);
drm_agp_chipset_flush(dev);
out:
obj_priv->phys_obj->cur_obj = NULL;
@@ -3693,7 +3695,7 @@ i915_gem_attach_phys_object(struct drm_device *dev,
obj_priv->phys_obj = dev_priv->mm.phys_objs[id - 1];
obj_priv->phys_obj->cur_obj = obj;

- ret = i915_gem_object_get_page_list(obj);
+ ret = i915_gem_object_get_pages(obj);
if (ret) {
DRM_ERROR("failed to get page list\n");
goto out;
@@ -3702,7 +3704,7 @@ i915_gem_attach_phys_object(struct drm_device *dev,
page_count = obj->size / PAGE_SIZE;

for (i = 0; i < page_count; i++) {
- char *src = kmap_atomic(obj_priv->page_list[i], KM_USER0);
+ char *src = kmap_atomic(obj_priv->pages[i], KM_USER0);
char *dst = obj_priv->phys_obj->handle->vaddr + (i * PAGE_SIZE);

memcpy(dst, src, PAGE_SIZE);
--
1.6.2.1

2009-03-25 21:46:23

by Eric Anholt

[permalink] [raw]
Subject: [PATCH 3/6] drm/i915: Fix lock order reversal in shmem pwrite path.

Like the GTT pwrite path fix, this uses an optimistic path and a
fallback to get_user_pages. Note that this means we have to stop using
vfs_write and roll it ourselves.

Signed-off-by: Eric Anholt <[email protected]>
---
drivers/gpu/drm/i915/i915_gem.c | 225 +++++++++++++++++++++++++++++++++++----
1 files changed, 205 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b998d65..bdc7326 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -136,6 +136,33 @@ i915_gem_create_ioctl(struct drm_device *dev, void *data,
return 0;
}

+static inline int
+slow_shmem_copy(struct page *dst_page,
+ int dst_offset,
+ struct page *src_page,
+ int src_offset,
+ int length)
+{
+ char *dst_vaddr, *src_vaddr;
+
+ dst_vaddr = kmap_atomic(dst_page, KM_USER0);
+ if (dst_vaddr == NULL)
+ return -ENOMEM;
+
+ src_vaddr = kmap_atomic(src_page, KM_USER1);
+ if (src_vaddr == NULL) {
+ kunmap_atomic(dst_vaddr, KM_USER0);
+ return -ENOMEM;
+ }
+
+ memcpy(dst_vaddr + dst_offset, src_vaddr + src_offset, length);
+
+ kunmap_atomic(src_vaddr, KM_USER1);
+ kunmap_atomic(dst_vaddr, KM_USER0);
+
+ return 0;
+}
+
/**
* Reads data from the object referenced by handle.
*
@@ -243,6 +270,23 @@ slow_kernel_write(struct io_mapping *mapping,
return 0;
}

+static inline int
+fast_shmem_write(struct page **pages,
+ loff_t page_base, int page_offset,
+ char __user *data,
+ int length)
+{
+ char __iomem *vaddr;
+
+ vaddr = kmap_atomic(pages[page_base >> PAGE_SHIFT], KM_USER0);
+ if (vaddr == NULL)
+ return -ENOMEM;
+ __copy_from_user_inatomic(vaddr + page_offset, data, length);
+ kunmap_atomic(vaddr, KM_USER0);
+
+ return 0;
+}
+
/**
* This is the fast pwrite path, where we copy the data directly from the
* user into the GTT, uncached.
@@ -423,39 +467,175 @@ out_unpin_pages:
return ret;
}

+/**
+ * This is the fast shmem pwrite path, which attempts to directly
+ * copy_from_user into the kmapped pages backing the object.
+ */
static int
-i915_gem_shmem_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
- struct drm_i915_gem_pwrite *args,
- struct drm_file *file_priv)
+i915_gem_shmem_pwrite_fast(struct drm_device *dev, struct drm_gem_object *obj,
+ struct drm_i915_gem_pwrite *args,
+ struct drm_file *file_priv)
{
+ struct drm_i915_gem_object *obj_priv = obj->driver_private;
+ ssize_t remain;
+ loff_t offset, page_base;
+ char __user *user_data;
+ int page_offset, page_length;
int ret;
- loff_t offset;
- ssize_t written;
+
+ user_data = (char __user *) (uintptr_t) args->data_ptr;
+ remain = args->size;

mutex_lock(&dev->struct_mutex);

+ ret = i915_gem_object_get_pages(obj);
+ if (ret != 0)
+ goto fail_unlock;
+
ret = i915_gem_object_set_to_cpu_domain(obj, 1);
- if (ret) {
- mutex_unlock(&dev->struct_mutex);
- return ret;
+ if (ret != 0)
+ goto fail_put_pages;
+
+ obj_priv = obj->driver_private;
+ offset = args->offset;
+ obj_priv->dirty = 1;
+
+ while (remain > 0) {
+ /* Operation in this page
+ *
+ * page_base = page offset within aperture
+ * page_offset = offset within page
+ * page_length = bytes to copy for this page
+ */
+ page_base = (offset & ~(PAGE_SIZE-1));
+ page_offset = offset & (PAGE_SIZE-1);
+ page_length = remain;
+ if ((page_offset + remain) > PAGE_SIZE)
+ page_length = PAGE_SIZE - page_offset;
+
+ ret = fast_shmem_write(obj_priv->pages,
+ page_base, page_offset,
+ user_data, page_length);
+ if (ret)
+ goto fail_put_pages;
+
+ remain -= page_length;
+ user_data += page_length;
+ offset += page_length;
}

+fail_put_pages:
+ i915_gem_object_put_pages(obj);
+fail_unlock:
+ mutex_unlock(&dev->struct_mutex);
+
+ return ret;
+}
+
+/**
+ * This is the fallback shmem pwrite path, which uses get_user_pages to pin
+ * the memory and maps it using kmap_atomic for copying.
+ *
+ * This avoids taking mmap_sem for faulting on the user's address while the
+ * struct_mutex is held.
+ */
+static int
+i915_gem_shmem_pwrite_slow(struct drm_device *dev, struct drm_gem_object *obj,
+ struct drm_i915_gem_pwrite *args,
+ struct drm_file *file_priv)
+{
+ struct drm_i915_gem_object *obj_priv = obj->driver_private;
+ struct mm_struct *mm = current->mm;
+ struct page **user_pages;
+ ssize_t remain;
+ loff_t offset, pinned_pages, i;
+ loff_t first_data_page, last_data_page, num_pages;
+ int shmem_page_index, shmem_page_offset;
+ int data_page_index, data_page_offset;
+ int page_length;
+ int ret;
+ uint64_t data_ptr = args->data_ptr;
+
+ remain = args->size;
+
+ /* Pin the user pages containing the data. We can't fault while
+ * holding the struct mutex, and all of the pwrite implementations
+ * want to hold it while dereferencing the user data.
+ */
+ first_data_page = data_ptr / PAGE_SIZE;
+ last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
+ num_pages = last_data_page - first_data_page + 1;
+
+ user_pages = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
+ if (user_pages == NULL)
+ return -ENOMEM;
+
+ down_read(&mm->mmap_sem);
+ pinned_pages = get_user_pages(current, mm, (uintptr_t)args->data_ptr,
+ num_pages, 0, 0, user_pages, NULL);
+ up_read(&mm->mmap_sem);
+ if (pinned_pages < num_pages) {
+ ret = -EFAULT;
+ goto fail_put_user_pages;
+ }
+
+ mutex_lock(&dev->struct_mutex);
+
+ ret = i915_gem_object_get_pages(obj);
+ if (ret != 0)
+ goto fail_unlock;
+
+ ret = i915_gem_object_set_to_cpu_domain(obj, 1);
+ if (ret != 0)
+ goto fail_put_pages;
+
+ obj_priv = obj->driver_private;
offset = args->offset;
+ obj_priv->dirty = 1;

- written = vfs_write(obj->filp,
- (char __user *)(uintptr_t) args->data_ptr,
- args->size, &offset);
- if (written != args->size) {
- mutex_unlock(&dev->struct_mutex);
- if (written < 0)
- return written;
- else
- return -EINVAL;
+ while (remain > 0) {
+ /* Operation in this page
+ *
+ * shmem_page_index = page number within shmem file
+ * shmem_page_offset = offset within page in shmem file
+ * data_page_index = page number in get_user_pages return
+ * data_page_offset = offset with data_page_index page.
+ * page_length = bytes to copy for this page
+ */
+ shmem_page_index = offset / PAGE_SIZE;
+ shmem_page_offset = offset & ~PAGE_MASK;
+ data_page_index = data_ptr / PAGE_SIZE - first_data_page;
+ data_page_offset = data_ptr & ~PAGE_MASK;
+
+ page_length = remain;
+ if ((shmem_page_offset + page_length) > PAGE_SIZE)
+ page_length = PAGE_SIZE - shmem_page_offset;
+ if ((data_page_offset + page_length) > PAGE_SIZE)
+ page_length = PAGE_SIZE - data_page_offset;
+
+ ret = slow_shmem_copy(obj_priv->pages[shmem_page_index],
+ shmem_page_offset,
+ user_pages[data_page_index],
+ data_page_offset,
+ page_length);
+ if (ret)
+ goto fail_put_pages;
+
+ remain -= page_length;
+ data_ptr += page_length;
+ offset += page_length;
}

+fail_put_pages:
+ i915_gem_object_put_pages(obj);
+fail_unlock:
mutex_unlock(&dev->struct_mutex);
+fail_put_user_pages:
+ for (i = 0; i < pinned_pages; i++)
+ page_cache_release(user_pages[i]);
+ kfree(user_pages);

- return 0;
+ return ret;
}

/**
@@ -502,8 +682,13 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
ret = i915_gem_gtt_pwrite_slow(dev, obj, args,
file_priv);
}
- } else
- ret = i915_gem_shmem_pwrite(dev, obj, args, file_priv);
+ } else {
+ ret = i915_gem_shmem_pwrite_fast(dev, obj, args, file_priv);
+ if (ret == -EFAULT) {
+ ret = i915_gem_shmem_pwrite_slow(dev, obj, args,
+ file_priv);
+ }
+ }

#if WATCH_PWRITE
if (ret)
--
1.6.2.1

2009-03-25 21:45:58

by Eric Anholt

[permalink] [raw]
Subject: [PATCH 4/6] drm/i915: Fix lock order reversal in shmem pread path.

Signed-off-by: Eric Anholt <[email protected]>
---
drivers/gpu/drm/i915/i915_gem.c | 221 ++++++++++++++++++++++++++++++++++-----
1 files changed, 195 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index bdc7326..010af90 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -137,6 +137,24 @@ i915_gem_create_ioctl(struct drm_device *dev, void *data,
}

static inline int
+fast_shmem_read(struct page **pages,
+ loff_t page_base, int page_offset,
+ char __user *data,
+ int length)
+{
+ char __iomem *vaddr;
+ int ret;
+
+ vaddr = kmap_atomic(pages[page_base >> PAGE_SHIFT], KM_USER0);
+ if (vaddr == NULL)
+ return -ENOMEM;
+ ret = __copy_to_user_inatomic(data, vaddr + page_offset, length);
+ kunmap_atomic(vaddr, KM_USER0);
+
+ return ret;
+}
+
+static inline int
slow_shmem_copy(struct page *dst_page,
int dst_offset,
struct page *src_page,
@@ -164,6 +182,179 @@ slow_shmem_copy(struct page *dst_page,
}

/**
+ * This is the fast shmem pread path, which attempts to copy_from_user directly
+ * from the backing pages of the object to the user's address space. On a
+ * fault, it fails so we can fall back to i915_gem_shmem_pwrite_slow().
+ */
+static int
+i915_gem_shmem_pread_fast(struct drm_device *dev, struct drm_gem_object *obj,
+ struct drm_i915_gem_pread *args,
+ struct drm_file *file_priv)
+{
+ struct drm_i915_gem_object *obj_priv = obj->driver_private;
+ ssize_t remain;
+ loff_t offset, page_base;
+ char __user *user_data;
+ int page_offset, page_length;
+ int ret;
+
+ user_data = (char __user *) (uintptr_t) args->data_ptr;
+ remain = args->size;
+
+ mutex_lock(&dev->struct_mutex);
+
+ ret = i915_gem_object_get_pages(obj);
+ if (ret != 0)
+ goto fail_unlock;
+
+ ret = i915_gem_object_set_cpu_read_domain_range(obj, args->offset,
+ args->size);
+ if (ret != 0)
+ goto fail_put_pages;
+
+ obj_priv = obj->driver_private;
+ offset = args->offset;
+
+ while (remain > 0) {
+ /* Operation in this page
+ *
+ * page_base = page offset within aperture
+ * page_offset = offset within page
+ * page_length = bytes to copy for this page
+ */
+ page_base = (offset & ~(PAGE_SIZE-1));
+ page_offset = offset & (PAGE_SIZE-1);
+ page_length = remain;
+ if ((page_offset + remain) > PAGE_SIZE)
+ page_length = PAGE_SIZE - page_offset;
+
+ ret = fast_shmem_read(obj_priv->pages,
+ page_base, page_offset,
+ user_data, page_length);
+ if (ret)
+ goto fail_put_pages;
+
+ remain -= page_length;
+ user_data += page_length;
+ offset += page_length;
+ }
+
+fail_put_pages:
+ i915_gem_object_put_pages(obj);
+fail_unlock:
+ mutex_unlock(&dev->struct_mutex);
+
+ return ret;
+}
+
+/**
+ * This is the fallback shmem pread path, which allocates temporary storage
+ * in kernel space to copy_to_user into outside of the struct_mutex, so we
+ * can copy out of the object's backing pages while holding the struct mutex
+ * and not take page faults.
+ */
+static int
+i915_gem_shmem_pread_slow(struct drm_device *dev, struct drm_gem_object *obj,
+ struct drm_i915_gem_pread *args,
+ struct drm_file *file_priv)
+{
+ struct drm_i915_gem_object *obj_priv = obj->driver_private;
+ struct mm_struct *mm = current->mm;
+ struct page **user_pages;
+ ssize_t remain;
+ loff_t offset, pinned_pages, i;
+ loff_t first_data_page, last_data_page, num_pages;
+ int shmem_page_index, shmem_page_offset;
+ int data_page_index, data_page_offset;
+ int page_length;
+ int ret;
+ uint64_t data_ptr = args->data_ptr;
+
+ remain = args->size;
+
+ /* Pin the user pages containing the data. We can't fault while
+ * holding the struct mutex, yet we want to hold it while
+ * dereferencing the user data.
+ */
+ first_data_page = data_ptr / PAGE_SIZE;
+ last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
+ num_pages = last_data_page - first_data_page + 1;
+
+ user_pages = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
+ if (user_pages == NULL)
+ return -ENOMEM;
+
+ down_read(&mm->mmap_sem);
+ pinned_pages = get_user_pages(current, mm, (uintptr_t)args->data_ptr,
+ num_pages, 0, 0, user_pages, NULL);
+ up_read(&mm->mmap_sem);
+ if (pinned_pages < num_pages) {
+ ret = -EFAULT;
+ goto fail_put_user_pages;
+ }
+
+ mutex_lock(&dev->struct_mutex);
+
+ ret = i915_gem_object_get_pages(obj);
+ if (ret != 0)
+ goto fail_unlock;
+
+ ret = i915_gem_object_set_cpu_read_domain_range(obj, args->offset,
+ args->size);
+ if (ret != 0)
+ goto fail_put_pages;
+
+ obj_priv = obj->driver_private;
+ offset = args->offset;
+
+ while (remain > 0) {
+ /* Operation in this page
+ *
+ * shmem_page_index = page number within shmem file
+ * shmem_page_offset = offset within page in shmem file
+ * data_page_index = page number in get_user_pages return
+ * data_page_offset = offset with data_page_index page.
+ * page_length = bytes to copy for this page
+ */
+ shmem_page_index = offset / PAGE_SIZE;
+ shmem_page_offset = offset & ~PAGE_MASK;
+ data_page_index = data_ptr / PAGE_SIZE - first_data_page;
+ data_page_offset = data_ptr & ~PAGE_MASK;
+
+ page_length = remain;
+ if ((shmem_page_offset + page_length) > PAGE_SIZE)
+ page_length = PAGE_SIZE - shmem_page_offset;
+ if ((data_page_offset + page_length) > PAGE_SIZE)
+ page_length = PAGE_SIZE - data_page_offset;
+
+ ret = slow_shmem_copy(user_pages[data_page_index],
+ data_page_offset,
+ obj_priv->pages[shmem_page_index],
+ shmem_page_offset,
+ page_length);
+ if (ret)
+ goto fail_put_pages;
+
+ remain -= page_length;
+ data_ptr += page_length;
+ offset += page_length;
+ }
+
+fail_put_pages:
+ i915_gem_object_put_pages(obj);
+fail_unlock:
+ mutex_unlock(&dev->struct_mutex);
+fail_put_user_pages:
+ for (i = 0; i < pinned_pages; i++) {
+ SetPageDirty(user_pages[i]);
+ page_cache_release(user_pages[i]);
+ }
+ kfree(user_pages);
+
+ return ret;
+}
+
+/**
* Reads data from the object referenced by handle.
*
* On error, the contents of *data are undefined.
@@ -175,8 +366,6 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
struct drm_i915_gem_pread *args = data;
struct drm_gem_object *obj;
struct drm_i915_gem_object *obj_priv;
- ssize_t read;
- loff_t offset;
int ret;

obj = drm_gem_object_lookup(dev, file_priv, args->handle);
@@ -194,33 +383,13 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
return -EINVAL;
}

- mutex_lock(&dev->struct_mutex);
-
- ret = i915_gem_object_set_cpu_read_domain_range(obj, args->offset,
- args->size);
- if (ret != 0) {
- drm_gem_object_unreference(obj);
- mutex_unlock(&dev->struct_mutex);
- return ret;
- }
-
- offset = args->offset;
-
- read = vfs_read(obj->filp, (char __user *)(uintptr_t)args->data_ptr,
- args->size, &offset);
- if (read != args->size) {
- drm_gem_object_unreference(obj);
- mutex_unlock(&dev->struct_mutex);
- if (read < 0)
- return read;
- else
- return -EINVAL;
- }
+ ret = i915_gem_shmem_pread_fast(dev, obj, args, file_priv);
+ if (ret != 0)
+ ret = i915_gem_shmem_pread_slow(dev, obj, args, file_priv);

drm_gem_object_unreference(obj);
- mutex_unlock(&dev->struct_mutex);

- return 0;
+ return ret;
}

/* This is the fast write path which cannot handle
--
1.6.2.1

2009-03-25 21:46:41

by Eric Anholt

[permalink] [raw]
Subject: [PATCH 1/6] drm/i915: Fix lock order reversal in GTT pwrite path.

Since the pagefault path determines that the lock order we use has to be
mmap_sem -> struct_mutex, we can't allow page faults to occur while the
struct_mutex is held. To fix this in pwrite, we first try optimistically to
see if we can copy from user without faulting. If it fails, fall back to
using get_user_pages to pin the user's memory, and map those pages
atomically when copying it to the GPU.

Signed-off-by: Eric Anholt <[email protected]>
---
drivers/gpu/drm/i915/i915_gem.c | 166 ++++++++++++++++++++++++++++++++------
1 files changed, 139 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 37427e4..35f8c7b 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -223,29 +223,34 @@ fast_user_write(struct io_mapping *mapping,
*/

static inline int
-slow_user_write(struct io_mapping *mapping,
- loff_t page_base, int page_offset,
- char __user *user_data,
- int length)
+slow_kernel_write(struct io_mapping *mapping,
+ loff_t gtt_base, int gtt_offset,
+ struct page *user_page, int user_offset,
+ int length)
{
- char __iomem *vaddr;
+ char *src_vaddr, *dst_vaddr;
unsigned long unwritten;

- vaddr = io_mapping_map_wc(mapping, page_base);
- if (vaddr == NULL)
- return -EFAULT;
- unwritten = __copy_from_user(vaddr + page_offset,
- user_data, length);
- io_mapping_unmap(vaddr);
+ dst_vaddr = io_mapping_map_atomic_wc(mapping, gtt_base);
+ src_vaddr = kmap_atomic(user_page, KM_USER1);
+ unwritten = __copy_from_user_inatomic_nocache(dst_vaddr + gtt_offset,
+ src_vaddr + user_offset,
+ length);
+ kunmap_atomic(src_vaddr, KM_USER1);
+ io_mapping_unmap_atomic(dst_vaddr);
if (unwritten)
return -EFAULT;
return 0;
}

+/**
+ * This is the fast pwrite path, where we copy the data directly from the
+ * user into the GTT, uncached.
+ */
static int
-i915_gem_gtt_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
- struct drm_i915_gem_pwrite *args,
- struct drm_file *file_priv)
+i915_gem_gtt_pwrite_fast(struct drm_device *dev, struct drm_gem_object *obj,
+ struct drm_i915_gem_pwrite *args,
+ struct drm_file *file_priv)
{
struct drm_i915_gem_object *obj_priv = obj->driver_private;
drm_i915_private_t *dev_priv = dev->dev_private;
@@ -273,7 +278,6 @@ i915_gem_gtt_pwrite(struct drm_device *dev, struct drm_gem_object *obj,

obj_priv = obj->driver_private;
offset = obj_priv->gtt_offset + args->offset;
- obj_priv->dirty = 1;

while (remain > 0) {
/* Operation in this page
@@ -292,16 +296,11 @@ i915_gem_gtt_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
page_offset, user_data, page_length);

/* If we get a fault while copying data, then (presumably) our
- * source page isn't available. In this case, use the
- * non-atomic function
+ * source page isn't available. Return the error and we'll
+ * retry in the slow path.
*/
- if (ret) {
- ret = slow_user_write (dev_priv->mm.gtt_mapping,
- page_base, page_offset,
- user_data, page_length);
- if (ret)
- goto fail;
- }
+ if (ret)
+ goto fail;

remain -= page_length;
user_data += page_length;
@@ -315,6 +314,115 @@ fail:
return ret;
}

+/**
+ * This is the fallback GTT pwrite path, which uses get_user_pages to pin
+ * the memory and maps it using kmap_atomic for copying.
+ *
+ * This code resulted in x11perf -rgb10text consuming about 10% more CPU
+ * than using i915_gem_gtt_pwrite_fast on a G45 (32-bit).
+ */
+static int
+i915_gem_gtt_pwrite_slow(struct drm_device *dev, struct drm_gem_object *obj,
+ struct drm_i915_gem_pwrite *args,
+ struct drm_file *file_priv)
+{
+ struct drm_i915_gem_object *obj_priv = obj->driver_private;
+ drm_i915_private_t *dev_priv = dev->dev_private;
+ ssize_t remain;
+ loff_t gtt_page_base, offset;
+ loff_t first_data_page, last_data_page, num_pages;
+ loff_t pinned_pages, i;
+ struct page **user_pages;
+ struct mm_struct *mm = current->mm;
+ int gtt_page_offset, data_page_offset, data_page_index, page_length;
+ int ret;
+ uint64_t data_ptr = args->data_ptr;
+
+ remain = args->size;
+
+ /* Pin the user pages containing the data. We can't fault while
+ * holding the struct mutex, and all of the pwrite implementations
+ * want to hold it while dereferencing the user data.
+ */
+ first_data_page = data_ptr / PAGE_SIZE;
+ last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
+ num_pages = last_data_page - first_data_page + 1;
+
+ user_pages = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
+ if (user_pages == NULL)
+ return -ENOMEM;
+
+ down_read(&mm->mmap_sem);
+ pinned_pages = get_user_pages(current, mm, (uintptr_t)args->data_ptr,
+ num_pages, 0, 0, user_pages, NULL);
+ up_read(&mm->mmap_sem);
+ if (pinned_pages < num_pages) {
+ ret = -EFAULT;
+ goto out_unpin_pages;
+ }
+
+ mutex_lock(&dev->struct_mutex);
+ ret = i915_gem_object_pin(obj, 0);
+ if (ret)
+ goto out_unlock;
+
+ ret = i915_gem_object_set_to_gtt_domain(obj, 1);
+ if (ret)
+ goto out_unpin_object;
+
+ obj_priv = obj->driver_private;
+ offset = obj_priv->gtt_offset + args->offset;
+
+ while (remain > 0) {
+ /* Operation in this page
+ *
+ * gtt_page_base = page offset within aperture
+ * gtt_page_offset = offset within page in aperture
+ * data_page_index = page number in get_user_pages return
+ * data_page_offset = offset with data_page_index page.
+ * page_length = bytes to copy for this page
+ */
+ gtt_page_base = offset & PAGE_MASK;
+ gtt_page_offset = offset & ~PAGE_MASK;
+ data_page_index = data_ptr / PAGE_SIZE - first_data_page;
+ data_page_offset = data_ptr & ~PAGE_MASK;
+
+ page_length = remain;
+ if ((gtt_page_offset + page_length) > PAGE_SIZE)
+ page_length = PAGE_SIZE - gtt_page_offset;
+ if ((data_page_offset + page_length) > PAGE_SIZE)
+ page_length = PAGE_SIZE - data_page_offset;
+
+ ret = slow_kernel_write(dev_priv->mm.gtt_mapping,
+ gtt_page_base, gtt_page_offset,
+ user_pages[data_page_index],
+ data_page_offset,
+ page_length);
+
+ /* If we get a fault while copying data, then (presumably) our
+ * source page isn't available. Return the error and we'll
+ * retry in the slow path.
+ */
+ if (ret)
+ goto out_unpin_object;
+
+ remain -= page_length;
+ offset += page_length;
+ data_ptr += page_length;
+ }
+
+out_unpin_object:
+ i915_gem_object_unpin(obj);
+out_unlock:
+ mutex_unlock(&dev->struct_mutex);
+out_unpin_pages:
+ for (i = 0; i < pinned_pages; i++)
+ page_cache_release(user_pages[i]);
+ kfree(user_pages);
+
+ return ret;
+}
+
static int
i915_gem_shmem_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
struct drm_i915_gem_pwrite *args,
@@ -388,9 +496,13 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
if (obj_priv->phys_obj)
ret = i915_gem_phys_pwrite(dev, obj, args, file_priv);
else if (obj_priv->tiling_mode == I915_TILING_NONE &&
- dev->gtt_total != 0)
- ret = i915_gem_gtt_pwrite(dev, obj, args, file_priv);
- else
+ dev->gtt_total != 0) {
+ ret = i915_gem_gtt_pwrite_fast(dev, obj, args, file_priv);
+ if (ret == -EFAULT) {
+ ret = i915_gem_gtt_pwrite_slow(dev, obj, args,
+ file_priv);
+ }
+ } else
ret = i915_gem_shmem_pwrite(dev, obj, args, file_priv);

#if WATCH_PWRITE
--
1.6.2.1

2009-03-25 21:47:28

by Eric Anholt

[permalink] [raw]
Subject: [PATCH 5/6] drm/i915: Fix lock order reversal with cliprects and cmdbuf in non-DRI2 paths.

This introduces allocation in the batch submission path that wasn't there
previously, but these are compatibility paths so we care about simplicity
more than performance.

kernel.org bug #12419.

Signed-off-by: Eric Anholt <[email protected]>
Reviewed-by: Keith Packard <[email protected]>
---
drivers/gpu/drm/i915/i915_dma.c | 107 ++++++++++++++++++++++++++------------
drivers/gpu/drm/i915/i915_drv.h | 2 +-
drivers/gpu/drm/i915/i915_gem.c | 27 ++++++++--
3 files changed, 97 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 6d21b9e..ae83fe0 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -356,7 +356,7 @@ static int validate_cmd(int cmd)
return ret;
}

-static int i915_emit_cmds(struct drm_device * dev, int __user * buffer, int dwords)
+static int i915_emit_cmds(struct drm_device * dev, int *buffer, int dwords)
{
drm_i915_private_t *dev_priv = dev->dev_private;
int i;
@@ -370,8 +370,7 @@ static int i915_emit_cmds(struct drm_device * dev, int __user * buffer, int dwor
for (i = 0; i < dwords;) {
int cmd, sz;

- if (DRM_COPY_FROM_USER_UNCHECKED(&cmd, &buffer[i], sizeof(cmd)))
- return -EINVAL;
+ cmd = buffer[i];

if ((sz = validate_cmd(cmd)) == 0 || i + sz > dwords)
return -EINVAL;
@@ -379,11 +378,7 @@ static int i915_emit_cmds(struct drm_device * dev, int __user * buffer, int dwor
OUT_RING(cmd);

while (++i, --sz) {
- if (DRM_COPY_FROM_USER_UNCHECKED(&cmd, &buffer[i],
- sizeof(cmd))) {
- return -EINVAL;
- }
- OUT_RING(cmd);
+ OUT_RING(buffer[i]);
}
}

@@ -397,17 +392,13 @@ static int i915_emit_cmds(struct drm_device * dev, int __user * buffer, int dwor

int
i915_emit_box(struct drm_device *dev,
- struct drm_clip_rect __user *boxes,
+ struct drm_clip_rect *boxes,
int i, int DR1, int DR4)
{
drm_i915_private_t *dev_priv = dev->dev_private;
- struct drm_clip_rect box;
+ struct drm_clip_rect box = boxes[i];
RING_LOCALS;

- if (DRM_COPY_FROM_USER_UNCHECKED(&box, &boxes[i], sizeof(box))) {
- return -EFAULT;
- }
-
if (box.y2 <= box.y1 || box.x2 <= box.x1 || box.y2 <= 0 || box.x2 <= 0) {
DRM_ERROR("Bad box %d,%d..%d,%d\n",
box.x1, box.y1, box.x2, box.y2);
@@ -460,7 +451,9 @@ static void i915_emit_breadcrumb(struct drm_device *dev)
}

static int i915_dispatch_cmdbuffer(struct drm_device * dev,
- drm_i915_cmdbuffer_t * cmd)
+ drm_i915_cmdbuffer_t *cmd,
+ struct drm_clip_rect *cliprects,
+ void *cmdbuf)
{
int nbox = cmd->num_cliprects;
int i = 0, count, ret;
@@ -476,13 +469,13 @@ static int i915_dispatch_cmdbuffer(struct drm_device * dev,

for (i = 0; i < count; i++) {
if (i < nbox) {
- ret = i915_emit_box(dev, cmd->cliprects, i,
+ ret = i915_emit_box(dev, cliprects, i,
cmd->DR1, cmd->DR4);
if (ret)
return ret;
}

- ret = i915_emit_cmds(dev, (int __user *)cmd->buf, cmd->sz / 4);
+ ret = i915_emit_cmds(dev, cmdbuf, cmd->sz / 4);
if (ret)
return ret;
}
@@ -492,10 +485,10 @@ static int i915_dispatch_cmdbuffer(struct drm_device * dev,
}

static int i915_dispatch_batchbuffer(struct drm_device * dev,
- drm_i915_batchbuffer_t * batch)
+ drm_i915_batchbuffer_t * batch,
+ struct drm_clip_rect *cliprects)
{
drm_i915_private_t *dev_priv = dev->dev_private;
- struct drm_clip_rect __user *boxes = batch->cliprects;
int nbox = batch->num_cliprects;
int i = 0, count;
RING_LOCALS;
@@ -511,7 +504,7 @@ static int i915_dispatch_batchbuffer(struct drm_device * dev,

for (i = 0; i < count; i++) {
if (i < nbox) {
- int ret = i915_emit_box(dev, boxes, i,
+ int ret = i915_emit_box(dev, cliprects, i,
batch->DR1, batch->DR4);
if (ret)
return ret;
@@ -626,6 +619,7 @@ static int i915_batchbuffer(struct drm_device *dev, void *data,
master_priv->sarea_priv;
drm_i915_batchbuffer_t *batch = data;
int ret;
+ struct drm_clip_rect *cliprects = NULL;

if (!dev_priv->allow_batchbuffer) {
DRM_ERROR("Batchbuffer ioctl disabled\n");
@@ -637,17 +631,35 @@ static int i915_batchbuffer(struct drm_device *dev, void *data,

RING_LOCK_TEST_WITH_RETURN(dev, file_priv);

- if (batch->num_cliprects && DRM_VERIFYAREA_READ(batch->cliprects,
- batch->num_cliprects *
- sizeof(struct drm_clip_rect)))
- return -EFAULT;
+ if (batch->num_cliprects < 0)
+ return -EINVAL;
+
+ if (batch->num_cliprects) {
+ cliprects = drm_calloc(batch->num_cliprects,
+ sizeof(struct drm_clip_rect),
+ DRM_MEM_DRIVER);
+ if (cliprects == NULL)
+ return -ENOMEM;
+
+ ret = copy_from_user(cliprects, batch->cliprects,
+ batch->num_cliprects *
+ sizeof(struct drm_clip_rect));
+ if (ret != 0)
+ goto fail_free;
+ }

mutex_lock(&dev->struct_mutex);
- ret = i915_dispatch_batchbuffer(dev, batch);
+ ret = i915_dispatch_batchbuffer(dev, batch, cliprects);
mutex_unlock(&dev->struct_mutex);

if (sarea_priv)
sarea_priv->last_dispatch = READ_BREADCRUMB(dev_priv);
+
+fail_free:
+ drm_free(cliprects,
+ batch->num_cliprects * sizeof(struct drm_clip_rect),
+ DRM_MEM_DRIVER);
+
return ret;
}

@@ -659,6 +671,8 @@ static int i915_cmdbuffer(struct drm_device *dev, void *data,
drm_i915_sarea_t *sarea_priv = (drm_i915_sarea_t *)
master_priv->sarea_priv;
drm_i915_cmdbuffer_t *cmdbuf = data;
+ struct drm_clip_rect *cliprects = NULL;
+ void *batch_data;
int ret;

DRM_DEBUG("i915 cmdbuffer, buf %p sz %d cliprects %d\n",
@@ -666,25 +680,50 @@ static int i915_cmdbuffer(struct drm_device *dev, void *data,

RING_LOCK_TEST_WITH_RETURN(dev, file_priv);

- if (cmdbuf->num_cliprects &&
- DRM_VERIFYAREA_READ(cmdbuf->cliprects,
- cmdbuf->num_cliprects *
- sizeof(struct drm_clip_rect))) {
- DRM_ERROR("Fault accessing cliprects\n");
- return -EFAULT;
+ if (cmdbuf->num_cliprects < 0)
+ return -EINVAL;
+
+ batch_data = drm_alloc(cmdbuf->sz, DRM_MEM_DRIVER);
+ if (batch_data == NULL)
+ return -ENOMEM;
+
+ ret = copy_from_user(batch_data, cmdbuf->buf, cmdbuf->sz);
+ if (ret != 0)
+ goto fail_batch_free;
+
+ if (cmdbuf->num_cliprects) {
+ cliprects = drm_calloc(cmdbuf->num_cliprects,
+ sizeof(struct drm_clip_rect),
+ DRM_MEM_DRIVER);
+ if (cliprects == NULL)
+ goto fail_batch_free;
+
+ ret = copy_from_user(cliprects, cmdbuf->cliprects,
+ cmdbuf->num_cliprects *
+ sizeof(struct drm_clip_rect));
+ if (ret != 0)
+ goto fail_clip_free;
}

mutex_lock(&dev->struct_mutex);
- ret = i915_dispatch_cmdbuffer(dev, cmdbuf);
+ ret = i915_dispatch_cmdbuffer(dev, cmdbuf, cliprects, batch_data);
mutex_unlock(&dev->struct_mutex);
if (ret) {
DRM_ERROR("i915_dispatch_cmdbuffer failed\n");
- return ret;
+ goto fail_batch_free;
}

if (sarea_priv)
sarea_priv->last_dispatch = READ_BREADCRUMB(dev_priv);
- return 0;
+
+fail_batch_free:
+ drm_free(batch_data, cmdbuf->sz, DRM_MEM_DRIVER);
+fail_clip_free:
+ drm_free(cliprects,
+ cmdbuf->num_cliprects * sizeof(struct drm_clip_rect),
+ DRM_MEM_DRIVER);
+
+ return ret;
}

static int i915_flip_bufs(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 75e3384..2c02ce6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -520,7 +520,7 @@ extern int i915_driver_device_is_agp(struct drm_device * dev);
extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg);
extern int i915_emit_box(struct drm_device *dev,
- struct drm_clip_rect __user *boxes,
+ struct drm_clip_rect *boxes,
int i, int DR1, int DR4);

/* i915_irq.c */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 010af90..2bda151 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2891,11 +2891,10 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
static int
i915_dispatch_gem_execbuffer(struct drm_device *dev,
struct drm_i915_gem_execbuffer *exec,
+ struct drm_clip_rect *cliprects,
uint64_t exec_offset)
{
drm_i915_private_t *dev_priv = dev->dev_private;
- struct drm_clip_rect __user *boxes = (struct drm_clip_rect __user *)
- (uintptr_t) exec->cliprects_ptr;
int nbox = exec->num_cliprects;
int i = 0, count;
uint32_t exec_start, exec_len;
@@ -2916,7 +2915,7 @@ i915_dispatch_gem_execbuffer(struct drm_device *dev,

for (i = 0; i < count; i++) {
if (i < nbox) {
- int ret = i915_emit_box(dev, boxes, i,
+ int ret = i915_emit_box(dev, cliprects, i,
exec->DR1, exec->DR4);
if (ret)
return ret;
@@ -2983,6 +2982,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
struct drm_gem_object **object_list = NULL;
struct drm_gem_object *batch_obj;
struct drm_i915_gem_object *obj_priv;
+ struct drm_clip_rect *cliprects = NULL;
int ret, i, pinned = 0;
uint64_t exec_offset;
uint32_t seqno, flush_domains;
@@ -3019,6 +3019,23 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
goto pre_mutex_err;
}

+ if (args->num_cliprects != 0) {
+ cliprects = drm_calloc(args->num_cliprects, sizeof(*cliprects),
+ DRM_MEM_DRIVER);
+ if (cliprects == NULL)
+ goto pre_mutex_err;
+
+ ret = copy_from_user(cliprects,
+ (struct drm_clip_rect __user *)
+ (uintptr_t) args->cliprects_ptr,
+ sizeof(*cliprects) * args->num_cliprects);
+ if (ret != 0) {
+ DRM_ERROR("copy %d cliprects failed: %d\n",
+ args->num_cliprects, ret);
+ goto pre_mutex_err;
+ }
+ }
+
mutex_lock(&dev->struct_mutex);

i915_verify_inactive(dev, __FILE__, __LINE__);
@@ -3155,7 +3172,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
#endif

/* Exec the batchbuffer */
- ret = i915_dispatch_gem_execbuffer(dev, args, exec_offset);
+ ret = i915_dispatch_gem_execbuffer(dev, args, cliprects, exec_offset);
if (ret) {
DRM_ERROR("dispatch failed %d\n", ret);
goto err;
@@ -3224,6 +3241,8 @@ pre_mutex_err:
DRM_MEM_DRIVER);
drm_free(exec_list, sizeof(*exec_list) * args->buffer_count,
DRM_MEM_DRIVER);
+ drm_free(cliprects, sizeof(*cliprects) * args->num_cliprects,
+ DRM_MEM_DRIVER);

return ret;
}
--
1.6.2.1

2009-03-25 21:46:59

by Eric Anholt

[permalink] [raw]
Subject: [PATCH 6/6] drm/i915: Fix lock order reversal in GEM relocation entry copying.

Signed-off-by: Eric Anholt <[email protected]>
Reviewed-by: Keith Packard <[email protected]>
---
drivers/gpu/drm/i915/i915_gem.c | 187 +++++++++++++++++++++++++++-----------
1 files changed, 133 insertions(+), 54 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 2bda151..f135c90 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2713,12 +2713,11 @@ i915_gem_object_set_cpu_read_domain_range(struct drm_gem_object *obj,
static int
i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
struct drm_file *file_priv,
- struct drm_i915_gem_exec_object *entry)
+ struct drm_i915_gem_exec_object *entry,
+ struct drm_i915_gem_relocation_entry *relocs)
{
struct drm_device *dev = obj->dev;
drm_i915_private_t *dev_priv = dev->dev_private;
- struct drm_i915_gem_relocation_entry reloc;
- struct drm_i915_gem_relocation_entry __user *relocs;
struct drm_i915_gem_object *obj_priv = obj->driver_private;
int i, ret;
void __iomem *reloc_page;
@@ -2730,25 +2729,18 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,

entry->offset = obj_priv->gtt_offset;

- relocs = (struct drm_i915_gem_relocation_entry __user *)
- (uintptr_t) entry->relocs_ptr;
/* Apply the relocations, using the GTT aperture to avoid cache
* flushing requirements.
*/
for (i = 0; i < entry->relocation_count; i++) {
+ struct drm_i915_gem_relocation_entry *reloc= &relocs[i];
struct drm_gem_object *target_obj;
struct drm_i915_gem_object *target_obj_priv;
uint32_t reloc_val, reloc_offset;
uint32_t __iomem *reloc_entry;

- ret = copy_from_user(&reloc, relocs + i, sizeof(reloc));
- if (ret != 0) {
- i915_gem_object_unpin(obj);
- return ret;
- }
-
target_obj = drm_gem_object_lookup(obj->dev, file_priv,
- reloc.target_handle);
+ reloc->target_handle);
if (target_obj == NULL) {
i915_gem_object_unpin(obj);
return -EBADF;
@@ -2760,53 +2752,53 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
*/
if (target_obj_priv->gtt_space == NULL) {
DRM_ERROR("No GTT space found for object %d\n",
- reloc.target_handle);
+ reloc->target_handle);
drm_gem_object_unreference(target_obj);
i915_gem_object_unpin(obj);
return -EINVAL;
}

- if (reloc.offset > obj->size - 4) {
+ if (reloc->offset > obj->size - 4) {
DRM_ERROR("Relocation beyond object bounds: "
"obj %p target %d offset %d size %d.\n",
- obj, reloc.target_handle,
- (int) reloc.offset, (int) obj->size);
+ obj, reloc->target_handle,
+ (int) reloc->offset, (int) obj->size);
drm_gem_object_unreference(target_obj);
i915_gem_object_unpin(obj);
return -EINVAL;
}
- if (reloc.offset & 3) {
+ if (reloc->offset & 3) {
DRM_ERROR("Relocation not 4-byte aligned: "
"obj %p target %d offset %d.\n",
- obj, reloc.target_handle,
- (int) reloc.offset);
+ obj, reloc->target_handle,
+ (int) reloc->offset);
drm_gem_object_unreference(target_obj);
i915_gem_object_unpin(obj);
return -EINVAL;
}

- if (reloc.write_domain & I915_GEM_DOMAIN_CPU ||
- reloc.read_domains & I915_GEM_DOMAIN_CPU) {
+ if (reloc->write_domain & I915_GEM_DOMAIN_CPU ||
+ reloc->read_domains & I915_GEM_DOMAIN_CPU) {
DRM_ERROR("reloc with read/write CPU domains: "
"obj %p target %d offset %d "
"read %08x write %08x",
- obj, reloc.target_handle,
- (int) reloc.offset,
- reloc.read_domains,
- reloc.write_domain);
+ obj, reloc->target_handle,
+ (int) reloc->offset,
+ reloc->read_domains,
+ reloc->write_domain);
drm_gem_object_unreference(target_obj);
i915_gem_object_unpin(obj);
return -EINVAL;
}

- if (reloc.write_domain && target_obj->pending_write_domain &&
- reloc.write_domain != target_obj->pending_write_domain) {
+ if (reloc->write_domain && target_obj->pending_write_domain &&
+ reloc->write_domain != target_obj->pending_write_domain) {
DRM_ERROR("Write domain conflict: "
"obj %p target %d offset %d "
"new %08x old %08x\n",
- obj, reloc.target_handle,
- (int) reloc.offset,
- reloc.write_domain,
+ obj, reloc->target_handle,
+ (int) reloc->offset,
+ reloc->write_domain,
target_obj->pending_write_domain);
drm_gem_object_unreference(target_obj);
i915_gem_object_unpin(obj);
@@ -2819,22 +2811,22 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
"presumed %08x delta %08x\n",
__func__,
obj,
- (int) reloc.offset,
- (int) reloc.target_handle,
- (int) reloc.read_domains,
- (int) reloc.write_domain,
+ (int) reloc->offset,
+ (int) reloc->target_handle,
+ (int) reloc->read_domains,
+ (int) reloc->write_domain,
(int) target_obj_priv->gtt_offset,
- (int) reloc.presumed_offset,
- reloc.delta);
+ (int) reloc->presumed_offset,
+ reloc->delta);
#endif

- target_obj->pending_read_domains |= reloc.read_domains;
- target_obj->pending_write_domain |= reloc.write_domain;
+ target_obj->pending_read_domains |= reloc->read_domains;
+ target_obj->pending_write_domain |= reloc->write_domain;

/* If the relocation already has the right value in it, no
* more work needs to be done.
*/
- if (target_obj_priv->gtt_offset == reloc.presumed_offset) {
+ if (target_obj_priv->gtt_offset == reloc->presumed_offset) {
drm_gem_object_unreference(target_obj);
continue;
}
@@ -2849,32 +2841,26 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
/* Map the page containing the relocation we're going to
* perform.
*/
- reloc_offset = obj_priv->gtt_offset + reloc.offset;
+ reloc_offset = obj_priv->gtt_offset + reloc->offset;
reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
(reloc_offset &
~(PAGE_SIZE - 1)));
reloc_entry = (uint32_t __iomem *)(reloc_page +
(reloc_offset & (PAGE_SIZE - 1)));
- reloc_val = target_obj_priv->gtt_offset + reloc.delta;
+ reloc_val = target_obj_priv->gtt_offset + reloc->delta;

#if WATCH_BUF
DRM_INFO("Applied relocation: %p@0x%08x %08x -> %08x\n",
- obj, (unsigned int) reloc.offset,
+ obj, (unsigned int) reloc->offset,
readl(reloc_entry), reloc_val);
#endif
writel(reloc_val, reloc_entry);
io_mapping_unmap_atomic(reloc_page);

- /* Write the updated presumed offset for this entry back out
- * to the user.
+ /* The updated presumed offset for this entry will be
+ * copied back out to the user.
*/
- reloc.presumed_offset = target_obj_priv->gtt_offset;
- ret = copy_to_user(relocs + i, &reloc, sizeof(reloc));
- if (ret != 0) {
- drm_gem_object_unreference(target_obj);
- i915_gem_object_unpin(obj);
- return ret;
- }
+ reloc->presumed_offset = target_obj_priv->gtt_offset;

drm_gem_object_unreference(target_obj);
}
@@ -2971,6 +2957,75 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file_priv)
return ret;
}

+static int
+i915_gem_get_relocs_from_user(struct drm_i915_gem_exec_object *exec_list,
+ uint32_t buffer_count,
+ struct drm_i915_gem_relocation_entry **relocs)
+{
+ uint32_t reloc_count = 0, reloc_index = 0, i;
+ int ret;
+
+ *relocs = NULL;
+ for (i = 0; i < buffer_count; i++) {
+ if (reloc_count + exec_list[i].relocation_count < reloc_count)
+ return -EINVAL;
+ reloc_count += exec_list[i].relocation_count;
+ }
+
+ *relocs = drm_calloc(reloc_count, sizeof(**relocs), DRM_MEM_DRIVER);
+ if (*relocs == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < buffer_count; i++) {
+ struct drm_i915_gem_relocation_entry __user *user_relocs;
+
+ user_relocs = (void __user *)(uintptr_t)exec_list[i].relocs_ptr;
+
+ ret = copy_from_user(&(*relocs)[reloc_index],
+ user_relocs,
+ exec_list[i].relocation_count *
+ sizeof(**relocs));
+ if (ret != 0) {
+ drm_free(*relocs, reloc_count * sizeof(**relocs),
+ DRM_MEM_DRIVER);
+ *relocs = NULL;
+ return ret;
+ }
+
+ reloc_index += exec_list[i].relocation_count;
+ }
+
+ return ret;
+}
+
+static int
+i915_gem_put_relocs_to_user(struct drm_i915_gem_exec_object *exec_list,
+ uint32_t buffer_count,
+ struct drm_i915_gem_relocation_entry *relocs)
+{
+ uint32_t reloc_count = 0, i;
+ int ret;
+
+ for (i = 0; i < buffer_count; i++) {
+ struct drm_i915_gem_relocation_entry __user *user_relocs;
+
+ user_relocs = (void __user *)(uintptr_t)exec_list[i].relocs_ptr;
+
+ if (ret == 0) {
+ ret = copy_to_user(user_relocs,
+ &relocs[reloc_count],
+ exec_list[i].relocation_count *
+ sizeof(*relocs));
+ }
+
+ reloc_count += exec_list[i].relocation_count;
+ }
+
+ drm_free(relocs, reloc_count * sizeof(*relocs), DRM_MEM_DRIVER);
+
+ return ret;
+}
+
int
i915_gem_execbuffer(struct drm_device *dev, void *data,
struct drm_file *file_priv)
@@ -2983,9 +3038,10 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
struct drm_gem_object *batch_obj;
struct drm_i915_gem_object *obj_priv;
struct drm_clip_rect *cliprects = NULL;
- int ret, i, pinned = 0;
+ struct drm_i915_gem_relocation_entry *relocs;
+ int ret, ret2, i, pinned = 0;
uint64_t exec_offset;
- uint32_t seqno, flush_domains;
+ uint32_t seqno, flush_domains, reloc_index;
int pin_tries;

#if WATCH_EXEC
@@ -3036,6 +3092,11 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
}
}

+ ret = i915_gem_get_relocs_from_user(exec_list, args->buffer_count,
+ &relocs);
+ if (ret != 0)
+ goto pre_mutex_err;
+
mutex_lock(&dev->struct_mutex);

i915_verify_inactive(dev, __FILE__, __LINE__);
@@ -3078,15 +3139,19 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
/* Pin and relocate */
for (pin_tries = 0; ; pin_tries++) {
ret = 0;
+ reloc_index = 0;
+
for (i = 0; i < args->buffer_count; i++) {
object_list[i]->pending_read_domains = 0;
object_list[i]->pending_write_domain = 0;
ret = i915_gem_object_pin_and_relocate(object_list[i],
file_priv,
- &exec_list[i]);
+ &exec_list[i],
+ &relocs[reloc_index]);
if (ret)
break;
pinned = i + 1;
+ reloc_index += exec_list[i].relocation_count;
}
/* success */
if (ret == 0)
@@ -3236,6 +3301,20 @@ err:
args->buffer_count, ret);
}

+ /* Copy the updated relocations out regardless of current error
+ * state. Failure to update the relocs would mean that the next
+ * time userland calls execbuf, it would do so with presumed offset
+ * state that didn't match the actual object state.
+ */
+ ret2 = i915_gem_put_relocs_to_user(exec_list, args->buffer_count,
+ relocs);
+ if (ret2 != 0) {
+ DRM_ERROR("Failed to copy relocations back out: %d\n", ret2);
+
+ if (ret == 0)
+ ret = ret2;
+ }
+
pre_mutex_err:
drm_free(object_list, sizeof(*object_list) * args->buffer_count,
DRM_MEM_DRIVER);
--
1.6.2.1

2009-03-25 22:52:30

by Dave Airlie

[permalink] [raw]
Subject: Re: [PATCH 2/6] drm/i915: Make GEM object's page lists refcounted instead of get/free.


> We've wanted this for a few consumers that touch the pages directly (such as
> the following commit), which have been doing the refcounting outside of
> get/put pages.

No idea if this is a valid point or not but whenever I see refcount that
isn't a kref my internal, "should this be a kref" o-meter goes off.

Dave.

> ---
> drivers/gpu/drm/i915/i915_drv.h | 3 +-
> drivers/gpu/drm/i915/i915_gem.c | 70 ++++++++++++++++++++-------------------
> 2 files changed, 38 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index d6cc986..75e3384 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -404,7 +404,8 @@ struct drm_i915_gem_object {
> /** AGP memory structure for our GTT binding. */
> DRM_AGP_MEM *agp_mem;
>
> - struct page **page_list;
> + struct page **pages;
> + int pages_refcount;
>
> /**
> * Current offset of the object in GTT space.
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 35f8c7b..b998d65 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -43,8 +43,8 @@ static int i915_gem_object_set_cpu_read_domain_range(struct drm_gem_object *obj,
> uint64_t offset,
> uint64_t size);
> static void i915_gem_object_set_to_full_cpu_read_domain(struct drm_gem_object *obj);
> -static int i915_gem_object_get_page_list(struct drm_gem_object *obj);
> -static void i915_gem_object_free_page_list(struct drm_gem_object *obj);
> +static int i915_gem_object_get_pages(struct drm_gem_object *obj);
> +static void i915_gem_object_put_pages(struct drm_gem_object *obj);
> static int i915_gem_object_wait_rendering(struct drm_gem_object *obj);
> static int i915_gem_object_bind_to_gtt(struct drm_gem_object *obj,
> unsigned alignment);
> @@ -928,29 +928,30 @@ i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
> }
>
> static void
> -i915_gem_object_free_page_list(struct drm_gem_object *obj)
> +i915_gem_object_put_pages(struct drm_gem_object *obj)
> {
> struct drm_i915_gem_object *obj_priv = obj->driver_private;
> int page_count = obj->size / PAGE_SIZE;
> int i;
>
> - if (obj_priv->page_list == NULL)
> - return;
> + BUG_ON(obj_priv->pages_refcount == 0);
>
> + if (--obj_priv->pages_refcount != 0)
> + return;
>
> for (i = 0; i < page_count; i++)
> - if (obj_priv->page_list[i] != NULL) {
> + if (obj_priv->pages[i] != NULL) {
> if (obj_priv->dirty)
> - set_page_dirty(obj_priv->page_list[i]);
> - mark_page_accessed(obj_priv->page_list[i]);
> - page_cache_release(obj_priv->page_list[i]);
> + set_page_dirty(obj_priv->pages[i]);
> + mark_page_accessed(obj_priv->pages[i]);
> + page_cache_release(obj_priv->pages[i]);
> }
> obj_priv->dirty = 0;
>
> - drm_free(obj_priv->page_list,
> + drm_free(obj_priv->pages,
> page_count * sizeof(struct page *),
> DRM_MEM_DRIVER);
> - obj_priv->page_list = NULL;
> + obj_priv->pages = NULL;
> }
>
> static void
> @@ -1402,7 +1403,7 @@ i915_gem_object_unbind(struct drm_gem_object *obj)
> if (obj_priv->fence_reg != I915_FENCE_REG_NONE)
> i915_gem_clear_fence_reg(obj);
>
> - i915_gem_object_free_page_list(obj);
> + i915_gem_object_put_pages(obj);
>
> if (obj_priv->gtt_space) {
> atomic_dec(&dev->gtt_count);
> @@ -1521,7 +1522,7 @@ i915_gem_evict_everything(struct drm_device *dev)
> }
>
> static int
> -i915_gem_object_get_page_list(struct drm_gem_object *obj)
> +i915_gem_object_get_pages(struct drm_gem_object *obj)
> {
> struct drm_i915_gem_object *obj_priv = obj->driver_private;
> int page_count, i;
> @@ -1530,18 +1531,19 @@ i915_gem_object_get_page_list(struct drm_gem_object *obj)
> struct page *page;
> int ret;
>
> - if (obj_priv->page_list)
> + if (obj_priv->pages_refcount++ != 0)
> return 0;
>
> /* Get the list of pages out of our struct file. They'll be pinned
> * at this point until we release them.
> */
> page_count = obj->size / PAGE_SIZE;
> - BUG_ON(obj_priv->page_list != NULL);
> - obj_priv->page_list = drm_calloc(page_count, sizeof(struct page *),
> - DRM_MEM_DRIVER);
> - if (obj_priv->page_list == NULL) {
> + BUG_ON(obj_priv->pages != NULL);
> + obj_priv->pages = drm_calloc(page_count, sizeof(struct page *),
> + DRM_MEM_DRIVER);
> + if (obj_priv->pages == NULL) {
> DRM_ERROR("Faled to allocate page list\n");
> + obj_priv->pages_refcount--;
> return -ENOMEM;
> }
>
> @@ -1552,10 +1554,10 @@ i915_gem_object_get_page_list(struct drm_gem_object *obj)
> if (IS_ERR(page)) {
> ret = PTR_ERR(page);
> DRM_ERROR("read_mapping_page failed: %d\n", ret);
> - i915_gem_object_free_page_list(obj);
> + i915_gem_object_put_pages(obj);
> return ret;
> }
> - obj_priv->page_list[i] = page;
> + obj_priv->pages[i] = page;
> }
> return 0;
> }
> @@ -1878,7 +1880,7 @@ i915_gem_object_bind_to_gtt(struct drm_gem_object *obj, unsigned alignment)
> DRM_INFO("Binding object of size %d at 0x%08x\n",
> obj->size, obj_priv->gtt_offset);
> #endif
> - ret = i915_gem_object_get_page_list(obj);
> + ret = i915_gem_object_get_pages(obj);
> if (ret) {
> drm_mm_put_block(obj_priv->gtt_space);
> obj_priv->gtt_space = NULL;
> @@ -1890,12 +1892,12 @@ i915_gem_object_bind_to_gtt(struct drm_gem_object *obj, unsigned alignment)
> * into the GTT.
> */
> obj_priv->agp_mem = drm_agp_bind_pages(dev,
> - obj_priv->page_list,
> + obj_priv->pages,
> page_count,
> obj_priv->gtt_offset,
> obj_priv->agp_type);
> if (obj_priv->agp_mem == NULL) {
> - i915_gem_object_free_page_list(obj);
> + i915_gem_object_put_pages(obj);
> drm_mm_put_block(obj_priv->gtt_space);
> obj_priv->gtt_space = NULL;
> return -ENOMEM;
> @@ -1922,10 +1924,10 @@ i915_gem_clflush_object(struct drm_gem_object *obj)
> * to GPU, and we can ignore the cache flush because it'll happen
> * again at bind time.
> */
> - if (obj_priv->page_list == NULL)
> + if (obj_priv->pages == NULL)
> return;
>
> - drm_clflush_pages(obj_priv->page_list, obj->size / PAGE_SIZE);
> + drm_clflush_pages(obj_priv->pages, obj->size / PAGE_SIZE);
> }
>
> /** Flushes any GPU write domain for the object if it's dirty. */
> @@ -2270,7 +2272,7 @@ i915_gem_object_set_to_full_cpu_read_domain(struct drm_gem_object *obj)
> for (i = 0; i <= (obj->size - 1) / PAGE_SIZE; i++) {
> if (obj_priv->page_cpu_valid[i])
> continue;
> - drm_clflush_pages(obj_priv->page_list + i, 1);
> + drm_clflush_pages(obj_priv->pages + i, 1);
> }
> drm_agp_chipset_flush(dev);
> }
> @@ -2336,7 +2338,7 @@ i915_gem_object_set_cpu_read_domain_range(struct drm_gem_object *obj,
> if (obj_priv->page_cpu_valid[i])
> continue;
>
> - drm_clflush_pages(obj_priv->page_list + i, 1);
> + drm_clflush_pages(obj_priv->pages + i, 1);
>
> obj_priv->page_cpu_valid[i] = 1;
> }
> @@ -3304,7 +3306,7 @@ i915_gem_init_hws(struct drm_device *dev)
>
> dev_priv->status_gfx_addr = obj_priv->gtt_offset;
>
> - dev_priv->hw_status_page = kmap(obj_priv->page_list[0]);
> + dev_priv->hw_status_page = kmap(obj_priv->pages[0]);
> if (dev_priv->hw_status_page == NULL) {
> DRM_ERROR("Failed to map status page.\n");
> memset(&dev_priv->hws_map, 0, sizeof(dev_priv->hws_map));
> @@ -3334,7 +3336,7 @@ i915_gem_cleanup_hws(struct drm_device *dev)
> obj = dev_priv->hws_obj;
> obj_priv = obj->driver_private;
>
> - kunmap(obj_priv->page_list[0]);
> + kunmap(obj_priv->pages[0]);
> i915_gem_object_unpin(obj);
> drm_gem_object_unreference(obj);
> dev_priv->hws_obj = NULL;
> @@ -3637,20 +3639,20 @@ void i915_gem_detach_phys_object(struct drm_device *dev,
> if (!obj_priv->phys_obj)
> return;
>
> - ret = i915_gem_object_get_page_list(obj);
> + ret = i915_gem_object_get_pages(obj);
> if (ret)
> goto out;
>
> page_count = obj->size / PAGE_SIZE;
>
> for (i = 0; i < page_count; i++) {
> - char *dst = kmap_atomic(obj_priv->page_list[i], KM_USER0);
> + char *dst = kmap_atomic(obj_priv->pages[i], KM_USER0);
> char *src = obj_priv->phys_obj->handle->vaddr + (i * PAGE_SIZE);
>
> memcpy(dst, src, PAGE_SIZE);
> kunmap_atomic(dst, KM_USER0);
> }
> - drm_clflush_pages(obj_priv->page_list, page_count);
> + drm_clflush_pages(obj_priv->pages, page_count);
> drm_agp_chipset_flush(dev);
> out:
> obj_priv->phys_obj->cur_obj = NULL;
> @@ -3693,7 +3695,7 @@ i915_gem_attach_phys_object(struct drm_device *dev,
> obj_priv->phys_obj = dev_priv->mm.phys_objs[id - 1];
> obj_priv->phys_obj->cur_obj = obj;
>
> - ret = i915_gem_object_get_page_list(obj);
> + ret = i915_gem_object_get_pages(obj);
> if (ret) {
> DRM_ERROR("failed to get page list\n");
> goto out;
> @@ -3702,7 +3704,7 @@ i915_gem_attach_phys_object(struct drm_device *dev,
> page_count = obj->size / PAGE_SIZE;
>
> for (i = 0; i < page_count; i++) {
> - char *src = kmap_atomic(obj_priv->page_list[i], KM_USER0);
> + char *src = kmap_atomic(obj_priv->pages[i], KM_USER0);
> char *dst = obj_priv->phys_obj->handle->vaddr + (i * PAGE_SIZE);
>
> memcpy(dst, src, PAGE_SIZE);
>

2009-03-25 23:31:04

by Dave Airlie

[permalink] [raw]
Subject: Re: [PATCH 4/6] drm/i915: Fix lock order reversal in shmem pread path.



I've no idea when a fault is likely in the fast case, i.e. will it happen
usually on the first page etc, because if it happens on the last page and
you fallback and restart the whole copy, I would think that would be
sub-optimal, granted it could get ugly quick, but this code has already
hit a few branches on the tree.

Dave.

> static inline int
> +fast_shmem_read(struct page **pages,
> + loff_t page_base, int page_offset,
> + char __user *data,
> + int length)
> +{
> + char __iomem *vaddr;
> + int ret;
> +
> + vaddr = kmap_atomic(pages[page_base >> PAGE_SHIFT], KM_USER0);
> + if (vaddr == NULL)
> + return -ENOMEM;
> + ret = __copy_to_user_inatomic(data, vaddr + page_offset, length);
> + kunmap_atomic(vaddr, KM_USER0);
> +
> + return ret;
> +}
> +
> +static inline int
> slow_shmem_copy(struct page *dst_page,
> int dst_offset,
> struct page *src_page,
> @@ -164,6 +182,179 @@ slow_shmem_copy(struct page *dst_page,
> }
>
> /**
> + * This is the fast shmem pread path, which attempts to copy_from_user directly
> + * from the backing pages of the object to the user's address space. On a
> + * fault, it fails so we can fall back to i915_gem_shmem_pwrite_slow().
> + */
> +static int
> +i915_gem_shmem_pread_fast(struct drm_device *dev, struct drm_gem_object *obj,
> + struct drm_i915_gem_pread *args,
> + struct drm_file *file_priv)
> +{
> + struct drm_i915_gem_object *obj_priv = obj->driver_private;
> + ssize_t remain;
> + loff_t offset, page_base;
> + char __user *user_data;
> + int page_offset, page_length;
> + int ret;
> +
> + user_data = (char __user *) (uintptr_t) args->data_ptr;
> + remain = args->size;
> +
> + mutex_lock(&dev->struct_mutex);
> +
> + ret = i915_gem_object_get_pages(obj);
> + if (ret != 0)
> + goto fail_unlock;
> +
> + ret = i915_gem_object_set_cpu_read_domain_range(obj, args->offset,
> + args->size);
> + if (ret != 0)
> + goto fail_put_pages;
> +
> + obj_priv = obj->driver_private;
> + offset = args->offset;
> +
> + while (remain > 0) {
> + /* Operation in this page
> + *
> + * page_base = page offset within aperture
> + * page_offset = offset within page
> + * page_length = bytes to copy for this page
> + */
> + page_base = (offset & ~(PAGE_SIZE-1));
> + page_offset = offset & (PAGE_SIZE-1);
> + page_length = remain;
> + if ((page_offset + remain) > PAGE_SIZE)
> + page_length = PAGE_SIZE - page_offset;
> +
> + ret = fast_shmem_read(obj_priv->pages,
> + page_base, page_offset,
> + user_data, page_length);
> + if (ret)
> + goto fail_put_pages;
> +
> + remain -= page_length;
> + user_data += page_length;
> + offset += page_length;
> + }
> +
> +fail_put_pages:
> + i915_gem_object_put_pages(obj);
> +fail_unlock:
> + mutex_unlock(&dev->struct_mutex);
> +
> + return ret;
> +}
> +
> +/**
> + * This is the fallback shmem pread path, which allocates temporary storage
> + * in kernel space to copy_to_user into outside of the struct_mutex, so we
> + * can copy out of the object's backing pages while holding the struct mutex
> + * and not take page faults.
> + */
> +static int
> +i915_gem_shmem_pread_slow(struct drm_device *dev, struct drm_gem_object *obj,
> + struct drm_i915_gem_pread *args,
> + struct drm_file *file_priv)
> +{
> + struct drm_i915_gem_object *obj_priv = obj->driver_private;
> + struct mm_struct *mm = current->mm;
> + struct page **user_pages;
> + ssize_t remain;
> + loff_t offset, pinned_pages, i;
> + loff_t first_data_page, last_data_page, num_pages;
> + int shmem_page_index, shmem_page_offset;
> + int data_page_index, data_page_offset;
> + int page_length;
> + int ret;
> + uint64_t data_ptr = args->data_ptr;
> +
> + remain = args->size;
> +
> + /* Pin the user pages containing the data. We can't fault while
> + * holding the struct mutex, yet we want to hold it while
> + * dereferencing the user data.
> + */
> + first_data_page = data_ptr / PAGE_SIZE;
> + last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
> + num_pages = last_data_page - first_data_page + 1;
> +
> + user_pages = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
> + if (user_pages == NULL)
> + return -ENOMEM;
> +
> + down_read(&mm->mmap_sem);
> + pinned_pages = get_user_pages(current, mm, (uintptr_t)args->data_ptr,
> + num_pages, 0, 0, user_pages, NULL);
> + up_read(&mm->mmap_sem);
> + if (pinned_pages < num_pages) {
> + ret = -EFAULT;
> + goto fail_put_user_pages;
> + }
> +
> + mutex_lock(&dev->struct_mutex);
> +
> + ret = i915_gem_object_get_pages(obj);
> + if (ret != 0)
> + goto fail_unlock;
> +
> + ret = i915_gem_object_set_cpu_read_domain_range(obj, args->offset,
> + args->size);
> + if (ret != 0)
> + goto fail_put_pages;
> +
> + obj_priv = obj->driver_private;
> + offset = args->offset;
> +
> + while (remain > 0) {
> + /* Operation in this page
> + *
> + * shmem_page_index = page number within shmem file
> + * shmem_page_offset = offset within page in shmem file
> + * data_page_index = page number in get_user_pages return
> + * data_page_offset = offset with data_page_index page.
> + * page_length = bytes to copy for this page
> + */
> + shmem_page_index = offset / PAGE_SIZE;
> + shmem_page_offset = offset & ~PAGE_MASK;
> + data_page_index = data_ptr / PAGE_SIZE - first_data_page;
> + data_page_offset = data_ptr & ~PAGE_MASK;
> +
> + page_length = remain;
> + if ((shmem_page_offset + page_length) > PAGE_SIZE)
> + page_length = PAGE_SIZE - shmem_page_offset;
> + if ((data_page_offset + page_length) > PAGE_SIZE)
> + page_length = PAGE_SIZE - data_page_offset;
> +
> + ret = slow_shmem_copy(user_pages[data_page_index],
> + data_page_offset,
> + obj_priv->pages[shmem_page_index],
> + shmem_page_offset,
> + page_length);
> + if (ret)
> + goto fail_put_pages;
> +
> + remain -= page_length;
> + data_ptr += page_length;
> + offset += page_length;
> + }
> +
> +fail_put_pages:
> + i915_gem_object_put_pages(obj);
> +fail_unlock:
> + mutex_unlock(&dev->struct_mutex);
> +fail_put_user_pages:
> + for (i = 0; i < pinned_pages; i++) {
> + SetPageDirty(user_pages[i]);
> + page_cache_release(user_pages[i]);
> + }
> + kfree(user_pages);
> +
> + return ret;
> +}
> +
> +/**
> * Reads data from the object referenced by handle.
> *
> * On error, the contents of *data are undefined.
> @@ -175,8 +366,6 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
> struct drm_i915_gem_pread *args = data;
> struct drm_gem_object *obj;
> struct drm_i915_gem_object *obj_priv;
> - ssize_t read;
> - loff_t offset;
> int ret;
>
> obj = drm_gem_object_lookup(dev, file_priv, args->handle);
> @@ -194,33 +383,13 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
> return -EINVAL;
> }
>
> - mutex_lock(&dev->struct_mutex);
> -
> - ret = i915_gem_object_set_cpu_read_domain_range(obj, args->offset,
> - args->size);
> - if (ret != 0) {
> - drm_gem_object_unreference(obj);
> - mutex_unlock(&dev->struct_mutex);
> - return ret;
> - }
> -
> - offset = args->offset;
> -
> - read = vfs_read(obj->filp, (char __user *)(uintptr_t)args->data_ptr,
> - args->size, &offset);
> - if (read != args->size) {
> - drm_gem_object_unreference(obj);
> - mutex_unlock(&dev->struct_mutex);
> - if (read < 0)
> - return read;
> - else
> - return -EINVAL;
> - }
> + ret = i915_gem_shmem_pread_fast(dev, obj, args, file_priv);
> + if (ret != 0)
> + ret = i915_gem_shmem_pread_slow(dev, obj, args, file_priv);
>
> drm_gem_object_unreference(obj);
> - mutex_unlock(&dev->struct_mutex);
>
> - return 0;
> + return ret;
> }
>
> /* This is the fast write path which cannot handle
>

2009-03-26 04:12:38

by Keith Packard

[permalink] [raw]
Subject: Re: [PATCH 4/6] drm/i915: Fix lock order reversal in shmem pread path.

On Wed, 2009-03-25 at 23:30 +0000, Dave Airlie wrote:
>
> I've no idea when a fault is likely in the fast case, i.e. will it happen
> usually on the first page etc, because if it happens on the last page and
> you fallback and restart the whole copy, I would think that would be
> sub-optimal, granted it could get ugly quick, but this code has already
> hit a few branches on the tree.

The data in question is presumably 'hot', and so unlikely to be swapped
out. Testing the slow paths doing a complete copy is fairly easy, while
testing partial copies would be considerably more difficult. The
combination of these two seems to encourage a careful and simplistic
slow path.

We can always make it 'faster' in the future, but having it wrong from
the first seems sub-optimal.

--
[email protected]


Attachments:
signature.asc (189.00 B)
This is a digitally signed message part

2009-03-26 19:59:39

by Eric Anholt

[permalink] [raw]
Subject: Re: [PATCH 2/6] drm/i915: Make GEM object's page lists refcounted instead of get/free.

On Wed, 2009-03-25 at 22:52 +0000, Dave Airlie wrote:
> > We've wanted this for a few consumers that touch the pages directly (such as
> > the following commit), which have been doing the refcounting outside of
> > get/put pages.
>
> No idea if this is a valid point or not but whenever I see refcount that
> isn't a kref my internal, "should this be a kref" o-meter goes off.

All usage is under the struct mutex, since "get" is the transition from
0 -> 1 refcount, so I don't see how krefs would apply here (they're
suited more to refcounting objects after creation).

Our use of krefs for our GEM object refcounts is itself somewhat dubious
-- people have noted that we're spending a decent bit of CPU on the kref
usage, since the second hottest path is a loop of reffing objects (10-50
or so) up front, and loop of unreffing at the end. And when we're doing
that reffing, we're holding a spinlock on the table we're looking it up
from, anyway! So we end up with like 4 locked bus transactions per
object in exec that could easily be reduced to 2 total, if we made there
be a small "object referencing" lock covering the handle tables and
object refcounts.

--
Eric Anholt
[email protected] [email protected]



Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-03-27 00:43:42

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH 1/6] drm/i915: Fix lock order reversal in GTT pwrite path.

On Wed, 25 Mar 2009 14:45:05 -0700
Eric Anholt <[email protected]> wrote:

> Since the pagefault path determines that the lock order we use has to
> be mmap_sem -> struct_mutex, we can't allow page faults to occur
> while the struct_mutex is held. To fix this in pwrite, we first try
> optimistically to see if we can copy from user without faulting. If
> it fails, fall back to using get_user_pages to pin the user's memory,
> and map those pages atomically when copying it to the GPU.
>
> Signed-off-by: Eric Anholt <[email protected]>
> ---
> + /* Pin the user pages containing the data. We can't fault
> while
> + * holding the struct mutex, and all of the pwrite
> implementations
> + * want to hold it while dereferencing the user data.
> + */
> + first_data_page = data_ptr / PAGE_SIZE;
> + last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
> + num_pages = last_data_page - first_data_page + 1;
> +
> + user_pages = kcalloc(num_pages, sizeof(struct page *),
> GFP_KERNEL);
> + if (user_pages == NULL)
> + return -ENOMEM;

If kmalloc limits us to a 128k allocation (and maybe less under
pressure), then we'll be limited to 128k/8 page pointers on 64 bit, or
64M per pwrite... Is that ok? Or do we need to handle multiple passes
here?

Looks good other than that.

Reviewed-by: Jesse Barnes <[email protected]>

--
Jesse Barnes, Intel Open Source Technology Center

2009-03-27 00:47:55

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH 2/6] drm/i915: Make GEM object's page lists refcounted instead of get/free.

On Wed, 25 Mar 2009 14:45:06 -0700
Eric Anholt <[email protected]> wrote:

> We've wanted this for a few consumers that touch the pages directly
> (such as the following commit), which have been doing the refcounting
> outside of get/put pages.

Looks good, seems like it should be a small perf win also, did you
measure it?

Reviewed-by: Jesse Barnes <[email protected]>

--
Jesse Barnes, Intel Open Source Technology Center

2009-03-27 00:51:08

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH 3/6] drm/i915: Fix lock order reversal in shmem pwrite path.

On Wed, 25 Mar 2009 14:45:07 -0700
Eric Anholt <[email protected]> wrote:

> Like the GTT pwrite path fix, this uses an optimistic path and a
> fallback to get_user_pages. Note that this means we have to stop
> using vfs_write and roll it ourselves.
>
> Signed-off-by: Eric Anholt <[email protected]>

Ashame that it's so similar but can't share more code...

Reviewed-by: Jesse Barnes <[email protected]>

--
Jesse Barnes, Intel Open Source Technology Center

2009-03-27 00:51:51

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH 4/6] drm/i915: Fix lock order reversal in shmem pread path.

On Wed, 25 Mar 2009 14:45:08 -0700
Eric Anholt <[email protected]> wrote:

> Signed-off-by: Eric Anholt <[email protected]>

Same comments as 3/6...

Reviewed-by: Jesse Barnes <[email protected]>

--
Jesse Barnes, Intel Open Source Technology Center

2009-03-27 00:53:37

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH 5/6] drm/i915: Fix lock order reversal with cliprects and cmdbuf in non-DRI2 paths.

On Wed, 25 Mar 2009 14:45:09 -0700
Eric Anholt <[email protected]> wrote:

> This introduces allocation in the batch submission path that wasn't
> there previously, but these are compatibility paths so we care about
> simplicity more than performance.
>
> kernel.org bug #12419.

Don't care too much here as long as it's been tested.

Acked-by: Jesse Barnes <[email protected]>

--
Jesse Barnes, Intel Open Source Technology Center

2009-03-27 09:35:49

by Andi Kleen

[permalink] [raw]
Subject: Re: DRM lock ordering fix series

Eric Anholt <[email protected]> writes:

> Here's hopefully the final attempt at the lock ordering fix for GEM. The
> problem was introduced in .29 with the GTT mapping support. We hashed out
> a few potential fixes on the mailing list and at OSTS. Peter's plan was
> to use get_user_pages, but it has significant CPU overhead (10% cost to text
> rendering, though part of that is due to some dumb userland code. But it's
> dumb userland code we're all running).


You are aware that there is a fast path now (get_user_pages_fast) which
is significantly faster? (but has some limitations)

-Andi

--
[email protected] -- Speaking for myself only.

2009-03-27 16:19:43

by Eric Anholt

[permalink] [raw]
Subject: Re: DRM lock ordering fix series

On Fri, 2009-03-27 at 10:34 +0100, Andi Kleen wrote:
> Eric Anholt <[email protected]> writes:
>
> > Here's hopefully the final attempt at the lock ordering fix for GEM. The
> > problem was introduced in .29 with the GTT mapping support. We hashed out
> > a few potential fixes on the mailing list and at OSTS. Peter's plan was
> > to use get_user_pages, but it has significant CPU overhead (10% cost to text
> > rendering, though part of that is due to some dumb userland code. But it's
> > dumb userland code we're all running).
>
>
> You are aware that there is a fast path now (get_user_pages_fast) which
> is significantly faster? (but has some limitations)

In the code I have, get_user_pages_fast is just a wrapper that calls the
get_user_pages in the way that I'm calling it from the DRM.

I'm assuming that that's changing. Can you explain what the "some
limitations" is? (Though, of course, in a comment in mm/util.c would be
better than email).

--
Eric Anholt
[email protected] [email protected]



Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-03-27 16:36:57

by Eric Anholt

[permalink] [raw]
Subject: Re: DRM lock ordering fix series

On Fri, 2009-03-27 at 09:19 -0700, Eric Anholt wrote:
> On Fri, 2009-03-27 at 10:34 +0100, Andi Kleen wrote:
> > Eric Anholt <[email protected]> writes:
> >
> > > Here's hopefully the final attempt at the lock ordering fix for GEM. The
> > > problem was introduced in .29 with the GTT mapping support. We hashed out
> > > a few potential fixes on the mailing list and at OSTS. Peter's plan was
> > > to use get_user_pages, but it has significant CPU overhead (10% cost to text
> > > rendering, though part of that is due to some dumb userland code. But it's
> > > dumb userland code we're all running).
> >
> >
> > You are aware that there is a fast path now (get_user_pages_fast) which
> > is significantly faster? (but has some limitations)
>
> In the code I have, get_user_pages_fast is just a wrapper that calls the
> get_user_pages in the way that I'm calling it from the DRM.

Ah, I see: that's a weak stub, and there is a real implementation. I
didn't know we could do weak stubs.

Still, needs docs badly.

--
Eric Anholt
[email protected] [email protected]



Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-03-27 16:56:19

by Eric Anholt

[permalink] [raw]
Subject: Re: [PATCH 1/6] drm/i915: Fix lock order reversal in GTT pwrite path.

On Thu, 2009-03-26 at 17:43 -0700, Jesse Barnes wrote:
> On Wed, 25 Mar 2009 14:45:05 -0700
> Eric Anholt <[email protected]> wrote:
>
> > Since the pagefault path determines that the lock order we use has to
> > be mmap_sem -> struct_mutex, we can't allow page faults to occur
> > while the struct_mutex is held. To fix this in pwrite, we first try
> > optimistically to see if we can copy from user without faulting. If
> > it fails, fall back to using get_user_pages to pin the user's memory,
> > and map those pages atomically when copying it to the GPU.
> >
> > Signed-off-by: Eric Anholt <[email protected]>
> > ---
> > + /* Pin the user pages containing the data. We can't fault
> > while
> > + * holding the struct mutex, and all of the pwrite
> > implementations
> > + * want to hold it while dereferencing the user data.
> > + */
> > + first_data_page = data_ptr / PAGE_SIZE;
> > + last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
> > + num_pages = last_data_page - first_data_page + 1;
> > +
> > + user_pages = kcalloc(num_pages, sizeof(struct page *),
> > GFP_KERNEL);
> > + if (user_pages == NULL)
> > + return -ENOMEM;
>
> If kmalloc limits us to a 128k allocation (and maybe less under
> pressure), then we'll be limited to 128k/8 page pointers on 64 bit, or
> 64M per pwrite... Is that ok? Or do we need to handle multiple passes
> here?

That's a really good point. This hurts. However, we're already in
pain:
obj_priv->page_list = drm_calloc(page_count, sizeof(struct page *),
DRM_MEM_DRIVER);

drm_calloc is kcalloc, so we already fall on our faces with big objects,
before this code. Thinking about potential regressions for big objects
from the change in question:

pixmaps: Can't render with them already. X only limits you to 4GB
pixmaps. Doesn't use pread/pwrite.

textures: Can't render with them already. Largest texture size is
2048*2048*4*6*1.5 or so for a mipmapped cube map, or around 150MB. This
would fail on 32-bit as well. Doesn't use pread/write.

FBOs: Can't render with them. Same size as textures. Software
fallbacks use pread/pwrite, but it's always done a page at a time.

VBOs (965): Can't render with them. No size limitations I know of.

VBOs (915): Not used for rendering, just intermediate storage (this is a
bug). No size limitations I know of. So here we would regress huge
VBOs on 915 when uploaded using BufferData instead of MapBuffer
(unlikely). Of course, it's already a bug that we make real VBOs on 915
before it's strictly necessary.

PBOs: Can't render with them. Normal usage wouldn't be big enough to
trigger the bug, though. Does use pread/pwrite when accessed using
{Get,}Buffer{Sub,}Data.

My summary here would be: Huge objects are already pretty thoroughly
broken, since any acceleration using them fails at the kcalloc of the
page list when binding to the GTT. Doing one more kalloc of a page list
isn't significantly changing the situation.

I propose going forward with these patches, and I'll go off and build
some small testcases for our various interfaces with big objects so we
can fix them and make sure we stay correct.

--
Eric Anholt
[email protected] [email protected]



Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-03-27 17:07:33

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH 1/6] drm/i915: Fix lock order reversal in GTT pwrite path.

On Fri, 27 Mar 2009 09:56:03 -0700
Eric Anholt <[email protected]> wrote:

> On Thu, 2009-03-26 at 17:43 -0700, Jesse Barnes wrote:
> > On Wed, 25 Mar 2009 14:45:05 -0700
> > Eric Anholt <[email protected]> wrote:
> >
> > > Since the pagefault path determines that the lock order we use
> > > has to be mmap_sem -> struct_mutex, we can't allow page faults to
> > > occur while the struct_mutex is held. To fix this in pwrite, we
> > > first try optimistically to see if we can copy from user without
> > > faulting. If it fails, fall back to using get_user_pages to pin
> > > the user's memory, and map those pages atomically when copying it
> > > to the GPU.
> > >
> > > Signed-off-by: Eric Anholt <[email protected]>
> > > ---
> > > + /* Pin the user pages containing the data. We can't
> > > fault while
> > > + * holding the struct mutex, and all of the pwrite
> > > implementations
> > > + * want to hold it while dereferencing the user data.
> > > + */
> > > + first_data_page = data_ptr / PAGE_SIZE;
> > > + last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
> > > + num_pages = last_data_page - first_data_page + 1;
> > > +
> > > + user_pages = kcalloc(num_pages, sizeof(struct page *),
> > > GFP_KERNEL);
> > > + if (user_pages == NULL)
> > > + return -ENOMEM;
> >
> > If kmalloc limits us to a 128k allocation (and maybe less under
> > pressure), then we'll be limited to 128k/8 page pointers on 64 bit,
> > or 64M per pwrite... Is that ok? Or do we need to handle multiple
> > passes here?
>
> That's a really good point. This hurts. However, we're already in
> pain:
> obj_priv->page_list = drm_calloc(page_count, sizeof(struct
> page *), DRM_MEM_DRIVER);
>
> drm_calloc is kcalloc, so we already fall on our faces with big
> objects, before this code. Thinking about potential regressions for
> big objects from the change in question:
>
> pixmaps: Can't render with them already. X only limits you to 4GB
> pixmaps. Doesn't use pread/pwrite.
>
> textures: Can't render with them already. Largest texture size is
> 2048*2048*4*6*1.5 or so for a mipmapped cube map, or around 150MB.
> This would fail on 32-bit as well. Doesn't use pread/write.
>
> FBOs: Can't render with them. Same size as textures. Software
> fallbacks use pread/pwrite, but it's always done a page at a time.
>
> VBOs (965): Can't render with them. No size limitations I know of.
>
> VBOs (915): Not used for rendering, just intermediate storage (this
> is a bug). No size limitations I know of. So here we would regress
> huge VBOs on 915 when uploaded using BufferData instead of MapBuffer
> (unlikely). Of course, it's already a bug that we make real VBOs on
> 915 before it's strictly necessary.
>
> PBOs: Can't render with them. Normal usage wouldn't be big enough to
> trigger the bug, though. Does use pread/pwrite when accessed using
> {Get,}Buffer{Sub,}Data.
>
> My summary here would be: Huge objects are already pretty thoroughly
> broken, since any acceleration using them fails at the kcalloc of the
> page list when binding to the GTT. Doing one more kalloc of a page
> list isn't significantly changing the situation.
>
> I propose going forward with these patches, and I'll go off and build
> some small testcases for our various interfaces with big objects so we
> can fix them and make sure we stay correct.

Great, thanks for looking into it. I figured there was probably
similar breakage elsewhere, so there's no reason to block this
patchset. I agree large stuff should be fixed up in a separate set.

--
Jesse Barnes, Intel Open Source Technology Center

2009-03-27 18:09:20

by Andi Kleen

[permalink] [raw]
Subject: Re: DRM lock ordering fix series

On Fri, Mar 27, 2009 at 09:36:45AM -0700, Eric Anholt wrote:
> > > You are aware that there is a fast path now (get_user_pages_fast) which
> > > is significantly faster? (but has some limitations)
> >
> > In the code I have, get_user_pages_fast is just a wrapper that calls the
> > get_user_pages in the way that I'm calling it from the DRM.
>
> Ah, I see: that's a weak stub, and there is a real implementation. I
> didn't know we could do weak stubs.

The main limitation is that it only works for your current process,
not another one. For more details you can check the git changelog
that added it (8174c430e445a93016ef18f717fe570214fa38bf)

And yes it's only faster for architectures that support it, that's
currently x86 and ppc.

-Andi

--
[email protected] -- Speaking for myself only.

2009-03-27 20:12:59

by Eric Anholt

[permalink] [raw]
Subject: Re: DRM lock ordering fix series

On Fri, 2009-03-27 at 19:10 +0100, Andi Kleen wrote:
> On Fri, Mar 27, 2009 at 09:36:45AM -0700, Eric Anholt wrote:
> > > > You are aware that there is a fast path now (get_user_pages_fast) which
> > > > is significantly faster? (but has some limitations)
> > >
> > > In the code I have, get_user_pages_fast is just a wrapper that calls the
> > > get_user_pages in the way that I'm calling it from the DRM.
> >
> > Ah, I see: that's a weak stub, and there is a real implementation. I
> > didn't know we could do weak stubs.
>
> The main limitation is that it only works for your current process,
> not another one. For more details you can check the git changelog
> that added it (8174c430e445a93016ef18f717fe570214fa38bf)
>
> And yes it's only faster for architectures that support it, that's
> currently x86 and ppc.

OK. I'm not too excited here -- 10% of 2% of the CPU time doesn't get
me to the 10% loss that the slow path added up to. Most of the cost is
in k{un,}map_atomic of the returned pages. If the gup somehow filled in
the user's PTEs, I'd be happy and always use that (since then I'd have
the mapping already in place and just use that). But I think I can see
why that can't be done.

I suppose I could rework this so that we get_user_pages_fast outside the
lock, then walk doing copy_from_user_inatomic, and fall back to
kmap_atomic of the page list if we fault on the user's address. It's
still going to be a cost in our hot path, though, so I'd rather not.

I'm working on a set of tests and microbenchmarks for GEM, so other
people will be able to play with this easily soon.

--
Eric Anholt
[email protected] [email protected]



Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-03-27 21:04:35

by Andi Kleen

[permalink] [raw]
Subject: Re: DRM lock ordering fix series

> OK. I'm not too excited here -- 10% of 2% of the CPU time doesn't get
> me to the 10% loss that the slow path added up to. Most of the cost is
> in k{un,}map_atomic of the returned pages. If the gup somehow filled in
> the user's PTEs, I'd be happy and always use that (since then I'd have

On x86 the user PTEs are already there if it's your current process
context so you could just use them. And it's even safe to use as long as its
locked. But that would be a x86 specific hack, not working on all platforms
that have split kernel user address spaces (that includes UML)

-Andi

--
[email protected] -- Speaking for myself only.

2009-03-28 00:54:47

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/6] drm/i915: Fix lock order reversal in GTT pwrite path.

On Thu, 2009-03-26 at 17:43 -0700, Jesse Barnes wrote:
> On Wed, 25 Mar 2009 14:45:05 -0700
> Eric Anholt <[email protected]> wrote:
>
> > Since the pagefault path determines that the lock order we use has to
> > be mmap_sem -> struct_mutex, we can't allow page faults to occur
> > while the struct_mutex is held. To fix this in pwrite, we first try
> > optimistically to see if we can copy from user without faulting. If
> > it fails, fall back to using get_user_pages to pin the user's memory,
> > and map those pages atomically when copying it to the GPU.
> >
> > Signed-off-by: Eric Anholt <[email protected]>
> > ---
> > + /* Pin the user pages containing the data. We can't fault
> > while
> > + * holding the struct mutex, and all of the pwrite
> > implementations
> > + * want to hold it while dereferencing the user data.
> > + */
> > + first_data_page = data_ptr / PAGE_SIZE;
> > + last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
> > + num_pages = last_data_page - first_data_page + 1;
> > +
> > + user_pages = kcalloc(num_pages, sizeof(struct page *),
> > GFP_KERNEL);
> > + if (user_pages == NULL)
> > + return -ENOMEM;
>
> If kmalloc limits us to a 128k allocation (and maybe less under
> pressure), then we'll be limited to 128k/8 page pointers on 64 bit, or
> 64M per pwrite... Is that ok? Or do we need to handle multiple passes
> here?

While officially supported, a 128k kmalloc is _very_ likely to fail, it
would require an order 5 page allocation to back that, and that is well
outside of comfortable.


2009-03-28 00:58:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: DRM lock ordering fix series

On Fri, 2009-03-27 at 13:10 -0700, Eric Anholt wrote:
> On Fri, 2009-03-27 at 19:10 +0100, Andi Kleen wrote:
> > On Fri, Mar 27, 2009 at 09:36:45AM -0700, Eric Anholt wrote:
> > > > > You are aware that there is a fast path now (get_user_pages_fast) which
> > > > > is significantly faster? (but has some limitations)
> > > >
> > > > In the code I have, get_user_pages_fast is just a wrapper that calls the
> > > > get_user_pages in the way that I'm calling it from the DRM.
> > >
> > > Ah, I see: that's a weak stub, and there is a real implementation. I
> > > didn't know we could do weak stubs.
> >
> > The main limitation is that it only works for your current process,
> > not another one. For more details you can check the git changelog
> > that added it (8174c430e445a93016ef18f717fe570214fa38bf)
> >
> > And yes it's only faster for architectures that support it, that's
> > currently x86 and ppc.
>
> OK. I'm not too excited here -- 10% of 2% of the CPU time doesn't get
> me to the 10% loss that the slow path added up to. Most of the cost is
> in k{un,}map_atomic of the returned pages.

Also note that doing large gup() with gup_fast() will be undesirable due
to it disabling IRQs. So iterating say several MB worth of pages will
hurt like crazy. Currently all gup_fast() users do a single page lookup.

2009-03-28 01:30:20

by Peter Zijlstra

[permalink] [raw]
Subject: Re: DRM lock ordering fix series

On Sat, 2009-03-28 at 01:58 +0100, Peter Zijlstra wrote:

> > OK. I'm not too excited here -- 10% of 2% of the CPU time doesn't get
> > me to the 10% loss that the slow path added up to. Most of the cost is
> > in k{un,}map_atomic of the returned pages.
>
> Also note that doing large gup() with gup_fast() will be undesirable due
> to it disabling IRQs. So iterating say several MB worth of pages will
> hurt like crazy. Currently all gup_fast() users do a single page lookup.

Also, what's this weird facination with 32bit, can you even buy a 32bit
only cpu these days?

2009-03-28 02:35:56

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH 1/6] drm/i915: Fix lock order reversal in GTT pwrite path.

On Sat, 28 Mar 2009 01:54:32 +0100
Peter Zijlstra <[email protected]> wrote:

> On Thu, 2009-03-26 at 17:43 -0700, Jesse Barnes wrote:
> > On Wed, 25 Mar 2009 14:45:05 -0700
> > Eric Anholt <[email protected]> wrote:
> >
> > > Since the pagefault path determines that the lock order we use
> > > has to be mmap_sem -> struct_mutex, we can't allow page faults to
> > > occur while the struct_mutex is held. To fix this in pwrite, we
> > > first try optimistically to see if we can copy from user without
> > > faulting. If it fails, fall back to using get_user_pages to pin
> > > the user's memory, and map those pages atomically when copying it
> > > to the GPU.
> > >
> > > Signed-off-by: Eric Anholt <[email protected]>
> > > ---
> > > + /* Pin the user pages containing the data. We can't
> > > fault while
> > > + * holding the struct mutex, and all of the pwrite
> > > implementations
> > > + * want to hold it while dereferencing the user data.
> > > + */
> > > + first_data_page = data_ptr / PAGE_SIZE;
> > > + last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
> > > + num_pages = last_data_page - first_data_page + 1;
> > > +
> > > + user_pages = kcalloc(num_pages, sizeof(struct page *),
> > > GFP_KERNEL);
> > > + if (user_pages == NULL)
> > > + return -ENOMEM;
> >
> > If kmalloc limits us to a 128k allocation (and maybe less under
> > pressure), then we'll be limited to 128k/8 page pointers on 64 bit,
> > or 64M per pwrite... Is that ok? Or do we need to handle multiple
> > passes here?
>
> While officially supported, a 128k kmalloc is _very_ likely to fail,
> it would require an order 5 page allocation to back that, and that is
> well outside of comfortable.

Yeah, my "and maybe less" could have been worded a tad more strongly. ;)
Do we have stats on which kmalloc buckets have available allocations
anywhere for machines under various workloads? I know under heavy
pressure even 8k allocations can fail, but since this is a GFP_KERNEL
things should be a *little* better.

--
Jesse Barnes, Intel Open Source Technology Center

2009-03-28 05:22:20

by Dave Airlie

[permalink] [raw]
Subject: Re: [PATCH 1/6] drm/i915: Fix lock order reversal in GTT pwrite path.

On Sat, Mar 28, 2009 at 12:35 PM, Jesse Barnes <[email protected]> wrote:
> On Sat, 28 Mar 2009 01:54:32 +0100
> Peter Zijlstra <[email protected]> wrote:
>
>> On Thu, 2009-03-26 at 17:43 -0700, Jesse Barnes wrote:
>> > On Wed, 25 Mar 2009 14:45:05 -0700
>> > Eric Anholt <[email protected]> wrote:
>> >
>> > > Since the pagefault path determines that the lock order we use
>> > > has to be mmap_sem -> struct_mutex, we can't allow page faults to
>> > > occur while the struct_mutex is held. ?To fix this in pwrite, we
>> > > first try optimistically to see if we can copy from user without
>> > > faulting. ?If it fails, fall back to using get_user_pages to pin
>> > > the user's memory, and map those pages atomically when copying it
>> > > to the GPU.
>> > >
>> > > Signed-off-by: Eric Anholt <[email protected]>
>> > > ---
>> > > + /* Pin the user pages containing the data. ?We can't
>> > > fault while
>> > > + ?* holding the struct mutex, and all of the pwrite
>> > > implementations
>> > > + ?* want to hold it while dereferencing the user data.
>> > > + ?*/
>> > > + first_data_page = data_ptr / PAGE_SIZE;
>> > > + last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
>> > > + num_pages = last_data_page - first_data_page + 1;
>> > > +
>> > > + user_pages = kcalloc(num_pages, sizeof(struct page *),
>> > > GFP_KERNEL);
>> > > + if (user_pages == NULL)
>> > > + ? ? ? ? return -ENOMEM;
>> >
>> > If kmalloc limits us to a 128k allocation (and maybe less under
>> > pressure), then we'll be limited to 128k/8 page pointers on 64 bit,
>> > or 64M per pwrite... ?Is that ok? ?Or do we need to handle multiple
>> > passes here?
>>
>> While officially supported, a 128k kmalloc is _very_ likely to fail,
>> it would require an order 5 page allocation to back that, and that is
>> well outside of comfortable.
>
> Yeah, my "and maybe less" could have been worded a tad more strongly. ;)
> Do we have stats on which kmalloc buckets have available allocations
> anywhere for machines under various workloads? ?I know under heavy
> pressure even 8k allocations can fail, but since this is a GFP_KERNEL
> things should be a *little* better.

You might want to check what TTM did, it had a vmalloc fallback.

Another issue that might need checking in GEM was whether we can from unpriv
userspace cause the kernel to do unbounded small kmallocs and keep
them alive either via object references or anything like that.

Dave.
>
> --
> Jesse Barnes, Intel Open Source Technology Center
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at ?http://www.tux.org/lkml/
>

2009-03-28 09:06:22

by Brice Goglin

[permalink] [raw]
Subject: Re: DRM lock ordering fix series

Peter Zijlstra wrote:
> Also note that doing large gup() with gup_fast() will be undesirable due
> to it disabling IRQs. So iterating say several MB worth of pages will
> hurt like crazy. Currently all gup_fast() users do a single page lookup.
>

In 2.6.29, fs/bio.c:955, fs/direct-io.c:153 and fs/splice.c:1222 do
multiple-pages lookup at once. The latter might be limited to 16 pages
because of the pipe-depth, I don't know about the formers.

Is there some sort of reasonable limit? A couple dozens pages at once maybe?

Brice

2009-03-28 10:49:05

by Peter Zijlstra

[permalink] [raw]
Subject: Re: DRM lock ordering fix series

On Sat, 2009-03-28 at 09:46 +0100, Brice Goglin wrote:
> Peter Zijlstra wrote:
> > Also note that doing large gup() with gup_fast() will be undesirable due
> > to it disabling IRQs. So iterating say several MB worth of pages will
> > hurt like crazy. Currently all gup_fast() users do a single page lookup.
> >
>
> In 2.6.29, fs/bio.c:955, fs/direct-io.c:153 and fs/splice.c:1222 do
> multiple-pages lookup at once. The latter might be limited to 16 pages
> because of the pipe-depth, I don't know about the formers.
>
> Is there some sort of reasonable limit? A couple dozens pages at once maybe?

Depends on your latency requirements, looking at the code I'd say we'd
have to add that batch limit the comment talks about. I'd see preempt-rt
wanting to lower that significantly.

Regular mainline could do with 32-64 I guess, max irq latency is well
over 10ms on mainline anyway.

2009-03-28 12:22:29

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC] x86: gup_fast() batch limit (was: DRM lock ordering fix series)

On Sat, 2009-03-28 at 11:48 +0100, Peter Zijlstra wrote:
> On Sat, 2009-03-28 at 09:46 +0100, Brice Goglin wrote:
> > Peter Zijlstra wrote:
> > > Also note that doing large gup() with gup_fast() will be undesirable due
> > > to it disabling IRQs. So iterating say several MB worth of pages will
> > > hurt like crazy. Currently all gup_fast() users do a single page lookup.
> > >
> >
> > In 2.6.29, fs/bio.c:955, fs/direct-io.c:153 and fs/splice.c:1222 do
> > multiple-pages lookup at once. The latter might be limited to 16 pages
> > because of the pipe-depth, I don't know about the formers.
> >
> > Is there some sort of reasonable limit? A couple dozens pages at once maybe?
>
> Depends on your latency requirements, looking at the code I'd say we'd
> have to add that batch limit the comment talks about. I'd see preempt-rt
> wanting to lower that significantly.
>
> Regular mainline could do with 32-64 I guess, max irq latency is well
> over 10ms on mainline anyway.

I'm not really trusting my brain today, but something like the below
should work I think.

Nick, any thoughts?

Not-Signed-off-by: Peter Zijlstra <[email protected]>
---
arch/x86/mm/gup.c | 24 +++++++++++++++++++++---
1 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index be54176..4ded5c3 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -11,6 +11,8 @@

#include <asm/pgtable.h>

+#define GUP_BATCH 32
+
static inline pte_t gup_get_pte(pte_t *ptep)
{
#ifndef CONFIG_X86_PAE
@@ -91,7 +93,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
get_page(page);
pages[*nr] = page;
(*nr)++;
-
+ if (*nr > GUP_BATCH)
+ break;
} while (ptep++, addr += PAGE_SIZE, addr != end);
pte_unmap(ptep - 1);

@@ -157,6 +160,8 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (!gup_pte_range(pmd, addr, next, write, pages, nr))
return 0;
}
+ if (*nr > GUP_BATCH)
+ break;
} while (pmdp++, addr = next, addr != end);

return 1;
@@ -214,6 +219,8 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
if (!gup_pmd_range(pud, addr, next, write, pages, nr))
return 0;
}
+ if (*nr > GUP_BATCH)
+ break;
} while (pudp++, addr = next, addr != end);

return 1;
@@ -226,7 +233,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
unsigned long addr, len, end;
unsigned long next;
pgd_t *pgdp;
- int nr = 0;
+ int batch = 0, nr = 0;

start &= PAGE_MASK;
addr = start;
@@ -254,6 +261,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
* (which we do on x86, with the above PAE exception), we can follow the
* address down to the the page and take a ref on it.
*/
+again:
local_irq_disable();
pgdp = pgd_offset(mm, addr);
do {
@@ -262,11 +270,21 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ if (!gup_pud_range(pgd, addr, next, write, pages, &batch))
goto slow;
+ if (batch > GUP_BATCH) {
+ local_irq_enable();
+ addr += batch << PAGE_SHIFT;
+ nr += batch;
+ batch = 0;
+ if (addr != end)
+ goto again;
+ }
} while (pgdp++, addr = next, addr != end);
local_irq_enable();

+ nr += batch;
+
VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
return nr;


2009-03-28 12:46:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC] x86: gup_fast() batch limit (was: DRM lock ordering fix series)

On Sat, 2009-03-28 at 13:22 +0100, Peter Zijlstra wrote:

> I'm not really trusting my brain today, but something like the below
> should work I think.
>
> Nick, any thoughts?
>
> Not-Signed-off-by: Peter Zijlstra <[email protected]>
> ---
> arch/x86/mm/gup.c | 24 +++++++++++++++++++++---
> 1 files changed, 21 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
> index be54176..4ded5c3 100644
> --- a/arch/x86/mm/gup.c
> +++ b/arch/x86/mm/gup.c
> @@ -11,6 +11,8 @@
>
> #include <asm/pgtable.h>
>
> +#define GUP_BATCH 32
> +
> static inline pte_t gup_get_pte(pte_t *ptep)
> {
> #ifndef CONFIG_X86_PAE
> @@ -91,7 +93,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
> get_page(page);
> pages[*nr] = page;
> (*nr)++;
> -
> + if (*nr > GUP_BATCH)
> + break;
> } while (ptep++, addr += PAGE_SIZE, addr != end);
> pte_unmap(ptep - 1);
>
> @@ -157,6 +160,8 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
> if (!gup_pte_range(pmd, addr, next, write, pages, nr))
> return 0;
> }
> + if (*nr > GUP_BATCH)
> + break;
> } while (pmdp++, addr = next, addr != end);
>
> return 1;
> @@ -214,6 +219,8 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
> if (!gup_pmd_range(pud, addr, next, write, pages, nr))
> return 0;
> }
> + if (*nr > GUP_BATCH)
> + break;
> } while (pudp++, addr = next, addr != end);
>
> return 1;
> @@ -226,7 +233,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
> unsigned long addr, len, end;
> unsigned long next;
> pgd_t *pgdp;
> - int nr = 0;
> + int batch = 0, nr = 0;
>
> start &= PAGE_MASK;
> addr = start;
> @@ -254,6 +261,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
> * (which we do on x86, with the above PAE exception), we can follow the
> * address down to the the page and take a ref on it.
> */
> +again:
> local_irq_disable();
> pgdp = pgd_offset(mm, addr);
> do {
> @@ -262,11 +270,21 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
> next = pgd_addr_end(addr, end);
> if (pgd_none(pgd))
> goto slow;
> - if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
> + if (!gup_pud_range(pgd, addr, next, write, pages, &batch))
> goto slow;
> + if (batch > GUP_BATCH) {
> + local_irq_enable();
> + addr += batch << PAGE_SHIFT;
> + nr += batch;
> + batch = 0;
> + if (addr != end)
> + goto again;
> + }
> } while (pgdp++, addr = next, addr != end);
> local_irq_enable();
>
> + nr += batch;
> +
> VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
> return nr;
>

Would also need the following bit:

@@ -274,6 +292,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
int ret;

slow:
+ nr += batch;
local_irq_enable();
slow_irqon:
/* Try to get the remaining pages with get_user_pages */

2009-03-30 06:29:46

by Eric Anholt

[permalink] [raw]
Subject: Re: DRM lock ordering fix series

On Sat, 2009-03-28 at 02:29 +0100, Peter Zijlstra wrote:
> On Sat, 2009-03-28 at 01:58 +0100, Peter Zijlstra wrote:
>
> > > OK. I'm not too excited here -- 10% of 2% of the CPU time doesn't get
> > > me to the 10% loss that the slow path added up to. Most of the cost is
> > > in k{un,}map_atomic of the returned pages.
> >
> > Also note that doing large gup() with gup_fast() will be undesirable due
> > to it disabling IRQs. So iterating say several MB worth of pages will
> > hurt like crazy. Currently all gup_fast() users do a single page lookup.
>
> Also, what's this weird facination with 32bit, can you even buy a 32bit
> only cpu these days?

I work on OpenGL. Many people using OpenGL want to play commercial
games. Commercial games are 32-bit. sysprof doesn't work for 32-on-64,
so I'd lose a critical tool. Thus, 32-only.

keithp runs 32-on-64, and just about every day we're working together,
we lament that he can't run sysprof on his box. Getting ~10% of my CPU
back by going 32-on-64 would be nice, but it's not worth not being able
to usefully profile.

--
Eric Anholt
[email protected] [email protected]



Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-03-30 10:34:58

by Florian Mickler

[permalink] [raw]
Subject: Re: [PATCH 6/6] drm/i915: Fix lock order reversal in GEM relocation entry copying. -- makes X hang

Hi!

On Wed, 25 Mar 2009 14:45:10 -0700
Eric Anholt <[email protected]> wrote:

> Signed-off-by: Eric Anholt <[email protected]>
> Reviewed-by: Keith Packard <[email protected]>
> ---
> drivers/gpu/drm/i915/i915_gem.c | 187
> +++++++++++++++++++++++++++----------- 1 files changed, 133
> insertions(+), 54 deletions(-)
>

I testet Linus' Git tree @ 5d80f8e5a (merge
net-2.6) and discovered that X hung after starting up gdm.

When i start gdm the screen is frozen and X hangs.
I was able to bisect it down to

40a5f0decdf050785ebd62b36ad48c869ee4b384 drm/i915: Fix lock order
reversal in GEM relocation entry copying.


when hung /proc/[xpid]/stack did contain:

[<ffffffff8024c13e>] msleep_interruptible+0x2e/0x40
[<ffffffff80557fdd>] i915_wait_ring+0x17d/0x1d0
[<ffffffff8056280d>] i915_gem_execbuffer+0xd2d/0xf70
[<ffffffff80546555>] drm_ioctl+0x1f5/0x320
[<ffffffff802d8ce5>] vfs_ioctl+0x85/0xa0
[<ffffffff802d8f0b>] do_vfs_ioctl+0x20b/0x510
[<ffffffff802d9297>] sys_ioctl+0x87/0xa0
[<ffffffff8020ba8b>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff


I reverted that commit now on top of 5d80f8e5a and all 's well .


My X-Stack is from Git at around 20th march. I will update that now,
but as it hangs in a kernel syscall, it shouldn't matter?

Sincerely,

Florian

p.s. no kms


Attachments:
(No filename) (1.28 kB)
config.gz (15.40 kB)
Download all attachments

2009-03-31 19:37:20

by Eric Anholt

[permalink] [raw]
Subject: Re: [PATCH 6/6] drm/i915: Fix lock order reversal in GEM relocation entry copying. -- makes X hang

On Mon, 2009-03-30 at 12:00 +0200, Florian Mickler wrote:
> Hi!
>
> On Wed, 25 Mar 2009 14:45:10 -0700
> Eric Anholt <[email protected]> wrote:
>
> > Signed-off-by: Eric Anholt <[email protected]>
> > Reviewed-by: Keith Packard <[email protected]>
> > ---
> > drivers/gpu/drm/i915/i915_gem.c | 187
> > +++++++++++++++++++++++++++----------- 1 files changed, 133
> > insertions(+), 54 deletions(-)
> >
>
> I testet Linus' Git tree @ 5d80f8e5a (merge
> net-2.6) and discovered that X hung after starting up gdm.
>
> When i start gdm the screen is frozen and X hangs.
> I was able to bisect it down to
>
> 40a5f0decdf050785ebd62b36ad48c869ee4b384 drm/i915: Fix lock order
> reversal in GEM relocation entry copying.
>
>
> when hung /proc/[xpid]/stack did contain:
>
> [<ffffffff8024c13e>] msleep_interruptible+0x2e/0x40
> [<ffffffff80557fdd>] i915_wait_ring+0x17d/0x1d0
> [<ffffffff8056280d>] i915_gem_execbuffer+0xd2d/0xf70
> [<ffffffff80546555>] drm_ioctl+0x1f5/0x320
> [<ffffffff802d8ce5>] vfs_ioctl+0x85/0xa0
> [<ffffffff802d8f0b>] do_vfs_ioctl+0x20b/0x510
> [<ffffffff802d9297>] sys_ioctl+0x87/0xa0
> [<ffffffff8020ba8b>] system_call_fastpath+0x16/0x1b
> [<ffffffffffffffff>] 0xffffffffffffffff
>
>
> I reverted that commit now on top of 5d80f8e5a and all 's well .
>
>
> My X-Stack is from Git at around 20th march. I will update that now,
> but as it hangs in a kernel syscall, it shouldn't matter?
>
> Sincerely,
>
> Florian
>
> p.s. no kms

Ouch. Could you file a bug report using:

http://intellinuxgraphics.org/how_to_report_bug.html

so I've got the information I need to try to reproduce this?

--
Eric Anholt
[email protected] [email protected]



Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2009-04-01 00:12:46

by Florian Mickler

[permalink] [raw]
Subject: Re: [PATCH 6/6] drm/i915: Fix lock order reversal in GEM relocation entry copying. -- makes X hang

On Tue, 31 Mar 2009 12:36:56 -0700
Eric Anholt <[email protected]> wrote:

>
> Ouch. Could you file a bug report using:
>
> http://intellinuxgraphics.org/how_to_report_bug.html
>
> so I've got the information I need to try to reproduce this?
>

shure. done.

http://bugs.freedesktop.org/show_bug.cgi?id=20985

thx,

Florian

p.s. hope i didn't miss some vital piece of information?


Attachments:
signature.asc (197.00 B)

2009-04-02 11:20:29

by Nick Piggin

[permalink] [raw]
Subject: Re: [RFC] x86: gup_fast() batch limit (was: DRM lock ordering fix series)

On Saturday 28 March 2009 23:46:14 Peter Zijlstra wrote:
> On Sat, 2009-03-28 at 13:22 +0100, Peter Zijlstra wrote:
> > I'm not really trusting my brain today, but something like the below
> > should work I think.
> >
> > Nick, any thoughts?
> >
> > Not-Signed-off-by: Peter Zijlstra <[email protected]>
> > ---
> > arch/x86/mm/gup.c | 24 +++++++++++++++++++++---
> > 1 files changed, 21 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
> > index be54176..4ded5c3 100644
> > --- a/arch/x86/mm/gup.c
> > +++ b/arch/x86/mm/gup.c
> > @@ -11,6 +11,8 @@
> >
> > #include <asm/pgtable.h>
> >
> > +#define GUP_BATCH 32
> > +
> > static inline pte_t gup_get_pte(pte_t *ptep)
> > {
> > #ifndef CONFIG_X86_PAE
> > @@ -91,7 +93,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned
> > long addr, get_page(page);
> > pages[*nr] = page;
> > (*nr)++;
> > -
> > + if (*nr > GUP_BATCH)
> > + break;
> > } while (ptep++, addr += PAGE_SIZE, addr != end);
> > pte_unmap(ptep - 1);
> >
> > @@ -157,6 +160,8 @@ static int gup_pmd_range(pud_t pud, unsigned long
> > addr, unsigned long end, if (!gup_pte_range(pmd, addr, next, write,
> > pages, nr))
> > return 0;
> > }
> > + if (*nr > GUP_BATCH)
> > + break;
> > } while (pmdp++, addr = next, addr != end);
> >
> > return 1;
> > @@ -214,6 +219,8 @@ static int gup_pud_range(pgd_t pgd, unsigned long
> > addr, unsigned long end, if (!gup_pmd_range(pud, addr, next, write,
> > pages, nr))
> > return 0;
> > }
> > + if (*nr > GUP_BATCH)
> > + break;
> > } while (pudp++, addr = next, addr != end);
> >
> > return 1;
> > @@ -226,7 +233,7 @@ int get_user_pages_fast(unsigned long start, int
> > nr_pages, int write, unsigned long addr, len, end;
> > unsigned long next;
> > pgd_t *pgdp;
> > - int nr = 0;
> > + int batch = 0, nr = 0;
> >
> > start &= PAGE_MASK;
> > addr = start;
> > @@ -254,6 +261,7 @@ int get_user_pages_fast(unsigned long start, int
> > nr_pages, int write, * (which we do on x86, with the above PAE
> > exception), we can follow the * address down to the the page and take a
> > ref on it.
> > */
> > +again:
> > local_irq_disable();
> > pgdp = pgd_offset(mm, addr);
> > do {
> > @@ -262,11 +270,21 @@ int get_user_pages_fast(unsigned long start, int
> > nr_pages, int write, next = pgd_addr_end(addr, end);
> > if (pgd_none(pgd))
> > goto slow;
> > - if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
> > + if (!gup_pud_range(pgd, addr, next, write, pages, &batch))
> > goto slow;
> > + if (batch > GUP_BATCH) {
> > + local_irq_enable();
> > + addr += batch << PAGE_SHIFT;
> > + nr += batch;
> > + batch = 0;
> > + if (addr != end)
> > + goto again;
> > + }
> > } while (pgdp++, addr = next, addr != end);
> > local_irq_enable();
> >
> > + nr += batch;
> > +
> > VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
> > return nr;
>
> Would also need the following bit:
>
> @@ -274,6 +292,7 @@ int get_user_pages_fast(unsigned long start, int
> nr_pages, int write, int ret;
>
> slow:
> + nr += batch;
> local_irq_enable();
> slow_irqon:
> /* Try to get the remaining pages with get_user_pages */


Yeah something like this would be fine (and welcome). And we can
remove the XXX comment in there too. I would suggest 64 being a
reasonable value simply because that's what direct IO does.

Implementation-wise, why not just break "len" into chunks in the
top level function rather than add branches all down the call
chain?

2009-06-24 13:46:19

by Brice Goglin

[permalink] [raw]
Subject: Re: [RFC] x86: gup_fast() batch limit

Any news about this patch?

Brice



Nick Piggin wrote:
> On Saturday 28 March 2009 23:46:14 Peter Zijlstra wrote:
>
>> On Sat, 2009-03-28 at 13:22 +0100, Peter Zijlstra wrote:
>>
>>> I'm not really trusting my brain today, but something like the below
>>> should work I think.
>>>
>>> Nick, any thoughts?
>>>
>>> Not-Signed-off-by: Peter Zijlstra <[email protected]>
>>> ---
>>> arch/x86/mm/gup.c | 24 +++++++++++++++++++++---
>>> 1 files changed, 21 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
>>> index be54176..4ded5c3 100644
>>> --- a/arch/x86/mm/gup.c
>>> +++ b/arch/x86/mm/gup.c
>>> @@ -11,6 +11,8 @@
>>>
>>> #include <asm/pgtable.h>
>>>
>>> +#define GUP_BATCH 32
>>> +
>>> static inline pte_t gup_get_pte(pte_t *ptep)
>>> {
>>> #ifndef CONFIG_X86_PAE
>>> @@ -91,7 +93,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned
>>> long addr, get_page(page);
>>> pages[*nr] = page;
>>> (*nr)++;
>>> -
>>> + if (*nr > GUP_BATCH)
>>> + break;
>>> } while (ptep++, addr += PAGE_SIZE, addr != end);
>>> pte_unmap(ptep - 1);
>>>
>>> @@ -157,6 +160,8 @@ static int gup_pmd_range(pud_t pud, unsigned long
>>> addr, unsigned long end, if (!gup_pte_range(pmd, addr, next, write,
>>> pages, nr))
>>> return 0;
>>> }
>>> + if (*nr > GUP_BATCH)
>>> + break;
>>> } while (pmdp++, addr = next, addr != end);
>>>
>>> return 1;
>>> @@ -214,6 +219,8 @@ static int gup_pud_range(pgd_t pgd, unsigned long
>>> addr, unsigned long end, if (!gup_pmd_range(pud, addr, next, write,
>>> pages, nr))
>>> return 0;
>>> }
>>> + if (*nr > GUP_BATCH)
>>> + break;
>>> } while (pudp++, addr = next, addr != end);
>>>
>>> return 1;
>>> @@ -226,7 +233,7 @@ int get_user_pages_fast(unsigned long start, int
>>> nr_pages, int write, unsigned long addr, len, end;
>>> unsigned long next;
>>> pgd_t *pgdp;
>>> - int nr = 0;
>>> + int batch = 0, nr = 0;
>>>
>>> start &= PAGE_MASK;
>>> addr = start;
>>> @@ -254,6 +261,7 @@ int get_user_pages_fast(unsigned long start, int
>>> nr_pages, int write, * (which we do on x86, with the above PAE
>>> exception), we can follow the * address down to the the page and take a
>>> ref on it.
>>> */
>>> +again:
>>> local_irq_disable();
>>> pgdp = pgd_offset(mm, addr);
>>> do {
>>> @@ -262,11 +270,21 @@ int get_user_pages_fast(unsigned long start, int
>>> nr_pages, int write, next = pgd_addr_end(addr, end);
>>> if (pgd_none(pgd))
>>> goto slow;
>>> - if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
>>> + if (!gup_pud_range(pgd, addr, next, write, pages, &batch))
>>> goto slow;
>>> + if (batch > GUP_BATCH) {
>>> + local_irq_enable();
>>> + addr += batch << PAGE_SHIFT;
>>> + nr += batch;
>>> + batch = 0;
>>> + if (addr != end)
>>> + goto again;
>>> + }
>>> } while (pgdp++, addr = next, addr != end);
>>> local_irq_enable();
>>>
>>> + nr += batch;
>>> +
>>> VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
>>> return nr;
>>>
>> Would also need the following bit:
>>
>> @@ -274,6 +292,7 @@ int get_user_pages_fast(unsigned long start, int
>> nr_pages, int write, int ret;
>>
>> slow:
>> + nr += batch;
>> local_irq_enable();
>> slow_irqon:
>> /* Try to get the remaining pages with get_user_pages */
>>
>
>
> Yeah something like this would be fine (and welcome). And we can
> remove the XXX comment in there too. I would suggest 64 being a
> reasonable value simply because that's what direct IO does.
>
> Implementation-wise, why not just break "len" into chunks in the
> top level function rather than add branches all down the call
> chain?
>

2009-06-24 17:12:43

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC] x86: gup_fast() batch limit

On Wed, 2009-06-24 at 15:46 +0200, Brice Goglin wrote:
> Any news about this patch?

I knew I was forgetting something, I'll try and find a hole to
ressurect/finish this ;-)

2009-06-24 19:55:35

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC] x86: gup_fast() batch limit

On Wed, 2009-06-24 at 15:46 +0200, Brice Goglin wrote:
> Any news about this patch?

Compile tested on x86_64 and ppc64.

---
Implement the batching mentioned in the gup_fast comment.

Signed-off-by: Peter Zijlstra <[email protected]>
---
arch/powerpc/mm/gup.c | 28 +++++++++++++---------------
arch/x86/mm/gup.c | 46 ++++++++++++++++++++--------------------------
2 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index bc400c7..cf535bf 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -146,11 +146,13 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
return 1;
}

+#define CHUNK_SIZE (64 * PAGE_SIZE)
+
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
+ unsigned long addr, len, end, chunk;
unsigned long next;
pgd_t *pgdp;
int nr = 0;
@@ -191,16 +193,9 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
}
#endif /* CONFIG_HUGETLB_PAGE */

- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
+again:
+ chunk = min(addr + CHUNK_SIZE, end);
+
/*
* This doesn't prevent pagetable teardown, but does prevent
* the pagetables from being freed on powerpc.
@@ -235,10 +230,10 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
ptep = huge_pte_offset(mm, a);
pr_debug(" %016lx: huge ptep %p\n", a, ptep);
- if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
+ if (!ptep || !gup_huge_pte(ptep, hstate, &a, chunk, write, pages,
&nr))
goto slow;
- } while (a != end);
+ } while (a != chunk);
} else
#endif /* CONFIG_HUGETLB_PAGE */
{
@@ -251,15 +246,18 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
#endif
pr_debug(" %016lx: normal pgd %p\n", addr,
(void *)pgd_val(pgd));
- next = pgd_addr_end(addr, end);
+ next = pgd_addr_end(addr, chunk);
if (pgd_none(pgd))
goto slow;
if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
goto slow;
- } while (pgdp++, addr = next, addr != end);
+ } while (pgdp++, addr = next, addr != chunk);
}
local_irq_enable();

+ if (addr != end)
+ goto again;
+
VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
return nr;

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 71da1bc..9e0552b 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -219,6 +219,8 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
return 1;
}

+#define CHUNK_SIZE (64 * PAGE_SIZE)
+
/*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP.
@@ -227,7 +229,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
+ unsigned long addr, len, end, chunk;
unsigned long next;
unsigned long flags;
pgd_t *pgdp;
@@ -241,16 +243,9 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
(void __user *)start, len)))
return 0;

- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
+again:
+ chunk = min(addr + CHUNK_SIZE, end);
+
/*
* This doesn't prevent pagetable teardown, but does prevent
* the pagetables and pages from being freed on x86.
@@ -264,14 +259,17 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
do {
pgd_t pgd = *pgdp;

- next = pgd_addr_end(addr, end);
+ next = pgd_addr_end(addr, chunk);
if (pgd_none(pgd))
break;
if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
break;
- } while (pgdp++, addr = next, addr != end);
+ } while (pgdp++, addr = next, addr != chunk);
local_irq_restore(flags);

+ if (addr != end)
+ goto again;
+
return nr;
}

@@ -295,7 +293,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
+ unsigned long addr, len, end, chunk;
unsigned long next;
pgd_t *pgdp;
int nr = 0;
@@ -313,16 +311,9 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
goto slow_irqon;
#endif

- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
+again:
+ chunk = min(addr + CHUNK_SIZE, end);
+
/*
* This doesn't prevent pagetable teardown, but does prevent
* the pagetables and pages from being freed on x86.
@@ -336,14 +327,17 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
do {
pgd_t pgd = *pgdp;

- next = pgd_addr_end(addr, end);
+ next = pgd_addr_end(addr, chunk);
if (pgd_none(pgd))
goto slow;
if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
goto slow;
- } while (pgdp++, addr = next, addr != end);
+ } while (pgdp++, addr = next, addr != chunk);
local_irq_enable();

+ if (addr != end)
+ goto again;
+
VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
return nr;