First 5 patches are almost trivial clean-up patches.
The others are for fixing three bugs.
Perhaps, these problems are minor, because this codes are used
for a long time, and there is no bug reporting for these problems.
These patches are based on v3.10.0 and
passed sanity check of libhugetlbfs.
Thanks.
Joonsoo Kim (9):
mm, hugetlb: move up the code which check availability of free huge
page
mm, hugetlb: trivial commenting fix
mm, hugetlb: clean-up alloc_huge_page()
mm, hugetlb: fix and clean-up node iteration code to alloc or free
mm, hugetlb: remove redundant list_empty check in
gather_surplus_pages()
mm, hugetlb: do not use a page in page cache for cow optimization
mm, hugetlb: add VM_NORESERVE check in vma_has_reserves()
mm, hugetlb: remove decrement_hugepage_resv_vma()
mm, hugetlb: decrement reserve count if VM_NORESERVE alloc page cache
mm/hugetlb.c | 256 +++++++++++++++++++++++++++-------------------------------
1 file changed, 117 insertions(+), 139 deletions(-)
--
1.7.9.5
Currently, we use a page with mapped count 1 in page cache for cow
optimization. If we find this condition, we don't allocate a new
page and copy contents. Instead, we map this page directly.
This may introduce a problem that writting to private mapping overwrite
hugetlb file directly. You can find this situation with following code.
size = 20 * MB;
flag = MAP_SHARED;
p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
if (p == MAP_FAILED) {
fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
return -1;
}
p[0] = 's';
fprintf(stdout, "BEFORE STEAL PRIVATE WRITE: %c\n", p[0]);
munmap(p, size);
flag = MAP_PRIVATE;
p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
if (p == MAP_FAILED) {
fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
}
p[0] = 'c';
munmap(p, size);
flag = MAP_SHARED;
p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
if (p == MAP_FAILED) {
fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
return -1;
}
fprintf(stdout, "AFTER STEAL PRIVATE WRITE: %c\n", p[0]);
munmap(p, size);
We can see that "AFTER STEAL PRIVATE WRITE: c", not "AFTER STEAL
PRIVATE WRITE: s". If we turn off this optimization to a page
in page cache, the problem is disappeared.
Signed-off-by: Joonsoo Kim <[email protected]>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d4a1695..6c1eb9b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2512,7 +2512,6 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
- int avoidcopy;
int outside_reserve = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
@@ -2522,10 +2521,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
- avoidcopy = (page_mapcount(old_page) == 1);
- if (avoidcopy) {
- if (PageAnon(old_page))
- page_move_anon_rmap(old_page, vma, address);
+ if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
+ page_move_anon_rmap(old_page, vma, address);
set_huge_ptep_writable(vma, address, ptep);
return 0;
}
--
1.7.9.5
We can unify some codes for succeed allocation.
This makes code more readable.
There is no functional difference.
Signed-off-by: Joonsoo Kim <[email protected]>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d21a33a..0067cf4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1144,29 +1144,25 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
hugepage_subpool_put_pages(spool, chg);
return ERR_PTR(-ENOSPC);
}
+
spin_lock(&hugetlb_lock);
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
- if (page) {
- /* update page cgroup details */
- hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
- h_cg, page);
- spin_unlock(&hugetlb_lock);
- } else {
+ if (!page) {
spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
hugetlb_cgroup_uncharge_cgroup(idx,
- pages_per_huge_page(h),
- h_cg);
+ pages_per_huge_page(h), h_cg);
hugepage_subpool_put_pages(spool, chg);
return ERR_PTR(-ENOSPC);
}
+
spin_lock(&hugetlb_lock);
- hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
- h_cg, page);
list_move(&page->lru, &h->hugepage_activelist);
- spin_unlock(&hugetlb_lock);
+ /* Fall through */
}
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+ spin_unlock(&hugetlb_lock);
set_page_private(page, (unsigned long)spool);
--
1.7.9.5
Current node iteration code have a minor problem which do one more
node rotation if we can't succeed to allocate. For example,
if we start to allocate at node 0, we stop to iterate at node 0.
Then we start to allocate at node 1 for next allocation.
I introduce new macros "for_each_node_mask_to_[alloc|free]" and
fix and clean-up node iteration code to alloc or free.
This makes code more understandable.
Signed-off-by: Joonsoo Kim <[email protected]>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0067cf4..a838e6b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -752,33 +752,6 @@ static int hstate_next_node_to_alloc(struct hstate *h,
return nid;
}
-static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
-{
- struct page *page;
- int start_nid;
- int next_nid;
- int ret = 0;
-
- start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
- next_nid = start_nid;
-
- do {
- page = alloc_fresh_huge_page_node(h, next_nid);
- if (page) {
- ret = 1;
- break;
- }
- next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
- } while (next_nid != start_nid);
-
- if (ret)
- count_vm_event(HTLB_BUDDY_PGALLOC);
- else
- count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
-
- return ret;
-}
-
/*
* helper for free_pool_huge_page() - return the previously saved
* node ["this node"] from which to free a huge page. Advance the
@@ -797,6 +770,42 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
return nid;
}
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask), \
+ node = hstate_next_node_to_alloc(hs, mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
+ nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask), \
+ node = hstate_next_node_to_free(hs, mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+ nr_nodes--)
+
+static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ struct page *page;
+ int nr_nodes, node;
+ int ret = 0;
+
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ page = alloc_fresh_huge_page_node(h, node);
+ if (page) {
+ ret = 1;
+ break;
+ }
+ }
+
+ if (ret)
+ count_vm_event(HTLB_BUDDY_PGALLOC);
+ else
+ count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+ return ret;
+}
+
/*
* Free huge page from pool from next node to free.
* Attempt to keep persistent huge pages more or less
@@ -806,36 +815,31 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
bool acct_surplus)
{
- int start_nid;
- int next_nid;
+ int nr_nodes, node;
int ret = 0;
- start_nid = hstate_next_node_to_free(h, nodes_allowed);
- next_nid = start_nid;
-
- do {
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
/*
* If we're returning unused surplus pages, only examine
* nodes with surplus pages.
*/
- if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
- !list_empty(&h->hugepage_freelists[next_nid])) {
+ if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
+ !list_empty(&h->hugepage_freelists[node])) {
struct page *page =
- list_entry(h->hugepage_freelists[next_nid].next,
+ list_entry(h->hugepage_freelists[node].next,
struct page, lru);
list_del(&page->lru);
h->free_huge_pages--;
- h->free_huge_pages_node[next_nid]--;
+ h->free_huge_pages_node[node]--;
if (acct_surplus) {
h->surplus_huge_pages--;
- h->surplus_huge_pages_node[next_nid]--;
+ h->surplus_huge_pages_node[node]--;
}
update_and_free_page(h, page);
ret = 1;
break;
}
- next_nid = hstate_next_node_to_free(h, nodes_allowed);
- } while (next_nid != start_nid);
+ }
return ret;
}
@@ -1173,14 +1177,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
int __weak alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
- int nr_nodes = nodes_weight(node_states[N_MEMORY]);
+ int nr_nodes, node;
- while (nr_nodes) {
+ for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
- addr = __alloc_bootmem_node_nopanic(
- NODE_DATA(hstate_next_node_to_alloc(h,
- &node_states[N_MEMORY])),
+ addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
huge_page_size(h), huge_page_size(h), 0);
if (addr) {
@@ -1192,7 +1194,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
m = addr;
goto found;
}
- nr_nodes--;
}
return 0;
@@ -1331,48 +1332,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count,
static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
int delta)
{
- int start_nid, next_nid;
- int ret = 0;
+ int nr_nodes, node;
VM_BUG_ON(delta != -1 && delta != 1);
- if (delta < 0)
- start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
- else
- start_nid = hstate_next_node_to_free(h, nodes_allowed);
- next_nid = start_nid;
-
- do {
- int nid = next_nid;
- if (delta < 0) {
- /*
- * To shrink on this node, there must be a surplus page
- */
- if (!h->surplus_huge_pages_node[nid]) {
- next_nid = hstate_next_node_to_alloc(h,
- nodes_allowed);
- continue;
- }
+ if (delta < 0) {
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ if (h->surplus_huge_pages_node[node])
+ goto found;
}
- if (delta > 0) {
- /*
- * Surplus cannot exceed the total number of pages
- */
- if (h->surplus_huge_pages_node[nid] >=
- h->nr_huge_pages_node[nid]) {
- next_nid = hstate_next_node_to_free(h,
- nodes_allowed);
- continue;
- }
+ } else {
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+ if (h->surplus_huge_pages_node[node] <
+ h->nr_huge_pages_node[node])
+ goto found;
}
+ }
+ return 0;
- h->surplus_huge_pages += delta;
- h->surplus_huge_pages_node[nid] += delta;
- ret = 1;
- break;
- } while (next_nid != start_nid);
-
- return ret;
+found:
+ h->surplus_huge_pages += delta;
+ h->surplus_huge_pages_node[node] += delta;
+ return 1;
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
--
1.7.9.5
If list is empty, list_for_each_entry_safe() doesn't do anything.
So, this check is redundant. Remove it.
Signed-off-by: Joonsoo Kim <[email protected]>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a838e6b..d4a1695 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1019,10 +1019,8 @@ free:
spin_unlock(&hugetlb_lock);
/* Free unnecessary surplus pages to the buddy allocator */
- if (!list_empty(&surplus_list)) {
- list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
- put_page(page);
- }
+ list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+ put_page(page);
}
spin_lock(&hugetlb_lock);
--
1.7.9.5
If a vma with VM_NORESERVE allocate a new page for page cache, we should
check whether this area is reserved or not. If this address is
already reserved by other process(in case of chg == 0), we should
decrement reserve count, because this allocated page will go into page
cache and currently, there is no way to know that this page comes from
reserved pool or not when releasing inode. This may introduce
over-counting problem to reserved count. With following example code,
you can easily reproduce this situation.
size = 20 * MB;
flag = MAP_SHARED;
p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
if (p == MAP_FAILED) {
fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
return -1;
}
flag = MAP_SHARED | MAP_NORESERVE;
q = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
if (q == MAP_FAILED) {
fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
}
q[0] = 'c';
This patch solve this problem.
Signed-off-by: Joonsoo Kim <[email protected]>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ed2d0af..defb180 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -443,10 +443,23 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
}
/* Returns true if the VMA has associated reserve pages */
-static int vma_has_reserves(struct vm_area_struct *vma)
+static int vma_has_reserves(struct vm_area_struct *vma, long chg)
{
- if (vma->vm_flags & VM_NORESERVE)
- return 0;
+ if (vma->vm_flags & VM_NORESERVE) {
+ /*
+ * This address is already reserved by other process(chg == 0),
+ * so, we should decreament reserved count. Without
+ * decreamenting, reserve count is remained after releasing
+ * inode, because this allocated page will go into page cache
+ * and is regarded as coming from reserved pool in releasing
+ * step. Currently, we don't have any other solution to deal
+ * with this situation properly, so add work-around here.
+ */
+ if (vma->vm_flags & VM_MAYSHARE && chg == 0)
+ return 1;
+ else
+ return 0;
+ }
/* Shared mappings always use reserves */
if (vma->vm_flags & VM_MAYSHARE)
@@ -520,7 +533,8 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
- unsigned long address, int avoid_reserve)
+ unsigned long address, int avoid_reserve,
+ long chg)
{
struct page *page = NULL;
struct mempolicy *mpol;
@@ -535,7 +549,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
* have no page reserves. This check ensures that reservations are
* not "stolen". The child may still get SIGKILLed
*/
- if (!vma_has_reserves(vma) &&
+ if (!vma_has_reserves(vma, chg) &&
h->free_huge_pages - h->resv_huge_pages == 0)
return NULL;
@@ -553,8 +567,12 @@ retry_cpuset:
if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
page = dequeue_huge_page_node(h, zone_to_nid(zone));
if (page) {
- if (!avoid_reserve && vma_has_reserves(vma))
- h->resv_huge_pages--;
+ if (avoid_reserve)
+ break;
+ if (!vma_has_reserves(vma, chg))
+ break;
+
+ h->resv_huge_pages--;
break;
}
}
@@ -1139,7 +1157,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
}
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
+ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
if (!page) {
spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
--
1.7.9.5
Now, Checking condition of decrement_hugepage_resv_vma() and
vma_has_reserves() is same, so we can clean-up this function with
vma_has_reserves(). Additionally, decrement_hugepage_resv_vma() has only
one call site, so we can remove function and embed it into
dequeue_huge_page_vma() directly. This patch implement it.
Signed-off-by: Joonsoo Kim <[email protected]>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f6a7a4e..ed2d0af 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -434,25 +434,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
return (get_vma_private_data(vma) & flag) != 0;
}
-/* Decrement the reserved pages in the hugepage pool by one */
-static void decrement_hugepage_resv_vma(struct hstate *h,
- struct vm_area_struct *vma)
-{
- if (vma->vm_flags & VM_NORESERVE)
- return;
-
- if (vma->vm_flags & VM_MAYSHARE) {
- /* Shared mappings always use reserves */
- h->resv_huge_pages--;
- } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
- /*
- * Only the process that called mmap() has reserves for
- * private mappings.
- */
- h->resv_huge_pages--;
- }
-}
-
/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
@@ -466,10 +447,18 @@ static int vma_has_reserves(struct vm_area_struct *vma)
{
if (vma->vm_flags & VM_NORESERVE)
return 0;
+
+ /* Shared mappings always use reserves */
if (vma->vm_flags & VM_MAYSHARE)
return 1;
+
+ /*
+ * Only the process that called mmap() has reserves for
+ * private mappings.
+ */
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return 1;
+
return 0;
}
@@ -564,8 +553,8 @@ retry_cpuset:
if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
page = dequeue_huge_page_node(h, zone_to_nid(zone));
if (page) {
- if (!avoid_reserve)
- decrement_hugepage_resv_vma(h, vma);
+ if (!avoid_reserve && vma_has_reserves(vma))
+ h->resv_huge_pages--;
break;
}
}
--
1.7.9.5
If we map the region with MAP_NORESERVE and MAP_SHARED,
we can skip to check reserve counting and eventually we cannot be ensured
to allocate a huge page in fault time.
With following example code, you can easily find this situation.
Assume 2MB, nr_hugepages = 100
fd = hugetlbfs_unlinked_fd();
if (fd < 0)
return 1;
size = 200 * MB;
flag = MAP_SHARED;
p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
if (p == MAP_FAILED) {
fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
return -1;
}
size = 2 * MB;
flag = MAP_ANONYMOUS | MAP_SHARED | MAP_HUGETLB | MAP_NORESERVE;
p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, -1, 0);
if (p == MAP_FAILED) {
fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
}
p[0] = '0';
sleep(10);
During executing sleep(10), run 'cat /proc/meminfo' on another process.
You'll find a mentioned problem.
Solution is simple. We should check VM_NORESERVE in vma_has_reserves().
This prevent to use a pre-allocated huge page if free count is under
the reserve count.
Signed-off-by: Joonsoo Kim <[email protected]>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6c1eb9b..f6a7a4e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -464,6 +464,8 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
/* Returns true if the VMA has associated reserve pages */
static int vma_has_reserves(struct vm_area_struct *vma)
{
+ if (vma->vm_flags & VM_NORESERVE)
+ return 0;
if (vma->vm_flags & VM_MAYSHARE)
return 1;
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
--
1.7.9.5
The name of the mutex written in comment is wrong.
Fix it.
Signed-off-by: Joonsoo Kim <[email protected]>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d87f70b..d21a33a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -135,9 +135,9 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
* across the pages in a mapping.
*
* The region data structures are protected by a combination of the mmap_sem
- * and the hugetlb_instantion_mutex. To access or modify a region the caller
+ * and the hugetlb_instantiation_mutex. To access or modify a region the caller
* must either hold the mmap_sem for write, or the mmap_sem for read and
- * the hugetlb_instantiation mutex:
+ * the hugetlb_instantiation_mutex:
*
* down_write(&mm->mmap_sem);
* or
--
1.7.9.5
We don't need to proceede the processing if we don't have any usable
free huge page. So move this code up.
Signed-off-by: Joonsoo Kim <[email protected]>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e2bfbf7..d87f70b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -539,10 +539,6 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
struct zoneref *z;
unsigned int cpuset_mems_cookie;
-retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
- zonelist = huge_zonelist(vma, address,
- htlb_alloc_mask, &mpol, &nodemask);
/*
* A child process with MAP_PRIVATE mappings created by their parent
* have no page reserves. This check ensures that reservations are
@@ -550,11 +546,16 @@ retry_cpuset:
*/
if (!vma_has_reserves(vma) &&
h->free_huge_pages - h->resv_huge_pages == 0)
- goto err;
+ return NULL;
/* If reserves cannot be used, ensure enough pages are in the pool */
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
- goto err;
+ return NULL;
+
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+ zonelist = huge_zonelist(vma, address,
+ htlb_alloc_mask, &mpol, &nodemask);
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
@@ -572,10 +573,6 @@ retry_cpuset:
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
-
-err:
- mpol_cond_put(mpol);
- return NULL;
}
static void update_and_free_page(struct hstate *h, struct page *page)
--
1.7.9.5
On Mon, Jul 15, 2013 at 5:52 PM, Joonsoo Kim <[email protected]> wrote:
> The name of the mutex written in comment is wrong.
> Fix it.
>
> Signed-off-by: Joonsoo Kim <[email protected]>
>
Acked-by: Hillf Danton <[email protected]>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d87f70b..d21a33a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -135,9 +135,9 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
> * across the pages in a mapping.
> *
> * The region data structures are protected by a combination of the mmap_sem
> - * and the hugetlb_instantion_mutex. To access or modify a region the caller
> + * and the hugetlb_instantiation_mutex. To access or modify a region the caller
> * must either hold the mmap_sem for write, or the mmap_sem for read and
> - * the hugetlb_instantiation mutex:
> + * the hugetlb_instantiation_mutex:
> *
> * down_write(&mm->mmap_sem);
> * or
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
>
Joonsoo Kim <[email protected]> writes:
> Currently, we use a page with mapped count 1 in page cache for cow
> optimization. If we find this condition, we don't allocate a new
> page and copy contents. Instead, we map this page directly.
> This may introduce a problem that writting to private mapping overwrite
> hugetlb file directly. You can find this situation with following code.
>
> size = 20 * MB;
> flag = MAP_SHARED;
> p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
> if (p == MAP_FAILED) {
> fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> return -1;
> }
> p[0] = 's';
> fprintf(stdout, "BEFORE STEAL PRIVATE WRITE: %c\n", p[0]);
> munmap(p, size);
>
> flag = MAP_PRIVATE;
> p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
> if (p == MAP_FAILED) {
> fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> }
> p[0] = 'c';
> munmap(p, size);
>
> flag = MAP_SHARED;
> p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
> if (p == MAP_FAILED) {
> fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> return -1;
> }
> fprintf(stdout, "AFTER STEAL PRIVATE WRITE: %c\n", p[0]);
> munmap(p, size);
>
> We can see that "AFTER STEAL PRIVATE WRITE: c", not "AFTER STEAL
> PRIVATE WRITE: s". If we turn off this optimization to a page
> in page cache, the problem is disappeared.
Do we need to trun of the optimization for page cache completely ?
Can't we check for MAP_PRIVATE case ?
Also, we may want to add the above test into libhugetlbfs
>
> Signed-off-by: Joonsoo Kim <[email protected]>
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d4a1695..6c1eb9b 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -2512,7 +2512,6 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> {
> struct hstate *h = hstate_vma(vma);
> struct page *old_page, *new_page;
> - int avoidcopy;
> int outside_reserve = 0;
> unsigned long mmun_start; /* For mmu_notifiers */
> unsigned long mmun_end; /* For mmu_notifiers */
> @@ -2522,10 +2521,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> retry_avoidcopy:
> /* If no-one else is actually using this page, avoid the copy
> * and just make the page writable */
> - avoidcopy = (page_mapcount(old_page) == 1);
> - if (avoidcopy) {
> - if (PageAnon(old_page))
> - page_move_anon_rmap(old_page, vma, address);
> + if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
> + page_move_anon_rmap(old_page, vma, address);
> set_huge_ptep_writable(vma, address, ptep);
> return 0;
> }
-aneesh
Joonsoo Kim <[email protected]> writes:
> We don't need to proceede the processing if we don't have any usable
> free huge page. So move this code up.
I guess you can also mention that since we are holding hugetlb_lock
hstate values can't change.
Also.
>
> Signed-off-by: Joonsoo Kim <[email protected]>
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index e2bfbf7..d87f70b 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -539,10 +539,6 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
> struct zoneref *z;
> unsigned int cpuset_mems_cookie;
>
> -retry_cpuset:
> - cpuset_mems_cookie = get_mems_allowed();
> - zonelist = huge_zonelist(vma, address,
> - htlb_alloc_mask, &mpol, &nodemask);
> /*
> * A child process with MAP_PRIVATE mappings created by their parent
> * have no page reserves. This check ensures that reservations are
> @@ -550,11 +546,16 @@ retry_cpuset:
> */
> if (!vma_has_reserves(vma) &&
> h->free_huge_pages - h->resv_huge_pages == 0)
> - goto err;
> + return NULL;
>
If you don't do the above change, the patch will be much simpler.
> /* If reserves cannot be used, ensure enough pages are in the pool */
> if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
> - goto err;
> + return NULL;
> +
Same here.
> +retry_cpuset:
> + cpuset_mems_cookie = get_mems_allowed();
> + zonelist = huge_zonelist(vma, address,
> + htlb_alloc_mask, &mpol, &nodemask);
>
> for_each_zone_zonelist_nodemask(zone, z, zonelist,
> MAX_NR_ZONES - 1, nodemask) {
> @@ -572,10 +573,6 @@ retry_cpuset:
> if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
> goto retry_cpuset;
> return page;
> -
> -err:
> - mpol_cond_put(mpol);
> - return NULL;
> }
>
> static void update_and_free_page(struct hstate *h, struct page *page)
-aneesh
Joonsoo Kim <[email protected]> writes:
> The name of the mutex written in comment is wrong.
> Fix it.
>
> Signed-off-by: Joonsoo Kim <[email protected]>
Reviewed-by: Aneesh Kumar K.V <[email protected]>
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d87f70b..d21a33a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -135,9 +135,9 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
> * across the pages in a mapping.
> *
> * The region data structures are protected by a combination of the mmap_sem
> - * and the hugetlb_instantion_mutex. To access or modify a region the caller
> + * and the hugetlb_instantiation_mutex. To access or modify a region the caller
> * must either hold the mmap_sem for write, or the mmap_sem for read and
> - * the hugetlb_instantiation mutex:
> + * the hugetlb_instantiation_mutex:
> *
> * down_write(&mm->mmap_sem);
> * or
> --
> 1.7.9.5
Joonsoo Kim <[email protected]> writes:
> First 5 patches are almost trivial clean-up patches.
>
> The others are for fixing three bugs.
> Perhaps, these problems are minor, because this codes are used
> for a long time, and there is no bug reporting for these problems.
>
> These patches are based on v3.10.0 and
> passed sanity check of libhugetlbfs.
does that mean you had run with libhugetlbfs test suite ?
-aneesh
Joonsoo Kim <[email protected]> writes:
> If list is empty, list_for_each_entry_safe() doesn't do anything.
> So, this check is redundant. Remove it.
>
> Signed-off-by: Joonsoo Kim <[email protected]>
Reviewed-by: Aneesh Kumar K.V <[email protected]>
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index a838e6b..d4a1695 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1019,10 +1019,8 @@ free:
> spin_unlock(&hugetlb_lock);
>
> /* Free unnecessary surplus pages to the buddy allocator */
> - if (!list_empty(&surplus_list)) {
> - list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
> - put_page(page);
> - }
> + list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
> + put_page(page);
> }
You can now remove '{'
> spin_lock(&hugetlb_lock);
>
> --
> 1.7.9.5
Joonsoo Kim <[email protected]> writes:
> Current node iteration code have a minor problem which do one more
> node rotation if we can't succeed to allocate. For example,
> if we start to allocate at node 0, we stop to iterate at node 0.
> Then we start to allocate at node 1 for next allocation.
Can you explain the problem in a bit more detail
>
> I introduce new macros "for_each_node_mask_to_[alloc|free]" and
> fix and clean-up node iteration code to alloc or free.
> This makes code more understandable.
>
I found the existing code more readable. Obviously I haven't yet figured
out the problem you have observed with the code.
> Signed-off-by: Joonsoo Kim <[email protected]>
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 0067cf4..a838e6b 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -752,33 +752,6 @@ static int hstate_next_node_to_alloc(struct hstate *h,
> return nid;
> }
>
> -static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
> -{
> - struct page *page;
> - int start_nid;
> - int next_nid;
> - int ret = 0;
> -
> - start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
> - next_nid = start_nid;
> -
> - do {
> - page = alloc_fresh_huge_page_node(h, next_nid);
> - if (page) {
> - ret = 1;
> - break;
> - }
> - next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
> - } while (next_nid != start_nid);
> -
> - if (ret)
> - count_vm_event(HTLB_BUDDY_PGALLOC);
> - else
> - count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
> -
> - return ret;
> -}
> -
> /*
> * helper for free_pool_huge_page() - return the previously saved
> * node ["this node"] from which to free a huge page. Advance the
> @@ -797,6 +770,42 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
> return nid;
> }
>
> +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
> + for (nr_nodes = nodes_weight(*mask), \
> + node = hstate_next_node_to_alloc(hs, mask); \
> + nr_nodes > 0 && \
> + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
> + nr_nodes--)
> +
> +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
> + for (nr_nodes = nodes_weight(*mask), \
> + node = hstate_next_node_to_free(hs, mask); \
> + nr_nodes > 0 && \
> + ((node = hstate_next_node_to_free(hs, mask)) || 1); \
> + nr_nodes--)
> +
> +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
> +{
> + struct page *page;
> + int nr_nodes, node;
> + int ret = 0;
> +
> + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
This check for nodes_weight and fail right ? (nr_nodes == 0). That is
not the case with the existing code. It will allocate from
h->next_nid_to_alloc. Is that ok ?
> + page = alloc_fresh_huge_page_node(h, node);
> + if (page) {
> + ret = 1;
> + break;
> + }
> + }
> +
> + if (ret)
> + count_vm_event(HTLB_BUDDY_PGALLOC);
> + else
> + count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
> +
> + return ret;
> +}
> +
-aneesh
Joonsoo Kim <[email protected]> writes:
> If we map the region with MAP_NORESERVE and MAP_SHARED,
> we can skip to check reserve counting and eventually we cannot be ensured
> to allocate a huge page in fault time.
> With following example code, you can easily find this situation.
>
> Assume 2MB, nr_hugepages = 100
>
> fd = hugetlbfs_unlinked_fd();
> if (fd < 0)
> return 1;
>
> size = 200 * MB;
> flag = MAP_SHARED;
> p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
> if (p == MAP_FAILED) {
> fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> return -1;
> }
>
> size = 2 * MB;
> flag = MAP_ANONYMOUS | MAP_SHARED | MAP_HUGETLB | MAP_NORESERVE;
> p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, -1, 0);
> if (p == MAP_FAILED) {
> fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> }
> p[0] = '0';
> sleep(10);
>
> During executing sleep(10), run 'cat /proc/meminfo' on another process.
> You'll find a mentioned problem.
>
> Solution is simple. We should check VM_NORESERVE in vma_has_reserves().
> This prevent to use a pre-allocated huge page if free count is under
> the reserve count.
>
> Signed-off-by: Joonsoo Kim <[email protected]>
Reviewed-by: Aneesh Kumar K.V <[email protected]>
May be it is better to say
"non reserved shared mapping should not eat into reserve space. So
return error when we don't find enough free space."
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6c1eb9b..f6a7a4e 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -464,6 +464,8 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
> /* Returns true if the VMA has associated reserve pages */
> static int vma_has_reserves(struct vm_area_struct *vma)
> {
> + if (vma->vm_flags & VM_NORESERVE)
> + return 0;
> if (vma->vm_flags & VM_MAYSHARE)
> return 1;
> if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
> --
> 1.7.9.5
Joonsoo Kim <[email protected]> writes:
> Now, Checking condition of decrement_hugepage_resv_vma() and
> vma_has_reserves() is same, so we can clean-up this function with
> vma_has_reserves(). Additionally, decrement_hugepage_resv_vma() has only
> one call site, so we can remove function and embed it into
> dequeue_huge_page_vma() directly. This patch implement it.
>
> Signed-off-by: Joonsoo Kim <[email protected]>
Reviewed-by: Aneesh Kumar K.V <[email protected]>
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index f6a7a4e..ed2d0af 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -434,25 +434,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
> return (get_vma_private_data(vma) & flag) != 0;
> }
>
> -/* Decrement the reserved pages in the hugepage pool by one */
> -static void decrement_hugepage_resv_vma(struct hstate *h,
> - struct vm_area_struct *vma)
> -{
> - if (vma->vm_flags & VM_NORESERVE)
> - return;
> -
> - if (vma->vm_flags & VM_MAYSHARE) {
> - /* Shared mappings always use reserves */
> - h->resv_huge_pages--;
> - } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
> - /*
> - * Only the process that called mmap() has reserves for
> - * private mappings.
> - */
> - h->resv_huge_pages--;
> - }
> -}
> -
> /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
> void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
> {
> @@ -466,10 +447,18 @@ static int vma_has_reserves(struct vm_area_struct *vma)
> {
> if (vma->vm_flags & VM_NORESERVE)
> return 0;
> +
> + /* Shared mappings always use reserves */
> if (vma->vm_flags & VM_MAYSHARE)
> return 1;
> +
> + /*
> + * Only the process that called mmap() has reserves for
> + * private mappings.
> + */
> if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
> return 1;
> +
> return 0;
> }
>
> @@ -564,8 +553,8 @@ retry_cpuset:
> if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
> page = dequeue_huge_page_node(h, zone_to_nid(zone));
> if (page) {
> - if (!avoid_reserve)
> - decrement_hugepage_resv_vma(h, vma);
> + if (!avoid_reserve && vma_has_reserves(vma))
> + h->resv_huge_pages--;
> break;
> }
> }
> --
> 1.7.9.5
Joonsoo Kim <[email protected]> writes:
> If we map the region with MAP_NORESERVE and MAP_SHARED,
> we can skip to check reserve counting and eventually we cannot be ensured
> to allocate a huge page in fault time.
> With following example code, you can easily find this situation.
>
> Assume 2MB, nr_hugepages = 100
>
> fd = hugetlbfs_unlinked_fd();
> if (fd < 0)
> return 1;
>
> size = 200 * MB;
> flag = MAP_SHARED;
> p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
> if (p == MAP_FAILED) {
> fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> return -1;
> }
>
> size = 2 * MB;
> flag = MAP_ANONYMOUS | MAP_SHARED | MAP_HUGETLB | MAP_NORESERVE;
> p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, -1, 0);
> if (p == MAP_FAILED) {
> fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> }
> p[0] = '0';
> sleep(10);
>
> During executing sleep(10), run 'cat /proc/meminfo' on another process.
> You'll find a mentioned problem.
>
> Solution is simple. We should check VM_NORESERVE in vma_has_reserves().
> This prevent to use a pre-allocated huge page if free count is under
> the reserve count.
You have a problem with this patch, which i guess you are fixing in
patch 9. Consider two process
a) MAP_SHARED on fd
b) MAP_SHARED | MAP_NORESERVE on fd
We should allow the (b) to access the page even if VM_NORESERVE is set
and we are out of reserve space .
so may be you should rearrange the patch ?
>
> Signed-off-by: Joonsoo Kim <[email protected]>
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6c1eb9b..f6a7a4e 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -464,6 +464,8 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
> /* Returns true if the VMA has associated reserve pages */
> static int vma_has_reserves(struct vm_area_struct *vma)
> {
> + if (vma->vm_flags & VM_NORESERVE)
> + return 0;
> if (vma->vm_flags & VM_MAYSHARE)
> return 1;
> if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
> --
> 1.7.9.5
Joonsoo Kim <[email protected]> writes:
> If a vma with VM_NORESERVE allocate a new page for page cache, we should
> check whether this area is reserved or not. If this address is
> already reserved by other process(in case of chg == 0), we should
> decrement reserve count, because this allocated page will go into page
> cache and currently, there is no way to know that this page comes from
> reserved pool or not when releasing inode. This may introduce
> over-counting problem to reserved count. With following example code,
> you can easily reproduce this situation.
>
> size = 20 * MB;
> flag = MAP_SHARED;
> p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
> if (p == MAP_FAILED) {
> fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> return -1;
> }
>
> flag = MAP_SHARED | MAP_NORESERVE;
> q = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
> if (q == MAP_FAILED) {
> fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> }
> q[0] = 'c';
>
> This patch solve this problem.
>
> Signed-off-by: Joonsoo Kim <[email protected]>
Reviewed-by: Aneesh Kumar K.V <[email protected]>
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ed2d0af..defb180 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -443,10 +443,23 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
> }
>
> /* Returns true if the VMA has associated reserve pages */
> -static int vma_has_reserves(struct vm_area_struct *vma)
> +static int vma_has_reserves(struct vm_area_struct *vma, long chg)
> {
> - if (vma->vm_flags & VM_NORESERVE)
> - return 0;
> + if (vma->vm_flags & VM_NORESERVE) {
> + /*
> + * This address is already reserved by other process(chg == 0),
> + * so, we should decreament reserved count. Without
> + * decreamenting, reserve count is remained after releasing
> + * inode, because this allocated page will go into page cache
> + * and is regarded as coming from reserved pool in releasing
> + * step. Currently, we don't have any other solution to deal
> + * with this situation properly, so add work-around here.
> + */
> + if (vma->vm_flags & VM_MAYSHARE && chg == 0)
> + return 1;
> + else
> + return 0;
> + }
>
> /* Shared mappings always use reserves */
> if (vma->vm_flags & VM_MAYSHARE)
> @@ -520,7 +533,8 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
>
> static struct page *dequeue_huge_page_vma(struct hstate *h,
> struct vm_area_struct *vma,
> - unsigned long address, int avoid_reserve)
> + unsigned long address, int avoid_reserve,
> + long chg)
> {
> struct page *page = NULL;
> struct mempolicy *mpol;
> @@ -535,7 +549,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
> * have no page reserves. This check ensures that reservations are
> * not "stolen". The child may still get SIGKILLed
> */
> - if (!vma_has_reserves(vma) &&
> + if (!vma_has_reserves(vma, chg) &&
> h->free_huge_pages - h->resv_huge_pages == 0)
> return NULL;
>
> @@ -553,8 +567,12 @@ retry_cpuset:
> if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
> page = dequeue_huge_page_node(h, zone_to_nid(zone));
> if (page) {
> - if (!avoid_reserve && vma_has_reserves(vma))
> - h->resv_huge_pages--;
> + if (avoid_reserve)
> + break;
> + if (!vma_has_reserves(vma, chg))
> + break;
> +
> + h->resv_huge_pages--;
> break;
> }
> }
> @@ -1139,7 +1157,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
> }
>
> spin_lock(&hugetlb_lock);
> - page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
> + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
> if (!page) {
> spin_unlock(&hugetlb_lock);
> page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
> --
> 1.7.9.5
On Mon, Jul 15, 2013 at 07:40:16PM +0530, Aneesh Kumar K.V wrote:
> Joonsoo Kim <[email protected]> writes:
>
> > First 5 patches are almost trivial clean-up patches.
> >
> > The others are for fixing three bugs.
> > Perhaps, these problems are minor, because this codes are used
> > for a long time, and there is no bug reporting for these problems.
> >
> > These patches are based on v3.10.0 and
> > passed sanity check of libhugetlbfs.
>
> does that mean you had run with libhugetlbfs test suite ?
Yes! I can't find any reggression on libhugetlbfs test suite.
>
> -aneesh
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Mon, Jul 15, 2013 at 07:31:33PM +0530, Aneesh Kumar K.V wrote:
> Joonsoo Kim <[email protected]> writes:
>
> > We don't need to proceede the processing if we don't have any usable
> > free huge page. So move this code up.
>
> I guess you can also mention that since we are holding hugetlb_lock
> hstate values can't change.
Okay. I will mention this for v2.
>
>
> Also.
>
> >
> > Signed-off-by: Joonsoo Kim <[email protected]>
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index e2bfbf7..d87f70b 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -539,10 +539,6 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
> > struct zoneref *z;
> > unsigned int cpuset_mems_cookie;
> >
> > -retry_cpuset:
> > - cpuset_mems_cookie = get_mems_allowed();
> > - zonelist = huge_zonelist(vma, address,
> > - htlb_alloc_mask, &mpol, &nodemask);
> > /*
> > * A child process with MAP_PRIVATE mappings created by their parent
> > * have no page reserves. This check ensures that reservations are
> > @@ -550,11 +546,16 @@ retry_cpuset:
> > */
> > if (!vma_has_reserves(vma) &&
> > h->free_huge_pages - h->resv_huge_pages == 0)
> > - goto err;
> > + return NULL;
> >
>
> If you don't do the above change, the patch will be much simpler.
The patch will be, but output code will not.
With this change, we can remove one goto label('err:') and
this makes code more readable.
Thanks.
>
>
> > /* If reserves cannot be used, ensure enough pages are in the pool */
> > if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
> > - goto err;
> > + return NULL;
> > +
>
> Same here.
>
> > +retry_cpuset:
> > + cpuset_mems_cookie = get_mems_allowed();
> > + zonelist = huge_zonelist(vma, address,
> > + htlb_alloc_mask, &mpol, &nodemask);
> >
> > for_each_zone_zonelist_nodemask(zone, z, zonelist,
> > MAX_NR_ZONES - 1, nodemask) {
> > @@ -572,10 +573,6 @@ retry_cpuset:
> > if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
> > goto retry_cpuset;
> > return page;
> > -
> > -err:
> > - mpol_cond_put(mpol);
> > - return NULL;
> > }
> >
> > static void update_and_free_page(struct hstate *h, struct page *page)
>
> -aneesh
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 07/16/2013 09:10 AM, Joonsoo Kim wrote:
> On Mon, Jul 15, 2013 at 07:40:16PM +0530, Aneesh Kumar K.V wrote:
>> Joonsoo Kim <[email protected]> writes:
>>
>>> First 5 patches are almost trivial clean-up patches.
>>>
>>> The others are for fixing three bugs.
>>> Perhaps, these problems are minor, because this codes are used
>>> for a long time, and there is no bug reporting for these problems.
>>>
>>> These patches are based on v3.10.0 and
>>> passed sanity check of libhugetlbfs.
>> does that mean you had run with libhugetlbfs test suite ?
> Yes! I can't find any reggression on libhugetlbfs test suite.
Where can get your test case?
>
>>
>> -aneesh
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Mon, Jul 15, 2013 at 07:57:37PM +0530, Aneesh Kumar K.V wrote:
> Joonsoo Kim <[email protected]> writes:
>
> > Current node iteration code have a minor problem which do one more
> > node rotation if we can't succeed to allocate. For example,
> > if we start to allocate at node 0, we stop to iterate at node 0.
> > Then we start to allocate at node 1 for next allocation.
>
> Can you explain the problem in a bit more detail
Yes.
I try to explain with below example.
Assume that we start with below condition.
h->next_nid_to_alloc = 0;
node_allowes = 0, 1;
and think we are in alloc_fresh_huge_page().
So, start to execute.
start_nid = 0;
next_nid = 0;
h->next_nid_to_alloc = 1;
And then go into the loop.
We try to allocate from node 0.
and if failed, call hstate_next_node_to_alloc() to get next node.
So,
next_nid = 1;
h->next_nid_to_alloc = 0;
And then try to allocate from node 1.
If we fail again,
next_nid = 0;
h->next_nid_to_alloc = 1;
And next_nid is same as start_nid, so we quit the loop.
If we do alloc_fresh_huge_page() next time, we try to allocate
from *not node 0*, but *node 1*.
This is the problem I mentioned.
>
> >
> > I introduce new macros "for_each_node_mask_to_[alloc|free]" and
> > fix and clean-up node iteration code to alloc or free.
> > This makes code more understandable.
> >
>
> I found the existing code more readable. Obviously I haven't yet figured
> out the problem you have observed with the code.
>
> > Signed-off-by: Joonsoo Kim <[email protected]>
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 0067cf4..a838e6b 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -752,33 +752,6 @@ static int hstate_next_node_to_alloc(struct hstate *h,
> > return nid;
> > }
> >
> > -static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
> > -{
> > - struct page *page;
> > - int start_nid;
> > - int next_nid;
> > - int ret = 0;
> > -
> > - start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
> > - next_nid = start_nid;
> > -
> > - do {
> > - page = alloc_fresh_huge_page_node(h, next_nid);
> > - if (page) {
> > - ret = 1;
> > - break;
> > - }
> > - next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
> > - } while (next_nid != start_nid);
> > -
> > - if (ret)
> > - count_vm_event(HTLB_BUDDY_PGALLOC);
> > - else
> > - count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
> > -
> > - return ret;
> > -}
> > -
> > /*
> > * helper for free_pool_huge_page() - return the previously saved
> > * node ["this node"] from which to free a huge page. Advance the
> > @@ -797,6 +770,42 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
> > return nid;
> > }
> >
> > +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
> > + for (nr_nodes = nodes_weight(*mask), \
> > + node = hstate_next_node_to_alloc(hs, mask); \
> > + nr_nodes > 0 && \
> > + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
> > + nr_nodes--)
> > +
> > +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
> > + for (nr_nodes = nodes_weight(*mask), \
> > + node = hstate_next_node_to_free(hs, mask); \
> > + nr_nodes > 0 && \
> > + ((node = hstate_next_node_to_free(hs, mask)) || 1); \
> > + nr_nodes--)
> > +
> > +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
> > +{
> > + struct page *page;
> > + int nr_nodes, node;
> > + int ret = 0;
> > +
> > + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
>
>
> This check for nodes_weight and fail right ? (nr_nodes == 0). That is
> not the case with the existing code. It will allocate from
> h->next_nid_to_alloc. Is that ok ?
Above code also try to allocate from h->next_nid_to_alloc.
nr_nodes is used for determining when we quit the loop.
Above code means that if all nodes in node_mask are traversed, quit the loop.
Thanks.
>
>
> > + page = alloc_fresh_huge_page_node(h, node);
> > + if (page) {
> > + ret = 1;
> > + break;
> > + }
> > + }
> > +
> > + if (ret)
> > + count_vm_event(HTLB_BUDDY_PGALLOC);
> > + else
> > + count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
> > +
> > + return ret;
> > +}
> > +
>
> -aneesh
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Mon, Jul 15, 2013 at 08:01:24PM +0530, Aneesh Kumar K.V wrote:
> Joonsoo Kim <[email protected]> writes:
>
> > If list is empty, list_for_each_entry_safe() doesn't do anything.
> > So, this check is redundant. Remove it.
> >
> > Signed-off-by: Joonsoo Kim <[email protected]>
>
> Reviewed-by: Aneesh Kumar K.V <[email protected]>
>
>
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index a838e6b..d4a1695 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1019,10 +1019,8 @@ free:
> > spin_unlock(&hugetlb_lock);
> >
> > /* Free unnecessary surplus pages to the buddy allocator */
> > - if (!list_empty(&surplus_list)) {
> > - list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
> > - put_page(page);
> > - }
> > + list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
> > + put_page(page);
> > }
>
> You can now remove '{'
Okay.
I will do that.
Thanks.
>
>
> > spin_lock(&hugetlb_lock);
> >
> > --
> > 1.7.9.5
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Tue, Jul 16, 2013 at 09:27:29AM +0800, Sam Ben wrote:
> On 07/16/2013 09:10 AM, Joonsoo Kim wrote:
> >On Mon, Jul 15, 2013 at 07:40:16PM +0530, Aneesh Kumar K.V wrote:
> >>Joonsoo Kim <[email protected]> writes:
> >>
> >>>First 5 patches are almost trivial clean-up patches.
> >>>
> >>>The others are for fixing three bugs.
> >>>Perhaps, these problems are minor, because this codes are used
> >>>for a long time, and there is no bug reporting for these problems.
> >>>
> >>>These patches are based on v3.10.0 and
> >>>passed sanity check of libhugetlbfs.
> >>does that mean you had run with libhugetlbfs test suite ?
> >Yes! I can't find any reggression on libhugetlbfs test suite.
>
> Where can get your test case?
These are my own test cases.
I will plan to submit these test cases to libhugetlbfs test suite.
Thanks.
>
> >
> >>-aneesh
> >>
> >>--
> >>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >>the body of a message to [email protected]
> >>More majordomo info at http://vger.kernel.org/majordomo-info.html
> >>Please read the FAQ at http://www.tux.org/lkml/
> >--
> >To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >the body of a message to [email protected]
> >More majordomo info at http://vger.kernel.org/majordomo-info.html
> >Please read the FAQ at http://www.tux.org/lkml/
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Mon, Jul 15, 2013 at 07:25:40PM +0530, Aneesh Kumar K.V wrote:
> Joonsoo Kim <[email protected]> writes:
>
> > Currently, we use a page with mapped count 1 in page cache for cow
> > optimization. If we find this condition, we don't allocate a new
> > page and copy contents. Instead, we map this page directly.
> > This may introduce a problem that writting to private mapping overwrite
> > hugetlb file directly. You can find this situation with following code.
> >
> > size = 20 * MB;
> > flag = MAP_SHARED;
> > p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
> > if (p == MAP_FAILED) {
> > fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> > return -1;
> > }
> > p[0] = 's';
> > fprintf(stdout, "BEFORE STEAL PRIVATE WRITE: %c\n", p[0]);
> > munmap(p, size);
> >
> > flag = MAP_PRIVATE;
> > p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
> > if (p == MAP_FAILED) {
> > fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> > }
> > p[0] = 'c';
> > munmap(p, size);
> >
> > flag = MAP_SHARED;
> > p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
> > if (p == MAP_FAILED) {
> > fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> > return -1;
> > }
> > fprintf(stdout, "AFTER STEAL PRIVATE WRITE: %c\n", p[0]);
> > munmap(p, size);
> >
> > We can see that "AFTER STEAL PRIVATE WRITE: c", not "AFTER STEAL
> > PRIVATE WRITE: s". If we turn off this optimization to a page
> > in page cache, the problem is disappeared.
>
> Do we need to trun of the optimization for page cache completely ?
> Can't we check for MAP_PRIVATE case ?
IMO, we need to turn off the optimization for page cache completely.
This optimization works just for MAP_PRIVATE case.
If we map with MAP_SHARED, we map a page from page cache directly,
so we don't need this cow optimization.
>
> Also, we may want to add the above test into libhugetlbfs
I will submit the above test into libhugetlbfs.
Thanks.
>
> >
> > Signed-off-by: Joonsoo Kim <[email protected]>
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index d4a1695..6c1eb9b 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -2512,7 +2512,6 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> > {
> > struct hstate *h = hstate_vma(vma);
> > struct page *old_page, *new_page;
> > - int avoidcopy;
> > int outside_reserve = 0;
> > unsigned long mmun_start; /* For mmu_notifiers */
> > unsigned long mmun_end; /* For mmu_notifiers */
> > @@ -2522,10 +2521,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> > retry_avoidcopy:
> > /* If no-one else is actually using this page, avoid the copy
> > * and just make the page writable */
> > - avoidcopy = (page_mapcount(old_page) == 1);
> > - if (avoidcopy) {
> > - if (PageAnon(old_page))
> > - page_move_anon_rmap(old_page, vma, address);
> > + if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
> > + page_move_anon_rmap(old_page, vma, address);
> > set_huge_ptep_writable(vma, address, ptep);
> > return 0;
> > }
>
> -aneesh
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 07/16/2013 09:45 AM, Joonsoo Kim wrote:
> On Tue, Jul 16, 2013 at 09:27:29AM +0800, Sam Ben wrote:
>> On 07/16/2013 09:10 AM, Joonsoo Kim wrote:
>>> On Mon, Jul 15, 2013 at 07:40:16PM +0530, Aneesh Kumar K.V wrote:
>>>> Joonsoo Kim <[email protected]> writes:
>>>>
>>>>> First 5 patches are almost trivial clean-up patches.
>>>>>
>>>>> The others are for fixing three bugs.
>>>>> Perhaps, these problems are minor, because this codes are used
>>>>> for a long time, and there is no bug reporting for these problems.
>>>>>
>>>>> These patches are based on v3.10.0 and
>>>>> passed sanity check of libhugetlbfs.
>>>> does that mean you had run with libhugetlbfs test suite ?
>>> Yes! I can't find any reggression on libhugetlbfs test suite.
>> Where can get your test case?
> These are my own test cases.
> I will plan to submit these test cases to libhugetlbfs test suite.
Could you point out where can get libhugetlbfs test suite? ;-)
>
> Thanks.
>
>>>> -aneesh
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>>> the body of a message to [email protected]
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>> Please read the FAQ at http://www.tux.org/lkml/
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at http://www.tux.org/lkml/
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to [email protected]. For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Mon, Jul 15, 2013 at 08:41:12PM +0530, Aneesh Kumar K.V wrote:
> Joonsoo Kim <[email protected]> writes:
>
> > If we map the region with MAP_NORESERVE and MAP_SHARED,
> > we can skip to check reserve counting and eventually we cannot be ensured
> > to allocate a huge page in fault time.
> > With following example code, you can easily find this situation.
> >
> > Assume 2MB, nr_hugepages = 100
> >
> > fd = hugetlbfs_unlinked_fd();
> > if (fd < 0)
> > return 1;
> >
> > size = 200 * MB;
> > flag = MAP_SHARED;
> > p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
> > if (p == MAP_FAILED) {
> > fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> > return -1;
> > }
> >
> > size = 2 * MB;
> > flag = MAP_ANONYMOUS | MAP_SHARED | MAP_HUGETLB | MAP_NORESERVE;
> > p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, -1, 0);
> > if (p == MAP_FAILED) {
> > fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> > }
> > p[0] = '0';
> > sleep(10);
> >
> > During executing sleep(10), run 'cat /proc/meminfo' on another process.
> > You'll find a mentioned problem.
> >
> > Solution is simple. We should check VM_NORESERVE in vma_has_reserves().
> > This prevent to use a pre-allocated huge page if free count is under
> > the reserve count.
>
> You have a problem with this patch, which i guess you are fixing in
> patch 9. Consider two process
>
> a) MAP_SHARED on fd
> b) MAP_SHARED | MAP_NORESERVE on fd
>
> We should allow the (b) to access the page even if VM_NORESERVE is set
> and we are out of reserve space .
I can't get your point.
Please elaborate more on this.
Thanks.
>
> so may be you should rearrange the patch ?
>
> >
> > Signed-off-by: Joonsoo Kim <[email protected]>
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 6c1eb9b..f6a7a4e 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -464,6 +464,8 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
> > /* Returns true if the VMA has associated reserve pages */
> > static int vma_has_reserves(struct vm_area_struct *vma)
> > {
> > + if (vma->vm_flags & VM_NORESERVE)
> > + return 0;
> > if (vma->vm_flags & VM_MAYSHARE)
> > return 1;
> > if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
> > --
> > 1.7.9.5
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Tue, Jul 16, 2013 at 09:55:48AM +0800, Sam Ben wrote:
> On 07/16/2013 09:45 AM, Joonsoo Kim wrote:
> >On Tue, Jul 16, 2013 at 09:27:29AM +0800, Sam Ben wrote:
> >>On 07/16/2013 09:10 AM, Joonsoo Kim wrote:
> >>>On Mon, Jul 15, 2013 at 07:40:16PM +0530, Aneesh Kumar K.V wrote:
> >>>>Joonsoo Kim <[email protected]> writes:
> >>>>
> >>>>>First 5 patches are almost trivial clean-up patches.
> >>>>>
> >>>>>The others are for fixing three bugs.
> >>>>>Perhaps, these problems are minor, because this codes are used
> >>>>>for a long time, and there is no bug reporting for these problems.
> >>>>>
> >>>>>These patches are based on v3.10.0 and
> >>>>>passed sanity check of libhugetlbfs.
> >>>>does that mean you had run with libhugetlbfs test suite ?
> >>>Yes! I can't find any reggression on libhugetlbfs test suite.
> >>Where can get your test case?
> >These are my own test cases.
> >I will plan to submit these test cases to libhugetlbfs test suite.
>
> Could you point out where can get libhugetlbfs test suite? ;-)
You can get here :)
http://sourceforge.net/projects/libhugetlbfs/
Thanks.
>
> >
> >Thanks.
> >
> >>>>-aneesh
> >>>>
> >>>>--
> >>>>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >>>>the body of a message to [email protected]
> >>>>More majordomo info at http://vger.kernel.org/majordomo-info.html
> >>>>Please read the FAQ at http://www.tux.org/lkml/
> >>>--
> >>>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >>>the body of a message to [email protected]
> >>>More majordomo info at http://vger.kernel.org/majordomo-info.html
> >>>Please read the FAQ at http://www.tux.org/lkml/
> >>--
> >>To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >>the body to [email protected]. For more info on Linux MM,
> >>see: http://www.linux-mm.org/ .
> >>Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Joonsoo Kim <[email protected]> writes:
> On Mon, Jul 15, 2013 at 07:31:33PM +0530, Aneesh Kumar K.V wrote:
>> Joonsoo Kim <[email protected]> writes:
>>
>> > We don't need to proceede the processing if we don't have any usable
>> > free huge page. So move this code up.
>>
>> I guess you can also mention that since we are holding hugetlb_lock
>> hstate values can't change.
>
> Okay. I will mention this for v2.
>
>>
>>
>> Also.
>>
>> >
>> > Signed-off-by: Joonsoo Kim <[email protected]>
>> >
>> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> > index e2bfbf7..d87f70b 100644
>> > --- a/mm/hugetlb.c
>> > +++ b/mm/hugetlb.c
>> > @@ -539,10 +539,6 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
>> > struct zoneref *z;
>> > unsigned int cpuset_mems_cookie;
>> >
>> > -retry_cpuset:
>> > - cpuset_mems_cookie = get_mems_allowed();
>> > - zonelist = huge_zonelist(vma, address,
>> > - htlb_alloc_mask, &mpol, &nodemask);
>> > /*
>> > * A child process with MAP_PRIVATE mappings created by their parent
>> > * have no page reserves. This check ensures that reservations are
>> > @@ -550,11 +546,16 @@ retry_cpuset:
>> > */
>> > if (!vma_has_reserves(vma) &&
>> > h->free_huge_pages - h->resv_huge_pages == 0)
>> > - goto err;
>> > + return NULL;
>> >
>>
>> If you don't do the above change, the patch will be much simpler.
>
> The patch will be, but output code will not.
> With this change, we can remove one goto label('err:') and
> this makes code more readable.
>
If you feel stronly about the cleanup, you can do another path for the
cleanups. Don't mix things in a single patch. That makes review difficult.
-aneesh
On Tue, Jul 16, 2013 at 09:06:04AM +0530, Aneesh Kumar K.V wrote:
> Joonsoo Kim <[email protected]> writes:
>
> > On Mon, Jul 15, 2013 at 07:31:33PM +0530, Aneesh Kumar K.V wrote:
> >> Joonsoo Kim <[email protected]> writes:
> >>
> >> > We don't need to proceede the processing if we don't have any usable
> >> > free huge page. So move this code up.
> >>
> >> I guess you can also mention that since we are holding hugetlb_lock
> >> hstate values can't change.
> >
> > Okay. I will mention this for v2.
> >
> >>
> >>
> >> Also.
> >>
> >> >
> >> > Signed-off-by: Joonsoo Kim <[email protected]>
> >> >
> >> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> >> > index e2bfbf7..d87f70b 100644
> >> > --- a/mm/hugetlb.c
> >> > +++ b/mm/hugetlb.c
> >> > @@ -539,10 +539,6 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
> >> > struct zoneref *z;
> >> > unsigned int cpuset_mems_cookie;
> >> >
> >> > -retry_cpuset:
> >> > - cpuset_mems_cookie = get_mems_allowed();
> >> > - zonelist = huge_zonelist(vma, address,
> >> > - htlb_alloc_mask, &mpol, &nodemask);
> >> > /*
> >> > * A child process with MAP_PRIVATE mappings created by their parent
> >> > * have no page reserves. This check ensures that reservations are
> >> > @@ -550,11 +546,16 @@ retry_cpuset:
> >> > */
> >> > if (!vma_has_reserves(vma) &&
> >> > h->free_huge_pages - h->resv_huge_pages == 0)
> >> > - goto err;
> >> > + return NULL;
> >> >
> >>
> >> If you don't do the above change, the patch will be much simpler.
> >
> > The patch will be, but output code will not.
> > With this change, we can remove one goto label('err:') and
> > this makes code more readable.
> >
>
> If you feel stronly about the cleanup, you can do another path for the
> cleanups. Don't mix things in a single patch. That makes review difficult.
Okay! I will make an another patch for clean-up.
Thanks.
>
> -aneesh
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Joonsoo Kim <[email protected]> writes:
> On Mon, Jul 15, 2013 at 08:41:12PM +0530, Aneesh Kumar K.V wrote:
>> Joonsoo Kim <[email protected]> writes:
>>
>> > If we map the region with MAP_NORESERVE and MAP_SHARED,
>> > we can skip to check reserve counting and eventually we cannot be ensured
>> > to allocate a huge page in fault time.
>> > With following example code, you can easily find this situation.
>> >
>> > Assume 2MB, nr_hugepages = 100
>> >
>> > fd = hugetlbfs_unlinked_fd();
>> > if (fd < 0)
>> > return 1;
>> >
>> > size = 200 * MB;
>> > flag = MAP_SHARED;
>> > p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
>> > if (p == MAP_FAILED) {
>> > fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
>> > return -1;
>> > }
>> >
>> > size = 2 * MB;
>> > flag = MAP_ANONYMOUS | MAP_SHARED | MAP_HUGETLB | MAP_NORESERVE;
>> > p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, -1, 0);
>> > if (p == MAP_FAILED) {
>> > fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
>> > }
>> > p[0] = '0';
>> > sleep(10);
>> >
>> > During executing sleep(10), run 'cat /proc/meminfo' on another process.
>> > You'll find a mentioned problem.
>> >
>> > Solution is simple. We should check VM_NORESERVE in vma_has_reserves().
>> > This prevent to use a pre-allocated huge page if free count is under
>> > the reserve count.
>>
>> You have a problem with this patch, which i guess you are fixing in
>> patch 9. Consider two process
>>
>> a) MAP_SHARED on fd
>> b) MAP_SHARED | MAP_NORESERVE on fd
>>
>> We should allow the (b) to access the page even if VM_NORESERVE is set
>> and we are out of reserve space .
>
> I can't get your point.
> Please elaborate more on this.
One process mmap with MAP_SHARED and another one with MAP_SHARED | MAP_NORESERVE
Now the first process will result in reserving the pages from the hugtlb
pool. Now if the second process try to dequeue huge page and we don't
have free space we will fail because
vma_has_reservers will now return zero because VM_NORESERVE is set
and we can have (h->free_huge_pages - h->resv_huge_pages) == 0;
The below hunk in your patch 9 handles that
+ if (vma->vm_flags & VM_NORESERVE) {
+ /*
+ * This address is already reserved by other process(chg == 0),
+ * so, we should decreament reserved count. Without
+ * decreamenting, reserve count is remained after releasing
+ * inode, because this allocated page will go into page cache
+ * and is regarded as coming from reserved pool in releasing
+ * step. Currently, we don't have any other solution to deal
+ * with this situation properly, so add work-around here.
+ */
+ if (vma->vm_flags & VM_MAYSHARE && chg == 0)
+ return 1;
+ else
+ return 0;
+ }
so may be both of these should be folded ?
-aneesh
On Tue, Jul 16, 2013 at 11:17:23AM +0530, Aneesh Kumar K.V wrote:
> Joonsoo Kim <[email protected]> writes:
>
> > On Mon, Jul 15, 2013 at 08:41:12PM +0530, Aneesh Kumar K.V wrote:
> >> Joonsoo Kim <[email protected]> writes:
> >>
> >> > If we map the region with MAP_NORESERVE and MAP_SHARED,
> >> > we can skip to check reserve counting and eventually we cannot be ensured
> >> > to allocate a huge page in fault time.
> >> > With following example code, you can easily find this situation.
> >> >
> >> > Assume 2MB, nr_hugepages = 100
> >> >
> >> > fd = hugetlbfs_unlinked_fd();
> >> > if (fd < 0)
> >> > return 1;
> >> >
> >> > size = 200 * MB;
> >> > flag = MAP_SHARED;
> >> > p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
> >> > if (p == MAP_FAILED) {
> >> > fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> >> > return -1;
> >> > }
> >> >
> >> > size = 2 * MB;
> >> > flag = MAP_ANONYMOUS | MAP_SHARED | MAP_HUGETLB | MAP_NORESERVE;
> >> > p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, -1, 0);
> >> > if (p == MAP_FAILED) {
> >> > fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
> >> > }
> >> > p[0] = '0';
> >> > sleep(10);
> >> >
> >> > During executing sleep(10), run 'cat /proc/meminfo' on another process.
> >> > You'll find a mentioned problem.
> >> >
> >> > Solution is simple. We should check VM_NORESERVE in vma_has_reserves().
> >> > This prevent to use a pre-allocated huge page if free count is under
> >> > the reserve count.
> >>
> >> You have a problem with this patch, which i guess you are fixing in
> >> patch 9. Consider two process
> >>
> >> a) MAP_SHARED on fd
> >> b) MAP_SHARED | MAP_NORESERVE on fd
> >>
> >> We should allow the (b) to access the page even if VM_NORESERVE is set
> >> and we are out of reserve space .
> >
> > I can't get your point.
> > Please elaborate more on this.
>
>
> One process mmap with MAP_SHARED and another one with MAP_SHARED | MAP_NORESERVE
> Now the first process will result in reserving the pages from the hugtlb
> pool. Now if the second process try to dequeue huge page and we don't
> have free space we will fail because
>
> vma_has_reservers will now return zero because VM_NORESERVE is set
> and we can have (h->free_huge_pages - h->resv_huge_pages) == 0;
I think that this behavior is correct, because a user who mapped with
VM_NORESERVE should not think their allocation always succeed. With patch 9,
he can be ensured to succeed, but I think it is side-effect.
> The below hunk in your patch 9 handles that
>
> + if (vma->vm_flags & VM_NORESERVE) {
> + /*
> + * This address is already reserved by other process(chg == 0),
> + * so, we should decreament reserved count. Without
> + * decreamenting, reserve count is remained after releasing
> + * inode, because this allocated page will go into page cache
> + * and is regarded as coming from reserved pool in releasing
> + * step. Currently, we don't have any other solution to deal
> + * with this situation properly, so add work-around here.
> + */
> + if (vma->vm_flags & VM_MAYSHARE && chg == 0)
> + return 1;
> + else
> + return 0;
> + }
>
> so may be both of these should be folded ?
I think that these patches should not be folded, because these handle
two separate issues. Reserve count mismatch issue mentioned in patch 9
is not introduced by patch 7.
Thanks.
>
> -aneesh
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 2013/7/15 17:52, Joonsoo Kim wrote:
> Current node iteration code have a minor problem which do one more
> node rotation if we can't succeed to allocate. For example,
> if we start to allocate at node 0, we stop to iterate at node 0.
> Then we start to allocate at node 1 for next allocation.
>
> I introduce new macros "for_each_node_mask_to_[alloc|free]" and
> fix and clean-up node iteration code to alloc or free.
> This makes code more understandable.
>
> Signed-off-by: Joonsoo Kim <[email protected]>
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 0067cf4..a838e6b 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -752,33 +752,6 @@ static int hstate_next_node_to_alloc(struct hstate *h,
> return nid;
> }
>
> -static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
> -{
> - struct page *page;
> - int start_nid;
> - int next_nid;
> - int ret = 0;
> -
> - start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
> - next_nid = start_nid;
> -
> - do {
> - page = alloc_fresh_huge_page_node(h, next_nid);
> - if (page) {
> - ret = 1;
> - break;
> - }
> - next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
> - } while (next_nid != start_nid);
> -
> - if (ret)
> - count_vm_event(HTLB_BUDDY_PGALLOC);
> - else
> - count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
> -
> - return ret;
> -}
> -
> /*
> * helper for free_pool_huge_page() - return the previously saved
> * node ["this node"] from which to free a huge page. Advance the
> @@ -797,6 +770,42 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
> return nid;
> }
>
> +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
> + for (nr_nodes = nodes_weight(*mask), \
> + node = hstate_next_node_to_alloc(hs, mask); \
> + nr_nodes > 0 && \
> + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
Hi Joonsoo,
For the first loop, node = hstate_next_node_to_alloc() will be executed two times,
so will skip the first node in mask, right?
Thanks,
Jianguo Wu.
> + nr_nodes--)
> +
> +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
> + for (nr_nodes = nodes_weight(*mask), \
> + node = hstate_next_node_to_free(hs, mask); \
> + nr_nodes > 0 && \
> + ((node = hstate_next_node_to_free(hs, mask)) || 1); \
> + nr_nodes--)
> +
> +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
> +{
> + struct page *page;
> + int nr_nodes, node;
> + int ret = 0;
> +
> + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
> + page = alloc_fresh_huge_page_node(h, node);
> + if (page) {
> + ret = 1;
> + break;
> + }
> + }
> +
> + if (ret)
> + count_vm_event(HTLB_BUDDY_PGALLOC);
> + else
> + count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
> +
> + return ret;
> +}
> +
> /*
> * Free huge page from pool from next node to free.
> * Attempt to keep persistent huge pages more or less
> @@ -806,36 +815,31 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
> static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
> bool acct_surplus)
> {
> - int start_nid;
> - int next_nid;
> + int nr_nodes, node;
> int ret = 0;
>
> - start_nid = hstate_next_node_to_free(h, nodes_allowed);
> - next_nid = start_nid;
> -
> - do {
> + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
> /*
> * If we're returning unused surplus pages, only examine
> * nodes with surplus pages.
> */
> - if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
> - !list_empty(&h->hugepage_freelists[next_nid])) {
> + if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
> + !list_empty(&h->hugepage_freelists[node])) {
> struct page *page =
> - list_entry(h->hugepage_freelists[next_nid].next,
> + list_entry(h->hugepage_freelists[node].next,
> struct page, lru);
> list_del(&page->lru);
> h->free_huge_pages--;
> - h->free_huge_pages_node[next_nid]--;
> + h->free_huge_pages_node[node]--;
> if (acct_surplus) {
> h->surplus_huge_pages--;
> - h->surplus_huge_pages_node[next_nid]--;
> + h->surplus_huge_pages_node[node]--;
> }
> update_and_free_page(h, page);
> ret = 1;
> break;
> }
> - next_nid = hstate_next_node_to_free(h, nodes_allowed);
> - } while (next_nid != start_nid);
> + }
>
> return ret;
> }
> @@ -1173,14 +1177,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
> int __weak alloc_bootmem_huge_page(struct hstate *h)
> {
> struct huge_bootmem_page *m;
> - int nr_nodes = nodes_weight(node_states[N_MEMORY]);
> + int nr_nodes, node;
>
> - while (nr_nodes) {
> + for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
> void *addr;
>
> - addr = __alloc_bootmem_node_nopanic(
> - NODE_DATA(hstate_next_node_to_alloc(h,
> - &node_states[N_MEMORY])),
> + addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
> huge_page_size(h), huge_page_size(h), 0);
>
> if (addr) {
> @@ -1192,7 +1194,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
> m = addr;
> goto found;
> }
> - nr_nodes--;
> }
> return 0;
>
> @@ -1331,48 +1332,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count,
> static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
> int delta)
> {
> - int start_nid, next_nid;
> - int ret = 0;
> + int nr_nodes, node;
>
> VM_BUG_ON(delta != -1 && delta != 1);
>
> - if (delta < 0)
> - start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
> - else
> - start_nid = hstate_next_node_to_free(h, nodes_allowed);
> - next_nid = start_nid;
> -
> - do {
> - int nid = next_nid;
> - if (delta < 0) {
> - /*
> - * To shrink on this node, there must be a surplus page
> - */
> - if (!h->surplus_huge_pages_node[nid]) {
> - next_nid = hstate_next_node_to_alloc(h,
> - nodes_allowed);
> - continue;
> - }
> + if (delta < 0) {
> + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
> + if (h->surplus_huge_pages_node[node])
> + goto found;
> }
> - if (delta > 0) {
> - /*
> - * Surplus cannot exceed the total number of pages
> - */
> - if (h->surplus_huge_pages_node[nid] >=
> - h->nr_huge_pages_node[nid]) {
> - next_nid = hstate_next_node_to_free(h,
> - nodes_allowed);
> - continue;
> - }
> + } else {
> + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
> + if (h->surplus_huge_pages_node[node] <
> + h->nr_huge_pages_node[node])
> + goto found;
> }
> + }
> + return 0;
>
> - h->surplus_huge_pages += delta;
> - h->surplus_huge_pages_node[nid] += delta;
> - ret = 1;
> - break;
> - } while (next_nid != start_nid);
> -
> - return ret;
> +found:
> + h->surplus_huge_pages += delta;
> + h->surplus_huge_pages_node[node] += delta;
> + return 1;
> }
>
> #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
On Wed, Jul 17, 2013 at 10:00:48AM +0800, Jianguo Wu wrote:
> On 2013/7/15 17:52, Joonsoo Kim wrote:
>
> > Current node iteration code have a minor problem which do one more
> > node rotation if we can't succeed to allocate. For example,
> > if we start to allocate at node 0, we stop to iterate at node 0.
> > Then we start to allocate at node 1 for next allocation.
> >
> > I introduce new macros "for_each_node_mask_to_[alloc|free]" and
> > fix and clean-up node iteration code to alloc or free.
> > This makes code more understandable.
> >
> > Signed-off-by: Joonsoo Kim <[email protected]>
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 0067cf4..a838e6b 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -752,33 +752,6 @@ static int hstate_next_node_to_alloc(struct hstate *h,
> > return nid;
> > }
> >
> > -static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
> > -{
> > - struct page *page;
> > - int start_nid;
> > - int next_nid;
> > - int ret = 0;
> > -
> > - start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
> > - next_nid = start_nid;
> > -
> > - do {
> > - page = alloc_fresh_huge_page_node(h, next_nid);
> > - if (page) {
> > - ret = 1;
> > - break;
> > - }
> > - next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
> > - } while (next_nid != start_nid);
> > -
> > - if (ret)
> > - count_vm_event(HTLB_BUDDY_PGALLOC);
> > - else
> > - count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
> > -
> > - return ret;
> > -}
> > -
> > /*
> > * helper for free_pool_huge_page() - return the previously saved
> > * node ["this node"] from which to free a huge page. Advance the
> > @@ -797,6 +770,42 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
> > return nid;
> > }
> >
> > +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
> > + for (nr_nodes = nodes_weight(*mask), \
> > + node = hstate_next_node_to_alloc(hs, mask); \
> > + nr_nodes > 0 && \
> > + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
>
> Hi Joonsoo,
Hello, Jianguo.
>
> For the first loop, node = hstate_next_node_to_alloc() will be executed two times,
> so will skip the first node in mask, right?
Yes, it's my fault.
I will fix it on v2.
Thanks.
>
> Thanks,
> Jianguo Wu.
>
> > + nr_nodes--)
> > +
> > +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
> > + for (nr_nodes = nodes_weight(*mask), \
> > + node = hstate_next_node_to_free(hs, mask); \
> > + nr_nodes > 0 && \
> > + ((node = hstate_next_node_to_free(hs, mask)) || 1); \
> > + nr_nodes--)
> > +
> > +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
> > +{
> > + struct page *page;
> > + int nr_nodes, node;
> > + int ret = 0;
> > +
> > + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
> > + page = alloc_fresh_huge_page_node(h, node);
> > + if (page) {
> > + ret = 1;
> > + break;
> > + }
> > + }
> > +
> > + if (ret)
> > + count_vm_event(HTLB_BUDDY_PGALLOC);
> > + else
> > + count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
> > +
> > + return ret;
> > +}
> > +
> > /*
> > * Free huge page from pool from next node to free.
> > * Attempt to keep persistent huge pages more or less
> > @@ -806,36 +815,31 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
> > static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
> > bool acct_surplus)
> > {
> > - int start_nid;
> > - int next_nid;
> > + int nr_nodes, node;
> > int ret = 0;
> >
> > - start_nid = hstate_next_node_to_free(h, nodes_allowed);
> > - next_nid = start_nid;
> > -
> > - do {
> > + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
> > /*
> > * If we're returning unused surplus pages, only examine
> > * nodes with surplus pages.
> > */
> > - if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
> > - !list_empty(&h->hugepage_freelists[next_nid])) {
> > + if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
> > + !list_empty(&h->hugepage_freelists[node])) {
> > struct page *page =
> > - list_entry(h->hugepage_freelists[next_nid].next,
> > + list_entry(h->hugepage_freelists[node].next,
> > struct page, lru);
> > list_del(&page->lru);
> > h->free_huge_pages--;
> > - h->free_huge_pages_node[next_nid]--;
> > + h->free_huge_pages_node[node]--;
> > if (acct_surplus) {
> > h->surplus_huge_pages--;
> > - h->surplus_huge_pages_node[next_nid]--;
> > + h->surplus_huge_pages_node[node]--;
> > }
> > update_and_free_page(h, page);
> > ret = 1;
> > break;
> > }
> > - next_nid = hstate_next_node_to_free(h, nodes_allowed);
> > - } while (next_nid != start_nid);
> > + }
> >
> > return ret;
> > }
> > @@ -1173,14 +1177,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
> > int __weak alloc_bootmem_huge_page(struct hstate *h)
> > {
> > struct huge_bootmem_page *m;
> > - int nr_nodes = nodes_weight(node_states[N_MEMORY]);
> > + int nr_nodes, node;
> >
> > - while (nr_nodes) {
> > + for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
> > void *addr;
> >
> > - addr = __alloc_bootmem_node_nopanic(
> > - NODE_DATA(hstate_next_node_to_alloc(h,
> > - &node_states[N_MEMORY])),
> > + addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
> > huge_page_size(h), huge_page_size(h), 0);
> >
> > if (addr) {
> > @@ -1192,7 +1194,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
> > m = addr;
> > goto found;
> > }
> > - nr_nodes--;
> > }
> > return 0;
> >
> > @@ -1331,48 +1332,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count,
> > static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
> > int delta)
> > {
> > - int start_nid, next_nid;
> > - int ret = 0;
> > + int nr_nodes, node;
> >
> > VM_BUG_ON(delta != -1 && delta != 1);
> >
> > - if (delta < 0)
> > - start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
> > - else
> > - start_nid = hstate_next_node_to_free(h, nodes_allowed);
> > - next_nid = start_nid;
> > -
> > - do {
> > - int nid = next_nid;
> > - if (delta < 0) {
> > - /*
> > - * To shrink on this node, there must be a surplus page
> > - */
> > - if (!h->surplus_huge_pages_node[nid]) {
> > - next_nid = hstate_next_node_to_alloc(h,
> > - nodes_allowed);
> > - continue;
> > - }
> > + if (delta < 0) {
> > + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
> > + if (h->surplus_huge_pages_node[node])
> > + goto found;
> > }
> > - if (delta > 0) {
> > - /*
> > - * Surplus cannot exceed the total number of pages
> > - */
> > - if (h->surplus_huge_pages_node[nid] >=
> > - h->nr_huge_pages_node[nid]) {
> > - next_nid = hstate_next_node_to_free(h,
> > - nodes_allowed);
> > - continue;
> > - }
> > + } else {
> > + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
> > + if (h->surplus_huge_pages_node[node] <
> > + h->nr_huge_pages_node[node])
> > + goto found;
> > }
> > + }
> > + return 0;
> >
> > - h->surplus_huge_pages += delta;
> > - h->surplus_huge_pages_node[nid] += delta;
> > - ret = 1;
> > - break;
> > - } while (next_nid != start_nid);
> > -
> > - return ret;
> > +found:
> > + h->surplus_huge_pages += delta;
> > + h->surplus_huge_pages_node[node] += delta;
> > + return 1;
> > }
> >
> > #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
>
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Mon 15-07-13 18:52:39, Joonsoo Kim wrote:
> We don't need to proceede the processing if we don't have any usable
> free huge page. So move this code up.
>
> Signed-off-by: Joonsoo Kim <[email protected]>
Acked-by: Michal Hocko <[email protected]>
after you add a note about hugetlb_lock which stabilizes hstate so the
retry doesn't have to re-check reserves and other stuff as suggested by
Aneesh.
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index e2bfbf7..d87f70b 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -539,10 +539,6 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
> struct zoneref *z;
> unsigned int cpuset_mems_cookie;
>
> -retry_cpuset:
> - cpuset_mems_cookie = get_mems_allowed();
> - zonelist = huge_zonelist(vma, address,
> - htlb_alloc_mask, &mpol, &nodemask);
> /*
> * A child process with MAP_PRIVATE mappings created by their parent
> * have no page reserves. This check ensures that reservations are
> @@ -550,11 +546,16 @@ retry_cpuset:
> */
> if (!vma_has_reserves(vma) &&
> h->free_huge_pages - h->resv_huge_pages == 0)
> - goto err;
> + return NULL;
>
> /* If reserves cannot be used, ensure enough pages are in the pool */
> if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
> - goto err;
> + return NULL;
> +
> +retry_cpuset:
> + cpuset_mems_cookie = get_mems_allowed();
> + zonelist = huge_zonelist(vma, address,
> + htlb_alloc_mask, &mpol, &nodemask);
>
> for_each_zone_zonelist_nodemask(zone, z, zonelist,
> MAX_NR_ZONES - 1, nodemask) {
> @@ -572,10 +573,6 @@ retry_cpuset:
> if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
> goto retry_cpuset;
> return page;
> -
> -err:
> - mpol_cond_put(mpol);
> - return NULL;
> }
>
> static void update_and_free_page(struct hstate *h, struct page *page)
> --
> 1.7.9.5
>
--
Michal Hocko
SUSE Labs
On Mon 15-07-13 18:52:40, Joonsoo Kim wrote:
> The name of the mutex written in comment is wrong.
> Fix it.
>
> Signed-off-by: Joonsoo Kim <[email protected]>
Acked-by: Michal Hocko <[email protected]>
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d87f70b..d21a33a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -135,9 +135,9 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
> * across the pages in a mapping.
> *
> * The region data structures are protected by a combination of the mmap_sem
> - * and the hugetlb_instantion_mutex. To access or modify a region the caller
> + * and the hugetlb_instantiation_mutex. To access or modify a region the caller
> * must either hold the mmap_sem for write, or the mmap_sem for read and
> - * the hugetlb_instantiation mutex:
> + * the hugetlb_instantiation_mutex:
> *
> * down_write(&mm->mmap_sem);
> * or
> --
> 1.7.9.5
>
--
Michal Hocko
SUSE Labs
On Mon 15-07-13 18:52:41, Joonsoo Kim wrote:
> We can unify some codes for succeed allocation.
> This makes code more readable.
> There is no functional difference.
"This patch unifies successful allocation paths to make the code more
readable. There are no functional changes."
Better?
> Signed-off-by: Joonsoo Kim <[email protected]>
Acked-by: Michal Hocko <[email protected]>
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d21a33a..0067cf4 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1144,29 +1144,25 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
> hugepage_subpool_put_pages(spool, chg);
> return ERR_PTR(-ENOSPC);
> }
> +
> spin_lock(&hugetlb_lock);
> page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
> - if (page) {
> - /* update page cgroup details */
> - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
> - h_cg, page);
> - spin_unlock(&hugetlb_lock);
> - } else {
> + if (!page) {
> spin_unlock(&hugetlb_lock);
> page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
> if (!page) {
> hugetlb_cgroup_uncharge_cgroup(idx,
> - pages_per_huge_page(h),
> - h_cg);
> + pages_per_huge_page(h), h_cg);
> hugepage_subpool_put_pages(spool, chg);
> return ERR_PTR(-ENOSPC);
> }
> +
> spin_lock(&hugetlb_lock);
> - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
> - h_cg, page);
> list_move(&page->lru, &h->hugepage_activelist);
> - spin_unlock(&hugetlb_lock);
> + /* Fall through */
> }
> + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
> + spin_unlock(&hugetlb_lock);
>
> set_page_private(page, (unsigned long)spool);
>
> --
> 1.7.9.5
>
--
Michal Hocko
SUSE Labs
On Mon 15-07-13 18:52:43, Joonsoo Kim wrote:
> If list is empty, list_for_each_entry_safe() doesn't do anything.
> So, this check is redundant. Remove it.
>
> Signed-off-by: Joonsoo Kim <[email protected]>
Acked-by: Michal Hocko <[email protected]>
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index a838e6b..d4a1695 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1019,10 +1019,8 @@ free:
> spin_unlock(&hugetlb_lock);
>
> /* Free unnecessary surplus pages to the buddy allocator */
> - if (!list_empty(&surplus_list)) {
> - list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
> - put_page(page);
> - }
> + list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
> + put_page(page);
> }
> spin_lock(&hugetlb_lock);
>
> --
> 1.7.9.5
>
--
Michal Hocko
SUSE Labs
On Mon, Jul 22, 2013 at 04:51:50PM +0200, Michal Hocko wrote:
> On Mon 15-07-13 18:52:41, Joonsoo Kim wrote:
> > We can unify some codes for succeed allocation.
> > This makes code more readable.
> > There is no functional difference.
>
> "This patch unifies successful allocation paths to make the code more
> readable. There are no functional changes."
>
> Better?
Better :)
Thanks.
>
> > Signed-off-by: Joonsoo Kim <[email protected]>
>
> Acked-by: Michal Hocko <[email protected]>
>
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index d21a33a..0067cf4 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1144,29 +1144,25 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
> > hugepage_subpool_put_pages(spool, chg);
> > return ERR_PTR(-ENOSPC);
> > }
> > +
> > spin_lock(&hugetlb_lock);
> > page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
> > - if (page) {
> > - /* update page cgroup details */
> > - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
> > - h_cg, page);
> > - spin_unlock(&hugetlb_lock);
> > - } else {
> > + if (!page) {
> > spin_unlock(&hugetlb_lock);
> > page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
> > if (!page) {
> > hugetlb_cgroup_uncharge_cgroup(idx,
> > - pages_per_huge_page(h),
> > - h_cg);
> > + pages_per_huge_page(h), h_cg);
> > hugepage_subpool_put_pages(spool, chg);
> > return ERR_PTR(-ENOSPC);
> > }
> > +
> > spin_lock(&hugetlb_lock);
> > - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
> > - h_cg, page);
> > list_move(&page->lru, &h->hugepage_activelist);
> > - spin_unlock(&hugetlb_lock);
> > + /* Fall through */
> > }
> > + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
> > + spin_unlock(&hugetlb_lock);
> >
> > set_page_private(page, (unsigned long)spool);
> >
> > --
> > 1.7.9.5
> >
>
> --
> Michal Hocko
> SUSE Labs
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>