Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754618AbYKFPhd (ORCPT ); Thu, 6 Nov 2008 10:37:33 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1754771AbYKFPgi (ORCPT ); Thu, 6 Nov 2008 10:36:38 -0500 Received: from mx2.redhat.com ([66.187.237.31]:41741 "EHLO mx2.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754770AbYKFPgf (ORCPT ); Thu, 6 Nov 2008 10:36:35 -0500 Message-Id: <20081106153135.790621895@redhat.com> References: <20081106153022.215696930@redhat.com> User-Agent: quilt/0.46-1 Date: Thu, 06 Nov 2008 10:30:24 -0500 From: vgoyal@redhat.com To: linux-kernel@vger.kernel.org, containers@lists.linux-foundation.org, virtualization@lists.linux-foundation.org, jens.axboe@oracle.com, Hirokazu Takahashi , Ryo Tsuruta , Andrea Righi , Satoshi UCHIDA Cc: fernando@oss.ntt.co.jp, balbir@linux.vnet.ibm.com, Andrew Morton , menage@google.com, ngupta@google.com, Rik van Riel , Jeff Moyer , Peter Zijlstra Subject: [patch 2/4] io controller: biocgroup implementation Content-Disposition: inline; filename=bio-cgroup-implementation Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 24711 Lines: 914 o biocgroup functionality. o Implemented new controller "bio" o Most of it picked from dm-ioband biocgroup implementation patches. Signed-off-by: Vivek Goyal Index: linux17/include/linux/cgroup_subsys.h =================================================================== --- linux17.orig/include/linux/cgroup_subsys.h 2008-10-09 18:13:53.000000000 -0400 +++ linux17/include/linux/cgroup_subsys.h 2008-11-05 18:12:32.000000000 -0500 @@ -43,6 +43,12 @@ SUBSYS(mem_cgroup) /* */ +#ifdef CONFIG_CGROUP_BIO +SUBSYS(bio_cgroup) +#endif + +/* */ + #ifdef CONFIG_CGROUP_DEVICE SUBSYS(devices) #endif Index: linux17/init/Kconfig =================================================================== --- linux17.orig/init/Kconfig 2008-10-09 18:13:53.000000000 -0400 +++ linux17/init/Kconfig 2008-11-05 18:12:32.000000000 -0500 @@ -408,6 +408,13 @@ config CGROUP_MEM_RES_CTLR This config option also selects MM_OWNER config option, which could in turn add some fork/exit overhead. +config CGROUP_BIO + bool "Block I/O cgroup subsystem" + depends on CGROUP_MEM_RES_CTLR + select MM_OWNER + help + A generic proportinal weight IO controller. + config SYSFS_DEPRECATED bool Index: linux17/mm/biocontrol.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux17/mm/biocontrol.c 2008-11-05 18:12:44.000000000 -0500 @@ -0,0 +1,409 @@ +/* biocontrol.c - Block I/O Controller + * + * Copyright IBM Corporation, 2007 + * Author Balbir Singh + * + * Copyright 2007 OpenVZ SWsoft Inc + * Author: Pavel Emelianov + * + * Copyright VA Linux Systems Japan, 2008 + * Author Hirokazu Takahashi + * + * Copyright RedHat Inc, 2008 + * Author Vivek Goyal + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* return corresponding bio_cgroup object of a cgroup */ +static inline struct bio_cgroup *cgroup_bio(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, bio_cgroup_subsys_id), + struct bio_cgroup, css); +} + +static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio) +{ + bio->bi_next = NULL; + + if (bl->head) + bio->bi_next = bl->head; + else + bl->tail = bio; + + bl->head = bio; +} + +void __bio_group_queue_bio_head(struct bio_group *biog, struct bio *bio) +{ + bio_list_add_head(&biog->bio_queue, bio); +} + +void bio_group_queue_bio_head(struct bio_group *biog, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&biog->bio_group_lock, flags); + __bio_group_queue_bio_head(biog, bio); + spin_unlock_irqrestore(&biog->bio_group_lock, flags); +} + +void __bio_group_queue_bio_tail(struct bio_group *biog, struct bio *bio) +{ + bio_list_add(&biog->bio_queue, bio); +} + +void bio_group_queue_bio_tail(struct bio_group *biog, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&biog->bio_group_lock, flags); + __bio_group_queue_bio_tail(biog, bio); + spin_unlock_irqrestore(&biog->bio_group_lock, flags); +} + +/* Removes first request from the bio-cgroup request list */ +struct bio* __bio_group_dequeue_bio(struct bio_group *biog) +{ + struct bio *bio = NULL; + + if (bio_list_empty(&biog->bio_queue)) + return NULL; + bio = bio_list_pop(&biog->bio_queue); + return bio; +} + +struct bio* bio_group_dequeue_bio(struct bio_group *biog) +{ + unsigned long flags; + struct bio *bio; + spin_lock_irqsave(&biog->bio_group_lock, flags); + bio = __bio_group_dequeue_bio(biog); + spin_unlock_irqrestore(&biog->bio_group_lock, flags); + return bio; +} + +/* Traverse through all the active bio_group list of this cgroup and see + * if there is an active bio_group for the request queue. */ +struct bio_group* bio_group_from_cgroup(struct bio_cgroup *biocg, + struct request_queue *q) +{ + unsigned long flags; + struct bio_group *biog = NULL; + + spin_lock_irqsave(&biocg->biog_list_lock, flags); + if (list_empty(&biocg->bio_group_list)) + goto out; + list_for_each_entry(biog, &biocg->bio_group_list, next) { + if (biog->q == q) { + bio_group_get(biog); + goto out; + } + } + + /* did not find biog */ + spin_unlock_irqrestore(&biocg->biog_list_lock, flags); + return NULL; +out: + spin_unlock_irqrestore(&biocg->biog_list_lock, flags); + return biog; +} + +struct bio_cgroup *bio_cgroup_from_bio(struct bio *bio) +{ + struct page_cgroup *pc; + struct bio_cgroup *biocg = NULL; + struct page *page = bio_iovec_idx(bio, 0)->bv_page; + + lock_page_cgroup(page); + pc = page_get_page_cgroup(page); + if (pc) + biocg = pc->bio_cgroup; + if (!biocg) + biocg = bio_cgroup_from_task(rcu_dereference(init_mm.owner)); + unlock_page_cgroup(page); + return biocg; +} + +static struct cgroup_subsys_state * bio_cgroup_create(struct cgroup_subsys *ss, + struct cgroup *cgrp) +{ + struct bio_cgroup *biocg; + int error; + + if (!cgrp->parent) { + static struct bio_cgroup default_bio_cgroup; + + biocg = &default_bio_cgroup; + } else { + biocg = kzalloc(sizeof(*biocg), GFP_KERNEL); + if (!biocg) { + error = -ENOMEM; + goto out; + } + } + + /* Bind the cgroup to bio_cgroup object we just created */ + biocg->css.cgroup = cgrp; + spin_lock_init(&biocg->biog_list_lock); + spin_lock_init(&biocg->page_list_lock); + /* Assign default shares */ + biocg->shares = 1024; + INIT_LIST_HEAD(&biocg->bio_group_list); + INIT_LIST_HEAD(&biocg->page_list); + + return &biocg->css; +out: + kfree(biocg); + return ERR_PTR(error); +} + +void free_biog_elements(struct bio_cgroup *biocg) +{ + unsigned long flags, flags1; + struct bio_group *biog = NULL; + + spin_lock_irqsave(&biocg->biog_list_lock, flags); + while (1) { + if (list_empty(&biocg->bio_group_list)) + goto out; + + list_for_each_entry(biog, &biocg->bio_group_list, next) { + spin_lock_irqsave(&biog->bio_group_lock, flags1); + if (!atomic_read(&biog->refcnt)) { + list_del(&biog->next); + BUG_ON(bio_group_on_queue(biog)); + spin_unlock_irqrestore(&biog->bio_group_lock, + flags1); + kfree(biog); + break; + } else { + /* Drop the locks and schedule out. */ + spin_unlock_irqrestore(&biog->bio_group_lock, + flags1); + spin_unlock_irqrestore(&biocg->biog_list_lock, + flags); + msleep(1); + + /* Re-acquire the lock */ + spin_lock_irqsave(&biocg->biog_list_lock, + flags); + break; + } + } + } + +out: + spin_unlock_irqrestore(&biocg->biog_list_lock, flags); + return; +} + +void free_bio_cgroup(struct bio_cgroup *biocg) +{ + free_biog_elements(biocg); +} + +static void __clear_bio_cgroup(struct page_cgroup *pc) +{ + struct bio_cgroup *biocg = pc->bio_cgroup; + pc->bio_cgroup = NULL; + /* Respective bio group got deleted hence reference to + * bio cgroup removed from page during force empty. But page + * is being freed now. Igonore it. */ + if (!biocg) + return; + put_bio_cgroup(biocg); +} + +void clear_bio_cgroup(struct page_cgroup *pc) +{ + __clear_bio_cgroup(pc); +} + +#define FORCE_UNCHARGE_BATCH (128) +void bio_cgroup_force_empty(struct bio_cgroup *biocg) +{ + struct page_cgroup *pc; + struct page *page; + int count = FORCE_UNCHARGE_BATCH; + struct list_head *list = &biocg->page_list; + unsigned long flags; + + spin_lock_irqsave(&biocg->page_list_lock, flags); + while (!list_empty(list)) { + pc = list_entry(list->prev, struct page_cgroup, blist); + page = pc->page; + get_page(page); + __bio_cgroup_remove_page(pc); + __clear_bio_cgroup(pc); + spin_unlock_irqrestore(&biocg->page_list_lock, flags); + put_page(page); + if (--count <= 0) { + count = FORCE_UNCHARGE_BATCH; + cond_resched(); + } + spin_lock_irqsave(&biocg->page_list_lock, flags); + } + spin_unlock_irqrestore(&biocg->page_list_lock, flags); + /* Now free up all the bio groups releated to cgroup */ + free_bio_cgroup(biocg); + return; +} + +static void bio_cgroup_pre_destroy(struct cgroup_subsys *ss, + struct cgroup *cgrp) +{ + struct bio_cgroup *biocg = cgroup_bio(cgrp); + bio_cgroup_force_empty(biocg); +} + +static void bio_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct bio_cgroup *biocg = cgroup_bio(cgrp); + kfree(biocg); +} + +static u64 bio_shares_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct bio_cgroup *biog = cgroup_bio(cgrp); + + return (u64) biog->shares; +} + +static int bio_shares_write(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + struct bio_cgroup *biog = cgroup_bio(cgrp); + + biog->shares = val; + return 0; +} + +static u64 bio_aggregate_tokens_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct bio_cgroup *biocg = cgroup_bio(cgrp); + + return (u64) biocg->aggregate_tokens; +} + +static int bio_aggregate_tokens_write(struct cgroup *cgrp, struct cftype *cft, + u64 val) +{ + struct bio_cgroup *biocg = cgroup_bio(cgrp); + + biocg->aggregate_tokens = val; + return 0; +} + +static u64 bio_jiffies_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct bio_cgroup *biocg = cgroup_bio(cgrp); + + return (u64) biocg->jiffies; +} + +static u64 bio_nr_off_the_tree_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct bio_cgroup *biocg = cgroup_bio(cgrp); + + return (u64) biocg->nr_off_the_tree; +} + +static int bio_nr_off_the_tree_write(struct cgroup *cgrp, struct cftype *cft, + u64 val) +{ + struct bio_cgroup *biocg = cgroup_bio(cgrp); + + biocg->nr_off_the_tree = val; + return 0; +} + +static u64 bio_nr_token_slices_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct bio_cgroup *biocg = cgroup_bio(cgrp); + + return (u64) biocg->nr_token_slices; +} + +static int bio_nr_token_slices_write(struct cgroup *cgrp, + struct cftype *cft, u64 val) +{ + struct bio_cgroup *biocg = cgroup_bio(cgrp); + + biocg->nr_token_slices = val; + return 0; +} + + + +static struct cftype bio_files[] = { + { + .name = "shares", + .read_u64 = bio_shares_read, + .write_u64 = bio_shares_write, + }, + { + .name = "aggregate_tokens", + .read_u64 = bio_aggregate_tokens_read, + .write_u64 = bio_aggregate_tokens_write, + }, + { + .name = "jiffies", + .read_u64 = bio_jiffies_read, + }, + { + .name = "nr_off_the_tree", + .read_u64 = bio_nr_off_the_tree_read, + .write_u64 = bio_nr_off_the_tree_write, + }, + { + .name = "nr_token_slices", + .read_u64 = bio_nr_token_slices_read, + .write_u64 = bio_nr_token_slices_write, + }, +}; + +static int bio_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + if (bio_cgroup_disabled()) + return 0; + return cgroup_add_files(cont, ss, bio_files, ARRAY_SIZE(bio_files)); +} + +static void bio_cgroup_move_task(struct cgroup_subsys *ss, + struct cgroup *cont, + struct cgroup *old_cont, + struct task_struct *p) +{ + /* do nothing */ +} + + +struct cgroup_subsys bio_cgroup_subsys = { + .name = "bio", + .subsys_id = bio_cgroup_subsys_id, + .create = bio_cgroup_create, + .destroy = bio_cgroup_destroy, + .pre_destroy = bio_cgroup_pre_destroy, + .populate = bio_cgroup_populate, + .attach = bio_cgroup_move_task, + .early_init = 0, +}; Index: linux17/include/linux/biocontrol.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux17/include/linux/biocontrol.h 2008-11-05 18:12:44.000000000 -0500 @@ -0,0 +1,174 @@ +#include +#include +#include +#include +#include "../../drivers/md/dm-bio-list.h" + +#ifndef _LINUX_BIOCONTROL_H +#define _LINUX_BIOCONTROL_H + +#ifdef CONFIG_CGROUP_BIO + +struct io_context; +struct block_device; + +struct bio_cgroup { + struct cgroup_subsys_state css; + /* Share/weight of the cgroup */ + unsigned long shares; + + /* list of bio-groups associated with this cgroup. */ + struct list_head bio_group_list; + spinlock_t biog_list_lock; + + /* list of pages associated with this bio cgroup */ + spinlock_t page_list_lock; + struct list_head page_list; + + /* Debug Aid */ + unsigned long aggregate_tokens; + unsigned long jiffies; + unsigned long nr_off_the_tree; + unsigned long nr_token_slices; +}; + +static inline int bio_cgroup_disabled(void) +{ + return bio_cgroup_subsys.disabled; +} + +static inline struct bio_cgroup *bio_cgroup_from_task(struct task_struct *p) +{ + return container_of(task_subsys_state(p, bio_cgroup_subsys_id), + struct bio_cgroup, css); +} + +static inline void get_bio_cgroup(struct bio_cgroup *biocg) +{ + css_get(&biocg->css); +} + +static inline void put_bio_cgroup(struct bio_cgroup *biocg) +{ + css_put(&biocg->css); +} + +static inline void set_bio_cgroup(struct page_cgroup *pc, + struct bio_cgroup *biog) +{ + pc->bio_cgroup = biog; +} + +static inline struct bio_cgroup *get_bio_page_cgroup(struct page_cgroup *pc) +{ + struct bio_cgroup *biog = pc->bio_cgroup; + get_bio_cgroup(biog); + return biog; +} + +/* This sould be called in an RCU-protected section. */ +static inline struct bio_cgroup *mm_get_bio_cgroup(struct mm_struct *mm) +{ + struct bio_cgroup *biog; + biog = bio_cgroup_from_task(rcu_dereference(mm->owner)); + get_bio_cgroup(biog); + return biog; +} + +static inline void __bio_cgroup_add_page(struct page_cgroup *pc) +{ + struct bio_cgroup *biocg = pc->bio_cgroup; + list_add(&pc->blist, &biocg->page_list); +} + +static inline void bio_cgroup_add_page(struct page_cgroup *pc) +{ + struct bio_cgroup *biocg = pc->bio_cgroup; + unsigned long flags; + spin_lock_irqsave(&biocg->page_list_lock, flags); + __bio_cgroup_add_page(pc); + spin_unlock_irqrestore(&biocg->page_list_lock, flags); +} + +static inline void __bio_cgroup_remove_page(struct page_cgroup *pc) +{ + list_del_init(&pc->blist); +} + +static inline void bio_cgroup_remove_page(struct page_cgroup *pc) +{ + struct bio_cgroup *biocg = pc->bio_cgroup; + unsigned long flags; + + /* Respective bio group got deleted hence reference to + * bio cgroup removed from page during force empty. But page + * is being freed now. Igonore it. */ + if (!biocg) + return; + spin_lock_irqsave(&biocg->page_list_lock, flags); + __bio_cgroup_remove_page(pc); + spin_unlock_irqrestore(&biocg->page_list_lock, flags); +} + +extern void clear_bio_cgroup(struct page_cgroup *pc); + +extern int bio_group_controller(struct request_queue *q, struct bio *bio); +extern void blk_biogroup_work(struct work_struct *work); +#else /* CONFIG_CGROUP_BIO */ + +struct bio_cgroup; + +static inline int bio_cgroup_disabled(void) +{ + return 1; +} + +static inline void get_bio_cgroup(struct bio_cgroup *biocg) +{ +} + +static inline void put_bio_cgroup(struct bio_cgroup *biocg) +{ +} + +static inline void set_bio_cgroup(struct page_cgroup *pc, + struct bio_cgroup *biog) +{ +} + +static inline void clear_bio_cgroup(struct page_cgroup *pc) +{ +} + +static inline struct bio_cgroup *get_bio_page_cgroup(struct page_cgroup *pc) +{ + return NULL; +} + +static inline struct bio_cgroup *mm_get_bio_cgroup(struct mm_struct *mm) +{ + return NULL; +} + +static inline void bio_cgroup_add_page(struct page_cgroup *pc) +{ + return; +} + +static inline void bio_cgroup_remove_page(struct page_cgroup *pc) +{ + return; +} + +static inline int bio_group_controller(struct request_queue *q, struct bio *bio) +{ + return 0; +} +static inline void blk_biogroup_work(struct work_struct *work) +{ +} + + +#endif /* CONFIG_CGROUP_BIO */ + +#endif /* _LINUX_BIOCONTROL_H */ Index: linux17/mm/Makefile =================================================================== --- linux17.orig/mm/Makefile 2008-10-09 18:13:53.000000000 -0400 +++ linux17/mm/Makefile 2008-11-05 18:12:32.000000000 -0500 @@ -34,4 +34,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o +obj-$(CONFIG_CGROUP_BIO) += biocontrol.o Index: linux17/mm/memcontrol.c =================================================================== --- linux17.orig/mm/memcontrol.c 2008-10-09 18:13:53.000000000 -0400 +++ linux17/mm/memcontrol.c 2008-11-05 18:12:32.000000000 -0500 @@ -32,6 +32,7 @@ #include #include #include +#include #include @@ -144,30 +145,6 @@ struct mem_cgroup { }; static struct mem_cgroup init_mem_cgroup; -/* - * We use the lower bit of the page->page_cgroup pointer as a bit spin - * lock. We need to ensure that page->page_cgroup is at least two - * byte aligned (based on comments from Nick Piggin). But since - * bit_spin_lock doesn't actually set that lock bit in a non-debug - * uniprocessor kernel, we should avoid setting it here too. - */ -#define PAGE_CGROUP_LOCK_BIT 0x0 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) -#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) -#else -#define PAGE_CGROUP_LOCK 0x0 -#endif - -/* - * A page_cgroup page is associated with every page descriptor. The - * page_cgroup helps us identify information about the cgroup - */ -struct page_cgroup { - struct list_head lru; /* per cgroup LRU list */ - struct page *page; - struct mem_cgroup *mem_cgroup; - int flags; -}; #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ @@ -278,21 +255,6 @@ struct page_cgroup *page_get_page_cgroup return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); } -static void lock_page_cgroup(struct page *page) -{ - bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - -static int try_lock_page_cgroup(struct page *page) -{ - return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - -static void unlock_page_cgroup(struct page *page) -{ - bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, struct page_cgroup *pc) { @@ -535,14 +497,15 @@ unsigned long mem_cgroup_isolate_pages(u * < 0 if the cgroup is over its limit */ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype, - struct mem_cgroup *memcg) + gfp_t gfp_mask, enum charge_type ctype, + struct mem_cgroup *memcg, struct bio_cgroup *biocg) { struct mem_cgroup *mem; struct page_cgroup *pc; unsigned long flags; unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup_per_zone *mz; + struct bio_cgroup *biocg_temp; pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); if (unlikely(pc == NULL)) @@ -572,6 +535,10 @@ static int mem_cgroup_charge_common(stru css_get(&memcg->css); } + rcu_read_lock(); + biocg_temp = biocg ? biocg : mm_get_bio_cgroup(mm); + rcu_read_unlock(); + while (res_counter_charge(&mem->res, PAGE_SIZE)) { if (!(gfp_mask & __GFP_WAIT)) goto out; @@ -597,6 +564,7 @@ static int mem_cgroup_charge_common(stru pc->mem_cgroup = mem; pc->page = page; + set_bio_cgroup(pc, biocg_temp); /* * If a page is accounted as a page cache, insert to inactive list. * If anon, insert to active list. @@ -611,21 +579,22 @@ static int mem_cgroup_charge_common(stru unlock_page_cgroup(page); res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); + clear_bio_cgroup(pc); kmem_cache_free(page_cgroup_cache, pc); goto done; } page_assign_page_cgroup(page, pc); - mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_add_list(mz, pc); spin_unlock_irqrestore(&mz->lru_lock, flags); - + bio_cgroup_add_page(pc); unlock_page_cgroup(page); done: return 0; out: css_put(&mem->css); + put_bio_cgroup(biocg_temp); kmem_cache_free(page_cgroup_cache, pc); err: return -ENOMEM; @@ -648,7 +617,7 @@ int mem_cgroup_charge(struct page *page, if (unlikely(!mm)) mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); + MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL, NULL); } int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, @@ -684,7 +653,7 @@ int mem_cgroup_cache_charge(struct page mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); + MEM_CGROUP_CHARGE_TYPE_CACHE, NULL, NULL); } /* @@ -720,14 +689,14 @@ __mem_cgroup_uncharge_common(struct page spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_remove_list(mz, pc); spin_unlock_irqrestore(&mz->lru_lock, flags); - + bio_cgroup_remove_page(pc); page_assign_page_cgroup(page, NULL); unlock_page_cgroup(page); mem = pc->mem_cgroup; res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); - + clear_bio_cgroup(pc); kmem_cache_free(page_cgroup_cache, pc); return; unlock: @@ -754,6 +723,7 @@ int mem_cgroup_prepare_migration(struct struct mem_cgroup *mem = NULL; enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; int ret = 0; + struct bio_cgroup *biocg = NULL; if (mem_cgroup_subsys.disabled) return 0; @@ -765,12 +735,15 @@ int mem_cgroup_prepare_migration(struct css_get(&mem->css); if (pc->flags & PAGE_CGROUP_FLAG_CACHE) ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + biocg = get_bio_page_cgroup(pc); } unlock_page_cgroup(page); if (mem) { ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, - ctype, mem); + ctype, mem, biocg); css_put(&mem->css); + if (biocg) + put_bio_cgroup(biocg); } return ret; } Index: linux17/include/linux/memcontrol.h =================================================================== --- linux17.orig/include/linux/memcontrol.h 2008-10-09 18:13:53.000000000 -0400 +++ linux17/include/linux/memcontrol.h 2008-11-05 18:12:32.000000000 -0500 @@ -17,16 +17,47 @@ * GNU General Public License for more details. */ +#include +#include + #ifndef _LINUX_MEMCONTROL_H #define _LINUX_MEMCONTROL_H struct mem_cgroup; -struct page_cgroup; struct page; struct mm_struct; #ifdef CONFIG_CGROUP_MEM_RES_CTLR +/* + * We use the lower bit of the page->page_cgroup pointer as a bit spin + * lock. We need to ensure that page->page_cgroup is at least two + * byte aligned (based on comments from Nick Piggin). But since + * bit_spin_lock doesn't actually set that lock bit in a non-debug + * uniprocessor kernel, we should avoid setting it here too. + */ +#define PAGE_CGROUP_LOCK_BIT 0x0 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) +#else +#define PAGE_CGROUP_LOCK 0x0 +#endif + +/* + * A page_cgroup page is associated with every page descriptor. The + * page_cgroup helps us identify information about the cgroup + */ +struct page_cgroup { + struct list_head lru; /* per cgroup LRU list */ + struct page *page; + struct mem_cgroup *mem_cgroup; + int flags; +#ifdef CONFIG_CGROUP_BIO + struct list_head blist; /* for bio_cgroup page list */ + struct bio_cgroup *bio_cgroup; +#endif +}; + #define page_reset_bad_cgroup(page) ((page)->page_cgroup = 0) extern struct page_cgroup *page_get_page_cgroup(struct page *page); @@ -74,6 +105,20 @@ extern long mem_cgroup_calc_reclaim_acti extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, struct zone *zone, int priority); +static inline void lock_page_cgroup(struct page *page) +{ + bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); +} + +static inline int try_lock_page_cgroup(struct page *page) +{ + return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); +} + +static inline void unlock_page_cgroup(struct page *page) +{ + bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); +} #else /* CONFIG_CGROUP_MEM_RES_CTLR */ static inline void page_reset_bad_cgroup(struct page *page) { -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/