From: Kristen Carlson Accardi <[email protected]>
Previous patches have implemented all infrastructure needed for
per-cgroup EPC page tracking and reclaiming. But all reclaimable EPC
pages are still tracked in the global LRU as sgx_epc_page_lru() always
returns reference to the global LRU.
Change sgx_epc_page_lru() to return the LRU of the cgroup in which the
given EPC page is allocated.
This makes all EPC pages tracked in per-cgroup LRUs and the global
reclaimer (ksgxd) will not be able to reclaim any pages from the global
LRU. However, in cases of over-committing, i.e., the sum of cgroup
limits greater than the total capacity, cgroups may never reclaim but
the total usage can still be near the capacity. Therefore a global
reclamation is still needed in those cases and it should be performed
from the root cgroup.
Modify sgx_reclaim_pages_global(), to reclaim from the root EPC cgroup
when cgroup is enabled, otherwise from the global LRU. Export
sgx_cgroup_reclaim_pages() in the header file so it can be reused for
this purpose.
Similarly, modify sgx_can_reclaim_global(), to check emptiness of LRUs
of all cgroups when EPC cgroup is enabled, otherwise only check the
global LRU. Export sgx_cgroup_lru_empty() so it can be reused for this
purpose.
Finally, change sgx_reclaim_direct(), to check and ensure there are free
pages at cgroup level so forward progress can be made by the caller.
Export sgx_cgroup_should_reclaim() for reuse.
With these changes, the global reclamation and per-cgroup reclamation
both work properly with all pages tracked in per-cgroup LRUs.
Co-developed-by: Sean Christopherson <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
Signed-off-by: Kristen Carlson Accardi <[email protected]>
Co-developed-by: Haitao Huang <[email protected]>
Signed-off-by: Haitao Huang <[email protected]>
Tested-by: Mikko Ylinen <[email protected]>
Tested-by: Jarkko Sakkinen <[email protected]>
---
V13:
- Use IS_ENABLED(CONFIG_CGROUP_MISC) in sgx_can_reclaim_global(). (Kai)
V12:
- Remove CONFIG_CGROUP_SGX_EPC, conditional compile SGX Cgroup for
CONFIGCONFIG_CGROUPMISC. (Jarkko)
V11:
- Reword the comments for global reclamation for allocation failure
after passing cgroup charging. (Kai)
- Add stub functions to remove ifdefs in c file (Kai)
- Add more detailed comments to clarify each page belongs to one cgroup, or the
root. (Kai)
V10:
- Add comment to clarify each page belongs to one cgroup, or the root by
default. (Kai)
- Merge the changes that expose sgx_cgroup_* functions to this patch.
- Add changes for sgx_reclaim_direct() that was missed previously.
V7:
- Split this out from the big patch, #10 in V6. (Dave, Kai)
---
arch/x86/kernel/cpu/sgx/epc_cgroup.c | 6 ++--
arch/x86/kernel/cpu/sgx/epc_cgroup.h | 27 ++++++++++++++++
arch/x86/kernel/cpu/sgx/main.c | 46 ++++++++++++++++++++++++++--
3 files changed, 74 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.c b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
index c237bc3330ee..fe2fd6034023 100644
--- a/arch/x86/kernel/cpu/sgx/epc_cgroup.c
+++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
@@ -67,7 +67,7 @@ static inline u64 sgx_cgroup_max_pages_to_root(struct sgx_cgroup *sgx_cg)
*
* Return: %true if all cgroups under the specified root have empty LRU lists.
*/
-static bool sgx_cgroup_lru_empty(struct misc_cg *root)
+bool sgx_cgroup_lru_empty(struct misc_cg *root)
{
struct cgroup_subsys_state *css_root;
struct cgroup_subsys_state *pos;
@@ -115,7 +115,7 @@ static bool sgx_cgroup_lru_empty(struct misc_cg *root)
* the LRUs are recently accessed, i.e., considered "too young" to reclaim, no
* page will actually be reclaimed after walking the whole tree.
*/
-static void sgx_cgroup_reclaim_pages(struct misc_cg *root, struct mm_struct *charge_mm)
+void sgx_cgroup_reclaim_pages(struct misc_cg *root, struct mm_struct *charge_mm)
{
struct cgroup_subsys_state *css_root;
struct cgroup_subsys_state *pos;
@@ -156,7 +156,7 @@ static void sgx_cgroup_reclaim_pages(struct misc_cg *root, struct mm_struct *cha
* threshold (%SGX_CG_MIN_FREE_PAGE) and there are reclaimable pages within the
* cgroup.
*/
-static bool sgx_cgroup_should_reclaim(struct sgx_cgroup *sgx_cg)
+bool sgx_cgroup_should_reclaim(struct sgx_cgroup *sgx_cg)
{
u64 cur, max;
diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.h b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
index 2044e0d64076..9d69608eadf6 100644
--- a/arch/x86/kernel/cpu/sgx/epc_cgroup.h
+++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
@@ -13,6 +13,11 @@
#define MISC_CG_RES_SGX_EPC MISC_CG_RES_TYPES
struct sgx_cgroup;
+static inline struct misc_cg *misc_from_sgx(struct sgx_cgroup *sgx_cg)
+{
+ return NULL;
+}
+
static inline struct sgx_cgroup *sgx_get_current_cg(void)
{
return NULL;
@@ -27,8 +32,22 @@ static inline int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg, enum sgx_recl
static inline void sgx_cgroup_uncharge(struct sgx_cgroup *sgx_cg) { }
+static inline bool sgx_cgroup_lru_empty(struct misc_cg *root)
+{
+ return true;
+}
+
+static inline bool sgx_cgroup_should_reclaim(struct sgx_cgroup *sgx_cg)
+{
+ return false;
+}
+
static inline void sgx_cgroup_init(void) { }
+static inline void sgx_cgroup_reclaim_pages(struct misc_cg *root, struct mm_struct *charge_mm)
+{
+}
+
#else /* CONFIG_CGROUP_MISC */
struct sgx_cgroup {
@@ -37,6 +56,11 @@ struct sgx_cgroup {
struct work_struct reclaim_work;
};
+static inline struct misc_cg *misc_from_sgx(struct sgx_cgroup *sgx_cg)
+{
+ return sgx_cg->cg;
+}
+
static inline struct sgx_cgroup *sgx_cgroup_from_misc_cg(struct misc_cg *cg)
{
return (struct sgx_cgroup *)(cg->res[MISC_CG_RES_SGX_EPC].priv);
@@ -67,6 +91,9 @@ static inline void sgx_put_cg(struct sgx_cgroup *sgx_cg)
int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg, enum sgx_reclaim reclaim);
void sgx_cgroup_uncharge(struct sgx_cgroup *sgx_cg);
+bool sgx_cgroup_lru_empty(struct misc_cg *root);
+bool sgx_cgroup_should_reclaim(struct sgx_cgroup *sgx_cg);
+void sgx_cgroup_reclaim_pages(struct misc_cg *root, struct mm_struct *charge_mm);
void sgx_cgroup_init(void);
#endif /* CONFIG_CGROUP_MISC */
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 92bd3151a589..8748358fd00f 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -31,9 +31,30 @@ static DEFINE_XARRAY(sgx_epc_address_space);
*/
static struct sgx_epc_lru_list sgx_global_lru;
+/*
+ * Get the per-cgroup or global LRU list that tracks the given reclaimable page.
+ */
static inline struct sgx_epc_lru_list *sgx_epc_page_lru(struct sgx_epc_page *epc_page)
{
+#ifdef CONFIG_CGROUP_MISC
+ /*
+ * epc_page->sgx_cg here is never NULL during a reclaimable epc_page's
+ * life between sgx_alloc_epc_page() and sgx_free_epc_page():
+ *
+ * In sgx_alloc_epc_page(), epc_page->sgx_cg is set to the return from
+ * sgx_get_current_cg() which is the misc cgroup of the current task, or
+ * the root by default even if the misc cgroup is disabled by kernel
+ * command line.
+ *
+ * epc_page->sgx_cg is only unset by sgx_free_epc_page().
+ *
+ * This function is never used before sgx_alloc_epc_page() or after
+ * sgx_free_epc_page().
+ */
+ return &epc_page->sgx_cg->lru;
+#else
return &sgx_global_lru;
+#endif
}
/*
@@ -41,7 +62,10 @@ static inline struct sgx_epc_lru_list *sgx_epc_page_lru(struct sgx_epc_page *epc
*/
static inline bool sgx_can_reclaim_global(void)
{
- return !list_empty(&sgx_global_lru.reclaimable);
+ if (IS_ENABLED(CONFIG_CGROUP_MISC))
+ return !sgx_cgroup_lru_empty(misc_cg_root());
+ else
+ return !list_empty(&sgx_global_lru.reclaimable);
}
static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
@@ -403,7 +427,10 @@ static bool sgx_should_reclaim_global(unsigned long watermark)
static void sgx_reclaim_pages_global(struct mm_struct *charge_mm)
{
- sgx_reclaim_pages(&sgx_global_lru, charge_mm);
+ if (IS_ENABLED(CONFIG_CGROUP_MISC))
+ sgx_cgroup_reclaim_pages(misc_cg_root(), charge_mm);
+ else
+ sgx_reclaim_pages(&sgx_global_lru, charge_mm);
}
/*
@@ -413,6 +440,15 @@ static void sgx_reclaim_pages_global(struct mm_struct *charge_mm)
*/
void sgx_reclaim_direct(void)
{
+ struct sgx_cgroup *sgx_cg = sgx_get_current_cg();
+
+ /* Make sure there are some free pages at cgroup level */
+ if (sgx_cg && sgx_cgroup_should_reclaim(sgx_cg)) {
+ sgx_cgroup_reclaim_pages(misc_from_sgx(sgx_cg), current->mm);
+ sgx_put_cg(sgx_cg);
+ }
+
+ /* Make sure there are some free pages at global level */
if (sgx_should_reclaim_global(SGX_NR_LOW_PAGES))
sgx_reclaim_pages_global(current->mm);
}
@@ -615,6 +651,12 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, enum sgx_reclaim reclaim)
break;
}
+ /*
+ * At this point, the usage within this cgroup is under its
+ * limit but there is no physical page left for allocation.
+ * Perform a global reclaim to get some pages released from any
+ * cgroup with reclaimable pages.
+ */
sgx_reclaim_pages_global(current->mm);
cond_resched();
}
--
2.25.1
On 1/05/2024 7:51 am, Haitao Huang wrote:
>
> static void sgx_reclaim_pages_global(struct mm_struct *charge_mm)
> {
> - sgx_reclaim_pages(&sgx_global_lru, charge_mm);
> + if (IS_ENABLED(CONFIG_CGROUP_MISC))
> + sgx_cgroup_reclaim_pages(misc_cg_root(), charge_mm);
> + else
> + sgx_reclaim_pages(&sgx_global_lru, charge_mm);
> }
>
I think we have a problem here when we do global reclaim starting from
the ROOT cgroup:
This function will mostly just only try to reclaim from the ROOT cgroup,
but won't reclaim from the descendants.
The reason is the sgx_cgroup_reclaim_pages() will simply return after
"scanning" SGX_NR_TO_SCAN (16) pages w/o going to the descendants, and
the "scanning" here simply means "removing the EPC page from the
cgroup's LRU list".
So as long as the ROOT cgroup LRU contains more than SGX_NR_TO_SCAN (16)
pages, effectively sgx_cgroup_reclaim_pages() will just scan and return
w/o going into the descendants. Having 16 EPC pages should be a "almost
always true" case I suppose.
When the sgx_reclaim_pages_global() is called again, we will start from
the ROOT again.
That means the this doesn't truly reclaim "from global" at all.
IMHO the behaviour of sgx_cgroup_reclaim_pages() is OK for per-cgroup
reclaim because I think in this case our intention is we should try best
to reclaim from the cgroup, i.e., whether we can reclaim from
descendants doesn't matter.
But for global reclaim this doesn't work.
Am I missing anything?
On Mon, 06 May 2024 19:10:42 -0500, Huang, Kai <[email protected]> wrote:
>
>
> On 1/05/2024 7:51 am, Haitao Huang wrote:
>> static void sgx_reclaim_pages_global(struct mm_struct *charge_mm)
>> {
>> - sgx_reclaim_pages(&sgx_global_lru, charge_mm);
>> + if (IS_ENABLED(CONFIG_CGROUP_MISC))
>> + sgx_cgroup_reclaim_pages(misc_cg_root(), charge_mm);
>> + else
>> + sgx_reclaim_pages(&sgx_global_lru, charge_mm);
>> }
>>
>
> I think we have a problem here when we do global reclaim starting from
> the ROOT cgroup:
>
> This function will mostly just only try to reclaim from the ROOT cgroup,
> but won't reclaim from the descendants.
>
> The reason is the sgx_cgroup_reclaim_pages() will simply return after
> "scanning" SGX_NR_TO_SCAN (16) pages w/o going to the descendants, and
> the "scanning" here simply means "removing the EPC page from the
> cgroup's LRU list".
>
> So as long as the ROOT cgroup LRU contains more than SGX_NR_TO_SCAN (16)
> pages, effectively sgx_cgroup_reclaim_pages() will just scan and return
> w/o going into the descendants. Having 16 EPC pages should be a "almost
> always true" case I suppose.
>
> When the sgx_reclaim_pages_global() is called again, we will start from
> the ROOT again.
>
> That means the this doesn't truly reclaim "from global" at all.
>
> IMHO the behaviour of sgx_cgroup_reclaim_pages() is OK for per-cgroup
> reclaim because I think in this case our intention is we should try best
> to reclaim from the cgroup, i.e., whether we can reclaim from
> descendants doesn't matter.
>
> But for global reclaim this doesn't work.
>
> Am I missing anything?
>
Good catch. This is indeed a problem if pages in a higher level cgroup are
always busy (being 'young').The reclamation loop starting from this group
may be stuck in only shifting the pages from front to tail in this group
and never tries to scan & reclaim pages in its descendants.
Though this may not happen often, I think it does require a fix. Will do
it in v14 :-)
Thanks
Haitao