When a system is under memory presure (high usage with fragments),
the original 256KB ICM chunk allocations will likely trigger kernel
memory management to enter slow path doing memory compact/migration
ops in order to complete high order memory allocations.
When that happens, user processes calling uverb APIs may get stuck
for more than 120s easily even though there are a lot of free pages
in smaller chunks available in the system.
Syslog:
...
Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
oracle_205573_e:205573 blocked for more than 120 seconds.
...
With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
However in order to support smaller ICM chunk size, we need to fix
another issue in large size kcalloc allocations.
E.g.
Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
entry). So we need a 16MB allocation for a table->icm pointer array to
hold 2M pointers which can easily cause kcalloc to fail.
The solution is to use vzalloc to replace kcalloc. There is no need
for contiguous memory pages for a driver meta data structure (no need
of DMA ops).
Signed-off-by: Qing Huang <[email protected]>
Acked-by: Daniel Jurgens <[email protected]>
Reviewed-by: Zhu Yanjun <[email protected]>
---
v2 -> v1: adjusted chunk size to reflect different architectures.
drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
index a822f7a..ccb62b8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -43,12 +43,12 @@
#include "fw.h"
/*
- * We allocate in as big chunks as we can, up to a maximum of 256 KB
- * per chunk.
+ * We allocate in page size (default 4KB on many archs) chunks to avoid high
+ * order memory allocations in fragmented/high usage memory situation.
*/
enum {
- MLX4_ICM_ALLOC_SIZE = 1 << 18,
- MLX4_TABLE_CHUNK_SIZE = 1 << 18
+ MLX4_ICM_ALLOC_SIZE = 1 << PAGE_SHIFT,
+ MLX4_TABLE_CHUNK_SIZE = 1 << PAGE_SHIFT
};
static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
@@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
- table->icm = kcalloc(num_icm, sizeof(*table->icm), GFP_KERNEL);
+ table->icm = vzalloc(num_icm * sizeof(*table->icm));
if (!table->icm)
return -ENOMEM;
table->virt = virt;
@@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
mlx4_free_icm(dev, table->icm[i], use_coherent);
}
- kfree(table->icm);
+ vfree(table->icm);
return -ENOMEM;
}
@@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
mlx4_free_icm(dev, table->icm[i], table->coherent);
}
- kfree(table->icm);
+ vfree(table->icm);
}
--
2.9.3
On 11/05/2018 10:23 PM, Qing Huang wrote:
> When a system is under memory presure (high usage with fragments),
> the original 256KB ICM chunk allocations will likely trigger kernel
> memory management to enter slow path doing memory compact/migration
> ops in order to complete high order memory allocations.
>
> When that happens, user processes calling uverb APIs may get stuck
> for more than 120s easily even though there are a lot of free pages
> in smaller chunks available in the system.
>
> Syslog:
> ...
> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
> oracle_205573_e:205573 blocked for more than 120 seconds.
> ...
>
> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>
> However in order to support smaller ICM chunk size, we need to fix
> another issue in large size kcalloc allocations.
>
> E.g.
> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
> entry). So we need a 16MB allocation for a table->icm pointer array to
> hold 2M pointers which can easily cause kcalloc to fail.
>
> The solution is to use vzalloc to replace kcalloc. There is no need
> for contiguous memory pages for a driver meta data structure (no need
> of DMA ops).
>
> Signed-off-by: Qing Huang <[email protected]>
> Acked-by: Daniel Jurgens <[email protected]>
> Reviewed-by: Zhu Yanjun <[email protected]>
> ---
> v2 -> v1: adjusted chunk size to reflect different architectures.
>
> drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
> 1 file changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
> index a822f7a..ccb62b8 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
> @@ -43,12 +43,12 @@
> #include "fw.h"
>
> /*
> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
> - * per chunk.
> + * We allocate in page size (default 4KB on many archs) chunks to avoid high
> + * order memory allocations in fragmented/high usage memory situation.
> */
> enum {
> - MLX4_ICM_ALLOC_SIZE = 1 << 18,
> - MLX4_TABLE_CHUNK_SIZE = 1 << 18
> + MLX4_ICM_ALLOC_SIZE = 1 << PAGE_SHIFT,
> + MLX4_TABLE_CHUNK_SIZE = 1 << PAGE_SHIFT
Which is actually PAGE_SIZE.
Also, please add a comma at the end of the last entry.
> };
>
> static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
> obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
> num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>
> - table->icm = kcalloc(num_icm, sizeof(*table->icm), GFP_KERNEL);
> + table->icm = vzalloc(num_icm * sizeof(*table->icm));
Why not kvzalloc ?
> if (!table->icm)
> return -ENOMEM;
> table->virt = virt;
> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
> mlx4_free_icm(dev, table->icm[i], use_coherent);
> }
>
> - kfree(table->icm);
> + vfree(table->icm);
>
> return -ENOMEM;
> }
> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
> mlx4_free_icm(dev, table->icm[i], table->coherent);
> }
>
> - kfree(table->icm);
> + vfree(table->icm);
> }
>
Thanks for your patch.
I need to verify there is no dramatic performance degradation here.
You can prepare and send a v3 in the meanwhile.
Thanks,
Tariq
On 5/13/2018 2:00 AM, Tariq Toukan wrote:
>
>
> On 11/05/2018 10:23 PM, Qing Huang wrote:
>> When a system is under memory presure (high usage with fragments),
>> the original 256KB ICM chunk allocations will likely trigger kernel
>> memory management to enter slow path doing memory compact/migration
>> ops in order to complete high order memory allocations.
>>
>> When that happens, user processes calling uverb APIs may get stuck
>> for more than 120s easily even though there are a lot of free pages
>> in smaller chunks available in the system.
>>
>> Syslog:
>> ...
>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>> oracle_205573_e:205573 blocked for more than 120 seconds.
>> ...
>>
>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>
>> However in order to support smaller ICM chunk size, we need to fix
>> another issue in large size kcalloc allocations.
>>
>> E.g.
>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
>> entry). So we need a 16MB allocation for a table->icm pointer array to
>> hold 2M pointers which can easily cause kcalloc to fail.
>>
>> The solution is to use vzalloc to replace kcalloc. There is no need
>> for contiguous memory pages for a driver meta data structure (no need
>> of DMA ops).
>>
>> Signed-off-by: Qing Huang <[email protected]>
>> Acked-by: Daniel Jurgens <[email protected]>
>> Reviewed-by: Zhu Yanjun <[email protected]>
>> ---
>> v2 -> v1: adjusted chunk size to reflect different architectures.
>>
>> drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>> 1 file changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c
>> b/drivers/net/ethernet/mellanox/mlx4/icm.c
>> index a822f7a..ccb62b8 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
>> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
>> @@ -43,12 +43,12 @@
>> #include "fw.h"
>> /*
>> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
>> - * per chunk.
>> + * We allocate in page size (default 4KB on many archs) chunks to
>> avoid high
>> + * order memory allocations in fragmented/high usage memory situation.
>> */
>> enum {
>> - MLX4_ICM_ALLOC_SIZE = 1 << 18,
>> - MLX4_TABLE_CHUNK_SIZE = 1 << 18
>> + MLX4_ICM_ALLOC_SIZE = 1 << PAGE_SHIFT,
>> + MLX4_TABLE_CHUNK_SIZE = 1 << PAGE_SHIFT
>
> Which is actually PAGE_SIZE.
Yes, we wanted to avoid high order memory allocations.
> Also, please add a comma at the end of the last entry.
Hmm..., followed the existing code style and checkpatch.pl didn't
complain about the comma.
>
>> };
>> static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct
>> mlx4_icm_chunk *chunk)
>> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev,
>> struct mlx4_icm_table *table,
>> obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>> num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>> - table->icm = kcalloc(num_icm, sizeof(*table->icm),
>> GFP_KERNEL);
>> + table->icm = vzalloc(num_icm * sizeof(*table->icm));
>
> Why not kvzalloc ?
I think table->icm pointer array doesn't really need physically
contiguous memory. Sometimes high order
memory allocation by kmalloc variants may trigger slow path and cause
tasks to be blocked.
Thanks,
Qing
>
>> if (!table->icm)
>> return -ENOMEM;
>> table->virt = virt;
>> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev,
>> struct mlx4_icm_table *table,
>> mlx4_free_icm(dev, table->icm[i], use_coherent);
>> }
>> - kfree(table->icm);
>> + vfree(table->icm);
>> return -ENOMEM;
>> }
>> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev *dev,
>> struct mlx4_icm_table *table)
>> mlx4_free_icm(dev, table->icm[i], table->coherent);
>> }
>> - kfree(table->icm);
>> + vfree(table->icm);
>> }
>>
>
> Thanks for your patch.
>
> I need to verify there is no dramatic performance degradation here.
> You can prepare and send a v3 in the meanwhile.
>
> Thanks,
> Tariq
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On 14/05/2018 7:41 PM, Qing Huang wrote:
>
>
> On 5/13/2018 2:00 AM, Tariq Toukan wrote:
>>
>>
>> On 11/05/2018 10:23 PM, Qing Huang wrote:
>>> When a system is under memory presure (high usage with fragments),
>>> the original 256KB ICM chunk allocations will likely trigger kernel
>>> memory management to enter slow path doing memory compact/migration
>>> ops in order to complete high order memory allocations.
>>>
>>> When that happens, user processes calling uverb APIs may get stuck
>>> for more than 120s easily even though there are a lot of free pages
>>> in smaller chunks available in the system.
>>>
>>> Syslog:
>>> ...
>>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>>> oracle_205573_e:205573 blocked for more than 120 seconds.
>>> ...
>>>
>>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>>
>>> However in order to support smaller ICM chunk size, we need to fix
>>> another issue in large size kcalloc allocations.
>>>
>>> E.g.
>>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
>>> entry). So we need a 16MB allocation for a table->icm pointer array to
>>> hold 2M pointers which can easily cause kcalloc to fail.
>>>
>>> The solution is to use vzalloc to replace kcalloc. There is no need
>>> for contiguous memory pages for a driver meta data structure (no need
>>> of DMA ops).
>>>
>>> Signed-off-by: Qing Huang <[email protected]>
>>> Acked-by: Daniel Jurgens <[email protected]>
>>> Reviewed-by: Zhu Yanjun <[email protected]>
>>> ---
>>> v2 -> v1: adjusted chunk size to reflect different architectures.
>>>
>>> drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>>> 1 file changed, 7 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>> b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>> index a822f7a..ccb62b8 100644
>>> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>> @@ -43,12 +43,12 @@
>>> #include "fw.h"
>>> /*
>>> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
>>> - * per chunk.
>>> + * We allocate in page size (default 4KB on many archs) chunks to
>>> avoid high
>>> + * order memory allocations in fragmented/high usage memory situation.
>>> */
>>> enum {
>>> - MLX4_ICM_ALLOC_SIZE = 1 << 18,
>>> - MLX4_TABLE_CHUNK_SIZE = 1 << 18
>>> + MLX4_ICM_ALLOC_SIZE = 1 << PAGE_SHIFT,
>>> + MLX4_TABLE_CHUNK_SIZE = 1 << PAGE_SHIFT
>>
>> Which is actually PAGE_SIZE.
>
> Yes, we wanted to avoid high order memory allocations.
>
Then please use PAGE_SIZE instead.
>> Also, please add a comma at the end of the last entry.
>
> Hmm..., followed the existing code style and checkpatch.pl didn't
> complain about the comma.
>
I am in favor of having a comma also after the last element, so that
when another enum element is added we do not modify this line again,
which would falsely affect git blame.
I know it didn't exist before your patch, but once we're here, let's do it.
>>
>>> };
>>> static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct
>>> mlx4_icm_chunk *chunk)
>>> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev,
>>> struct mlx4_icm_table *table,
>>> obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>>> num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>>> - table->icm = kcalloc(num_icm, sizeof(*table->icm),
>>> GFP_KERNEL);
>>> + table->icm = vzalloc(num_icm * sizeof(*table->icm));
>>
>> Why not kvzalloc ?
>
> I think table->icm pointer array doesn't really need physically
> contiguous memory. Sometimes high order
> memory allocation by kmalloc variants may trigger slow path and cause
> tasks to be blocked.
>
This is control path so it is less latency-sensitive.
Let's not produce unnecessary degradation here, please call kvzalloc so
we maintain a similar behavior when contiguous memory is available, and
a fallback for resiliency.
> Thanks,
> Qing
>
>>
>>> if (!table->icm)
>>> return -ENOMEM;
>>> table->virt = virt;
>>> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev,
>>> struct mlx4_icm_table *table,
>>> mlx4_free_icm(dev, table->icm[i], use_coherent);
>>> }
>>> - kfree(table->icm);
>>> + vfree(table->icm);
>>> return -ENOMEM;
>>> }
>>> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev *dev,
>>> struct mlx4_icm_table *table)
>>> mlx4_free_icm(dev, table->icm[i], table->coherent);
>>> }
>>> - kfree(table->icm);
>>> + vfree(table->icm);
>>> }
>>>
>>
>> Thanks for your patch.
>>
>> I need to verify there is no dramatic performance degradation here.
>> You can prepare and send a v3 in the meanwhile.
>>
>> Thanks,
>> Tariq
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
On 5/15/2018 2:19 AM, Tariq Toukan wrote:
>
>
> On 14/05/2018 7:41 PM, Qing Huang wrote:
>>
>>
>> On 5/13/2018 2:00 AM, Tariq Toukan wrote:
>>>
>>>
>>> On 11/05/2018 10:23 PM, Qing Huang wrote:
>>>> When a system is under memory presure (high usage with fragments),
>>>> the original 256KB ICM chunk allocations will likely trigger kernel
>>>> memory management to enter slow path doing memory compact/migration
>>>> ops in order to complete high order memory allocations.
>>>>
>>>> When that happens, user processes calling uverb APIs may get stuck
>>>> for more than 120s easily even though there are a lot of free pages
>>>> in smaller chunks available in the system.
>>>>
>>>> Syslog:
>>>> ...
>>>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>>>> oracle_205573_e:205573 blocked for more than 120 seconds.
>>>> ...
>>>>
>>>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>>>
>>>> However in order to support smaller ICM chunk size, we need to fix
>>>> another issue in large size kcalloc allocations.
>>>>
>>>> E.g.
>>>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>>>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for
>>>> each mtt
>>>> entry). So we need a 16MB allocation for a table->icm pointer array to
>>>> hold 2M pointers which can easily cause kcalloc to fail.
>>>>
>>>> The solution is to use vzalloc to replace kcalloc. There is no need
>>>> for contiguous memory pages for a driver meta data structure (no need
>>>> of DMA ops).
>>>>
>>>> Signed-off-by: Qing Huang <[email protected]>
>>>> Acked-by: Daniel Jurgens <[email protected]>
>>>> Reviewed-by: Zhu Yanjun <[email protected]>
>>>> ---
>>>> v2 -> v1: adjusted chunk size to reflect different architectures.
>>>>
>>>> drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>>>> 1 file changed, 7 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>> b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>> index a822f7a..ccb62b8 100644
>>>> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>> @@ -43,12 +43,12 @@
>>>> #include "fw.h"
>>>> /*
>>>> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
>>>> - * per chunk.
>>>> + * We allocate in page size (default 4KB on many archs) chunks to
>>>> avoid high
>>>> + * order memory allocations in fragmented/high usage memory
>>>> situation.
>>>> */
>>>> enum {
>>>> - MLX4_ICM_ALLOC_SIZE = 1 << 18,
>>>> - MLX4_TABLE_CHUNK_SIZE = 1 << 18
>>>> + MLX4_ICM_ALLOC_SIZE = 1 << PAGE_SHIFT,
>>>> + MLX4_TABLE_CHUNK_SIZE = 1 << PAGE_SHIFT
>>>
>>> Which is actually PAGE_SIZE.
>>
>> Yes, we wanted to avoid high order memory allocations.
>>
>
> Then please use PAGE_SIZE instead.
PAGE_SIZE is usually defined as 1 << PAGE_SHIFT. So I think PAGE_SHIFT
is actually more appropriate here.
>
>>> Also, please add a comma at the end of the last entry.
>>
>> Hmm..., followed the existing code style and checkpatch.pl didn't
>> complain about the comma.
>>
>
> I am in favor of having a comma also after the last element, so that
> when another enum element is added we do not modify this line again,
> which would falsely affect git blame.
>
> I know it didn't exist before your patch, but once we're here, let's
> do it.
I'm okay either way. If adding an extra comma is preferred by many
people, someone should update checkpatch.pl to enforce it. :)
>
>>>
>>>> };
>>>> static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct
>>>> mlx4_icm_chunk *chunk)
>>>> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev,
>>>> struct mlx4_icm_table *table,
>>>> obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>>>> num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>>>> - table->icm = kcalloc(num_icm, sizeof(*table->icm),
>>>> GFP_KERNEL);
>>>> + table->icm = vzalloc(num_icm * sizeof(*table->icm));
>>>
>>> Why not kvzalloc ?
>>
>> I think table->icm pointer array doesn't really need physically
>> contiguous memory. Sometimes high order
>> memory allocation by kmalloc variants may trigger slow path and cause
>> tasks to be blocked.
>>
>
> This is control path so it is less latency-sensitive.
> Let's not produce unnecessary degradation here, please call kvzalloc
> so we maintain a similar behavior when contiguous memory is available,
> and a fallback for resiliency.
No sure what exactly degradation is caused by vzalloc here. I think it's
better to keep physically contiguous pages
to other requests which really need them. Besides slow path/mem
compacting can be really expensive.
>
>> Thanks,
>> Qing
>>
>>>
>>>> if (!table->icm)
>>>> return -ENOMEM;
>>>> table->virt = virt;
>>>> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev,
>>>> struct mlx4_icm_table *table,
>>>> mlx4_free_icm(dev, table->icm[i], use_coherent);
>>>> }
>>>> - kfree(table->icm);
>>>> + vfree(table->icm);
>>>> return -ENOMEM;
>>>> }
>>>> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev
>>>> *dev, struct mlx4_icm_table *table)
>>>> mlx4_free_icm(dev, table->icm[i], table->coherent);
>>>> }
>>>> - kfree(table->icm);
>>>> + vfree(table->icm);
>>>> }
>>>>
>>>
>>> Thanks for your patch.
>>>
>>> I need to verify there is no dramatic performance degradation here.
>>> You can prepare and send a v3 in the meanwhile.
>>>
>>> Thanks,
>>> Tariq
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe
>>> linux-rdma" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/15/2018 11:53 AM, Qing Huang wrote:
>
>> This is control path so it is less latency-sensitive.
>> Let's not produce unnecessary degradation here, please call kvzalloc so we maintain a similar behavior when contiguous memory is available, and a fallback for resiliency.
>
> No sure what exactly degradation is caused by vzalloc here. I think it's better to keep physically contiguous pages
> to other requests which really need them. Besides slow path/mem compacting can be really expensive.
>
Just use kvzalloc(), and you get the benefit of having contiguous memory if available,
without expensive compact phase.
This thing _automatically_ falls back to vmalloc(), thus your problem will be solved.
If you are not sure, trust others.
On 5/15/2018 12:08 PM, Eric Dumazet wrote:
>
> On 05/15/2018 11:53 AM, Qing Huang wrote:
>>> This is control path so it is less latency-sensitive.
>>> Let's not produce unnecessary degradation here, please call kvzalloc so we maintain a similar behavior when contiguous memory is available, and a fallback for resiliency.
>> No sure what exactly degradation is caused by vzalloc here. I think it's better to keep physically contiguous pages
>> to other requests which really need them. Besides slow path/mem compacting can be really expensive.
>>
> Just use kvzalloc(), and you get the benefit of having contiguous memory if available,
> without expensive compact phase.
>
> This thing _automatically_ falls back to vmalloc(), thus your problem will be solved.
>
> If you are not sure, trust others.
Thanks for the review. There are many places in kernel and applications
where physically contiguous pages are needed.
We saw quite a few issues when there were not enough contiguous phy mem
available. My main concern here is that why
using physically contiguous pages when they are not really needed?
On 15/05/2018 9:53 PM, Qing Huang wrote:
>
>
> On 5/15/2018 2:19 AM, Tariq Toukan wrote:
>>
>>
>> On 14/05/2018 7:41 PM, Qing Huang wrote:
>>>
>>>
>>> On 5/13/2018 2:00 AM, Tariq Toukan wrote:
>>>>
>>>>
>>>> On 11/05/2018 10:23 PM, Qing Huang wrote:
>>>>> When a system is under memory presure (high usage with fragments),
>>>>> the original 256KB ICM chunk allocations will likely trigger kernel
>>>>> memory management to enter slow path doing memory compact/migration
>>>>> ops in order to complete high order memory allocations.
>>>>>
>>>>> When that happens, user processes calling uverb APIs may get stuck
>>>>> for more than 120s easily even though there are a lot of free pages
>>>>> in smaller chunks available in the system.
>>>>>
>>>>> Syslog:
>>>>> ...
>>>>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>>>>> oracle_205573_e:205573 blocked for more than 120 seconds.
>>>>> ...
>>>>>
>>>>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>>>>
>>>>> However in order to support smaller ICM chunk size, we need to fix
>>>>> another issue in large size kcalloc allocations.
>>>>>
>>>>> E.g.
>>>>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>>>>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for
>>>>> each mtt
>>>>> entry). So we need a 16MB allocation for a table->icm pointer array to
>>>>> hold 2M pointers which can easily cause kcalloc to fail.
>>>>>
>>>>> The solution is to use vzalloc to replace kcalloc. There is no need
>>>>> for contiguous memory pages for a driver meta data structure (no need
>>>>> of DMA ops).
>>>>>
>>>>> Signed-off-by: Qing Huang <[email protected]>
>>>>> Acked-by: Daniel Jurgens <[email protected]>
>>>>> Reviewed-by: Zhu Yanjun <[email protected]>
>>>>> ---
>>>>> v2 -> v1: adjusted chunk size to reflect different architectures.
>>>>>
>>>>> drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>>>>> 1 file changed, 7 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>> b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>> index a822f7a..ccb62b8 100644
>>>>> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>> @@ -43,12 +43,12 @@
>>>>> #include "fw.h"
>>>>> /*
>>>>> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
>>>>> - * per chunk.
>>>>> + * We allocate in page size (default 4KB on many archs) chunks to
>>>>> avoid high
>>>>> + * order memory allocations in fragmented/high usage memory
>>>>> situation.
>>>>> */
>>>>> enum {
>>>>> - MLX4_ICM_ALLOC_SIZE = 1 << 18,
>>>>> - MLX4_TABLE_CHUNK_SIZE = 1 << 18
>>>>> + MLX4_ICM_ALLOC_SIZE = 1 << PAGE_SHIFT,
>>>>> + MLX4_TABLE_CHUNK_SIZE = 1 << PAGE_SHIFT
>>>>
>>>> Which is actually PAGE_SIZE.
>>>
>>> Yes, we wanted to avoid high order memory allocations.
>>>
>>
>> Then please use PAGE_SIZE instead.
>
> PAGE_SIZE is usually defined as 1 << PAGE_SHIFT. So I think PAGE_SHIFT
> is actually more appropriate here.
>
Definition of PAGE_SIZE varies among different archs.
It is not always as simple as 1 << PAGE_SHIFT.
It might be:
PAGE_SIZE (1UL << PAGE_SHIFT)
PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
etc...
Please replace 1 << PAGE_SHIFT with PAGE_SIZE.
>
>>
>>>> Also, please add a comma at the end of the last entry.
>>>
>>> Hmm..., followed the existing code style and checkpatch.pl didn't
>>> complain about the comma.
>>>
>>
>> I am in favor of having a comma also after the last element, so that
>> when another enum element is added we do not modify this line again,
>> which would falsely affect git blame.
>>
>> I know it didn't exist before your patch, but once we're here, let's
>> do it.
>
> I'm okay either way. If adding an extra comma is preferred by many
> people, someone should update checkpatch.pl to enforce it. :)
>
I agree.
Until then, please use an extra comma in this patch.
>>
>>>>
>>>>> };
>>>>> static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct
>>>>> mlx4_icm_chunk *chunk)
>>>>> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev,
>>>>> struct mlx4_icm_table *table,
>>>>> obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>>>>> num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>>>>> - table->icm = kcalloc(num_icm, sizeof(*table->icm),
>>>>> GFP_KERNEL);
>>>>> + table->icm = vzalloc(num_icm * sizeof(*table->icm));
>>>>
>>>> Why not kvzalloc ?
>>>
>>> I think table->icm pointer array doesn't really need physically
>>> contiguous memory. Sometimes high order
>>> memory allocation by kmalloc variants may trigger slow path and cause
>>> tasks to be blocked.
>>>
>>
>> This is control path so it is less latency-sensitive.
>> Let's not produce unnecessary degradation here, please call kvzalloc
>> so we maintain a similar behavior when contiguous memory is available,
>> and a fallback for resiliency.
>
> No sure what exactly degradation is caused by vzalloc here. I think it's
> better to keep physically contiguous pages
> to other requests which really need them. Besides slow path/mem
> compacting can be really expensive.
>
Degradation is expected when you replace a contig memory with non-contig
memory, without any perf test.
We agree that when contig memory is not available, we should use
non-contig instead of simply failing, and for this you can call kvzalloc.
>>
>>> Thanks,
>>> Qing
>>>
>>>>
>>>>> if (!table->icm)
>>>>> return -ENOMEM;
>>>>> table->virt = virt;
>>>>> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev,
>>>>> struct mlx4_icm_table *table,
>>>>> mlx4_free_icm(dev, table->icm[i], use_coherent);
>>>>> }
>>>>> - kfree(table->icm);
>>>>> + vfree(table->icm);
>>>>> return -ENOMEM;
>>>>> }
>>>>> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev
>>>>> *dev, struct mlx4_icm_table *table)
>>>>> mlx4_free_icm(dev, table->icm[i], table->coherent);
>>>>> }
>>>>> - kfree(table->icm);
>>>>> + vfree(table->icm);
>>>>> }
>>>>>
>>>>
>>>> Thanks for your patch.
>>>>
>>>> I need to verify there is no dramatic performance degradation here.
>>>> You can prepare and send a v3 in the meanwhile.
>>>>
>>>> Thanks,
>>>> Tariq
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe
>>>> linux-rdma" in
>>>> the body of a message to [email protected]
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
On Wed, May 16, 2018 at 9:04 AM, Tariq Toukan <[email protected]> wrote:
>
>
> On 15/05/2018 9:53 PM, Qing Huang wrote:
>>
>>
>>
>> On 5/15/2018 2:19 AM, Tariq Toukan wrote:
>>>
>>>
>>>
>>> On 14/05/2018 7:41 PM, Qing Huang wrote:
>>>>
>>>>
>>>>
>>>> On 5/13/2018 2:00 AM, Tariq Toukan wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 11/05/2018 10:23 PM, Qing Huang wrote:
>>>>>>
>>>>>> When a system is under memory presure (high usage with fragments),
>>>>>> the original 256KB ICM chunk allocations will likely trigger kernel
>>>>>> memory management to enter slow path doing memory compact/migration
>>>>>> ops in order to complete high order memory allocations.
>>>>>>
>>>>>> When that happens, user processes calling uverb APIs may get stuck
>>>>>> for more than 120s easily even though there are a lot of free pages
>>>>>> in smaller chunks available in the system.
>>>>>>
>>>>>> Syslog:
>>>>>> ...
>>>>>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>>>>>> oracle_205573_e:205573 blocked for more than 120 seconds.
>>>>>> ...
>>>>>>
>>>>>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>>>>>
>>>>>> However in order to support smaller ICM chunk size, we need to fix
>>>>>> another issue in large size kcalloc allocations.
>>>>>>
>>>>>> E.g.
>>>>>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>>>>>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each
>>>>>> mtt
>>>>>> entry). So we need a 16MB allocation for a table->icm pointer array to
>>>>>> hold 2M pointers which can easily cause kcalloc to fail.
>>>>>>
>>>>>> The solution is to use vzalloc to replace kcalloc. There is no need
>>>>>> for contiguous memory pages for a driver meta data structure (no need
>>>>>> of DMA ops).
>>>>>>
>>>>>> Signed-off-by: Qing Huang <[email protected]>
>>>>>> Acked-by: Daniel Jurgens <[email protected]>
>>>>>> Reviewed-by: Zhu Yanjun <[email protected]>
>>>>>> ---
>>>>>> v2 -> v1: adjusted chunk size to reflect different architectures.
>>>>>>
>>>>>> drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>>>>>> 1 file changed, 7 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>>> b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>>> index a822f7a..ccb62b8 100644
>>>>>> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>>> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>>> @@ -43,12 +43,12 @@
>>>>>> #include "fw.h"
>>>>>> /*
>>>>>> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
>>>>>> - * per chunk.
>>>>>> + * We allocate in page size (default 4KB on many archs) chunks to
>>>>>> avoid high
>>>>>> + * order memory allocations in fragmented/high usage memory
>>>>>> situation.
>>>>>> */
>>>>>> enum {
>>>>>> - MLX4_ICM_ALLOC_SIZE = 1 << 18,
>>>>>> - MLX4_TABLE_CHUNK_SIZE = 1 << 18
>>>>>> + MLX4_ICM_ALLOC_SIZE = 1 << PAGE_SHIFT,
>>>>>> + MLX4_TABLE_CHUNK_SIZE = 1 << PAGE_SHIFT
>>>>>
>>>>>
>>>>> Which is actually PAGE_SIZE.
>>>>
>>>>
>>>> Yes, we wanted to avoid high order memory allocations.
>>>>
>>>
>>> Then please use PAGE_SIZE instead.
>>
>>
>> PAGE_SIZE is usually defined as 1 << PAGE_SHIFT. So I think PAGE_SHIFT is
>> actually more appropriate here.
>>
>
> Definition of PAGE_SIZE varies among different archs.
> It is not always as simple as 1 << PAGE_SHIFT.
> It might be:
> PAGE_SIZE (1UL << PAGE_SHIFT)
> PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
> etc...
>
> Please replace 1 << PAGE_SHIFT with PAGE_SIZE.
>
>>
>>>
>>>>> Also, please add a comma at the end of the last entry.
>>>>
>>>>
>>>> Hmm..., followed the existing code style and checkpatch.pl didn't
>>>> complain about the comma.
>>>>
>>>
>>> I am in favor of having a comma also after the last element, so that when
>>> another enum element is added we do not modify this line again, which would
>>> falsely affect git blame.
>>>
>>> I know it didn't exist before your patch, but once we're here, let's do
>>> it.
>>
>>
>> I'm okay either way. If adding an extra comma is preferred by many people,
>> someone should update checkpatch.pl to enforce it. :)
>>
> I agree.
> Until then, please use an extra comma in this patch.
>
>>>
>>>>>
>>>>>> };
>>>>>> static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct
>>>>>> mlx4_icm_chunk *chunk)
>>>>>> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev,
>>>>>> struct mlx4_icm_table *table,
>>>>>> obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>>>>>> num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>>>>>> - table->icm = kcalloc(num_icm, sizeof(*table->icm),
>>>>>> GFP_KERNEL);
>>>>>> + table->icm = vzalloc(num_icm * sizeof(*table->icm));
>>>>>
>>>>>
>>>>> Why not kvzalloc ?
>>>>
>>>>
>>>> I think table->icm pointer array doesn't really need physically
>>>> contiguous memory. Sometimes high order
>>>> memory allocation by kmalloc variants may trigger slow path and cause
>>>> tasks to be blocked.
>>>>
>>>
>>> This is control path so it is less latency-sensitive.
>>> Let's not produce unnecessary degradation here, please call kvzalloc so
>>> we maintain a similar behavior when contiguous memory is available, and a
>>> fallback for resiliency.
>>
>>
>> No sure what exactly degradation is caused by vzalloc here. I think it's
>> better to keep physically contiguous pages
>> to other requests which really need them. Besides slow path/mem compacting
>> can be really expensive.
>>
>
> Degradation is expected when you replace a contig memory with non-contig
> memory, without any perf test.
> We agree that when contig memory is not available, we should use non-contig
> instead of simply failing, and for this you can call kvzalloc.
The expected degradation would be little if the data is not very
performance sensitive.
I think vmalloc would be better in general case.
Even if the server has hundreds of gigabyte memory, even 1MB
contiguous memory is often rare.
For example, I attached /proc/pagetypeinfo of my server running for 153 days.
The largest contiguous memory is 2^7=512KB.
Node 0, zone Normal, type Unmovable 4499 9418 4817
732 747 567 18 3 0 0 0
Node 0, zone Normal, type Movable 38179 40839 10546
1888 491 51 1 0 0 0 0
Node 0, zone Normal, type Reclaimable 117 98 1279
35 21 10 8 0 0 0 0
Node 0, zone Normal, type HighAtomic 0 0 0
0 0 0 0 0 0 0 0
Node 0, zone Normal, type Isolate 0 0 0
0 0 0 0 0 0 0 0
So I think vmalloc would be good if it is not very performance critical data.
--
GIOH KIM
Linux Kernel Entwickler
ProfitBricks GmbH
Greifswalder Str. 207
D - 10405 Berlin
Tel: +49 176 2697 8962
Fax: +49 30 577 008 299
Email: [email protected]
URL: https://www.profitbricks.de
Sitz der Gesellschaft: Berlin
Registergericht: Amtsgericht Charlottenburg, HRB 125506 B
Geschäftsführer: Achim Weiss, Matthias Steinberg, Christoph Steffens
Hi Qing,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on net-next/master]
[also build test ERROR on v4.17-rc5 next-20180517]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Qing-Huang/mlx4_core-allocate-ICM-memory-in-page-size-chunks/20180512-090438
config: sparc64-allyesconfig (attached as .config)
compiler: sparc64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=sparc64
All error/warnings (new ones prefixed by >>):
drivers/net//ethernet/mellanox/mlx4/icm.c: In function 'mlx4_init_icm_table':
>> drivers/net//ethernet/mellanox/mlx4/icm.c:403:20: error: implicit declaration of function 'vzalloc'; did you mean 'kzalloc'? [-Werror=implicit-function-declaration]
table->icm = vzalloc(num_icm * sizeof(*table->icm));
^~~~~~~
kzalloc
>> drivers/net//ethernet/mellanox/mlx4/icm.c:403:18: warning: assignment makes pointer from integer without a cast [-Wint-conversion]
table->icm = vzalloc(num_icm * sizeof(*table->icm));
^
>> drivers/net//ethernet/mellanox/mlx4/icm.c:449:2: error: implicit declaration of function 'vfree'; did you mean 'kfree'? [-Werror=implicit-function-declaration]
vfree(table->icm);
^~~~~
kfree
cc1: some warnings being treated as errors
vim +403 drivers/net//ethernet/mellanox/mlx4/icm.c
389
390 int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
391 u64 virt, int obj_size, u32 nobj, int reserved,
392 int use_lowmem, int use_coherent)
393 {
394 int obj_per_chunk;
395 int num_icm;
396 unsigned chunk_size;
397 int i;
398 u64 size;
399
400 obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
401 num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
402
> 403 table->icm = vzalloc(num_icm * sizeof(*table->icm));
404 if (!table->icm)
405 return -ENOMEM;
406 table->virt = virt;
407 table->num_icm = num_icm;
408 table->num_obj = nobj;
409 table->obj_size = obj_size;
410 table->lowmem = use_lowmem;
411 table->coherent = use_coherent;
412 mutex_init(&table->mutex);
413
414 size = (u64) nobj * obj_size;
415 for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
416 chunk_size = MLX4_TABLE_CHUNK_SIZE;
417 if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > size)
418 chunk_size = PAGE_ALIGN(size -
419 i * MLX4_TABLE_CHUNK_SIZE);
420
421 table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
422 (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
423 __GFP_NOWARN, use_coherent);
424 if (!table->icm[i])
425 goto err;
426 if (mlx4_MAP_ICM(dev, table->icm[i], virt + i * MLX4_TABLE_CHUNK_SIZE)) {
427 mlx4_free_icm(dev, table->icm[i], use_coherent);
428 table->icm[i] = NULL;
429 goto err;
430 }
431
432 /*
433 * Add a reference to this ICM chunk so that it never
434 * gets freed (since it contains reserved firmware objects).
435 */
436 ++table->icm[i]->refcount;
437 }
438
439 return 0;
440
441 err:
442 for (i = 0; i < num_icm; ++i)
443 if (table->icm[i]) {
444 mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE,
445 MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
446 mlx4_free_icm(dev, table->icm[i], use_coherent);
447 }
448
> 449 vfree(table->icm);
450
451 return -ENOMEM;
452 }
453
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation