mirror of
https://gitee.com/bianbu-linux/linux-6.6
synced 2025-04-26 14:17:26 -04:00
mm/vmalloc: rework the drain logic
A current "lazy drain" model suffers from at least two issues. First one is related to the unsorted list of vmap areas, thus in order to identify the [min:max] range of areas to be drained, it requires a full list scan. What is a time consuming if the list is too long. Second one and as a next step is about merging all fragments with a free space. What is also a time consuming because it has to iterate over entire list which holds outstanding lazy areas. See below the "preemptirqsoff" tracer that illustrates a high latency. It is ~24676us. Our workloads like audio and video are effected by such long latency: <snip> tracer: preemptirqsoff preemptirqsoff latency trace v1.1.5 on 4.9.186-perf+ -------------------------------------------------------------------- latency: 24676 us, #4/4, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 P:8) ----------------- | task: crtc_commit:112-261 (uid:0 nice:0 policy:1 rt_prio:16) ----------------- => started at: __purge_vmap_area_lazy => ended at: __purge_vmap_area_lazy _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / delay cmd pid ||||| time | caller \ / ||||| \ | / crtc_com-261 1...1 1us*: _raw_spin_lock <-__purge_vmap_area_lazy [...] crtc_com-261 1...1 24675us : _raw_spin_unlock <-__purge_vmap_area_lazy crtc_com-261 1...1 24677us : trace_preempt_on <-__purge_vmap_area_lazy crtc_com-261 1...1 24683us : <stack trace> => free_vmap_area_noflush => remove_vm_area => __vunmap => vfree => drm_property_free_blob => drm_mode_object_unreference => drm_property_unreference_blob => __drm_atomic_helper_crtc_destroy_state => sde_crtc_destroy_state => drm_atomic_state_default_clear => drm_atomic_state_clear => drm_atomic_state_free => complete_commit => _msm_drm_commit_work_cb => kthread_worker_fn => kthread => ret_from_fork <snip> To address those two issues we can redesign a purging of the outstanding lazy areas. Instead of queuing vmap areas to the list, we replace it by the separate rb-tree. In hat case an area is located in the tree/list in ascending order. It will give us below advantages: a) Outstanding vmap areas are merged creating bigger coalesced blocks, thus it becomes less fragmented. b) It is possible to calculate a flush range [min:max] without scanning all elements. It is O(1) access time or complexity; c) The final merge of areas with the rb-tree that represents a free space is faster because of (a). As a result the lock contention is also reduced. Link: https://lkml.kernel.org/r/20201116220033.1837-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> Cc: Hillf Danton <hdanton@sina.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Minchan Kim <minchan@kernel.org> Cc: huang ying <huang.ying.caritas@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
8945a72306
commit
96e2db4561
2 changed files with 53 additions and 45 deletions
|
@ -72,16 +72,14 @@ struct vmap_area {
|
||||||
struct list_head list; /* address sorted list */
|
struct list_head list; /* address sorted list */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The following three variables can be packed, because
|
* The following two variables can be packed, because
|
||||||
* a vmap_area object is always one of the three states:
|
* a vmap_area object can be either:
|
||||||
* 1) in "free" tree (root is vmap_area_root)
|
* 1) in "free" tree (root is vmap_area_root)
|
||||||
* 2) in "busy" tree (root is free_vmap_area_root)
|
* 2) or "busy" tree (root is free_vmap_area_root)
|
||||||
* 3) in purge list (head is vmap_purge_list)
|
|
||||||
*/
|
*/
|
||||||
union {
|
union {
|
||||||
unsigned long subtree_max_size; /* in "free" tree */
|
unsigned long subtree_max_size; /* in "free" tree */
|
||||||
struct vm_struct *vm; /* in "busy" tree */
|
struct vm_struct *vm; /* in "busy" tree */
|
||||||
struct llist_node purge_list; /* in purge list */
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
90
mm/vmalloc.c
90
mm/vmalloc.c
|
@ -413,10 +413,13 @@ static DEFINE_SPINLOCK(vmap_area_lock);
|
||||||
static DEFINE_SPINLOCK(free_vmap_area_lock);
|
static DEFINE_SPINLOCK(free_vmap_area_lock);
|
||||||
/* Export for kexec only */
|
/* Export for kexec only */
|
||||||
LIST_HEAD(vmap_area_list);
|
LIST_HEAD(vmap_area_list);
|
||||||
static LLIST_HEAD(vmap_purge_list);
|
|
||||||
static struct rb_root vmap_area_root = RB_ROOT;
|
static struct rb_root vmap_area_root = RB_ROOT;
|
||||||
static bool vmap_initialized __read_mostly;
|
static bool vmap_initialized __read_mostly;
|
||||||
|
|
||||||
|
static struct rb_root purge_vmap_area_root = RB_ROOT;
|
||||||
|
static LIST_HEAD(purge_vmap_area_list);
|
||||||
|
static DEFINE_SPINLOCK(purge_vmap_area_lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This kmem_cache is used for vmap_area objects. Instead of
|
* This kmem_cache is used for vmap_area objects. Instead of
|
||||||
* allocating from slab we reuse an object from this cache to
|
* allocating from slab we reuse an object from this cache to
|
||||||
|
@ -820,10 +823,17 @@ insert:
|
||||||
if (!merged)
|
if (!merged)
|
||||||
link_va(va, root, parent, link, head);
|
link_va(va, root, parent, link, head);
|
||||||
|
|
||||||
/*
|
return va;
|
||||||
* Last step is to check and update the tree.
|
}
|
||||||
*/
|
|
||||||
augment_tree_propagate_from(va);
|
static __always_inline struct vmap_area *
|
||||||
|
merge_or_add_vmap_area_augment(struct vmap_area *va,
|
||||||
|
struct rb_root *root, struct list_head *head)
|
||||||
|
{
|
||||||
|
va = merge_or_add_vmap_area(va, root, head);
|
||||||
|
if (va)
|
||||||
|
augment_tree_propagate_from(va);
|
||||||
|
|
||||||
return va;
|
return va;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1138,7 +1148,7 @@ static void free_vmap_area(struct vmap_area *va)
|
||||||
* Insert/Merge it back to the free tree/list.
|
* Insert/Merge it back to the free tree/list.
|
||||||
*/
|
*/
|
||||||
spin_lock(&free_vmap_area_lock);
|
spin_lock(&free_vmap_area_lock);
|
||||||
merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
|
merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
|
||||||
spin_unlock(&free_vmap_area_lock);
|
spin_unlock(&free_vmap_area_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1326,32 +1336,32 @@ void set_iounmap_nonlazy(void)
|
||||||
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
|
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
|
||||||
{
|
{
|
||||||
unsigned long resched_threshold;
|
unsigned long resched_threshold;
|
||||||
struct llist_node *valist;
|
struct list_head local_pure_list;
|
||||||
struct vmap_area *va;
|
struct vmap_area *va, *n_va;
|
||||||
struct vmap_area *n_va;
|
|
||||||
|
|
||||||
lockdep_assert_held(&vmap_purge_lock);
|
lockdep_assert_held(&vmap_purge_lock);
|
||||||
|
|
||||||
valist = llist_del_all(&vmap_purge_list);
|
spin_lock(&purge_vmap_area_lock);
|
||||||
if (unlikely(valist == NULL))
|
purge_vmap_area_root = RB_ROOT;
|
||||||
|
list_replace_init(&purge_vmap_area_list, &local_pure_list);
|
||||||
|
spin_unlock(&purge_vmap_area_lock);
|
||||||
|
|
||||||
|
if (unlikely(list_empty(&local_pure_list)))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
/*
|
start = min(start,
|
||||||
* TODO: to calculate a flush range without looping.
|
list_first_entry(&local_pure_list,
|
||||||
* The list can be up to lazy_max_pages() elements.
|
struct vmap_area, list)->va_start);
|
||||||
*/
|
|
||||||
llist_for_each_entry(va, valist, purge_list) {
|
end = max(end,
|
||||||
if (va->va_start < start)
|
list_last_entry(&local_pure_list,
|
||||||
start = va->va_start;
|
struct vmap_area, list)->va_end);
|
||||||
if (va->va_end > end)
|
|
||||||
end = va->va_end;
|
|
||||||
}
|
|
||||||
|
|
||||||
flush_tlb_kernel_range(start, end);
|
flush_tlb_kernel_range(start, end);
|
||||||
resched_threshold = lazy_max_pages() << 1;
|
resched_threshold = lazy_max_pages() << 1;
|
||||||
|
|
||||||
spin_lock(&free_vmap_area_lock);
|
spin_lock(&free_vmap_area_lock);
|
||||||
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
|
list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
|
||||||
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
|
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
|
||||||
unsigned long orig_start = va->va_start;
|
unsigned long orig_start = va->va_start;
|
||||||
unsigned long orig_end = va->va_end;
|
unsigned long orig_end = va->va_end;
|
||||||
|
@ -1361,8 +1371,8 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
|
||||||
* detached and there is no need to "unlink" it from
|
* detached and there is no need to "unlink" it from
|
||||||
* anything.
|
* anything.
|
||||||
*/
|
*/
|
||||||
va = merge_or_add_vmap_area(va, &free_vmap_area_root,
|
va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
|
||||||
&free_vmap_area_list);
|
&free_vmap_area_list);
|
||||||
|
|
||||||
if (!va)
|
if (!va)
|
||||||
continue;
|
continue;
|
||||||
|
@ -1419,9 +1429,15 @@ static void free_vmap_area_noflush(struct vmap_area *va)
|
||||||
nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
|
nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
|
||||||
PAGE_SHIFT, &vmap_lazy_nr);
|
PAGE_SHIFT, &vmap_lazy_nr);
|
||||||
|
|
||||||
/* After this point, we may free va at any time */
|
/*
|
||||||
llist_add(&va->purge_list, &vmap_purge_list);
|
* Merge or place it to the purge tree/list.
|
||||||
|
*/
|
||||||
|
spin_lock(&purge_vmap_area_lock);
|
||||||
|
merge_or_add_vmap_area(va,
|
||||||
|
&purge_vmap_area_root, &purge_vmap_area_list);
|
||||||
|
spin_unlock(&purge_vmap_area_lock);
|
||||||
|
|
||||||
|
/* After this point, we may free va at any time */
|
||||||
if (unlikely(nr_lazy > lazy_max_pages()))
|
if (unlikely(nr_lazy > lazy_max_pages()))
|
||||||
try_purge_vmap_area_lazy();
|
try_purge_vmap_area_lazy();
|
||||||
}
|
}
|
||||||
|
@ -3351,8 +3367,8 @@ recovery:
|
||||||
while (area--) {
|
while (area--) {
|
||||||
orig_start = vas[area]->va_start;
|
orig_start = vas[area]->va_start;
|
||||||
orig_end = vas[area]->va_end;
|
orig_end = vas[area]->va_end;
|
||||||
va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
|
va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
|
||||||
&free_vmap_area_list);
|
&free_vmap_area_list);
|
||||||
if (va)
|
if (va)
|
||||||
kasan_release_vmalloc(orig_start, orig_end,
|
kasan_release_vmalloc(orig_start, orig_end,
|
||||||
va->va_start, va->va_end);
|
va->va_start, va->va_end);
|
||||||
|
@ -3401,8 +3417,8 @@ err_free_shadow:
|
||||||
for (area = 0; area < nr_vms; area++) {
|
for (area = 0; area < nr_vms; area++) {
|
||||||
orig_start = vas[area]->va_start;
|
orig_start = vas[area]->va_start;
|
||||||
orig_end = vas[area]->va_end;
|
orig_end = vas[area]->va_end;
|
||||||
va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
|
va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
|
||||||
&free_vmap_area_list);
|
&free_vmap_area_list);
|
||||||
if (va)
|
if (va)
|
||||||
kasan_release_vmalloc(orig_start, orig_end,
|
kasan_release_vmalloc(orig_start, orig_end,
|
||||||
va->va_start, va->va_end);
|
va->va_start, va->va_end);
|
||||||
|
@ -3482,18 +3498,15 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
|
||||||
|
|
||||||
static void show_purge_info(struct seq_file *m)
|
static void show_purge_info(struct seq_file *m)
|
||||||
{
|
{
|
||||||
struct llist_node *head;
|
|
||||||
struct vmap_area *va;
|
struct vmap_area *va;
|
||||||
|
|
||||||
head = READ_ONCE(vmap_purge_list.first);
|
spin_lock(&purge_vmap_area_lock);
|
||||||
if (head == NULL)
|
list_for_each_entry(va, &purge_vmap_area_list, list) {
|
||||||
return;
|
|
||||||
|
|
||||||
llist_for_each_entry(va, head, purge_list) {
|
|
||||||
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
|
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
|
||||||
(void *)va->va_start, (void *)va->va_end,
|
(void *)va->va_start, (void *)va->va_end,
|
||||||
va->va_end - va->va_start);
|
va->va_end - va->va_start);
|
||||||
}
|
}
|
||||||
|
spin_unlock(&purge_vmap_area_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int s_show(struct seq_file *m, void *p)
|
static int s_show(struct seq_file *m, void *p)
|
||||||
|
@ -3551,10 +3564,7 @@ static int s_show(struct seq_file *m, void *p)
|
||||||
seq_putc(m, '\n');
|
seq_putc(m, '\n');
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* As a final step, dump "unpurged" areas. Note,
|
* As a final step, dump "unpurged" areas.
|
||||||
* that entire "/proc/vmallocinfo" output will not
|
|
||||||
* be address sorted, because the purge list is not
|
|
||||||
* sorted.
|
|
||||||
*/
|
*/
|
||||||
if (list_is_last(&va->list, &vmap_area_list))
|
if (list_is_last(&va->list, &vmap_area_list))
|
||||||
show_purge_info(m);
|
show_purge_info(m);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue