导读
本文适合有基本Linux内存管理概念的新手阅读,且本文旨在从工作流程和设计思想上介绍KSM,在涉及到源代码的地方,进行了部分删减,如果想详细了解KSM,推荐阅读源代码及源代码中的注释。
作者也是初次接触Linux内核源码,所以文章中难免出现纰漏,欢迎在评论中纠正。
一、KSM概述
KSM的全称是 Kernel Samepage Merging,主要应用在虚拟化环境中,它允许内核通过合并内存页面来节省内存,从来可以增加虚拟机的并发数据。
KSM核心设计思想是基于写时复制机制COW,也就是将内容相同的页面合并成一个只读页面,从而释放出空闲物理页面。
二、核心数据结构
KSM的核心数据结构有三个:
- struct rmap_item
- struct mm_slot
- struct ksm_scan
rmap_item
从虚拟地址到一个 mm 的反向映射
同时,对于 stable tree,它负责组织其结构;对于 unstable tree,保存其校验和
struct rmap_item {
struct rmap_item *rmap_list; // 所有rmap_item连成一个链表,ksm_scam.rmap_list
union {
struct anon_vma *anon_vma; // rmap_item在stable树中时,指向VMA
#ifdef CONFIG_NUMA
int nid; // when node of unstable tree
#endif
};
struct mm_struct *mm; // 进程的struct mm_struct数据结构
unsigned long address; // rmap_item所跟踪的用户地址空间
unsigned int oldchecksum; // 虚拟地址对应的物理页面的旧校验值
union {
struct rb_node node; // rmap_item加入unstable红黑树的节点
struct { /* 在 stable tree 中时才有效 */
struct stable_node *head; // 加入stable红黑树的节点
struct hlist_node hlist; // stable链表
};
};
};
struct mm_slot
描述添加到KSM系统中将来要被扫描的进程mm_struct数据结构
struct mm_slot {
struct hlist_node link; // 用于添加到mm_slot哈希表中
struct list_head mm_list; // 用于添加到mm_slot链表中,链表头在ksm_mm_head
struct rmap_item *rmap_list; // rmap_item链表头
struct mm_struct *mm; // 进程的mm_sturct数据结构
};
struct ksm_scan
当前的扫描状态
struct ksm_scan {
struct mm_slot *mm_slot; // 当前正在扫描的mm_slot
unsigned long address; // 将要扫描的地址(page的地址)
struct rmap_item **rmap_list; // 将要扫描的rmap_item的指针
unsigned long seqnr; // 全部扫描完成后会计数一次,用于删除unstable节点
};
三、工作流程
上层的应用通过 madvise() 给某内存区域增加 MADV_MERGEABLE
或者 MADV_UNMERGEABLE
标记,造成对系统调用的访问,该系统调用由 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
定义。
SYSCALL_DEFINE3
在这里会进行一个预处理,如找到该内存区域的所有VMA,并调用 madvise_vma
进行进一步处理。
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
int unmapped_error = 0;
int error = -EINVAL;
int write;
size_t len;
struct blk_plug plug;
...
vma = find_vma_prev(current->mm, start, &prev);
if (vma && start > vma->vm_start)
prev = vma;
blk_start_plug(&plug);
for (;;) { // 在这个循环中遍历所有的VMA
error = -ENOMEM;
if (!vma)
goto out;
/* Here start < (end|vma->vm_end). */
if (start < vma->vm_start) {
unmapped_error = -ENOMEM;
start = vma->vm_start;
if (start >= end)
goto out;
}
// 获得VMA的终止地址
tmp = vma->vm_end;
if (end < tmp)
tmp = end;
// 将当前VMA的起始地址(start和end)传递给 madvise_vma,当然继承的标记behavior不能忘记
error = madvise_vma(vma, &prev, start, tmp, behavior);
if (error)
goto out;
start = tmp;
if (prev && start < prev->vm_end)
start = prev->vm_end;
error = unmapped_error;
if (start >= end)
goto out; // 这里是正常结束的出口
if (prev)
vma = prev->vm_next;
else
vma = find_vma(current->mm, start);
}
out:
blk_finish_plug(&plug);
if (write)
up_write(¤t->mm->mmap_sem);
else
up_read(¤t->mm->mmap_sem);
return error;
}
madvise_behavior
如果在某个VMA上发现 MADV_MERGEABLE 或者 MADV_UNMERGEABLE 标记,则触发 ksm_madvise
static long madvise_behavior(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
struct mm_struct *mm = vma->vm_mm;
int error = 0;
pgoff_t pgoff;
unsigned long new_flags = vma->vm_flags;
switch (behavior) {
...
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
error = ksm_madvise(vma, start, end, behavior, &new_flags);
if (error) {
/*
* madvise() returns EAGAIN if kernel resources, such as
* slab, are temporarily unavailable.
*/
if (error == -ENOMEM)
error = -EAGAIN;
goto out;
}
break;
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
error = hugepage_madvise(vma, &new_flags, behavior);
if (error) {
/*
* madvise() returns EAGAIN if kernel resources, such as
* slab, are temporarily unavailable.
*/
if (error == -ENOMEM)
error = -EAGAIN;
goto out;
}
break;
}
...
success:
vma->vm_flags = new_flags;
out:
return error;
}
ksm_madvise
在这一步会找到 vma 所属进程(mm),并判断标记决定是否对页面进行共享
如果需要共享,调用 __ksm_enter()
并传递当前 vma 所属的 mm 地址。
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
int err;
switch (advice) {
case MADV_MERGEABLE:
...
if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
err = __ksm_enter(mm); // 这是真正的入口
if (err)
return err;
}
*vm_flags |= VM_MERGEABLE;
break;
case MADV_UNMERGEABLE:
if (!(*vm_flags & VM_MERGEABLE))
return 0; /* just ignore the advice */
if (vma->anon_vma) {
err = unmerge_ksm_pages(vma, start, end);
if (err)
return err;
}
*vm_flags &= ~VM_MERGEABLE;
break;
}
return 0;
}
__ksm_enter
将进程的 mm 加入到 ksm_mm_head 链表中
int __ksm_enter(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
int needs_wakeup;
mm_slot = alloc_mm_slot(); // 分配一个mm_slot,表示当前进程mm_struct数据结构
if (!mm_slot)
return -ENOMEM;
/* Check ksm_run too? Would need tighter locking */
needs_wakeup = list_empty(&ksm_mm_head.mm_list); // 为空表示当前没有正在被扫描的mm_slot
spin_lock(&ksm_mmlist_lock);
insert_to_mm_slots_hash(mm, mm_slot); // 将当前进程mm赋给mm_slot->mm
/*
* When KSM_RUN_MERGE (or KSM_RUN_STOP),
* insert just behind the scanning cursor, to let the area settle
* down a little; when fork is followed by immediate exec, we don't
* want ksmd to waste time setting up and tearing down an rmap_list.
*
* But when KSM_RUN_UNMERGE, it's important to insert ahead of its
* scanning cursor, otherwise KSM pages in newly forked mms will be
* missed: then we might as well insert at the end of the list.
*/
if (ksm_run & KSM_RUN_UNMERGE)
list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
else
list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); // mm_slot添加到ksm_scan.mm_slot->mm_list链表中
spin_unlock(&ksm_mmlist_lock);
set_bit(MMF_VM_MERGEABLE, &mm->flags); // 表示这个进程已经添加到KSM系统中
mmgrab(mm);
if (needs_wakeup)
wake_up_interruptible(&ksm_thread_wait); // 则唤醒ksmd内核线程
return 0;
}
ksm_init
创建内核线程ksmd
static int __init ksm_init(void)
{
struct task_struct *ksm_thread;
int err;
/* The correct value depends on page size and endianness */
zero_checksum = calc_checksum(ZERO_PAGE(0));
/* Default to false for backwards compatibility */
ksm_use_zero_pages = false;
err = ksm_slab_init(); // 创建ksm_rmap_item、ksm_stable_node、ksm_mm_slot三个高速缓存
if (err)
goto out;
ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); // 创建ksmd内核线程,线程函数为ksm_scan_thread
if (IS_ERR(ksm_thread)) {
pr_err("ksm: creating kthread failed\n");
err = PTR_ERR(ksm_thread);
goto out_free;
}
#ifdef CONFIG_SYSFS
err = sysfs_create_group(mm_kobj, &ksm_attr_group); // 在sys/kernel/mm下创建ksm相关节点
if (err) {
pr_err("ksm: register sysfs failed\n");
kthread_stop(ksm_thread);
goto out_free;
}
#else
ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
#endif /* CONFIG_SYSFS */
#ifdef CONFIG_MEMORY_HOTREMOVE
/* There is no significance to this priority 100 */
hotplug_memory_notifier(ksm_memory_callback, 100);
#endif
return 0;
out_free:
ksm_slab_free();
out:
return err;
}
ksm_scan_thread
只需要知道这里进入ksm_do_scan()
进行实际的操作就可以了
static int ksm_scan_thread(void *nothing)
{
set_freezable();
set_user_nice(current, 5);
while (!kthread_should_stop()) {
mutex_lock(&ksm_thread_mutex);
wait_while_offlining();
if (ksmd_should_run())
ksm_do_scan(ksm_thread_pages_to_scan); // 实际的执行者,参数是一次扫描的页面数量
mutex_unlock(&ksm_thread_mutex);
try_to_freeze();
if (ksmd_should_run()) {
schedule_timeout_interruptible(
msecs_to_jiffies(ksm_thread_sleep_millisecs));
} else {
wait_event_freezable(ksm_thread_wait,
ksmd_should_run() || kthread_should_stop());
}
}
return 0;
}
ksm_do_scam
static void ksm_do_scan(unsigned int scan_npages)
{
struct rmap_item *rmap_item;
struct page *uninitialized_var(page);
while (scan_npages-- && likely(!freezing(current))) { // 一次扫描 scan_npages 个页面
cond_resched();
rmap_item = scan_get_next_rmap_item(&page); // 找到一个合适的匿名page,并为其建立由物理地址到虚拟地址的反向映射
if (!rmap_item)
return;
cmp_and_merge_page(page, rmap_item); // 查找两棵树看看是不是能合并这个page
put_page(page);
}
}
cmp_and_merge_page
首先查看这个 page 是不是可以和 stable tree 中的 page 合并,不可以的话,检查下它的检验值和之前是否有变化,如果变化则认为是写频繁的页面,跳过它;否则,决定是将它插入到 unstable tree 中,或与其中节点合并加入到 stable tree 中。
static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
{
struct rmap_item *tree_rmap_item;
struct page *tree_page = NULL;
struct stable_node *stable_node;
struct page *kpage;
unsigned int checksum;
int err;
stable_node = page_stable_node(page);
if (stable_node) {
if (stable_node->head != &migrate_nodes &&
get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
rb_erase(&stable_node->node,
root_stable_tree + NUMA(stable_node->nid));
stable_node->head = &migrate_nodes;
list_add(&stable_node->list, stable_node->head);
}
if (stable_node->head != &migrate_nodes &&
rmap_item->head == stable_node)
return;
}
kpage = stable_tree_search(page); // 查 stable tree
if (kpage == page && rmap_item->head == stable_node) { // 查找结果就是自身的情况
put_page(kpage);
return;
}
remove_rmap_item_from_tree(rmap_item); // 为什么???????????
if (kpage) { // kpage 和 page 内容相同且不是同一页面
err = try_to_merge_with_ksm_page(rmap_item, page, kpage); // 尝试合并页面
if (!err) {
lock_page(kpage);
stable_tree_append(rmap_item, page_stable_node(kpage)); // 合并成功,把rmap_item添加到stable_node->hlist哈希链表上
unlock_page(kpage);
}
put_page(kpage);
return;
}
/********************************下面是 unstable tree 的处理 ***********************/
// 计算检验值,如果与旧值不等,说明页面写频繁,不适合KSM
checksum = calc_checksum(page);
if (rmap_item->oldchecksum != checksum) {
rmap_item->oldchecksum = checksum;
return;
}
/*
* Same checksum as an empty page. We attempt to merge it with the
* appropriate zero page if the user enabled this via sysfs.
*/
if (ksm_use_zero_pages && (checksum == zero_checksum)) {
struct vm_area_struct *vma;
vma = find_mergeable_vma(rmap_item->mm, rmap_item->address);
err = try_to_merge_one_page(vma, page,
ZERO_PAGE(rmap_item->address));
/*
* In case of failure, the page was not really empty, so we
* need to continue. Otherwise we're done.
*/
if (!err)
return;
}
// 搜索 unstable tree 中查看是否有相同页面
tree_rmap_item =
unstable_tree_search_insert(rmap_item, page, &tree_page);
if (tree_rmap_item) {
// 尝试将 page 和 tree_page 合并成一个 kpage
kpage = try_to_merge_two_pages(rmap_item, page,
tree_rmap_item, tree_page);
put_page(tree_page);
if (kpage) {
/*
* 合并成功,将 kpage 插入到 stable tree 中和 rmap_items 中
*/
lock_page(kpage);
stable_node = stable_tree_insert(kpage);
if (stable_node) {
stable_tree_append(tree_rmap_item, stable_node);
stable_tree_append(rmap_item, stable_node);
}
unlock_page(kpage);
/*
* 如果stable_node插入到stable树失败,
* 那么调用break_cow()主动触发一个缺页中断
* 来分离这个KSM页面
*/
if (!stable_node) {
break_cow(tree_rmap_item);
break_cow(rmap_item);
}
}
}
}
try_to_merge_one_page
将两个 page 进行合并
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
* @page: the PageAnon page that we want to replace with kpage
* @kpage: the PageKsm page that we want to map instead of page,
* or NULL the first time when we want to use page as kpage.
*/
static int try_to_merge_one_page(struct vm_area_struct *vma,
struct page *page, struct page *kpage)
{
pte_t orig_pte = __pte(0);
int err = -EFAULT;
if (page == kpage) /* ksm page forked */
return 0;
if (!PageAnon(page))
goto out;
/*
* We need the page lock to read a stable PageSwapCache in
* write_protect_page().
*/
if (!trylock_page(page)) // 我们不希望在此等待,所以不直接 lock_page
goto out; // 跳过这个 page 转而去处理其他 page
if (PageTransCompound(page)) { // 如果 page 是透明大页,返回 1
if (split_huge_page(page)) // 将大页分裂成 normal pages,成功则返回 0
goto out_unlock;
}
/*
* If this anonymous page is mapped only here, its pte may need
* to be write-protected. If it's mapped elsewhere, all of its
* ptes are necessarily already write-protected. But in either
* case, we need to lock and check page_count is not raised.
*/
if (write_protect_page(vma, page, &orig_pte) == 0) {
if (!kpage) {
/*
* While we hold page lock, upgrade page from
* PageAnon+anon_vma to PageKsm+NULL stable_node:
* stable_tree_insert() will update stable_node.
*/
set_page_stable_node(page, NULL);
mark_page_accessed(page);
/*
* Page reclaim just frees a clean page with no dirty
* ptes: make sure that the ksm page would be swapped.
*/
if (!PageDirty(page))
SetPageDirty(page);
err = 0;
} else if (pages_identical(page, kpage))
err = replace_page(vma, page, kpage, orig_pte); //
}
if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
munlock_vma_page(page);
if (!PageMlocked(kpage)) {
unlock_page(page);
lock_page(kpage);
mlock_vma_page(kpage);
page = kpage; /* for final unlock */
}
}
out_unlock:
unlock_page(page);
out:
return err;
}
附:
scan_get_next_rmap_item()
遍历ksm_mm_head链表(即进程的mm),然后再遍历进程地址空间的每个VMA,然后通过get_next_rmpa_item()返回rmap_item
static struct rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
struct mm_slot *slot;
struct vm_area_struct *vma;
struct rmap_item *rmap_item;
int nid;
if (list_empty(&ksm_mm_head.mm_list)) // 没有mm_slot,直接退出
return NULL;
slot = ksm_scan.mm_slot;
if (slot == &ksm_mm_head) { // 第一次运行ksmd,进行初始化
...
}
mm = slot->mm;
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
vma = NULL;
else
vma = find_vma(mm, ksm_scan.address); // 根据 address 在 mm 中找到 vma
for (; vma; vma = vma->vm_next) { // 遍历当前mm的所有VMA
if (!(vma->vm_flags & VM_MERGEABLE))
continue;
if (ksm_scan.address < vma->vm_start)
ksm_scan.address = vma->vm_start;
if (!vma->anon_vma)
ksm_scan.address = vma->vm_end;
while (ksm_scan.address < vma->vm_end) { // 遍历VMA中的所有页面
if (ksm_test_exit(mm))
break;
*page = follow_page(vma, ksm_scan.address, FOLL_GET); // 根据虚拟地址找到物理页
if (IS_ERR_OR_NULL(*page)) {
ksm_scan.address += PAGE_SIZE;
cond_resched();
continue;
}
if (PageAnon(*page)) { // 只考虑匿名页面
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
ksm_scan.rmap_list, ksm_scan.address); // 去找mm_slot->rmap_list链表上是否有该虚拟地址对应的rmap_item,没有就新建一个
if (rmap_item) {
ksm_scan.rmap_list =
&rmap_item->rmap_list;
ksm_scan.address += PAGE_SIZE;
} else
put_page(*page);
up_read(&mm->mmap_sem);
return rmap_item;
}
put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
}
}
if (ksm_test_exit(mm)) { // for循环里扫描该进程所有的VMA都没找到合适的匿名页面
ksm_scan.address = 0;
ksm_scan.rmap_list = &slot->rmap_list;
}
/*
* Nuke all the rmap_items that are above this current rmap:
* because there were no VM_MERGEABLE vmas with such addresses.
*/
remove_trailing_rmap_items(slot, ksm_scan.rmap_list); // 在该进程中没找到合适的匿名页面时,那么对应的rmap_item已经没用必要占用空间,直接删除
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(slot->mm_list.next,
struct mm_slot, mm_list); // 取下一个mm_slot
if (ksm_scan.address == 0) { // 处理该进程被销毁的情况,把mm_slot从ksm_mm_head链表中删除,释放mm_slot数据结构,清MMF_VM_MERGEABLE标志位。
/*
* We've completed a full scan of all vmas, holding mmap_sem
* throughout, and found no VM_MERGEABLE: so do the same as
* __ksm_exit does to remove this mm from all our lists now.
* This applies either when cleaning up after __ksm_exit
* (but beware: we can reach here even before __ksm_exit),
* or when all VM_MERGEABLE areas have been unmapped (and
* mmap_sem then protects against race with MADV_MERGEABLE).
*/
hash_del(&slot->link);
list_del(&slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
free_mm_slot(slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
up_read(&mm->mmap_sem);
mmdrop(mm);
} else {
up_read(&mm->mmap_sem);
/*
* up_read(&mm->mmap_sem) first because after
* spin_unlock(&ksm_mmlist_lock) run, the "mm" may
* already have been freed under us by __ksm_exit()
* because the "mm_slot" is still hashed and
* ksm_scan.mm_slot doesn't point to it anymore.
*/
spin_unlock(&ksm_mmlist_lock);
}
/* Repeat until we've completed scanning the whole list */
slot = ksm_scan.mm_slot;
if (slot != &ksm_mm_head)
goto next_mm; // 继续扫描下一个mm_slot
ksm_scan.seqnr++; // 扫描完一轮mm_slot,增加计数
return NULL;
}