前情提要
上一篇文章对VFS做了一下简单介绍,基本理解到VFS中的基本数据结构为super_block,dentry,inode,address_space及file;
其中super_block是文件系统初始化时建立起来了,其中存储着inode信息,文件系统类型,等等信息;
dentry:管理着文件名filename 到 inode的映射;
inode:管理着文件的元数据信息,主要包括对应的存储设备,在数据存储设备的位置,文件的大小,访问信息等等元数据
address_space: 是内存cache的核心数据结构,主要负责内存与磁盘数据的同步,包括内存页的写入写出等。
file:是与进程关联的数据结构,其中关联着address_space,而address_space是真正与磁盘等设备进行交互的真正的地方,所以多个进程可能会对同一个文件的操作,当然如果多个进程同时打开一个文件进行写的话会把文件流corrupted。
基本介绍
在之前的简介实战中我们有讲到一个磁盘设备格式化为对应的文件系统,初始化完super_block后需要进行挂载,以及在操作文件之前都需要通过vfs_path_lookup来找到对应的inode,以便真正的进行读写数据,那么文件挂载做了一些什么事情,而文件系统又是以什么样的方式来找到对应的inode的呢?结合着EXT2文件系统的具体实现我们来彻底探寻一下vfsmount和path_look_up。
mount
注册文件系统
fs/super.c中的register_filesystem用来向内核注册文件系统,内核中所有的文件系统都保存在一个单链表中,各个文件系统的名称存储为字符串,所以该函数会扫描文件系统链表,直至找到对应的文件系统或到达链表尾部,如果是第一种情况会报一个不能注册两次的错误,如果是后者就会将对应的文件系统放置链表末尾完成注册。相关代码如下:
int register_filesystem(struct file_system_type * fs)
{
int res = 0;
struct file_system_type ** p;
BUG_ON(strchr(fs->name, '.'));
if (fs->next)
return -EBUSY;
write_lock(&file_systems_lock);
p = find_filesystem(fs->name, strlen(fs->name)); // 遍历文件系统链表
if (*p)
res = -EBUSY;
else
*p = fs;
write_unlock(&file_systems_lock);
return res;
}
struct file_system_type {
const char *name;// 文件系统名称
int fs_flags;
#define FS_REQUIRES_DEV 1
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE 4
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);// 将该文件系统mount到对应的目录
void (*kill_sb) (struct super_block *);
struct module *owner;
struct file_system_type * next;
...
};
mount结构
目录树的装载和卸载要比文件系统注册复杂的多,其需要对内核的数据结构执行很多操作,比较复杂,后面结合结构图详细说明一下。
在Unix中文件系统层次结构如下图:其中每个方框中的是一个dentry目录项,其可以挂载为不同的文件系统,但是其path_lookup都需要从根目录开始索引,并当其是一个挂载点(图中的/, /mnt, /mnt/cdrom)时,会更新其dentry中的super_block数据结构。因此文件系统的挂载是可以嵌套的,其主要体现在父文件系统中的dentry中存储的super_block信息来区分其对应的具体是哪一个文件系统。
vfsmount数据结构如下:
struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */
struct super_block *mnt_sb; /* pointer to superblock */
int mnt_flags;
} __randomize_layout;
struct mount {
struct hlist_node mnt_hash;
struct mount *mnt_parent;
struct dentry *mnt_mountpoint;
struct vfsmount mnt;
union {
struct rcu_head mnt_rcu;
struct llist_node mnt_llist;
};
#ifdef CONFIG_SMP
struct mnt_pcp __percpu *mnt_pcp;
#else
int mnt_count;
int mnt_writers;
#endif
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
struct list_head mnt_instance; /* mount instance on sb->s_mounts */
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
struct list_head mnt_list;
struct list_head mnt_expire; /* link in fs-specific expiry list */
struct list_head mnt_share; /* circular list of shared mounts */
struct list_head mnt_slave_list;/* list of slave mounts */
struct list_head mnt_slave; /* slave list entry */
struct mount *mnt_master; /* slave is on master->mnt_slave_list */
struct mnt_namespace *mnt_ns; /* containing namespace */
struct mountpoint *mnt_mp; /* where is it mounted */
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
#endif
int mnt_id; /* mount identifier */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
struct hlist_head mnt_pins;
struct fs_pin mnt_umount;
struct dentry *mnt_ex_mountpoint;
} __randomize_layout;
mount mnt = real_mount(vfsmount) // real_mount map of vfsmount to mount
static inline struct mount *real_mount(struct vfsmount *mnt)
{
return container_of(mnt, struct mount, mnt);
}
#define container_of(ptr, type, member) ({ \
void *__mptr = (void *)(ptr); \
BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
!__same_type(*(ptr), void), \
"pointer type mismatch in container_of()"); \
((type *)(__mptr - offsetof(type, member))); })
/*
* 可以看出其实 mount 数据结构是伴随着 vfsmount的,并存储在vfsmount的上一个连续内存地址中
* 其中mount数据结构就维护着整个mount tree结构,而vfsmount只管理着当前挂载点相关的文件系统管理。
*/
mount系统调用
mount系统调用的入口点时sys_mount函数,其定义在fs/namespace.c中,其主要是委托给do_mount函数进行挂载操作
调用流程图如下:
数据结构关系图如下:
mount的最主要的目标是初始化vfsmount.mnt_root dentry到内存,其包括以下几个步骤:
- 根据被挂载的设备名称查找对应设备的inode,从而获得其block_device,也就是简介中提到的/dev/sda硬盘设备,这一块具体到设备管理相关再介绍,这也体现了“万物皆文件”的思想。
- 从对应的block_device中读取对应数据到内存中生成super_block对象。一般各个具体的文件系统会将自己相关的信息写入到super_block->s_fs_info中,有兴趣的可以看一下ext2_fill_super相关的具体实现。
- 将vfsmount的mnt_root=super_block->s_root, mnt_sb= super_block
- 将当前mountpoint添加到父文件系统命名空间,并将mountpoint加入到父挂载点的子列表中,以便当前挂载点卸载时能回到之前的挂载点。
代码如下:
/**
* domount充当多路分解器的角色,其判断传入的flag,然后交给实际的处理函数,包括:
* do_reconfigure_mnt修改装载文件系统的选项
* do_loopback通过换回接口装载文件系统
* do_change_type 更改文件系统类型
* do_move_mount 移动已经装载的文件系统
* do_new_mount 挂载一个新的文件系统(主要讲解一下这个)
**/
long do_mount(const char *dev_name, const char __user *dir_name,
const char *type_page, unsigned long flags, void *data_page)
{
... // flags判断处理
if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
retval = do_reconfigure_mnt(&path, mnt_flags);
else if (flags & MS_REMOUNT)
retval = do_remount(&path, flags, sb_flags, mnt_flags,
data_page);
else if (flags & MS_BIND)
retval = do_loopback(&path, dev_name, flags & MS_REC);
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&path, flags);
else if (flags & MS_MOVE)
retval = do_move_mount(&path, dev_name);
else
retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
dev_name, data_page);
dput_out:
path_put(&path);
return retval;
}
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
struct vfsmount *mnt;
int err;
if (!fstype)
return -EINVAL;
type = get_fs_type(fstype);
if (!type)
return -ENODEV;
mnt = vfs_kern_mount(type, sb_flags, name, data); //
if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
!mnt->mnt_sb->s_subtype)
mnt = fs_set_subtype(mnt, fstype);
put_filesystem(type);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
if (mount_too_revealing(mnt, &mnt_flags)) {
mntput(mnt);
return -EPERM;
}
err = do_add_mount(real_mount(mnt), path, mnt_flags);
if (err)
mntput(mnt);
return err;
}
挂载的基本流程如上;其基本思路就是通过设备初始化mountpoint的dentry中的superblock。
umount
既然介绍了mount,那也简单介绍一下umount吧,umount的系统调用入口为ksys_umount,最终会调用到do_umount函数,其中ksys_umount会传入文件名,然后在ksys_umount中根据文件名找到对应的path,也就是对应的vfsmount数据结构,然后根据vfsmount字节偏移得到mount,后续do_umount就可以直接操作mount数据结构进行相关的umount操作了。do_umount会先通过特定于超级块的的umount_begin函数来释放与特定文件系统相关的内存或者nfs中的网络连接,mount相关的树数据结构交由umount_tree处理。
具体代码如下:
int ksys_umount(char __user *name, int flags)
{
...
retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); // 根据用户文件名获取path
...
mnt = real_mount(path.mnt); // 根据字节偏移形成映射 vfsmount -> mount
...
retval = do_umount(mnt, flags);
dput_and_out:
/* we mustn't call path_put() as that would clear mnt_expiry_mark */
dput(path.dentry); // 更新dcache
mntput_no_expire(mnt);
out:
return retval;
}
static int do_umount(struct mount *mnt, int flags)
{
struct super_block *sb = mnt->mnt.mnt_sb;
int retval;
retval = security_sb_umount(&mnt->mnt, flags);
if (retval)
return retval;
/*
* Allow userspace to request a mountpoint be expired rather than
* unmounting unconditionally. Unmount only happens if:
* (1) the mark is already set (the mark is cleared by mntput())
* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
*/
if (flags & MNT_EXPIRE) {
if (&mnt->mnt == current->fs->root.mnt ||
flags & (MNT_FORCE | MNT_DETACH))
return -EINVAL;
/*
* probably don't strictly need the lock here if we examined
* all race cases, but it's a slowpath.
*/
lock_mount_hash();
if (mnt_get_count(mnt) != 2) {
unlock_mount_hash();
return -EBUSY;
}
unlock_mount_hash();
if (!xchg(&mnt->mnt_expiry_mark, 1))
return -EAGAIN;
}
/*
* If we may have to abort operations to get out of this
* mount, and they will themselves hold resources we must
* allow the fs to do things. In the Unix tradition of
* 'Gee thats tricky lets do it in userspace' the umount_begin
* might fail to complete on the first run through as other tasks
* must return, and the like. Thats for the mount program to worry
* about for the moment.
*/
if (flags & MNT_FORCE && sb->s_op->umount_begin) {
sb->s_op->umount_begin(sb);
}
/*
* No sense to grab the lock for this test, but test itself looks
* somewhat bogus. Suggestions for better replacement?
* Ho-hum... In principle, we might treat that as umount + switch
* to rootfs. GC would eventually take care of the old vfsmount.
* Actually it makes sense, especially if rootfs would contain a
* /reboot - static binary that would close all descriptors and
* call reboot(9). Then init(8) could umount root and exec /reboot.
*/
if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
/*
* Special case for "unmounting" root ...
* we just try to remount it readonly.
*/
if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
down_write(&sb->s_umount);
if (!sb_rdonly(sb))
retval = do_remount_sb(sb, SB_RDONLY, NULL, 0);
up_write(&sb->s_umount);
return retval;
}
namespace_lock();
lock_mount_hash();
/* Recheck MNT_LOCKED with the locks held */
retval = -EINVAL;
if (mnt->mnt.mnt_flags & MNT_LOCKED)
goto out;
event++;
if (flags & MNT_DETACH) {
if (!list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
} else {
shrink_submounts(mnt);
retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) {
if (!list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
retval = 0;
}
}
out:
unlock_mount_hash();
namespace_unlock();
return retval;
}
lookup
前面对于mount挂载相关做了一些简单介绍,接下来结合Ext2文件系统简单分析一下linux是如何通过filename查找对应的inode。
vfs_path_lookup是lookup的前端函数,最终会调用到filename_lookup,那么所以主要来分析一下filename_lookup中的调用流程和逻辑。调用图如下:
其中path_init负责初始化nameidata中的root及path数据结构至superblock中的s_root,然后后续就是基于当前path中的dentry依次往后查找。
link_path_walk依次遍历一个一个目录项,以'/'作为分隔,并去掉重复的'/',并将真正的工作委托给walk_component进行。
walk_component首先从缓存中查找对应的inode,找不到则委托给文件系统去查找
可以看到最终会交给inode->iop->lookup,这个operation我们在之前的章节中可以找到,这个是一个与具体的文件系统相关的操作。
相关代码如下:
static int filename_lookup(int dfd, struct filename *name, unsigned flags,
struct path *path, struct path *root)
{
int retval;
struct nameidata nd;
if (IS_ERR(name))
return PTR_ERR(name);
if (unlikely(root)) {
nd.root = *root;
flags |= LOOKUP_ROOT;
}
set_nameidata(&nd, dfd, name);
retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
if (unlikely(retval == -ECHILD))
retval = path_lookupat(&nd, flags, path);
if (unlikely(retval == -ESTALE))
retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
if (likely(!retval))
audit_inode(name, path->dentry, flags & LOOKUP_PARENT);
restore_nameidata();
putname(name);
return retval;
}
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
{
const char *s = path_init(nd, flags); // 初始化nd中的root及path
// ...
while (!(err = link_path_walk(s, nd)) // 依次遍历每个目录项
&& ((err = lookup_last(nd)) > 0)) {
s = trailing_symlink(nd);
}
// ...
terminate_walk(nd);
return err;
}
static int link_path_walk(const char *name, struct nameidata *nd)
{
int err;
if (IS_ERR(name))
return PTR_ERR(name);
while (*name=='/') // 去掉多余重复的slash
name++;
if (!*name)
return 0;
/* At this point we know we have a real path component. */
for(;;) {
// ... // 参数检测 及flag等nd相关参数设置
name += hashlen_len(hash_len);
if (!*name)
goto OK;
/*
* If it wasn't NUL, we know it was '/'. Skip that
* slash, and continue until no more slashes.
*/
do {
name++;
} while (unlikely(*name == '/'));
if (unlikely(!*name)) {
OK:
/* pathname body, done */
if (!nd->depth)
return 0;
name = nd->stack[nd->depth - 1].name;
/* trailing symlink, done */
if (!name)
return 0;
/* last component of nested symlink */
err = walk_component(nd, WALK_FOLLOW);
} else {
/* not the last component */
err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
}
// ...
}
}
static int walk_component(struct nameidata *nd, int flags)
{
struct path path;
struct inode *inode;
unsigned seq;
int err;
/*
* "." and ".." are special - ".." especially so because it has
* to be able to know about the current root directory and
* parent relationships.
*/
if (unlikely(nd->last_type != LAST_NORM)) {
err = handle_dots(nd, nd->last_type);
if (!(flags & WALK_MORE) && nd->depth)
put_link(nd);
return err;
}
err = lookup_fast(nd, &path, &inode, &seq); // 缓存查询
if (unlikely(err <= 0)) {
if (err < 0)
return err;
path.dentry = lookup_slow(&nd->last, nd->path.dentry,
nd->flags); // 文件系统中读取
if (IS_ERR(path.dentry))
return PTR_ERR(path.dentry);
path.mnt = nd->path.mnt;
err = follow_managed(&path, nd);
if (unlikely(err < 0))
return err;
if (unlikely(d_is_negative(path.dentry))) {
path_to_nameidata(&path, nd);
return -ENOENT;
}
seq = 0; /* we are already out of RCU mode */
inode = d_backing_inode(path.dentry);
}
return step_into(nd, &path, flags, inode, seq);
}
static struct dentry *__lookup_slow(const struct qstr *name,
struct dentry *dir,
unsigned int flags)
{
struct dentry *dentry, *old;
struct inode *inode = dir->d_inode;
// ...
if (unlikely(!d_in_lookup(dentry))) {
if (!(flags & LOOKUP_NO_REVAL)) {
int error = d_revalidate(dentry, flags);
if (unlikely(error <= 0)) {
if (!error) {
d_invalidate(dentry);
dput(dentry);
goto again;
}
dput(dentry);
dentry = ERR_PTR(error);
}
}
} else {
old = inode->i_op->lookup(inode, dentry, flags);
d_lookup_done(dentry);
if (unlikely(old)) {
dput(dentry);
dentry = old;
}
}
return dentry;
}
所以我们结合Ext2中的相关实现来探究一下inode的查找,以及inode在block_device中的存储详情。
i_op->lookup在Ext2中的实现为ext2_lookup,我们来看下相关的调用流程:
ext2_inode_by_name(dir, &dentry->d_name);在dir inode中查找对应filename所在的ino
ext2_find_entry (dir, child, &page);根据dirinode中的信息,比如目录项中数据的大小;通过address_space从磁盘依次读取相关该目录下的所有目录项到内存page中,Ext2中的目录项数据结构为ext2_dirent,而磁盘中存储的是该数据结构的数组链表,所以会变量内存page中所有的ext2_dirent,直至找到对应名字存在的目录项并返回对应的ino编号。
ext2_iget(dir->i_sb, ino);会委托ext2_get_inode获取对应的raw_inode信息,并最终会进行一些小端转存到真正的inode返回
ext2_get_inode(inode->i_sb, ino, &bh);从super_block中根据ino获取对应的inode信息
相关代码如下:
/*
* ext2_find_entry()
*
* finds an entry in the specified directory with the wanted name. It
* returns the page in which the entry was found (as a parameter - res_page),
* and the entry itself. Page is returned mapped and unlocked.
* Entry is guaranteed to be valid.
*/
struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir,
const struct qstr *child, struct page **res_page)
{
const char *name = child->name;
int namelen = child->len;
unsigned reclen = EXT2_DIR_REC_LEN(namelen);
unsigned long start, n;
unsigned long npages = dir_pages(dir); // 根据dir->inode->size计算page数
struct page *page = NULL;
struct ext2_inode_info *ei = EXT2_I(dir); // 读取ext2_inode信息
ext2_dirent * de;
int dir_has_error = 0;
if (npages == 0) // 空目录
goto out;
/* OFFSET_CACHE */
*res_page = NULL;
start = ei->i_dir_start_lookup; // 起始页偏移
if (start >= npages)
start = 0;
n = start;
do {
char *kaddr;
page = ext2_get_page(dir, n, dir_has_error); // 通过address_space读取一页到内存page
if (!IS_ERR(page)) {
kaddr = page_address(page);// 获取地址
de = (ext2_dirent *) kaddr; // 地址内存读取
kaddr += ext2_last_byte(dir, n) - reclen; // 下一个ext2_dirent地址
while ((char *) de <= kaddr) {
if (de->rec_len == 0) {
ext2_error(dir->i_sb, __func__,
"zero-length directory entry");
ext2_put_page(page);
goto out;
}
if (ext2_match (namelen, name, de)) // 找到对应的目录项
goto found;
de = ext2_next_entry(de);
}
ext2_put_page(page);
} else
dir_has_error = 1;
if (++n >= npages)
n = 0;
/* next page is past the blocks we've got */
if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) {
ext2_error(dir->i_sb, __func__,
"dir %lu size %lld exceeds block count %llu",
dir->i_ino, dir->i_size,
(unsigned long long)dir->i_blocks);
goto out;
}
} while (n != start);
out:
return NULL;
found:
*res_page = page;
ei->i_dir_start_lookup = n;
return de;
}
static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino, struct buffer_head **p)
{
struct buffer_head * bh;
unsigned long block_group;
unsigned long block;
unsigned long offset;
struct ext2_group_desc * gdp;
*p = NULL;
if ((ino != EXT2_ROOT_INO && ino < EXT2_FIRST_INO(sb)) ||
ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
goto Einval;
block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);// 在Ext2中,inode是按指定个数分组存储的根据ino编号获取对应的block group
gdp = ext2_get_group_desc(sb, block_group, NULL); // 根据block group位置从super_block获取group描述信息
if (!gdp)
goto Egdp;
/*
* Figure out the offset within the block group inode table
*/
offset = ((ino - 1) % EXT2_INODES_PER_GROUP(sb)) * EXT2_INODE_SIZE(sb); // 获取指定ino在sb的block_group中的内存地址偏移。
block = le32_to_cpu(gdp->bg_inode_table) +
(offset >> EXT2_BLOCK_SIZE_BITS(sb));
if (!(bh = sb_bread(sb, block))) //读取一个block到buffer_head块缓存中
goto Eio;
*p = bh;
offset &= (EXT2_BLOCK_SIZE(sb) - 1);
return (struct ext2_inode *) (bh->b_data + offset); // 根据地址偏移返回对应的inode。
Einval:
ext2_error(sb, "ext2_get_inode", "bad inode number: %lu",
(unsigned long) ino);
return ERR_PTR(-EINVAL);
Eio:
ext2_error(sb, "ext2_get_inode",
"unable to read inode block - inode=%lu, block=%lu",
(unsigned long) ino, block);
Egdp:
return ERR_PTR(-EIO);
}
struct ext2_group_desc
{
__le32 bg_block_bitmap; /* Blocks bitmap block */
__le32 bg_inode_bitmap; /* Inodes bitmap block */
__le32 bg_inode_table; /* Inodes table block */
__le16 bg_free_blocks_count; /* Free blocks count */
__le16 bg_free_inodes_count; /* Free inodes count */
__le16 bg_used_dirs_count; /* Directories count */
__le16 bg_pad;
__le32 bg_reserved[3];
};
EXT2文件系统基本的存储结构图如下: