日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 运维知识 > linux >内容正文

linux

Linux Kernel File IO Syscall Kernel-Source-Code Analysis(undone)

發(fā)布時間:2025/3/15 linux 33 豆豆
生活随笔 收集整理的這篇文章主要介紹了 Linux Kernel File IO Syscall Kernel-Source-Code Analysis(undone) 小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

目錄

0. 引言 1. open() syscall 2. close() syscall

?

0. 引言

在linux的哲學中,所有的磁盤文件、目錄、外設設備、驅(qū)動設備全部被抽象為了"文件"這個概念,所以本文提到的"File IO"適用于linux下所有的IO操作,需要明白的的,本文分析的是linux下的IO系統(tǒng)調(diào)用對應的內(nèi)核源代碼,linux下每一個系統(tǒng)調(diào)用都有對應的內(nèi)核源代碼,而我們在ring3常用的glib c的編程所有的c庫API,它們只是對系統(tǒng)調(diào)用的一個封裝,最終還是要通過系統(tǒng)調(diào)用實現(xiàn)功能

0x1: SYSCALL_DEFINE宏定義

我們在學習內(nèi)核源代碼的時候經(jīng)常會遇到一個宏定義: SYSCALL_DEFINE,所有的系統(tǒng)調(diào)用的聲明都通過它來實現(xiàn)

\linux-2.6.32.63\include\linux\syscalls.h

#define SYSCALL_DEFINE0(sname) \SYSCALL_TRACE_ENTER_EVENT(_##sname); \SYSCALL_TRACE_EXIT_EVENT(_##sname); \static const struct syscall_metadata __used \__attribute__((__aligned__(4))) \__attribute__((section("__syscalls_metadata"))) \__syscall_meta_##sname = { \.name = "sys_"#sname, \.nb_args = 0, \.enter_event = &event_enter__##sname, \.exit_event = &event_exit__##sname, \}; \asmlinkage long sys_##sname(void) #else#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) #endif#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) #define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

...

#ifdef CONFIG_FTRACE_SYSCALLS#define SYSCALL_DEFINEx(x, sname, ...) \static const char *types_##sname[] = { \__SC_STR_TDECL##x(__VA_ARGS__) \}; \static const char *args_##sname[] = { \__SC_STR_ADECL##x(__VA_ARGS__) \}; \SYSCALL_METADATA(sname, x); \__SYSCALL_DEFINEx(x, sname, __VA_ARGS__) #else#define SYSCALL_DEFINEx(x, sname, ...) \__SYSCALL_DEFINEx(x, sname, __VA_ARGS__) #endif#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS#define SYSCALL_DEFINE(name) static inline long SYSC_##name#define __SYSCALL_DEFINEx(x, name, ...) \asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \{ \__SC_TEST##x(__VA_ARGS__); \return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__)); \} \SYSCALL_ALIAS(sys##name, SyS##name); \static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)) #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */#define SYSCALL_DEFINE(name) asmlinkage long sys_##name#define __SYSCALL_DEFINEx(x, name, ...) asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)) #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */

所以對函數(shù)定義

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)就等于
asmlinkage long sys_socket(int family, int type, int protocol)

Relevant Link:

http://blog.csdn.net/p_panyuch/article/details/5648007

?

1. open() syscall

open()系統(tǒng)調(diào)用在kernel中對應的是sys_open()

\linux-2.6.32.63\fs\open.c

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) {long ret;if (force_o_largefile()){flags |= O_LARGEFILE;} //調(diào)用do_sys_open完成實際功能ret = do_sys_open(AT_FDCWD, filename, flags, mode);/* avoid REGPARM breakage on x86: */asmlinkage_protect(3, ret, filename, flags, mode);return ret; }

繼續(xù)跟進do_sys_open()函數(shù)

long do_sys_open(int dfd, const char __user *filename, int flags, int mode) {/*獲取文件名稱,由getname()函數(shù)完成,其內(nèi)部首先創(chuàng)建存取文件名稱的空間,然后從用戶空間把文件名拷貝過來*/char *tmp = getname(filename);int fd = PTR_ERR(tmp);if (!IS_ERR(tmp)) {/*獲取一個可用的fd,此函數(shù)調(diào)用alloc_fd()函數(shù)從fd_table中獲取一個可用fd,并進行初始化*/fd = get_unused_fd_flags(flags);if (fd >= 0) {/*fd獲取成功則開始打開文件,此函數(shù)是主要完成打開功能的函數(shù)*/struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);if (IS_ERR(f)) {/*打開失敗,釋放fd*/put_unused_fd(fd);fd = PTR_ERR(f);} else {//文件如果已經(jīng)被打開了,調(diào)用fsnotify_open()函數(shù) fsnotify_open(f->f_path.dentry);//將文件指針安裝在fd數(shù)組中,每個進程都會將打開的文件句柄保存在fd_array[]數(shù)組中 fd_install(fd, f);}}//釋放放置從用戶空間拷貝過來的文件名的存儲空間 putname(tmp);}return fd; }

繼續(xù)跟進do_file_open()函數(shù)

/** Note that the low bits of the passed in "open_flag"* are not the same as in the local variable "flag". See* open_to_namei_flags() for more details.*/ struct file *do_filp_open(int dfd, const char *pathname, int open_flag, int mode, int acc_mode) {/* 若干變量聲明 */struct file *filp;struct nameidata nd;int error;struct path path;struct dentry *dir;int count = 0;int will_write;/*改變參數(shù)flag的值,具體做法是flag+1*/int flag = open_to_namei_flags(open_flag);/*設置訪問權限*/if (!acc_mode){acc_mode = MAY_OPEN | ACC_MODE(flag);} /* O_TRUNC implies we need access checks for write permissions *//* 根據(jù)O_TRUNC標志設置寫權限 */if (flag & O_TRUNC){acc_mode |= MAY_WRITE;} /* Allow the LSM permission hook to distinguish append access from general write access. *//* 設置O_APPEND標志 */if (flag & O_APPEND){acc_mode |= MAY_APPEND;} /* The simplest case - just a plain lookup. *//* 如果不是創(chuàng)建文件 */if (!(flag & O_CREAT)) { /*當內(nèi)核要訪問一個文件的時候,第一步要做的是找到這個文件,而查找文件的過程在vfs里面是由path_lookup或者path_lookup_open函數(shù)來完成的這兩個函數(shù)將用戶傳進來的字符串表示的文件路徑轉(zhuǎn)換成一個dentry結構,并建立好相應的inode和file結構,將指向file的描述符返回用戶用戶隨后通過文件描述符,來訪問這些數(shù)據(jù)結構*/error = path_lookup_open(dfd, pathname, lookup_flags(flag), &nd, flag);if (error){return ERR_PTR(error);} goto ok;}/** Create - we need to know the parent.*///path-init為查找作準備工作,path_walk真正上路查找,這兩個函數(shù)聯(lián)合起來根據(jù)一段路徑名找到對應的dentry error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);if (error){return ERR_PTR(error);} /*這個函數(shù)相當重要,是整個NFS的名字解析函數(shù),其實也是NFS得以構筑的函數(shù)該函數(shù)采用一個for循環(huán),對name路徑根據(jù)目錄的層次,一層一層推進,直到終點或失敗。在推進的過程中,一步步建立了目錄樹的dentry和對應的inode*/error = path_walk(pathname, &nd);if (error) {if (nd.root.mnt){/*減少dentry和vsmount得計數(shù)*/path_put(&nd.root);} return ERR_PTR(error);}if (unlikely(!audit_dummy_context())){/*保存inode節(jié)點信息*/audit_inode(pathname, nd.path.dentry);} /** We have the parent and last component. First of all, check* that we are not asked to creat(2) an obvious directory - that* will not do.*/error = -EISDIR;/*父節(jié)點信息*/if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]){goto exit_parent;} error = -ENFILE;/* 返回特定的file結構體指針 */filp = get_empty_filp();if (filp == NULL){goto exit_parent;} /* 填充nameidata結構 */nd.intent.open.file = filp;nd.intent.open.flags = flag;nd.intent.open.create_mode = mode;dir = nd.path.dentry;nd.flags &= ~LOOKUP_PARENT;nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;if (flag & O_EXCL){nd.flags |= LOOKUP_EXCL;} mutex_lock(&dir->d_inode->i_mutex);/*從哈希表中查找nd對應的dentry*/path.dentry = lookup_hash(&nd);path.mnt = nd.path.mnt;do_last:error = PTR_ERR(path.dentry);if (IS_ERR(path.dentry)) {mutex_unlock(&dir->d_inode->i_mutex);goto exit;}if (IS_ERR(nd.intent.open.file)) {error = PTR_ERR(nd.intent.open.file);goto exit_mutex_unlock;}/* Negative dentry, just create the file *//*如果此dentry結構沒有對應的inode節(jié)點,說明是無效的,應該創(chuàng)建文件節(jié)點 */if (!path.dentry->d_inode) {/** This write is needed to ensure that a* ro->rw transition does not occur between* the time when the file is created and when* a permanent write count is taken through* the 'struct file' in nameidata_to_filp().*//*write權限是必需的*/error = mnt_want_write(nd.path.mnt);if (error){goto exit_mutex_unlock;} /*按照namei格式的flag open*/error = __open_namei_create(&nd, &path, flag, mode);if (error) {mnt_drop_write(nd.path.mnt);goto exit;}/*根據(jù)nameidata 得到相應的file結構*/filp = nameidata_to_filp(&nd, open_flag);if (IS_ERR(filp)){ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));} /*放棄寫權限*/mnt_drop_write(nd.path.mnt);if (nd.root.mnt){/*計數(shù)減一*/path_put(&nd.root);} return filp;}/** It already exists.*//*要打開的文件已經(jīng)存在*/mutex_unlock(&dir->d_inode->i_mutex);/*保存inode節(jié)點*/audit_inode(pathname, path.dentry);error = -EEXIST;/*flag標志檢查代碼*/if (flag & O_EXCL){goto exit_dput;} if (__follow_mount(&path)){error = -ELOOP;if (flag & O_NOFOLLOW){goto exit_dput;} }error = -ENOENT;if (!path.dentry->d_inode){goto exit_dput;} if (path.dentry->d_inode->i_op->follow_link){goto do_link;} /*路徑裝化為相應的nameidata結構*/path_to_nameidata(&path, &nd);error = -EISDIR;/*如果是文件夾*/if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)){goto exit;} ok:/** Consider:* 1. may_open() truncates a file* 2. a rw->ro mount transition occurs* 3. nameidata_to_filp() fails due to* the ro mount.* That would be inconsistent, and should* be avoided. Taking this mnt write here* ensures that (2) can not occur.*//*檢測是否截斷文件標志*/will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);if (will_write) {/*要截斷的話就要獲取寫權限*/error = mnt_want_write(nd.path.mnt);if (error){goto exit;} }//may_open執(zhí)行權限檢測、文件打開和truncate的操作error = may_open(&nd.path, acc_mode, flag);if (error) {if (will_write){mnt_drop_write(nd.path.mnt);} goto exit;}filp = nameidata_to_filp(&nd, open_flag);if (IS_ERR(filp)){ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));}/** It is now safe to drop the mnt write* because the filp has had a write taken* on its behalf.*///安全的放棄寫權限if (will_write){mnt_drop_write(nd.path.mnt);} if (nd.root.mnt){path_put(&nd.root);} return filp;exit_mutex_unlock:mutex_unlock(&dir->d_inode->i_mutex); exit_dput:path_put_conditional(&path, &nd); exit:if (!IS_ERR(nd.intent.open.file)){release_open_intent(&nd);}exit_parent:if (nd.root.mnt){path_put(&nd.root);} path_put(&nd.path);return ERR_PTR(error);do_link: //允許遍歷連接文件,則手工找到連接文件對應的文件error = -ELOOP;if (flag & O_NOFOLLOW){//不允許遍歷連接文件,返回錯誤goto exit_dput;} /** This is subtle. Instead of calling do_follow_link() we do the* thing by hands. The reason is that this way we have zero link_count* and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.* After that we have the parent and last component, i.e.* we are in the same situation as after the first path_walk().* Well, almost - if the last component is normal we get its copy* stored in nd->last.name and we will have to putname() it when we* are done. Procfs-like symlinks just set LAST_BIND.*//* 以下是手工找到鏈接文件對應的文件dentry結構代碼 *///設置查找LOOKUP_PARENT標志nd.flags |= LOOKUP_PARENT;//判斷操作是否安全error = security_inode_follow_link(path.dentry, &nd);if (error){goto exit_dput;} //處理符號鏈接error = __do_follow_link(&path, &nd);if (error) {/* Does someone understand code flow here? Or it is only* me so stupid? Anathema to whoever designed this non-sense* with "intent.open".*/release_open_intent(&nd);if (nd.root.mnt){path_put(&nd.root);} return ERR_PTR(error);}nd.flags &= ~LOOKUP_PARENT;//檢查最后一段文件或目錄名的屬性情況if (nd.last_type == LAST_BIND){goto ok;} error = -EISDIR;if (nd.last_type != LAST_NORM){goto exit;} if (nd.last.name[nd.last.len]) {__putname(nd.last.name);goto exit;}error = -ELOOP;//出現(xiàn)回環(huán)標志: 循環(huán)超過32次if (count++==32) {__putname(nd.last.name);goto exit;}dir = nd.path.dentry;mutex_lock(&dir->d_inode->i_mutex);//更新路徑的掛接點和dentrypath.dentry = lookup_hash(&nd);path.mnt = nd.path.mnt;__putname(nd.last.name);goto do_last; }

總結一下流程

1. open系統(tǒng)調(diào)用訪問SYSCALL_DEFINE3函數(shù) 2. 在open系統(tǒng)調(diào)用中,調(diào)用do_sys_open函數(shù)完成主要功能 3. 在do_sys_open函數(shù)中,調(diào)用函數(shù)do_filp_open完成主要的打開功能 4. 在內(nèi)核中要打開一個文件,首先應該找到這個文件,而查找文件的過程在vfs里面是由do_path_lookup或者path_lookup_open函數(shù)來完成的4.1 設置nd->root=根路徑(絕對地址)或者當前工作目錄(相對地址)4.2 這一步做完了后,內(nèi)核會建立一些數(shù)據(jù)結構(dentry,inode)來初始化查找的起點if(!retval){ retval = path_walk(name,nd);}4.3 path_walk會遍歷路徑的每一節(jié)點分量,也就是用"/"分隔開的每一部分,最終找到name指向的文件 int path_walk(const char *name,struct nameidata *nd){return link_path_walk(name,nd);//path_walk其實相當于直接調(diào)用link_path_walk來完成工作 }4.4 link_path_walk的主要工作是有其內(nèi)部函數(shù)__link_path_walk 來完成的result = __link_path_walk(name,nd)4.5 __link_walk_path,該函數(shù)把傳進來的字符串name,也就是用戶指定的路徑,按路徑分隔符分解成一系列小的component。比如用戶說,我要找"/path/to/dest"這個文件,那么我們的文件系統(tǒng)就會按path、to、dest一個
一個來找,知道最后一個分量是文件或者查找完成。他找的時候,會先用path_init初始化過的根路徑去找第一個分量,也就是path。然后用path的dentry->d_inode去找to,這樣循環(huán)到最后一個。注意,內(nèi)核會緩存找到的路徑分量,
所以往往只有第一次訪問一個路徑的時候,才會去訪問磁盤,后面的訪問會直接從緩存里找,下面會看到,很多與頁告訴緩存打交道的代碼。但不管怎樣,第一遍查找總是會訪問磁盤的
static int __link_path_walk(const char *name,strucy nameidata *nd){..} 至此,按照每一個component查找完成之后,就會找到相應的文件,然后相應的打開工作就基本完成了

Relevant Link:

http://oss.org.cn/kernel-book/ http://blog.csdn.net/f413933206/article/details/5701913

?

2. close() syscall

close()系統(tǒng)調(diào)用對應內(nèi)核中的函數(shù)為: sys_close()

\linux-2.6.32.63\fs\open.c

/** Careful here! We test whether the file pointer is NULL before* releasing the fd. This ensures that one clone task can't release* an fd while another clone is opening it.*/ SYSCALL_DEFINE1(close, unsigned int, fd) {struct file * filp;struct files_struct *files = current->files;struct fdtable *fdt;int retval;spin_lock(&files->file_lock);/*獲取指向struct fdtable結構體的指針\linux-2.6.32.63\include\linux\fdtable.h#define files_fdtable(files) (rcu_dereference((files)->fdt))*/fdt = files_fdtable(files);if (fd >= fdt->max_fds){goto out_unlock;} //獲取需要關閉的文件描述符編號filp = fdt->fd[fd];if (!filp){goto out_unlock;} /*將fd_array[]中的的指定元素值置null */rcu_assign_pointer(fdt->fd[fd], NULL);FD_CLR(fd, fdt->close_on_exec); /*調(diào)用__put_unused_fd函數(shù),將當前fd回收,則下一次打開新的文件又可以用這個fd了static void __put_unused_fd(struct files_struct *files, unsigned int fd){struct fdtable *fdt = files_fdtable(files);__FD_CLR(fd, fdt->open_fds);if (fd < files->next_fd){files->next_fd = fd;} }*/__put_unused_fd(files, fd);spin_unlock(&files->file_lock);retval = filp_close(filp, files);/* can't restart close syscall because file table entry was cleared */if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK)){retval = -EINTR;} return retval;out_unlock:spin_unlock(&files->file_lock);return -EBADF; } EXPORT_SYMBOL(sys_close);

對于,我們需要重點跟進2個函數(shù): rcu_assign_pointer(fdt->fd[fd], NULL);、retval = filp_close(filp, files);

\linux-2.6.32.63\fs\rcupdate.h

/*** rcu_assign_pointer - assign (publicize) a pointer to a newly* initialized structure that will be dereferenced by RCU read-side* critical sections. Returns the value assigned.** Inserts memory barriers on architectures that require them* (pretty much all of them other than x86), and also prevents* the compiler from reordering the code that initializes the* structure after the pointer assignment. More importantly, this* call documents which pointers will be dereferenced by RCU read-side* code.*/#define rcu_assign_pointer(p, v) \({ \if (!__builtin_constant_p(v) || \((v) != NULL)) \smp_wmb(); \(p) = (v); \})

我們知道,每個進程在kernel中都有一個對應的task_struct與之對應,而通過task_struct可以間接地獲得一個fd_array[]數(shù)組,表示當前進程已經(jīng)打開的文件,每一個元素都是一個文件描述符的值,只有通過這個fd_array[x]才能獲取當前進程打開的文件的struc file*,而rcu_assign_pointer(fdt->fd[fd], NULL)的作用就在于將將這個數(shù)組的指定元素置空,即斷開了這個引用的關系,至于之后內(nèi)核棧中的那個struct file*是否釋放,那內(nèi)存回收的事,至少現(xiàn)在進程想通過task_stuct是無法再引用到之前打開過的文件了,這里面的關系圖可以參閱:

http://www.cnblogs.com/LittleHann/p/3865490.html //搜索: 用一張圖表示task_struct、fs_struct、files_struct、fdtable、file的關系

我們繼續(xù)分析etval = filp_close(filp, files);

\linux-2.6.32.63\fs\open.c

/** "id" is the POSIX thread ID. We use the* files pointer for this..*/ int filp_close(struct file *filp, fl_owner_t id) {int retval = 0;if (!file_count(filp)) {printk(KERN_ERR "VFS: Close: file count is 0\n");return 0;}if (filp->f_op && filp->f_op->flush){retval = filp->f_op->flush(filp, id);} dnotify_flush(filp, id);locks_remove_posix(filp, id);fput(filp);return retval; }

filp_close()負責將表示打開的文件的struct file*內(nèi)存空間進行釋放,至此,內(nèi)核棧中就再也沒有之前打開過的文件的任何痕跡了

Relevant Link:

http://blog.csdn.net/ce123_zhouwei/article/details/8459794

?

Copyright (c) 2014 LittleHann All rights reserved

?

轉(zhuǎn)載于:https://www.cnblogs.com/LittleHann/p/3932624.html

總結

以上是生活随笔為你收集整理的Linux Kernel File IO Syscall Kernel-Source-Code Analysis(undone)的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯,歡迎將生活随笔推薦給好友。