In this ariticle, we will track a read/write operation from the user space all the waydown to the XFS source code.html
From the user space, their are many read and write APIs. Take printf as an example:node
1 #include <stdio.h> 2 int printf(const char *format, ...);
printf() uses putc() , which in turn calls int write(int,void*,int) of user.h.linux
But we cannot find the implementation of int write(int,void*,int) in the library source code. app
However, when compiling int write(int,void*,int) , it will be translated to ide
1 push a 2 push b 3 push c 4 call write
Then we can find the definition of write in assmeble from usys.S:ui
1 .global write; 2 write: 3 movl $SYS_write, %eax; 4 int $T_SYSCALL; 5 ret
The virtual file system (VFS) is a kernel software layer handling all system calls corresponding to the file system operations. It offers unified interface to all different types of the file systems.this
In this section we will see how the read/write request come to the VFS level interface.atom
In the system call, it will use the system call number to identify which kernel function to be called.spa
1 cat /usr/include/asm/unistd_64.h 2 #ifndef __SYSCALL 3 #define __SYSCALL(a, b) 4 #define __NR_read 0 5 __SYSCALL(__NR_read, sys_read) 6 #define __NR_write 1
8 __SYSCALL(__NR_write, sys_write)
9 ...
So to track down, the next function is sys_read and sys_write . However, in kernel souce code, we couldn't find the implementation of either one. Only thing is the declarition of the read and write functions (in include/linux/Syscalls.h):.net
1 asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count); 2 ...
6 asmlinkage long sys_write(unsigned int fd, const char __user *buf, 7 size_t count);
Where is the implementation?
The crux is macro replacement.
In fs/Read_write.c, we can find the implementation of the sys_read and sys_write function:
1 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) 2 { 3 struct fd f = fdget_pos(fd); 4 ssize_t ret = -EBADF; 5 6 if (f.file) { 7 loff_t pos = file_pos_read(f.file); 8 ret = vfs_read(f.file, buf, count, &pos); 9 if (ret >= 0) 10 file_pos_write(f.file, pos); 11 fdput_pos(f); 12 } 13 return ret; 14 } 15 16 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, 17 size_t, count) 18 { 19 struct fd f = fdget_pos(fd); 20 ssize_t ret = -EBADF; 21 22 if (f.file) { 23 loff_t pos = file_pos_read(f.file); 24 ret = vfs_write(f.file, buf, count, &pos); 25 if (ret >= 0) 26 file_pos_write(f.file, pos); 27 fdput_pos(f); 28 } 29 30 return ret; 31 }
If you track the macro SYSCALL_DEFINE3 you will finally find it will replace the function definition above into what looks like the declaration.
We can see sys_read and sys_write will first retrive the fd struct and find out the file position then call the virtual file system interface vfs_read and vfs_write .
How does the operating system know which file system to search for the implementation of exact read and write?
In fs/Read_write.c, we can find the implementation of the vfs_read and vfs_write :
1 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) 2 { 3 ssize_t ret; 4 5 if (!(file->f_mode & FMODE_READ)) 6 return -EBADF; 7 if (!file->f_op->read && !file->f_op->aio_read) 8 return -EINVAL; 9 if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) 10 return -EFAULT; 11 12 ret = rw_verify_area(READ, file, pos, count); 13 if (ret >= 0) { 14 count = ret; 15 if (file->f_op->read) 16 ret = file->f_op->read(file, buf, count, pos); 17 else 18 ret = do_sync_read(file, buf, count, pos); 19 if (ret > 0) { 20 fsnotify_access(file); 21 add_rchar(current, ret); 22 } 23 inc_syscr(current); 24 } 25 26 return ret; 27 }
1 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) 2 { 3 ssize_t ret; 4 5 if (!(file->f_mode & FMODE_WRITE)) 6 return -EBADF; 7 if (!file->f_op->write && !file->f_op->aio_write) 8 return -EINVAL; 9 if (unlikely(!access_ok(VERIFY_READ, buf, count))) 10 return -EFAULT; 11 12 ret = rw_verify_area(WRITE, file, pos, count); 13 if (ret >= 0) { 14 count = ret; 15 file_start_write(file); 16 if (file->f_op->write) 17 ret = file->f_op->write(file, buf, count, pos); 18 else 19 ret = do_sync_write(file, buf, count, pos); 20 if (ret > 0) { 21 fsnotify_modify(file); 22 add_wchar(current, ret); 23 } 24 inc_syscw(current); 25 file_end_write(file); 26 } 27 28 return ret; 29 }
Actual read and write is done by ret = file->f_op->read(file, buf, count, pos); and ret = file->f_op->write(file, buf, count, pos); .
The file manupulating function pointers is stored in truct file. So we can take a look in to the definition of the struct (include/linux/Fs.h).
1 struct file { 2 union { 3 struct llist_node fu_llist; 4 struct rcu_head fu_rcuhead; 5 } f_u; 6 struct path f_path; 7 #define f_dentry f_path.dentry 8 struct inode *f_inode; /* cached value */ 9 const struct file_operations *f_op; 10 11 /* 12 * Protects f_ep_links, f_flags. 13 * Must not be taken from IRQ context. 14 */ 15 spinlock_t f_lock; 16 atomic_long_t f_count; 17 unsigned int f_flags; 18 fmode_t f_mode; 19 struct mutex f_pos_lock; 20 loff_t f_pos; 21 struct fown_struct f_owner; 22 const struct cred *f_cred; 23 struct file_ra_state f_ra; 24 25 u64 f_version; 26 #ifdef CONFIG_SECURITY 27 void *f_security; 28 #endif 29 /* needed for tty driver, and maybe others */ 30 void *private_data; 31 32 #ifdef CONFIG_EPOLL 33 /* Used by fs/eventpoll.c to link all the hooks to this file */ 34 struct list_head f_ep_links; 35 struct list_head f_tfile_llink; 36 #endif /* #ifdef CONFIG_EPOLL */ 37 struct address_space *f_mapping; 38 #ifdef CONFIG_DEBUG_WRITECOUNT 39 unsigned long f_mnt_write_state; 40 #endif 41 } __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
In fs/xfs/Xfs_file.c, the XFS version read and write functions are registered as the file_operations:
const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = xfs_file_aio_read, .aio_write = xfs_file_aio_write, .splice_read = xfs_file_splice_read, .splice_write = xfs_file_splice_write, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, #endif .mmap = xfs_file_mmap, .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, .fallocate = xfs_file_fallocate, }; const struct file_operations xfs_dir_file_operations = { .open = xfs_dir_open, .read = generic_read_dir, .iterate = xfs_file_readdir, .llseek = generic_file_llseek, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, #endif .fsync = xfs_dir_fsync, };
I didn't try to track down how the file system link the file_operations to be the xfs_file_operations. Now I guess in mount time, there are something done to record the file_operation. Or during the time when the fd struct is create. I will add this part to complete the whole process soon.
For Part (2/2), I will track the exact operations in the XFS implementation.
Reference:
http://blog.chinaunix.net/uid-28362602-id-3424404.html
http://blog.csdn.net/xzyiverson/article/details/12676847
Daniel P. Bovet and Marco Cesati, Understanding the Linux Kernel, 1st Edition, O'Reilly, 2001.