XFS Source Code Reading -- read/write operation (Part 1/2)

In this ariticle, we will track a read/write operation from the user space all the waydown to the XFS source code.html

  • From user space to system call

From the user space, their are many read and write APIs. Take printf as an example:node

1 #include <stdio.h>
2 int printf(const char *format, ...); 

 printf()  uses  putc() , which in turn calls  int write(int,void*,int)  of user.h.linux

 

But we cannot find the implementation of   int write(int,void*,int)  in the library source code. app

However, when compiling   int write(int,void*,int)  , it will be translated to ide

1 push a
2 push b
3 push c
4 call write

Then we can find the definition of  write in assmeble from usys.S:ui

1 .global write;
2 write:
3 movl $SYS_write, %eax;
4 int $T_SYSCALL;
5 ret
  • From system call to the virtual file system

 The virtual file system (VFS) is a kernel software layer handling all system calls corresponding to the file system operations. It offers unified interface to all different types of the file systems.this

In this section we will see how the read/write request come to the VFS level interface.atom

In the system call, it will use the system call number to identify which kernel function to be called.spa

 1 cat /usr/include/asm/unistd_64.h
 2 #ifndef __SYSCALL
 3 #define __SYSCALL(a, b)
 4 #define __NR_read                               0
 5 __SYSCALL(__NR_read, sys_read)
 6 #define __NR_write                              1 
8
__SYSCALL(__NR_write, sys_write)
9
...

So to track down, the next function is  sys_read and  sys_write . However, in kernel souce code, we couldn't find the implementation of either one. Only thing is the declarition of the read and write functions (in include/linux/Syscalls.h):.net

 1 asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count);
 2 ... 
6
asmlinkage long sys_write(unsigned int fd, const char __user *buf, 7 size_t count);

 Where is the implementation?

 The crux is macro replacement. 

In fs/Read_write.c, we can find the implementation of the  sys_read  and  sys_write function:

 1 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 2 {
 3     struct fd f = fdget_pos(fd);
 4     ssize_t ret = -EBADF;
 5 
 6     if (f.file) {
 7         loff_t pos = file_pos_read(f.file);
 8         ret = vfs_read(f.file, buf, count, &pos);
 9         if (ret >= 0)
10             file_pos_write(f.file, pos);
11         fdput_pos(f);
12     }
13     return ret;
14 }
15 
16 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
17         size_t, count)
18 {
19     struct fd f = fdget_pos(fd);
20     ssize_t ret = -EBADF;
21 
22     if (f.file) {
23         loff_t pos = file_pos_read(f.file);
24         ret = vfs_write(f.file, buf, count, &pos);
25         if (ret >= 0)
26             file_pos_write(f.file, pos);
27         fdput_pos(f);
28     }
29 
30     return ret;
31 }

 

 If you track the macro  SYSCALL_DEFINE3 you will finally find it will replace the function definition above into what looks like the declaration.

We can see  sys_read  and  sys_write will first retrive the fd struct and find out the file position then call the virtual file system interface  vfs_read  and  vfs_write .

 

  • From VFS to XFS

 How does the operating system know which file system to search for the implementation of exact read and write? 

In fs/Read_write.c, we can find the implementation of the   vfs_read  and  vfs_write :

 1 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 2 {
 3     ssize_t ret;
 4 
 5     if (!(file->f_mode & FMODE_READ))
 6         return -EBADF;
 7     if (!file->f_op->read && !file->f_op->aio_read)
 8         return -EINVAL;
 9     if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
10         return -EFAULT;
11 
12     ret = rw_verify_area(READ, file, pos, count);
13     if (ret >= 0) {
14         count = ret;
15         if (file->f_op->read)
16             ret = file->f_op->read(file, buf, count, pos);
17         else
18             ret = do_sync_read(file, buf, count, pos);
19         if (ret > 0) {
20             fsnotify_access(file);
21             add_rchar(current, ret);
22         }
23         inc_syscr(current);
24     }
25 
26     return ret;
27 }

 

 

 1 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 2 {
 3     ssize_t ret;
 4 
 5     if (!(file->f_mode & FMODE_WRITE))
 6         return -EBADF;
 7     if (!file->f_op->write && !file->f_op->aio_write)
 8         return -EINVAL;
 9     if (unlikely(!access_ok(VERIFY_READ, buf, count)))
10         return -EFAULT;
11 
12     ret = rw_verify_area(WRITE, file, pos, count);
13     if (ret >= 0) {
14         count = ret;
15         file_start_write(file);
16         if (file->f_op->write)
17             ret = file->f_op->write(file, buf, count, pos);
18         else
19             ret = do_sync_write(file, buf, count, pos);
20         if (ret > 0) {
21             fsnotify_modify(file);
22             add_wchar(current, ret);
23         }
24         inc_syscw(current);
25         file_end_write(file);
26     }
27 
28     return ret;
29 }

 Actual read and write is done by  ret = file->f_op->read(file, buf, count, pos); and  ret = file->f_op->write(file, buf, count, pos); .

 The file manupulating function pointers is stored in truct file. So we can take a look in to the definition of the struct (include/linux/Fs.h).

 1 struct file {
 2     union {
 3         struct llist_node    fu_llist;
 4         struct rcu_head     fu_rcuhead;
 5     } f_u;
 6     struct path        f_path;
 7 #define f_dentry    f_path.dentry
 8     struct inode        *f_inode;    /* cached value */
 9     const struct file_operations    *f_op;
10 
11     /*
12      * Protects f_ep_links, f_flags.
13      * Must not be taken from IRQ context.
14      */
15     spinlock_t        f_lock;
16     atomic_long_t        f_count;
17     unsigned int         f_flags;
18     fmode_t            f_mode;
19     struct mutex        f_pos_lock;
20     loff_t            f_pos;
21     struct fown_struct    f_owner;
22     const struct cred    *f_cred;
23     struct file_ra_state    f_ra;
24 
25     u64            f_version;
26 #ifdef CONFIG_SECURITY
27     void            *f_security;
28 #endif
29     /* needed for tty driver, and maybe others */
30     void            *private_data;
31 
32 #ifdef CONFIG_EPOLL
33     /* Used by fs/eventpoll.c to link all the hooks to this file */
34     struct list_head    f_ep_links;
35     struct list_head    f_tfile_llink;
36 #endif /* #ifdef CONFIG_EPOLL */
37     struct address_space    *f_mapping;
38 #ifdef CONFIG_DEBUG_WRITECOUNT
39     unsigned long f_mnt_write_state;
40 #endif
41 } __attribute__((aligned(4)));    /* lest something weird decides that 2 is OK */

 In fs/xfs/Xfs_file.c, the XFS version read and write functions are registered as the file_operations:

const struct file_operations xfs_file_operations = {
    .llseek        = xfs_file_llseek,
    .read = do_sync_read,
    .write        = do_sync_write,
    .aio_read    = xfs_file_aio_read,
    .aio_write    = xfs_file_aio_write,
    .splice_read    = xfs_file_splice_read,
    .splice_write    = xfs_file_splice_write,
    .unlocked_ioctl    = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
    .compat_ioctl    = xfs_file_compat_ioctl,
#endif
    .mmap        = xfs_file_mmap,
    .open        = xfs_file_open,
    .release    = xfs_file_release,
    .fsync        = xfs_file_fsync,
    .fallocate    = xfs_file_fallocate,
};

const struct file_operations xfs_dir_file_operations = {
    .open        = xfs_dir_open,
    .read        = generic_read_dir,
    .iterate    = xfs_file_readdir,
    .llseek        = generic_file_llseek,
    .unlocked_ioctl    = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
    .compat_ioctl    = xfs_file_compat_ioctl,
#endif
    .fsync        = xfs_dir_fsync,
};

 

I didn't try to track down how the file system link the file_operations to be the xfs_file_operations. Now I guess in mount time, there are something done to record the file_operation. Or during the time when the fd struct is create. I will add this part to complete the whole process soon.

For Part (2/2), I will track the exact operations in the XFS implementation.

 

Reference:

http://blog.chinaunix.net/uid-28362602-id-3424404.html

http://blog.csdn.net/xzyiverson/article/details/12676847

Daniel P. Bovet and Marco Cesati, Understanding the Linux Kernel, 1st Edition, O'Reilly, 2001. 

相關文章
相關標籤/搜索