這裏僅考慮buffer IO, 因此不少direct IO處理能夠直接略過。先上調用路徑:node
2) | vfs_write() { 2) | do_sync_write() { 2) | ocfs2_file_aio_write() { 2) 0.441 us | ocfs2_rw_lock(); 2) | ocfs2_prepare_inode_for_write() { 2) 0.286 us | ocfs2_inode_unlock(); 2) 1.521 us | } 2) | generic_file_buffered_write() { 2) + 10.056 us | ocfs2_inode_unlock(); 2) ! 11929.80 us | } 2) ! 11933.97 us | } 2) ! 11934.52 us | } 2) ! 11938.59 us | } //看出來generic_file_buffered_write()很耗時間,其中有個異步等待函數也耗時: 2) ocfs2dc-7642 => jbd2/sd-7649 ------------------------------------------ 2) ! 714.081 us | filemap_fdatawait_range(); 2) ! 1708.134 us | filemap_fdatawait_range(); ...
2231 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 2232 const struct iovec *iov, 2233 unsigned long nr_segs, 2234 loff_t pos) 2235 { 2236 int ret, direct_io, appending, rw_level, have_alloc_sem = 0; 2237 int can_do_direct, has_refcount = 0; 2238 ssize_t written = 0; 2239 size_t ocount; /* original count */ 2240 size_t count; /* after file limit checks */ 2241 loff_t old_size, *ppos = &iocb->ki_pos; 2242 u32 old_clusters; 2243 struct file *file = iocb->ki_filp; 2244 struct inode *inode = file->f_path.dentry->d_inode; 2245 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2246 int full_coherency = !(osb->s_mount_opt & 2247 OCFS2_MOUNT_COHERENCY_BUFFERED); //一直等於0 2248 int unaligned_dio = 0; 2249 2250 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, 2251 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2252 file->f_path.dentry->d_name.len, 2253 file->f_path.dentry->d_name.name, 2254 (unsigned int)nr_segs); 2255 2256 if (iocb->ki_left == 0) 2257 return 0; 2258 2259 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2260 2261 appending = kiocb_is_append(iocb) ? 1 : 0; 2262 direct_io = kiocb_is_direct(iocb) ? 1 : 0; 2263 2264 mutex_lock(&inode->i_mutex); 2265 2266 ocfs2_iocb_clear_sem_locked(iocb); 2267 2268 relock: 2269 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 2270 if (direct_io) { 2271 down_read(&inode->i_alloc_sem); 2272 have_alloc_sem = 1; 2273 /* communicate with ocfs2_dio_end_io */ 2274 ocfs2_iocb_set_sem_locked(iocb); 2275 } 2276 2277 /* 2278 * Concurrent O_DIRECT writes are allowed with 2279 * mount_option "coherency=buffered". 2280 */ 2281 rw_level = (!direct_io || full_coherency); 2282 //rw_level=1 2283 ret = ocfs2_rw_lock(inode, rw_level); 2284 if (ret < 0) { 2285 mlog_errno(ret); 2286 goto out_sems; 2287 } 2288 2289 /* 2290 * O_DIRECT writes with "coherency=full" need to take EX cluster 2291 * inode_lock to guarantee coherency. 2292 */ 2293 if (direct_io && full_coherency) { 2294 /* 2295 * We need to take and drop the inode lock to force 2296 * other nodes to drop their caches. Buffered I/O 2297 * already does this in write_begin(). 2298 */ 2299 ret = ocfs2_inode_lock(inode, NULL, 1); 2300 if (ret < 0) { 2301 mlog_errno(ret); 2302 goto out_sems; 2303 } 2304 2305 ocfs2_inode_unlock(inode, 1); 2306 } 2307 2308 can_do_direct = direct_io; //計算文件指針,針對-->setattr,refcount, direct io等特殊狀況進行處理; //通常狀況,只對inode_lock加讀鎖 2309 ret = ocfs2_prepare_inode_for_write(file, ppos, 2310 iocb->ki_left, appending, 2311 &can_do_direct, &has_refcount); 2312 if (ret < 0) { 2313 mlog_errno(ret); 2314 goto out; 2315 }
割...ios
這段,咱們只須要關心generic_file_buffered_write(),拿出來單獨分析。app
2317 if (direct_io && !is_sync_kiocb(iocb)) 2318 unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, 2319 *ppos); 2320 2321 /* 2322 * We can't complete the direct I/O as requested, fall back to 2323 * buffered I/O. 2324 */ 2325 if (direct_io && !can_do_direct) { 2326 ocfs2_rw_unlock(inode, rw_level); 2327 up_read(&inode->i_alloc_sem); 2328 2329 have_alloc_sem = 0; 2330 rw_level = -1; 2331 2332 direct_io = 0; 2333 goto relock; 2334 } 2335 2336 if (unaligned_dio) { 2337 /* 2338 * Wait on previous unaligned aio to complete before 2339 * proceeding. 2340 */ 2341 ocfs2_aiodio_wait(inode); 2342 2343 /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */ 2344 atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio); 2345 ocfs2_iocb_set_unaligned_aio(iocb); 2346 } 2347 2348 /* 2349 * To later detect whether a journal commit for sync writes is 2350 * necessary, we sample i_size, and cluster count here. 2351 */ 2352 old_size = i_size_read(inode); 2353 old_clusters = OCFS2_I(inode)->ip_clusters; 2354 2355 /* communicate with ocfs2_dio_end_io */ 2356 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2357 2358 ret = generic_segment_checks(iov, &nr_segs, &ocount, 2359 VERIFY_READ); 2360 if (ret) 2361 goto out_dio; 2362 2363 count = ocount; 2364 ret = generic_write_checks2(iocb, ppos, &count, 2365 S_ISBLK(inode->i_mode)); 2366 if (ret) 2367 goto out_dio; 2368 2369 if (direct_io) { 2370 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2371 ppos, count, ocount); 2372 if (written < 0) { 2373 ret = written; 2374 goto out_dio; 2375 } 2376 } else { 2377 current->backing_dev_info = file->f_mapping->backing_dev_info; 2378 written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, 2379 ppos, count, 0); 2380 current->backing_dev_info = NULL; 2381 } 2382
割...異步
2383 out_dio: //不支持buffered aio啦 2384 /* buffered aio wouldn't have proper lock coverage today */ 2385 BUG_ON(ret == -EIOCBQUEUED && !kiocb_is_direct(iocb)); 2386 2387 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || 2388 ((file->f_flags & O_DIRECT) && !direct_io)) { //除非inode的I_SYNC置位,即正在回寫inode,咱們才能進來; //why? don't know yet; //測了下,沒有進來 2389 ret = filemap_fdatawrite_range(file->f_mapping, pos, 2390 pos + count - 1); 2391 if (ret < 0) 2392 written = ret; 2393 2394 if (!ret) { 2395 ret = jbd2_journal_force_commit(osb->journal->j_journal); 2396 if (ret < 0) 2397 written = ret; 2398 } 2399 2400 if (!ret) 2401 ret = filemap_fdatawait_range(file->f_mapping, pos, 2402 pos + count - 1); 2403 } 2404 2405 /* 2406 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 2407 * function pointer which is called when o_direct io completes so that 2408 * it can unlock our rw lock. (it's the clustered equivalent of 2409 * i_alloc_sem; protects truncate from racing with pending ios). 2410 * Unfortunately there are error cases which call end_io and others 2411 * that don't. so we don't have to unlock the rw_lock if either an 2412 * async dio is going to do it in the future or an end_io after an 2413 * error has already done it. 2414 */ 2415 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { 2416 rw_level = -1; 2417 have_alloc_sem = 0; 2418 unaligned_dio = 0; 2419 } 2420 2421 if (unaligned_dio) 2422 atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); 2423 2424 out: 2425 if (rw_level != -1) 2426 ocfs2_rw_unlock(inode, rw_level); 2427 2428 out_sems: 2429 if (have_alloc_sem) { 2430 up_read(&inode->i_alloc_sem); 2431 ocfs2_iocb_clear_sem_locked(iocb); 2432 } 2433 2434 mutex_unlock(&inode->i_mutex); 2435 2436 if (written) 2437 ret = written; 2438 return ret; 2439 }