RAID5 IO处理之条带读代码详解

除了对齐读流程中读失败通过条带重试的场景会进入到条带读,当IO覆盖范围超过一个chunk时也会进入条带读(如向chunk为4K的RAID下发起始位置为1K大小为4K的IO),接下来我们就这部分逻辑进行分析 。
1 IO加入链表首先 bio 通过 add_stripe_bio() 函数被挂载到条带头指向成员磁盘设备的 toread上,代码如下所示:
/* 只保留读请求相关处理逻辑 */static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite){ struct bio **bip; struct r5conf *conf = sh->raid_conf; spin_lock_irq(&sh->stripe_lock); /** 获取bio所在dev的用于保存toread的地址* 后续bio插入时会根据其起始位置进行排序* 这里使用二级指针便于后续的插入操作*/ bip = &sh->dev[dd_idx].toread; /** 遍历当前需要读的bio,判断是否存在bio覆盖范围重叠的场景* 如果有重叠则跳转到overlap设置标记后返回0退出* 需要等待已存在的导致重叠的bio执行完毕后才能再次执行*/ while (*bip && (*bip)->bi_sector < bi->bi_sector) {if (bio_end_sector(*bip) > bi->bi_sector)goto overlap;bip = & (*bip)->bi_next; } if (*bip && (*bip)->bi_sector < bio_end_sector(bi))goto overlap; BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); /* 将bio根据起始位置顺序插入到toread的bio链表中等待处理 */ if (*bip)bi->bi_next = *bip; *bip = bi; /* 增加bio的bi_phys_segments计数 */ raid5_inc_bi_active_stripes(bi); spin_unlock_irq(&sh->stripe_lock); return 1;overlap: set_bit(R5_Overlap, &sh->dev[dd_idx].flags); spin_unlock_irq(&sh->stripe_lock); return 0;}2 条带处理条带处理的函数入口为 handle_active_stripes(),代码如下所示:
#define MAX_STRIPE_BATCH 8static int handle_active_stripes(struct r5conf *conf){ struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; int i, batch_size = 0; while (batch_size < MAX_STRIPE_BATCH &&/* 根据优先级获取一个待处理条带 */(sh = __get_priority_stripe(conf)) != NULL)batch[batch_size++] = sh; if (batch_size == 0)return batch_size; spin_unlock_irq(&conf->device_lock); /* 调用handle_stripe函数处理条带 */ for (i = 0; i < batch_size; i++)handle_stripe(batch[i]); cond_resched(); spin_lock_irq(&conf->device_lock); for (i = 0; i < batch_size; i++)__release_stripe(conf, batch[i]); return batch_size;}handle_stripe() 是条带处理的主要函数 。一个条带从开始到结束需要调用几次 handle_stripe() 及相关函数 。本文讨论如下四种场景:

  • 读成功
  • IO所在磁盘异常
  • 读IO报错
  • 阵列超冗余
接下来根据不同场景下每轮处理的内容进行代码分析,贴出的代码只包含当前处理的相关内容 。
2.1 读成功正常的条带读会经过以下三轮的条带处理,读取成功后将数据返回给调用者 。
2.1.1 下发读请求函数调用关系如下:
handle_stripe() \_ analyse_stripe() \_ handle_stripe_fill()\_ fetch_block() \_ ops_run_io()各函数执行的代码逻辑如下:
static void handle_stripe(struct stripe_head *sh){ /* 调用analyse_stripe解析条带状态 */ analyse_stripe(sh, &s); /* s.to_read条件为真进入handle_stripe_fill */ if (s.to_read || s.non_overwrite|| (conf->level == 6 && s.to_write && s.failed)|| (s.syncing && (s.uptodate + s.compute < disks))|| s.replacing|| s.expanding)handle_stripe_fill(sh, &s, disks); /* 调用ops_run_io检查是否有需要调度的请求 */ ops_run_io(sh, &s);}static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s){ rcu_read_lock(); for (i = disks; i--; ) {/* 统计读请求 */if (dev->toread)s->to_read++;/* 条带/设备状态正常 */if (test_bit(In_sync, &rdev->flags))set_bit(R5_Insync, &dev->flags); } rcu_read_unlock();}static void handle_stripe_fill(struct stripe_head *sh,struct stripe_head_state *s,int disks){ int i; /* 当前条带状态没有设置标记,满足条件判断进入if */ if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&!sh->reconstruct_state)for (i = disks; i--; )if (fetch_block(sh, s, i, disks))break; set_bit(STRIPE_HANDLE, &sh->state);}static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,int disk_idx, int disks){ struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],&sh->dev[s->failed_num[1]] }; /* dev尚未下发IO所以未设置R5_LOCKED和R5_UPTODATE标记 */ if (!test_bit(R5_LOCKED, &dev->flags) &&!test_bit(R5_UPTODATE, &dev->flags) &&/* dev->toread条件为真,进入最外层if判断 */dev->toread) {/* 在analyse_stripe中设置了R5_Insync */if (test_bit(R5_Insync, &dev->flags)) {/* 设置R5_LOCKED标记表明对应磁盘正在进行IO处理 */set_bit(R5_LOCKED, &dev->flags);/* 设置R5_Wantread标记表明需要下发读请求 */set_bit(R5_Wantread, &dev->flags);/* 统计在进行IO操作的dev的计数 */s->locked++;} } return 0;}static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s){ struct r5conf *conf = sh->raid_conf; int i, disks = sh->disks; might_sleep(); for (i = disks; i--; ) {bi = &sh->dev[i].req;rbi = &sh->dev[i].rreq; /* For writing to replacement */rcu_read_lock();rrdev = rcu_dereference(conf->disks[i].replacement);smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */rdev = rcu_dereference(conf->disks[i].rdev);if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))rw = WRITE_FUA;elserw = WRITE;if (test_bit(R5_Discard, &sh->dev[i].flags))rw |= REQ_DISCARD;} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))/* 设置为请求类型为读 */rw = READ;else if (test_and_clear_bit(R5_WantReplace,&sh->dev[i].flags)) {rw = WRITE;replace_only = 1;} else/* 其余跳过 */continue;if (rdev) {set_bit(STRIPE_IO_STARTED, &sh->state);/** 设置bio参数* 包括重新设置bio指向的块设备,起始位置,IO完成回调函数*/bio_reset(bi);bi->bi_bdev = rdev->bdev;bi->bi_rw = rw;bi->bi_end_io = (rw & WRITE)? raid5_end_write_request: raid5_end_read_request;bi->bi_private = sh;atomic_inc(&sh->count);if (use_new_offset(conf, sh))bi->bi_sector = (sh->sector + rdev->new_data_offset);elsebi->bi_sector = (sh->sector + rdev->data_offset);if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))bi->bi_rw |= REQ_FLUSH;bi->bi_vcnt = 1;bi->bi_io_vec[0].bv_len = STRIPE_SIZE;bi->bi_io_vec[0].bv_offset = 0;bi->bi_size = STRIPE_SIZE;if (rrdev)set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);/* 调用generic_make_request向底层块设备提交请求 */generic_make_request(bi);} }}

经验总结扩展阅读