《野兽》1975年磁力版:UBIFS文件系统分析

来源：百度文库编辑：偶看新闻时间：2024/05/06 06:05:49

UBIFS文件系统分析

（一）：挂载UBIFS的代码分析

陆陆续续的看UBIFS很长时间了，一直没有写出一点东西。因为我在=到能够系统的理解UBIFS的时候再写出一点东西。但是因为工作比较忙，UBIFS源码读的断断续续，老是需要复习拾起，比较浪费时间，所以决定写出一点东西，做个备份吧。我决定在读UBIFS源码之前需要读两份关于UBIF设计的文档：一份是《UBI-Unsorted Block Images》

ubidesign.pdf 另外一份是《A Brief Introduction to the design of UBIFS》

A Brief Introduction to the Design of UBIFS.pdf 这两份简洁的介绍了UBIFS设计的一些结构和考虑。

我们按照挂载ubifs的工序来分析代码：

（2）ubimkvol /dev/ubi0 -N ubifs -s15MiB

（3）mount -t ubifs ubi0:ubifs /mnt

首先先分析（1），相应的代码是ubi_attach_mtd_dev（）函数，下面我们紧跟代码来看看究竟干了些什么。

1．ubi_attach_mtd_dev

int ubi_attach_mtd_dev(struct mtd_info*mtd, int ubi_num, int vid_hdr_offset)

{

//ubi_num, vid_hdr_offset是命令传进来的参数

structubi_device *ubi;

inti, err, do_free = 1;

* Check if we already have the same MTD deviceattached.

* Note, this function assumes that UBI devicescreations and deletions

* are serialized, so it does not take the&ubi_devices_lock.

for(i = 0; i < UBI_MAX_DEVICES; i++) {

ubi= ubi_devices[i];

if(ubi && mtd->index == ubi->mtd->index) {

dbg_err("mtd%dis already attached to ubi%d",

mtd->index,i);

return-EEXIST;

}

//上面的这段代码可以看英文注释，一个mtd设备（一个分区）不能被attach两次，除非你已经deatch了。所以在这段代码的开始就检查被attach的mtd设备是否已经被attach了。

if(mtd->type == MTD_UBIVOLUME) {

ubi_err("refuseattaching mtd%d - it is already emulated on "

"topof UBI", mtd->index);

return-EINVAL;

}

上面的代码接着检查被attach的mtd设备时候是一个mtdvolume（卷区），如果已经是一个mtd卷了，那么就不能再被attach了。

if (ubi_num == UBI_DEV_NUM_AUTO) {

/*Search for an empty slot in the @ubi_devices array */

for (ubi_num = 0; ubi_num < UBI_MAX_DEVICES;ubi_num++)

if (!ubi_devices[ubi_num])

break;

如果在终端输入命令的时候没有带ubinum，那么就是自动分配ubinum，系统就会从ubi_device[]数组中找出一个没被使用的ubinum号

if (ubi_num == UBI_MAX_DEVICES) {

dbg_err("only %d UBI devices may becreated",

UBI_MAX_DEVICES);

return -ENFILE;

}

} else {

if (ubi_num >= UBI_MAX_DEVICES)

return -EINVAL;

如果ubi_num > UBI_MAX_DEVICES，就代表没有空余ubinum号可供分配，返回出错

/* Make sure ubi_num is not busy */

if (ubi_devices[ubi_num]) {

dbg_err("ubi%d already exists",ubi_num);

return -EEXIST;

}

ubi = kzalloc(sizeof(struct ubi_device), GFP_KERNEL);

if (!ubi)

return -ENOMEM;

ubi->mtd = mtd;

ubi->ubi_num = ubi_num;

ubi->vid_hdr_offset = vid_hdr_offset;

ubi->autoresize_vol_id = -1;

mutex_init(&ubi->buf_mutex);

mutex_init(&ubi->ckvol_mutex);

mutex_init(&ubi->mult_mutex);

mutex_init(&ubi->volumes_mutex);

spin_lock_init(&ubi->volumes_lock);

初始化信号

ubi_msg("attaching mtd%d to ubi%d", mtd->index,ubi_num);

err = io_init(ubi);

if (err)

goto out_free;

下面跟着io_init()往下分析：

static intio_init(struct ubi_device *ubi)

{

if (ubi->mtd->numeraseregions != 0) {

ubi_err("multiple regions, not implemented");

return -EINVAL;

}

Numeraseregions是扫描nandflash得到的信息，如果numeraseregions等于0，代表我们需要attach的设备已经擦除过了

if (ubi->vid_hdr_offset < 0)

return -EINVAL;

ubi->vid_hdr_offset显然应该是一个正数，一般是nandflash的一页，我们的4020上的nandflash页大小为512字节，所以ubi->vid_hdr_offset为512.这儿再稍微说一下，EC header和VID header，是记录我们ubi管理信息。一般EC在一个擦除块的第一页，所以偏移量为0，VID在擦除块的第二页上，所以偏移量为512.，在我们4020的nandflash上，一个擦除块的大小为16K，也就是32页。

下面接着讲我们的扫描信息写进mtd结构体

ubi->peb_size =ubi->mtd->erasesize;

ubi->peb_count =ubi->mtd->size / ubi->mtd->erasesize;

Peb_count是指逻辑块的数目，也就是总的大小除以每一页的大小

ubi->flash_size = ubi->mtd->size;

if (ubi->mtd->block_isbad &&ubi->mtd->block_markbad)

ubi->bad_allowed = 1;

ubi->min_io_size = ubi->mtd->writesize;

ubi->hdrs_min_io_size = ubi->mtd->writesize >>ubi->mtd->subpage_sft;

if (!is_power_of_2(ubi->min_io_size)) {

ubi_err("min. I/O unit (%d) is not power of2",

ubi->min_io_size);

return -EINVAL;

}

ubi_assert(ubi->hdrs_min_io_size > 0);

ubi_assert(ubi->hdrs_min_io_size <=ubi->min_io_size);

ubi_assert(ubi->min_io_size % ubi->hdrs_min_io_size ==0);

/* Calculate defaultaligned sizes of EC and VID headers */

ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE,ubi->hdrs_min_io_size);

ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE,ubi->hdrs_min_io_size);

dbg_msg("min_io_size %d", ubi->min_io_size);

dbg_msg("hdrs_min_io_size %d",ubi->hdrs_min_io_size);

dbg_msg("ec_hdr_alsize %d", ubi->ec_hdr_alsize);

dbg_msg("vid_hdr_alsize %d", ubi->vid_hdr_alsize);

if (ubi->vid_hdr_offset == 0)

/* Default offset */

ubi->vid_hdr_offset = ubi->vid_hdr_aloffset =

ubi->ec_hdr_alsize;

else {

ubi->vid_hdr_aloffset = ubi->vid_hdr_offset &

~(ubi->hdrs_min_io_size- 1);

ubi->vid_hdr_shift = ubi->vid_hdr_offset -

ubi->vid_hdr_aloffset;

}

Io_init剩余的部分就不分析了，比较容易

接着上面ubi_attach_mtd_dev（）往下说：

ubi->peb_buf1 = vmalloc(ubi->peb_size);

if (!ubi->peb_buf1)

goto out_free;

ubi->peb_buf2 = vmalloc(ubi->peb_size);

if (!ubi->peb_buf2)

goto out_free;

分配两个物理擦除块大小的buf，具体的用途下面再说

err = attach_by_scanning(ubi);

if (err) {

dbg_err("failed to attach by scanning, error%d", err);

goto out_free;

}

我们再跟着attach_by_scanning(ubi)细说

static intattach_by_scanning(struct ubi_device *ubi)

{

int err;

struct ubi_scan_info *si;

si = ubi_scan(ubi);

**********************************************************************************

这儿通过ubi_scan函数来扫描MTD分区的每一块。具体是调用static intprocess_eb(struct ubi_device *ubi, struct ubi_scan_info *si,int pnum)函数来读取EC和VID头（即没一块的前两页），在读每一页的时候，会调用check_pattern函数来判断这一页是否为空，如果每一页都是空的，那么就会发现这个MTD分区是空的。

**********************************************************************************

if (IS_ERR(si))

return PTR_ERR(si);

ubi->bad_peb_count = si->bad_peb_count;

ubi->good_peb_count = ubi->peb_count -ubi->bad_peb_count;

ubi->max_ec = si->max_ec;

ubi->mean_ec = si->mean_ec;

err = ubi_read_volume_table(ubi, si);

if (err)

goto out_si;

err = ubi_wl_init_scan(ubi, si);

**********************************************************************************

取之ubi_wl_init_scan(ubi, si);函数片段

list_for_each_entry_safe(seb,tmp, &si->erase, u.list) {

cond_resched();

e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL);

if (!e)

goto out_free;

e->pnum = seb->pnum;

e->ec = seb->ec;

ubi->lookuptbl[e->pnum] = e;

if (schedule_erase(ubi, e, 0)) {

kmem_cache_free(ubi_wl_entry_slab, e);

goto out_free;

}

在初始化wl的时候会将为每一个空页建立一个struct ubi_work *wl_wrk;结构体（该结构体的具体处理函数为erase_worker，擦除一块，并写入EC头），并添加到ubi->works队列中（list_add_tail(&wrk->list,&ubi->works)）；这儿我们渐渐的认识到ubi->works这个队列的作用，后台进程ubi_thread就是循环的处理该队列中的工作的。

在第一次attach的时候，在这儿ubi_thread进程还没有被唤醒，所以这些工作要等到进程被唤醒的时候才能被处理

**********************************************************************************

if (err)

goto out_vtbl;

err =ubi_eba_init_scan(ubi, si);

**********************************************************************************

前面我们看到了ubi_scan，其实这个这个过程是建立ubifs的基础，因为所有关于ubi和ubifs的基本信息都是在scan 的过程中建立在内存中的，现在调用ubi_eba_init_scan来建立起EBA子系统就是利用前面的扫描信息，建立起没一个volumn的vtl。

if (err)

goto out_wl;

ubi_scan_destroy_si(si);

return 0;

out_wl:

ubi_wl_close(ubi);

out_vtbl:

free_internal_volumes(ubi);

vfree(ubi->vtbl);

out_si:

ubi_scan_destroy_si(si);

return err;

}

1.1．Ubi_scan

struct ubi_scan_info*ubi_scan(struct ubi_device *ubi)

{

int err, pnum;

struct rb_node *rb1, *rb2;

struct ubi_scan_volume *sv;

struct ubi_scan_leb *seb;

struct ubi_scan_info *si;

si = kzalloc(sizeof(struct ubi_scan_info), GFP_KERNEL);

if (!si)

return ERR_PTR(-ENOMEM);

INIT_LIST_HEAD(&si->corr);//初始化si的corrupt队列

INIT_LIST_HEAD(&si->free);// //初始化si的corrupt队列

INIT_LIST_HEAD(&si->erase);////初始化si的corrupt队列

INIT_LIST_HEAD(&si->alien); //初始化si的corrupt队列

si->volumes = RB_ROOT;

#defineRB_ROOT (struct rb_root) { NULL, },只是空的，哈哈

si->is_empty = 1;

err = -ENOMEM;

ech = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);//为ec头部分配空间，用于暂存后面读出的每一个peb的ec头部信息

if (!ech)

goto out_si;

vidh = ubi_zalloc_vid_hdr(ubi, GFP_KERNEL); //为vid头部分配空间，用于暂存后面读出的每一个peb的vid头部信息，注意扫描的目的就是收集EC和VID中信息，在内存中建立相关的信息

if (!vidh)

goto out_ech;

for (pnum = 0; pnum < ubi->peb_count; pnum++) {

cond_resched();

dbg_gen("process PEB %d", pnum);

err = process_eb(ubi, si, pnum);//具体的扫描每一个物理块

if (err < 0)

goto out_vidh;

}

dbg_msg("scanning is finished");

/* Calculate mean erase counter */

if (si->ec_count)//算平均擦除次数

si->mean_ec = div_u64(si->ec_sum,si->ec_count);

if (si->is_empty)//判断这是否是一个空的MTD，如果是空的话，那么后面的mount的时候调用create_default_filesystem在建立初始的ubifs数据

ubi_msg("empty MTD device detected");

* Few corrupted PEBsare not a problem and may be just a result of

* unclean reboots.However, many of them may indicate some problems

* with the flash HW ordriver. Print a warning in this case.

if (si->corr_count >= 8 || si->corr_count >=ubi->peb_count / 4) {

ubi_warn("%d PEBs are corrupted",si->corr_count);

printk(KERN_WARNING "corrupted PEBs are:");

list_for_each_entry(seb, &si->corr, u.list)

printk(KERN_CONT " %d", seb->pnum);

printk(KERN_CONT "\n");

}

* In case of unknownerase counter we use the mean erase counter

* value.

ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) {

ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb)

if (seb->ec == UBI_SCAN_UNKNOWN_EC)

seb->ec = si->mean_ec;

}

list_for_each_entry(seb, &si->free, u.list) {

if (seb->ec == UBI_SCAN_UNKNOWN_EC)

seb->ec = si->mean_ec;

}

list_for_each_entry(seb, &si->corr, u.list)

if (seb->ec == UBI_SCAN_UNKNOWN_EC)

seb->ec = si->mean_ec;

list_for_each_entry(seb, &si->erase, u.list)

if (seb->ec == UBI_SCAN_UNKNOWN_EC)

seb->ec = si->mean_ec;

err = paranoid_check_si(ubi, si);

if (err) {

if (err > 0)

err = -EINVAL;

goto out_vidh;

}

ubi_free_vid_hdr(ubi, vidh);

kfree(ech);

return si;

out_vidh:

ubi_free_vid_hdr(ubi, vidh);

out_ech:

kfree(ech);

out_si:

ubi_scan_destroy_si(si);

return ERR_PTR(err);

}

1.2．process_eb

static intprocess_eb(struct ubi_device *ubi, struct ubi_scan_info *si, int pnum)

{

long long uninitialized_var(ec);

int err, bitflips = 0, vol_id, ec_corr = 0;

dbg_bld("scan PEB %d", pnum);

/* Skip bad physical eraseblocks */

err = ubi_io_is_bad(ubi, pnum);

判断一个块是否为坏块，直接调用mtd层的mtd->block_isbad

if (err < 0)

return err;

else if (err) {

* FIXME: this isactually duty of the I/O sub-system to

* initializethis, but MTD does not provide enough

* information.

si->bad_peb_count += 1;

return 0;

}

err = ubi_io_read_ec_hdr(ubi, pnum, ech, 0);//读ec header，一般为一块的第一页

if (err < 0)

return err;

else if (err == UBI_IO_BITFLIPS)

bitflips = 1;

else if (err == UBI_IO_PEB_EMPTY)

return add_to_list(si, pnum, UBI_SCAN_UNKNOWN_EC,&si->erase);

//注意这儿，为什么这个块是empty（也就是全是0xff），还要丢到si->erase队列中呢？这是因为MTD所谓的空与UBI所谓的空不是一回事。在UBI中，空块是指只包含EC头部的块。所以这些需要将全0xff的块进行擦除，写入EC头部

else if (err == UBI_IO_BAD_EC_HDR) {

* We have toalso look at the VID header, possibly it is not

* corrupted. Set%bitflips flag in order to make this PEB be

* moved and ECbe re-created.

ec_corr = 1;

ec = UBI_SCAN_UNKNOWN_EC;

bitflips = 1;

}

si->is_empty = 0;

if (!ec_corr) {

int image_seq;

/* Make sure UBI version is OK */

if (ech->version != UBI_VERSION) {

ubi_err("this UBI version is %d, imageversion is %d",

UBI_VERSION, (int)ech->version);

return -EINVAL;

}

ec = be64_to_cpu(ech->ec);

if (ec > UBI_MAX_ERASECOUNTER) {

* Erasecounter overflow. The EC headers have 64 bits

*reserved, but we anyway make use of only 31 bit

* values,as this seems to be enough for any existing

* flash.Upgrade UBI and use 64-bit erase counters

*internally.

ubi_err("erase counter overflow, max is%d",

UBI_MAX_ERASECOUNTER);

ubi_dbg_dump_ec_hdr(ech);

return -EINVAL;

}

* Make sure thatall PEBs have the same image sequence number.

* This allows usto detect situations when users flash UBI

* imagesincorrectly, so that the flash has the new UBI image

* and leftoversfrom the old one. This feature was added

* relativelyrecently, and the sequence number was always

* zero, becauseold UBI implementations always set it to zero.

* For thisreasons, we do not panic if some PEBs have zero

* sequencenumber, while other PEBs have non-zero sequence

* number.

image_seq = be32_to_cpu(ech->image_seq);

if (!ubi->image_seq && image_seq)

ubi->image_seq = image_seq;

if (ubi->image_seq && image_seq &&

ubi->image_seq!= image_seq) {

ubi_err("bad image sequence number %d inPEB %d, "

"expected %d", image_seq, pnum,ubi->image_seq);

ubi_dbg_dump_ec_hdr(ech);

return -EINVAL;

}

/* OK, we've done with the EC header, let's look at the VID header*/

err = ubi_io_read_vid_hdr(ubi, pnum, vidh, 0);

if (err < 0)

return err;

else if (err == UBI_IO_BITFLIPS)

bitflips = 1;

else if (err == UBI_IO_BAD_VID_HDR ||

(err == UBI_IO_PEB_FREE && ec_corr)){

//如果是一个块的VID头，那么就添加到corr队列中去

/* VID header is corrupted */

err = add_to_list(si, pnum, ec, &si->corr);

if (err)

return err;

goto adjust_mean_ec;

}else if (err == UBI_IO_PEB_FREE) {

//如果VID头是空的，也就是说该PEB只存在EC头部，那么添加到free队列中，可以用于后面的分配。

/*No VID header - the physical eraseblock is free */

err = add_to_list(si, pnum, ec, &si->free);

if (err)

return err;

goto adjust_mean_ec;

}

vol_id = be32_to_cpu(vidh->vol_id);

if (vol_id > UBI_MAX_VOLUMES && vol_id !=UBI_LAYOUT_VOLUME_ID) {

//判断vol_id是否合法，ubi内部存在一个layout_volume，专门用来保存uservolumn的信息

UBI maintains internalvolumes to store UBI related information e.g. volume information, flash basederase block assignment tables

int lnum = be32_to_cpu(vidh->lnum);

/* Unsupported internal volume */

switch (vidh->compat) {

case UBI_COMPAT_DELETE:

ubi_msg("\"delete\" compatibleinternal volume %d:%d"

" found, remove it", vol_id,lnum);

err = add_to_list(si, pnum, ec,&si->corr);

if (err)

return err;

break;

case UBI_COMPAT_RO:

ubi_msg("read-only compatible internalvolume %d:%d"

" found, switch to read-onlymode",

vol_id, lnum);

ubi->ro_mode = 1;

break;

case UBI_COMPAT_PRESERVE:

ubi_msg("\"preserve\" compatibleinternal volume %d:%d"

" found", vol_id, lnum);

err = add_to_list(si, pnum, ec,&si->alien);

if (err)

return err;

si->alien_peb_count += 1;

return 0;

case UBI_COMPAT_REJECT:

ubi_err("incompatible internal volume %d:%dfound",

vol_id, lnum);

return -EINVAL;

}

if (ec_corr)

ubi_warn("valid VID header but corrupted EC headerat PEB %d",

pnum);

//到这儿可以判定这个PEB是一个有效的UBI块，包含有效的EC头部很有效的VID头部

err = ubi_scan_add_used(ubi, si, pnum, ec, vidh, bitflips);

if (err)

return err;

adjust_mean_ec:

if (!ec_corr) {

si->ec_sum += ec;

si->ec_count += 1;

if (ec > si->max_ec)

si->max_ec = ec;

if (ec < si->min_ec)

si->min_ec = ec;

}

return 0;

}

1.3．ubi_scan_add_used

int ubi_scan_add_used (structubi_device *ubi, struct ubi_scan_info *si,int pnum, int ec, const structubi_vid_hdr *vid_hdr,int bitflips)

{

int err, vol_id, lnum;

unsigned long long sqnum;

struct ubi_scan_volume *sv;

struct ubi_scan_leb *seb;

struct rb_node **p, *parent = NULL;

vol_id = be32_to_cpu(vid_hdr->vol_id);

lnum = be32_to_cpu(vid_hdr->lnum);

sqnum = be64_to_cpu(vid_hdr->sqnum);

dbg_bld("PEB %d, LEB %d:%d, EC %d, sqnum %llu, bitflips%d",

pnum, vol_id, lnum, ec, sqnum, bitflips);

sv = add_volume(si, vol_id, pnum, vid_hdr);

调用add_volumn在检查读出的pnum的volumnid号，在内存中建立volumn的红黑树

if (IS_ERR(sv))

return PTR_ERR(sv);

if (si->max_sqnum < sqnum)

si->max_sqnum = sqnum;

* Walk the RB-tree oflogical eraseblocks of volume @vol_id to look

* if this is the firstinstance of this logical eraseblock or not.

p = &sv->root.rb_node;

while (*p) {

int cmp_res;

parent = *p;

seb = rb_entry(parent, struct ubi_scan_leb, u.rb);

if (lnum != seb->lnum) {

if (lnum < seb->lnum)

p = &(*p)->rb_left;

else

p = &(*p)->rb_right;

continue;

}

在内存中建立ubi_scan_leb的红黑树

* There isalready a physical eraseblock describing the same

* logicaleraseblock present.

dbg_bld("this LEB already exists: PEB %d, sqnum%llu, "

"EC %d", seb->pnum, seb->sqnum,seb->ec);

* Make sure thatthe logical eraseblocks have different

* sequencenumbers. Otherwise the image is bad.

* However, ifthe sequence number is zero, we assume it must

* be an ancientUBI image from the era when UBI did not have

* sequencenumbers. We still can attach these images, unless

* there is aneed to distinguish between old and new

* eraseblocks,in which case we'll refuse the image in

*'compare_lebs()'. In other words, we attach old clean

* images, butrefuse attaching old images with duplicated

* logicaleraseblocks because there was an unclean reboot.

//注意上面的那个while(1)的范围，到这儿的时候表示在ubi_seb的红黑树中找到了一个描述pnum的ubi_seb结构，那么说明什么问题呢？说明在ubi中存在多个PEB指向同一个LEB.

//sqnum是一个持续增加的64bit的全局变量，我们认为它不会溢出，如果seb->sqnum== sqnum，那么显然是不合理的

if (seb->sqnum == sqnum && sqnum != 0) {

ubi_err("two LEBs with same sequence number%llu",

sqnum);

ubi_dbg_dump_seb(seb, 0);

ubi_dbg_dump_vid_hdr(vid_hdr);

return -EINVAL;

}

* Now we have todrop the older one and preserve the newer

* one.

// * @copy_flag: if this logical eraseblockwas copied from another physical eraseblock (for wear-leveling reasons)

//如果存在多个PEB指向同一个LEB，那么一般是WL的时候，或者修改文件的时候发生了unclean reboot，那么我们就需要从这些多个PEB中找出哪个是最新的。compare_lebs就是完成这个工作的。

cmp_res = compare_lebs(ubi, seb, pnum, vid_hdr);

if (cmp_res < 0)

return cmp_res;

if (cmp_res & 1) {

* Thislogical eraseblock is newer then the one

* foundearlier.

err = validate_vid_hdr(vid_hdr, sv, pnum);

if (err)

return err;

if (cmp_res & 4)

err = add_to_list(si, seb->pnum,seb->ec,

&si->corr);

else

err = add_to_list(si, seb->pnum, seb->ec,

&si->erase);

if (err)

return err;

seb->ec = ec;

seb->pnum = pnum;

seb->scrub = ((cmp_res & 2) || bitflips);

seb->sqnum = sqnum;

if (sv->highest_lnum == lnum)

sv->last_data_size =

be32_to_cpu(vid_hdr->data_size);

return 0;

}else {

* Thislogical eraseblock is older than the one found

*previously.

if (cmp_res & 4)

return add_to_list(si, pnum, ec,&si->corr);

else

return add_to_list(si, pnum, ec,&si->erase);

}

* We've met thislogical eraseblock for the first time, add it to the

* scanning information.

//如果到这儿了，表示这是第一次遇到该LEB，那么很简单，将它添加到队列中就可以了

err = validate_vid_hdr(vid_hdr, sv, pnum);

if (err)

return err;

seb = kmalloc(sizeof(struct ubi_scan_leb), GFP_KERNEL);

if (!seb)

return -ENOMEM;

seb->ec = ec;

seb->pnum = pnum;

seb->lnum = lnum;

seb->sqnum = sqnum;

seb->scrub = bitflips;

if (sv->highest_lnum <= lnum) {

sv->highest_lnum = lnum;

sv->last_data_size =be32_to_cpu(vid_hdr->data_size);

}

sv->leb_count += 1;

rb_link_node(&seb->u.rb, parent, p);

rb_insert_color(&seb->u.rb, &sv->root);

return 0;

}

1.4．compare_lebs

static intcompare_lebs(struct ubi_device *ubi, const struct ubi_scan_leb *seb,int pnum,const struct ubi_vid_hdr *vid_hdr)

{

void *buf;

int len, err, second_is_newer, bitflips = 0, corrupted = 0;

uint32_t data_crc, crc;

struct ubi_vid_hdr *vh = NULL;

unsigned long long sqnum2 = be64_to_cpu(vid_hdr->sqnum);

//再次判断一下是否存在sqnum相等的情况发生

if (sqnum2 == seb->sqnum) {

* This must be areally ancient UBI image which has been

* created beforesequence numbers support has been added. At

* that times weused 32-bit LEB versions stored in logical

* eraseblocks.That was before UBI got into mainline. We do not

* support theseimages anymore. Well, those images will work

* still work,but only if no unclean reboots happened.

ubi_err("unsupported on-flash UBI format\n");

return -EINVAL;

}

/* Obviously the LEB with lower sequence counter is older */

//因为sqnum是持续增加的，而且不会溢出。所以认为sqnum大的那个PEB是最新的。 second_is_newer= !!(sqnum2 > seb->sqnum);

* Now we know whichcopy is newer. If the copy flag of the PEB with

* newer version is notset, then we just return, otherwise we have to

* check data CRC. Forthe second PEB we already have the VID header,

* for the first one -we'll need to re-read it from flash.

* Note: this may beoptimized so that we wouldn't read twice.

if (second_is_newer) {

if (!vid_hdr->copy_flag) {

/*It is not a copy, so it is newer */

dbg_bld("second PEB %d is newer, copy_flagis unset",

pnum);

return 1;

}

} else {

//如果copy_flag位设置了，那么可以认为是在WL的时候发生意外。因为发生了unclear reboot，所以需要判断这个最新的PEB中的数据是否是完整的。（unclean reboot时数据可能被打断了）

pnum =seb->pnum;

vh = ubi_zalloc_vid_hdr(ubi, GFP_KERNEL);

if (!vh)

return -ENOMEM;

err = ubi_io_read_vid_hdr(ubi, pnum, vh, 0);

if (err) {

if (err == UBI_IO_BITFLIPS)

bitflips = 1;

else {

dbg_err("VID of PEB %d header isbad, but it "

"was OK earlier", pnum);

if (err > 0)

err = -EIO;

goto out_free_vidh;

}

if (!vh->copy_flag) {

/*It is not a copy, so it is newer */

dbg_bld("first PEB %d is newer, copy_flagis unset",

pnum);

err = bitflips << 1;

goto out_free_vidh;

}

vid_hdr = vh;

}

/* Read the data of the copy and check the CRC */

len = be32_to_cpu(vid_hdr->data_size);

buf = vmalloc(len);

if (!buf) {

err = -ENOMEM;

goto out_free_vidh;

}

//OK,读出数据，校验CRC

err = ubi_io_read_data(ubi, buf, pnum, 0, len);

if (err && err != UBI_IO_BITFLIPS && err !=-EBADMSG)

goto out_free_buf;

data_crc = be32_to_cpu(vid_hdr->data_crc);

crc = crc32(UBI_CRC32_INIT, buf, len);

if (crc != data_crc) {

dbg_bld("PEB %d CRC error: calculated %#08x, mustbe %#08x",

pnum, crc, data_crc);

corrupted = 1;

bitflips = 0;

//如果CRC校验失败了，那么还沿用老的PEB

second_is_newer = !second_is_newer;

}else {

dbg_bld("PEB %d CRC is OK", pnum);

bitflips = !!err;

}

vfree(buf);

ubi_free_vid_hdr(ubi, vh);

if (second_is_newer)

dbg_bld("second PEB %d is newer, copy_flag isset", pnum);

else

dbg_bld("first PEB %d is newer, copy_flag isset", pnum);

return second_is_newer | (bitflips << 1) | (corrupted<< 2);

out_free_buf:

vfree(buf);

out_free_vidh:

ubi_free_vid_hdr(ubi, vh);

return err;

}

二．创建volume

ubimkvol /dev/ubi0 -N ubifs -s 15MiB

上面的这条命令是在ubi设备0上创建一个大小为15M，名字叫做ubifs的volumn

这条命令是通过ioctl实现的，我们下面来看一下相关的代码：

/*Create volume command */

caseUBI_IOCMKVOL:

{

structubi_mkvol_req req;

dbg_gen("createvolume");

err= copy_from_user(&req, argp, sizeof(struct ubi_mkvol_req));

if(err) {

err= -EFAULT;

break;

}

req.name[req.name_len]= '\0';

err= verify_mkvol_req(ubi, &req);

if(err)

break;

mutex_lock(&ubi->device_mutex);

err= ubi_create_volume(ubi, &req);

mutex_unlock(&ubi->device_mutex);

if(err)

break;

err= put_user(req.vol_id, (__user int32_t *)argp);

if(err)

err= -EFAULT;

break;

}

函数的主体部分是ubi_create_volume。传给ubi_create_volume的是一个ubi_mkvol_req类型的结构体。

struct ubi_mkvol_req {

__s32 vol_id;//要创建的volumn的ID，可以不指定

__s32 alignment;//The @alignment field specifies the requiredalignment of the volume logical eraseblock. This means, that the size oflogical eraseblocks will be aligned to this number, i.e.,

(UBI device logicaleraseblock size) mod (@alignment) = 0.

__s64 bytes;//volume的大小

__s8 vol_type;//volume的类型，静态或者动态

__s8 padding1;

__s16 name_len;//volume的名字的长度

__s8 padding2[4];

char name[UBI_MAX_VOLUME_NAME + 1];

} __attribute__((packed));

intubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req)

{

int i, err, vol_id = req->vol_id, do_free = 1;

struct ubi_volume *vol;

struct ubi_vtbl_record vtbl_rec;

dev_t dev;

if (ubi->ro_mode)

return -EROFS;

vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL);

if (!vol)

return -ENOMEM;

spin_lock(&ubi->volumes_lock);

//如果没有指定vol-id,那么就是采用默认的方式获得id

if (vol_id == UBI_VOL_NUM_AUTO) {

/* Find unused volume ID */

dbg_gen("search for vacant volume ID");

for (i = 0; i < ubi->vtbl_slots; i++)

if (!ubi->volumes[i]) {

vol_id = i;

break;

}

if (vol_id == UBI_VOL_NUM_AUTO) {

dbg_err("out of volume IDs");

err = -ENFILE;

goto out_unlock;

}

req->vol_id = vol_id;

}

dbg_gen("create device %d, volume %d, %llu bytes, type%d, name %s",

ubi->ubi_num, vol_id, (unsigned longlong)req->bytes,

(int)req->vol_type, req->name);

/* Ensure that this volume does not exist */

err = -EEXIST;

if (ubi->volumes[vol_id]) {

dbg_err("volume %d already exists", vol_id);

goto out_unlock;

}

/* Ensure that the name is unique */

//确认要创建的volume的名字是唯一的。与已经存在的volume对比

for (i = 0; i < ubi->vtbl_slots; i++)

if (ubi->volumes[i] &&

ubi->volumes[i]->name_len == req->name_len &&

!strcmp(ubi->volumes[i]->name,req->name)) {

dbg_err("volume \"%s\" exists (ID%d)", req->name, i);

goto out_unlock;

}

//根据req->bytes计算需要的物理块数，UBI中操作的基本单元是物理块

/*Calculate how many eraseblocks are requested */

vol->usable_leb_size = ubi->leb_size - ubi->leb_size% req->alignment;

vol->reserved_pebs += div_u64(req->bytes +vol->usable_leb_size - 1,

vol->usable_leb_size);

/* Reserve physical eraseblocks */

if (vol->reserved_pebs > ubi->avail_pebs) {

dbg_err("not enough PEBs, only %d available",ubi->avail_pebs);

err = -ENOSPC;

goto out_unlock;

}

//将ubi设备中的可用pebs减少，因为已经分配了新创建的volume

ubi->avail_pebs -= vol->reserved_pebs;

ubi->rsvd_pebs += vol->reserved_pebs;

spin_unlock(&ubi->volumes_lock);

//初始化新创建的volume的相关信息

vol->vol_id =vol_id;

vol->alignment = req->alignment;

vol->data_pad =ubi->leb_size % vol->alignment;

vol->vol_type =req->vol_type;

vol->name_len =req->name_len;

memcpy(vol->name, req->name, vol->name_len);

vol->ubi = ubi;

* Finish all pendingerases because there may be some LEBs belonging

* to the same volumeID.

//刷新UBI后台中pending的workers。

err = ubi_wl_flush(ubi);

if (err)

goto out_acc;

//创建eba_tbl表，并将其初始化为UBI_LEB_UNMAPPED，只有在对具体的LEB进行写操作的时候才会真正的更新该表中的每一个LEB对应的项

vol->eba_tbl = kmalloc(vol->reserved_pebs * sizeof(int),GFP_KERNEL);

if (!vol->eba_tbl) {

err = -ENOMEM;

goto out_acc;

}

for (i = 0; i < vol->reserved_pebs; i++)

vol->eba_tbl[i] = UBI_LEB_UNMAPPED;

if (vol->vol_type == UBI_DYNAMIC_VOLUME) {

vol->used_ebs = vol->reserved_pebs;

vol->last_eb_bytes = vol->usable_leb_size;

vol->used_bytes =

(long long)vol->used_ebs *vol->usable_leb_size;

}else {

vol->used_ebs = div_u64_rem(vol->used_bytes,

vol->usable_leb_size,

&vol->last_eb_bytes);

if (vol->last_eb_bytes != 0)

vol->used_ebs += 1;

else

vol->last_eb_bytes = vol->usable_leb_size;

}

/* Register character device for the volume */

//给ubivolume注册字符接口

cdev_init(&vol->cdev, &ubi_vol_cdev_operations);

vol->cdev.owner = THIS_MODULE;

dev = MKDEV(MAJOR(ubi->cdev.dev), vol_id + 1);

err = cdev_add(&vol->cdev, dev, 1);

if (err) {

ubi_err("cannot add character device");

goto out_mapping;

}

vol->dev.release = vol_release;

vol->dev.parent = &ubi->dev;

vol->dev.devt = dev;

vol->dev.class = ubi_class;

dev_set_name(&vol->dev, "%s_%d",ubi->ubi_name, vol->vol_id);

err = device_register(&vol->dev);

if (err) {

ubi_err("cannot register device");

goto out_cdev;

}

err = volume_sysfs_init(ubi, vol);

if (err)

goto out_sysfs;

/* Fill volume table record */

//ubi中存在一个internalvolume ，其中保持的是每一个volume 的信息，现在新创建了一个volume，就需要更新其中的这个internal volume（layout volume）的信息

memset(&vtbl_rec, 0, sizeof(struct ubi_vtbl_record));

vtbl_rec.reserved_pebs = cpu_to_be32(vol->reserved_pebs);

vtbl_rec.alignment =cpu_to_be32(vol->alignment);

vtbl_rec.data_pad =cpu_to_be32(vol->data_pad);

vtbl_rec.name_len =cpu_to_be16(vol->name_len);

if (vol->vol_type == UBI_DYNAMIC_VOLUME)

vtbl_rec.vol_type = UBI_VID_DYNAMIC;

else

vtbl_rec.vol_type = UBI_VID_STATIC;

memcpy(vtbl_rec.name, vol->name, vol->name_len);

err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);

通过一个ubi_eba_unmap_leb操作，和一个ubi_eba_write_leb操作来实现了ubifs的写操作，保证了数据的安全性

if (err)

goto out_sysfs;

spin_lock(&ubi->volumes_lock);

ubi->volumes[vol_id] = vol;

ubi->vol_count += 1;

spin_unlock(&ubi->volumes_lock);

//通知相关模块，UBI创建了一个新的volume，让它们也采取相应的措施，貌似这个通知联上只有gluebi_notifier。

ubi_volume_notify(ubi, vol, UBI_VOLUME_ADDED);

if (paranoid_check_volumes(ubi))

dbg_err("check failed while creating volume%d", vol_id);

return err;

out_sysfs:

* We have registeredour device, we should not free the volume

* description object inthis function in case of an error - it is

* freed by the releasefunction.

* Get device referenceto prevent the release function from being

* called just aftersysfs has been closed.

do_free = 0;

get_device(&vol->dev);

volume_sysfs_close(vol);

out_cdev:

cdev_del(&vol->cdev);

out_mapping:

if (do_free)

kfree(vol->eba_tbl);

out_acc:

spin_lock(&ubi->volumes_lock);

ubi->rsvd_pebs -= vol->reserved_pebs;

ubi->avail_pebs += vol->reserved_pebs;

out_unlock:

spin_unlock(&ubi->volumes_lock);

if (do_free)

kfree(vol);

else

put_device(&vol->dev);

ubi_err("cannot create volume %d, error %d", vol_id,err);

return err;

}

三．Mount过程

static int mount_ubifs(struct ubifs_info *c)

{

struct super_block *sb= c->vfs_sb;

int err,mounted_read_only = (sb->s_flags & MS_RDONLY);

long long x;

size_t sz;

err =init_constants_early(c);

if (err)

return err;

err =ubifs_debugging_init(c);

if (err)

return err;

//通过检查vtl表来确定volume是否为空

err =check_volume_empty(c);

if (err)

goto out_free;

//如果该volume为空，但是只读的话，显然不能写入信息，自然

//也就不能mount了

if (c->empty&& (mounted_read_only || c->ro_media)) {

* This UBI volume is empty, and read-only, orthe file system

* is mounted read-only - we cannot format it.

ubifs_err("can'tformat empty UBI volume: read-only %s",

c->ro_media ? "UBI volume" :"mount");

err = -EROFS;

goto out_free;

}

if (c->ro_media&& !mounted_read_only) {

ubifs_err("cannotmount read-write - read-only media");

err = -EROFS;

goto out_free;

}

* The requirement for the buffer is that itshould fit indexing B-tree

* height amount of integers. We assume theheight if the TNC tree will

* never exceed 64.

err = -ENOMEM;

//bottom_up_buf: a buffer which isused by 'dirty_cow_bottom_up()' in tnc.c,在后面我们会看到在dirty_cow_bottom_up中将znode的所有的ancestors（父节点，父节点的父节点，一直到根节点未知）都设为dirty。所以在标记之前要记录一下所以的ancestors znode。这个bottom_up_buf就是用于这个目的的。

c->bottom_up_buf =kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL);

if(!c->bottom_up_buf)

goto out_free;

//sbuf: LEB-sizedbuffer to use

c->sbuf =vmalloc(c->leb_size);

if (!c->sbuf)

goto out_free;

if(!mounted_read_only) {

//@ileb_buf:buffer for commit in-the-gaps method

c->ileb_buf= vmalloc(c->leb_size);

if(!c->ileb_buf)

gotoout_free;

}

if (c->bulk_read ==1)

//初始化bulk-read的信息，关于bulk-read的相关信息可以在通过VFS的读操作中看到详细的解释

bu_init(c);

* We have to check all CRCs, even for datanodes, when we mount the FS

* (specifically, when we are replaying).

c->always_chk_crc =1;

//读超级块，如果该volume是空的，显然不存在超级块，这时候需要创建一个最初的文件系统

err =ubifs_read_superblock(c);

if (err)

goto out_free;

* Make sure the compressor which is set asdefault in the superblock

* or overridden by mount options is actuallycompiled in.

if(!ubifs_compr_present(c->default_compr)) {

ubifs_err("'compressor\"%s\" is not compiled in",

ubifs_compr_name(c->default_compr));

err =-ENOTSUPP;

goto out_free;

}

//初始化ubifs的一些常量

err =init_constants_sb(c);

if (err)

goto out_free;

sz =ALIGN(c->max_idx_node_sz, c->min_io_size);

sz = ALIGN(sz +c->max_idx_node_sz, c->min_io_size);

c->cbuf =kmalloc(sz, GFP_NOFS);

if (!c->cbuf) {

err = -ENOMEM;

goto out_free;

}

sprintf(c->bgt_name,BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);

if(!mounted_read_only) {

err = alloc_wbufs(c);

if (err)

gotoout_cbuf;

/* Createbackground thread */

//创建UBIFS的后台进程，这个后台进程主要用于基于wbuf的读写

c->bgt =kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);

if(IS_ERR(c->bgt)) {

err =PTR_ERR(c->bgt);

c->bgt= NULL;

ubifs_err("cannotspawn \"%s\", error %d",

c->bgt_name, err);

gotoout_wbufs;

}

//唤醒该进程

wake_up_process(c->bgt);

}

err = ubifs_read_master(c);

//见下面的具体描述

if (err)

goto out_free;

* Make sure the compressor which is set asdefault in the superblock

* or overridden by mount options is actuallycompiled in.

if(!ubifs_compr_present(c->default_compr)) {

ubifs_err("'compressor\"%s\" is not compiled in",

ubifs_compr_name(c->default_compr));

err =-ENOTSUPP;

goto out_free;

}

err =init_constants_sb(c);

if (err)

goto out_free;

sz = ALIGN(c->max_idx_node_sz,c->min_io_size);

sz = ALIGN(sz +c->max_idx_node_sz, c->min_io_size);

c->cbuf =kmalloc(sz, GFP_NOFS);

if (!c->cbuf) {

err = -ENOMEM;

goto out_free;

}

sprintf(c->bgt_name,BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);

if (!mounted_read_only){

err =alloc_wbufs(c);

if (err)

gotoout_cbuf;

/* Createbackground thread */

c->bgt =kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);

if(IS_ERR(c->bgt)) {

err =PTR_ERR(c->bgt);

c->bgt= NULL;

ubifs_err("cannotspawn \"%s\", error %d",

c->bgt_name, err);

gotoout_wbufs;

}

wake_up_process(c->bgt);

}

err =ubifs_read_master(c);

if (err)

gotoout_master;

init_constants_master(c);

if((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {

ubifs_msg("recoveryneeded");

c->need_recovery= 1;

if(!mounted_read_only) {

err =ubifs_recover_inl_heads(c, c->sbuf);

if (err)

gotoout_master;

}

} else if(!mounted_read_only) {

* Set the "dirty" flag so that if wereboot uncleanly we

* will notice this immediately on the nextmount.

c->mst_node->flags|= cpu_to_le32(UBIFS_MST_DIRTY);

err =ubifs_write_master(c);

if (err)

gotoout_master;

}

err =ubifs_lpt_init(c, 1, !mounted_read_only);

if (err)

goto out_lpt;

err =dbg_check_idx_size(c, c->old_idx_sz);

if (err)

goto out_lpt;

err =ubifs_replay_journal(c);

if (err)

gotoout_journal;

/* Calculate'min_idx_lebs' after journal replay */

c->min_idx_lebs =ubifs_calc_min_idx_lebs(c);

err = ubifs_mount_orphans(c,c->need_recovery, mounted_read_only);

if (err)

gotoout_orphans;

if(!mounted_read_only) {

int lnum;

err =check_free_space(c);

if (err)

gotoout_orphans;

/* Check forenough log space */

lnum =c->lhead_lnum + 1;

if (lnum >=UBIFS_LOG_LNUM + c->log_lebs)

lnum =UBIFS_LOG_LNUM;

if (lnum ==c->ltail_lnum) {

err =ubifs_consolidate_log(c);

if (err)

gotoout_orphans;

}

if(c->need_recovery) {

err =ubifs_recover_size(c);

if (err)

gotoout_orphans;

err =ubifs_rcvry_gc_commit(c);

}else {

err =take_gc_lnum(c);

if (err)

gotoout_orphans;

* GC LEB may contain garbage if there was anunclean

* reboot, and it should be un-mapped.

err =ubifs_leb_unmap(c, c->gc_lnum);

if (err)

returnerr;

}

err =dbg_check_lprops(c);

if (err)

gotoout_orphans;

}else if (c->need_recovery) {

err =ubifs_recover_size(c);

if (err)

gotoout_orphans;

}else {

* Even if we mount read-only, we have to setspace in GC LEB

* to proper value because this affects UBIFSfree space

* reporting. We do not want to have asituation when

* re-mounting from R/O to R/W changes amountof free space.

err =take_gc_lnum(c);

if (err)

goto out_orphans;

}

spin_lock(&ubifs_infos_lock);

list_add_tail(&c->infos_list,&ubifs_infos);

spin_unlock(&ubifs_infos_lock);

if(c->need_recovery) {

if(mounted_read_only)

ubifs_msg("recoverydeferred");

else {

c->need_recovery= 0;

ubifs_msg("recoverycompleted");

* GC LEB has to be empty and taken at thispoint. But

* the journal head LEBs may also be accountedas

* "empty taken" if they are empty.

ubifs_assert(c->lst.taken_empty_lebs> 0);

}

} else

ubifs_assert(c->lst.taken_empty_lebs> 0);

err =dbg_check_filesystem(c);

if (err)

goto out_infos;

err =dbg_debugfs_init_fs(c);

if (err)

goto out_infos;

c->always_chk_crc =0;

ubifs_msg("mountedUBI device %d, volume %d, name \"%s\"",

c->vi.ubi_num, c->vi.vol_id,c->vi.name);

if (mounted_read_only)

ubifs_msg("mountedread-only");

x = (longlong)c->main_lebs * c->leb_size;

ubifs_msg("filesystem size: %lld bytes (%lld KiB, %lldMiB, %d "

"LEBs)", x, x >> 10, x>> 20, c->main_lebs);

x = (longlong)c->log_lebs * c->leb_size + c->max_bud_bytes;

ubifs_msg("journalsize: %lld bytes (%lld KiB, %lldMiB, %d "

"LEBs)", x, x >> 10, x>> 20, c->log_lebs + c->max_bud_cnt);

ubifs_msg("mediaformat: w%d/r%d (latest isw%d/r%d)",

c->fmt_version, c->ro_compat_version,

UBIFS_FORMAT_VERSION,UBIFS_RO_COMPAT_VERSION);

ubifs_msg("defaultcompressor: %s", ubifs_compr_name(c->default_compr));

ubifs_msg("reservedfor root: %llu bytes (%llu KiB)",

c->report_rp_size,c->report_rp_size >> 10);

dbg_msg("compiledon: " __DATE__ " at" __TIME__);

dbg_msg("min. I/Ounit size: %d bytes",c->min_io_size);

dbg_msg("LEBsize: %d bytes (%d KiB)",

c->leb_size,c->leb_size >> 10);

dbg_msg("datajournal heads: %d",

c->jhead_cnt- NONDATA_JHEADS_CNT);

dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X"

"-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X",

c->uuid[0], c->uuid[1],c->uuid[2], c->uuid[3],

c->uuid[4], c->uuid[5],c->uuid[6], c->uuid[7],

c->uuid[8], c->uuid[9],c->uuid[10], c->uuid[11],

c->uuid[12], c->uuid[13],c->uuid[14], c->uuid[15]);

dbg_msg("big_lpt %d", c->big_lpt);

dbg_msg("logLEBs: %d (%d - %d)",

c->log_lebs,UBIFS_LOG_LNUM, c->log_last);

dbg_msg("LPT areaLEBs: %d (%d - %d)",

c->lpt_lebs,c->lpt_first, c->lpt_last);

dbg_msg("orphanarea LEBs: %d (%d - %d)",

c->orph_lebs,c->orph_first, c->orph_last);

dbg_msg("mainarea LEBs: %d (%d - %d)",

c->main_lebs,c->main_first, c->leb_cnt - 1);

dbg_msg("indexLEBs: %d",c->lst.idx_lebs);

dbg_msg("totalindex bytes: %lld (%lld KiB, %lldMiB)",

c->old_idx_sz,c->old_idx_sz >> 10, c->old_idx_sz >> 20);

dbg_msg("key hashtype: %d",c->key_hash_type);

dbg_msg("treefanout: %d", c->fanout);

dbg_msg("reservedGC LEB: %d", c->gc_lnum);

dbg_msg("firstmain LEB: %d",c->main_first);

dbg_msg("max.znode size %d",c->max_znode_sz);

dbg_msg("max.index node size %d", c->max_idx_node_sz);

dbg_msg("nodesizes: data %zu, inode %zu,dentry %zu",

UBIFS_DATA_NODE_SZ,UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);

dbg_msg("nodesizes: trun %zu, sb %zu, master%zu",

UBIFS_TRUN_NODE_SZ,UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);

dbg_msg("nodesizes: ref %zu, cmt. start %zu,orph %zu",

UBIFS_REF_NODE_SZ,UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);

dbg_msg("max.node sizes: data %zu, inode %zudentry %zu",

UBIFS_MAX_DATA_NODE_SZ,UBIFS_MAX_INO_NODE_SZ,

UBIFS_MAX_DENT_NODE_SZ);

dbg_msg("deadwatermark: %d", c->dead_wm);

dbg_msg("darkwatermark: %d", c->dark_wm);

dbg_msg("LEBoverhead: %d",c->leb_overhead);

x = (longlong)c->main_lebs * c->dark_wm;

dbg_msg("max.dark space: %lld (%lld KiB, %lldMiB)",

x, x >>10, x >> 20);

dbg_msg("maximumbud bytes: %lld (%lld KiB, %lldMiB)",

c->max_bud_bytes,c->max_bud_bytes >> 10,

c->max_bud_bytes>> 20);

dbg_msg("BGcommit bud bytes: %lld (%lld KiB, %lld MiB)",

c->bg_bud_bytes,c->bg_bud_bytes >> 10,

c->bg_bud_bytes>> 20);

dbg_msg("currentbud bytes %lld (%lld KiB, %lldMiB)",

c->bud_bytes,c->bud_bytes >> 10, c->bud_bytes >> 20);

dbg_msg("max.seq. number: %llu",c->max_sqnum);

dbg_msg("commitnumber: %llu", c->cmt_no);

return 0;

out_infos:

spin_lock(&ubifs_infos_lock);

list_del(&c->infos_list);

spin_unlock(&ubifs_infos_lock);

out_orphans:

free_orphans(c);

out_journal:

destroy_journal(c);

out_lpt:

ubifs_lpt_free(c, 0);

out_master:

kfree(c->mst_node);

kfree(c->rcvrd_mst_node);

if (c->bgt)

kthread_stop(c->bgt);

out_wbufs:

free_wbufs(c);

out_cbuf:

kfree(c->cbuf);

out_free:

kfree(c->bu.buf);

vfree(c->ileb_buf);

vfree(c->sbuf);

kfree(c->bottom_up_buf);

ubifs_debugging_exit(c);

return err;

}

3.1 ubifs_read_superblock

int ubifs_read_superblock(struct ubifs_info *c)

{

int err, sup_flags;

struct ubifs_sb_node*sup;

//如果前面扫描的时候发现该卷中的LEB全部没有map，因此是一个空卷，什么信息都没有，这时候需要建立一个最原始的文件系统，其实就是写入superblock节点（LEB0），master节点（LEB1，和LEB2），commit节点（LEB3），inode节点（main_first+1），index节点（main_first+0）。

//对于这些节点，我觉得很有必要详细的描述一下。我们都知道每一个文件系统都有一个超级块，里面存放的是文件系统的基本信息，在这儿ubifs将超级块以superblock类型节点的形式写进了flash media。

//从《a brief introduce of ubi and ubifs》的文档中可以看出。为了垃圾回收，采用node-structure的形式组织文件，jiffs2中这些相关的数据结构是在mount的时候建立的，这样花费了大量的时间和内存资源，而ubifs中这些数据是保存在flash media中的。Master节点就是这样的树状信息的根节点。Master节点是一式两份的，分别保存在LEB1和LEB2上。为什么需要两份呢？

因为文件更新的时候，B+tree中的数据会变的，相应的master也就需要更新，为了防止在更新master的时候发生unclean reboot导致数据被破坏，所以保存了两份，用于unclean reboot时候的数据恢复。

if (c->empty) {

err =create_default_filesystem(c);

if (err)

returnerr;

}

//读出超级块，当然这个超级块有可能是上面的create_default_filesystem刚刚写进去的。

sup =ubifs_read_sb_node(c);

if (IS_ERR(sup))

returnPTR_ERR(sup);

c->fmt_version =le32_to_cpu(sup->fmt_version);

c->ro_compat_version= le32_to_cpu(sup->ro_compat_version);

* The software supports all previous versionsbut not future versions,

* due to the unavailability of time-travellingequipment.

if (c->fmt_version> UBIFS_FORMAT_VERSION) {

structsuper_block *sb = c->vfs_sb;

int mounting_ro= sb->s_flags & MS_RDONLY;

ubifs_assert(!c->ro_media|| mounting_ro);

if(!mounting_ro ||

c->ro_compat_version >UBIFS_RO_COMPAT_VERSION) {

ubifs_err("on-flashformat version is w%d/r%d, but "

"software only supports up to version "

"w%d/r%d", c->fmt_version,

c->ro_compat_version,UBIFS_FORMAT_VERSION,

UBIFS_RO_COMPAT_VERSION);

if(c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {

ubifs_msg("onlyR/O mounting is possible");

err= -EROFS;

} else

err= -EINVAL;

gotoout;

}

* The FS is mounted R/O, and the media formatis

* R/O-compatible with the UBIFSimplementation, so we can

* mount.

c->rw_incompat= 1;

}

if (c->fmt_version< 3) {

ubifs_err("on-flashformat version %d is not supported",

c->fmt_version);

err = -EINVAL;

goto out;

}

//采用哪种hash运算方法

switch (sup->key_hash) {

caseUBIFS_KEY_HASH_R5:

c->key_hash= key_r5_hash;

c->key_hash_type= UBIFS_KEY_HASH_R5;

break;

caseUBIFS_KEY_HASH_TEST:

c->key_hash= key_test_hash;

c->key_hash_type= UBIFS_KEY_HASH_TEST;

break;

};

c->key_fmt =sup->key_fmt;

switch (c->key_fmt){

caseUBIFS_SIMPLE_KEY_FMT:

c->key_len =UBIFS_SK_LEN;

break;

default:

ubifs_err("unsupportedkey format");

err = -EINVAL;

goto out;

}

//用从超级块中读出的信息来初始化内存中的ubifs_info结构体

c->leb_cnt = le32_to_cpu(sup->leb_cnt);

c->max_leb_cnt = le32_to_cpu(sup->max_leb_cnt);

c->max_bud_bytes =le64_to_cpu(sup->max_bud_bytes);

c->log_lebs = le32_to_cpu(sup->log_lebs);

c->lpt_lebs = le32_to_cpu(sup->lpt_lebs);

c->orph_lebs = le32_to_cpu(sup->orph_lebs);

c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) +NONDATA_JHEADS_CNT;

c->fanout = le32_to_cpu(sup->fanout);

c->lsave_cnt = le32_to_cpu(sup->lsave_cnt);

c->rp_size = le64_to_cpu(sup->rp_size);

c->rp_uid = le32_to_cpu(sup->rp_uid);

c->rp_gid = le32_to_cpu(sup->rp_gid);

sup_flags = le32_to_cpu(sup->flags);

if(!c->mount_opts.override_compr)

c->default_compr= le16_to_cpu(sup->default_compr);

c->vfs_sb->s_time_gran= le32_to_cpu(sup->time_gran);

memcpy(&c->uuid,&sup->uuid, 16);

c->big_lpt =!!(sup_flags & UBIFS_FLG_BIGLPT);

/* Automaticallyincrease file system size to the maximum size */

//ubi的volume是可以resize的，即可以改变大小。此时需要重新写超级块

c->old_leb_cnt =c->leb_cnt;

if (c->leb_cnt vi.size && c->leb_cnt < c->max_leb_cnt) {

c->leb_cnt =min_t(int, c->max_leb_cnt, c->vi.size);

if(c->vfs_sb->s_flags & MS_RDONLY)

dbg_mnt("Autoresizing (ro) from %d LEBs to %d LEBs",

c->old_leb_cnt, c->leb_cnt);

else {

dbg_mnt("Autoresizing (sb) from %d LEBs to %d LEBs",

c->old_leb_cnt,c->leb_cnt);

sup->leb_cnt= cpu_to_le32(c->leb_cnt);

err =ubifs_write_sb_node(c, sup);

if (err)

gotoout;

c->old_leb_cnt= c->leb_cnt;

}

c->log_bytes =(long long)c->log_lebs * c->leb_size;

c->log_last =UBIFS_LOG_LNUM + c->log_lebs - 1;

c->lpt_first =UBIFS_LOG_LNUM + c->log_lebs;

c->lpt_last =c->lpt_first + c->lpt_lebs - 1;

c->orph_first =c->lpt_last + 1;

c->orph_last =c->orph_first + c->orph_lebs - 1;

c->main_lebs =c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;

c->main_lebs -=c->log_lebs + c->lpt_lebs + c->orph_lebs;

c->main_first =c->leb_cnt - c->main_lebs;

err = validate_sb(c,sup);

out:

kfree(sup);

return err;

}

3.2create_default_filesystem

static int create_default_filesystem(struct ubifs_info *c)

{

struct ubifs_sb_node*sup;

struct ubifs_mst_node*mst;

struct ubifs_idx_node*idx;

struct ubifs_branch*br;

struct ubifs_ino_node*ino;

struct ubifs_cs_node*cs;

union ubifs_key key;

int err, tmp,jnl_lebs, log_lebs, max_buds, main_lebs, main_first;

int lpt_lebs,lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;

int min_leb_cnt =UBIFS_MIN_LEB_CNT;

long long tmp64,main_bytes;

__le64 tmp_le64;

/* Some functionscalled from here depend on the @c->key_len filed */

c->key_len =UBIFS_SK_LEN;

* First of all, we have to calculate defaultfile-system geometry -

* log size, journal size, etc.

//首先根据文件系统的大小算相应的journal和log区的大小。Journal的目的前面可能已经提到了，因为ubifs的文件的B+tree的数据是保存在flash media中，这就带来了一个问题，每次更新文件的时候都需要更新相关的B+tree的信息，这样就会频繁的读写flash设备，降低文件系统的性能。所以采用了joural，也就是说在更新的时候先将更新相关inode的信息写进log中，在log满了的时候才一起更新flash media中的B+tree。这样降低了更新的频率，提高了文件系统的性能。

if (c->leb_cnt <0x7FFFFFFF / DEFAULT_JNL_PERCENT)

/* We can firstmultiply then divide and have no overflow */

jnl_lebs =c->leb_cnt * DEFAULT_JNL_PERCENT / 100;

else

jnl_lebs =(c->leb_cnt / 100) * DEFAULT_JNL_PERCENT;

if (jnl_lebs

jnl_lebs =UBIFS_MIN_JNL_LEBS;

if (jnl_lebs *c->leb_size > DEFAULT_MAX_JNL)

jnl_lebs =DEFAULT_MAX_JNL / c->leb_size;

* The log should be large enough to fitreference nodes for all bud

* LEBs. Because buds do not have to start fromthe beginning of LEBs

* (half of the LEB may contain committeddata), the log should

* generally be larger, make it twice as large.

tmp = 2 *(c->ref_node_alsz * jnl_lebs) + c->leb_size - 1;

log_lebs = tmp /c->leb_size;

/* Plus one LEBreserved for commit */

log_lebs += 1;

if (c->leb_cnt -min_leb_cnt > 8) {

/* And someextra space to allow writes while committing */

log_lebs += 1;

min_leb_cnt +=1;

}

max_buds = jnl_lebs -log_lebs;

if (max_buds

max_buds =UBIFS_MIN_BUD_LEBS;

* Orphan nodes are stored in a separate area.One node can store a lot

* of orphan inode numbers, but when new orphancomes we just add a new

* orphan node. At some point the nodes areconsolidated into one

* orphan node.

// An orphan is an inodenumber whose inode node has been committed to the index with a link count ofzero. That happens when an open file is deleted (unlinked) and then a commit isrun

// The orphan area is afixed number of LEBs situated between the LPT area and the main area

// orphan 顾名思义是指牺牲者，在ubifs中的当一inode的引用为零的时候，这个文件需要被删除，为了防止在删除的时候发生unclean reboot，ubifs将这些需要删除的文件信息写在orphan area中，这样在发生unclean reboot的时候文件系统可以清楚的知道哪些文件需要被删除，而不是去扫描整个分区。文件系统在没有空余空间的时候也可以通过GC子系统来回收这些空间。关于orphan 的相关信息就保存在orphan area中，The orphan area is a fixed numberof LEBs situated between the LPT area and the main area

orph_lebs =UBIFS_MIN_ORPH_LEBS;

#ifdef CONFIG_UBIFS_FS_DEBUG

if (c->leb_cnt -min_leb_cnt > 1)

* For debugging purposes it is better to haveat least 2

* orphan LEBs, because the orphan subsystemwould need to do

* consolidations and would be stressed more.

orph_lebs += 1;

#endif

main_lebs =c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs;

main_lebs -=orph_lebs;

//上面提到了，orphan区处于LPT区和main area之间。什么是LPT，LPT= LEB Properties Tree

lpt_first =UBIFS_LOG_LNUM + log_lebs;

c->lsave_cnt =DEFAULT_LSAVE_CNT;

c->max_leb_cnt =c->leb_cnt;

err = ubifs_create_dflt_lpt(c, &main_lebs,lpt_first, &lpt_lebs,

&big_lpt);

*********************************************************************************

ubifs_create_dflt_lpt算出LPT需要占用几块LEB,LPT是描述的ubifs中每一个leb的空闲bytes和dirty （这儿的脏好像并不是指被修改的意思，从代码pnode->lprops[0].dirty = iopos - node_sz;中大体的意思为没有被写，但是别人不能用的空间，因为flash操作的基本单元是page，如果在某一页中只写了一半的数据，那么另外一半就是脏的，虽然没有写东西，但是别人也用不了, Dirty space is thenumber of bytes taken up by obsolete nodes and padding, that can potentially bereclaimed by garbage collection）bytes。因为LPT区自己也占用了LEB，所以需要建立LPT自己的表。这想内核在启动的过程中建立自己的页表一样

a) 为跟index节点和根inode节点所占的leb创建LEB properties

b) 为其余所有的pnode节点建立信息，同时将信息写入flash media中

**********************************************************************************

if (err)

return err;

dbg_gen("LEBProperties Tree created (LEBs %d-%d)", lpt_first,

lpt_first +lpt_lebs - 1);

main_first =c->leb_cnt - main_lebs;

/* Create defaultsuperblock */

tmp =ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);

sup = kzalloc(tmp,GFP_KERNEL);

if (!sup)

return -ENOMEM;

tmp64 = (longlong)max_buds * c->leb_size;

if (big_lpt)

sup_flags |=UBIFS_FLG_BIGLPT;

//初始化superblock节点

sup->ch.node_type = UBIFS_SB_NODE;

sup->key_hash = UBIFS_KEY_HASH_R5;

sup->flags = cpu_to_le32(sup_flags);

sup->min_io_size = cpu_to_le32(c->min_io_size);

sup->leb_size = cpu_to_le32(c->leb_size);

sup->leb_cnt = cpu_to_le32(c->leb_cnt);

sup->max_leb_cnt = cpu_to_le32(c->max_leb_cnt);

sup->max_bud_bytes= cpu_to_le64(tmp64);

sup->log_lebs = cpu_to_le32(log_lebs);

sup->lpt_lebs = cpu_to_le32(lpt_lebs);

sup->orph_lebs = cpu_to_le32(orph_lebs);

sup->jhead_cnt = cpu_to_le32(DEFAULT_JHEADS_CNT);

sup->fanout = cpu_to_le32(DEFAULT_FANOUT);

sup->lsave_cnt = cpu_to_le32(c->lsave_cnt);

sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION);

sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN);

if(c->mount_opts.override_compr)

sup->default_compr= cpu_to_le16(c->mount_opts.compr_type);

else

sup->default_compr= cpu_to_le16(UBIFS_COMPR_LZO);

generate_random_uuid(sup->uuid);

main_bytes = (longlong)main_lebs * c->leb_size;

tmp64 = div_u64(main_bytes* DEFAULT_RP_PERCENT, 100);

if (tmp64 >DEFAULT_MAX_RP_SIZE)

tmp64 =DEFAULT_MAX_RP_SIZE;

sup->rp_size =cpu_to_le64(tmp64);

sup->ro_compat_version= cpu_to_le32(UBIFS_RO_COMPAT_VERSION);

//写入superblock 节点到LEB0

err =ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);

kfree(sup);

if (err)

return err;

dbg_gen("defaultsuperblock created at LEB 0:0");

/* Create defaultmaster node */

mst =kzalloc(c->mst_node_alsz, GFP_KERNEL);

if (!mst)

return -ENOMEM;

//初始化master节点

mst->ch.node_type =UBIFS_MST_NODE;

mst->log_lnum = cpu_to_le32(UBIFS_LOG_LNUM);

mst->highest_inum =cpu_to_le64(UBIFS_FIRST_INO);

mst->cmt_no = 0;

mst->root_lnum = cpu_to_le32(main_first +DEFAULT_IDX_LEB);

mst->root_offs = 0;

tmp =ubifs_idx_node_sz(c, 1);

mst->root_len = cpu_to_le32(tmp);

mst->gc_lnum = cpu_to_le32(main_first +DEFAULT_GC_LEB);

mst->ihead_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB);

mst->ihead_offs = cpu_to_le32(ALIGN(tmp,c->min_io_size));

mst->index_size = cpu_to_le64(ALIGN(tmp, 8));

mst->lpt_lnum = cpu_to_le32(c->lpt_lnum);

mst->lpt_offs = cpu_to_le32(c->lpt_offs);

mst->nhead_lnum = cpu_to_le32(c->nhead_lnum);

mst->nhead_offs = cpu_to_le32(c->nhead_offs);

mst->ltab_lnum = cpu_to_le32(c->ltab_lnum);

mst->ltab_offs = cpu_to_le32(c->ltab_offs);

mst->lsave_lnum = cpu_to_le32(c->lsave_lnum);

mst->lsave_offs = cpu_to_le32(c->lsave_offs);

mst->lscan_lnum = cpu_to_le32(main_first);

mst->empty_lebs = cpu_to_le32(main_lebs - 2);

mst->idx_lebs = cpu_to_le32(1);

mst->leb_cnt = cpu_to_le32(c->leb_cnt);

/* Calculate lpropsstatistics */

tmp64 = main_bytes;

tmp64 -=ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);

tmp64 -=ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);

mst->total_free =cpu_to_le64(tmp64);

tmp64 =ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);

ino_waste =ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) -

UBIFS_INO_NODE_SZ;

tmp64 += ino_waste;

tmp64 -=ALIGN(ubifs_idx_node_sz(c, 1), 8);

mst->total_dirty =cpu_to_le64(tmp64);

/* The indexing LEB does not contribute to darkspace */

tmp64 =(c->main_lebs - 1) * c->dark_wm;

mst->total_dark =cpu_to_le64(tmp64);

mst->total_used =cpu_to_le64(UBIFS_INO_NODE_SZ);

//master节点一式两份

err = ubifs_write_node(c,mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0,

UBI_UNKNOWN);

if (err) {

kfree(mst);

return err;

}

err =ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0,

UBI_UNKNOWN);

kfree(mst);

if (err)

return err;

dbg_gen("defaultmaster node created at LEB %d:0", UBIFS_MST_LNUM);

/* Create the rootindexing node */

tmp =ubifs_idx_node_sz(c, 1);

//idx节点。从tnc.c中的描述操作，idx的成员zbranch以及make_idx_node函数看来，idx节点是用来在flash media中保存TNC树的

内核用struct ubifs_znode结构体来代表着flash中的一个idx 节点。Idx节点的孩子代表真正的数据，当然这些数据本身可以是一个idx节点，也可以是当初的数据。

这儿初始化的是TNC的根节点。

//《a brief introduce of ubi and ubifs》中说inode节点和它的数据是分开的，上面的idx节点其实是存放的数据。那么struct ubifs_ino_node类型的节点是存放的inode吗？(yes)

// In UBIFS, inodes have a correspondinginode node which records the number of directory entry links, moresimply known as the link count.

// inode node is a node that holds themetadata for an inode. Every inode has

exactly one (non-obsolete) inode node.

idx =kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);

if (!idx)

return -ENOMEM;

c->key_fmt =UBIFS_SIMPLE_KEY_FMT;

c->key_hash =key_r5_hash;

idx->ch.node_type =UBIFS_IDX_NODE;

idx->child_cnt =cpu_to_le16(1);

ino_key_init(c,&key, UBIFS_ROOT_INO);

br =ubifs_idx_branch(c, idx, 0);

key_write_idx(c,&key, &br->key);

br->lnum =cpu_to_le32(main_first + DEFAULT_DATA_LEB);

br->len = cpu_to_le32(UBIFS_INO_NODE_SZ);

err =ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0,

UBI_UNKNOWN);

kfree(idx);

if (err)

return err;

dbg_gen("defaultroot indexing node created LEB %d:0",

main_first +DEFAULT_IDX_LEB);

/* Create default rootinode */

tmp =ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);

ino = kzalloc(tmp,GFP_KERNEL);

if (!ino)

return -ENOMEM;

ino_key_init_flash(c,&ino->key, UBIFS_ROOT_INO);

ino->ch.node_type =UBIFS_INO_NODE;

ino->creat_sqnum =cpu_to_le64(++c->max_sqnum);

ino->nlink =cpu_to_le32(2);

tmp_le64 =cpu_to_le64(CURRENT_TIME_SEC.tv_sec);

ino->atime_sec = tmp_le64;

ino->ctime_sec = tmp_le64;

ino->mtime_sec = tmp_le64;

ino->atime_nsec = 0;

ino->ctime_nsec = 0;

ino->mtime_nsec = 0;

ino->mode =cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);

ino->size =cpu_to_le64(UBIFS_INO_NODE_SZ);

/* Set compressionenabled by default */

ino->flags =cpu_to_le32(UBIFS_COMPR_FL);

err =ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,

main_first + DEFAULT_DATA_LEB, 0,

UBI_UNKNOWN);

kfree(ino);

if (err)

return err;

dbg_gen("rootinode created at LEB %d:0",

main_first +DEFAULT_DATA_LEB);

* The first node in the log has to be thecommit start node. This is

* always the case during normal file-systemoperation. Write a fake

* commit start node to the log.

tmp =ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size);

cs = kzalloc(tmp,GFP_KERNEL);

if (!cs)

return -ENOMEM;

cs->ch.node_type =UBIFS_CS_NODE;

//在log区域写入一个commit start node，每一次commit的时候会向log区域写入两种类型，一种就是commit start类型的节点表示一次commit的开始，两外一种就是referencr 节点，里面记录了相应的日志需要操作的leb，和offset。

err =ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM,

0, UBI_UNKNOWN);

kfree(cs);

ubifs_msg("defaultfile-system created");

return 0;

}

3.3 ubifs_read_master

读ubifs文件系统的master节点，我们前面提到了master节点是一式两份的，因为它里面保存的是idx的最基本的东西，不容有失。而且master节点是不能同时写的，防止unclean reboot使得两份数据同时被破坏

int ubifs_read_master(struct ubifs_info *c)

{

int err, old_leb_cnt;

c->mst_node =kzalloc(c->mst_node_alsz, GFP_KERNEL);

if (!c->mst_node)

return -ENOMEM;

//检查两份master节点，看是master中的数据是否被破坏。

err =scan_for_master(c);

if (err) {

if (err ==-EUCLEAN)

//如果被破坏，那么就需要恢复

err =ubifs_recover_master_node(c);

if (err)

* Note, we do not free 'c->mst_node' herebecause the

* unmount routine will take care of this.

returnerr;

}

/* Make sure that therecovery flag is clear */

//用master节点来初始化ubifs_info结构体中的信息

c->mst_node->flags&= cpu_to_le32(~UBIFS_MST_RCVRY);

c->max_sqnum =le64_to_cpu(c->mst_node->ch.sqnum);

c->highest_inum =le64_to_cpu(c->mst_node->highest_inum);

c->cmt_no = le64_to_cpu(c->mst_node->cmt_no);

c->zroot.lnum =le32_to_cpu(c->mst_node->root_lnum);

c->zroot.offs =le32_to_cpu(c->mst_node->root_offs);

c->zroot.len =le32_to_cpu(c->mst_node->root_len);

c->lhead_lnum =le32_to_cpu(c->mst_node->log_lnum);

c->gc_lnum =le32_to_cpu(c->mst_node->gc_lnum);

c->ihead_lnum =le32_to_cpu(c->mst_node->ihead_lnum);

c->ihead_offs =le32_to_cpu(c->mst_node->ihead_offs);

c->old_idx_sz =le64_to_cpu(c->mst_node->index_size);

c->lpt_lnum =le32_to_cpu(c->mst_node->lpt_lnum);

c->lpt_offs =le32_to_cpu(c->mst_node->lpt_offs);

c->nhead_lnum =le32_to_cpu(c->mst_node->nhead_lnum);

c->nhead_offs =le32_to_cpu(c->mst_node->nhead_offs);

c->ltab_lnum = le32_to_cpu(c->mst_node->ltab_lnum);

c->ltab_offs =le32_to_cpu(c->mst_node->ltab_offs);

c->lsave_lnum =le32_to_cpu(c->mst_node->lsave_lnum);

c->lsave_offs =le32_to_cpu(c->mst_node->lsave_offs);

c->lscan_lnum =le32_to_cpu(c->mst_node->lscan_lnum);

c->lst.empty_lebs = le32_to_cpu(c->mst_node->empty_lebs);

c->lst.idx_lebs = le32_to_cpu(c->mst_node->idx_lebs);

old_leb_cnt =le32_to_cpu(c->mst_node->leb_cnt);

c->lst.total_free = le64_to_cpu(c->mst_node->total_free);

c->lst.total_dirty= le64_to_cpu(c->mst_node->total_dirty);

c->lst.total_used = le64_to_cpu(c->mst_node->total_used);

c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead);

c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark);

c->calc_idx_sz =c->old_idx_sz;

if(c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))

c->no_orphs= 1;

if (old_leb_cnt !=c->leb_cnt) {

/* The filesystem has been resized */

int growth =c->leb_cnt - old_leb_cnt;

if(c->leb_cnt < old_leb_cnt ||

c->leb_cnt < UBIFS_MIN_LEB_CNT) {

ubifs_err("badleb_cnt on master node");

dbg_dump_node(c,c->mst_node);

return-EINVAL;

}

dbg_mnt("Autoresizing (master) from %d LEBs to %d LEBs",

old_leb_cnt,c->leb_cnt);

c->lst.empty_lebs+= growth;

c->lst.total_free+= growth * (long long)c->leb_size;

c->lst.total_dark+= growth * (long long)c->dark_wm;

* Reflect changes back onto the master node.N.B. the master

* node gets written immediately whenevermounting (or

* remounting) in read-write mode, so we do notneed to write it

* here.

c->mst_node->leb_cnt= cpu_to_le32(c->leb_cnt);

c->mst_node->empty_lebs= cpu_to_le32(c->lst.empty_lebs);

c->mst_node->total_free= cpu_to_le64(c->lst.total_free);

c->mst_node->total_dark= cpu_to_le64(c->lst.total_dark);

}

err =validate_master(c);

if (err)

return err;

err =dbg_old_index_check_init(c, &c->zroot);

return err;

}

（二）：通过VFS的读写流程

1.通过VFS的写流程

断断续续的看ubifs这么久了，感觉越看越乱，所以想先从VFS的读写接口开始慢慢的扩展一下。

conststruct file_operations ubifs_file_operations = {

.llseek = generic_file_llseek,

.read = do_sync_read,

.write = do_sync_write,

.aio_read = generic_file_aio_read,

.aio_write = ubifs_aio_write,

.mmap = ubifs_file_mmap,

.fsync = ubifs_fsync,

.unlocked_ioctl= ubifs_ioctl,

.splice_read = generic_file_splice_read,

.splice_write = generic_file_splice_write,

#ifdef CONFIG_COMPAT

.compat_ioctl = ubifs_compat_ioctl,

#endif

};

其中ubifs_aio_write的代码很短。

static ssize_t ubifs_aio_write(struct kiocb*iocb, const struct iovec *iov,

unsigned long nr_segs, loff_t pos)

{

interr;

ssize_tret;

structinode *inode = iocb->ki_filp->f_mapping->host;

structubifs_info *c = inode->i_sb->s_fs_info;

err= update_mctime(c, inode);

if(err)

returnerr;

ret= generic_file_aio_write(iocb, iov, nr_segs, pos);

if(ret < 0)

returnret;

if(ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags &O_SYNC)) {

err= ubifs_sync_wbufs_by_inode(c, inode);

if(err)

returnerr;

}

returnret;

}

对于异步的情况，直接调用，generic_file_aio_write将数据写入到缓冲区中，由后台进程来具体的将数据写入的flashmedia中去。对于采用sync模式挂载的情况，就不是由后台进程来讲数据刷新到flash media中去了，而是直接调用ubifs_sync_wbufs_by_inode来讲数据直接写入到flash media中去。

下面具体的看一下这个函数的代码：

int ubifs_sync_wbufs_by_inode(structubifs_info *c, struct inode *inode)

{

inti, err = 0;

for(i = 0; i < c->jhead_cnt; i++) {

structubifs_wbuf *wbuf = &c->jheads[i].wbuf;

if(i == GCHD)

* GC head is special, do not look at it. Evenif the

* head contains something related to thisinode, it is

*a _copy_ of corresponding on-flash node which sits

* somewhere else.

continue;

if(!wbuf_has_ino(wbuf, inode->i_ino))

continue;

mutex_lock_nested(&wbuf->io_mutex,wbuf->jhead);

if(wbuf_has_ino(wbuf, inode->i_ino))

err= ubifs_wbuf_sync_nolock(wbuf);

mutex_unlock(&wbuf->io_mutex);

if(err) {

ubifs_ro_mode(c,err);

returnerr;

}

return0;

}

该函数通过调用wbuf_has_ino来判断这些inode中是否存在数据。在获得wbuf的信号了的情况，接着调用ubifs_wbuf_sync_nolock来sync数据。

在structubifs_info中存在这样几个成员：

intjhead_cnt;

structubifs_jhead *jheads;

是用于日志文件系统的管理的。在

/fs/ubifs/ubifs-media.h中有这样的宏定义：

/* Garbage collector journal head number */

#define UBIFS_GC_HEAD 0

/* Base journal head number */

#define UBIFS_BASE_HEAD 1

/* Data journal head number */

#define UBIFS_DATA_HEAD 2

这是ubifs中用于管理的三种不同目的的缓冲区，分别用于垃圾回收, Journal head used for non-data nodes.和数据读写这三种用途。

struct ubifs_jhead {

structubifs_wbuf wbuf;

structlist_head buds_list;

};

struct ubifs_wbuf {

structubifs_info *c;

void*buf;//具体的分配用来缓冲数据的空间

intlnum;//缓冲的是哪一个flash块

int offs；//缓冲的数据的位移，也就是这一块中，offs之前位置的数据被缓冲在wbuf中。

intavail;//缓冲区的可用字节数

intused;//缓冲区中已用字节数

intdtype;// type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,

*%UBI_UNKNOWN)

intjhead;

int(*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);

structmutex io_mutex;

spinlock_tlock;

ktime_tsoftlimit;

unsignedlong long delta;

structhrtimer timer;

unsignedint no_timer:1;

unsignedint need_sync:1;

intnext_ino;

ino_t*inodes;//缓冲区中数据的host

};

从注释中可以看出structubifs_wbuf - UBIFS write-buffer.，这个wbuf是ubifs层的一个缓冲区，我们慢慢来看这个缓冲区是怎么实现的。

int ubifs_wbuf_sync_nolock(structubifs_wbuf *wbuf)

{

structubifs_info *c = wbuf->c;

interr, dirt;

cancel_wbuf_timer_nolock(wbuf);

//取消wbuf的定时器，因为在后台进程中通过定时器的定期刷新数据

if(!wbuf->used || wbuf->lnum == -1)

/*Write-buffer is empty or not seeked */

return0;

dbg_io("LEB%d:%d, %d bytes, jhead %s",

wbuf->lnum, wbuf->offs,wbuf->used, dbg_jhead(wbuf->jhead));

ubifs_assert(!(c->vfs_sb->s_flags& MS_RDONLY));

ubifs_assert(!(wbuf->avail& 7));

ubifs_assert(wbuf->offs+ c->min_io_size <= c->leb_size);

if(c->ro_media)

return-EROFS;

ubifs_pad(c,wbuf->buf + wbuf->used, wbuf->avail);

//调用ubi_leb_write来对LEB（逻辑块）进行读写

err= ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,

c->min_io_size, wbuf->dtype);

if(err) {

ubifs_err("cannotwrite %d bytes to LEB %d:%d",

c->min_io_size, wbuf->lnum,wbuf->offs);

dbg_dump_stack();

returnerr;

}

dirt= wbuf->avail;

spin_lock(&wbuf->lock);

wbuf->offs+= c->min_io_size;

wbuf->avail= c->min_io_size;

wbuf->used= 0;

wbuf->next_ino= 0;

spin_unlock(&wbuf->lock);

if(wbuf->sync_callback)

err= wbuf->sync_callback(c, wbuf->lnum,

c->leb_size - wbuf->offs, dirt);

returnerr;

}

这个函数的主体是调用UBI层的ubi_leb_write函数来将数据写入flash中。

ubi_leb_write的函数调用关系：

->ubi_leb_write（对逻辑块进行读写）

->ubi_eba_write_leb(内核中每一个volume都维护一个vol->eba_tbl的数组，其中是关于逻辑块与物理块之间的映射关系，这些映射关系同时保持在实际物理介质中的VID header中，在ubiattach的时候，建立这样的eba_tbl)

->ubi_io_write_data

->ubi_io_write

->ubi->mtd->write(ubi->mtd,addr, len, &written, buf);

这儿可以看出UBI是构建在MTD层之上的。UBI的读写之后调用了MTD层的读写。

上面的generic_file_aio_write刚才没有分析，这个函数比较复杂，在此之前我们先看一个ubifs中address_space_operations类型的一个关于内核缓冲区的操作结构体。

const struct address_space_operationsubifs_file_address_operations = {

.readpage = ubifs_readpage,

.writepage = ubifs_writepage,

.write_begin = ubifs_write_begin,

.write_end = ubifs_write_end,

.invalidatepage= ubifs_invalidatepage,

.set_page_dirty= ubifs_set_page_dirty,

.releasepage = ubifs_releasepage,

};

->generic_file_aio_write

->__generic_file_aio_write

->generic_file_buffered_write

->generic_perform_write

->ubifs_write_begin

->ubifs_write_end

这儿有一个关于write_begin和write_end的资料：

http://lwn.net/Articles/254856/

generic_file_aio_write函数结束的时候，整个写过程也就结束了，到这儿的时候，数据已经被写入了buffer_head中去了，等待内核线程pdflush发现radix树上的脏页，并最终调用ubifs_writepages。

关于ubifs_writepages，作者有一段注释，大意是说在VFS中，是先写入属于inode的数据，最后才写入inode节点的。但是对ubifs这样的日志文件系统就可能存在问题。设想存在下面的情况：一个原来长度为0的inode节点，现在想往该节点写入数据，ubifs提交日志，最终完成了写操作。在没有写入inode之前发生了一次unclearreboot，这时候重新启动的时候就会发现该inode节点还是0字节，但是数据已经写入了，占用了flash media。所以这部分空间就没办法释放了。为了避免这种情况，需要在ubifs中先写入inode节点，然后再用log的形式写入数据，这时候即使发生unclear reboot，由于提交了日志，所以数据还是可以恢复的。

static int ubifs_writepage(struct page*page, struct writeback_control *wbc)

{

structinode *inode = page->mapping->host;

structubifs_inode *ui = ubifs_inode(inode);

loff_ti_size = i_size_read(inode),synced_i_size;

pgoff_tend_index = i_size >> PAGE_CACHE_SHIFT;

interr, len = i_size & (PAGE_CACHE_SIZE - 1);

void*kaddr;

dbg_gen("ino%lu, pg %lu, pg flags %#lx",

inode->i_ino,page->index, page->flags);

ubifs_assert(PagePrivate(page));

/* Is the page fully outside@i_size? (truncate in progress) */

if(page->index > end_index || (page->index == end_index &&!len)) {

err= 0;

gotoout_unlock;

}

spin_lock(&ui->ui_lock);

synced_i_size= ui->synced_i_size;

spin_unlock(&ui->ui_lock);

/*Is the page fully inside @i_size? */

if(page->index < end_index) {

if(page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {

err = inode->i_sb->s_op->write_inode(inode, 1);

if(err)

gotoout_unlock;

* The inode has been written, but thewrite-buffer has

* not been synchronized, so in case of anunclean

* reboot we may end up with some pages beyondinode

* size, but they would be in the journal(because

* commit flushes write buffers) and recoverywould deal

* with this.

}

return do_writepage(page, PAGE_CACHE_SIZE);

}

* The page straddles @i_size. It must bezeroed out on each and every

* writepage invocation because it may bemmapped. "A file is mapped

* in multiples of the page size. For a filethat is not a multiple of

* the page size, the remaining memory iszeroed when mapped, and

* writes to that region are not written out tothe file."

kaddr= kmap_atomic(page, KM_USER0);

memset(kaddr+ len, 0, PAGE_CACHE_SIZE - len);

flush_dcache_page(page);//将Dcache中的数据刷回内存中

kunmap_atomic(kaddr,KM_USER0);

if(i_size > synced_i_size) {

err= inode->i_sb->s_op->write_inode(inode, 1);

if(err)

gotoout_unlock;

}

returndo_writepage(page, len);

out_unlock:

unlock_page(page);

returnerr;

}

首先调用inode->i_sb->s_op->write_inode(inode,1);将inode节点写入flash media中去，接着调用do_writepage将在page中的数据写入flash media中，在do_writepage中调用ubifs_jnl_write_data来进行日志文件系统的写操作。该函数首先将数据拷贝到wbuf中，由后台进程来进行些操作。但是日志文件系统是怎么保证unclear reboot的recovery工作的呢？

我们ubifs_add_bud_to_log(bud -----An eraseblock used bythe journal)就可以看到，每次找到一块可用块将其添加到wbuf的时候，都会在日志中（ubifs在flash中保持一定的块数用于日志的目的）写入一个REF的节点。一个ref结点代表的是journal中的一个LEB，可以称之为bud。那么log中记录的就是在前一次commit之后，下一次commit之前，我们的写操作涉及到的LEB。所以我们可以理解为什么struct ubifs_jhead结构的成员中会有struct ubifs_wbuf wbuf；

struct ubifs_jhead {

struct ubifs_wbuf wbuf;

struct list_head buds_list;

};因为wbuf差不多是journal的一部分，wbuf中缓冲的是将要写入到journal中的数据。

Ubifs中一共分为六个区域：superblock，master node，the log area，the lpt area，the orphan area 和the main area.其中master node中记载着idx树的根节点，an index node recordsthe onflash position of its child nodes，the UBIFS wandering tree can beviewed as having two parts.A top part consisting of index nodes that create thestructure of the tree, and a bottom part consisting of leaf nodes that hold theactual file data。因为wanderingtree是保存在flash上的，所以在进行数据更新的时候就必然需要更新wandering tree，频繁的数据更新显然会降低文件系统的性能，所以采用了journal。Log是journal的一部分，是为了防止在flash media中频繁的更新idx树而降低文件系统效率。Ubifs将需要更新的信息写入log中，然后在提交的时候一起更新，从而降低了wandering tree的更新频率。

Ubifs文件系统在对一个文件进行修改的时候，它会将修改的数据写入到一个新块中，然后将LNUM指向该新页，将原来LNUM指向的PNUM擦除掉。所以在修改的过程中发生unclear reboot的时候，在重新启动的时候就会发现有两个PNUM指向同一个LNUM,这就说明发生了错误。同时旧的PNUM中的数据没有擦除掉，很容易恢复。

2.通过VFS的读流程

从上面的文件操作结构体可以看出，UBIFS对于VFS的接口函数为generic_file_aio_read。

下面来看一下函数之间的调用关系：

generic_file_aio_read：

->do_generic_file_read

->readpage

这儿的readpage的函数指针指向ubifs_readpage。

ubifs_readpage的代码清单：

staticint ubifs_readpage(struct file *file, struct page *page)

{

if (ubifs_bulk_read(page))

return 0;

do_readpage(page);

unlock_page(page);

return 0;

}

上面的代码很简短。

staticint ubifs_bulk_read(struct page *page)

{

struct inode *inode =page->mapping->host;

struct ubifs_info *c =inode->i_sb->s_fs_info;

struct ubifs_inode *ui =ubifs_inode(inode);

pgoff_t index = page->index,last_page_read = ui->last_page_read;

struct bu_info *bu;

int err = 0, allocated = 0;

ui->last_page_read = index;

if (!c->bulk_read)

return 0;

*Bulk-read is protected by @ui->ui_mutex, but it is an optimization,

*so don't bother if we cannot lock the mutex.

if (!mutex_trylock(&ui->ui_mutex))

return 0;

if (index != last_page_read + 1) {

/*Turn off bulk-read if we stop reading sequentially */

ui->read_in_a_row = 1;

if (ui->bulk_read)

ui->bulk_read = 0;

goto out_unlock;

}

if (!ui->bulk_read) {

ui->read_in_a_row += 1;

if (ui->read_in_a_row < 3)

goto out_unlock;

/*Three reads in a row, so switch on bulk-read */

ui->bulk_read = 1;

}

*If possible, try to use pre-allocated bulk-read information, which

*is protected by @c->bu_mutex.

if (mutex_trylock(&c->bu_mutex))

bu = &c->bu;

else {

bu = kmalloc(sizeof(structbu_info), GFP_NOFS | __GFP_NOWARN);

if (!bu)

goto out_unlock;

bu->buf = NULL;

allocated = 1;

}

bu->buf_len = c->max_bu_buf_len;

data_key_init(c, &bu->key,inode->i_ino,

page->index <

err = ubifs_do_bulk_read(c, bu, page);

if (!allocated)

mutex_unlock(&c->bu_mutex);

else

kfree(bu);

out_unlock:

mutex_unlock(&ui->ui_mutex);

return err;

}

在ubifs中数据都是以LEB的形式的组织的，ubifs层的基本读写也是块读写。所以在ubifs中必然出现很多文件的尾部只占用了一整块的一小部分。当对这文件添加内容的时候，就会以另外一个DATA_NODE的形式附加在后面，这样就会在某一个块中出现在物理上连续，属于同一个节点的不同的DATA_NODE形式的ubifs节点。

ubifs_bulk_read来擦看ubifs是否支持bulk_read ,如果支持，那么就执行bulk-read操作，在ubifs中，存在一个TNC树(treenode cache),里面保持的是在内存中的inode树。首先ubifs_bulk_read通过key来从TNC树中查找到znode，bulk-read的相关信息保持在znode 的zbranch数组中。

为什么要采用bulk操作，而不是直接在添加数据到文件的tail后面？因为ubifs中对数据进行了压缩，所以数据不能直接添加的，需要压缩之后以另外一个DATA_NODE的形式接在后面。

上面的两种方式最终都调用到了ubi_leb_read。注意这儿的的名字中leb，leb是logical erase block ,是UBI层虚拟的逻辑块，逻辑块与物理上的块是一一对应的，ubi中是如何实现虚拟块与物理块之间的映射关系的呢？

3.EBA子系统3.1erase worker

在ubifs中，存在两个头部分别用来进行ubifs的管理，分别为EC（erase header）和VID（volumneidentifier header），看名字可以大体擦除这些头部的用途，其中EC是用来统计erase次数的，用于均衡损耗。VIDheader用于LEB和PEB之间的映射。

structubi_vid_hdr {

ubi32_tmagic;

uint8_tversion;

uint8_tvol_type;

uint8_tcopy_flag;

uint8_tcompat;

ubi32_tvol_id;//属于哪一个volumn

ubi32_tlnum;//该peb属于哪一个leb

ubi32_tleb_ver;

ubi32_tdata_size;

ubi32_tused_ebs;

ubi32_tdata_pad;

ubi32_tdata_crc;

uint8_tpadding1[12];

uint8_tivol_data[UBI_VID_HDR_IVOL_DATA_SIZE];

ubi32_thdr_crc;

}__attribute__((packed));

在VID头部中有一个成员为lnum，用于表示与该peb相对于的leb的number。Ubi在atatch一个MTD分区的时候会扫描每一个块，然后收集相应的信息，建立起来volumn分区信息和每一个分区的eba_tbl表。

Ubifs设计文档中提到ubifs是一个out-of-place updates的文件系统，即文件系统修改一个文件系统的时候，是先将数据读出，在缓冲区中进行写覆盖，然后将这些数据写入到一个新块中。为什么这么做呢？

我们知道Nand Flash在写之前进行擦除（具体原因不在说明），如果我只是修改了很小的不部分内容，就会发现这个读-擦除-写的代价比写入一个新块的代价大的多。

所以在ubifs在对一个文件修改时，直接修改数据写入一个新块，然后再新块中使得VID头部的lnum为原来的leb就可以了。然后将原来的leb unmap掉，这个unmap的过程将该块丢给ubi的后台进程去擦除。当然这个擦除的过程需要读出EC头部，然后更新擦除次数，重新写入被擦除的物理块中。这时候这个块就是一个free的块了。所以UBI中空块是存在EC头部但是不存在VID头部的。

上面提到了unmap一个erase block，下面看看eba子系统是如何unmap一个erase block的。

int ubi_eba_unmap_leb(struct ubi_device *ubi,struct ubi_volume *vol,int lnum)

{

interr, pnum, vol_id = vol->vol_id;

if(ubi->ro_mode)

return-EROFS;

err= leb_write_lock(ubi, vol_id, lnum);

if(err)

returnerr;

pnum= vol->eba_tbl[lnum];

if(pnum < 0)

/*This logical eraseblock is already unmapped */

gotoout_unlock;

dbg_eba("eraseLEB %d:%d, PEB %d", vol_id, lnum, pnum);

vol->eba_tbl[lnum]= UBI_LEB_UNMAPPED;

err= ubi_wl_put_peb(ubi, pnum, 0);

out_unlock:

leb_write_unlock(ubi,vol_id, lnum);

returnerr;

}

该函数首先在分区的eba_tbl中查找看看该erase block时候被map了，如果没有被map，那么函数直接返回。

ubi_wl_put_peb 函数最终调用schedule_erase(ubi, e, torture);来进行erase 操作。

static int schedule_erase(struct ubi_device*ubi, struct ubi_wl_entry *e,

int torture)

{

structubi_work *wl_wrk;

dbg_wl("scheduleerasure of PEB %d, EC %d, torture %d",

e->pnum, e->ec, torture);

wl_wrk= kmalloc(sizeof(struct ubi_work), GFP_NOFS);

if(!wl_wrk)

return-ENOMEM;

wl_wrk->func= &erase_worker;

wl_wrk->e= e;

wl_wrk->torture= torture;

schedule_ubi_work(ubi,wl_wrk);

return0;

}

这个函数的实现也比较简单，首先创建了一个ubi_work结构体，初始化之。

static void schedule_ubi_work(struct ubi_device*ubi, struct ubi_work *wrk)

{

spin_lock(&ubi->wl_lock);

list_add_tail(&wrk->list,&ubi->works);

ubi_assert(ubi->works_count>= 0);

ubi->works_count+= 1;

if(ubi->thread_enabled)

wake_up_process(ubi->bgt_thread);

spin_unlock(&ubi->wl_lock);

}

将这个ubi_work结构体结构体加到&ubi->works队列中，然后唤醒ubi的后台进程。

这个后台进程是ubi_thread，主要调用do_work来执行具体的操作。在此之前我们需要详细的了解一下ubi_work结构体。

struct ubi_work {

structlist_head list;//用于将其连到队列中去

int(*func)(struct ubi_device *ubi, struct ubi_work *wrk, int cancel);

//函数指针，用于执行具体的work

/*The below fields are only relevant to erasure works */

structubi_wl_entry *e；//一个红黑树的入口，用于查找leb

inttorture;

};

在后台擦除的时候，这个func的具体执行函数为

static int erase_worker(struct ubi_device *ubi,struct ubi_work *wl_wrk,int cancel)

->sync_erase(读出该peb的ec头部，后的erase counter)

->do_sync_erase(具体执行mtd层的erase操作，并检查擦除之后该块是否全是0xff)

->ubi_io_write_ec_hdr(重新写入ec头)

至此，一个块的unmap工作就完成了。

3.2．Bitfilp

int ubi_eba_read_leb(struct ubi_device *ubi,struct ubi_volume *vol, int lnum,void *buf, int offset, int len, int check)

是eba层的读函数，被ubifs层调用。当发生位反转（bit filp）的时候，ubi认为该块不适合继续用来存储数据了，就会进行scrub操作。

具体的scrub操作由ubi_wl_scrub_peb函数执行。与上面的erase一样，也是创建一个ubi_worker，只不是现在的具体的回调函数是wear_leveling_worker

该函数将原来lnum对应的pnum中的数据拷贝到另外一个物理块中，然后将原来的物理块擦除。

4．Init过程

Nand_scan用来在attach某一个MTD分区的时候扫描分区的每一个块，通过读出EC头跟VID头来确定volumn的个数，并将这些块分类（free，corp等等。在这个扫描的过程建立了一个很大的红黑树，树中的每一个节点代表一个leb。

后面在ubi_eba_init_scan中建立每一个volumn的vtl的时候会用到这个建立起来的树。

err= register_filesystem(&ubifs_fs_type);

if(err) {

ubifs_err("cannotregister file system, error %d", err);

returnerr;

}

这儿是在ubifs_init（super.c）中关于注册ubifs文件系统类型的。

static struct file_system_type ubifs_fs_type = {

.name = "ubifs",

.owner = THIS_MODULE,

.get_sb = ubifs_get_sb,

.kill_sb= kill_anon_super,

};

文件系统类型中最主要的是关于超级块的读取。每一个类型的文件系统都有自己类型的自定义的超级块。但是为了跟VFS挂钩，需要将这些信息跟VFS定义的超级块衔接一起。所以每一个文件系统类型中都定义了这样的超级块读取函数。

（三）：UBIFS的六个area

UBIFS中一共分为六个区，分别为

superblock area

master node area

journal (or log)area

LPT(LEB properties tree) area

Orphan area

The mian area

对于第一个区我不准备做介绍，因为superblock是每一个文件系统必备的。

MASTER AREA:UBIFS为了进行垃圾回收，采用了node结构来进行文件的管理，什么是node？我觉得以UBIFS中的一个inode node来打个比方。

struct ubifs_ino_node {

structubifs_ch ch;

__u8key[UBIFS_MAX_KEY_LEN];

__le64creat_sqnum;

__le64size;

__le64atime_sec;

__le64ctime_sec;

__le64mtime_sec;

__le32atime_nsec;

__le32ctime_nsec;

__le32mtime_nsec;

__le32nlink;

__le32uid;

__le32gid;

__le32mode;

__le32flags;

__le32data_len;

__le32xattr_cnt;

__le32xattr_size;

__u8padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */

__le32xattr_names;

__le16compr_type;

__u8padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */

__u8data[];

} __attribute__ ((packed));

node就是文件信息和数据的一个结合。上面的结构体中除了__u8 data[]之外都可以称之为文件信息，而__u8 data[]称之为文件数据。为了便于垃圾回收，文件系统必须为所有的文件建立这样的树状结构来进行管理。

为了降低启动时的扫描时间和运行的内存消耗，UBIFS将这样的树状结构保持在FLASH上，而不是在内存中。但是问题就来，怎么知道这棵树的根在哪儿？

所以master区就是为了这样的目的，当然不仅仅是为了这样的目的，这棵树的根就保存在masterarea中。

Journal area：上面我们提到了UBIFS中这样的树状结构是保存在flash中，那么就带来了一个问题，每次更新文件（不管是写入、修改还是删除），相应的文件信息和数据都会发生变化，那么这颗树种的结点也会发生变化。而我们知道NANDFLASH的特点，每次重新写入之前必须擦除，可见这样频繁的操作带来的是效率的低下。问了降低片上结点频繁的更新，UBIFS中创建了journal区，在其中缓存对结点的修改，然后一次写到NANDFLASH上去，这样就降低了更新的频率。UBIFS会在内存中建立TNC树（tree node cache），是对flash中这棵树的缓存，这样不可能每次都要到FLASH上去读出node结点的相关信息。在FLASH中的index node在TNC树中为znode。

LPT AREA:我们上面提到了journal area的目的，就是降低数据的更新频率。但是数据如何更新呢？也就是说，这些新添加的数据写往何处？所以必须对flash中每一个块的空间使用情况有一个了解，这就是这儿LPT（LEB propertiestree）的目的。LEBproperties中主要包含三个重要的参数：freespace、dirty space 和whether the eraseblock is an indexeraseblock or not。空闲空间是指可擦除块中未使用的空间。Dirty space 是指一个可擦除块中废弃的（被trunk掉的）和填充的空间的字节数（UBIFS中存在minI/O，也就是最小的写入数据字节数，如果数据不够，就需要padding来填充）。我们上面提到了master区中放的是node树的根，那么它的枝放在哪儿呢？是以index node的形式存放在可擦除块中，所以需要标记一下知道main area中这个可擦除块中存放的是否是index node。LPT area的大小是根据分区的大小来确定的。LPT也有自己的LPT，这是什么意思，就是LPT内部建立了一个ltab（LEB properties table，因为LPTarea所占的可擦出块毕竟是少数，所以采用表的形式），是LPT表所占LEB的LPT。LPT也是在commit的时候更新的。

ORPHAN AREA:在理解在这个区的作用之前，我们必须准确的了解inode node结点在UBIFS中的作用。用这篇文章中的话来解释的话，A node that holds the metadata for an inode，Every inode has exactly one（non-obsolete）inode node。Orphan area is an area for storingthe inode numbers of deleted by still open inodes，needed for recovery from unclean unmounts。

MAIN AREA:这个区就不用多说了，是用来存放文件数据和index结点的。

（四）：重要数据结构

用leeming的话来说，一个大的工程中，最最核心的往往是数据结构体的定义。所以看代码不急着看c文件，而是主要看document和h文件，来理解设计者的思路，这样才能走对路。

1. struct ubi_device

UBI中对于一个UBI设备的抽象是以struct ubi_device来定义，其中包括了该UBI设备的各种信息。

struct ubi_device {

structcdev cdev;

structdevice dev;

intubi_num;//UBI设备的标号，在ubiattach用户程序时以-d选项来输入

charubi_name[sizeof(UBI_NAME_STR)+5];//ubi设备的名称

intvol_count;//在该UBI设备中有多少个volume

structubi_volume *volumes[UBI_MAX_VOLUMES+UBI_INT_VOL_COUNT];

spinlock_tvolumes_lock;

intref_count;

intimage_seq;

intrsvd_pebs;//保留的LEB数目

intavail_pebs;//可用的LEB数目

intbeb_rsvd_pebs;//为坏块处理而保留的LEB数目

intbeb_rsvd_level;//为坏块处理而保留的LEB的正常数目

intautoresize_vol_id;

intvtbl_slots;

intvtbl_size;//volume表的大小（bytes）

structubi_vtbl_record *vtbl;//内存中volume表的拷贝

structmutex device_mutex;

intmax_ec;//最大的erasecounter

/*Note, mean_ec is not updated run-time - should be fixed */

intmean_ec;//平均erasecounter

/*EBA sub-system's stuff */

unsignedlong long global_sqnum;

spinlock_tltree_lock;

structrb_root ltree;

structmutex alc_mutex;

/*Wear-leveling sub-system's stuff */

structrb_root used;//一个红黑树，其中是已用的blcok

structrb_root erroneous;// RB-tree of erroneous used physical eraseblocks

structrb_root free;//红黑树的根，其中是没有用到的block

structrb_root scrub;//需要擦除的blcok

structlist_head pq[UBI_PROT_QUEUE_LEN];

intpq_head;

spinlock_twl_lock;

structmutex move_mutex;

structrw_semaphore work_sem;

intwl_scheduled;

structubi_wl_entry **lookuptbl;// a table to quickly find a &struct ubi_wl_entryobject for any physical eraseblock,，一个struct ubi_wl_entry类型的数组，以pnum为下标，记录该UBI设备的每一个block

structubi_wl_entry *move_from;// physical eraseblock from where the data is beingmoved

structubi_wl_entry *move_to;// physical eraseblock where the data is being moved to

intmove_to_put;//标志位，用于标志目的LEB是否被put

structlist_head works;// list of pending works

intworks_count;// count of pending works

structtask_struct *bgt_thread;//UBI的后台进程

intthread_enabled;

charbgt_name[sizeof(UBI_BGT_NAME_PATTERN)+2];//后台进程的名字

structnotifier_block reboot_notifier;//内核通知链

/*I/O sub-system's stuff */

longlong flash_size;//MTD分区的大小

intpeb_count;//LEB的数目

intpeb_size;//LEB的大小（每一个block的大小）

intbad_peb_count;//坏块数目

intgood_peb_count;//能使用的LEB数目

interroneous_peb_count;

intmax_erroneous;

intmin_io_size;//最小操作单元的大小，也就是一个page的大小

inthdrs_min_io_size;

intro_mode;

intleb_size;//逻辑块的大小，一般等于peb_size

intleb_start;//逻辑块块从物理块中那一块开始算，也就是之前的物理块保留用于其他目的

intec_hdr_alsize;// size of the EC header aligned to @hdrs_min_io_size

intvid_hdr_alsize; //size of the VID header aligned to @hdrs_min_io_size

intvid_hdr_offset;//VID头部在一块之中的偏移量。一般是一个pagesize

intvid_hdr_aloffset;// starting offset of the VID header aligned to @hdrs_min_io_size

intvid_hdr_shift// contains @vid_hdr_offset - @vid_hdr_aloffset

unsignedint bad_allowed:1;

unsignedint nor_flash:1;// non-zero if working on top of NOR flash

structmtd_info *mtd;//指向MTD分区信息，我们知道，UBI层是构建在MTD层之上的。

void*peb_buf1;//一个缓冲区，大小为一个block的大小

void*peb_buf2; //一个缓冲区，大小为一个block的大小

structmutex buf_mutex;

structmutex ckvol_mutex;

#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID

void*dbg_peb_buf;

structmutex dbg_buf_mutex;

#endif

};

2. struct ubi_vtbl_record

下一个重要的结构体structubi_vtbl_record，在认识这个结构体之前我们先看一副截图,

这幅截图是我们在attach一个设备时候的打印内容，红色的划线部分是我们要注意的内容：internalvolume

什么是internalvolume？它是与下面的uservolume相区别的。

internal volume是内核使用来保持相应的信息的，那么它保持的是什么呢？它保持的是volumetable。它是以structubi_vtbl_record数据结构的格式来保持的。

struct ubi_vtbl_record {

__be32 reserved_pebs;// how many physicaleraseblocks are reserved for this volume

__be32 alignment;// volume alignment

__be32 data_pad;// how many bytes are unused at theend of the each physical eraseblock to satisfy the requested alignment

__u8 vol_type;//volume的类型，分为动态和静态两种，动态volume可以动态的改变它的大小

__u8 upd_marker;

__be16 name_len;//volume name length

__u8 name[UBI_VOL_NAME_MAX+1]; //volume name

__u8 flags;

__u8 padding[23];

__be32 crc;

} __attribute__ ((packed));

3. struct ubi_volume

struct ubi_volume是对UBI设备上每一个volume的抽象。

struct ubi_volume {

structdevice dev;

structcdev cdev;

structubi_device *ubi;//该volume在哪一个UBI设备上

intvol_id;//volume标号

intref_count;//引用次数（不知道什么用途）

intreaders;// number of users holding this volume in read-only mode

intwriters;// number of users holding this volume in read-write mode

intexclusive;// whether somebody holds this volume in exclusive mode

intreserved_pebs;//该volume中保留的peb数

intvol_type;//volume类型

intusable_leb_size;// logical eraseblock size without padding

intused_ebs//可用PEB数目

intlast_eb_bytes;// how many bytes are stored in the last logical eraseblock

longlong used_bytes;//已用空间大小

intalignment;

intdata_pad;

intname_len;//volume名字的长度

charname[UBI_VOL_NAME_MAX + 1];

intupd_ebs;

intch_lnum; LEB number which is being changing by the atomic LEB change operation（这样在后面修改LEB数据的操作中可以看到）

intch_dtype;

longlong upd_bytes;

longlong upd_received;

void*upd_buf;

int*eba_tbl;// EBA table of this volume，极其重要，LEB到PEB得影射关系需要查该表来获得

unsignedint checked:1;

unsignedint corrupted:1;

unsignedint upd_marker:1;

unsignedint updating:1;

unsignedint changing_leb:1;

unsignedint direct_writes:1;

};

4. struct ubi_scan_info

这个结构体是在attach的过程中使用的。在attach的过程中，UBIFS需要获知该设备上每一个PEB的状态，然后为重新挂载文件系统做准备。

struct ubi_scan_info {

structrb_root volumes;//volume的红黑树的根节点

//下面是4个链表，是在扫描的过程将扫描的block进行分类，然后连接到下面4个链表中的其中一个。

structlist_head corr;

structlist_head free;

structlist_head erase;

structlist_head alien;

intbad_peb_count;//坏块数

intvols_found;//volume数

inthighest_vol_id;//volume 的最高标号

intalien_peb_count;

intis_empty;//标志位，用于表示该UBI设备是否为空的，在上面所说的扫描过程被置位

intmin_ec;//最小erasecounter

intmax_ec;//最大erasecounter

unsignedlong long max_sqnum;//64位的sqnum

intmean_ec;//平均erasecounter

uint64_tec_sum;

intec_count;

intcorr_count;

};

5 struct ubi_scan_leb

在上面的structubi_scan_info中我们说到了在attach操作中的扫描过程，并且说到了struct ubi_scan_info中的4个队列，是将扫描的每一个block的信息抽象，然后挂载到这些队列中去，下面就简单的说一下对于block扫描信息的抽象。

struct ubi_scan_leb {

intec;//erase counter，用于均衡损耗目的，以后详细介绍

//每一个卷的eba_table就是由下面两个成员构成的。

intpnum;//物理块标号

intlnum;//逻辑块标号

intscrub;

unsignedlong long sqnum;

union{

structrb_node rb;

structlist_head list;

}u;

};

6. struct ubi_ec_hdr

我们知道UBIFS是一个Wear-level的文件系统，即均衡损耗。我们就以struct ubi_ec_hdr这个开始重要结构体的介绍。

struct ubi_ec_hdr {

__be32 magic;

__u8 version;

__u8 padding1[3];

__be64 ec; /* Warning: the current limit is 31-bitanyway! */

__be32 vid_hdr_offset;

__be32 data_offset;

__be32 image_seq;

__u8 padding2[32];

__be32 hdr_crc;

} __attribute__ ((packed));

我们注意其中的一个成员变量为_be64 ec，ec是什么，ec就是erase counter。我们知道NANDFLASH是的擦除是有次数限制的，当擦除的次数太多的时候，就会变成坏块。什么是均衡损耗，就是在文件系统的管理下，我们不能对其中的一块进行过多的擦除操作。

我们来看函数ensure_wear_leveling，它只要是来判断UBI设备是否需要进行均衡损耗的相关处理，这儿就有两个问题了。1.它判断的依据是什么。2.它会进行什么样的相关来避免对一个可擦出块进行过多的擦除操作。

那么我们先来回答第一个问题，在WL子系统中，所有的可擦出块都归WL子系统来进行管理。这是一个RB数，我们来其中的每一个结点。

struct ubi_wl_entry {

union{

structrb_node rb;

structlist_head list;

}u;

intec;

intpnum;

};

说白了，WL只关心一个东西，那么就是ec的数值。下面是wear_leveling_worker函数中的一段核心代码：

e1= rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);

e2= find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD))

从used中队里中取出一个LEB，显然EC是最小的（每擦除一次，EC值加一），再从free队列中取出一个EC值最大的LEB。

如果两个LEB的ec差值大于了UBI_WL_THRESHOLD，那么就需要进行WL操作了。

那么多操作是什么呢？

err = ubi_eba_copy_leb(ubi, e1->pnum,e2->pnum, vid_hdr);

将内容从一个LEB搬到另外一个LEB中去。

6.struct ubi_vid_hdr

在上面EC头部中有一个成员变量是vid_hdr_offset，是指vid_hdr在FLASH中的偏移量，接着分析第二重要的数据结构struct ubi_vid_hdr。

struct ubi_vid_hdr {

__be32 magic;

__u8 version;

__u8 vol_type;

__u8 copy_flag;

__u8 compat;

__be32 vol_id;

__be32 lnum;

__u8 padding1[4];

__be32 data_size;

__be32 used_ebs;

__be32 data_pad;

__be32 data_crc;

__u8 padding2[4];

__be64 sqnum;

__u8 padding3[12];

__be32 hdr_crc;

} __attribute__ ((packed));

这其中最重要的成员变量是__be32 vol_id和__be32 lnum。vol_id是标示该LEB属于哪儿一个volume。Lnum是指与该PNUM相对于的lnum。对于上层而言，我们操作的是逻辑块，也就是lnum，但是最终需要将数据写进pnum中去，在ubi_eba_write_leb函数中有这样的一句：

pnum = vol->eba_tbl[lnum];

每一个volume都有一个eba_tbl，是在扫描的时候建立的。如果该lnum没有影射，那么调用ubi_wl_get_peb来获得一个pnum，并相应的修改volume的eba_tbl;

7. struct ubi_scan_volume

struct ubi_scan_volume {

intvol_id;//volume标号

inthighest_lnum;//该volume中的最高逻辑块标号

intleb_count;//leb数目

intvol_type;//volume类型

intused_ebs;//已用PEB数

intlast_data_size;

intdata_pad;

intcompat;

structrb_node rb;//不清楚具体目的，猜想应该是一个节点的cache，缓存的是刚刚访问的结点

structrb_root root;

};

这儿主要注意一下上面的structrb_root root变量，这个成员是一个红黑树的根，用来链接在扫描的过程中发现的属于该volume的PEB。

这个结构体是在扫描的过程中读VID头部建立起来的关于volume的临时信息。

在ubifs-media.h中定义了很多的结构体，下面简单的解释一下。

在《A brief 。。。》中讲到了，UBIFS采用的node-structure。它的所有的数据都是以node的形式处理的。

struct ubifs_ch，其中ch的意思是指common header，是下面的所有结构体的共同部分。

struct ubifs_ino_node，是用来保存inode结点相关信息的。

struct ubifs_dent_node，用来来保存dent的相关信息。关于dent和inode请参考VFS部分的相关内容。

struct ubifs_data_node，用来保存具体数据的结点。

struct ubifs_trun_node，用来在trunk的时候写入journal区中的，只存在于journal区中。

struct ubifs_pad_node，用来进行数据填充的。

struct ubifs_sb_node，超结块结点，用来记载superblock的相关信息，只存在在superblock 区中。

struct ubifs_mst_node，记录master结点的数据结构，记载node树的根结点以及其他信息。只存在在master area中。

struct ubifs_ref_node ，用于在更新数据的时候写入journal区，在commit的时候更新index树和LPT树，只存在在journal区中。

struct ubifs_idx_node，idx结点的头部，关于idx结点，请参考《a briefintroduction to design of UBIFS》，只存在在main区中。

struct ubifs_branch，idx结点中的分支。

struct ubifs_cs_node，cs = commit start，用于在journal中表示一次commit的开始，只存在在journal区中。一次commit由一个ubifs_cs_node 和若干ubifs_ref_node 组成。

struct ubifs_orph_node ，用于在orphan区中记录相关信息的结点，关于orphan请参考《a brief introduction to designof UBIFS》。

（五）：wear-leveling

在本文的开头，先接本章讲一下EBA，什么是EBA，Eraseblock Association。

在上次提到structubi_volume结构体的成员变量eba_tbl的时候稍微提到了。每次文件系统需要对一个逻辑可擦除块（LEB）进行操作的时候，它就会到对应的volume的eba_tbl中去查找该逻辑可擦除块对应着哪一个物理可擦除块（PEB）。

EBA子系统的两个最重要的操作是map和unmap的过程。但是在UBI的内核源码中并没有关于map的专门函数，而是嵌套在ubi_eba_write_leb函数中，下面看来函数的具体代码：

int ubi_eba_write_leb(struct ubi_device*ubi, struct ubi_volume *vol, int lnum,

const void *buf, int offset, int len, intdtype)

{

interr, pnum, tries = 0, vol_id = vol->vol_id;

structubi_vid_hdr *vid_hdr;

if(ubi->ro_mode)

return-EROFS;

err= leb_write_lock(ubi, vol_id, lnum);

if(err)

returnerr;

到具体volume的eba_tbl表中去查找LEB与PEB之间的关系，如果pnum大于0就表示该LEB已经影射了

pnum= vol->eba_tbl[lnum];

if(pnum >= 0) {

dbg_eba("write%d bytes at offset %d of LEB %d:%d, PEB %d",

len,offset, vol_id, lnum, pnum);

err= ubi_io_write_data(ubi, buf, pnum, offset, len);

if(err) {

ubi_warn("failedto write data to PEB %d", pnum);

if(err == -EIO && ubi->bad_allowed)

err= recover_peb(ubi, pnum, vol_id, lnum, buf,

offset, len);

if(err)

ubi_ro_mode(ubi);

}

leb_write_unlock(ubi,vol_id, lnum);

returnerr;

}

* The logical eraseblock is not mapped. Wehave to get a free physical

* eraseblock and write the volume identifierheader there first.

vid_hdr= ubi_zalloc_vid_hdr(ubi, GFP_NOFS);

if(!vid_hdr) {

leb_write_unlock(ubi,vol_id, lnum);

return-ENOMEM;

}

vid_hdr->vol_type= UBI_VID_DYNAMIC;

vid_hdr->sqnum= cpu_to_be64(next_sqnum(ubi));

vid_hdr->vol_id= cpu_to_be32(vol_id);

vid_hdr->lnum= cpu_to_be32(lnum);

vid_hdr->compat= ubi_get_compat(ubi, vol_id);

vid_hdr->data_pad= cpu_to_be32(vol->data_pad);

retry:

上面的代码比较简单，也不是本次关注的内容

通过函数ubi_wl_get_peb来从WL子系统中获得一块free的PEB，然后修改volume的eba_tbl，这样一个map过程就算完成了，soeasy ,~。~！！

pnum= ubi_wl_get_peb(ubi, dtype);

if(pnum < 0) {

ubi_free_vid_hdr(ubi,vid_hdr);

leb_write_unlock(ubi,vol_id, lnum);

returnpnum;

}

dbg_eba("writeVID hdr and %d bytes at offset %d of LEB %d:%d, PEB %d",

len,offset, vol_id, lnum, pnum);

err= ubi_io_write_vid_hdr(ubi, pnum, vid_hdr);

if(err) {

ubi_warn("failedto write VID header to LEB %d:%d, PEB %d",

vol_id, lnum, pnum);

goto write_error;

}

if(len) {

err= ubi_io_write_data(ubi, buf, pnum, offset, len);

if(err) {

ubi_warn("failedto write %d bytes at offset %d of "

"LEB %d:%d, PEB %d", len, offset,vol_id,

lnum, pnum);

gotowrite_error;

}

vol->eba_tbl[lnum] = pnum;

leb_write_unlock(ubi,vol_id, lnum);

ubi_free_vid_hdr(ubi,vid_hdr);

return0;

write_error:

if(err != -EIO || !ubi->bad_allowed) {

ubi_ro_mode(ubi);

leb_write_unlock(ubi,vol_id, lnum);

ubi_free_vid_hdr(ubi,vid_hdr);

returnerr;

}

* Fortunately, this is the first writeoperation to this physical

* eraseblock, so just put it and request a newone. We assume that if

* this physical eraseblock went bad, the erasecode will handle that.

err= ubi_wl_put_peb(ubi, pnum, 1);

if(err || ++tries > UBI_IO_RETRIES) {

ubi_ro_mode(ubi);

leb_write_unlock(ubi,vol_id, lnum);

ubi_free_vid_hdr(ubi,vid_hdr);

returnerr;

}

vid_hdr->sqnum= cpu_to_be64(next_sqnum(ubi));

ubi_msg("tryanother PEB");

gotoretry;

}

接着看一个unmap的过程：

int ubi_eba_unmap_leb(struct ubi_device*ubi, struct ubi_volume *vol,

int lnum)

{

interr, pnum, vol_id = vol->vol_id;

if(ubi->ro_mode)

return-EROFS;

err= leb_write_lock(ubi, vol_id, lnum);

if(err)

returnerr;

首先还是查询vol->eba_tbl表，如果对应的想为-1,说明我们要unmap的块根本就没有map，所以也就不需要做任何事情了

pnum= vol->eba_tbl[lnum];

if(pnum < 0)

/*This logical eraseblock is already unmapped */

gotoout_unlock;

dbg_eba("eraseLEB %d:%d, PEB %d", vol_id, lnum, pnum);

如果不是小于0，那么得到值肯定是一个PEB号，修改eba_tbl对应项为-1

vol->eba_tbl[lnum]= UBI_LEB_UNMAPPED;

我们上面提到了，在map的过程中需要从WL子系统中获得peb，现在unmap掉了，需要将PEB归还给WL子系统并需要擦除，这个是由ubi_wl_put_peb完成的。

err= ubi_wl_put_peb(ubi, pnum, 0);

out_unlock:

leb_write_unlock(ubi,vol_id, lnum);

returnerr;

}

从上面的这段例子中可以看出，在UBI中，获得每一个PEB都是从WL子系统中获得，释放掉的每一个PEB都要归还给WL子系统，可以说WL无处不在每一个涉及可擦除块的使用的操作肯定涉及到WL子系统。

下面介绍一下涉及的wl的主要的数据结构：

struct ubi_wl_entry {

union{

structrb_node rb;

structlist_head list;

}u;

intec;

intpnum;

};

从这个结构体中可以看出WL子系统操作的是实实在在的物理可擦除块，另外一个关注的就是EC头部的erase counter，这也是WL进行操作的依据。

从联合u中可以看出wl子系统中是采用红黑树来管理的。关于红黑的一些操作下面稍微掠过，并不以源码的形式详细阐述。

static void wl_tree_add(struct ubi_wl_entry*e, struct rb_root *root)该操作用于将e添加到以root为RB树根的树中

static int in_wl_tree(struct ubi_wl_entry*e, struct rb_root *root)用于判断e是否存在于以root为根的RB树中

static struct ubi_wl_entry*find_wl_entry(struct rb_root *root, int max)用于在以root为根的RB树中查找erase counter无限左接近max的PEB。

WL的作用是什么呢？上面提到了一点，就是以EC值为依据来进行可擦除逻辑块的管理，以防对某一些可擦除块过多的操作导致变为坏块。如果在操作的过程中发现，某一个可擦除块的EC值变的不正常了，也就是变的太大了。（EC值是随着擦除的次数增加的）。既然EC值已经变的这么大了，那么这块可擦除块还能用吗？能。

在include/mtd/ubi-user.h中有这样一个枚举。

enum {

UBI_LONGTERM = 1,

UBI_SHORTTERM= 2,

UBI_UNKNOWN = 3,

};

定了三种用于指定数据类型的标志位，从名字中可以看出这个枚举的目的用于说明数据是长期还是短期保存。

在ubi_wl_get_peb函数中有这样的一段代码：

caseUBI_LONGTERM:

e= find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

break;

我们在获得一个PEB的时候，如果是用于长期保存的数据的话，那么就取一个EC值比较大（也就是已经擦除过很多次）的PEB。这样就物尽其用了。

根据ubidesign的说明：UBI select a long term storageblock with a low erase count and copies the block contents to the block withthe high erase count using the block moving function.但是在后面的源码中并没有看到long term这方面的考虑？（我哪儿没看到？）

函数ensure_wear_leveling就是用来判断是否存在上述的这种情况的。

static int ensure_wear_leveling(structubi_device *ubi)

{

interr = 0;

structubi_wl_entry *e1;

structubi_wl_entry *e2;

structubi_work *wrk;

spin_lock(&ubi->wl_lock);

//如果Wear-leveling已经在work工作队列了，那么这样的判断就没有必要了，因为不管你怎么判断，都是Wear-leveling必须的，而且会对pending的work造成影响，所以就什么事情也不做了。

if(ubi->wl_scheduled)

/*Wear-leveling is already in the work queue */

gotoout_unlock;

* If the ubi->scrub tree is not empty, scrubbingis needed, and the

* the WL worker has to be scheduled anyway.

@情况一：如果没有已经使用的可擦除块，也就是说该UBI设备刚被attach上去，没有任何数据。

@情况二：没有可用的可擦除块。上面说到了WL是将一块的数据搬运到另外一块可擦除块中，现在没有可用的可擦除块了，工作进行不下去了

if(!ubi->scrub.rb_node) {//这个队列中的结点是从哪儿来的呢？也就是说在什么情况下添加进来的

if(!ubi->used.rb_node || !ubi->free.rb_node)

/*No physical eraseblocks - no deal */

gotoout_unlock;

* We schedule wear-leveling only if thedifference between the

* lowest erase counter of used physicaleraseblocks and a high

* erase counter of free physical eraseblocksis greater than

* %UBI_WL_THRESHOLD.

上面说到了WL是将一块已用的可擦除块中的数据搬运到另外一块未用的可擦除块中去，所以就从used树中找一块EC值很小的（但是根据文档说，这儿应该是找一块UBI_LONGTERM类型的并且EC值比较小的），然后再从free树中找一块ec值很大的。

e1= rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);

e2= find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

if(!(e2->ec - e1->ec >= UBI_WL_THRESHOLD))

gotoout_unlock;

dbg_wl("schedulewear-leveling");

}else

dbg_wl("schedulescrubbing");

ubi->wl_scheduled= 1;//注意这儿将wl_scheduled标志置位

spin_unlock(&ubi->wl_lock);

wrk= kmalloc(sizeof(struct ubi_work), GFP_NOFS);

if(!wrk) {

err= -ENOMEM;

gotoout_cancel;

}

//构造一个worker，并添加到队列中由后台进程来完成。具体工作是由wear_leveling_worker来完成的。

wrk->func= &wear_leveling_worker;

schedule_ubi_work(ubi,wrk);

returnerr;

out_cancel:

spin_lock(&ubi->wl_lock);

ubi->wl_scheduled= 0;

out_unlock:

spin_unlock(&ubi->wl_lock);

returnerr;

}

下面就看看wear_leveling_worker这个函数的具体的工作：

static int wear_leveling_worker(structubi_device *ubi, struct ubi_work *wrk,

intcancel)

{

interr, scrubbing = 0, torture = 0, protect = 0, erroneous = 0;

intvol_id = -1, uninitialized_var(lnum);

structubi_wl_entry *e1, *e2;

structubi_vid_hdr *vid_hdr;

kfree(wrk);

if(cancel)

return0;

//分配一个VID头部，因为在拷贝数据的过程中，需要重新写入VID

vid_hdr= ubi_zalloc_vid_hdr(ubi, GFP_NOFS);

if(!vid_hdr)

return-ENOMEM;

mutex_lock(&ubi->move_mutex);

spin_lock(&ubi->wl_lock);

ubi_assert(!ubi->move_from&& !ubi->move_to);

ubi_assert(!ubi->move_to_put);

@下面的英文注释已经说的很清楚了，如果没有free的PEB，没有关系，可以等待被pending的erase_worker完成。但是如果连scrub都没有，那么就没有办法了，取消本次WL操作

@没有used 的PEB？。在ubi_wl_get_peb函数中

rb_erase(&e->u.rb,&ubi->free)

prot_queue_add(ubi, e);

而在ubi_wl_put_peb中有：

prot_queue_del(ubi,e->pnum);

这样的操作，相信在别的地方如erase_wroker也有这样的操作。也就是说UBI会将暂时操作的PEB从相应的队列中暂时移除，把它放到ubi->pq中保护起来。

if(!ubi->free.rb_node ||

(!ubi->used.rb_node &&!ubi->scrub.rb_node)) {

* No free physical eraseblocks? Well, theymust be waiting in

* the queue to be erased. Cancel movement - itwill be

* triggered again when a free physicaleraseblock appears.

* No used physical eraseblocks? They must betemporarily

* protected from being moved. They will bemoved to the

* @ubi->used tree later and thewear-leveling will be

* triggered again.

dbg_wl("cancelWL, a list is empty: free %d, used %d",

!ubi->free.rb_node, !ubi->used.rb_node);

gotoout_cancel;

}

if(!ubi->scrub.rb_node) {

* Now pick the least worn-out used physicaleraseblock and a

* highly worn-out free physical eraseblock. Ifthe erase

* counters differ much enough, startwear-leveling.

e1= rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);

//如果scrub队列是空的，那么就从free队列中取一个目标PEB进行WL操作（EC无限左接近于WL_FREE_MAX_DIFF）

e2= find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

if(!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) {

dbg_wl("noWL needed: min used EC %d, max free EC %d",

e1->ec, e2->ec);

gotoout_cancel;

}

paranoid_check_in_wl_tree(e1,&ubi->used);

// rb_erase是一个红黑的基本删除操作，在lib/rbtree.c中。这儿e1中的数据被转移了，那么就需要将e1从ubi->used队列中删除掉

rb_erase(&e1->u.rb,&ubi->used);

dbg_wl("movePEB %d EC %d to PEB %d EC %d",

e1->pnum, e1->ec, e2->pnum,e2->ec);

}else {

/*Perform scrubbing */

scrubbing= 1;

//注意这儿从scrub中获得e2的时候，并没有像上面一样if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD))比较e1和e2的EC值，为什么呢？因为scrub队列中的PEB都是在读的时候发生BIT_FILP的，所以必须进行WL

e1= rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, u.rb);

e2= find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

paranoid_check_in_wl_tree(e1,&ubi->scrub);

rb_erase(&e1->u.rb,&ubi->scrub);

dbg_wl("scrubPEB %d to PEB %d", e1->pnum, e2->pnum);

}

paranoid_check_in_wl_tree(e2,&ubi->free);

rb_erase(&e2->u.rb,&ubi->free);

//注意这儿，这两个指针在数据搬运完成之后会被清除掉的

ubi->move_from= e1;

ubi->move_to= e2;

spin_unlock(&ubi->wl_lock);

* Now we are going to copy physical eraseblock@e1->pnum to @e2->pnum.

* We so far do not know which logicaleraseblock our physical

* eraseblock (@e1) belongs to. We have to readthe volume identifier

* header first.

* Note, we are protected from this PEB beingunmapped and erased. The

* 'ubi_wl_put_peb()' would wait for moving tobe finished if the PEB

* which is being moved was unmapped.

err= ubi_io_read_vid_hdr(ubi, e1->pnum, vid_hdr, 0);

if(err && err != UBI_IO_BITFLIPS) {

if(err == UBI_IO_PEB_FREE) {

* We are trying to move PEB without a VIDheader. UBI

* always write VID headers shortly after thePEB was

* given, so we have a situation when it hasnot yet

* had a chance to write it, because it waspreempted.

* So add this PEB to the protection queue sofar,

* because presumably more data will be writtenthere

* (including the missing VID header), and thenwe'll

* move it.

//进一步检查VID头部，不能说因为它是从used队列中取出来的就直接将数据搬运过去了，可以以前某个地方出错了。这儿如果发现我们要搬运的PEB本身就是空，那么搬运也就没必要进行下去了。

dbg_wl("PEB%d has no VID header", e1->pnum);

protect= 1;

gotoout_not_moved;

}

ubi_err("error%d while reading VID header from PEB %d",

err,e1->pnum);

gotoout_error;

}

vol_id= be32_to_cpu(vid_hdr->vol_id);

lnum= be32_to_cpu(vid_hdr->lnum);

//具体搬运数据由ubi_eba_copy_leb函数完成，实现比较简单，不在赘述

err= ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vid_hdr);

if(err) {

if(err == MOVE_CANCEL_RACE) {

* The LEB has not been moved because the volumeis

* being deleted or the PEB has been putmeanwhile. We

* should prevent this PEB from being selectedfor

* wear-leveling movement again, so put it tothe

* protection queue.

protect= 1;

gotoout_not_moved;

}

if(err == MOVE_CANCEL_BITFLIPS || err == MOVE_TARGET_WR_ERR ||

err == MOVE_TARGET_RD_ERR) {

* Target PEB had bit-flips or write error -torture it.

torture= 1;

gotoout_not_moved;

}

if(err == MOVE_SOURCE_RD_ERR) {

* An error happened while reading the sourcePEB. Do

* not switch to R/O mode in this case, andgive the

* upper layers a possibility to recover fromthis,

* e.g. by unmapping corresponding LEB.Instead, just

* put this PEB to the @ubi->erroneous listto prevent

* UBI from trying to move it over and overagain.

if(ubi->erroneous_peb_count > ubi->max_erroneous) {

ubi_err("toomany erroneous eraseblocks (%d)",

ubi->erroneous_peb_count);

gotoout_error;

}

erroneous= 1;

gotoout_not_moved;

}

if(err < 0)

gotoout_error;

ubi_assert(0);

}

/*The PEB has been successfully moved */

if(scrubbing)

ubi_msg("scrubbedPEB %d (LEB %d:%d), data moved to PEB %d",

e1->pnum,vol_id, lnum, e2->pnum);

ubi_free_vid_hdr(ubi,vid_hdr);

spin_lock(&ubi->wl_lock);

if(!ubi->move_to_put) {

wl_tree_add(e2,&ubi->used);

e2= NULL;

}

ubi->move_from= ubi->move_to = NULL;

ubi->move_to_put= ubi->wl_scheduled = 0;

spin_unlock(&ubi->wl_lock);

//这个通过后台进程来擦除e1，erase_worker

err= schedule_erase(ubi, e1, 0);

if(err) {

kmem_cache_free(ubi_wl_entry_slab,e1);

if(e2)

kmem_cache_free(ubi_wl_entry_slab,e2);

gotoout_ro;

}

if(e2) {

* Well, the target PEB was put meanwhile,schedule it for

* erasure.

dbg_wl("PEB%d (LEB %d:%d) was put meanwhile, erase",

e2->pnum, vol_id, lnum);

err= schedule_erase(ubi, e2, 0);

if(err) {

kmem_cache_free(ubi_wl_entry_slab,e2);

gotoout_ro;

}

dbg_wl("done");

mutex_unlock(&ubi->move_mutex);

return0;

* For some reasons the LEB was not moved,might be an error, might be

* something else. @e1 was not changed, soreturn it back. @e2 might

* have been changed, schedule it for erasure.

out_not_moved:

if(vol_id != -1)

dbg_wl("cancelmoving PEB %d (LEB %d:%d) to PEB %d (%d)",

e1->pnum, vol_id, lnum, e2->pnum,err);

else

dbg_wl("cancelmoving PEB %d to PEB %d (%d)",

e1->pnum, e2->pnum, err);

spin_lock(&ubi->wl_lock);

if(protect)

prot_queue_add(ubi,e1);

elseif (erroneous) {

wl_tree_add(e1,&ubi->erroneous);

ubi->erroneous_peb_count+= 1;

}else if (scrubbing)

wl_tree_add(e1,&ubi->scrub);

else

wl_tree_add(e1,&ubi->used);

ubi_assert(!ubi->move_to_put);

ubi->move_from= ubi->move_to = NULL;

ubi->wl_scheduled= 0;

spin_unlock(&ubi->wl_lock);

ubi_free_vid_hdr(ubi,vid_hdr);

err= schedule_erase(ubi, e2, torture);

if(err) {

kmem_cache_free(ubi_wl_entry_slab,e2);

gotoout_ro;

}

mutex_unlock(&ubi->move_mutex);

return0;

out_error:

if(vol_id != -1)

ubi_err("error%d while moving PEB %d to PEB %d",

err,e1->pnum, e2->pnum);

else

ubi_err("error%d while moving PEB %d (LEB %d:%d) to PEB %d",

err,e1->pnum, vol_id, lnum, e2->pnum);

spin_lock(&ubi->wl_lock);

ubi->move_from= ubi->move_to = NULL;

ubi->move_to_put= ubi->wl_scheduled = 0;

spin_unlock(&ubi->wl_lock);

ubi_free_vid_hdr(ubi,vid_hdr);

kmem_cache_free(ubi_wl_entry_slab,e1);

kmem_cache_free(ubi_wl_entry_slab,e2);

out_ro:

ubi_ro_mode(ubi);

mutex_unlock(&ubi->move_mutex);

ubi_assert(err!= 0);

returnerr < 0 ? err : -EIO;

out_cancel:

ubi->wl_scheduled= 0;

spin_unlock(&ubi->wl_lock);

mutex_unlock(&ubi->move_mutex);

ubi_free_vid_hdr(ubi,vid_hdr);

return0;

}

至此，WL基本完成。主要代码都在/drivers/mtd/ubi/wl.c文件中。

那么UBIFS中在什么情况下会调用ensure_wear_leveling来判断是否进行WL。

1. erase_worker

2. ubi_wl_scrub_peb

3. ubi_wl_init_scan

对于WL，有点需要详细说明一下:上面提到了ubi->scrub中的结点是从哪儿来的？

在ubi_eba_read_leb函数，当发生BIT_FILP的时候，会调用ubi_wl_scrub_peb来进行WL。

同样在上面的ensure_wear_leveling中看到了，WL中是优先到ubi->scrub队列中查找的。

文件系统文件系统文件系统是什么？通道文件系统《文件系统》是什么意思文件系统问题文件系统问题 ntfs文件系统 ntfs文件系统 NTFS文件系统关于文件系统文件系统问题文件系统格式 FAT32文件系统和NTFS文件系统如何将NTFS文件系统转为FAT文件系统？怎样将FAT文件系统改成NTFS文件系统？ “fat的文件系统”是什么意思？文件系统又是什么？什么是NTFS文件系统，什么是FAT文件系统 NTFS文件系统怎么转换成FAT32文件系统？ Linux文件系统与dos文件系统的异同什么是NTFS文件系统？什么是FAT文件系统？文件系统的更改？ VFAT是不是FAT32文件系统？关于文件系统的问题