Luminous版本CephFS的file location

需求

在Ceph Luminous里部署了CephFS,想查看上面的一个file到Rados,再到OSDs的映射关系。

之前在Ceph Jewel版本里,有个cephfs的工具,可以获取file的location信息,如下:

1
2
3
4
5
6
7
8
9
10
# cephfs /mnt/tstfs2/mike512K/tstfile show_location
WARNING: This tool is deprecated. Use the layout.* xattrs to query and modify layouts.
location.file_offset: 0 // file的偏移
location.object_offset:0 // object的偏移
location.object_no: 0 // object的number
location.object_size: 4194304 // object size为4M
location.object_name: 10000002356.00000000 // object的name
location.block_offset: 0 // block的偏移
location.block_size: 524288 // block size为512k
location.osd: 0 // 存储在osd 0 上

但如上面的WARNING所述,这个命令被遗弃了,在Ceph Luminous里没找到相关的替换命令。。。

在Ceph的官方文档里也没找到相关说法:http://docs.ceph.com/docs/master/cephfs/file-layouts/

那只能自己看代码分析了 ;(

代码分析

Jewel版本

在Jewel版本里是有cephfs这个工具的,那先看看它是如何工作的?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
文件:src/cephfs.cc
int main (int argc, char **argv)
{
...
if (CMD_SHOW_LAYOUT == cmd) {
...
} else if (CMD_SHOW_LOC == cmd) {
struct ceph_ioctl_dataloc location;
location.file_offset = file_offset;
err = ioctl(fd, CEPH_IOC_GET_DATALOC, (unsigned long)&location);
if (err) {
cerr << "Error getting location: " << cpp_strerror(err) << endl;
return 1;
}
cout << "location.file_offset: " << location.file_offset << endl;
cout << "location.object_offset:" << location.object_offset << endl;
cout << "location.object_no: " << location.object_no << endl;
cout << "location.object_size: " << location.object_size << endl;
cout << "location.object_name: " << location.object_name << endl;
cout << "location.block_offset: " << location.block_offset << endl;
cout << "location.block_size: " << location.block_size << endl;
cout << "location.osd: " << location.osd << endl;
// cout << "osd address: " << location.osd_addr << endl;
}

上面代码段就是与命令cephfs <file_path> show_location相关的代码。

1
2
3
文件:src/client/ioctl.h
#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
struct ceph_ioctl_dataloc)

查看ceph代码里的ioctl,如下,这里只是ceph-fuse客户端实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
文件:src/client/fuse_ll.cc
#ifdef FUSE_IOCTL_COMPAT
static void fuse_ll_ioctl(fuse_req_t req, fuse_ino_t ino, int cmd, void *arg, struct fuse_file_info *fi,
unsigned flags, const void *in_buf, size_t in_bufsz, size_t out_bufsz)
{
...
switch(cmd) {
case CEPH_IOC_GET_LAYOUT: {
file_layout_t layout;
struct ceph_ioctl_layout l;
Fh *fh = (Fh*)fi->fh;
cfuse->client->ll_file_layout(fh, &layout);
l.stripe_unit = layout.stripe_unit;
l.stripe_count = layout.stripe_count;
l.object_size = layout.object_size;
l.data_pool = layout.pool_id;
fuse_reply_ioctl(req, 0, &l, sizeof(struct ceph_ioctl_layout));
}
break;
default:
fuse_reply_err(req, EINVAL);
}
}

看到ceph-fuse仅仅支持CEPH_IOC_GET_LAYOUTioctl命令。

查看Linux的kernel代码,看相关cephfs的ioctl部分:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
文件:fs/ceph/ioctl.c
/*
* Return object name, size/offset information, and location (OSD
* number, network address) for a given file offset.
*/
static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
{
struct ceph_ioctl_dataloc dl;
struct inode *inode = file_inode(file);
...
dl.file_offset -= dl.object_offset;
dl.object_size = ci->i_layout.object_size;
dl.block_size = ci->i_layout.stripe_unit;

/* block_offset = object_offset % block_size */
tmp = dl.object_offset;
dl.block_offset = do_div(tmp, dl.block_size);

snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
...
}

看出file的location信息是根据其layout信息和inode规则生成的。

Luminous版本

在Luminous版本里,没有找到src/cephfs.cc文件,那就查查其它相关代码。

想象一个file的什么过程会要求获取到Rados的映射信息?首先想到的就是read/write,那就看Ceph Luminous版本里的相关代码吧~

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
文件:src/client/Client.cc
int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
const struct iovec *iov, int iovcnt)
{
...
// async, caching, non-blocking.
r = objectcacher->file_write(&in->oset, &in->layout,
in->snaprealm->get_snap_context(),
offset, size, bl, ceph::real_clock::now(),
0);
...
}

文件:src/osdc/ObjectCacher.h
class ObjectCacher {
...
int file_write(ObjectSet *oset, file_layout_t *layout,
const SnapContext& snapc, loff_t offset, uint64_t len,
bufferlist& bl, ceph::real_time mtime, int flags) {
OSDWrite *wr = prepare_write(snapc, bl, mtime, flags, 0);
Striper::file_to_extents(cct, oset->ino, layout, offset, len,
oset->truncate_size, wr->extents);
return writex(wr, oset, NULL);
}
...
};

文件:osdc/Striper.h
class Striper {
...
static void file_to_extents(CephContext *cct, inodeno_t ino,
const file_layout_t *layout,
uint64_t offset, uint64_t len,
uint64_t trunc_size,
vector<ObjectExtent>& extents) {
// generate prefix/format
char buf[32];
snprintf(buf, sizeof(buf), "%llx.%%08llx", (long long unsigned)ino);

file_to_extents(cct, buf, layout, offset, len, trunc_size, extents);
}
...
};
可以看出file到extents的转换格式为:<ino.%%08llx>

也就是说在CephFS中file到Rados里object的映射关系如下。

object命名规则:<file inode number>.<slice number>

验证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
root@ceph0:/mnt/cephfs# dd if=/dev/zero of=4Mfile bs=4M count=1
1+0 records in
1+0 records out
4194304 bytes (4.2 MB, 4.0 MiB) copied, 0.00866722 s, 484 MB/s
root@ceph0:/mnt/cephfs# ll -ih
total 4.1M
1 drwxr-xr-x 1 root root 40G Jun 7 17:33 ./
15466497 drwxr-xr-x 3 root root 4.0K Jun 4 15:19 ../
1099511628901 -rw-r--r-- 1 root root 4.0M Jun 7 17:33 4Mfile

root@ceph0:/mnt/cephfs# stat 4Mfile
File: '4Mfile'
Size: 4194304 Blocks: 8192 IO Block: 4194304 regular file
Device: 10006bh/1048683d Inode: 1099511628901 Links: 1
Access: (0644/-rw-r--r--) Uid: ( 0/ root) Gid: ( 0/ root)
Access: 2018-06-07 17:33:12.451473976 +0800
Modify: 2018-06-07 17:44:11.141674057 +0800
Change: 2018-06-07 17:44:11.141674057 +0800
Birth: -

1099511628901转换为16进制为:0x10000000465

查看文件的layout信息:

1
2
3
root@ceph0:/mnt/cephfs# getfattr -n ceph.file.layout 4Mfile
# file: 4Mfile
ceph.file.layout="stripe_unit=4194304 stripe_count=1 object_size=4194304 pool=cephfs_data"

查看Rados里的object和其map信息:

1
2
3
4
5
6
7
root@ceph0:/mnt/cephfs# rados ls -p cephfs_data | grep -i 10000000465
10000000465.00000000
root@ceph0:/mnt/cephfs# rados -p cephfs_data stat 10000000465.00000000
cephfs_data/10000000465.00000000 mtime 2018-06-07 17:33:12.000000, size 4194304

root@ceph0:/mnt/cephfs# ceph osd map cephfs_data 10000000465.00000000
osdmap e5770 pool 'cephfs_data' (2) object '10000000465.00000000' -> pg 2.3137aa5e (2.5e) -> up ([2,6], p2) acting ([2,6], p2)
支持原创