文件系统隔离之 – 深入 prjquota，源码剖析

内容隐藏

ext4 prjquota 实现原理，参考了 xfs prjquota，并且复用了linux 内核的磁盘配额管理机制的大部分实现，所以源码上分析起来还是非常简单的

linux内核本身就已经支持user、group级别的磁盘配额管理，用法可以参考：https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/storage_administration_guide/ch-disk-quotas

从文件系统实现层面来看，文件系统本身并不了解什么是uid，gid，因此disk quota的实现一定是在raw file system 之上的。正因为是如此，所以 prjquota 得以复用原有 disk quota 的大量实现，之需要在原有基础之上，扩展一个新的 quota 类型而已

具体内核提交的 patch：https://lore.kernel.org/patchwork/patch/541891/

4.14 内核时，已经进入主干，因此可以参考：https://lxr.missinglinkelectronics.com/linux+v4.14/fs/ext4/

简述一下其基本设计：

在 super block 中，有一块专门用来存储 project id 用量的元数据区
每个文件，属于哪个 project id，是记录在文件的 xattr 属性里面的（正是因为 ext4 文件系统支持 xattr 扩展，所以才很方便的移植这个特性）
文件写入的时候，先查找这个文件的 project id，然后判断当前 project 的 usage + 文件的增量的大小，是否超过 project 的 hardlimit，如果超过，返回 EDOUT，文件写入失败

1）prjquota 元数据管理

prjquota 并没有从 superblock 上分配一块特殊的空间，来管理自己的元数据。因为不管是usrquota，还是grpquota，还是prjquota，对raw filesystem 来说，都是一样的

所以只需要扩展一下原来的 *quota 定义即可

-#define MAXQUOTAS 2
+#define MAXQUOTAS 3
 #define USRQUOTA  0		/* element used for user quotas */
 #define GRPQUOTA  1		/* element used for group quotas */
+#define PRJQUOTA  2		/* element used for project quotas */
 
 /*
  * Definitions for the default names of the quotas files.
@@ -48,6 +49,7 @@
 #define INITQFNAMES { \
 	"user",    /* USRQUOTA */ \
 	"group",   /* GRPQUOTA */ \
+	"project", /* PRJQUOTA */ \
 	"undefined", \
 };

然后在 ext4_inode 中，增加一个字段用来存储 project id 即可

--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -683,6 +683,7 @@ struct ext4_inode {
 	__le32  i_crtime;       /* File Creation time */
 	__le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
 	__le32  i_version_hi;	/* high 32 bits for 64-bit version */
+	__le32  i_projid;	/* Project ID */
 };
 
 struct move_extent {
@@ -938,6 +939,7 @@ struct ext4_inode_info {
 
 	/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
 	__u32 i_csum_seed;
+	kprojid_t i_projid;
 };/* Structure for communicating via ->get_dqblk() & ->set_dqblk() */
struct qc_dqblk {
	int d_fieldmask;	/* mask of fields to change in ->set_dqblk() */
	u64 d_spc_hardlimit;	/* absolute limit on used space */
	u64 d_spc_softlimit;	/* preferred limit on used space */
	u64 d_ino_hardlimit;	/* maximum # allocated inodes */
	u64 d_ino_softlimit;	/* preferred inode limit */
	u64 d_space;		/* Space owned by the user */
	u64 d_ino_count;	/* # inodes owned by the user */
	s64 d_ino_timer;	/* zero if within inode limits */

由于复用了 usrquota 的实现，所以 prjquota 和 usrquota、grpquota 一样，都只能有 65535 个id的限制，也就是说，一个文件系统内，最多不允许超过有 65525 个 project id

每个 project id 当前已经使用了多少磁盘空间，以及多少个 inode，是记录在 superblock 上某个特殊的元数据区里面的，其数据结构是 struct qc_dqblk {}

/* Structure for communicating via ->get_dqblk() & ->set_dqblk() */
struct qc_dqblk {
	int d_fieldmask;	/* mask of fields to change in ->set_dqblk() */
	u64 d_spc_hardlimit;	/* absolute limit on used space */
	u64 d_spc_softlimit;	/* preferred limit on used space */
	u64 d_ino_hardlimit;	/* maximum # allocated inodes */
	u64 d_ino_softlimit;	/* preferred inode limit */
	u64 d_space;		/* Space owned by the user */
	u64 d_ino_count;	/* # inodes owned by the user */
	s64 d_ino_timer;	/* zero if within inode limits */
        ......
}

2）prjquota 使能和关闭

前面我们说过，打开 prjquota 有两种方式，一种是 mount 设备的时候，直接加上 prjquota 参数，另外一种就是 tune2fs，从内核源码来看，这2中方式做的事情是一样的，最终就是在 superblock 上加上 enable 的开关，然后把管理 prjquota 的元数据内容，从磁盘加载到内存里

int dquot_enable(struct inode *inode, int type, int format_id,
		 unsigned int flags)
{
	struct super_block *sb = inode->i_sb;
        ......
	/* Just updating flags needed? */
	if (sb_has_quota_loaded(sb, type)) {
		if (flags & DQUOT_USAGE_ENABLED &&
		    sb_has_quota_usage_enabled(sb, type))
			return -EBUSY;
		if (flags & DQUOT_LIMITS_ENABLED &&
		    sb_has_quota_limits_enabled(sb, type))
			return -EBUSY;
		spin_lock(&dq_state_lock);
		sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
		spin_unlock(&dq_state_lock);
		return 0;
	}

	return vfs_load_quota_inode(inode, type, format_id, flags);
}

3）disk quota 统计 & 写入校验

所有带 project id 的文件，在读写的时候，都会把当前文件的写入内容的大小统计到当前 project id 的总用量上

主要分两种情况：

更新文件的 project id：包括以前没设置，现在设置一个新的，以及以前有 project id，现在更新成其他的
写文件：包括写一个新的文件，以及 append 文件。当前都是同一个实现

首先来看下，更新 project id 是怎么实现的，其内核关键函数是 __dquot_transfer，这个函数会把 inode 对应的文件大小，从原来的 project id 里减掉，然后加到新的 project id 下，如果增加到目标 project id 失败（比如 quota 超限了），则回滚

/*
 * Transfer the number of inode and blocks from one diskquota to an other.
 * On success, dquot references in transfer_to are consumed and references
 * to original dquots that need to be released are placed there. On failure,
 * references are kept untouched.
 *
 * This operation can block, but only after everything is updated
 * A transaction must be started when entering this function.
 *
 * We are holding reference on transfer_from & transfer_to, no need to
 * protect them by srcu_read_lock().
 */
int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
{
......

	cur_space = __inode_get_bytes(inode);
	rsv_space = __inode_get_rsv_space(inode);
	/*
	 * Build the transfer_from list, check limits, and update usage in
	 * the target structures.
	 */
	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
		/*
		 * Skip changes for same uid or gid or for turned off quota-type.
		 */
		if (!transfer_to[cnt])
			continue;
		/* Avoid races with quotaoff() */
		if (!sb_has_quota_active(inode->i_sb, cnt))
			continue;
		is_valid[cnt] = 1;
		transfer_from[cnt] = i_dquot(inode)[cnt];
		ret = dquot_add_inodes(transfer_to[cnt], inode_usage,
				       &warn_to[cnt]);
		if (ret)
			goto over_quota;
		ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, 0,
				      &warn_to[cnt]);
		if (ret) {
			spin_lock(&transfer_to[cnt]->dq_dqb_lock);
			dquot_decr_inodes(transfer_to[cnt], inode_usage);
			spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
			goto over_quota;
		}
	}

	/* Decrease usage for source structures and update quota pointers */
	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
		if (!is_valid[cnt])
			continue;
		/* Due to IO error we might not have transfer_from[] structure */
		if (transfer_from[cnt]) {
			int wtype;

			spin_lock(&transfer_from[cnt]->dq_dqb_lock);
			wtype = info_idq_free(transfer_from[cnt], inode_usage);
			if (wtype != QUOTA_NL_NOWARN)
				prepare_warning(&warn_from_inodes[cnt],
						transfer_from[cnt], wtype);
			wtype = info_bdq_free(transfer_from[cnt],
					      cur_space + rsv_space);
			if (wtype != QUOTA_NL_NOWARN)
				prepare_warning(&warn_from_space[cnt],
						transfer_from[cnt], wtype);
			dquot_decr_inodes(transfer_from[cnt], inode_usage);
			dquot_decr_space(transfer_from[cnt], cur_space);
			dquot_free_reserved_space(transfer_from[cnt],
						  rsv_space);
			spin_unlock(&transfer_from[cnt]->dq_dqb_lock);
		}
		i_dquot(inode)[cnt] = transfer_to[cnt];
	}
......
}
EXPORT_SYMBOL(__dquot_transfer);

我们再来看一下，写文件的过程，其实现的关键函数是 __dquot_alloc_space，这个函数会把写入的内容大小，统计到对应 project id 下面。同理，如果文件是被 ftruncate 了，则 number 是个负数，表示是需要释放这部分的大小

/*
 * This functions updates i_blocks+i_bytes fields and quota information
 * (together with appropriate checks).
 *
 * NOTE: We absolutely rely on the fact that caller dirties the inode
 * (usually helpers in quotaops.h care about this) and holds a handle for
 * the current transaction so that dquot write and inode write go into the
 * same transaction.
 */
int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
{
......

	dquots = i_dquot(inode);
	index = srcu_read_lock(&dquot_srcu);
	spin_lock(&inode->i_lock);
	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
		if (!dquots[cnt])
			continue;
		if (flags & DQUOT_SPACE_RESERVE) {
			ret = dquot_add_space(dquots[cnt], 0, number, flags,
					      &warn[cnt]);
		} else {
			ret = dquot_add_space(dquots[cnt], number, 0, flags,
					      &warn[cnt]);
		}
		if (ret) {
			/* 任意一种 disk quota 设置失败，这里都全部回滚 */
			for (cnt--; cnt >= 0; cnt--) {
				if (!dquots[cnt])
					continue;
				spin_lock(&dquots[cnt]->dq_dqb_lock);
				if (flags & DQUOT_SPACE_RESERVE) {
					dquots[cnt]->dq_dqb.dqb_rsvspace -=
									number;
				} else {
					dquots[cnt]->dq_dqb.dqb_curspace -=
									number;
				}
				spin_unlock(&dquots[cnt]->dq_dqb_lock);
			}
			spin_unlock(&inode->i_lock);
			goto out_flush_warn;
		}
	}
......
	return ret;
}
EXPORT_SYMBOL(__dquot_alloc_space);

上面所有过程实现的分析，都是以 space 为例的，project quota 中对于 inode 的处理是同理的，这里不展开分析了

一	二	三	四	五	六	日
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30	31

成功，源于对美学的执著追求

文件系统隔离之 – 深入 prjquota，源码剖析

1）prjquota 元数据管理

2）prjquota 使能和关闭

3）disk quota 统计 & 写入校验

发表回复取消回复

成功，源于对美学的执著追求

1）prjquota 元数据管理

2）prjquota 使能和关闭

3）disk quota 统计 & 写入校验

发表回复 取消回复

发表回复取消回复