zonefs: Dynamically create file inodes when needed

Allocating and initializing all inodes and dentries for all files results in a very large memory usage with high capacity zoned block devices. For instance, with a 26 TB SMR HDD with over 96000 zones, mounting the disk with zonefs results in about 130 MB of memory used, the vast majority of this space being used for vfs inodes and dentries. However, since a user will rarely access all zones at the same time, dynamically creating file inodes and dentries on demand, similarly to regular file systems, can significantly reduce memory usage. This patch modifies mount processing to not create the inodes and dentries for zone files. Instead, the directory inode operation zonefs_lookup() and directory file operation zonefs_readdir() are introduced to allocate and initialize inodes on-demand using the helper functions zonefs_get_dir_inode() and zonefs_get_zgroup_inode(). Implementation of these functions is simple, relying on the static nature of zonefs directories and files. Directory inodes are linked to the volume zone groups (struct zonefs_zone_group) they represent by using the directory inode i_private field. This simplifies the implementation of the lookup and readdir operations. Unreferenced zone file inodes can be evicted from the inode cache at any time. In such case, the only inode information that cannot be recreated from the zone information that is saved in the zone group data structures attached to the volume super block is the inode uid, gid and access rights. These values may have been changed by the user. To keep these attributes for the life time of the mount, as before, the inode mode, uid and gid are saved in the inode zone information and the saved values are used to initialize regular file inodes when an inode lookup happens. The zone information mode, uid and gid are initialized in zonefs_init_zgroup() using the default values. With these changes, the static minimal memory usage of a zonefs volume is mostly reduced to the array of zone information for each zone group. For the 26 TB SMR hard-disk mentioned above, the memory usage after mount becomes about 5.4 MB, a reduction by a factor of 24 from the initial 130 MB memory use. Co-developed-by: Jorgen Hansen <Jorgen.Hansen@wdc.com> Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
author: Damien Le Moal <damien.lemoal@opensource.wdc.com> 2022-11-30 11:01:09 +0900
committer: Damien Le Moal <damien.lemoal@opensource.wdc.com> 2023-01-23 09:25:51 +0900
commit: d207794ababe5c3ad72e965c5e1023cfaf4ab1bb (patch)
tree: a0485995788cf4f36b097ba07fe966c9f4c28434 /fs/zonefs
parent: aa7f243f32e1d18036ee00d71d3ccfad70ae2121 (diff)
2 files changed, 257 insertions, 99 deletions
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 270ded209dde..7d70c327883e 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -243,6 +243,7 @@ static void zonefs_inode_update_mode(struct inode *inode)
 	}
 
 	z->z_flags &= ~ZONEFS_ZONE_INIT_MODE;
+	z->z_mode = inode->i_mode;
 }
 
 struct zonefs_ioerr_data {
@@ -578,144 +579,283 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
 
 	setattr_copy(&init_user_ns, inode, iattr);
 
+	if (S_ISREG(inode->i_mode)) {
+		struct zonefs_zone *z = zonefs_inode_zone(inode);
+
+		z->z_mode = inode->i_mode;
+		z->z_uid = inode->i_uid;
+		z->z_gid = inode->i_gid;
+	}
+
 	return 0;
 }
 
-static const struct inode_operations zonefs_dir_inode_operations = {
-	.lookup		= simple_lookup,
+static const struct inode_operations zonefs_file_inode_operations = {
 	.setattr	= zonefs_inode_setattr,
 };
 
-static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
-				  enum zonefs_ztype ztype)
+static long zonefs_fname_to_fno(const struct qstr *fname)
 {
-	struct super_block *sb = parent->i_sb;
+	const char *name = fname->name;
+	unsigned int len = fname->len;
+	long fno = 0, shift = 1;
+	const char *rname;
+	char c = *name;
+	unsigned int i;
 
-	inode->i_ino = bdev_nr_zones(sb->s_bdev) + ztype + 1;
-	inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555);
-	inode->i_op = &zonefs_dir_inode_operations;
-	inode->i_fop = &simple_dir_operations;
-	set_nlink(inode, 2);
-	inc_nlink(parent);
-}
+	/*
+	 * File names are always a base-10 number string without any
+	 * leading 0s.
+	 */
+	if (!isdigit(c))
+		return -ENOENT;
 
-static const struct inode_operations zonefs_file_inode_operations = {
-	.setattr	= zonefs_inode_setattr,
-};
+	if (len > 1 && c == '0')
+		return -ENOENT;
 
-static void zonefs_init_file_inode(struct inode *inode,
-				   struct zonefs_zone *z)
+	if (len == 1)
+		return c - '0';
+
+	for (i = 0, rname = name + len - 1; i < len; i++, rname--) {
+		c = *rname;
+		if (!isdigit(c))
+			return -ENOENT;
+		fno += (c - '0') * shift;
+		shift *= 10;
+	}
+
+	return fno;
+}
+
+static struct inode *zonefs_get_file_inode(struct inode *dir,
+					   struct dentry *dentry)
 {
-	struct super_block *sb = inode->i_sb;
+	struct zonefs_zone_group *zgroup = dir->i_private;
+	struct super_block *sb = dir->i_sb;
 	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+	struct zonefs_zone *z;
+	struct inode *inode;
+	ino_t ino;
+	long fno;
 
-	inode->i_private = z;
+	/* Get the file number from the file name */
+	fno = zonefs_fname_to_fno(&dentry->d_name);
+	if (fno < 0)
+		return ERR_PTR(fno);
+
+	if (!zgroup->g_nr_zones || fno >= zgroup->g_nr_zones)
+		return ERR_PTR(-ENOENT);
 
-	inode->i_ino = z->z_sector >> sbi->s_zone_sectors_shift;
-	inode->i_mode = S_IFREG | sbi->s_perm;
-	inode->i_uid = sbi->s_uid;
-	inode->i_gid = sbi->s_gid;
+	z = &zgroup->g_zones[fno];
+	ino = z->z_sector >> sbi->s_zone_sectors_shift;
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW)) {
+		WARN_ON_ONCE(inode->i_private != z);
+		return inode;
+	}
+
+	inode->i_ino = ino;
+	inode->i_mode = z->z_mode;
+	inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
+	inode->i_uid = z->z_uid;
+	inode->i_gid = z->z_gid;
 	inode->i_size = z->z_wpoffset;
 	inode->i_blocks = z->z_capacity >> SECTOR_SHIFT;
+	inode->i_private = z;
 
 	inode->i_op = &zonefs_file_inode_operations;
 	inode->i_fop = &zonefs_file_operations;
 	inode->i_mapping->a_ops = &zonefs_file_aops;
 
 	/* Update the inode access rights depending on the zone condition */
-	z->z_flags |= ZONEFS_ZONE_INIT_MODE;
 	zonefs_inode_update_mode(inode);
+
+	unlock_new_inode(inode);
+
+	return inode;
 }
 
-static struct dentry *zonefs_create_inode(struct dentry *parent,
-					  const char *name,
-					  struct zonefs_zone *z,
-					  enum zonefs_ztype ztype)
+static struct inode *zonefs_get_zgroup_inode(struct super_block *sb,
+					     enum zonefs_ztype ztype)
 {
-	struct inode *dir = d_inode(parent);
-	struct dentry *dentry;
+	struct inode *root = d_inode(sb->s_root);
+	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 	struct inode *inode;
-	int ret = -ENOMEM;
+	ino_t ino = bdev_nr_zones(sb->s_bdev) + ztype + 1;
 
-	dentry = d_alloc_name(parent, name);
-	if (!dentry)
-		return ERR_PTR(ret);
-
-	inode = new_inode(parent->d_sb);
+	inode = iget_locked(sb, ino);
 	if (!inode)
-		goto dput;
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	inode->i_ino = ino;
+	inode_init_owner(&init_user_ns, inode, root, S_IFDIR | 0555);
+	inode->i_size = sbi->s_zgroup[ztype].g_nr_zones;
+	inode->i_ctime = inode->i_mtime = inode->i_atime = root->i_ctime;
+	inode->i_private = &sbi->s_zgroup[ztype];
+	set_nlink(inode, 2);
 
-	inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
-	if (z)
-		zonefs_init_file_inode(inode, z);
-	else
-		zonefs_init_dir_inode(dir, inode, ztype);
+	inode->i_op = &zonefs_dir_inode_operations;
+	inode->i_fop = &zonefs_dir_operations;
+
+	unlock_new_inode(inode);
 
-	d_add(dentry, inode);
-	dir->i_size++;
+	return inode;
+}
 
-	return dentry;
 
-dput:
-	dput(dentry);
+static struct inode *zonefs_get_dir_inode(struct inode *dir,
+					  struct dentry *dentry)
+{
+	struct super_block *sb = dir->i_sb;
+	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+	const char *name = dentry->d_name.name;
+	enum zonefs_ztype ztype;
+
+	/*
+	 * We only need to check for the "seq" directory and
+	 * the "cnv" directory if we have conventional zones.
+	 */
+	if (dentry->d_name.len != 3)
+		return ERR_PTR(-ENOENT);
+
+	for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) {
+		if (sbi->s_zgroup[ztype].g_nr_zones &&
+		    memcmp(name, zonefs_zgroup_name(ztype), 3) == 0)
+			break;
+	}
+	if (ztype == ZONEFS_ZTYPE_MAX)
+		return ERR_PTR(-ENOENT);
 
-	return ERR_PTR(ret);
+	return zonefs_get_zgroup_inode(sb, ztype);
 }
 
-struct zonefs_zone_data {
-	struct super_block	*sb;
-	unsigned int		nr_zones[ZONEFS_ZTYPE_MAX];
-	sector_t		cnv_zone_start;
-	struct blk_zone		*zones;
-};
+static struct dentry *zonefs_lookup(struct inode *dir, struct dentry *dentry,
+				    unsigned int flags)
+{
+	struct inode *inode;
 
-/*
- * Create the inodes for a zone group.
- */
-static int zonefs_create_zgroup_inodes(struct super_block *sb,
-				       enum zonefs_ztype ztype)
+	if (dentry->d_name.len > ZONEFS_NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	if (dir == d_inode(dir->i_sb->s_root))
+		inode = zonefs_get_dir_inode(dir, dentry);
+	else
+		inode = zonefs_get_file_inode(dir, dentry);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	return d_splice_alias(inode, dentry);
+}
+
+static int zonefs_readdir_root(struct file *file, struct dir_context *ctx)
 {
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
 	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
-	struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype];
-	struct dentry *dir, *dent;
-	char *file_name;
-	int i, ret = 0;
+	enum zonefs_ztype ztype = ZONEFS_ZTYPE_CNV;
+	ino_t base_ino = bdev_nr_zones(sb->s_bdev) + 1;
 
-	if (!zgroup)
-		return -ENOMEM;
+	if (ctx->pos >= inode->i_size)
+		return 0;
 
-	/* If the group is empty, there is nothing to do */
-	if (!zgroup->g_nr_zones)
+	if (!dir_emit_dots(file, ctx))
 		return 0;
 
-	file_name = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL);
-	if (!file_name)
-		return -ENOMEM;
+	if (ctx->pos == 2) {
+		if (!sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones)
+			ztype = ZONEFS_ZTYPE_SEQ;
 
-	dir = zonefs_create_inode(sb->s_root, zonefs_zgroup_name(ztype),
-				  NULL, ztype);
-	if (IS_ERR(dir)) {
-		ret = PTR_ERR(dir);
-		goto free;
+		if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3,
+			      base_ino + ztype, DT_DIR))
+			return 0;
+		ctx->pos++;
 	}
 
-	for (i = 0; i < zgroup->g_nr_zones; i++) {
-		/* Use the zone number within its group as the file name */
-		snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", i);
-		dent = zonefs_create_inode(dir, file_name,
-					   &zgroup->g_zones[i], ztype);
-		if (IS_ERR(dent)) {
-			ret = PTR_ERR(dent);
+	if (ctx->pos == 3 && ztype != ZONEFS_ZTYPE_SEQ) {
+		ztype = ZONEFS_ZTYPE_SEQ;
+		if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3,
+			      base_ino + ztype, DT_DIR))
+			return 0;
+		ctx->pos++;
+	}
+
+	return 0;
+}
+
+static int zonefs_readdir_zgroup(struct file *file,
+				 struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct zonefs_zone_group *zgroup = inode->i_private;
+	struct super_block *sb = inode->i_sb;
+	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+	struct zonefs_zone *z;
+	int fname_len;
+	char *fname;
+	ino_t ino;
+	int f;
+
+	/*
+	 * The size of zone group directories is equal to the number
+	 * of zone files in the group and does note include the "." and
+	 * ".." entries. Hence the "+ 2" here.
+	 */
+	if (ctx->pos >= inode->i_size + 2)
+		return 0;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	fname = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL);
+	if (!fname)
+		return -ENOMEM;
+
+	for (f = ctx->pos - 2; f < zgroup->g_nr_zones; f++) {
+		z = &zgroup->g_zones[f];
+		ino = z->z_sector >> sbi->s_zone_sectors_shift;
+		fname_len = snprintf(fname, ZONEFS_NAME_MAX - 1, "%u", f);
+		if (!dir_emit(ctx, fname, fname_len, ino, DT_REG))
 			break;
-		}
+		ctx->pos++;
 	}
 
-free:
-	kfree(file_name);
+	kfree(fname);
 
-	return ret;
+	return 0;
 }
 
+static int zonefs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+
+	if (inode == d_inode(inode->i_sb->s_root))
+		return zonefs_readdir_root(file, ctx);
+
+	return zonefs_readdir_zgroup(file, ctx);
+}
+
+const struct inode_operations zonefs_dir_inode_operations = {
+	.lookup		= zonefs_lookup,
+	.setattr	= zonefs_inode_setattr,
+};
+
+const struct file_operations zonefs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate_shared	= zonefs_readdir,
+};
+
+struct zonefs_zone_data {
+	struct super_block	*sb;
+	unsigned int		nr_zones[ZONEFS_ZTYPE_MAX];
+	sector_t		cnv_zone_start;
+	struct blk_zone		*zones;
+};
+
 static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx,
 				   void *data)
 {
@@ -875,6 +1015,17 @@ static int zonefs_init_zgroup(struct super_block *sb,
 				      zone->capacity << SECTOR_SHIFT);
 		z->z_wpoffset = zonefs_check_zone_condition(sb, z, zone);
 
+		z->z_mode = S_IFREG | sbi->s_perm;
+		z->z_uid = sbi->s_uid;
+		z->z_gid = sbi->s_gid;
+
+		/*
+		 * Let zonefs_inode_update_mode() know that we will need
+		 * special initialization of the inode mode the first time
+		 * it is accessed.
+		 */
+		z->z_flags |= ZONEFS_ZONE_INIT_MODE;
+
 		sb->s_maxbytes = max(z->z_capacity, sb->s_maxbytes);
 		sbi->s_blocks += z->z_capacity >> sb->s_blocksize_bits;
 		sbi->s_used_blocks += z->z_wpoffset >> sb->s_blocksize_bits;
@@ -1057,7 +1208,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct zonefs_sb_info *sbi;
 	struct inode *inode;
-	enum zonefs_ztype t;
+	enum zonefs_ztype ztype;
 	int ret;
 
 	if (!bdev_is_zoned(sb->s_bdev)) {
@@ -1122,7 +1273,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 	if (ret)
 		goto cleanup;
 
-	/* Create root directory inode */
+	/* Create the root directory inode */
 	ret = -ENOMEM;
 	inode = new_inode(sb);
 	if (!inode)
@@ -1132,20 +1283,20 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 	inode->i_mode = S_IFDIR | 0555;
 	inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode);
 	inode->i_op = &zonefs_dir_inode_operations;
-	inode->i_fop = &simple_dir_operations;
+	inode->i_fop = &zonefs_dir_operations;
+	inode->i_size = 2;
 	set_nlink(inode, 2);
+	for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) {
+		if (sbi->s_zgroup[ztype].g_nr_zones) {
+			inc_nlink(inode);
+			inode->i_size++;
+		}
+	}
 
 	sb->s_root = d_make_root(inode);
 	if (!sb->s_root)
 		goto cleanup;
 
-	/* Create and populate files in zone groups directories */
-	for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
-		ret = zonefs_create_zgroup_inodes(sb, t);
-		if (ret)
-			goto cleanup;
-	}
-
 	ret = zonefs_sysfs_register(sb);
 	if (ret)
 		goto cleanup;
@@ -1168,12 +1319,10 @@ static void zonefs_kill_super(struct super_block *sb)
 {
 	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 
-	if (sb->s_root)
-		d_genocide(sb->s_root);
+	kill_block_super(sb);
 
 	zonefs_sysfs_unregister(sb);
 	zonefs_free_zgroups(sb);
-	kill_block_super(sb);
 	kfree(sbi);
 }
 
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 2d626e18b141..f88466a4158b 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -64,6 +64,11 @@ struct zonefs_zone {
 
 	/* Write pointer offset in the zone (sequential zones only, bytes) */
 	loff_t			z_wpoffset;
+
+	/* Saved inode uid, gid and access rights */
+	umode_t			z_mode;
+	kuid_t			z_uid;
+	kgid_t			z_gid;
 };
 
 /*
@@ -265,6 +270,10 @@ static inline void zonefs_io_error(struct inode *inode, bool write)
 	mutex_unlock(&zi->i_truncate_mutex);
 }
 
+/* In super.c */
+extern const struct inode_operations zonefs_dir_inode_operations;
+extern const struct file_operations zonefs_dir_operations;
+
 /* In file.c */
 extern const struct address_space_operations zonefs_file_aops;
 extern const struct file_operations zonefs_file_operations;
author	Damien Le Moal <damien.lemoal@opensource.wdc.com>	2022-11-30 11:01:09 +0900
committer	Damien Le Moal <damien.lemoal@opensource.wdc.com>	2023-01-23 09:25:51 +0900
commit	d207794ababe5c3ad72e965c5e1023cfaf4ab1bb (patch)
tree	a0485995788cf4f36b097ba07fe966c9f4c28434 /fs/zonefs
parent	aa7f243f32e1d18036ee00d71d3ccfad70ae2121 (diff)