--- /dev/null
+From 5b6e7e120e716231a0bf9ad201438d72473e396d Mon Sep 17 00:00:00 2001
+From: Yue Hu <huyue2@yulong.com>
+Date: Thu, 14 Oct 2021 14:57:44 +0800
+Subject: [PATCH 1000/1012] erofs: remove the fast path of per-CPU buffer
+ decompression
+
+As Xiang mentioned, such path has no real impact to our current
+decompression strategy, remove it directly. Also, update the return
+value of z_erofs_lz4_decompress() to 0 if success to keep consistent
+with LZMA which will return 0 as well for that case.
+
+Link: https://lore.kernel.org/r/20211014065744.1787-1-zbestahu@gmail.com
+Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Signed-off-by: Yue Hu <huyue2@yulong.com>
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+---
+ fs/erofs/decompressor.c | 63 ++++++-----------------------------------
+ 1 file changed, 8 insertions(+), 55 deletions(-)
+
+diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
+index a5bc4b1b7..dce06ac61 100644
+--- a/fs/erofs/decompressor.c
++++ b/fs/erofs/decompressor.c
+@@ -242,6 +242,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+ if (ret >= 0)
+ memset(out + ret, 0, rq->outputsize - ret);
+ ret = -EIO;
++ } else {
++ ret = 0;
+ }
+
+ if (maptype == 0) {
+@@ -268,33 +270,6 @@ static struct z_erofs_decompressor decompressors[] = {
+ },
+ };
+
+-static void copy_from_pcpubuf(struct page **out, const char *dst,
+- unsigned short pageofs_out,
+- unsigned int outputsize)
+-{
+- const char *end = dst + outputsize;
+- const unsigned int righthalf = PAGE_SIZE - pageofs_out;
+- const char *cur = dst - pageofs_out;
+-
+- while (cur < end) {
+- struct page *const page = *out++;
+-
+- if (page) {
+- char *buf = kmap_atomic(page);
+-
+- if (cur >= dst) {
+- memcpy(buf, cur, min_t(uint, PAGE_SIZE,
+- end - cur));
+- } else {
+- memcpy(buf + pageofs_out, cur + pageofs_out,
+- min_t(uint, righthalf, end - cur));
+- }
+- kunmap_atomic(buf);
+- }
+- cur += PAGE_SIZE;
+- }
+-}
+-
+ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool)
+ {
+@@ -305,34 +280,12 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
+ void *dst;
+ int ret;
+
+- /* two optimized fast paths only for non bigpcluster cases yet */
+- if (rq->inputsize <= PAGE_SIZE) {
+- if (nrpages_out == 1 && !rq->inplace_io) {
+- DBG_BUGON(!*rq->out);
+- dst = kmap_atomic(*rq->out);
+- dst_maptype = 0;
+- goto dstmap_out;
+- }
+-
+- /*
+- * For the case of small output size (especially much less
+- * than PAGE_SIZE), memcpy the decompressed data rather than
+- * compressed data is preferred.
+- */
+- if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
+- dst = erofs_get_pcpubuf(1);
+- if (IS_ERR(dst))
+- return PTR_ERR(dst);
+-
+- rq->inplace_io = false;
+- ret = alg->decompress(rq, dst);
+- if (!ret)
+- copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
+- rq->outputsize);
+-
+- erofs_put_pcpubuf(dst);
+- return ret;
+- }
++ /* one optimized fast path only for non bigpcluster cases yet */
++ if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) {
++ DBG_BUGON(!*rq->out);
++ dst = kmap_atomic(*rq->out);
++ dst_maptype = 0;
++ goto dstmap_out;
+ }
+
+ /* general decoding path which can be used for all cases */
+--
+2.32.0
+
--- /dev/null
+From e62424651f43cb37e17ca26a7ee9ee42675f24bd Mon Sep 17 00:00:00 2001
+From: Gao Xiang <hsiangkao@linux.alibaba.com>
+Date: Thu, 7 Oct 2021 15:02:23 +0800
+Subject: [PATCH 1001/1012] erofs: decouple basic mount options from fs_context
+
+Previously, EROFS mount options are all in the basic types, so
+erofs_fs_context can be directly copied with assignment. However,
+when the multiple device feature is introduced, it's hard to handle
+multiple device information like the other basic mount options.
+
+Let's separate basic mount option usage from fs_context, thus
+multiple device information can be handled gracefully then.
+
+No logic changes.
+
+Link: https://lore.kernel.org/r/20211007070224.12833-1-hsiangkao@linux.alibaba.com
+Reviewed-by: Chao Yu <chao@kernel.org>
+Reviewed-by: Liu Bo <bo.liu@linux.alibaba.com>
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+---
+ fs/erofs/inode.c | 2 +-
+ fs/erofs/internal.h | 16 ++++++++-----
+ fs/erofs/super.c | 58 ++++++++++++++++++++++-----------------------
+ fs/erofs/xattr.c | 4 ++--
+ fs/erofs/zdata.c | 8 +++----
+ 5 files changed, 45 insertions(+), 43 deletions(-)
+
+diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
+index a552399e2..2345f1de4 100644
+--- a/fs/erofs/inode.c
++++ b/fs/erofs/inode.c
+@@ -192,7 +192,7 @@ static struct page *erofs_read_inode(struct inode *inode,
+ inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
+
+ inode->i_flags &= ~S_DAX;
+- if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
++ if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
+ vi->datalayout == EROFS_INODE_FLAT_PLAIN)
+ inode->i_flags |= S_DAX;
+ if (!nblks)
+diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
+index 9524e155b..b1b9d1b5c 100644
+--- a/fs/erofs/internal.h
++++ b/fs/erofs/internal.h
+@@ -47,7 +47,7 @@ typedef u64 erofs_off_t;
+ /* data type for filesystem-wide blocks number */
+ typedef u32 erofs_blk_t;
+
+-struct erofs_fs_context {
++struct erofs_mount_opts {
+ #ifdef CONFIG_EROFS_FS_ZIP
+ /* current strategy of how to use managed cache */
+ unsigned char cache_strategy;
+@@ -60,6 +60,10 @@ struct erofs_fs_context {
+ unsigned int mount_opt;
+ };
+
++struct erofs_fs_context {
++ struct erofs_mount_opts opt;
++};
++
+ /* all filesystem-wide lz4 configurations */
+ struct erofs_sb_lz4_info {
+ /* # of pages needed for EROFS lz4 rolling decompression */
+@@ -69,6 +73,8 @@ struct erofs_sb_lz4_info {
+ };
+
+ struct erofs_sb_info {
++ struct erofs_mount_opts opt; /* options */
++
+ #ifdef CONFIG_EROFS_FS_ZIP
+ /* list for all registered superblocks, mainly for shrinker */
+ struct list_head list;
+@@ -108,8 +114,6 @@ struct erofs_sb_info {
+ u8 volume_name[16]; /* volume name */
+ u32 feature_compat;
+ u32 feature_incompat;
+-
+- struct erofs_fs_context ctx; /* options */
+ };
+
+ #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
+@@ -121,9 +125,9 @@ struct erofs_sb_info {
+ #define EROFS_MOUNT_DAX_ALWAYS 0x00000040
+ #define EROFS_MOUNT_DAX_NEVER 0x00000080
+
+-#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
+-#define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option)
+-#define test_opt(ctx, option) ((ctx)->mount_opt & EROFS_MOUNT_##option)
++#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
++#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
++#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
+
+ enum {
+ EROFS_ZIP_CACHE_DISABLED,
+diff --git a/fs/erofs/super.c b/fs/erofs/super.c
+index 11b88559f..25f6b8b37 100644
+--- a/fs/erofs/super.c
++++ b/fs/erofs/super.c
+@@ -340,15 +340,15 @@ static int erofs_read_superblock(struct super_block *sb)
+ static void erofs_default_options(struct erofs_fs_context *ctx)
+ {
+ #ifdef CONFIG_EROFS_FS_ZIP
+- ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+- ctx->max_sync_decompress_pages = 3;
+- ctx->readahead_sync_decompress = false;
++ ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
++ ctx->opt.max_sync_decompress_pages = 3;
++ ctx->opt.readahead_sync_decompress = false;
+ #endif
+ #ifdef CONFIG_EROFS_FS_XATTR
+- set_opt(ctx, XATTR_USER);
++ set_opt(&ctx->opt, XATTR_USER);
+ #endif
+ #ifdef CONFIG_EROFS_FS_POSIX_ACL
+- set_opt(ctx, POSIX_ACL);
++ set_opt(&ctx->opt, POSIX_ACL);
+ #endif
+ }
+
+@@ -392,12 +392,12 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
+ switch (mode) {
+ case EROFS_MOUNT_DAX_ALWAYS:
+ warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+- set_opt(ctx, DAX_ALWAYS);
+- clear_opt(ctx, DAX_NEVER);
++ set_opt(&ctx->opt, DAX_ALWAYS);
++ clear_opt(&ctx->opt, DAX_NEVER);
+ return true;
+ case EROFS_MOUNT_DAX_NEVER:
+- set_opt(ctx, DAX_NEVER);
+- clear_opt(ctx, DAX_ALWAYS);
++ set_opt(&ctx->opt, DAX_NEVER);
++ clear_opt(&ctx->opt, DAX_ALWAYS);
+ return true;
+ default:
+ DBG_BUGON(1);
+@@ -424,9 +424,9 @@ static int erofs_fc_parse_param(struct fs_context *fc,
+ case Opt_user_xattr:
+ #ifdef CONFIG_EROFS_FS_XATTR
+ if (result.boolean)
+- set_opt(ctx, XATTR_USER);
++ set_opt(&ctx->opt, XATTR_USER);
+ else
+- clear_opt(ctx, XATTR_USER);
++ clear_opt(&ctx->opt, XATTR_USER);
+ #else
+ errorfc(fc, "{,no}user_xattr options not supported");
+ #endif
+@@ -434,16 +434,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
+ case Opt_acl:
+ #ifdef CONFIG_EROFS_FS_POSIX_ACL
+ if (result.boolean)
+- set_opt(ctx, POSIX_ACL);
++ set_opt(&ctx->opt, POSIX_ACL);
+ else
+- clear_opt(ctx, POSIX_ACL);
++ clear_opt(&ctx->opt, POSIX_ACL);
+ #else
+ errorfc(fc, "{,no}acl options not supported");
+ #endif
+ break;
+ case Opt_cache_strategy:
+ #ifdef CONFIG_EROFS_FS_ZIP
+- ctx->cache_strategy = result.uint_32;
++ ctx->opt.cache_strategy = result.uint_32;
+ #else
+ errorfc(fc, "compression not supported, cache_strategy ignored");
+ #endif
+@@ -540,15 +540,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
+ return -ENOMEM;
+
+ sb->s_fs_info = sbi;
++ sbi->opt = ctx->opt;
+ sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
+ err = erofs_read_superblock(sb);
+ if (err)
+ return err;
+
+- if (test_opt(ctx, DAX_ALWAYS) &&
++ if (test_opt(&sbi->opt, DAX_ALWAYS) &&
+ !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) {
+ errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
+- clear_opt(ctx, DAX_ALWAYS);
++ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
+ sb->s_flags |= SB_RDONLY | SB_NOATIME;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+@@ -557,13 +558,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
+ sb->s_op = &erofs_sops;
+ sb->s_xattr = erofs_xattr_handlers;
+
+- if (test_opt(ctx, POSIX_ACL))
++ if (test_opt(&sbi->opt, POSIX_ACL))
+ sb->s_flags |= SB_POSIXACL;
+ else
+ sb->s_flags &= ~SB_POSIXACL;
+
+- sbi->ctx = *ctx;
+-
+ #ifdef CONFIG_EROFS_FS_ZIP
+ xa_init(&sbi->managed_pslots);
+ #endif
+@@ -607,12 +606,12 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
+
+ DBG_BUGON(!sb_rdonly(sb));
+
+- if (test_opt(ctx, POSIX_ACL))
++ if (test_opt(&ctx->opt, POSIX_ACL))
+ fc->sb_flags |= SB_POSIXACL;
+ else
+ fc->sb_flags &= ~SB_POSIXACL;
+
+- sbi->ctx = *ctx;
++ sbi->opt = ctx->opt;
+
+ fc->sb_flags |= SB_RDONLY;
+ return 0;
+@@ -640,7 +639,6 @@ static int erofs_init_fs_context(struct fs_context *fc)
+ erofs_default_options(fc->fs_private);
+
+ fc->ops = &erofs_context_ops;
+-
+ return 0;
+ }
+
+@@ -763,31 +761,31 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
+ {
+ struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
+- struct erofs_fs_context *ctx = &sbi->ctx;
++ struct erofs_mount_opts *opt = &sbi->opt;
+
+ #ifdef CONFIG_EROFS_FS_XATTR
+- if (test_opt(ctx, XATTR_USER))
++ if (test_opt(opt, XATTR_USER))
+ seq_puts(seq, ",user_xattr");
+ else
+ seq_puts(seq, ",nouser_xattr");
+ #endif
+ #ifdef CONFIG_EROFS_FS_POSIX_ACL
+- if (test_opt(ctx, POSIX_ACL))
++ if (test_opt(opt, POSIX_ACL))
+ seq_puts(seq, ",acl");
+ else
+ seq_puts(seq, ",noacl");
+ #endif
+ #ifdef CONFIG_EROFS_FS_ZIP
+- if (ctx->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
++ if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
+ seq_puts(seq, ",cache_strategy=disabled");
+- else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
++ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
+ seq_puts(seq, ",cache_strategy=readahead");
+- else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
++ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
+ seq_puts(seq, ",cache_strategy=readaround");
+ #endif
+- if (test_opt(ctx, DAX_ALWAYS))
++ if (test_opt(opt, DAX_ALWAYS))
+ seq_puts(seq, ",dax=always");
+- if (test_opt(ctx, DAX_NEVER))
++ if (test_opt(opt, DAX_NEVER))
+ seq_puts(seq, ",dax=never");
+ return 0;
+ }
+diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
+index 778f2c522..01c581e93 100644
+--- a/fs/erofs/xattr.c
++++ b/fs/erofs/xattr.c
+@@ -429,7 +429,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
+
+ static bool erofs_xattr_user_list(struct dentry *dentry)
+ {
+- return test_opt(&EROFS_SB(dentry->d_sb)->ctx, XATTR_USER);
++ return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER);
+ }
+
+ static bool erofs_xattr_trusted_list(struct dentry *dentry)
+@@ -476,7 +476,7 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
+
+ switch (handler->flags) {
+ case EROFS_XATTR_INDEX_USER:
+- if (!test_opt(&sbi->ctx, XATTR_USER))
++ if (!test_opt(&sbi->opt, XATTR_USER))
+ return -EOPNOTSUPP;
+ break;
+ case EROFS_XATTR_INDEX_TRUSTED:
+diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
+index 11c7a1aae..e59e22852 100644
+--- a/fs/erofs/zdata.c
++++ b/fs/erofs/zdata.c
+@@ -695,7 +695,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
+ goto err_out;
+
+ /* preload all compressed pages (maybe downgrade role if necessary) */
+- if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la))
++ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la))
+ cache_strategy = TRYALLOC;
+ else
+ cache_strategy = DONTALLOC;
+@@ -796,7 +796,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
+ /* Use workqueue and sync decompression for atomic contexts only */
+ if (in_atomic() || irqs_disabled()) {
+ queue_work(z_erofs_workqueue, &io->u.work);
+- sbi->ctx.readahead_sync_decompress = true;
++ sbi->opt.readahead_sync_decompress = true;
+ return;
+ }
+ z_erofs_decompressqueue_work(&io->u.work);
+@@ -1411,8 +1411,8 @@ static void z_erofs_readahead(struct readahead_control *rac)
+ struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
+
+ unsigned int nr_pages = readahead_count(rac);
+- bool sync = (sbi->ctx.readahead_sync_decompress &&
+- nr_pages <= sbi->ctx.max_sync_decompress_pages);
++ bool sync = (sbi->opt.readahead_sync_decompress &&
++ nr_pages <= sbi->opt.max_sync_decompress_pages);
+ struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ struct page *page, *head = NULL;
+ LIST_HEAD(pagepool);
+--
+2.32.0
+
--- /dev/null
+From dfeab2e95a75a424adf39992ac62dcb9e9517d4a Mon Sep 17 00:00:00 2001
+From: Gao Xiang <hsiangkao@linux.alibaba.com>
+Date: Thu, 14 Oct 2021 16:10:10 +0800
+Subject: [PATCH 1002/1012] erofs: add multiple device support
+
+In order to support multi-layer container images, add multiple
+device feature to EROFS. Two ways are available to use for now:
+
+ - Devices can be mapped into 32-bit global block address space;
+ - Device ID can be specified with the chunk indexes format.
+
+Note that it assumes no extent would cross device boundary and mkfs
+should take care of it seriously.
+
+In the future, a dedicated device manager could be introduced then
+thus extra devices can be automatically scanned by UUID as well.
+
+Link: https://lore.kernel.org/r/20211014081010.43485-1-hsiangkao@linux.alibaba.com
+Reviewed-by: Chao Yu <chao@kernel.org>
+Reviewed-by: Liu Bo <bo.liu@linux.alibaba.com>
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+---
+ Documentation/filesystems/erofs.rst | 12 ++-
+ fs/erofs/Kconfig | 24 +++--
+ fs/erofs/data.c | 73 ++++++++++---
+ fs/erofs/erofs_fs.h | 22 +++-
+ fs/erofs/internal.h | 35 ++++++-
+ fs/erofs/super.c | 156 ++++++++++++++++++++++++++--
+ fs/erofs/zdata.c | 20 +++-
+ 7 files changed, 296 insertions(+), 46 deletions(-)
+
+diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst
+index b97579b7d..01df283c7 100644
+--- a/Documentation/filesystems/erofs.rst
++++ b/Documentation/filesystems/erofs.rst
+@@ -19,9 +19,10 @@ It is designed as a better filesystem solution for the following scenarios:
+ immutable and bit-for-bit identical to the official golden image for
+ their releases due to security and other considerations and
+
+- - hope to save some extra storage space with guaranteed end-to-end performance
+- by using reduced metadata and transparent file compression, especially
+- for those embedded devices with limited memory (ex, smartphone);
++ - hope to minimize extra storage space with guaranteed end-to-end performance
++ by using compact layout, transparent file compression and direct access,
++ especially for those embedded devices with limited memory and high-density
++ hosts with numerous containers;
+
+ Here is the main features of EROFS:
+
+@@ -51,7 +52,9 @@ Here is the main features of EROFS:
+ - Support POSIX.1e ACLs by using xattrs;
+
+ - Support transparent data compression as an option:
+- LZ4 algorithm with the fixed-sized output compression for high performance.
++ LZ4 algorithm with the fixed-sized output compression for high performance;
++
++ - Multiple device support for multi-layer container images.
+
+ The following git tree provides the file system user-space tools under
+ development (ex, formatting tool mkfs.erofs):
+@@ -87,6 +90,7 @@ cache_strategy=%s Select a strategy for cached decompression from now on:
+ dax={always,never} Use direct access (no page cache). See
+ Documentation/filesystems/dax.rst.
+ dax A legacy option which is an alias for ``dax=always``.
++device=%s Specify a path to an extra device to be used together.
+ =================== =========================================================
+
+ On-disk details
+diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
+index 14b747026..addfe608d 100644
+--- a/fs/erofs/Kconfig
++++ b/fs/erofs/Kconfig
+@@ -6,16 +6,22 @@ config EROFS_FS
+ select FS_IOMAP
+ select LIBCRC32C
+ help
+- EROFS (Enhanced Read-Only File System) is a lightweight
+- read-only file system with modern designs (eg. page-sized
+- blocks, inline xattrs/data, etc.) for scenarios which need
+- high-performance read-only requirements, e.g. Android OS
+- for mobile phones and LIVECDs.
++ EROFS (Enhanced Read-Only File System) is a lightweight read-only
++ file system with modern designs (e.g. no buffer heads, inline
++ xattrs/data, chunk-based deduplication, multiple devices, etc.) for
++ scenarios which need high-performance read-only solutions, e.g.
++ smartphones with Android OS, LiveCDs and high-density hosts with
++ numerous containers;
+
+- It also provides fixed-sized output compression support,
+- which improves storage density, keeps relatively higher
+- compression ratios, which is more useful to achieve high
+- performance for embedded devices with limited memory.
++ It also provides fixed-sized output compression support in order to
++ improve storage density as well as keep relatively higher compression
++ ratios and implements in-place decompression to reuse the file page
++ for compressed data temporarily with proper strategies, which is
++ quite useful to ensure guaranteed end-to-end runtime decompression
++ performance under extremely memory pressure without extra cost.
++
++ See the documentation at <file:Documentation/filesystems/erofs.rst>
++ for more details.
+
+ If unsure, say N.
+
+diff --git a/fs/erofs/data.c b/fs/erofs/data.c
+index 9db829715..808234d91 100644
+--- a/fs/erofs/data.c
++++ b/fs/erofs/data.c
+@@ -89,6 +89,7 @@ static int erofs_map_blocks(struct inode *inode,
+ erofs_off_t pos;
+ int err = 0;
+
++ map->m_deviceid = 0;
+ if (map->m_la >= inode->i_size) {
+ /* leave out-of-bound access unmapped */
+ map->m_flags = 0;
+@@ -135,14 +136,8 @@ static int erofs_map_blocks(struct inode *inode,
+ map->m_flags = 0;
+ break;
+ default:
+- /* only one device is supported for now */
+- if (idx->device_id) {
+- erofs_err(sb, "invalid device id %u @ %llu for nid %llu",
+- le16_to_cpu(idx->device_id),
+- chunknr, vi->nid);
+- err = -EFSCORRUPTED;
+- goto out_unlock;
+- }
++ map->m_deviceid = le16_to_cpu(idx->device_id) &
++ EROFS_SB(sb)->device_id_mask;
+ map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
+ map->m_flags = EROFS_MAP_MAPPED;
+ break;
+@@ -155,11 +150,55 @@ static int erofs_map_blocks(struct inode *inode,
+ return err;
+ }
+
++int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
++{
++ struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
++ struct erofs_device_info *dif;
++ int id;
++
++ /* primary device by default */
++ map->m_bdev = sb->s_bdev;
++ map->m_daxdev = EROFS_SB(sb)->dax_dev;
++
++ if (map->m_deviceid) {
++ down_read(&devs->rwsem);
++ dif = idr_find(&devs->tree, map->m_deviceid - 1);
++ if (!dif) {
++ up_read(&devs->rwsem);
++ return -ENODEV;
++ }
++ map->m_bdev = dif->bdev;
++ map->m_daxdev = dif->dax_dev;
++ up_read(&devs->rwsem);
++ } else if (devs->extra_devices) {
++ down_read(&devs->rwsem);
++ idr_for_each_entry(&devs->tree, dif, id) {
++ erofs_off_t startoff, length;
++
++ if (!dif->mapped_blkaddr)
++ continue;
++ startoff = blknr_to_addr(dif->mapped_blkaddr);
++ length = blknr_to_addr(dif->blocks);
++
++ if (map->m_pa >= startoff &&
++ map->m_pa < startoff + length) {
++ map->m_pa -= startoff;
++ map->m_bdev = dif->bdev;
++ map->m_daxdev = dif->dax_dev;
++ break;
++ }
++ }
++ up_read(&devs->rwsem);
++ }
++ return 0;
++}
++
+ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+ {
+ int ret;
+ struct erofs_map_blocks map;
++ struct erofs_map_dev mdev;
+
+ map.m_la = offset;
+ map.m_llen = length;
+@@ -168,8 +207,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ if (ret < 0)
+ return ret;
+
+- iomap->bdev = inode->i_sb->s_bdev;
+- iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
++ mdev = (struct erofs_map_dev) {
++ .m_deviceid = map.m_deviceid,
++ .m_pa = map.m_pa,
++ };
++ ret = erofs_map_dev(inode->i_sb, &mdev);
++ if (ret)
++ return ret;
++
++ iomap->bdev = mdev.m_bdev;
++ iomap->dax_dev = mdev.m_daxdev;
+ iomap->offset = map.m_la;
+ iomap->length = map.m_llen;
+ iomap->flags = 0;
+@@ -188,15 +235,15 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+
+ iomap->type = IOMAP_INLINE;
+ ipage = erofs_get_meta_page(inode->i_sb,
+- erofs_blknr(map.m_pa));
++ erofs_blknr(mdev.m_pa));
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+ iomap->inline_data = page_address(ipage) +
+- erofs_blkoff(map.m_pa);
++ erofs_blkoff(mdev.m_pa);
+ iomap->private = ipage;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+- iomap->addr = map.m_pa;
++ iomap->addr = mdev.m_pa;
+ }
+ return 0;
+ }
+diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
+index b0b23f41a..e480b3854 100644
+--- a/fs/erofs/erofs_fs.h
++++ b/fs/erofs/erofs_fs.h
+@@ -21,14 +21,27 @@
+ #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002
+ #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002
+ #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004
++#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008
+ #define EROFS_ALL_FEATURE_INCOMPAT \
+ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
+ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
+ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
+- EROFS_FEATURE_INCOMPAT_CHUNKED_FILE)
++ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
++ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE)
+
+ #define EROFS_SB_EXTSLOT_SIZE 16
+
++struct erofs_deviceslot {
++ union {
++ u8 uuid[16]; /* used for device manager later */
++ u8 userdata[64]; /* digest(sha256), etc. */
++ } u;
++ __le32 blocks; /* total fs blocks of this device */
++ __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
++ u8 reserved[56];
++};
++#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
++
+ /* erofs on-disk super block (currently 128 bytes) */
+ struct erofs_super_block {
+ __le32 magic; /* file system magic number */
+@@ -54,7 +67,9 @@ struct erofs_super_block {
+ /* customized sliding window size instead of 64k by default */
+ __le16 lz4_max_distance;
+ } __packed u1;
+- __u8 reserved2[42];
++ __le16 extra_devices; /* # of devices besides the primary device */
++ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */
++ __u8 reserved2[38];
+ };
+
+ /*
+@@ -238,7 +253,7 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
+ /* 8-byte inode chunk indexes */
+ struct erofs_inode_chunk_index {
+ __le16 advise; /* always 0, don't care for now */
+- __le16 device_id; /* back-end storage id, always 0 for now */
++ __le16 device_id; /* back-end storage id (with bits masked) */
+ __le32 blkaddr; /* start block address of this inode chunk */
+ };
+
+@@ -384,6 +399,7 @@ static inline void erofs_check_ondisk_layout_definitions(void)
+ /* keep in sync between 2 index structures for better extendibility */
+ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) !=
+ sizeof(struct z_erofs_vle_decompressed_index));
++ BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128);
+
+ BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
+ Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
+diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
+index b1b9d1b5c..0661d7d69 100644
+--- a/fs/erofs/internal.h
++++ b/fs/erofs/internal.h
+@@ -47,6 +47,15 @@ typedef u64 erofs_off_t;
+ /* data type for filesystem-wide blocks number */
+ typedef u32 erofs_blk_t;
+
++struct erofs_device_info {
++ char *path;
++ struct block_device *bdev;
++ struct dax_device *dax_dev;
++
++ u32 blocks;
++ u32 mapped_blkaddr;
++};
++
+ struct erofs_mount_opts {
+ #ifdef CONFIG_EROFS_FS_ZIP
+ /* current strategy of how to use managed cache */
+@@ -60,8 +69,16 @@ struct erofs_mount_opts {
+ unsigned int mount_opt;
+ };
+
++struct erofs_dev_context {
++ struct idr tree;
++ struct rw_semaphore rwsem;
++
++ unsigned int extra_devices;
++};
++
+ struct erofs_fs_context {
+ struct erofs_mount_opts opt;
++ struct erofs_dev_context *devs;
+ };
+
+ /* all filesystem-wide lz4 configurations */
+@@ -74,7 +91,6 @@ struct erofs_sb_lz4_info {
+
+ struct erofs_sb_info {
+ struct erofs_mount_opts opt; /* options */
+-
+ #ifdef CONFIG_EROFS_FS_ZIP
+ /* list for all registered superblocks, mainly for shrinker */
+ struct list_head list;
+@@ -91,12 +107,16 @@ struct erofs_sb_info {
+
+ struct erofs_sb_lz4_info lz4;
+ #endif /* CONFIG_EROFS_FS_ZIP */
++ struct erofs_dev_context *devs;
+ struct dax_device *dax_dev;
+- u32 blocks;
++ u64 total_blocks;
++ u32 primarydevice_blocks;
++
+ u32 meta_blkaddr;
+ #ifdef CONFIG_EROFS_FS_XATTR
+ u32 xattr_blkaddr;
+ #endif
++ u16 device_id_mask; /* valid bits of device id to be used */
+
+ /* inode slot unit size in bit shift */
+ unsigned char islotbits;
+@@ -241,6 +261,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
+ EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
+ EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
+ EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
++EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
+ EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
+
+ /* atomic flag definitions */
+@@ -359,6 +380,7 @@ struct erofs_map_blocks {
+ erofs_off_t m_pa, m_la;
+ u64 m_plen, m_llen;
+
++ unsigned short m_deviceid;
+ unsigned int m_flags;
+
+ struct page *mpage;
+@@ -390,9 +412,18 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
+ }
+ #endif /* !CONFIG_EROFS_FS_ZIP */
+
++struct erofs_map_dev {
++ struct block_device *m_bdev;
++ struct dax_device *m_daxdev;
++
++ erofs_off_t m_pa;
++ unsigned int m_deviceid;
++};
++
+ /* data.c */
+ extern const struct file_operations erofs_file_fops;
+ struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
++int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
+ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
+
+diff --git a/fs/erofs/super.c b/fs/erofs/super.c
+index 25f6b8b37..2cfe1ce0f 100644
+--- a/fs/erofs/super.c
++++ b/fs/erofs/super.c
+@@ -252,6 +252,79 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
+ }
+ #endif
+
++static int erofs_init_devices(struct super_block *sb,
++ struct erofs_super_block *dsb)
++{
++ struct erofs_sb_info *sbi = EROFS_SB(sb);
++ unsigned int ondisk_extradevs;
++ erofs_off_t pos;
++ struct page *page = NULL;
++ struct erofs_device_info *dif;
++ struct erofs_deviceslot *dis;
++ void *ptr;
++ int id, err = 0;
++
++ sbi->total_blocks = sbi->primarydevice_blocks;
++ if (!erofs_sb_has_device_table(sbi))
++ ondisk_extradevs = 0;
++ else
++ ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
++
++ if (ondisk_extradevs != sbi->devs->extra_devices) {
++ erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
++ ondisk_extradevs, sbi->devs->extra_devices);
++ return -EINVAL;
++ }
++ if (!ondisk_extradevs)
++ return 0;
++
++ sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
++ pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
++ down_read(&sbi->devs->rwsem);
++ idr_for_each_entry(&sbi->devs->tree, dif, id) {
++ erofs_blk_t blk = erofs_blknr(pos);
++ struct block_device *bdev;
++
++ if (!page || page->index != blk) {
++ if (page) {
++ kunmap(page);
++ unlock_page(page);
++ put_page(page);
++ }
++
++ page = erofs_get_meta_page(sb, blk);
++ if (IS_ERR(page)) {
++ up_read(&sbi->devs->rwsem);
++ return PTR_ERR(page);
++ }
++ ptr = kmap(page);
++ }
++ dis = ptr + erofs_blkoff(pos);
++
++ bdev = blkdev_get_by_path(dif->path,
++ FMODE_READ | FMODE_EXCL,
++ sb->s_type);
++ if (IS_ERR(bdev)) {
++ err = PTR_ERR(bdev);
++ goto err_out;
++ }
++ dif->bdev = bdev;
++ dif->dax_dev = fs_dax_get_by_bdev(bdev);
++ dif->blocks = le32_to_cpu(dis->blocks);
++ dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
++ sbi->total_blocks += dif->blocks;
++ pos += EROFS_DEVT_SLOT_SIZE;
++ }
++err_out:
++ up_read(&sbi->devs->rwsem);
++ if (page) {
++ kunmap(page);
++ unlock_page(page);
++ put_page(page);
++ }
++ return err;
++}
++
+ static int erofs_read_superblock(struct super_block *sb)
+ {
+ struct erofs_sb_info *sbi;
+@@ -303,7 +376,7 @@ static int erofs_read_superblock(struct super_block *sb)
+ sbi->sb_size);
+ goto out;
+ }
+- sbi->blocks = le32_to_cpu(dsb->blocks);
++ sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
+ sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
+ #ifdef CONFIG_EROFS_FS_XATTR
+ sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
+@@ -330,6 +403,11 @@ static int erofs_read_superblock(struct super_block *sb)
+ ret = erofs_load_compr_cfgs(sb, dsb);
+ else
+ ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
++ if (ret < 0)
++ goto out;
++
++ /* handle multiple devices */
++ ret = erofs_init_devices(sb, dsb);
+ out:
+ kunmap(page);
+ put_page(page);
+@@ -358,6 +436,7 @@ enum {
+ Opt_cache_strategy,
+ Opt_dax,
+ Opt_dax_enum,
++ Opt_device,
+ Opt_err
+ };
+
+@@ -381,6 +460,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
+ erofs_param_cache_strategy),
+ fsparam_flag("dax", Opt_dax),
+ fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums),
++ fsparam_string("device", Opt_device),
+ {}
+ };
+
+@@ -412,9 +492,10 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
+ static int erofs_fc_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+ {
+- struct erofs_fs_context *ctx __maybe_unused = fc->fs_private;
++ struct erofs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+- int opt;
++ struct erofs_device_info *dif;
++ int opt, ret;
+
+ opt = fs_parse(fc, erofs_fs_parameters, param, &result);
+ if (opt < 0)
+@@ -456,6 +537,25 @@ static int erofs_fc_parse_param(struct fs_context *fc,
+ if (!erofs_fc_set_dax_mode(fc, result.uint_32))
+ return -EINVAL;
+ break;
++ case Opt_device:
++ dif = kzalloc(sizeof(*dif), GFP_KERNEL);
++ if (!dif)
++ return -ENOMEM;
++ dif->path = kstrdup(param->string, GFP_KERNEL);
++ if (!dif->path) {
++ kfree(dif);
++ return -ENOMEM;
++ }
++ down_write(&ctx->devs->rwsem);
++ ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL);
++ up_write(&ctx->devs->rwsem);
++ if (ret < 0) {
++ kfree(dif->path);
++ kfree(dif);
++ return ret;
++ }
++ ++ctx->devs->extra_devices;
++ break;
+ default:
+ return -ENOPARAM;
+ }
+@@ -542,6 +642,9 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
+ sb->s_fs_info = sbi;
+ sbi->opt = ctx->opt;
+ sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
++ sbi->devs = ctx->devs;
++ ctx->devs = NULL;
++
+ err = erofs_read_superblock(sb);
+ if (err)
+ return err;
+@@ -617,9 +720,33 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
+ return 0;
+ }
+
++static int erofs_release_device_info(int id, void *ptr, void *data)
++{
++ struct erofs_device_info *dif = ptr;
++
++ fs_put_dax(dif->dax_dev);
++ if (dif->bdev)
++ blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
++ kfree(dif->path);
++ kfree(dif);
++ return 0;
++}
++
++static void erofs_free_dev_context(struct erofs_dev_context *devs)
++{
++ if (!devs)
++ return;
++ idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
++ idr_destroy(&devs->tree);
++ kfree(devs);
++}
++
+ static void erofs_fc_free(struct fs_context *fc)
+ {
+- kfree(fc->fs_private);
++ struct erofs_fs_context *ctx = fc->fs_private;
++
++ erofs_free_dev_context(ctx->devs);
++ kfree(ctx);
+ }
+
+ static const struct fs_context_operations erofs_context_ops = {
+@@ -631,13 +758,20 @@ static const struct fs_context_operations erofs_context_ops = {
+
+ static int erofs_init_fs_context(struct fs_context *fc)
+ {
+- fc->fs_private = kzalloc(sizeof(struct erofs_fs_context), GFP_KERNEL);
+- if (!fc->fs_private)
+- return -ENOMEM;
++ struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+
+- /* set default mount options */
+- erofs_default_options(fc->fs_private);
++ if (!ctx)
++ return -ENOMEM;
++ ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
++ if (!ctx->devs) {
++ kfree(ctx);
++ return -ENOMEM;
++ }
++ fc->fs_private = ctx;
+
++ idr_init(&ctx->devs->tree);
++ init_rwsem(&ctx->devs->rwsem);
++ erofs_default_options(ctx);
+ fc->ops = &erofs_context_ops;
+ return 0;
+ }
+@@ -657,6 +791,8 @@ static void erofs_kill_sb(struct super_block *sb)
+ sbi = EROFS_SB(sb);
+ if (!sbi)
+ return;
++
++ erofs_free_dev_context(sbi->devs);
+ fs_put_dax(sbi->dax_dev);
+ kfree(sbi);
+ sb->s_fs_info = NULL;
+@@ -746,7 +882,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+
+ buf->f_type = sb->s_magic;
+ buf->f_bsize = EROFS_BLKSIZ;
+- buf->f_blocks = sbi->blocks;
++ buf->f_blocks = sbi->total_blocks;
+ buf->f_bfree = buf->f_bavail = 0;
+
+ buf->f_files = ULLONG_MAX;
+diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
+index e59e22852..8c947ed49 100644
+--- a/fs/erofs/zdata.c
++++ b/fs/erofs/zdata.c
+@@ -1266,8 +1266,9 @@ static void z_erofs_submit_queue(struct super_block *sb,
+ struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
+ void *bi_private;
+ z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
+- /* since bio will be NULL, no need to initialize last_index */
++ /* bio is NULL initially, so no need to initialize last_{index,bdev} */
+ pgoff_t last_index;
++ struct block_device *last_bdev;
+ unsigned int nr_bios = 0;
+ struct bio *bio = NULL;
+
+@@ -1279,6 +1280,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
+ q[JQ_SUBMIT]->head = owned_head;
+
+ do {
++ struct erofs_map_dev mdev;
+ struct z_erofs_pcluster *pcl;
+ pgoff_t cur, end;
+ unsigned int i = 0;
+@@ -1290,7 +1292,13 @@ static void z_erofs_submit_queue(struct super_block *sb,
+
+ pcl = container_of(owned_head, struct z_erofs_pcluster, next);
+
+- cur = pcl->obj.index;
++ /* no device id here, thus it will always succeed */
++ mdev = (struct erofs_map_dev) {
++ .m_pa = blknr_to_addr(pcl->obj.index),
++ };
++ (void)erofs_map_dev(sb, &mdev);
++
++ cur = erofs_blknr(mdev.m_pa);
+ end = cur + pcl->pclusterpages;
+
+ /* close the main owned chain at first */
+@@ -1306,7 +1314,8 @@ static void z_erofs_submit_queue(struct super_block *sb,
+ if (!page)
+ continue;
+
+- if (bio && cur != last_index + 1) {
++ if (bio && (cur != last_index + 1 ||
++ last_bdev != mdev.m_bdev)) {
+ submit_bio_retry:
+ submit_bio(bio);
+ bio = NULL;
+@@ -1314,9 +1323,10 @@ static void z_erofs_submit_queue(struct super_block *sb,
+
+ if (!bio) {
+ bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+-
+ bio->bi_end_io = z_erofs_decompressqueue_endio;
+- bio_set_dev(bio, sb->s_bdev);
++
++ bio_set_dev(bio, mdev.m_bdev);
++ last_bdev = mdev.m_bdev;
+ bio->bi_iter.bi_sector = (sector_t)cur <<
+ LOG_SECTORS_PER_BLOCK;
+ bio->bi_private = bi_private;
+--
+2.32.0
+
--- /dev/null
+From 8f89926290c4b3d31748d5089b27952243be0693 Mon Sep 17 00:00:00 2001
+From: Gao Xiang <hsiangkao@linux.alibaba.com>
+Date: Sat, 9 Oct 2021 04:08:37 +0800
+Subject: [PATCH 1003/1012] erofs: get compression algorithms directly on
+ mapping
+
+Currently, z_erofs_map_blocks_iter() returns whether extents are
+compressed or not, and the decompression frontend gets the specific
+algorithms then.
+
+It works but not quite well in many aspests, for example:
+ - The decompression frontend has to deal with whether extents are
+ compressed or not again and lookup the algorithms if compressed.
+ It's duplicated and too detailed about the on-disk mapping.
+
+ - A new secondary compression head will be introduced later so that
+ each file can have 2 compression algorithms at most for different
+ type of data. It could increase the complexity of the decompression
+ frontend if still handled in this way;
+
+ - A new readmore decompression strategy will be introduced to get
+ better performance for much bigger pcluster and lzma, which needs
+ the specific algorithm in advance as well.
+
+Let's look up compression algorithms in z_erofs_map_blocks_iter()
+directly instead.
+
+Link: https://lore.kernel.org/r/20211008200839.24541-2-xiang@kernel.org
+Reviewed-by: Chao Yu <chao@kernel.org>
+Reviewed-by: Yue Hu <huyue2@yulong.com>
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+---
+ fs/erofs/compress.h | 5 -----
+ fs/erofs/internal.h | 12 +++++++++---
+ fs/erofs/zdata.c | 12 ++++++------
+ fs/erofs/zmap.c | 19 ++++++++++---------
+ include/trace/events/erofs.h | 2 +-
+ 5 files changed, 26 insertions(+), 24 deletions(-)
+
+diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
+index 3701c72ba..ad62d1b4d 100644
+--- a/fs/erofs/compress.h
++++ b/fs/erofs/compress.h
+@@ -8,11 +8,6 @@
+
+ #include "internal.h"
+
+-enum {
+- Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+- Z_EROFS_COMPRESSION_RUNTIME_MAX
+-};
+-
+ struct z_erofs_decompress_req {
+ struct super_block *sb;
+ struct page **in, **out;
+diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
+index 0661d7d69..f8537ffde 100644
+--- a/fs/erofs/internal.h
++++ b/fs/erofs/internal.h
+@@ -363,7 +363,7 @@ extern const struct address_space_operations z_erofs_aops;
+ * of the corresponding uncompressed data in the file.
+ */
+ enum {
+- BH_Zipped = BH_PrivateStart,
++ BH_Encoded = BH_PrivateStart,
+ BH_FullMapped,
+ };
+
+@@ -371,8 +371,8 @@ enum {
+ #define EROFS_MAP_MAPPED (1 << BH_Mapped)
+ /* Located in metadata (could be copied from bd_inode) */
+ #define EROFS_MAP_META (1 << BH_Meta)
+-/* The extent has been compressed */
+-#define EROFS_MAP_ZIPPED (1 << BH_Zipped)
++/* The extent is encoded */
++#define EROFS_MAP_ENCODED (1 << BH_Encoded)
+ /* The length of extent is full */
+ #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped)
+
+@@ -381,6 +381,7 @@ struct erofs_map_blocks {
+ u64 m_plen, m_llen;
+
+ unsigned short m_deviceid;
++ char m_algorithmformat;
+ unsigned int m_flags;
+
+ struct page *mpage;
+@@ -394,6 +395,11 @@ struct erofs_map_blocks {
+ */
+ #define EROFS_GET_BLOCKS_FIEMAP 0x0002
+
++enum {
++ Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
++ Z_EROFS_COMPRESSION_RUNTIME_MAX
++};
++
+ /* zmap.c */
+ extern const struct iomap_ops z_erofs_iomap_report_ops;
+
+diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
+index 8c947ed49..a9dced07c 100644
+--- a/fs/erofs/zdata.c
++++ b/fs/erofs/zdata.c
+@@ -476,6 +476,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
+ struct erofs_workgroup *grp;
+ int err;
+
++ if (!(map->m_flags & EROFS_MAP_ENCODED)) {
++ DBG_BUGON(1);
++ return -EFSCORRUPTED;
++ }
++
+ /* no available pcluster, let's allocate one */
+ pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT);
+ if (IS_ERR(pcl))
+@@ -483,16 +488,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
+
+ atomic_set(&pcl->obj.refcount, 1);
+ pcl->obj.index = map->m_pa >> PAGE_SHIFT;
+-
++ pcl->algorithmformat = map->m_algorithmformat;
+ pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
+ (map->m_flags & EROFS_MAP_FULL_MAPPED ?
+ Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
+
+- if (map->m_flags & EROFS_MAP_ZIPPED)
+- pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4;
+- else
+- pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+-
+ /* new pclusters should be claimed as type 1, primary and followed */
+ pcl->next = clt->owned_head;
+ clt->mode = COLLECT_PRIMARY_FOLLOWED;
+diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
+index 7a6df35fd..1c3b068e5 100644
+--- a/fs/erofs/zmap.c
++++ b/fs/erofs/zmap.c
+@@ -111,7 +111,7 @@ struct z_erofs_maprecorder {
+
+ unsigned long lcn;
+ /* compression extent information gathered */
+- u8 type;
++ u8 type, headtype;
+ u16 clusterofs;
+ u16 delta[2];
+ erofs_blk_t pblk, compressedlcs;
+@@ -446,9 +446,8 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
+ }
+ return z_erofs_extent_lookback(m, m->delta[0]);
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+- map->m_flags &= ~EROFS_MAP_ZIPPED;
+- fallthrough;
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
++ m->headtype = m->type;
+ map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ break;
+ default:
+@@ -472,7 +471,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
+
+ DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
+ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD);
+- if (!(map->m_flags & EROFS_MAP_ZIPPED) ||
++ if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+ map->m_plen = 1 << lclusterbits;
+ return 0;
+@@ -609,16 +608,14 @@ int z_erofs_map_blocks_iter(struct inode *inode,
+ if (err)
+ goto unmap_out;
+
+- map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */
++ map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
+ end = (m.lcn + 1ULL) << lclusterbits;
+
+ switch (m.type) {
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+- if (endoff >= m.clusterofs)
+- map->m_flags &= ~EROFS_MAP_ZIPPED;
+- fallthrough;
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ if (endoff >= m.clusterofs) {
++ m.headtype = m.type;
+ map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+ break;
+ }
+@@ -650,12 +647,16 @@ int z_erofs_map_blocks_iter(struct inode *inode,
+
+ map->m_llen = end - map->m_la;
+ map->m_pa = blknr_to_addr(m.pblk);
+- map->m_flags |= EROFS_MAP_MAPPED;
+
+ err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
+ if (err)
+ goto out;
+
++ if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN)
++ map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
++ else
++ map->m_algorithmformat = vi->z_algorithmtype[0];
++
+ if (flags & EROFS_GET_BLOCKS_FIEMAP) {
+ err = z_erofs_get_extent_decompressedlen(&m);
+ if (!err)
+diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h
+index db4f2cec8..16ae7b666 100644
+--- a/include/trace/events/erofs.h
++++ b/include/trace/events/erofs.h
+@@ -24,7 +24,7 @@ struct erofs_map_blocks;
+ #define show_mflags(flags) __print_flags(flags, "", \
+ { EROFS_MAP_MAPPED, "M" }, \
+ { EROFS_MAP_META, "I" }, \
+- { EROFS_MAP_ZIPPED, "Z" })
++ { EROFS_MAP_ENCODED, "E" })
+
+ TRACE_EVENT(erofs_lookup,
+
+--
+2.32.0
+
--- /dev/null
+From eebb297cf4fa658f157b584476ebc5692c860bca Mon Sep 17 00:00:00 2001
+From: Gao Xiang <hsiangkao@linux.alibaba.com>
+Date: Mon, 18 Oct 2021 00:57:21 +0800
+Subject: [PATCH 1004/1012] erofs: introduce the secondary compression head
+
+Previously, for each HEAD lcluster, it can be either HEAD or PLAIN
+lcluster to indicate whether the whole pcluster is compressed or not.
+
+In this patch, a new HEAD2 head type is introduced to specify another
+compression algorithm other than the primary algorithm for each
+compressed file, which can be used for upcoming LZMA compression and
+LZ4 range dictionary compression for various data patterns.
+
+It has been stayed in the EROFS roadmap for years. Complete it now!
+
+Link: https://lore.kernel.org/r/20211017165721.2442-1-xiang@kernel.org
+Reviewed-by: Yue Hu <huyue2@yulong.com>
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+---
+ fs/erofs/erofs_fs.h | 39 ++++++++++++++++++++-------------------
+ fs/erofs/zmap.c | 41 ++++++++++++++++++++++++++++-------------
+ 2 files changed, 48 insertions(+), 32 deletions(-)
+
+diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
+index e480b3854..87736cbf1 100644
+--- a/fs/erofs/erofs_fs.h
++++ b/fs/erofs/erofs_fs.h
+@@ -22,12 +22,14 @@
+ #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002
+ #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004
+ #define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008
++#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008
+ #define EROFS_ALL_FEATURE_INCOMPAT \
+ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
+ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
+ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
+ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
+- EROFS_FEATURE_INCOMPAT_DEVICE_TABLE)
++ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
++ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2)
+
+ #define EROFS_SB_EXTSLOT_SIZE 16
+
+@@ -303,35 +305,34 @@ struct z_erofs_map_header {
+ #define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8
+
+ /*
+- * Fixed-sized output compression ondisk Logical Extent cluster type:
+- * 0 - literal (uncompressed) cluster
+- * 1 - compressed cluster (for the head logical cluster)
+- * 2 - compressed cluster (for the other logical clusters)
++ * Fixed-sized output compression on-disk logical cluster type:
++ * 0 - literal (uncompressed) lcluster
++ * 1,3 - compressed lcluster (for HEAD lclusters)
++ * 2 - compressed lcluster (for NONHEAD lclusters)
+ *
+ * In detail,
+- * 0 - literal (uncompressed) cluster,
++ * 0 - literal (uncompressed) lcluster,
+ * di_advise = 0
+- * di_clusterofs = the literal data offset of the cluster
+- * di_blkaddr = the blkaddr of the literal cluster
++ * di_clusterofs = the literal data offset of the lcluster
++ * di_blkaddr = the blkaddr of the literal pcluster
+ *
+- * 1 - compressed cluster (for the head logical cluster)
+- * di_advise = 1
+- * di_clusterofs = the decompressed data offset of the cluster
+- * di_blkaddr = the blkaddr of the compressed cluster
++ * 1,3 - compressed lcluster (for HEAD lclusters)
++ * di_advise = 1 or 3
++ * di_clusterofs = the decompressed data offset of the lcluster
++ * di_blkaddr = the blkaddr of the compressed pcluster
+ *
+- * 2 - compressed cluster (for the other logical clusters)
++ * 2 - compressed cluster (for NONHEAD lclusters)
+ * di_advise = 2
+ * di_clusterofs =
+- * the decompressed data offset in its own head cluster
+- * di_u.delta[0] = distance to its corresponding head cluster
+- * di_u.delta[1] = distance to its corresponding tail cluster
+- * (di_advise could be 0, 1 or 2)
++ * the decompressed data offset in its own HEAD lcluster
++ * di_u.delta[0] = distance to this HEAD lcluster
++ * di_u.delta[1] = distance to the next HEAD lcluster
+ */
+ enum {
+ Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0,
+- Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1,
++ Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1,
+ Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2,
+- Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3,
++ Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3,
+ Z_EROFS_VLE_CLUSTER_TYPE_MAX
+ };
+
+diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
+index 1c3b068e5..85d028942 100644
+--- a/fs/erofs/zmap.c
++++ b/fs/erofs/zmap.c
+@@ -28,7 +28,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
+ {
+ struct erofs_inode *const vi = EROFS_I(inode);
+ struct super_block *const sb = inode->i_sb;
+- int err;
++ int err, headnr;
+ erofs_off_t pos;
+ struct page *page;
+ void *kaddr;
+@@ -68,9 +68,11 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
+ vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
+ vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
+
+- if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) {
+- erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel",
+- vi->z_algorithmtype[0], vi->nid);
++ headnr = 0;
++ if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
++ vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
++ erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
++ headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
+ err = -EOPNOTSUPP;
+ goto unmap_done;
+ }
+@@ -178,7 +180,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+ m->clusterofs = 1 << vi->z_logical_clusterbits;
+ m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
+ if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+- if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
++ if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
++ Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+@@ -189,7 +192,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+ m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
+ break;
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
++ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
++ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ m->clusterofs = le16_to_cpu(di->di_clusterofs);
+ m->pblk = le32_to_cpu(di->di_u.blkaddr);
+ break;
+@@ -446,7 +450,8 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
+ }
+ return z_erofs_extent_lookback(m, m->delta[0]);
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
++ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
++ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ m->headtype = m->type;
+ map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ break;
+@@ -470,13 +475,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
+ int err;
+
+ DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
+- m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD);
++ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 &&
++ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2);
++ DBG_BUGON(m->type != m->headtype);
++
+ if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
+- !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
++ ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) &&
++ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
++ ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) &&
++ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
+ map->m_plen = 1 << lclusterbits;
+ return 0;
+ }
+-
+ lcn = m->lcn + 1;
+ if (m->compressedlcs)
+ goto out;
+@@ -498,7 +508,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
+
+ switch (m->type) {
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
++ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
++ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ /*
+ * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
+ * rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
+@@ -553,7 +564,8 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
+ DBG_BUGON(!m->delta[1] &&
+ m->clusterofs != 1 << lclusterbits);
+ } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
+- m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) {
++ m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 ||
++ m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
+ /* go on until the next HEAD lcluster */
+ if (lcn != headlcn)
+ break;
+@@ -613,7 +625,8 @@ int z_erofs_map_blocks_iter(struct inode *inode,
+
+ switch (m.type) {
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
++ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
++ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ if (endoff >= m.clusterofs) {
+ m.headtype = m.type;
+ map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+@@ -654,6 +667,8 @@ int z_erofs_map_blocks_iter(struct inode *inode,
+
+ if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN)
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
++ else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2)
++ map->m_algorithmformat = vi->z_algorithmtype[1];
+ else
+ map->m_algorithmformat = vi->z_algorithmtype[0];
+
+--
+2.32.0
+
--- /dev/null
+From da52243991d28aaa697bd93fc2b6f89ad06c37d6 Mon Sep 17 00:00:00 2001
+From: Gao Xiang <hsiangkao@linux.alibaba.com>
+Date: Sat, 9 Oct 2021 04:08:39 +0800
+Subject: [PATCH 1005/1012] erofs: introduce readmore decompression strategy
+
+Previously, the readahead window was strictly followed by EROFS
+decompression strategy in order to minimize extra memory footprint.
+However, it could become inefficient if just reading the partial
+requested data for much big LZ4 pclusters and the upcoming LZMA
+implementation.
+
+Let's try to request the leading data in a pcluster without
+triggering memory reclaiming instead for the LZ4 approach first
+to boost up 100% randread of large big pclusters, and it has no real
+impact on low memory scenarios.
+
+It also introduces a way to expand read lengths in order to decompress
+the whole pcluster, which is useful for LZMA since the algorithm
+itself is relatively slow and causes CPU bound, but LZ4 is not.
+
+Link: https://lore.kernel.org/r/20211008200839.24541-4-xiang@kernel.org
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+---
+ fs/erofs/internal.h | 13 ++++++
+ fs/erofs/zdata.c | 99 ++++++++++++++++++++++++++++++++++++---------
+ 2 files changed, 93 insertions(+), 19 deletions(-)
+
+diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
+index f8537ffde..354ce3cb2 100644
+--- a/fs/erofs/internal.h
++++ b/fs/erofs/internal.h
+@@ -332,6 +332,19 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value)
+ EROFS_I_DATALAYOUT_BITS);
+ }
+
++/*
++ * Different from grab_cache_page_nowait(), reclaiming is never triggered
++ * when allocating new pages.
++ */
++static inline
++struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
++ pgoff_t index)
++{
++ return pagecache_get_page(mapping, index,
++ FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
++ readahead_gfp_mask(mapping) & ~__GFP_RECLAIM);
++}
++
+ extern const struct super_operations erofs_sops;
+
+ extern const struct address_space_operations erofs_raw_access_aops;
+diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
+index a9dced07c..98d3bd25d 100644
+--- a/fs/erofs/zdata.c
++++ b/fs/erofs/zdata.c
+@@ -1387,6 +1387,72 @@ static void z_erofs_runqueue(struct super_block *sb,
+ z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
+ }
+
++/*
++ * Since partial uptodate is still unimplemented for now, we have to use
++ * approximate readmore strategies as a start.
++ */
++static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
++ struct readahead_control *rac,
++ erofs_off_t end,
++ struct list_head *pagepool,
++ bool backmost)
++{
++ struct inode *inode = f->inode;
++ struct erofs_map_blocks *map = &f->map;
++ erofs_off_t cur;
++ int err;
++
++ if (backmost) {
++ map->m_la = end;
++ /* TODO: pass in EROFS_GET_BLOCKS_READMORE for LZMA later */
++ err = z_erofs_map_blocks_iter(inode, map, 0);
++ if (err)
++ return;
++
++ /* expend ra for the trailing edge if readahead */
++ if (rac) {
++ loff_t newstart = readahead_pos(rac);
++
++ cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
++ readahead_expand(rac, newstart, cur - newstart);
++ return;
++ }
++ end = round_up(end, PAGE_SIZE);
++ } else {
++ end = round_up(map->m_la, PAGE_SIZE);
++
++ if (!map->m_llen)
++ return;
++ }
++
++ cur = map->m_la + map->m_llen - 1;
++ while (cur >= end) {
++ pgoff_t index = cur >> PAGE_SHIFT;
++ struct page *page;
++
++ page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
++ if (!page)
++ goto skip;
++
++ if (PageUptodate(page)) {
++ unlock_page(page);
++ put_page(page);
++ goto skip;
++ }
++
++ err = z_erofs_do_read_page(f, page, pagepool);
++ if (err)
++ erofs_err(inode->i_sb,
++ "readmore error at page %lu @ nid %llu",
++ index, EROFS_I(inode)->nid);
++ put_page(page);
++skip:
++ if (cur < PAGE_SIZE)
++ break;
++ cur = (index << PAGE_SHIFT) - 1;
++ }
++}
++
+ static int z_erofs_readpage(struct file *file, struct page *page)
+ {
+ struct inode *const inode = page->mapping->host;
+@@ -1395,10 +1461,13 @@ static int z_erofs_readpage(struct file *file, struct page *page)
+ LIST_HEAD(pagepool);
+
+ trace_erofs_readpage(page, false);
+-
+ f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
+
++ z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
++ &pagepool, true);
+ err = z_erofs_do_read_page(&f, page, &pagepool);
++ z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
++
+ (void)z_erofs_collector_end(&f.clt);
+
+ /* if some compressed cluster ready, need submit them anyway */
+@@ -1419,29 +1488,20 @@ static void z_erofs_readahead(struct readahead_control *rac)
+ {
+ struct inode *const inode = rac->mapping->host;
+ struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
+-
+- unsigned int nr_pages = readahead_count(rac);
+- bool sync = (sbi->opt.readahead_sync_decompress &&
+- nr_pages <= sbi->opt.max_sync_decompress_pages);
+ struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ struct page *page, *head = NULL;
++ unsigned int nr_pages;
+ LIST_HEAD(pagepool);
+
+- trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+-
+ f.readahead = true;
+ f.headoffset = readahead_pos(rac);
+
+- while ((page = readahead_page(rac))) {
+- prefetchw(&page->flags);
+-
+- /*
+- * A pure asynchronous readahead is indicated if
+- * a PG_readahead marked page is hitted at first.
+- * Let's also do asynchronous decompression for this case.
+- */
+- sync &= !(PageReadahead(page) && !head);
++ z_erofs_pcluster_readmore(&f, rac, f.headoffset +
++ readahead_length(rac) - 1, &pagepool, true);
++ nr_pages = readahead_count(rac);
++ trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+
++ while ((page = readahead_page(rac))) {
+ set_page_private(page, (unsigned long)head);
+ head = page;
+ }
+@@ -1460,11 +1520,12 @@ static void z_erofs_readahead(struct readahead_control *rac)
+ page->index, EROFS_I(inode)->nid);
+ put_page(page);
+ }
+-
++ z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
+ (void)z_erofs_collector_end(&f.clt);
+
+- z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync);
+-
++ z_erofs_runqueue(inode->i_sb, &f, &pagepool,
++ sbi->opt.readahead_sync_decompress &&
++ nr_pages <= sbi->opt.max_sync_decompress_pages);
+ if (f.map.mpage)
+ put_page(f.map.mpage);
+
+--
+2.32.0
+
--- /dev/null
+From e10daad6d7da1417cdaf67757f605178c41b44d2 Mon Sep 17 00:00:00 2001
+From: Lasse Collin <lasse.collin@tukaani.org>
+Date: Wed, 22 Sep 2021 22:59:58 +0800
+Subject: [PATCH 1008/1012] lib/xz: Move s->lzma.len = 0 initialization to
+ lzma_reset()
+
+It's a more logical place even if the resetting needs to be done
+only once per LZMA2 stream (if lzma_reset() called in the middle
+of an LZMA2 stream, .len will already be 0).
+
+Signed-off-by: Lasse Collin <lasse.collin@tukaani.org>
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+---
+ lib/xz/xz_dec_lzma2.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c
+index d548cf0e5..22b789645 100644
+--- a/lib/xz/xz_dec_lzma2.c
++++ b/lib/xz/xz_dec_lzma2.c
+@@ -791,6 +791,7 @@ static void lzma_reset(struct xz_dec_lzma2 *s)
+ s->lzma.rep1 = 0;
+ s->lzma.rep2 = 0;
+ s->lzma.rep3 = 0;
++ s->lzma.len = 0;
+
+ /*
+ * All probabilities are initialized to the same value. This hack
+@@ -1174,8 +1175,6 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props)
+ }
+ }
+
+- s->lzma.len = 0;
+-
+ s->lzma2.sequence = SEQ_CONTROL;
+ s->lzma2.need_dict_reset = true;
+
+--
+2.32.0
+
--- /dev/null
+From 9fc39dcb1378f54ee5be60efe8a0260377a2807b Mon Sep 17 00:00:00 2001
+From: Lasse Collin <lasse.collin@tukaani.org>
+Date: Wed, 22 Sep 2021 22:59:58 +0800
+Subject: [PATCH 1009/1012] lib/xz: Add MicroLZMA decoder
+
+MicroLZMA is a yet another header format variant where the first
+byte of a raw LZMA stream (without the end of stream marker) has
+been replaced with a bitwise-negation of the lc/lp/pb properties
+byte. MicroLZMA was created to be used in EROFS but can be used
+by other things too where wasting minimal amount of space for
+headers is important.
+
+This is implemented using most of the LZMA2 code as is so the
+amount of new code is small. The API has a few extra features
+compared to the XZ decoder. On the other hand, the API lacks
+XZ_BUF_ERROR support which is important to take into account
+when using this API.
+
+MicroLZMA doesn't support BCJ filters. In theory they could be
+added later as there are many unused/reserved values for the
+first byte of the compressed stream but in practice it is
+somewhat unlikely to happen due to a few implementation reasons.
+
+Signed-off-by: Lasse Collin <lasse.collin@tukaani.org>
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+---
+ include/linux/xz.h | 106 ++++++++++++++++++++++++++++
+ lib/xz/Kconfig | 13 ++++
+ lib/xz/xz_dec_lzma2.c | 156 +++++++++++++++++++++++++++++++++++++++++-
+ lib/xz/xz_dec_syms.c | 9 ++-
+ lib/xz/xz_private.h | 3 +
+ 5 files changed, 284 insertions(+), 3 deletions(-)
+
+diff --git a/include/linux/xz.h b/include/linux/xz.h
+index 9884c8440..7285ca5d5 100644
+--- a/include/linux/xz.h
++++ b/include/linux/xz.h
+@@ -233,6 +233,112 @@ XZ_EXTERN void xz_dec_reset(struct xz_dec *s);
+ */
+ XZ_EXTERN void xz_dec_end(struct xz_dec *s);
+
++/*
++ * Decompressor for MicroLZMA, an LZMA variant with a very minimal header.
++ * See xz_dec_microlzma_alloc() below for details.
++ *
++ * These functions aren't used or available in preboot code and thus aren't
++ * marked with XZ_EXTERN. This avoids warnings about static functions that
++ * are never defined.
++ */
++/**
++ * struct xz_dec_microlzma - Opaque type to hold the MicroLZMA decoder state
++ */
++struct xz_dec_microlzma;
++
++/**
++ * xz_dec_microlzma_alloc() - Allocate memory for the MicroLZMA decoder
++ * @mode XZ_SINGLE or XZ_PREALLOC
++ * @dict_size LZMA dictionary size. This must be at least 4 KiB and
++ * at most 3 GiB.
++ *
++ * In contrast to xz_dec_init(), this function only allocates the memory
++ * and remembers the dictionary size. xz_dec_microlzma_reset() must be used
++ * before calling xz_dec_microlzma_run().
++ *
++ * The amount of allocated memory is a little less than 30 KiB with XZ_SINGLE.
++ * With XZ_PREALLOC also a dictionary buffer of dict_size bytes is allocated.
++ *
++ * On success, xz_dec_microlzma_alloc() returns a pointer to
++ * struct xz_dec_microlzma. If memory allocation fails or
++ * dict_size is invalid, NULL is returned.
++ *
++ * The compressed format supported by this decoder is a raw LZMA stream
++ * whose first byte (always 0x00) has been replaced with bitwise-negation
++ * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is
++ * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00.
++ * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream
++ * marker must not be used. The unused values are reserved for future use.
++ * This MicroLZMA header format was created for use in EROFS but may be used
++ * by others too.
++ */
++extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
++ uint32_t dict_size);
++
++/**
++ * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state
++ * @s Decoder state allocated using xz_dec_microlzma_alloc()
++ * @comp_size Compressed size of the input stream
++ * @uncomp_size Uncompressed size of the input stream. A value smaller
++ * than the real uncompressed size of the input stream can
++ * be specified if uncomp_size_is_exact is set to false.
++ * uncomp_size can never be set to a value larger than the
++ * expected real uncompressed size because it would eventually
++ * result in XZ_DATA_ERROR.
++ * @uncomp_size_is_exact This is an int instead of bool to avoid
++ * requiring stdbool.h. This should normally be set to true.
++ * When this is set to false, error detection is weaker.
++ */
++extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s,
++ uint32_t comp_size, uint32_t uncomp_size,
++ int uncomp_size_is_exact);
++
++/**
++ * xz_dec_microlzma_run() - Run the MicroLZMA decoder
++ * @s Decoder state initialized using xz_dec_microlzma_reset()
++ * @b: Input and output buffers
++ *
++ * This works similarly to xz_dec_run() with a few important differences.
++ * Only the differences are documented here.
++ *
++ * The only possible return values are XZ_OK, XZ_STREAM_END, and
++ * XZ_DATA_ERROR. This function cannot return XZ_BUF_ERROR: if no progress
++ * is possible due to lack of input data or output space, this function will
++ * keep returning XZ_OK. Thus, the calling code must be written so that it
++ * will eventually provide input and output space matching (or exceeding)
++ * comp_size and uncomp_size arguments given to xz_dec_microlzma_reset().
++ * If the caller cannot do this (for example, if the input file is truncated
++ * or otherwise corrupt), the caller must detect this error by itself to
++ * avoid an infinite loop.
++ *
++ * If the compressed data seems to be corrupt, XZ_DATA_ERROR is returned.
++ * This can happen also when incorrect dictionary, uncompressed, or
++ * compressed sizes have been specified.
++ *
++ * With XZ_PREALLOC only: As an extra feature, b->out may be NULL to skip over
++ * uncompressed data. This way the caller doesn't need to provide a temporary
++ * output buffer for the bytes that will be ignored.
++ *
++ * With XZ_SINGLE only: In contrast to xz_dec_run(), the return value XZ_OK
++ * is also possible and thus XZ_SINGLE is actually a limited multi-call mode.
++ * After XZ_OK the bytes decoded so far may be read from the output buffer.
++ * It is possible to continue decoding but the variables b->out and b->out_pos
++ * MUST NOT be changed by the caller. Increasing the value of b->out_size is
++ * allowed to make more output space available; one doesn't need to provide
++ * space for the whole uncompressed data on the first call. The input buffer
++ * may be changed normally like with XZ_PREALLOC. This way input data can be
++ * provided from non-contiguous memory.
++ */
++extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s,
++ struct xz_buf *b);
++
++/**
++ * xz_dec_microlzma_end() - Free the memory allocated for the decoder state
++ * @s: Decoder state allocated using xz_dec_microlzma_alloc().
++ * If s is NULL, this function does nothing.
++ */
++extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s);
++
+ /*
+ * Standalone build (userspace build or in-kernel build for boot time use)
+ * needs a CRC32 implementation. For normal in-kernel use, kernel's own
+diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig
+index 5cb50245a..adce22ac1 100644
+--- a/lib/xz/Kconfig
++++ b/lib/xz/Kconfig
+@@ -39,6 +39,19 @@ config XZ_DEC_SPARC
+ default y
+ select XZ_DEC_BCJ
+
++config XZ_DEC_MICROLZMA
++ bool "MicroLZMA decoder"
++ default n
++ help
++ MicroLZMA is a header format variant where the first byte
++ of a raw LZMA stream (without the end of stream marker) has
++ been replaced with a bitwise-negation of the lc/lp/pb
++ properties byte. MicroLZMA was created to be used in EROFS
++ but can be used by other things too where wasting minimal
++ amount of space for headers is important.
++
++ Unless you know that you need this, say N.
++
+ endif
+
+ config XZ_DEC_BCJ
+diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c
+index 22b789645..46b186d7e 100644
+--- a/lib/xz/xz_dec_lzma2.c
++++ b/lib/xz/xz_dec_lzma2.c
+@@ -248,6 +248,10 @@ struct lzma2_dec {
+ * before the first LZMA chunk.
+ */
+ bool need_props;
++
++#ifdef XZ_DEC_MICROLZMA
++ bool pedantic_microlzma;
++#endif
+ };
+
+ struct xz_dec_lzma2 {
+@@ -419,6 +423,12 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b,
+ }
+ }
+
++#ifdef XZ_DEC_MICROLZMA
++# define DICT_FLUSH_SUPPORTS_SKIPPING true
++#else
++# define DICT_FLUSH_SUPPORTS_SKIPPING false
++#endif
++
+ /*
+ * Flush pending data from dictionary to b->out. It is assumed that there is
+ * enough space in b->out. This is guaranteed because caller uses dict_limit()
+@@ -437,9 +447,14 @@ static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b)
+ * decompression because in multi-call mode dict->buf
+ * has been allocated by us in this file; it's not
+ * provided by the caller like in single-call mode.
++ *
++ * With MicroLZMA, b->out can be NULL to skip bytes that
++ * the caller doesn't need. This cannot be done with XZ
++ * because it would break BCJ filters.
+ */
+- memcpy(b->out + b->out_pos, dict->buf + dict->start,
+- copy_size);
++ if (!DICT_FLUSH_SUPPORTS_SKIPPING || b->out != NULL)
++ memcpy(b->out + b->out_pos, dict->buf + dict->start,
++ copy_size);
+ }
+
+ dict->start = dict->pos;
+@@ -1190,3 +1205,140 @@ XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s)
+
+ kfree(s);
+ }
++
++#ifdef XZ_DEC_MICROLZMA
++/* This is a wrapper struct to have a nice struct name in the public API. */
++struct xz_dec_microlzma {
++ struct xz_dec_lzma2 s;
++};
++
++enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s_ptr,
++ struct xz_buf *b)
++{
++ struct xz_dec_lzma2 *s = &s_ptr->s;
++
++ /*
++ * sequence is SEQ_PROPERTIES before the first input byte,
++ * SEQ_LZMA_PREPARE until a total of five bytes have been read,
++ * and SEQ_LZMA_RUN for the rest of the input stream.
++ */
++ if (s->lzma2.sequence != SEQ_LZMA_RUN) {
++ if (s->lzma2.sequence == SEQ_PROPERTIES) {
++ /* One byte is needed for the props. */
++ if (b->in_pos >= b->in_size)
++ return XZ_OK;
++
++ /*
++ * Don't increment b->in_pos here. The same byte is
++ * also passed to rc_read_init() which will ignore it.
++ */
++ if (!lzma_props(s, ~b->in[b->in_pos]))
++ return XZ_DATA_ERROR;
++
++ s->lzma2.sequence = SEQ_LZMA_PREPARE;
++ }
++
++ /*
++ * xz_dec_microlzma_reset() doesn't validate the compressed
++ * size so we do it here. We have to limit the maximum size
++ * to avoid integer overflows in lzma2_lzma(). 3 GiB is a nice
++ * round number and much more than users of this code should
++ * ever need.
++ */
++ if (s->lzma2.compressed < RC_INIT_BYTES
++ || s->lzma2.compressed > (3U << 30))
++ return XZ_DATA_ERROR;
++
++ if (!rc_read_init(&s->rc, b))
++ return XZ_OK;
++
++ s->lzma2.compressed -= RC_INIT_BYTES;
++ s->lzma2.sequence = SEQ_LZMA_RUN;
++
++ dict_reset(&s->dict, b);
++ }
++
++ /* This is to allow increasing b->out_size between calls. */
++ if (DEC_IS_SINGLE(s->dict.mode))
++ s->dict.end = b->out_size - b->out_pos;
++
++ while (true) {
++ dict_limit(&s->dict, min_t(size_t, b->out_size - b->out_pos,
++ s->lzma2.uncompressed));
++
++ if (!lzma2_lzma(s, b))
++ return XZ_DATA_ERROR;
++
++ s->lzma2.uncompressed -= dict_flush(&s->dict, b);
++
++ if (s->lzma2.uncompressed == 0) {
++ if (s->lzma2.pedantic_microlzma) {
++ if (s->lzma2.compressed > 0 || s->lzma.len > 0
++ || !rc_is_finished(&s->rc))
++ return XZ_DATA_ERROR;
++ }
++
++ return XZ_STREAM_END;
++ }
++
++ if (b->out_pos == b->out_size)
++ return XZ_OK;
++
++ if (b->in_pos == b->in_size
++ && s->temp.size < s->lzma2.compressed)
++ return XZ_OK;
++ }
++}
++
++struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
++ uint32_t dict_size)
++{
++ struct xz_dec_microlzma *s;
++
++ /* Restrict dict_size to the same range as in the LZMA2 code. */
++ if (dict_size < 4096 || dict_size > (3U << 30))
++ return NULL;
++
++ s = kmalloc(sizeof(*s), GFP_KERNEL);
++ if (s == NULL)
++ return NULL;
++
++ s->s.dict.mode = mode;
++ s->s.dict.size = dict_size;
++
++ if (DEC_IS_MULTI(mode)) {
++ s->s.dict.end = dict_size;
++
++ s->s.dict.buf = vmalloc(dict_size);
++ if (s->s.dict.buf == NULL) {
++ kfree(s);
++ return NULL;
++ }
++ }
++
++ return s;
++}
++
++void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, uint32_t comp_size,
++ uint32_t uncomp_size, int uncomp_size_is_exact)
++{
++ /*
++ * comp_size is validated in xz_dec_microlzma_run().
++ * uncomp_size can safely be anything.
++ */
++ s->s.lzma2.compressed = comp_size;
++ s->s.lzma2.uncompressed = uncomp_size;
++ s->s.lzma2.pedantic_microlzma = uncomp_size_is_exact;
++
++ s->s.lzma2.sequence = SEQ_PROPERTIES;
++ s->s.temp.size = 0;
++}
++
++void xz_dec_microlzma_end(struct xz_dec_microlzma *s)
++{
++ if (DEC_IS_MULTI(s->s.dict.mode))
++ vfree(s->s.dict.buf);
++
++ kfree(s);
++}
++#endif
+diff --git a/lib/xz/xz_dec_syms.c b/lib/xz/xz_dec_syms.c
+index 32eb3c03a..61098c67a 100644
+--- a/lib/xz/xz_dec_syms.c
++++ b/lib/xz/xz_dec_syms.c
+@@ -15,8 +15,15 @@ EXPORT_SYMBOL(xz_dec_reset);
+ EXPORT_SYMBOL(xz_dec_run);
+ EXPORT_SYMBOL(xz_dec_end);
+
++#ifdef CONFIG_XZ_DEC_MICROLZMA
++EXPORT_SYMBOL(xz_dec_microlzma_alloc);
++EXPORT_SYMBOL(xz_dec_microlzma_reset);
++EXPORT_SYMBOL(xz_dec_microlzma_run);
++EXPORT_SYMBOL(xz_dec_microlzma_end);
++#endif
++
+ MODULE_DESCRIPTION("XZ decompressor");
+-MODULE_VERSION("1.0");
++MODULE_VERSION("1.1");
+ MODULE_AUTHOR("Lasse Collin <lasse.collin@tukaani.org> and Igor Pavlov");
+
+ /*
+diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h
+index 09360ebb5..bf1e94ec7 100644
+--- a/lib/xz/xz_private.h
++++ b/lib/xz/xz_private.h
+@@ -37,6 +37,9 @@
+ # ifdef CONFIG_XZ_DEC_SPARC
+ # define XZ_DEC_SPARC
+ # endif
++# ifdef CONFIG_XZ_DEC_MICROLZMA
++# define XZ_DEC_MICROLZMA
++# endif
+ # define memeq(a, b, size) (memcmp(a, b, size) == 0)
+ # define memzero(buf, size) memset(buf, 0, size)
+ # endif
+--
+2.32.0
+
--- /dev/null
+From 502b5b1841dfd177f43949df60b591f0e429e5be Mon Sep 17 00:00:00 2001
+From: Lasse Collin <lasse.collin@tukaani.org>
+Date: Wed, 22 Sep 2021 22:59:58 +0800
+Subject: [PATCH 1010/1012] lib/xz, lib/decompress_unxz.c: Fix spelling in
+ comments
+
+uncompressible -> incompressible
+non-splitted -> non-split
+
+Signed-off-by: Lasse Collin <lasse.collin@tukaani.org>
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+---
+ lib/decompress_unxz.c | 10 +++++-----
+ lib/xz/xz_dec_lzma2.c | 2 +-
+ 2 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c
+index f7a3dc133..9f4262ee3 100644
+--- a/lib/decompress_unxz.c
++++ b/lib/decompress_unxz.c
+@@ -20,8 +20,8 @@
+ *
+ * The worst case for in-place decompression is that the beginning of
+ * the file is compressed extremely well, and the rest of the file is
+- * uncompressible. Thus, we must look for worst-case expansion when the
+- * compressor is encoding uncompressible data.
++ * incompressible. Thus, we must look for worst-case expansion when the
++ * compressor is encoding incompressible data.
+ *
+ * The structure of the .xz file in case of a compressed kernel is as follows.
+ * Sizes (as bytes) of the fields are in parenthesis.
+@@ -58,7 +58,7 @@
+ * uncompressed size of the payload is in practice never less than the
+ * payload size itself. The LZMA2 format would allow uncompressed size
+ * to be less than the payload size, but no sane compressor creates such
+- * files. LZMA2 supports storing uncompressible data in uncompressed form,
++ * files. LZMA2 supports storing incompressible data in uncompressed form,
+ * so there's never a need to create payloads whose uncompressed size is
+ * smaller than the compressed size.
+ *
+@@ -167,8 +167,8 @@
+ * memeq and memzero are not used much and any remotely sane implementation
+ * is fast enough. memcpy/memmove speed matters in multi-call mode, but
+ * the kernel image is decompressed in single-call mode, in which only
+- * memmove speed can matter and only if there is a lot of uncompressible data
+- * (LZMA2 stores uncompressible chunks in uncompressed form). Thus, the
++ * memmove speed can matter and only if there is a lot of incompressible data
++ * (LZMA2 stores incompressible chunks in uncompressed form). Thus, the
+ * functions below should just be kept small; it's probably not worth
+ * optimizing for speed.
+ */
+diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c
+index 46b186d7e..27ce34520 100644
+--- a/lib/xz/xz_dec_lzma2.c
++++ b/lib/xz/xz_dec_lzma2.c
+@@ -520,7 +520,7 @@ static __always_inline void rc_normalize(struct rc_dec *rc)
+ * functions so that the compiler is supposed to be able to more easily avoid
+ * an extra branch. In this particular version of the LZMA decoder, this
+ * doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3
+- * on x86). Using a non-splitted version results in nicer looking code too.
++ * on x86). Using a non-split version results in nicer looking code too.
+ *
+ * NOTE: This must return an int. Do not make it return a bool or the speed
+ * of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care,
+--
+2.32.0
+
--- /dev/null
+From 7a6d16b5e3a16262873dd30dbf81944f53466dc9 Mon Sep 17 00:00:00 2001
+From: Gao Xiang <hsiangkao@linux.alibaba.com>
+Date: Mon, 11 Oct 2021 05:31:44 +0800
+Subject: [PATCH 1011/1012] erofs: rename some generic methods in decompressor
+
+Previously, some LZ4 methods were named with `generic'. However, while
+evaluating the effective LZMA approach, it seems they aren't quite
+generic at all (e.g. no need preparing dstpages for most LZMA cases.)
+
+Avoid such naming instead.
+
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+---
+ fs/erofs/decompressor.c | 63 ++++++++++++++++++++---------------------
+ 1 file changed, 30 insertions(+), 33 deletions(-)
+
+diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
+index dce06ac61..8fd7af9d6 100644
+--- a/fs/erofs/decompressor.c
++++ b/fs/erofs/decompressor.c
+@@ -17,13 +17,8 @@
+ #endif
+
+ struct z_erofs_decompressor {
+- /*
+- * if destpages have sparsed pages, fill them with bounce pages.
+- * it also check whether destpages indicate continuous physical memory.
+- */
+- int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
+- struct list_head *pagepool);
+- int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
++ int (*decompress)(struct z_erofs_decompress_req *rq,
++ struct list_head *pagepool);
+ char *name;
+ };
+
+@@ -63,8 +58,12 @@ int z_erofs_load_lz4_config(struct super_block *sb,
+ return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
+ }
+
+-static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
+- struct list_head *pagepool)
++/*
++ * Fill all gaps with bounce pages if it's a sparse page list. Also check if
++ * all physical pages are consecutive, which can be seen for moderate CR.
++ */
++static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
++ struct list_head *pagepool)
+ {
+ const unsigned int nr =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+@@ -119,7 +118,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
+ return kaddr ? 1 : 0;
+ }
+
+-static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
++static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq,
+ void *inpage, unsigned int *inputmargin, int *maptype,
+ bool support_0padding)
+ {
+@@ -189,7 +188,8 @@ static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
+ return src;
+ }
+
+-static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
++static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq,
++ u8 *out)
+ {
+ unsigned int inputmargin;
+ u8 *headpage, *src;
+@@ -216,8 +216,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+ }
+
+ rq->inputsize -= inputmargin;
+- src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype,
+- support_0padding);
++ src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin,
++ &maptype, support_0padding);
+ if (IS_ERR(src))
+ return PTR_ERR(src);
+
+@@ -259,23 +259,11 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+ return ret;
+ }
+
+-static struct z_erofs_decompressor decompressors[] = {
+- [Z_EROFS_COMPRESSION_SHIFTED] = {
+- .name = "shifted"
+- },
+- [Z_EROFS_COMPRESSION_LZ4] = {
+- .prepare_destpages = z_erofs_lz4_prepare_destpages,
+- .decompress = z_erofs_lz4_decompress,
+- .name = "lz4"
+- },
+-};
+-
+-static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
+- struct list_head *pagepool)
++static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
++ struct list_head *pagepool)
+ {
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+- const struct z_erofs_decompressor *alg = decompressors + rq->alg;
+ unsigned int dst_maptype;
+ void *dst;
+ int ret;
+@@ -289,7 +277,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
+ }
+
+ /* general decoding path which can be used for all cases */
+- ret = alg->prepare_destpages(rq, pagepool);
++ ret = z_erofs_lz4_prepare_dstpages(rq, pagepool);
+ if (ret < 0)
+ return ret;
+ if (ret) {
+@@ -304,7 +292,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
+ dst_maptype = 2;
+
+ dstmap_out:
+- ret = alg->decompress(rq, dst + rq->pageofs_out);
++ ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out);
+
+ if (!dst_maptype)
+ kunmap_atomic(dst);
+@@ -313,7 +301,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
+ return ret;
+ }
+
+-static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
++static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool)
+ {
+ const unsigned int nrpages_out =
+@@ -352,10 +340,19 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
+ return 0;
+ }
+
++static struct z_erofs_decompressor decompressors[] = {
++ [Z_EROFS_COMPRESSION_SHIFTED] = {
++ .decompress = z_erofs_shifted_transform,
++ .name = "shifted"
++ },
++ [Z_EROFS_COMPRESSION_LZ4] = {
++ .decompress = z_erofs_lz4_decompress,
++ .name = "lz4"
++ },
++};
++
+ int z_erofs_decompress(struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool)
+ {
+- if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED)
+- return z_erofs_shifted_transform(rq, pagepool);
+- return z_erofs_decompress_generic(rq, pagepool);
++ return decompressors[rq->alg].decompress(rq, pagepool);
+ }
+--
+2.32.0
+
--- /dev/null
+From 8f760def65c931128f681b61d9c3fdb8469246d5 Mon Sep 17 00:00:00 2001
+From: Gao Xiang <hsiangkao@linux.alibaba.com>
+Date: Mon, 11 Oct 2021 05:31:45 +0800
+Subject: [PATCH 1012/1012] erofs: lzma compression support
+
+Add MicroLZMA support in order to maximize compression ratios for
+specific scenarios. For example, it's useful for low-end embedded
+boards and as a secondary algorithm in a file for specific access
+patterns.
+
+MicroLZMA is a new container format for raw LZMA1, which was created
+by Lasse Collin aiming to minimize old LZMA headers and get rid of
+unnecessary EOPM (end of payload marker) as well as to enable
+fixed-sized output compression, especially for 4KiB pclusters.
+
+Similar to LZ4, inplace I/O approach is used to minimize runtime
+memory footprint when dealing with I/O. Overlapped decompression is
+handled with 1) bounced buffer for data under processing or 2) extra
+short-lived pages from the on-stack pagepool which will be shared in
+the same read request (128KiB for example).
+
+Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+---
+ fs/erofs/Kconfig | 16 ++
+ fs/erofs/Makefile | 1 +
+ fs/erofs/compress.h | 16 ++
+ fs/erofs/decompressor.c | 12 +-
+ fs/erofs/decompressor_lzma.c | 290 +++++++++++++++++++++++++++++++++++
+ fs/erofs/erofs_fs.h | 14 +-
+ fs/erofs/internal.h | 22 +++
+ fs/erofs/super.c | 17 +-
+ fs/erofs/zdata.c | 4 +-
+ fs/erofs/zdata.h | 7 -
+ fs/erofs/zmap.c | 5 +-
+ 11 files changed, 383 insertions(+), 21 deletions(-)
+ create mode 100644 fs/erofs/decompressor_lzma.c
+
+diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
+index addfe608d..f57255ab8 100644
+--- a/fs/erofs/Kconfig
++++ b/fs/erofs/Kconfig
+@@ -82,3 +82,19 @@ config EROFS_FS_ZIP
+ Enable fixed-sized output compression for EROFS.
+
+ If you don't want to enable compression feature, say N.
++
++config EROFS_FS_ZIP_LZMA
++ bool "EROFS LZMA compressed data support"
++ depends on EROFS_FS_ZIP
++ select XZ_DEC
++ select XZ_DEC_MICROLZMA
++ help
++ Saying Y here includes support for reading EROFS file systems
++ containing LZMA compressed data, specifically called microLZMA. it
++ gives better compression ratios than the LZ4 algorithm, at the
++ expense of more CPU overhead.
++
++ LZMA support is an experimental feature for now and so most file
++ systems will be readable without selecting this option.
++
++ If unsure, say N.
+diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
+index 1f9aced49..756fe2d65 100644
+--- a/fs/erofs/Makefile
++++ b/fs/erofs/Makefile
+@@ -4,3 +4,4 @@ obj-$(CONFIG_EROFS_FS) += erofs.o
+ erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
+ erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+ erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
++erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
+diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
+index ad62d1b4d..8ea6a9b14 100644
+--- a/fs/erofs/compress.h
++++ b/fs/erofs/compress.h
+@@ -20,6 +20,12 @@ struct z_erofs_decompress_req {
+ bool inplace_io, partial_decoding;
+ };
+
++struct z_erofs_decompressor {
++ int (*decompress)(struct z_erofs_decompress_req *rq,
++ struct list_head *pagepool);
++ char *name;
++};
++
+ /* some special page->private (unsigned long, see below) */
+ #define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2)
+ #define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2)
+@@ -75,7 +81,17 @@ static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
+ return true;
+ }
+
++#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
++static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
++ struct page *page)
++{
++ return page->mapping == MNGD_MAPPING(sbi);
++}
++
+ int z_erofs_decompress(struct z_erofs_decompress_req *rq,
+ struct list_head *pagepool);
+
++/* prototypes for specific algorithms */
++int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
++ struct list_head *pagepool);
+ #endif
+diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
+index 8fd7af9d6..8a624d73c 100644
+--- a/fs/erofs/decompressor.c
++++ b/fs/erofs/decompressor.c
+@@ -16,12 +16,6 @@
+ #define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
+ #endif
+
+-struct z_erofs_decompressor {
+- int (*decompress)(struct z_erofs_decompress_req *rq,
+- struct list_head *pagepool);
+- char *name;
+-};
+-
+ int z_erofs_load_lz4_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lz4_cfgs *lz4, int size)
+@@ -349,6 +343,12 @@ static struct z_erofs_decompressor decompressors[] = {
+ .decompress = z_erofs_lz4_decompress,
+ .name = "lz4"
+ },
++#ifdef CONFIG_EROFS_FS_ZIP_LZMA
++ [Z_EROFS_COMPRESSION_LZMA] = {
++ .decompress = z_erofs_lzma_decompress,
++ .name = "lzma"
++ },
++#endif
+ };
+
+ int z_erofs_decompress(struct z_erofs_decompress_req *rq,
+diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
+new file mode 100644
+index 000000000..bd7d9809e
+--- /dev/null
++++ b/fs/erofs/decompressor_lzma.c
+@@ -0,0 +1,290 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++#include <linux/xz.h>
++#include <linux/module.h>
++#include "compress.h"
++
++struct z_erofs_lzma {
++ struct z_erofs_lzma *next;
++ struct xz_dec_microlzma *state;
++ struct xz_buf buf;
++ u8 bounce[PAGE_SIZE];
++};
++
++/* considering the LZMA performance, no need to use a lockless list for now */
++static DEFINE_SPINLOCK(z_erofs_lzma_lock);
++static unsigned int z_erofs_lzma_max_dictsize;
++static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms;
++static struct z_erofs_lzma *z_erofs_lzma_head;
++static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq);
++
++module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444);
++
++void z_erofs_lzma_exit(void)
++{
++ /* there should be no running fs instance */
++ while (z_erofs_lzma_avail_strms) {
++ struct z_erofs_lzma *strm;
++
++ spin_lock(&z_erofs_lzma_lock);
++ strm = z_erofs_lzma_head;
++ if (!strm) {
++ spin_unlock(&z_erofs_lzma_lock);
++ DBG_BUGON(1);
++ return;
++ }
++ z_erofs_lzma_head = NULL;
++ spin_unlock(&z_erofs_lzma_lock);
++
++ while (strm) {
++ struct z_erofs_lzma *n = strm->next;
++
++ if (strm->state)
++ xz_dec_microlzma_end(strm->state);
++ kfree(strm);
++ --z_erofs_lzma_avail_strms;
++ strm = n;
++ }
++ }
++}
++
++int z_erofs_lzma_init(void)
++{
++ unsigned int i;
++
++ /* by default, use # of possible CPUs instead */
++ if (!z_erofs_lzma_nstrms)
++ z_erofs_lzma_nstrms = num_possible_cpus();
++
++ for (i = 0; i < z_erofs_lzma_nstrms; ++i) {
++ struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL);
++
++ if (!strm) {
++ z_erofs_lzma_exit();
++ return -ENOMEM;
++ }
++ spin_lock(&z_erofs_lzma_lock);
++ strm->next = z_erofs_lzma_head;
++ z_erofs_lzma_head = strm;
++ spin_unlock(&z_erofs_lzma_lock);
++ ++z_erofs_lzma_avail_strms;
++ }
++ return 0;
++}
++
++int z_erofs_load_lzma_config(struct super_block *sb,
++ struct erofs_super_block *dsb,
++ struct z_erofs_lzma_cfgs *lzma, int size)
++{
++ static DEFINE_MUTEX(lzma_resize_mutex);
++ unsigned int dict_size, i;
++ struct z_erofs_lzma *strm, *head = NULL;
++ int err;
++
++ if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) {
++ erofs_err(sb, "invalid lzma cfgs, size=%u", size);
++ return -EINVAL;
++ }
++ if (lzma->format) {
++ erofs_err(sb, "unidentified lzma format %x, please check kernel version",
++ le16_to_cpu(lzma->format));
++ return -EINVAL;
++ }
++ dict_size = le32_to_cpu(lzma->dict_size);
++ if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) {
++ erofs_err(sb, "unsupported lzma dictionary size %u",
++ dict_size);
++ return -EINVAL;
++ }
++
++ erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!");
++
++ /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */
++ mutex_lock(&lzma_resize_mutex);
++
++ if (z_erofs_lzma_max_dictsize >= dict_size) {
++ mutex_unlock(&lzma_resize_mutex);
++ return 0;
++ }
++
++ /* 1. collect/isolate all streams for the following check */
++ for (i = 0; i < z_erofs_lzma_avail_strms; ++i) {
++ struct z_erofs_lzma *last;
++
++again:
++ spin_lock(&z_erofs_lzma_lock);
++ strm = z_erofs_lzma_head;
++ if (!strm) {
++ spin_unlock(&z_erofs_lzma_lock);
++ wait_event(z_erofs_lzma_wq,
++ READ_ONCE(z_erofs_lzma_head));
++ goto again;
++ }
++ z_erofs_lzma_head = NULL;
++ spin_unlock(&z_erofs_lzma_lock);
++
++ for (last = strm; last->next; last = last->next)
++ ++i;
++ last->next = head;
++ head = strm;
++ }
++
++ err = 0;
++ /* 2. walk each isolated stream and grow max dict_size if needed */
++ for (strm = head; strm; strm = strm->next) {
++ if (strm->state)
++ xz_dec_microlzma_end(strm->state);
++ strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size);
++ if (!strm->state)
++ err = -ENOMEM;
++ }
++
++ /* 3. push back all to the global list and update max dict_size */
++ spin_lock(&z_erofs_lzma_lock);
++ DBG_BUGON(z_erofs_lzma_head);
++ z_erofs_lzma_head = head;
++ spin_unlock(&z_erofs_lzma_lock);
++
++ z_erofs_lzma_max_dictsize = dict_size;
++ mutex_unlock(&lzma_resize_mutex);
++ return err;
++}
++
++int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
++ struct list_head *pagepool)
++{
++ const unsigned int nrpages_out =
++ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
++ const unsigned int nrpages_in =
++ PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
++ unsigned int inputmargin, inlen, outlen, pageofs;
++ struct z_erofs_lzma *strm;
++ u8 *kin;
++ bool bounced = false;
++ int no, ni, j, err = 0;
++
++ /* 1. get the exact LZMA compressed size */
++ kin = kmap(*rq->in);
++ inputmargin = 0;
++ while (!kin[inputmargin & ~PAGE_MASK])
++ if (!(++inputmargin & ~PAGE_MASK))
++ break;
++
++ if (inputmargin >= PAGE_SIZE) {
++ kunmap(*rq->in);
++ return -EFSCORRUPTED;
++ }
++ rq->inputsize -= inputmargin;
++
++ /* 2. get an available lzma context */
++again:
++ spin_lock(&z_erofs_lzma_lock);
++ strm = z_erofs_lzma_head;
++ if (!strm) {
++ spin_unlock(&z_erofs_lzma_lock);
++ wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head));
++ goto again;
++ }
++ z_erofs_lzma_head = strm->next;
++ spin_unlock(&z_erofs_lzma_lock);
++
++ /* 3. multi-call decompress */
++ inlen = rq->inputsize;
++ outlen = rq->outputsize;
++ xz_dec_microlzma_reset(strm->state, inlen, outlen,
++ !rq->partial_decoding);
++ pageofs = rq->pageofs_out;
++ strm->buf.in = kin + inputmargin;
++ strm->buf.in_pos = 0;
++ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin);
++ inlen -= strm->buf.in_size;
++ strm->buf.out = NULL;
++ strm->buf.out_pos = 0;
++ strm->buf.out_size = 0;
++
++ for (ni = 0, no = -1;;) {
++ enum xz_ret xz_err;
++
++ if (strm->buf.out_pos == strm->buf.out_size) {
++ if (strm->buf.out) {
++ kunmap(rq->out[no]);
++ strm->buf.out = NULL;
++ }
++
++ if (++no >= nrpages_out || !outlen) {
++ erofs_err(rq->sb, "decompressed buf out of bound");
++ err = -EFSCORRUPTED;
++ break;
++ }
++ strm->buf.out_pos = 0;
++ strm->buf.out_size = min_t(u32, outlen,
++ PAGE_SIZE - pageofs);
++ outlen -= strm->buf.out_size;
++ if (rq->out[no])
++ strm->buf.out = kmap(rq->out[no]) + pageofs;
++ pageofs = 0;
++ } else if (strm->buf.in_pos == strm->buf.in_size) {
++ kunmap(rq->in[ni]);
++
++ if (++ni >= nrpages_in || !inlen) {
++ erofs_err(rq->sb, "compressed buf out of bound");
++ err = -EFSCORRUPTED;
++ break;
++ }
++ strm->buf.in_pos = 0;
++ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE);
++ inlen -= strm->buf.in_size;
++ kin = kmap(rq->in[ni]);
++ strm->buf.in = kin;
++ bounced = false;
++ }
++
++ /*
++ * Handle overlapping: Use bounced buffer if the compressed
++ * data is under processing; Otherwise, Use short-lived pages
++ * from the on-stack pagepool where pages share with the same
++ * request.
++ */
++ if (!bounced && rq->out[no] == rq->in[ni]) {
++ memcpy(strm->bounce, strm->buf.in, strm->buf.in_size);
++ strm->buf.in = strm->bounce;
++ bounced = true;
++ }
++ for (j = ni + 1; j < nrpages_in; ++j) {
++ struct page *tmppage;
++
++ if (rq->out[no] != rq->in[j])
++ continue;
++
++ DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
++ rq->in[j]));
++ tmppage = erofs_allocpage(pagepool,
++ GFP_KERNEL | __GFP_NOFAIL);
++ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
++ copy_highpage(tmppage, rq->in[j]);
++ rq->in[j] = tmppage;
++ }
++ xz_err = xz_dec_microlzma_run(strm->state, &strm->buf);
++ DBG_BUGON(strm->buf.out_pos > strm->buf.out_size);
++ DBG_BUGON(strm->buf.in_pos > strm->buf.in_size);
++
++ if (xz_err != XZ_OK) {
++ if (xz_err == XZ_STREAM_END && !outlen)
++ break;
++ erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]",
++ xz_err, rq->inputsize, rq->outputsize);
++ err = -EFSCORRUPTED;
++ break;
++ }
++ }
++ if (no < nrpages_out && strm->buf.out)
++ kunmap(rq->in[no]);
++ if (ni < nrpages_in)
++ kunmap(rq->in[ni]);
++ /* 4. push back LZMA stream context to the global list */
++ spin_lock(&z_erofs_lzma_lock);
++ strm->next = z_erofs_lzma_head;
++ z_erofs_lzma_head = strm;
++ spin_unlock(&z_erofs_lzma_lock);
++ wake_up(&z_erofs_lzma_wq);
++ return err;
++}
+diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
+index 87736cbf1..6ff1e515c 100644
+--- a/fs/erofs/erofs_fs.h
++++ b/fs/erofs/erofs_fs.h
+@@ -264,10 +264,11 @@ struct erofs_inode_chunk_index {
+
+ /* available compression algorithm types (for h_algorithmtype) */
+ enum {
+- Z_EROFS_COMPRESSION_LZ4 = 0,
++ Z_EROFS_COMPRESSION_LZ4 = 0,
++ Z_EROFS_COMPRESSION_LZMA = 1,
+ Z_EROFS_COMPRESSION_MAX
+ };
+-#define Z_EROFS_ALL_COMPR_ALGS (1 << (Z_EROFS_COMPRESSION_MAX - 1))
++#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
+
+ /* 14 bytes (+ length field = 16 bytes) */
+ struct z_erofs_lz4_cfgs {
+@@ -276,6 +277,15 @@ struct z_erofs_lz4_cfgs {
+ u8 reserved[10];
+ } __packed;
+
++/* 14 bytes (+ length field = 16 bytes) */
++struct z_erofs_lzma_cfgs {
++ __le32 dict_size;
++ __le16 format;
++ u8 reserved[8];
++} __packed;
++
++#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE)
++
+ /*
+ * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
+ * e.g. for 4k logical cluster size, 4B if compacted 2B is off;
+diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
+index 354ce3cb2..a6a53d22d 100644
+--- a/fs/erofs/internal.h
++++ b/fs/erofs/internal.h
+@@ -407,6 +407,8 @@ struct erofs_map_blocks {
+ * approach instead if possible since it's more metadata lightweight.)
+ */
+ #define EROFS_GET_BLOCKS_FIEMAP 0x0002
++/* Used to map the whole extent if non-negligible data is requested for LZMA */
++#define EROFS_GET_BLOCKS_READMORE 0x0004
+
+ enum {
+ Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+@@ -537,6 +539,26 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb,
+ }
+ #endif /* !CONFIG_EROFS_FS_ZIP */
+
++#ifdef CONFIG_EROFS_FS_ZIP_LZMA
++int z_erofs_lzma_init(void);
++void z_erofs_lzma_exit(void);
++int z_erofs_load_lzma_config(struct super_block *sb,
++ struct erofs_super_block *dsb,
++ struct z_erofs_lzma_cfgs *lzma, int size);
++#else
++static inline int z_erofs_lzma_init(void) { return 0; }
++static inline int z_erofs_lzma_exit(void) { return 0; }
++static inline int z_erofs_load_lzma_config(struct super_block *sb,
++ struct erofs_super_block *dsb,
++ struct z_erofs_lzma_cfgs *lzma, int size) {
++ if (lzma) {
++ erofs_err(sb, "lzma algorithm isn't enabled");
++ return -EINVAL;
++ }
++ return 0;
++}
++#endif /* !CONFIG_EROFS_FS_ZIP */
++
+ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
+
+ #endif /* __EROFS_INTERNAL_H */
+diff --git a/fs/erofs/super.c b/fs/erofs/super.c
+index 2cfe1ce0f..6a969b1e0 100644
+--- a/fs/erofs/super.c
++++ b/fs/erofs/super.c
+@@ -225,6 +225,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
+ case Z_EROFS_COMPRESSION_LZ4:
+ ret = z_erofs_load_lz4_config(sb, dsb, data, size);
+ break;
++ case Z_EROFS_COMPRESSION_LZMA:
++ ret = z_erofs_load_lzma_config(sb, dsb, data, size);
++ break;
+ default:
+ DBG_BUGON(1);
+ ret = -EFAULT;
+@@ -840,6 +843,10 @@ static int __init erofs_module_init(void)
+ if (err)
+ goto shrinker_err;
+
++ err = z_erofs_lzma_init();
++ if (err)
++ goto lzma_err;
++
+ erofs_pcpubuf_init();
+ err = z_erofs_init_zip_subsystem();
+ if (err)
+@@ -854,6 +861,8 @@ static int __init erofs_module_init(void)
+ fs_err:
+ z_erofs_exit_zip_subsystem();
+ zip_err:
++ z_erofs_lzma_exit();
++lzma_err:
+ erofs_exit_shrinker();
+ shrinker_err:
+ kmem_cache_destroy(erofs_inode_cachep);
+@@ -864,11 +873,13 @@ static int __init erofs_module_init(void)
+ static void __exit erofs_module_exit(void)
+ {
+ unregister_filesystem(&erofs_fs_type);
+- z_erofs_exit_zip_subsystem();
+- erofs_exit_shrinker();
+
+- /* Ensure all RCU free inodes are safe before cache is destroyed. */
++ /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
+ rcu_barrier();
++
++ z_erofs_exit_zip_subsystem();
++ z_erofs_lzma_exit();
++ erofs_exit_shrinker();
+ kmem_cache_destroy(erofs_inode_cachep);
+ erofs_pcpubuf_exit();
+ }
+diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
+index 98d3bd25d..d55e6215c 100644
+--- a/fs/erofs/zdata.c
++++ b/fs/erofs/zdata.c
+@@ -1404,8 +1404,8 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
+
+ if (backmost) {
+ map->m_la = end;
+- /* TODO: pass in EROFS_GET_BLOCKS_READMORE for LZMA later */
+- err = z_erofs_map_blocks_iter(inode, map, 0);
++ err = z_erofs_map_blocks_iter(inode, map,
++ EROFS_GET_BLOCKS_READMORE);
+ if (err)
+ return;
+
+diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
+index 3a008f1b9..879df5362 100644
+--- a/fs/erofs/zdata.h
++++ b/fs/erofs/zdata.h
+@@ -94,13 +94,6 @@ struct z_erofs_decompressqueue {
+ } u;
+ };
+
+-#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+-static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
+- struct page *page)
+-{
+- return page->mapping == MNGD_MAPPING(sbi);
+-}
+-
+ #define Z_EROFS_ONLINEPAGE_COUNT_BITS 2
+ #define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+ #define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
+index 85d028942..660489a7f 100644
+--- a/fs/erofs/zmap.c
++++ b/fs/erofs/zmap.c
+@@ -672,7 +672,10 @@ int z_erofs_map_blocks_iter(struct inode *inode,
+ else
+ map->m_algorithmformat = vi->z_algorithmtype[0];
+
+- if (flags & EROFS_GET_BLOCKS_FIEMAP) {
++ if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
++ ((flags & EROFS_GET_BLOCKS_READMORE) &&
++ map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA &&
++ map->m_llen >= EROFS_BLKSIZ)) {
+ err = z_erofs_get_extent_decompressedlen(&m);
+ if (!err)
+ map->m_flags |= EROFS_MAP_FULL_MAPPED;
+--
+2.32.0
+