// SPDX-License-Identifier: GPL-2.0-only
/*
 * Module for pnfs flexfile layout driver.
 *
 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
 *
 * Tao Peng <bergwolf@primarydata.com>
 */

#include <linux/nfs_fs.h>
#include <linux/nfs_mount.h>
#include <linux/nfs_page.h>
#include <linux/module.h>
#include <linux/sched/mm.h>

#include <linux/sunrpc/metrics.h>

#include "flexfilelayout.h"
#include "../nfs4session.h"
#include "../nfs4idmap.h"
#include "../internal.h"
#include "../delegation.h"
#include "../nfs4trace.h"
#include "../iostat.h"
#include "../nfs.h"
#include "../nfs42.h"

#define NFSDBG_FACILITY         NFSDBG_PNFS_LD

#define FF_LAYOUT_POLL_RETRY_MAX     (15*HZ)
#define FF_LAYOUTRETURN_MAXERR 20

enum nfs4_ff_op_type {
	NFS4_FF_OP_LAYOUTSTATS,
	NFS4_FF_OP_LAYOUTRETURN,
};

static unsigned short io_maxretrans;

static const struct pnfs_commit_ops ff_layout_commit_ops;
static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
		struct nfs_pgio_header *hdr);
static int
ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
			       struct nfs42_layoutstat_devinfo *devinfo,
			       int dev_limit, enum nfs4_ff_op_type type);
static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
			      const struct nfs42_layoutstat_devinfo *devinfo,
			      struct nfs4_ff_layout_mirror *mirror);

static struct pnfs_layout_hdr *
ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
{
	struct nfs4_flexfile_layout *ffl;

	ffl = kzalloc(sizeof(*ffl), gfp_flags);
	if (ffl) {
		pnfs_init_ds_commit_info(&ffl->commit_info);
		INIT_LIST_HEAD(&ffl->error_list);
		INIT_LIST_HEAD(&ffl->mirrors);
		ffl->last_report_time = ktime_get();
		ffl->commit_info.ops = &ff_layout_commit_ops;
		return &ffl->generic_hdr;
	} else
		return NULL;
}

static void
ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
	struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(lo);
	struct nfs4_ff_layout_ds_err *err, *n;

	list_for_each_entry_safe(err, n, &ffl->error_list, list) {
		list_del(&err->list);
		kfree(err);
	}
	kfree_rcu(ffl, generic_hdr.plh_rcu);
}

static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
{
	__be32 *p;

	p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
	if (unlikely(p == NULL))
		return -ENOBUFS;
	stateid->type = NFS4_PNFS_DS_STATEID_TYPE;
	memcpy(stateid->data, p, NFS4_STATEID_SIZE);
	dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
		p[0], p[1], p[2], p[3]);
	return 0;
}

static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
{
	__be32 *p;

	p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
	if (unlikely(!p))
		return -ENOBUFS;
	memcpy(devid, p, NFS4_DEVICEID4_SIZE);
	nfs4_print_deviceid(devid);
	return 0;
}

static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
{
	__be32 *p;

	p = xdr_inline_decode(xdr, 4);
	if (unlikely(!p))
		return -ENOBUFS;
	fh->size = be32_to_cpup(p++);
	if (fh->size > NFS_MAXFHSIZE) {
		printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
		       fh->size);
		return -EOVERFLOW;
	}
	/* fh.data */
	p = xdr_inline_decode(xdr, fh->size);
	if (unlikely(!p))
		return -ENOBUFS;
	memcpy(&fh->data, p, fh->size);
	dprintk("%s: fh len %d\n", __func__, fh->size);

	return 0;
}

/*
 * Currently only stringified uids and gids are accepted.
 * I.e., kerberos is not supported to the DSes, so no pricipals.
 *
 * That means that one common function will suffice, but when
 * principals are added, this should be split to accomodate
 * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
 */
static int
decode_name(struct xdr_stream *xdr, u32 *id)
{
	__be32 *p;
	int len;

	/* opaque_length(4)*/
	p = xdr_inline_decode(xdr, 4);
	if (unlikely(!p))
		return -ENOBUFS;
	len = be32_to_cpup(p++);
	if (len < 0)
		return -EINVAL;

	dprintk("%s: len %u\n", __func__, len);

	/* opaque body */
	p = xdr_inline_decode(xdr, len);
	if (unlikely(!p))
		return -ENOBUFS;

	if (!nfs_map_string_to_numeric((char *)p, len, id))
		return -EINVAL;

	return 0;
}

static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
		const struct nfs4_ff_layout_mirror *m2)
{
	int i, j;

	if (m1->fh_versions_cnt != m2->fh_versions_cnt)
		return false;
	for (i = 0; i < m1->fh_versions_cnt; i++) {
		bool found_fh = false;
		for (j = 0; j < m2->fh_versions_cnt; j++) {
			if (nfs_compare_fh(&m1->fh_versions[i],
					&m2->fh_versions[j]) == 0) {
				found_fh = true;
				break;
			}
		}
		if (!found_fh)
			return false;
	}
	return true;
}

static struct nfs4_ff_layout_mirror *
ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
		struct nfs4_ff_layout_mirror *mirror)
{
	struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
	struct nfs4_ff_layout_mirror *pos;
	struct inode *inode = lo->plh_inode;

	spin_lock(&inode->i_lock);
	list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
		if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0)
			continue;
		if (!ff_mirror_match_fh(mirror, pos))
			continue;
		if (refcount_inc_not_zero(&pos->ref)) {
			spin_unlock(&inode->i_lock);
			return pos;
		}
	}
	list_add(&mirror->mirrors, &ff_layout->mirrors);
	mirror->layout = lo;
	spin_unlock(&inode->i_lock);
	return mirror;
}

static void
ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
{
	struct inode *inode;
	if (mirror->layout == NULL)
		return;
	inode = mirror->layout->plh_inode;
	spin_lock(&inode->i_lock);
	list_del(&mirror->mirrors);
	spin_unlock(&inode->i_lock);
	mirror->layout = NULL;
}

static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
{
	struct nfs4_ff_layout_mirror *mirror;

	mirror = kzalloc(sizeof(*mirror), gfp_flags);
	if (mirror != NULL) {
		spin_lock_init(&mirror->lock);
		refcount_set(&mirror->ref, 1);
		INIT_LIST_HEAD(&mirror->mirrors);
	}
	return mirror;
}

static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
{
	const struct cred	*cred;

	ff_layout_remove_mirror(mirror);
	kfree(mirror->fh_versions);
	cred = rcu_access_pointer(mirror->ro_cred);
	put_cred(cred);
	cred = rcu_access_pointer(mirror->rw_cred);
	put_cred(cred);
	nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
	kfree(mirror);
}

static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
{
	if (mirror != NULL && refcount_dec_and_test(&mirror->ref))
		ff_layout_free_mirror(mirror);
}

static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
{
	u32 i;

	for (i = 0; i < fls->mirror_array_cnt; i++)
		ff_layout_put_mirror(fls->mirror_array[i]);
}

static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
{
	if (fls) {
		ff_layout_free_mirror_array(fls);
		kfree(fls);
	}
}

static bool
ff_lseg_match_mirrors(struct pnfs_layout_segment *l1,
		struct pnfs_layout_segment *l2)
{
	const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1);
	const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1);
	u32 i;

	if (fl1->mirror_array_cnt != fl2->mirror_array_cnt)
		return false;
	for (i = 0; i < fl1->mirror_array_cnt; i++) {
		if (fl1->mirror_array[i] != fl2->mirror_array[i])
			return false;
	}
	return true;
}

static bool
ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
		const struct pnfs_layout_range *l2)
{
	u64 end1, end2;

	if (l1->iomode != l2->iomode)
		return l1->iomode != IOMODE_READ;
	end1 = pnfs_calc_offset_end(l1->offset, l1->length);
	end2 = pnfs_calc_offset_end(l2->offset, l2->length);
	if (end1 < l2->offset)
		return false;
	if (end2 < l1->offset)
		return true;
	return l2->offset <= l1->offset;
}

static bool
ff_lseg_merge(struct pnfs_layout_segment *new,
		struct pnfs_layout_segment *old)
{
	u64 new_end, old_end;

	if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
		return false;
	if (new->pls_range.iomode != old->pls_range.iomode)
		return false;
	old_end = pnfs_calc_offset_end(old->pls_range.offset,
			old->pls_range.length);
	if (old_end < new->pls_range.offset)
		return false;
	new_end = pnfs_calc_offset_end(new->pls_range.offset,
			new->pls_range.length);
	if (new_end < old->pls_range.offset)
		return false;
	if (!ff_lseg_match_mirrors(new, old))
		return false;

	/* Mergeable: copy info from 'old' to 'new' */
	if (new_end < old_end)
		new_end = old_end;
	if (new->pls_range.offset < old->pls_range.offset)
		new->pls_range.offset = old->pls_range.offset;
	new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset,
			new_end);
	if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
		set_bit(NFS_LSEG_ROC, &new->pls_flags);
	return true;
}

static void
ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
		struct pnfs_layout_segment *lseg,
		struct list_head *free_me)
{
	pnfs_generic_layout_insert_lseg(lo, lseg,
			ff_lseg_range_is_after,
			ff_lseg_merge,
			free_me);
}

static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
{
	int i, j;

	for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
		for (j = i + 1; j < fls->mirror_array_cnt; j++)
			if (fls->mirror_array[i]->efficiency <
			    fls->mirror_array[j]->efficiency)
				swap(fls->mirror_array[i],
				     fls->mirror_array[j]);
	}
}

static struct pnfs_layout_segment *
ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
		     struct nfs4_layoutget_res *lgr,
		     gfp_t gfp_flags)
{
	struct pnfs_layout_segment *ret;
	struct nfs4_ff_layout_segment *fls = NULL;
	struct xdr_stream stream;
	struct xdr_buf buf;
	struct page *scratch;
	u64 stripe_unit;
	u32 mirror_array_cnt;
	__be32 *p;
	int i, rc;

	dprintk("--> %s\n", __func__);
	scratch = alloc_page(gfp_flags);
	if (!scratch)
		return ERR_PTR(-ENOMEM);

	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
			      lgr->layoutp->len);
	xdr_set_scratch_page(&stream, scratch);

	/* stripe unit and mirror_array_cnt */
	rc = -EIO;
	p = xdr_inline_decode(&stream, 8 + 4);
	if (!p)
		goto out_err_free;

	p = xdr_decode_hyper(p, &stripe_unit);
	mirror_array_cnt = be32_to_cpup(p++);
	dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
		stripe_unit, mirror_array_cnt);

	if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
	    mirror_array_cnt == 0)
		goto out_err_free;

	rc = -ENOMEM;
	fls = kzalloc(struct_size(fls, mirror_array, mirror_array_cnt),
			gfp_flags);
	if (!fls)
		goto out_err_free;

	fls->mirror_array_cnt = mirror_array_cnt;
	fls->stripe_unit = stripe_unit;

	for (i = 0; i < fls->mirror_array_cnt; i++) {
		struct nfs4_ff_layout_mirror *mirror;
		struct cred *kcred;
		const struct cred __rcu *cred;
		kuid_t uid;
		kgid_t gid;
		u32 ds_count, fh_count, id;
		int j;

		rc = -EIO;
		p = xdr_inline_decode(&stream, 4);
		if (!p)
			goto out_err_free;
		ds_count = be32_to_cpup(p);

		/* FIXME: allow for striping? */
		if (ds_count != 1)
			goto out_err_free;

		fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
		if (fls->mirror_array[i] == NULL) {
			rc = -ENOMEM;
			goto out_err_free;
		}

		fls->mirror_array[i]->ds_count = ds_count;

		/* deviceid */
		rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid);
		if (rc)
			goto out_err_free;

		/* efficiency */
		rc = -EIO;
		p = xdr_inline_decode(&stream, 4);
		if (!p)
			goto out_err_free;
		fls->mirror_array[i]->efficiency = be32_to_cpup(p);

		/* stateid */
		rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
		if (rc)
			goto out_err_free;

		/* fh */
		rc = -EIO;
		p = xdr_inline_decode(&stream, 4);
		if (!p)
			goto out_err_free;
		fh_count = be32_to_cpup(p);

		fls->mirror_array[i]->fh_versions =
			kcalloc(fh_count, sizeof(struct nfs_fh),
				gfp_flags);
		if (fls->mirror_array[i]->fh_versions == NULL) {
			rc = -ENOMEM;
			goto out_err_free;
		}

		for (j = 0; j < fh_count; j++) {
			rc = decode_nfs_fh(&stream,
					   &fls->mirror_array[i]->fh_versions[j]);
			if (rc)
				goto out_err_free;
		}

		fls->mirror_array[i]->fh_versions_cnt = fh_count;

		/* user */
		rc = decode_name(&stream, &id);
		if (rc)
			goto out_err_free;

		uid = make_kuid(&init_user_ns, id);

		/* group */
		rc = decode_name(&stream, &id);
		if (rc)
			goto out_err_free;

		gid = make_kgid(&init_user_ns, id);

		if (gfp_flags & __GFP_FS)
			kcred = prepare_kernel_cred(&init_task);
		else {
			unsigned int nofs_flags = memalloc_nofs_save();
			kcred = prepare_kernel_cred(&init_task);
			memalloc_nofs_restore(nofs_flags);
		}
		rc = -ENOMEM;
		if (!kcred)
			goto out_err_free;
		kcred->fsuid = uid;
		kcred->fsgid = gid;
		cred = RCU_INITIALIZER(kcred);

		if (lgr->range.iomode == IOMODE_READ)
			rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
		else
			rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);

		mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
		if (mirror != fls->mirror_array[i]) {
			/* swap cred ptrs so free_mirror will clean up old */
			if (lgr->range.iomode == IOMODE_READ) {
				cred = xchg(&mirror->ro_cred, cred);
				rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
			} else {
				cred = xchg(&mirror->rw_cred, cred);
				rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
			}
			ff_layout_free_mirror(fls->mirror_array[i]);
			fls->mirror_array[i] = mirror;
		}

		dprintk("%s: iomode %s uid %u gid %u\n", __func__,
			lgr->range.iomode == IOMODE_READ ? "READ" : "RW",
			from_kuid(&init_user_ns, uid),
			from_kgid(&init_user_ns, gid));
	}

	p = xdr_inline_decode(&stream, 4);
	if (!p)
		goto out_sort_mirrors;
	fls->flags = be32_to_cpup(p);

	p = xdr_inline_decode(&stream, 4);
	if (!p)
		goto out_sort_mirrors;
	for (i=0; i < fls->mirror_array_cnt; i++)
		fls->mirror_array[i]->report_interval = be32_to_cpup(p);

out_sort_mirrors:
	ff_layout_sort_mirrors(fls);
	ret = &fls->generic_hdr;
	dprintk("<-- %s (success)\n", __func__);
out_free_page:
	__free_page(scratch);
	return ret;
out_err_free:
	_ff_layout_free_lseg(fls);
	ret = ERR_PTR(rc);
	dprintk("<-- %s (%d)\n", __func__, rc);
	goto out_free_page;
}

static void
ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
{
	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);

	dprintk("--> %s\n", __func__);

	if (lseg->pls_range.iomode == IOMODE_RW) {
		struct nfs4_flexfile_layout *ffl;
		struct inode *inode;

		ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
		inode = ffl->generic_hdr.plh_inode;
		spin_lock(&inode->i_lock);
		pnfs_generic_ds_cinfo_release_lseg(&ffl->commit_info, lseg);
		spin_unlock(&inode->i_lock);
	}
	_ff_layout_free_lseg(fls);
}

static void
nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
	/* first IO request? */
	if (atomic_inc_return(&timer->n_ops) == 1) {
		timer->start_time = now;
	}
}

static ktime_t
nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
	ktime_t start;

	if (atomic_dec_return(&timer->n_ops) < 0)
		WARN_ON_ONCE(1);

	start = timer->start_time;
	timer->start_time = now;
	return ktime_sub(now, start);
}

static bool
nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
			    struct nfs4_ff_layoutstat *layoutstat,
			    ktime_t now)
{
	s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
	struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);

	nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
	if (!mirror->start_time)
		mirror->start_time = now;
	if (mirror->report_interval != 0)
		report_interval = (s64)mirror->report_interval * 1000LL;
	else if (layoutstats_timer != 0)
		report_interval = (s64)layoutstats_timer * 1000LL;
	if (ktime_to_ms(ktime_sub(now, ffl->last_report_time)) >=
			report_interval) {
		ffl->last_report_time = now;
		return true;
	}

	return false;
}

static void
nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat,
		__u64 requested)
{
	struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;

	iostat->ops_requested++;
	iostat->bytes_requested += requested;
}

static void
nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
		__u64 requested,
		__u64 completed,
		ktime_t time_completed,
		ktime_t time_started)
{
	struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
	ktime_t completion_time = ktime_sub(time_completed, time_started);
	ktime_t timer;

	iostat->ops_completed++;
	iostat->bytes_completed += completed;
	iostat->bytes_not_delivered += requested - completed;

	timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed);
	iostat->total_busy_time =
			ktime_add(iostat->total_busy_time, timer);
	iostat->aggregate_completion_time =
			ktime_add(iostat->aggregate_completion_time,
					completion_time);
}

static void
nfs4_ff_layout_stat_io_start_read(struct inode *inode,
		struct nfs4_ff_layout_mirror *mirror,
		__u64 requested, ktime_t now)
{
	bool report;

	spin_lock(&mirror->lock);
	report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
	nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
	spin_unlock(&mirror->lock);

	if (report)
		pnfs_report_layoutstat(inode, nfs_io_gfp_mask());
}

static void
nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
		struct nfs4_ff_layout_mirror *mirror,
		__u64 requested,
		__u64 completed)
{
	spin_lock(&mirror->lock);
	nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
			requested, completed,
			ktime_get(), task->tk_start);
	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
	spin_unlock(&mirror->lock);
}

static void
nfs4_ff_layout_stat_io_start_write(struct inode *inode,
		struct nfs4_ff_layout_mirror *mirror,
		__u64 requested, ktime_t now)
{
	bool report;

	spin_lock(&mirror->lock);
	report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
	nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
	spin_unlock(&mirror->lock);

	if (report)
		pnfs_report_layoutstat(inode, nfs_io_gfp_mask());
}

static void
nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
		struct nfs4_ff_layout_mirror *mirror,
		__u64 requested,
		__u64 completed,
		enum nfs3_stable_how committed)
{
	if (committed == NFS_UNSTABLE)
		requested = completed = 0;

	spin_lock(&mirror->lock);
	nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
			requested, completed, ktime_get(), task->tk_start);
	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
	spin_unlock(&mirror->lock);
}

static void
ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx)
{
	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);

	if (devid)
		nfs4_mark_deviceid_unavailable(devid);
}

static void
ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
{
	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);

	if (devid)
		nfs4_mark_deviceid_available(devid);
}

static struct nfs4_pnfs_ds *
ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
			     u32 start_idx, u32 *best_idx,
			     bool check_device)
{
	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
	struct nfs4_ff_layout_mirror *mirror;
	struct nfs4_pnfs_ds *ds;
	u32 idx;

	/* mirrors are initially sorted by efficiency */
	for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
		mirror = FF_LAYOUT_COMP(lseg, idx);
		ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
		if (!ds)
			continue;

		if (check_device &&
		    nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
			continue;

		*best_idx = idx;
		return ds;
	}

	return NULL;
}

static struct nfs4_pnfs_ds *
ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
				 u32 start_idx, u32 *best_idx)
{
	return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
}

static struct nfs4_pnfs_ds *
ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
				   u32 start_idx, u32 *best_idx)
{
	return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
}

static struct nfs4_pnfs_ds *
ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
				  u32 start_idx, u32 *best_idx)
{
	struct nfs4_pnfs_ds *ds;

	ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
	if (ds)
		return ds;
	return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
}

static struct nfs4_pnfs_ds *
ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
			  u32 *best_idx)
{
	struct pnfs_layout_segment *lseg = pgio->pg_lseg;
	struct nfs4_pnfs_ds *ds;

	ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
					       best_idx);
	if (ds || !pgio->pg_mirror_idx)
		return ds;
	return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
}

static void
ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
		      struct nfs_page *req,
		      bool strict_iomode)
{
	pnfs_put_lseg(pgio->pg_lseg);
	pgio->pg_lseg =
		pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
				   req_offset(req), req->wb_bytes, IOMODE_READ,
				   strict_iomode, nfs_io_gfp_mask());
	if (IS_ERR(pgio->pg_lseg)) {
		pgio->pg_error = PTR_ERR(pgio->pg_lseg);
		pgio->pg_lseg = NULL;
	}
}

static void
ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio,
			  struct nfs_page *req)
{
	pnfs_generic_pg_check_layout(pgio);
	pnfs_generic_pg_check_range(pgio, req);
}

static void
ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
			struct nfs_page *req)
{
	struct nfs_pgio_mirror *pgm;
	struct nfs4_ff_layout_mirror *mirror;
	struct nfs4_pnfs_ds *ds;
	u32 ds_idx;

retry:
	ff_layout_pg_check_layout(pgio, req);
	/* Use full layout for now */
	if (!pgio->pg_lseg) {
		ff_layout_pg_get_read(pgio, req, false);
		if (!pgio->pg_lseg)
			goto out_nolseg;
	}
	if (ff_layout_avoid_read_on_rw(pgio->pg_lseg)) {
		ff_layout_pg_get_read(pgio, req, true);
		if (!pgio->pg_lseg)
			goto out_nolseg;
	}

	ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
	if (!ds) {
		if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
			goto out_mds;
		pnfs_generic_pg_cleanup(pgio);
		/* Sleep for 1 second before retrying */
		ssleep(1);
		goto retry;
	}

	mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
	pgm = &pgio->pg_mirrors[0];
	pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;

	pgio->pg_mirror_idx = ds_idx;

	if (NFS_SERVER(pgio->pg_inode)->flags &
			(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
		pgio->pg_maxretrans = io_maxretrans;
	return;
out_nolseg:
	if (pgio->pg_error < 0)
		return;
out_mds:
	trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode,
			0, NFS4_MAX_UINT64, IOMODE_READ,
			NFS_I(pgio->pg_inode)->layout,
			pgio->pg_lseg);
	pgio->pg_maxretrans = 0;
	nfs_pageio_reset_read_mds(pgio);
}
