NFSD: Enforce timeout on layout recall and integrate lease manager fencing

When a layout conflict triggers a recall, enforcing a timeout is
necessary to prevent excessive nfsd threads from being blocked in
__break_lease ensuring the server continues servicing incoming
requests efficiently.

This patch introduces a new function to lease_manager_operations:

lm_breaker_timedout: Invoked when a lease recall times out and is
about to be disposed of. This function enables the lease manager
to inform the caller whether the file_lease should remain on the
flc_list or be disposed of.

For the NFSD lease manager, this function now handles layout recall
timeouts. If the layout type supports fencing and the client has not
been fenced, a fence operation is triggered to prevent the client
from accessing the block device.

While the fencing operation is in progress, the conflicting file_lease
remains on the flc_list until fencing is complete. This guarantees
that no other clients can access the file, and the client with
exclusive access is properly blocked before disposal.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
master
Dai Ngo 2026-02-13 10:36:30 -08:00 committed by Chuck Lever
parent b48f44f36e
commit f52792f484
10 changed files with 279 additions and 17 deletions

View File

@ -40,3 +40,33 @@ how to translate the device into a serial number from SCSI EVPD 0x80::
echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
EOF
If the nfsd server needs to fence a non-responding client and the
fencing operation fails, the server logs a warning message in the
system log with the following format:
FENCE failed client[IP_address] clid[#n] device[dev_name]
Where:
IP_address: refers to the IP address of the affected client.
#n: indicates the unique client identifier.
dev_name: specifies the name of the block device related
to the fencing attempt.
The server will repeatedly retry the operation indefinitely. During
this time, access to the affected file is restricted for all other
clients. This is to prevent potential data corruption if multiple
clients access the same file simultaneously.
To restore access to the affected file for other clients, the admin
needs to take the following actions:
. shutdown or power off the client being fenced.
. manually expire the client to release all its state on the server:
echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
Where:
clid: is the unique client identifier displayed in the system log.

View File

@ -22,3 +22,34 @@ option and the underlying SCSI device support persistent reservations.
On the client make sure the kernel has the CONFIG_PNFS_BLOCK option
enabled, and the file system is mounted using the NFSv4.1 protocol
version (mount -o vers=4.1).
If the nfsd server needs to fence a non-responding client and the
fencing operation fails, the server logs a warning message in the
system log with the following format:
FENCE failed client[IP_address] clid[#n] device[dev_name]
Where:
IP_address: refers to the IP address of the affected client.
#n: indicates the unique client identifier.
dev_name: specifies the name of the block device related
to the fencing attempt.
The server will repeatedly retry the operation indefinitely. During
this time, access to the affected file is restricted for all other
clients. This is to prevent potential data corruption if multiple
clients access the same file simultaneously.
To restore access to the affected file for other clients, the admin
needs to take the following actions:
. shutdown or power off the client being fenced.
. manually expire the client to release all its state on the server:
echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
Where:
clid: is the unique client identifier displayed in the system log.

View File

@ -398,6 +398,7 @@ prototypes::
bool (*lm_breaker_owns_lease)(struct file_lock *);
bool (*lm_lock_expirable)(struct file_lock *);
void (*lm_expire_lock)(void);
bool (*lm_breaker_timedout)(struct file_lease *);
locking rules:
@ -412,6 +413,7 @@ lm_breaker_owns_lease: yes no no
lm_lock_expirable yes no no
lm_expire_lock no no yes
lm_open_conflict yes no no
lm_breaker_timedout yes no no
====================== ============= ================= =========
buffer_head

View File

@ -1534,6 +1534,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
{
struct file_lock_context *ctx = inode->i_flctx;
struct file_lease *fl, *tmp;
bool remove;
lockdep_assert_held(&ctx->flc_lock);
@ -1541,8 +1542,19 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
trace_time_out_leases(inode, fl);
if (past_time(fl->fl_downgrade_time))
lease_modify(fl, F_RDLCK, dispose);
if (past_time(fl->fl_break_time))
lease_modify(fl, F_UNLCK, dispose);
remove = true;
if (past_time(fl->fl_break_time)) {
/*
* Consult the lease manager when a lease break times
* out to determine whether the lease should be disposed
* of.
*/
if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
remove = fl->fl_lmops->lm_breaker_timedout(fl);
if (remove)
lease_modify(fl, F_UNLCK, dispose);
}
}
}
@ -1670,9 +1682,13 @@ int __break_lease(struct inode *inode, unsigned int flags)
restart:
fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list);
break_time = fl->fl_break_time;
if (break_time != 0)
break_time -= jiffies;
if (break_time == 0)
if (break_time != 0) {
if (time_after(jiffies, break_time)) {
fl->fl_break_time = jiffies + lease_break_time * HZ;
break_time = lease_break_time * HZ;
} else
break_time -= jiffies;
} else
break_time++;
locks_insert_block(&fl->c, &new_fl->c, leases_conflict);
trace_break_lease_block(inode, new_fl);

View File

@ -297,6 +297,7 @@ static inline int nfsd4_scsi_fence_insert(struct nfs4_client *clp,
ret = 0;
}
xa_unlock(xa);
clp->cl_fence_retry_warn = false;
return ret;
}
@ -443,15 +444,33 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
}
static void
/*
* Perform the fence operation to prevent the client from accessing the
* block device. If a fence operation is already in progress, wait for
* it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
* operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
* update the layout stateid by setting the ls_fenced flag to indicate
* that the client has been fenced.
*
* The cl_fence_mutex ensures that the fence operation has been fully
* completed, rather than just in progress, when returning from this
* function.
*
* Return true if client was fenced otherwise return false.
*/
static bool
nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
{
struct nfs4_client *clp = ls->ls_stid.sc_client;
struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
int status;
bool ret;
if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
return;
mutex_lock(&clp->cl_fence_mutex);
if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
mutex_unlock(&clp->cl_fence_mutex);
return true;
}
status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
nfsd4_scsi_pr_key(clp),
@ -470,13 +489,22 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
* PR_STS_RESERVATION_CONFLICT, which would cause an infinite
* retry loop.
*/
if (status < 0 ||
status == PR_STS_PATH_FAILED ||
status == PR_STS_PATH_FAST_FAILED ||
status == PR_STS_RETRY_PATH_FAILURE)
switch (status) {
case 0:
case PR_STS_IOERR:
case PR_STS_RESERVATION_CONFLICT:
ret = true;
break;
default:
/* retry-able and other errors */
ret = false;
nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
break;
}
mutex_unlock(&clp->cl_fence_mutex);
trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
return ret;
}
const struct nfsd4_layout_ops scsi_layout_ops = {

View File

@ -27,6 +27,8 @@ static struct kmem_cache *nfs4_layout_stateid_cache;
static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
static const struct lease_manager_operations nfsd4_layouts_lm_ops;
static void nfsd4_layout_fence_worker(struct work_struct *work);
const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
#ifdef CONFIG_NFSD_FLEXFILELAYOUT
[LAYOUT_FLEX_FILES] = &ff_layout_ops,
@ -177,6 +179,13 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid);
spin_lock(&ls->ls_lock);
if (delayed_work_pending(&ls->ls_fence_work)) {
spin_unlock(&ls->ls_lock);
cancel_delayed_work_sync(&ls->ls_fence_work);
} else
spin_unlock(&ls->ls_lock);
spin_lock(&clp->cl_lock);
list_del_init(&ls->ls_perclnt);
spin_unlock(&clp->cl_lock);
@ -271,6 +280,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
list_add(&ls->ls_perfile, &fp->fi_lo_states);
spin_unlock(&fp->fi_lock);
ls->ls_fenced = false;
ls->ls_fence_delay = 0;
INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
return ls;
}
@ -747,11 +760,9 @@ static bool
nfsd4_layout_lm_break(struct file_lease *fl)
{
/*
* We don't want the locks code to timeout the lease for us;
* we'll remove it ourself if a layout isn't returned
* in time:
* Enforce break lease timeout to prevent NFSD
* thread from hanging in __break_lease.
*/
fl->fl_break_time = 0;
nfsd4_recall_file_layout(fl->c.flc_owner);
return false;
}
@ -782,10 +793,143 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
return 0;
}
static void
nfsd4_layout_fence_worker(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct nfs4_layout_stateid *ls = container_of(dwork,
struct nfs4_layout_stateid, ls_fence_work);
struct nfsd_file *nf;
struct block_device *bdev;
struct nfs4_client *clp;
struct nfsd_net *nn;
/*
* The workqueue clears WORK_STRUCT_PENDING before invoking
* this callback. Re-arm immediately so that
* delayed_work_pending() returns true while the fence
* operation is in progress, preventing
* lm_breaker_timedout() from taking a duplicate reference.
*/
mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
spin_lock(&ls->ls_lock);
if (list_empty(&ls->ls_layouts)) {
spin_unlock(&ls->ls_lock);
dispose:
cancel_delayed_work(&ls->ls_fence_work);
/* unlock the lease so that tasks waiting on it can proceed */
nfsd4_close_layout(ls);
ls->ls_fenced = true;
nfs4_put_stid(&ls->ls_stid);
return;
}
spin_unlock(&ls->ls_lock);
rcu_read_lock();
nf = nfsd_file_get(ls->ls_file);
rcu_read_unlock();
if (!nf)
goto dispose;
clp = ls->ls_stid.sc_client;
nn = net_generic(clp->net, nfsd_net_id);
bdev = nf->nf_file->f_path.mnt->mnt_sb->s_bdev;
if (nfsd4_layout_ops[ls->ls_layout_type]->fence_client(ls, nf)) {
/* fenced ok */
nfsd_file_put(nf);
pr_warn("%s: FENCED client[%pISpc] clid[%d] to device[%s]\n",
__func__, (struct sockaddr *)&clp->cl_addr,
clp->cl_clientid.cl_id - nn->clientid_base,
bdev->bd_disk->disk_name);
goto dispose;
}
/* fence failed */
nfsd_file_put(nf);
if (!clp->cl_fence_retry_warn) {
pr_warn("%s: FENCE failed client[%pISpc] clid[%d] device[%s]\n",
__func__, (struct sockaddr *)&clp->cl_addr,
clp->cl_clientid.cl_id - nn->clientid_base,
bdev->bd_disk->disk_name);
clp->cl_fence_retry_warn = true;
}
/*
* The fence worker retries the fencing operation indefinitely to
* prevent data corruption. The admin needs to take the following
* actions to restore access to the file for other clients:
*
* . shutdown or power off the client being fenced.
* . manually expire the client to release all its state on the server;
* echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
*
* Where:
*
* clid: is the unique client identifier displayed in
* the warning message above.
*/
if (!ls->ls_fence_delay)
ls->ls_fence_delay = HZ;
else
ls->ls_fence_delay = min(ls->ls_fence_delay << 1,
MAX_FENCE_DELAY);
mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, ls->ls_fence_delay);
}
/**
* nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
* @fl: file to check
*
* If the layout type supports a fence operation, schedule a worker to
* fence the client from accessing the block device.
*
* This function runs under the protection of the spin_lock flc_lock.
* At this time, the file_lease associated with the layout stateid is
* on the flc_list. A reference count is incremented on the layout
* stateid to prevent it from being freed while the fence worker is
* executing. Once the fence worker finishes its operation, it releases
* this reference.
*
* The fence worker continues to run until either the client has been
* fenced or the layout becomes invalid. The layout can become invalid
* as a result of a LAYOUTRETURN or when the CB_LAYOUT recall callback
* has completed.
*
* Return true if the file_lease should be disposed of by the caller;
* otherwise, return false.
*/
static bool
nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
{
struct nfs4_layout_stateid *ls = fl->c.flc_owner;
if ((!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) ||
ls->ls_fenced)
return true;
if (delayed_work_pending(&ls->ls_fence_work))
return false;
/*
* Make sure layout has not been returned yet before
* taking a reference count on the layout stateid.
*/
spin_lock(&ls->ls_lock);
if (list_empty(&ls->ls_layouts) ||
!refcount_inc_not_zero(&ls->ls_stid.sc_count)) {
spin_unlock(&ls->ls_lock);
return true;
}
spin_unlock(&ls->ls_lock);
mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
return false;
}
static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
.lm_break = nfsd4_layout_lm_break,
.lm_change = nfsd4_layout_lm_change,
.lm_open_conflict = nfsd4_layout_lm_open_conflict,
.lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout,
};
int

View File

@ -2386,6 +2386,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
#endif
#ifdef CONFIG_NFSD_SCSILAYOUT
xa_init(&clp->cl_dev_fences);
mutex_init(&clp->cl_fence_mutex);
#endif
INIT_LIST_HEAD(&clp->async_copies);
spin_lock_init(&clp->async_lock);

View File

@ -11,6 +11,9 @@
struct xdr_stream;
/* Cap exponential backoff between fence retries at 3 minutes */
#define MAX_FENCE_DELAY ((unsigned int)(3 * 60 * HZ))
struct nfsd4_deviceid_map {
struct list_head hash;
u64 idx;
@ -38,7 +41,7 @@ struct nfsd4_layout_ops {
struct svc_rqst *rqstp,
struct nfsd4_layoutcommit *lcp);
void (*fence_client)(struct nfs4_layout_stateid *ls,
bool (*fence_client)(struct nfs4_layout_stateid *ls,
struct nfsd_file *file);
};

View File

@ -456,6 +456,7 @@ struct nfs4_client {
struct list_head cl_lru; /* tail queue */
#ifdef CONFIG_NFSD_PNFS
struct list_head cl_lo_states; /* outstanding layout states */
bool cl_fence_retry_warn;
#endif
struct xdr_netobj cl_name; /* id generated by client */
nfs4_verifier cl_verifier; /* generated by client */
@ -529,6 +530,7 @@ struct nfs4_client {
time64_t cl_ra_time;
#ifdef CONFIG_NFSD_SCSILAYOUT
struct xarray cl_dev_fences;
struct mutex cl_fence_mutex;
#endif
};
@ -745,6 +747,10 @@ struct nfs4_layout_stateid {
stateid_t ls_recall_sid;
bool ls_recalled;
struct mutex ls_mutex;
struct delayed_work ls_fence_work;
unsigned int ls_fence_delay;
bool ls_fenced;
};
static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)

View File

@ -50,6 +50,7 @@ struct lease_manager_operations {
void (*lm_setup)(struct file_lease *, void **);
bool (*lm_breaker_owns_lease)(struct file_lease *);
int (*lm_open_conflict)(struct file *, int);
bool (*lm_breaker_timedout)(struct file_lease *fl);
};
struct lock_manager {