NFSD: Enforce timeout on layout recall and integrate lease manager fencing
When a layout conflict triggers a recall, enforcing a timeout is necessary to prevent excessive nfsd threads from being blocked in __break_lease ensuring the server continues servicing incoming requests efficiently. This patch introduces a new function to lease_manager_operations: lm_breaker_timedout: Invoked when a lease recall times out and is about to be disposed of. This function enables the lease manager to inform the caller whether the file_lease should remain on the flc_list or be disposed of. For the NFSD lease manager, this function now handles layout recall timeouts. If the layout type supports fencing and the client has not been fenced, a fence operation is triggered to prevent the client from accessing the block device. While the fencing operation is in progress, the conflicting file_lease remains on the flc_list until fencing is complete. This guarantees that no other clients can access the file, and the client with exclusive access is properly blocked before disposal. Signed-off-by: Dai Ngo <dai.ngo@oracle.com> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>master
parent
b48f44f36e
commit
f52792f484
|
|
@ -40,3 +40,33 @@ how to translate the device into a serial number from SCSI EVPD 0x80::
|
|||
|
||||
echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
|
||||
EOF
|
||||
|
||||
If the nfsd server needs to fence a non-responding client and the
|
||||
fencing operation fails, the server logs a warning message in the
|
||||
system log with the following format:
|
||||
|
||||
FENCE failed client[IP_address] clid[#n] device[dev_name]
|
||||
|
||||
Where:
|
||||
|
||||
IP_address: refers to the IP address of the affected client.
|
||||
#n: indicates the unique client identifier.
|
||||
dev_name: specifies the name of the block device related
|
||||
to the fencing attempt.
|
||||
|
||||
The server will repeatedly retry the operation indefinitely. During
|
||||
this time, access to the affected file is restricted for all other
|
||||
clients. This is to prevent potential data corruption if multiple
|
||||
clients access the same file simultaneously.
|
||||
|
||||
To restore access to the affected file for other clients, the admin
|
||||
needs to take the following actions:
|
||||
|
||||
. shutdown or power off the client being fenced.
|
||||
. manually expire the client to release all its state on the server:
|
||||
|
||||
echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
|
||||
|
||||
Where:
|
||||
|
||||
clid: is the unique client identifier displayed in the system log.
|
||||
|
|
|
|||
|
|
@ -22,3 +22,34 @@ option and the underlying SCSI device support persistent reservations.
|
|||
On the client make sure the kernel has the CONFIG_PNFS_BLOCK option
|
||||
enabled, and the file system is mounted using the NFSv4.1 protocol
|
||||
version (mount -o vers=4.1).
|
||||
|
||||
If the nfsd server needs to fence a non-responding client and the
|
||||
fencing operation fails, the server logs a warning message in the
|
||||
system log with the following format:
|
||||
|
||||
FENCE failed client[IP_address] clid[#n] device[dev_name]
|
||||
|
||||
Where:
|
||||
|
||||
IP_address: refers to the IP address of the affected client.
|
||||
#n: indicates the unique client identifier.
|
||||
dev_name: specifies the name of the block device related
|
||||
to the fencing attempt.
|
||||
|
||||
The server will repeatedly retry the operation indefinitely. During
|
||||
this time, access to the affected file is restricted for all other
|
||||
clients. This is to prevent potential data corruption if multiple
|
||||
clients access the same file simultaneously.
|
||||
|
||||
To restore access to the affected file for other clients, the admin
|
||||
needs to take the following actions:
|
||||
|
||||
. shutdown or power off the client being fenced.
|
||||
. manually expire the client to release all its state on the server:
|
||||
|
||||
echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
|
||||
|
||||
Where:
|
||||
|
||||
clid: is the unique client identifier displayed in the system log.
|
||||
|
||||
|
|
|
|||
|
|
@ -398,6 +398,7 @@ prototypes::
|
|||
bool (*lm_breaker_owns_lease)(struct file_lock *);
|
||||
bool (*lm_lock_expirable)(struct file_lock *);
|
||||
void (*lm_expire_lock)(void);
|
||||
bool (*lm_breaker_timedout)(struct file_lease *);
|
||||
|
||||
locking rules:
|
||||
|
||||
|
|
@ -412,6 +413,7 @@ lm_breaker_owns_lease: yes no no
|
|||
lm_lock_expirable yes no no
|
||||
lm_expire_lock no no yes
|
||||
lm_open_conflict yes no no
|
||||
lm_breaker_timedout yes no no
|
||||
====================== ============= ================= =========
|
||||
|
||||
buffer_head
|
||||
|
|
|
|||
26
fs/locks.c
26
fs/locks.c
|
|
@ -1534,6 +1534,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
|
|||
{
|
||||
struct file_lock_context *ctx = inode->i_flctx;
|
||||
struct file_lease *fl, *tmp;
|
||||
bool remove;
|
||||
|
||||
lockdep_assert_held(&ctx->flc_lock);
|
||||
|
||||
|
|
@ -1541,8 +1542,19 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
|
|||
trace_time_out_leases(inode, fl);
|
||||
if (past_time(fl->fl_downgrade_time))
|
||||
lease_modify(fl, F_RDLCK, dispose);
|
||||
if (past_time(fl->fl_break_time))
|
||||
lease_modify(fl, F_UNLCK, dispose);
|
||||
|
||||
remove = true;
|
||||
if (past_time(fl->fl_break_time)) {
|
||||
/*
|
||||
* Consult the lease manager when a lease break times
|
||||
* out to determine whether the lease should be disposed
|
||||
* of.
|
||||
*/
|
||||
if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout)
|
||||
remove = fl->fl_lmops->lm_breaker_timedout(fl);
|
||||
if (remove)
|
||||
lease_modify(fl, F_UNLCK, dispose);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1670,9 +1682,13 @@ int __break_lease(struct inode *inode, unsigned int flags)
|
|||
restart:
|
||||
fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list);
|
||||
break_time = fl->fl_break_time;
|
||||
if (break_time != 0)
|
||||
break_time -= jiffies;
|
||||
if (break_time == 0)
|
||||
if (break_time != 0) {
|
||||
if (time_after(jiffies, break_time)) {
|
||||
fl->fl_break_time = jiffies + lease_break_time * HZ;
|
||||
break_time = lease_break_time * HZ;
|
||||
} else
|
||||
break_time -= jiffies;
|
||||
} else
|
||||
break_time++;
|
||||
locks_insert_block(&fl->c, &new_fl->c, leases_conflict);
|
||||
trace_break_lease_block(inode, new_fl);
|
||||
|
|
|
|||
|
|
@ -297,6 +297,7 @@ static inline int nfsd4_scsi_fence_insert(struct nfs4_client *clp,
|
|||
ret = 0;
|
||||
}
|
||||
xa_unlock(xa);
|
||||
clp->cl_fence_retry_warn = false;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
@ -443,15 +444,33 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp,
|
|||
return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
|
||||
}
|
||||
|
||||
static void
|
||||
/*
|
||||
* Perform the fence operation to prevent the client from accessing the
|
||||
* block device. If a fence operation is already in progress, wait for
|
||||
* it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the
|
||||
* operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set,
|
||||
* update the layout stateid by setting the ls_fenced flag to indicate
|
||||
* that the client has been fenced.
|
||||
*
|
||||
* The cl_fence_mutex ensures that the fence operation has been fully
|
||||
* completed, rather than just in progress, when returning from this
|
||||
* function.
|
||||
*
|
||||
* Return true if client was fenced otherwise return false.
|
||||
*/
|
||||
static bool
|
||||
nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
|
||||
{
|
||||
struct nfs4_client *clp = ls->ls_stid.sc_client;
|
||||
struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
|
||||
int status;
|
||||
bool ret;
|
||||
|
||||
if (nfsd4_scsi_fence_set(clp, bdev->bd_dev))
|
||||
return;
|
||||
mutex_lock(&clp->cl_fence_mutex);
|
||||
if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) {
|
||||
mutex_unlock(&clp->cl_fence_mutex);
|
||||
return true;
|
||||
}
|
||||
|
||||
status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
|
||||
nfsd4_scsi_pr_key(clp),
|
||||
|
|
@ -470,13 +489,22 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
|
|||
* PR_STS_RESERVATION_CONFLICT, which would cause an infinite
|
||||
* retry loop.
|
||||
*/
|
||||
if (status < 0 ||
|
||||
status == PR_STS_PATH_FAILED ||
|
||||
status == PR_STS_PATH_FAST_FAILED ||
|
||||
status == PR_STS_RETRY_PATH_FAILURE)
|
||||
switch (status) {
|
||||
case 0:
|
||||
case PR_STS_IOERR:
|
||||
case PR_STS_RESERVATION_CONFLICT:
|
||||
ret = true;
|
||||
break;
|
||||
default:
|
||||
/* retry-able and other errors */
|
||||
ret = false;
|
||||
nfsd4_scsi_fence_clear(clp, bdev->bd_dev);
|
||||
break;
|
||||
}
|
||||
mutex_unlock(&clp->cl_fence_mutex);
|
||||
|
||||
trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status);
|
||||
return ret;
|
||||
}
|
||||
|
||||
const struct nfsd4_layout_ops scsi_layout_ops = {
|
||||
|
|
|
|||
|
|
@ -27,6 +27,8 @@ static struct kmem_cache *nfs4_layout_stateid_cache;
|
|||
static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
|
||||
static const struct lease_manager_operations nfsd4_layouts_lm_ops;
|
||||
|
||||
static void nfsd4_layout_fence_worker(struct work_struct *work);
|
||||
|
||||
const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
|
||||
#ifdef CONFIG_NFSD_FLEXFILELAYOUT
|
||||
[LAYOUT_FLEX_FILES] = &ff_layout_ops,
|
||||
|
|
@ -177,6 +179,13 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
|
|||
|
||||
trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid);
|
||||
|
||||
spin_lock(&ls->ls_lock);
|
||||
if (delayed_work_pending(&ls->ls_fence_work)) {
|
||||
spin_unlock(&ls->ls_lock);
|
||||
cancel_delayed_work_sync(&ls->ls_fence_work);
|
||||
} else
|
||||
spin_unlock(&ls->ls_lock);
|
||||
|
||||
spin_lock(&clp->cl_lock);
|
||||
list_del_init(&ls->ls_perclnt);
|
||||
spin_unlock(&clp->cl_lock);
|
||||
|
|
@ -271,6 +280,10 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
|
|||
list_add(&ls->ls_perfile, &fp->fi_lo_states);
|
||||
spin_unlock(&fp->fi_lock);
|
||||
|
||||
ls->ls_fenced = false;
|
||||
ls->ls_fence_delay = 0;
|
||||
INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker);
|
||||
|
||||
trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
|
||||
return ls;
|
||||
}
|
||||
|
|
@ -747,11 +760,9 @@ static bool
|
|||
nfsd4_layout_lm_break(struct file_lease *fl)
|
||||
{
|
||||
/*
|
||||
* We don't want the locks code to timeout the lease for us;
|
||||
* we'll remove it ourself if a layout isn't returned
|
||||
* in time:
|
||||
* Enforce break lease timeout to prevent NFSD
|
||||
* thread from hanging in __break_lease.
|
||||
*/
|
||||
fl->fl_break_time = 0;
|
||||
nfsd4_recall_file_layout(fl->c.flc_owner);
|
||||
return false;
|
||||
}
|
||||
|
|
@ -782,10 +793,143 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
nfsd4_layout_fence_worker(struct work_struct *work)
|
||||
{
|
||||
struct delayed_work *dwork = to_delayed_work(work);
|
||||
struct nfs4_layout_stateid *ls = container_of(dwork,
|
||||
struct nfs4_layout_stateid, ls_fence_work);
|
||||
struct nfsd_file *nf;
|
||||
struct block_device *bdev;
|
||||
struct nfs4_client *clp;
|
||||
struct nfsd_net *nn;
|
||||
|
||||
/*
|
||||
* The workqueue clears WORK_STRUCT_PENDING before invoking
|
||||
* this callback. Re-arm immediately so that
|
||||
* delayed_work_pending() returns true while the fence
|
||||
* operation is in progress, preventing
|
||||
* lm_breaker_timedout() from taking a duplicate reference.
|
||||
*/
|
||||
mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
|
||||
|
||||
spin_lock(&ls->ls_lock);
|
||||
if (list_empty(&ls->ls_layouts)) {
|
||||
spin_unlock(&ls->ls_lock);
|
||||
dispose:
|
||||
cancel_delayed_work(&ls->ls_fence_work);
|
||||
/* unlock the lease so that tasks waiting on it can proceed */
|
||||
nfsd4_close_layout(ls);
|
||||
|
||||
ls->ls_fenced = true;
|
||||
nfs4_put_stid(&ls->ls_stid);
|
||||
return;
|
||||
}
|
||||
spin_unlock(&ls->ls_lock);
|
||||
|
||||
rcu_read_lock();
|
||||
nf = nfsd_file_get(ls->ls_file);
|
||||
rcu_read_unlock();
|
||||
if (!nf)
|
||||
goto dispose;
|
||||
|
||||
clp = ls->ls_stid.sc_client;
|
||||
nn = net_generic(clp->net, nfsd_net_id);
|
||||
bdev = nf->nf_file->f_path.mnt->mnt_sb->s_bdev;
|
||||
if (nfsd4_layout_ops[ls->ls_layout_type]->fence_client(ls, nf)) {
|
||||
/* fenced ok */
|
||||
nfsd_file_put(nf);
|
||||
pr_warn("%s: FENCED client[%pISpc] clid[%d] to device[%s]\n",
|
||||
__func__, (struct sockaddr *)&clp->cl_addr,
|
||||
clp->cl_clientid.cl_id - nn->clientid_base,
|
||||
bdev->bd_disk->disk_name);
|
||||
goto dispose;
|
||||
}
|
||||
/* fence failed */
|
||||
nfsd_file_put(nf);
|
||||
|
||||
if (!clp->cl_fence_retry_warn) {
|
||||
pr_warn("%s: FENCE failed client[%pISpc] clid[%d] device[%s]\n",
|
||||
__func__, (struct sockaddr *)&clp->cl_addr,
|
||||
clp->cl_clientid.cl_id - nn->clientid_base,
|
||||
bdev->bd_disk->disk_name);
|
||||
clp->cl_fence_retry_warn = true;
|
||||
}
|
||||
/*
|
||||
* The fence worker retries the fencing operation indefinitely to
|
||||
* prevent data corruption. The admin needs to take the following
|
||||
* actions to restore access to the file for other clients:
|
||||
*
|
||||
* . shutdown or power off the client being fenced.
|
||||
* . manually expire the client to release all its state on the server;
|
||||
* echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'.
|
||||
*
|
||||
* Where:
|
||||
*
|
||||
* clid: is the unique client identifier displayed in
|
||||
* the warning message above.
|
||||
*/
|
||||
if (!ls->ls_fence_delay)
|
||||
ls->ls_fence_delay = HZ;
|
||||
else
|
||||
ls->ls_fence_delay = min(ls->ls_fence_delay << 1,
|
||||
MAX_FENCE_DELAY);
|
||||
mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, ls->ls_fence_delay);
|
||||
}
|
||||
|
||||
/**
|
||||
* nfsd4_layout_lm_breaker_timedout - The layout recall has timed out.
|
||||
* @fl: file to check
|
||||
*
|
||||
* If the layout type supports a fence operation, schedule a worker to
|
||||
* fence the client from accessing the block device.
|
||||
*
|
||||
* This function runs under the protection of the spin_lock flc_lock.
|
||||
* At this time, the file_lease associated with the layout stateid is
|
||||
* on the flc_list. A reference count is incremented on the layout
|
||||
* stateid to prevent it from being freed while the fence worker is
|
||||
* executing. Once the fence worker finishes its operation, it releases
|
||||
* this reference.
|
||||
*
|
||||
* The fence worker continues to run until either the client has been
|
||||
* fenced or the layout becomes invalid. The layout can become invalid
|
||||
* as a result of a LAYOUTRETURN or when the CB_LAYOUT recall callback
|
||||
* has completed.
|
||||
*
|
||||
* Return true if the file_lease should be disposed of by the caller;
|
||||
* otherwise, return false.
|
||||
*/
|
||||
static bool
|
||||
nfsd4_layout_lm_breaker_timedout(struct file_lease *fl)
|
||||
{
|
||||
struct nfs4_layout_stateid *ls = fl->c.flc_owner;
|
||||
|
||||
if ((!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) ||
|
||||
ls->ls_fenced)
|
||||
return true;
|
||||
if (delayed_work_pending(&ls->ls_fence_work))
|
||||
return false;
|
||||
/*
|
||||
* Make sure layout has not been returned yet before
|
||||
* taking a reference count on the layout stateid.
|
||||
*/
|
||||
spin_lock(&ls->ls_lock);
|
||||
if (list_empty(&ls->ls_layouts) ||
|
||||
!refcount_inc_not_zero(&ls->ls_stid.sc_count)) {
|
||||
spin_unlock(&ls->ls_lock);
|
||||
return true;
|
||||
}
|
||||
spin_unlock(&ls->ls_lock);
|
||||
|
||||
mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0);
|
||||
return false;
|
||||
}
|
||||
|
||||
static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
|
||||
.lm_break = nfsd4_layout_lm_break,
|
||||
.lm_change = nfsd4_layout_lm_change,
|
||||
.lm_open_conflict = nfsd4_layout_lm_open_conflict,
|
||||
.lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout,
|
||||
};
|
||||
|
||||
int
|
||||
|
|
|
|||
|
|
@ -2386,6 +2386,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name,
|
|||
#endif
|
||||
#ifdef CONFIG_NFSD_SCSILAYOUT
|
||||
xa_init(&clp->cl_dev_fences);
|
||||
mutex_init(&clp->cl_fence_mutex);
|
||||
#endif
|
||||
INIT_LIST_HEAD(&clp->async_copies);
|
||||
spin_lock_init(&clp->async_lock);
|
||||
|
|
|
|||
|
|
@ -11,6 +11,9 @@
|
|||
|
||||
struct xdr_stream;
|
||||
|
||||
/* Cap exponential backoff between fence retries at 3 minutes */
|
||||
#define MAX_FENCE_DELAY ((unsigned int)(3 * 60 * HZ))
|
||||
|
||||
struct nfsd4_deviceid_map {
|
||||
struct list_head hash;
|
||||
u64 idx;
|
||||
|
|
@ -38,7 +41,7 @@ struct nfsd4_layout_ops {
|
|||
struct svc_rqst *rqstp,
|
||||
struct nfsd4_layoutcommit *lcp);
|
||||
|
||||
void (*fence_client)(struct nfs4_layout_stateid *ls,
|
||||
bool (*fence_client)(struct nfs4_layout_stateid *ls,
|
||||
struct nfsd_file *file);
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -456,6 +456,7 @@ struct nfs4_client {
|
|||
struct list_head cl_lru; /* tail queue */
|
||||
#ifdef CONFIG_NFSD_PNFS
|
||||
struct list_head cl_lo_states; /* outstanding layout states */
|
||||
bool cl_fence_retry_warn;
|
||||
#endif
|
||||
struct xdr_netobj cl_name; /* id generated by client */
|
||||
nfs4_verifier cl_verifier; /* generated by client */
|
||||
|
|
@ -529,6 +530,7 @@ struct nfs4_client {
|
|||
time64_t cl_ra_time;
|
||||
#ifdef CONFIG_NFSD_SCSILAYOUT
|
||||
struct xarray cl_dev_fences;
|
||||
struct mutex cl_fence_mutex;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
|
@ -745,6 +747,10 @@ struct nfs4_layout_stateid {
|
|||
stateid_t ls_recall_sid;
|
||||
bool ls_recalled;
|
||||
struct mutex ls_mutex;
|
||||
|
||||
struct delayed_work ls_fence_work;
|
||||
unsigned int ls_fence_delay;
|
||||
bool ls_fenced;
|
||||
};
|
||||
|
||||
static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ struct lease_manager_operations {
|
|||
void (*lm_setup)(struct file_lease *, void **);
|
||||
bool (*lm_breaker_owns_lease)(struct file_lease *);
|
||||
int (*lm_open_conflict)(struct file *, int);
|
||||
bool (*lm_breaker_timedout)(struct file_lease *fl);
|
||||
};
|
||||
|
||||
struct lock_manager {
|
||||
|
|
|
|||
Loading…
Reference in New Issue