diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7af8d10b3aba..5135e1be0390 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -225,6 +225,12 @@ enum io_uring_sqe_flags_bit { /* Use hybrid poll in iopoll process */ #define IORING_SETUP_HYBRID_IOPOLL (1U << 17) +/* + * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have + * IORING_CQE_F_32 set in cqe->flags. + */ +#define IORING_SETUP_CQE_MIXED (1U << 18) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5166f11f07c7..6c07efac977c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -620,27 +620,29 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { - size_t cqe_size = sizeof(struct io_uring_cqe); - lockdep_assert_held(&ctx->uring_lock); /* don't abort if we're dying, entries must get freed */ if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; - if (ctx->flags & IORING_SETUP_CQE32) - cqe_size <<= 1; - io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { + size_t cqe_size = sizeof(struct io_uring_cqe); struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; + bool is_cqe32 = false; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); + if (ocqe->cqe.flags & IORING_CQE_F_32 || + ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + cqe_size <<= 1; + } if (!dying) { - if (!io_get_cqe_overflow(ctx, &cqe, true)) + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) break; memcpy(cqe, &ocqe->cqe, cqe_size); } @@ -752,10 +754,12 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + bool is_cqe32 = false; - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + ocq_size <<= 1; + } ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); @@ -773,12 +777,30 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, return ocqe; } +/* + * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE + * because the ring is a single 16b entry away from wrapping. + */ +static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) +{ + if (__io_cqring_events(ctx) < ctx->cq_entries) { + struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; + + cqe->user_data = 0; + cqe->res = 0; + cqe->flags = IORING_CQE_F_SKIP; + ctx->cached_cq_tail++; + return true; + } + return false; +} + /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); @@ -792,12 +814,22 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) return false; + /* + * Post dummy CQE if a 32b CQE is needed and there's only room for a + * 16b CQE before the ring wraps. + */ + if (cqe32 && off + 1 == ctx->cq_entries) { + if (!io_fill_nop_cqe(ctx, off)) + return false; + off = 0; + } + /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); free = ctx->cq_entries - queued; /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); - if (!len) + if (len < (cqe32 + 1)) return false; if (ctx->flags & IORING_SETUP_CQE32) { @@ -815,9 +847,9 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, { struct io_uring_cqe *cqe; - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) + if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) return false; - if (unlikely(!io_get_cqe(ctx, &cqe))) + if (unlikely(!io_get_cqe(ctx, &cqe, true))) return false; memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); @@ -828,14 +860,15 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + bool cqe32 = cflags & IORING_CQE_F_32; struct io_uring_cqe *cqe; - if (likely(io_get_cqe(ctx, &cqe))) { + if (likely(io_get_cqe(ctx, &cqe, cqe32))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - if (ctx->flags & IORING_SETUP_CQE32) { + if (cqe32) { WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } @@ -2756,6 +2789,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, if (check_shl_overflow(off, 1, &off)) return SIZE_MAX; } + if (flags & IORING_SETUP_CQE_MIXED) { + if (cq_entries < 2) + return SIZE_MAX; + } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); @@ -3680,6 +3717,14 @@ static int io_uring_sanitise_params(struct io_uring_params *p) !(flags & IORING_SETUP_SINGLE_ISSUER)) return -EINVAL; + /* + * Nonsensical to ask for CQE32 and mixed CQE support, it's not + * supported to post 16b CQEs on a ring setup with CQE32. + */ + if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == + (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) + return -EINVAL; + return 0; } @@ -3906,7 +3951,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | - IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) + IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL | + IORING_SETUP_CQE_MIXED)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index abc6de227f74..2bcb565d9de6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -75,7 +75,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) unsigned long rings_size(unsigned int flags, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset); int io_uring_fill_params(unsigned entries, struct io_uring_params *p); -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); @@ -169,25 +169,31 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, - bool overflow) + bool overflow, bool cqe32) { io_lockdep_assert_cq_locked(ctx); - if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { - if (unlikely(!io_cqe_cache_refill(ctx, overflow))) + if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) { + if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32))) return false; } *ret = ctx->cqe_cached; ctx->cached_cq_tail++; ctx->cqe_cached++; - if (ctx->flags & IORING_SETUP_CQE32) + if (ctx->flags & IORING_SETUP_CQE32) { ctx->cqe_cached++; + } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) { + ctx->cqe_cached++; + ctx->cached_cq_tail++; + } + WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel); return true; } -static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) +static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, + bool cqe32) { - return io_get_cqe_overflow(ctx, ret, false); + return io_get_cqe_overflow(ctx, ret, false, cqe32); } static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, @@ -196,25 +202,24 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, io_lockdep_assert_cq_locked(ctx); ctx->submit_state.cq_flush = true; - return io_get_cqe(ctx, cqe_ret); + return io_get_cqe(ctx, cqe_ret, ctx->flags & IORING_SETUP_CQE_MIXED); } static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { + bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32; struct io_uring_cqe *cqe; /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. + * If we can't get a cq entry, userspace overflowed the submission + * (by quite a lot). */ - if (unlikely(!io_get_cqe(ctx, &cqe))) + if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32))) return false; - memcpy(cqe, &req->cqe, sizeof(*cqe)); - if (ctx->flags & IORING_SETUP_CQE32) { + if (is_cqe32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } @@ -239,6 +244,22 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.flags = cflags; } +static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx) +{ + if (ctx->flags & IORING_SETUP_CQE_MIXED) + return IORING_CQE_F_32; + return 0; +} + +static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags, + __u64 extra1, __u64 extra2) +{ + req->cqe.res = res; + req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx); + req->big_cqe.extra1 = extra1; + req->big_cqe.extra2 = extra2; +} + static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, struct io_kiocb *req) { diff --git a/io_uring/register.c b/io_uring/register.c index a59589249fce..a1a9b2884eae 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -396,7 +396,8 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ - IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ + IORING_SETUP_CQE_MIXED) static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) {