438 lines
9.9 KiB
C
438 lines
9.9 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
* Copyright 2024 HabanaLabs, Ltd.
|
|
* All Rights Reserved.
|
|
*/
|
|
|
|
#include "habanalabs.h"
|
|
#include "hldio.h"
|
|
#include <generated/uapi/linux/version.h>
|
|
#include <linux/pci-p2pdma.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/vmalloc.h>
|
|
|
|
/*
|
|
* NVMe Direct I/O implementation for habanalabs driver
|
|
*
|
|
* ASSUMPTIONS
|
|
* ===========
|
|
* 1. No IOMMU (well, technically it can work with IOMMU, but it is *almost useless).
|
|
* 2. Only READ operations (can extend in the future).
|
|
* 3. No sparse files (can overcome this in the future).
|
|
* 4. Kernel version >= 6.9
|
|
* 5. Requiring page alignment is OK (I don't see a solution to this one right,
|
|
* now, how do we read partial pages?)
|
|
* 6. Kernel compiled with CONFIG_PCI_P2PDMA. This requires a CUSTOM kernel.
|
|
* Theoretically I have a slight idea on how this could be solvable, but it
|
|
* is probably inacceptable for the upstream. Also may not work in the end.
|
|
* 7. Either make sure our cards and disks are under the same PCI bridge, or
|
|
* compile a custom kernel to hack around this.
|
|
*/
|
|
|
|
#define IO_STABILIZE_TIMEOUT 10000000 /* 10 seconds in microseconds */
|
|
|
|
/*
|
|
* This struct contains all the useful data I could milk out of the file handle
|
|
* provided by the user.
|
|
* @TODO: right now it is retrieved on each IO, but can be done once with some
|
|
* dedicated IOCTL, call it for example HL_REGISTER_HANDLE.
|
|
*/
|
|
struct hl_dio_fd {
|
|
/* Back pointer in case we need it in async completion */
|
|
struct hl_ctx *ctx;
|
|
/* Associated fd struct */
|
|
struct file *filp;
|
|
};
|
|
|
|
/*
|
|
* This is a single IO descriptor
|
|
*/
|
|
struct hl_direct_io {
|
|
struct hl_dio_fd f;
|
|
struct kiocb kio;
|
|
struct bio_vec *bv;
|
|
struct iov_iter iter;
|
|
u64 device_va;
|
|
u64 off_bytes;
|
|
u64 len_bytes;
|
|
u32 type;
|
|
};
|
|
|
|
bool hl_device_supports_nvme(struct hl_device *hdev)
|
|
{
|
|
return hdev->asic_prop.supports_nvme;
|
|
}
|
|
|
|
static int hl_dio_fd_register(struct hl_ctx *ctx, int fd, struct hl_dio_fd *f)
|
|
{
|
|
struct hl_device *hdev = ctx->hdev;
|
|
struct block_device *bd;
|
|
struct super_block *sb;
|
|
struct inode *inode;
|
|
struct gendisk *gd;
|
|
struct device *disk_dev;
|
|
int rc;
|
|
|
|
f->filp = fget(fd);
|
|
if (!f->filp) {
|
|
rc = -ENOENT;
|
|
goto out;
|
|
}
|
|
|
|
if (!(f->filp->f_flags & O_DIRECT)) {
|
|
dev_err(hdev->dev, "file is not in the direct mode\n");
|
|
rc = -EINVAL;
|
|
goto fput;
|
|
}
|
|
|
|
if (!f->filp->f_op->read_iter) {
|
|
dev_err(hdev->dev, "read iter is not supported, need to fall back to legacy\n");
|
|
rc = -EINVAL;
|
|
goto fput;
|
|
}
|
|
|
|
inode = file_inode(f->filp);
|
|
sb = inode->i_sb;
|
|
bd = sb->s_bdev;
|
|
gd = bd->bd_disk;
|
|
|
|
if (inode->i_blocks << sb->s_blocksize_bits < i_size_read(inode)) {
|
|
dev_err(hdev->dev, "sparse files are not currently supported\n");
|
|
rc = -EINVAL;
|
|
goto fput;
|
|
}
|
|
|
|
if (!bd || !gd) {
|
|
dev_err(hdev->dev, "invalid block device\n");
|
|
rc = -ENODEV;
|
|
goto fput;
|
|
}
|
|
/* Get the underlying device from the block device */
|
|
disk_dev = disk_to_dev(gd);
|
|
if (!dma_pci_p2pdma_supported(disk_dev)) {
|
|
dev_err(hdev->dev, "device does not support PCI P2P DMA\n");
|
|
rc = -EOPNOTSUPP;
|
|
goto fput;
|
|
}
|
|
|
|
/*
|
|
* @TODO: Maybe we need additional checks here
|
|
*/
|
|
|
|
f->ctx = ctx;
|
|
rc = 0;
|
|
|
|
goto out;
|
|
fput:
|
|
fput(f->filp);
|
|
out:
|
|
return rc;
|
|
}
|
|
|
|
static void hl_dio_fd_unregister(struct hl_dio_fd *f)
|
|
{
|
|
fput(f->filp);
|
|
}
|
|
|
|
static long hl_dio_count_io(struct hl_device *hdev)
|
|
{
|
|
s64 sum = 0;
|
|
int i;
|
|
|
|
for_each_possible_cpu(i)
|
|
sum += per_cpu(*hdev->hldio.inflight_ios, i);
|
|
|
|
return sum;
|
|
}
|
|
|
|
static bool hl_dio_get_iopath(struct hl_ctx *ctx)
|
|
{
|
|
struct hl_device *hdev = ctx->hdev;
|
|
|
|
if (hdev->hldio.io_enabled) {
|
|
this_cpu_inc(*hdev->hldio.inflight_ios);
|
|
|
|
/* Avoid race conditions */
|
|
if (!hdev->hldio.io_enabled) {
|
|
this_cpu_dec(*hdev->hldio.inflight_ios);
|
|
return false;
|
|
}
|
|
|
|
hl_ctx_get(ctx);
|
|
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static void hl_dio_put_iopath(struct hl_ctx *ctx)
|
|
{
|
|
struct hl_device *hdev = ctx->hdev;
|
|
|
|
hl_ctx_put(ctx);
|
|
this_cpu_dec(*hdev->hldio.inflight_ios);
|
|
}
|
|
|
|
static void hl_dio_set_io_enabled(struct hl_device *hdev, bool enabled)
|
|
{
|
|
hdev->hldio.io_enabled = enabled;
|
|
}
|
|
|
|
static bool hl_dio_validate_io(struct hl_device *hdev, struct hl_direct_io *io)
|
|
{
|
|
if ((u64)io->device_va & ~PAGE_MASK) {
|
|
dev_dbg(hdev->dev, "device address must be 4K aligned\n");
|
|
return false;
|
|
}
|
|
|
|
if (io->len_bytes & ~PAGE_MASK) {
|
|
dev_dbg(hdev->dev, "IO length must be 4K aligned\n");
|
|
return false;
|
|
}
|
|
|
|
if (io->off_bytes & ~PAGE_MASK) {
|
|
dev_dbg(hdev->dev, "IO offset must be 4K aligned\n");
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static struct page *hl_dio_va2page(struct hl_device *hdev, struct hl_ctx *ctx, u64 device_va)
|
|
{
|
|
struct hl_dio *hldio = &hdev->hldio;
|
|
u64 device_pa;
|
|
int rc, i;
|
|
|
|
rc = hl_mmu_va_to_pa(ctx, device_va, &device_pa);
|
|
if (rc) {
|
|
dev_err(hdev->dev, "device virtual address translation error: %#llx (%d)",
|
|
device_va, rc);
|
|
return NULL;
|
|
}
|
|
|
|
for (i = 0 ; i < hldio->np2prs ; ++i) {
|
|
if (device_pa >= hldio->p2prs[i].device_pa &&
|
|
device_pa < hldio->p2prs[i].device_pa + hldio->p2prs[i].size)
|
|
return hldio->p2prs[i].p2ppages[(device_pa - hldio->p2prs[i].device_pa) >>
|
|
PAGE_SHIFT];
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static ssize_t hl_direct_io(struct hl_device *hdev, struct hl_direct_io *io)
|
|
{
|
|
u64 npages, device_va;
|
|
ssize_t rc;
|
|
int i;
|
|
|
|
if (!hl_dio_validate_io(hdev, io))
|
|
return -EINVAL;
|
|
|
|
if (!hl_dio_get_iopath(io->f.ctx)) {
|
|
dev_info(hdev->dev, "can't schedule a new IO, IO is disabled\n");
|
|
return -ESHUTDOWN;
|
|
}
|
|
|
|
init_sync_kiocb(&io->kio, io->f.filp);
|
|
io->kio.ki_pos = io->off_bytes;
|
|
|
|
npages = (io->len_bytes >> PAGE_SHIFT);
|
|
|
|
/* @TODO: this can be implemented smarter, vmalloc in iopath is not
|
|
* ideal. Maybe some variation of genpool. Number of pages may differ
|
|
* greatly, so maybe even use pools of different sizes and chose the
|
|
* closest one.
|
|
*/
|
|
io->bv = vzalloc(npages * sizeof(struct bio_vec));
|
|
if (!io->bv)
|
|
return -ENOMEM;
|
|
|
|
for (i = 0, device_va = io->device_va; i < npages ; ++i, device_va += PAGE_SIZE) {
|
|
io->bv[i].bv_page = hl_dio_va2page(hdev, io->f.ctx, device_va);
|
|
if (!io->bv[i].bv_page) {
|
|
dev_err(hdev->dev, "error getting page struct for device va %#llx",
|
|
device_va);
|
|
rc = -EFAULT;
|
|
goto cleanup;
|
|
}
|
|
io->bv[i].bv_offset = 0;
|
|
io->bv[i].bv_len = PAGE_SIZE;
|
|
}
|
|
|
|
iov_iter_bvec(&io->iter, io->type, io->bv, 1, io->len_bytes);
|
|
if (io->f.filp->f_op && io->f.filp->f_op->read_iter)
|
|
rc = io->f.filp->f_op->read_iter(&io->kio, &io->iter);
|
|
else
|
|
rc = -EINVAL;
|
|
|
|
cleanup:
|
|
vfree(io->bv);
|
|
hl_dio_put_iopath(io->f.ctx);
|
|
|
|
dev_dbg(hdev->dev, "IO ended with %ld\n", rc);
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
* @TODO: This function can be used as a callback for io completion under
|
|
* kio->ki_complete in order to implement async IO.
|
|
* Note that on more recent kernels there is no ret2.
|
|
*/
|
|
__maybe_unused static void hl_direct_io_complete(struct kiocb *kio, long ret, long ret2)
|
|
{
|
|
struct hl_direct_io *io = container_of(kio, struct hl_direct_io, kio);
|
|
|
|
dev_dbg(io->f.ctx->hdev->dev, "IO completed with %ld\n", ret);
|
|
|
|
/* Do something to copy result to user / notify completion */
|
|
|
|
hl_dio_put_iopath(io->f.ctx);
|
|
|
|
hl_dio_fd_unregister(&io->f);
|
|
}
|
|
|
|
/*
|
|
* DMA disk to ASIC, wait for results. Must be invoked from the user context
|
|
*/
|
|
int hl_dio_ssd2hl(struct hl_device *hdev, struct hl_ctx *ctx, int fd,
|
|
u64 device_va, off_t off_bytes, size_t len_bytes,
|
|
size_t *len_read)
|
|
{
|
|
struct hl_direct_io *io;
|
|
ssize_t rc;
|
|
|
|
dev_dbg(hdev->dev, "SSD2HL fd=%d va=%#llx len=%#lx\n", fd, device_va, len_bytes);
|
|
|
|
io = kzalloc(sizeof(*io), GFP_KERNEL);
|
|
if (!io) {
|
|
rc = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
*io = (struct hl_direct_io){
|
|
.device_va = device_va,
|
|
.len_bytes = len_bytes,
|
|
.off_bytes = off_bytes,
|
|
.type = READ,
|
|
};
|
|
|
|
rc = hl_dio_fd_register(ctx, fd, &io->f);
|
|
if (rc)
|
|
goto kfree_io;
|
|
|
|
rc = hl_direct_io(hdev, io);
|
|
if (rc >= 0) {
|
|
*len_read = rc;
|
|
rc = 0;
|
|
}
|
|
|
|
/* This shall be called only in the case of a sync IO */
|
|
hl_dio_fd_unregister(&io->f);
|
|
kfree_io:
|
|
kfree(io);
|
|
out:
|
|
return rc;
|
|
}
|
|
|
|
static void hl_p2p_region_fini(struct hl_device *hdev, struct hl_p2p_region *p2pr)
|
|
{
|
|
if (p2pr->p2ppages) {
|
|
vfree(p2pr->p2ppages);
|
|
p2pr->p2ppages = NULL;
|
|
}
|
|
|
|
if (p2pr->p2pmem) {
|
|
dev_dbg(hdev->dev, "freeing P2P mem from %p, size=%#llx\n",
|
|
p2pr->p2pmem, p2pr->size);
|
|
pci_free_p2pmem(hdev->pdev, p2pr->p2pmem, p2pr->size);
|
|
p2pr->p2pmem = NULL;
|
|
}
|
|
}
|
|
|
|
void hl_p2p_region_fini_all(struct hl_device *hdev)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0 ; i < hdev->hldio.np2prs ; ++i)
|
|
hl_p2p_region_fini(hdev, &hdev->hldio.p2prs[i]);
|
|
|
|
kvfree(hdev->hldio.p2prs);
|
|
hdev->hldio.p2prs = NULL;
|
|
hdev->hldio.np2prs = 0;
|
|
}
|
|
|
|
int hl_p2p_region_init(struct hl_device *hdev, struct hl_p2p_region *p2pr)
|
|
{
|
|
void *addr;
|
|
int rc, i;
|
|
|
|
/* Start by publishing our p2p memory */
|
|
rc = pci_p2pdma_add_resource(hdev->pdev, p2pr->bar, p2pr->size, p2pr->bar_offset);
|
|
if (rc) {
|
|
dev_err(hdev->dev, "error adding p2p resource: %d\n", rc);
|
|
goto err;
|
|
}
|
|
|
|
/* Alloc all p2p mem */
|
|
p2pr->p2pmem = pci_alloc_p2pmem(hdev->pdev, p2pr->size);
|
|
if (!p2pr->p2pmem) {
|
|
dev_err(hdev->dev, "error allocating p2p memory\n");
|
|
rc = -ENOMEM;
|
|
goto err;
|
|
}
|
|
|
|
p2pr->p2ppages = vmalloc((p2pr->size >> PAGE_SHIFT) * sizeof(struct page *));
|
|
if (!p2pr->p2ppages) {
|
|
rc = -ENOMEM;
|
|
goto err;
|
|
}
|
|
|
|
for (i = 0, addr = p2pr->p2pmem ; i < (p2pr->size >> PAGE_SHIFT) ; ++i, addr += PAGE_SIZE) {
|
|
p2pr->p2ppages[i] = virt_to_page(addr);
|
|
if (!p2pr->p2ppages[i]) {
|
|
rc = -EFAULT;
|
|
goto err;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
err:
|
|
hl_p2p_region_fini(hdev, p2pr);
|
|
return rc;
|
|
}
|
|
|
|
int hl_dio_start(struct hl_device *hdev)
|
|
{
|
|
dev_dbg(hdev->dev, "initializing HLDIO\n");
|
|
|
|
/* Initialize the IO counter and enable IO */
|
|
hdev->hldio.inflight_ios = alloc_percpu(s64);
|
|
if (!hdev->hldio.inflight_ios)
|
|
return -ENOMEM;
|
|
|
|
hl_dio_set_io_enabled(hdev, true);
|
|
|
|
return 0;
|
|
}
|
|
|
|
void hl_dio_stop(struct hl_device *hdev)
|
|
{
|
|
dev_dbg(hdev->dev, "deinitializing HLDIO\n");
|
|
|
|
if (hdev->hldio.io_enabled) {
|
|
/* Wait for all the IO to finish */
|
|
hl_dio_set_io_enabled(hdev, false);
|
|
hl_poll_timeout_condition(hdev, !hl_dio_count_io(hdev), 1000, IO_STABILIZE_TIMEOUT);
|
|
}
|
|
|
|
if (hdev->hldio.inflight_ios) {
|
|
free_percpu(hdev->hldio.inflight_ios);
|
|
hdev->hldio.inflight_ios = NULL;
|
|
}
|
|
}
|