GPU-Resident Queue API#

The uPCIe CUDA backend supports GPU-resident NVMe queue pairs via this API. A GPU-resident queue pair lives entirely in CUDA device memory and can be passed as a kernel argument so that CUDA threads submit and reap NVMe commands without any host involvement after launch.

Note

This API is experimental and may change without notice.

API Overview#

Symbol	Where	Description
`xnvme_cuda_queue_create()`	Host (C/C++)	Allocate a GPU-resident queue pair and register it with the NVMe controller
`xnvme_cuda_queue_destroy()`	Host (C/C++)	Deregister the queue pair and free device memory
`xnvme_cuda_cmd_io()`	Device (CUDA kernel)	Submit one NVMe command per thread and reap its completion cooperatively

struct xnvme_cuda_queue is an opaque type — its internals are not part of the public API. xnvme_cuda_queue_create() allocates and initializes the queue pair in CUDA device memory, so the returned pointer can be passed directly to a kernel without any additional copy. This is in contrast to the command array, which is prepared on the host and must be copied to device memory with cudaMemcpy() before the kernel launch.

Host-Side Setup#

Open a device on the upcie-cuda backend, allocate a data buffer, and prepare one xnvme_spec_cmd per thread on the host. PRP entries require physical addresses, so use xnvme_buf_vtophys to resolve them before copying the commands to device memory and launching the kernel:

int
main(int argc, char **argv)
{
	struct xnvme_opts opts = xnvme_opts_default();
	struct xnvme_dev *dev = NULL;
	struct xnvme_spec_cmd *h_cmds = NULL, *d_cmds = NULL;
	struct xnvme_cuda_queue *gpu_queue = NULL;
	uint64_t phys;
	uint32_t nsid;
	size_t qdepth, lba_nbytes;
	char *buf = NULL;
	int *h_errors = NULL, *d_errors = NULL;
	cudaError_t cerr;
	int err = 0;

	if (argc < 2) {
		err = -EINVAL;
		xnvme_cli_perr("Usage: %s <pci-id>", err);
		return err;
	}

	opts.be = "upcie-cuda";
	dev = xnvme_dev_open(argv[1], &opts);
	if (!dev) {
		err = -errno;
		xnvme_cli_perr("xnvme_dev_open()", err);
		return err;
	}

	/*
	 * qdepth is both the CUDA block dimension and the NVMe queue depth.
	 * Pass it as batch_size to the kernel to make all threads active.
	 * To submit a partial batch, pass a smaller value as batch_size;
	 * threads with tid >= batch_size participate in barriers but do not
	 * submit or reap a command.
	 */
	qdepth = 64;
	lba_nbytes = xnvme_dev_get_geo(dev)->lba_nbytes;
	nsid = xnvme_dev_get_nsid(dev);

	buf = (char *)xnvme_buf_alloc(dev, qdepth * lba_nbytes);
	if (!buf) {
		err = -errno;
		xnvme_cli_perr("xnvme_buf_alloc()", err);
		goto exit;
	}

	/* Prepare one command per thread with physical PRP addresses */
	h_cmds = (struct xnvme_spec_cmd *)calloc(qdepth, sizeof(*h_cmds));
	if (!h_cmds) {
		err = -errno;
		xnvme_cli_perr("calloc(h_cmds)", err);
		goto exit;
	}

	for (size_t i = 0; i < qdepth; i++) {
		xnvme_buf_vtophys(dev, buf + i * lba_nbytes, &phys);

		h_cmds[i].common.opcode = XNVME_SPEC_NVM_OPC_READ;
		h_cmds[i].nvm.slba = i;
		h_cmds[i].common.nsid = nsid;
		h_cmds[i].common.dptr.prp.prp1 = phys;
		h_cmds[i].nvm.nlb = 0; /* nlb is zero-based; 0 means 1 LBA */
	}

	/* Copy commands to device memory before passing to the kernel */
	cerr = cudaMalloc((void **)&d_cmds, qdepth * sizeof(*d_cmds));
	if (cerr) {
		xnvme_cli_perr("cudaMalloc(d_cmds), cudaError_t: %d", cerr);
		err = -ENOMEM;
		goto exit;
	}
	cudaMemcpy(d_cmds, h_cmds, qdepth * sizeof(*d_cmds), cudaMemcpyHostToDevice);

	/* Allocate a per-thread error buffer so the kernel can report failures */
	cerr = cudaMalloc((void **)&d_errors, qdepth * sizeof(int));
	if (cerr) {
		xnvme_cli_perr("cudaMalloc(d_errors), cudaError_t: %d", cerr);
		err = -ENOMEM;
		goto exit;
	}
	cudaMemset(d_errors, 0, qdepth * sizeof(int));

	err = xnvme_cuda_queue_create(dev, qdepth, &gpu_queue);
	if (err) {
		xnvme_cli_perr("xnvme_cuda_queue_create()", err);
		goto exit;
	}

	xnvme_cuda_io_kernel<<<1, qdepth>>>(gpu_queue, d_cmds, qdepth, d_errors);
	cudaDeviceSynchronize();

	h_errors = (int *)calloc(qdepth, sizeof(int));
	if (!h_errors) {
		err = -errno;
		xnvme_cli_perr("calloc(h_errors)", err);
		goto exit;
	}
	cudaMemcpy(h_errors, d_errors, qdepth * sizeof(int), cudaMemcpyDeviceToHost);

	for (size_t i = 0; i < qdepth; i++) {
		if (h_errors[i]) {
			fprintf(stderr, "thread %zu failed: %d\n", i, h_errors[i]);
			err = h_errors[i];
		}
	}
	if (!err) {
		printf("OK: %zu GPU-submitted NVMe reads completed successfully\n", qdepth);
	}

exit:
	if (gpu_queue) {
		xnvme_cuda_queue_destroy(dev, gpu_queue);
	}
	cudaFree(d_errors);
	cudaFree(d_cmds);
	free(h_errors);
	free(h_cmds);
	xnvme_buf_free(dev, buf);
	xnvme_dev_close(dev);

	return err;
}

Device-Side Dispatch#

Inside a CUDA kernel, use xnvme_cuda_cmd_io() to submit one NVMe command per active thread. The commands are fully prepared on the host, so the kernel only needs to index into the command array and call the helper.

__global__ static void
xnvme_cuda_io_kernel(struct xnvme_cuda_queue *qp, struct xnvme_spec_cmd *cmds, size_t batch_size,
		     int *errors)
{
	size_t tid = threadIdx.x;
	/* kernels cannot return values to the host directly, populate err array */
	errors[tid] = xnvme_cuda_cmd_io(qp, &cmds[tid], tid, batch_size);
}

The queue depth passed to xnvme_cuda_queue_create() must equal the CUDA block dimension. The batch_size argument to xnvme_cuda_cmd_io() controls how many threads actively submit commands and can be anything up to the queue depth. Threads with tid >= batch_size participate in the required barriers but do not submit or reap a command.