/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2023-2024 Chelsio Communications, Inc.
 * Written by: John Baldwin <jhb@FreeBSD.org>
 */

#include <sys/param.h>
#include <sys/dnv.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/memdesc.h>
#include <sys/module.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/refcount.h>
#include <sys/sbuf.h>
#include <sys/smp.h>
#include <sys/sx.h>
#include <sys/taskqueue.h>

#include <machine/bus.h>
#include <machine/bus_dma.h>

#include <dev/nvmf/nvmf.h>
#include <dev/nvmf/nvmf_transport.h>
#include <dev/nvmf/controller/nvmft_subr.h>
#include <dev/nvmf/controller/nvmft_var.h>

#include <cam/ctl/ctl.h>
#include <cam/ctl/ctl_error.h>
#include <cam/ctl/ctl_ha.h>
#include <cam/ctl/ctl_io.h>
#include <cam/ctl/ctl_frontend.h>
#include <cam/ctl/ctl_private.h>

/*
 * Store pointers to the capsule and qpair in the two pointer members
 * of CTL_PRIV_FRONTEND.
 */
#define	NVMFT_NC(io)	((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[0])
#define	NVMFT_QP(io)	((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[1])

static void	nvmft_done(union ctl_io *io);
static int	nvmft_init(void);
static int	nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data,
    int flag, struct thread *td);
static int	nvmft_shutdown(void);

static struct taskqueue *nvmft_taskq;
static TAILQ_HEAD(, nvmft_port) nvmft_ports;
static struct sx nvmft_ports_lock;

MALLOC_DEFINE(M_NVMFT, "nvmft", "NVMe over Fabrics controller");

static struct ctl_frontend nvmft_frontend = {
	.name = "nvmf",
	.init = nvmft_init,
	.ioctl = nvmft_ioctl,
	.fe_dump = NULL,
	.shutdown = nvmft_shutdown,
};

static void
nvmft_online(void *arg)
{
	struct nvmft_port *np = arg;

	mtx_lock(&np->lock);
	np->online = true;
	mtx_unlock(&np->lock);
}

static void
nvmft_offline(void *arg)
{
	struct nvmft_port *np = arg;
	struct nvmft_controller *ctrlr;

	mtx_lock(&np->lock);
	np->online = false;

	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
		nvmft_printf(ctrlr,
		    "shutting down due to port going offline\n");
		nvmft_controller_error(ctrlr, NULL, ENODEV);
	}

	while (!TAILQ_EMPTY(&np->controllers))
		mtx_sleep(np, &np->lock, 0, "nvmfoff", 0);
	mtx_unlock(&np->lock);
}

static int
nvmft_info(void *arg, struct sbuf *sb)
{
	struct nvmft_port *np = arg;
	struct nvmft_controller *ctrlr;
	int retval;

	mtx_lock(&np->lock);
	retval = sbuf_printf(sb, "\t<port>%s,p,%u</port>\n", np->cdata.subnqn,
	    np->portid);
	if (retval != 0)
		goto out;

	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
		retval = sbuf_printf(sb, "\t<host id=\"%u\">%s</host>\n",
		    ctrlr->cntlid, ctrlr->hostnqn);
		if (retval != 0)
			break;
	}
out:
	mtx_unlock(&np->lock);
	return (retval);
}

static int
nvmft_lun_enable(void *arg, int lun_id)
{
	struct nvmft_port *np = arg;
	struct nvmft_controller *ctrlr;
	uint32_t *old_ns, *new_ns;
	uint32_t nsid;
	u_int i, new_count;

	if (lun_id >= le32toh(np->cdata.nn)) {
		printf("NVMFT: %s lun %d larger than maximum nsid %u\n",
		    np->cdata.subnqn, lun_id, le32toh(np->cdata.nn));
		return (EOPNOTSUPP);
	}
	nsid = lun_id + 1;

	mtx_lock(&np->lock);
	for (;;) {
		new_count = np->num_ns + 1;
		mtx_unlock(&np->lock);
		new_ns = mallocarray(new_count, sizeof(*new_ns), M_NVMFT,
		    M_WAITOK);
		mtx_lock(&np->lock);
		if (np->num_ns + 1 <= new_count)
			break;
		free(new_ns, M_NVMFT);
	}
	for (i = 0; i < np->num_ns; i++) {
		if (np->active_ns[i] < nsid)
			continue;
		if (np->active_ns[i] == nsid) {
			mtx_unlock(&np->lock);
			free(new_ns, M_NVMFT);
			printf("NVMFT: %s duplicate lun %d\n",
			    np->cdata.subnqn, lun_id);
			return (EINVAL);
		}
		break;
	}

	/* Copy over IDs smaller than nsid. */
	memcpy(new_ns, np->active_ns, i * sizeof(*np->active_ns));

	/* Insert nsid. */
	new_ns[i] = nsid;

	/* Copy over IDs greater than nsid. */
	memcpy(new_ns + i + 1, np->active_ns + i, (np->num_ns - i) *
	    sizeof(*np->active_ns));

	np->num_ns++;
	old_ns = np->active_ns;
	np->active_ns = new_ns;

	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
		nvmft_controller_lun_changed(ctrlr, lun_id);
	}

	mtx_unlock(&np->lock);
	free(old_ns, M_NVMFT);

	return (0);
}

static int
nvmft_lun_disable(void *arg, int lun_id)
{
	struct nvmft_port *np = arg;
	struct nvmft_controller *ctrlr;
	uint32_t nsid;
	u_int i;

	if (lun_id >= le32toh(np->cdata.nn))
		return (0);
	nsid = lun_id + 1;

	mtx_lock(&np->lock);
	for (i = 0; i < np->num_ns; i++) {
		if (np->active_ns[i] == nsid)
			goto found;
	}
	mtx_unlock(&np->lock);
	printf("NVMFT: %s request to disable nonexistent lun %d\n",
	    np->cdata.subnqn, lun_id);
	return (EINVAL);

found:
	/* Move down IDs greater than nsid. */
	memmove(np->active_ns + i, np->active_ns + i + 1,
	    (np->num_ns - (i + 1)) * sizeof(*np->active_ns));
	np->num_ns--;

	/* NB: Don't bother freeing the old active_ns array. */

	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
		nvmft_controller_lun_changed(ctrlr, lun_id);
	}

	mtx_unlock(&np->lock);

	return (0);
}

void
nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid,
    struct nvme_ns_list *nslist)
{
	u_int i, count;

	mtx_lock(&np->lock);
	count = 0;
	for (i = 0; i < np->num_ns; i++) {
		if (np->active_ns[i] <= nsid)
			continue;
		nslist->ns[count] = htole32(np->active_ns[i]);
		count++;
		if (count == nitems(nslist->ns))
			break;
	}
	mtx_unlock(&np->lock);
}

void
nvmft_dispatch_command(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
    bool admin)
{
	struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
	const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
	struct nvmft_port *np = ctrlr->np;
	union ctl_io *io;
	int error;

	if (cmd->nsid == htole32(0)) {
		nvmft_send_generic_error(qp, nc,
		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
		nvmf_free_capsule(nc);
		return;
	}

	mtx_lock(&ctrlr->lock);
	if (ctrlr->pending_commands == 0)
		ctrlr->start_busy = sbinuptime();
	ctrlr->pending_commands++;
	mtx_unlock(&ctrlr->lock);
	io = ctl_alloc_io(np->port.ctl_pool_ref);
	ctl_zero_io(io);
	NVMFT_NC(io) = nc;
	NVMFT_QP(io) = qp;
	io->io_hdr.io_type = admin ? CTL_IO_NVME_ADMIN : CTL_IO_NVME;
	io->io_hdr.nexus.initid = ctrlr->cntlid;
	io->io_hdr.nexus.targ_port = np->port.targ_port;
	io->io_hdr.nexus.targ_lun = le32toh(cmd->nsid) - 1;
	io->nvmeio.cmd = *cmd;
	error = ctl_run(io);
	if (error != 0) {
		nvmft_printf(ctrlr, "ctl_run failed for command on %s: %d\n",
		    nvmft_qpair_name(qp), error);
		ctl_nvme_set_generic_error(&io->nvmeio,
		    NVME_SC_INTERNAL_DEVICE_ERROR);
		nvmft_done(io);

		nvmft_controller_error(ctrlr, qp, ENXIO);
	}
}

void
nvmft_terminate_commands(struct nvmft_controller *ctrlr)
{
	struct nvmft_port *np = ctrlr->np;
	union ctl_io *io;
	int error;

	mtx_lock(&ctrlr->lock);
	if (ctrlr->pending_commands == 0)
		ctrlr->start_busy = sbinuptime();
	ctrlr->pending_commands++;
	mtx_unlock(&ctrlr->lock);
	io = ctl_alloc_io(np->port.ctl_pool_ref);
	ctl_zero_io(io);
	NVMFT_QP(io) = ctrlr->admin;
	io->io_hdr.io_type = CTL_IO_TASK;
	io->io_hdr.nexus.initid = ctrlr->cntlid;
	io->io_hdr.nexus.targ_port = np->port.targ_port;
	io->io_hdr.nexus.targ_lun = 0;
	io->taskio.tag_type = CTL_TAG_SIMPLE; /* XXX: unused? */
	io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET;
	error = ctl_run(io);
	if (error != CTL_RETVAL_COMPLETE) {
		nvmft_printf(ctrlr, "failed to terminate tasks: %d\n", error);
#ifdef INVARIANTS
		io->io_hdr.status = CTL_SUCCESS;
#endif
		nvmft_done(io);
	}
}

static void
nvmft_datamove_out_cb(void *arg, size_t xfered, int error)
{
	struct ctl_nvmeio *ctnio = arg;

	if (error != 0) {
		ctl_nvme_set_data_transfer_error(ctnio);
	} else {
		MPASS(xfered == ctnio->kern_data_len);
		ctnio->kern_data_resid -= xfered;
	}

	if (ctnio->kern_sg_entries) {
		free(ctnio->ext_data_ptr, M_NVMFT);
		ctnio->ext_data_ptr = NULL;
	} else
		MPASS(ctnio->ext_data_ptr == NULL);
	ctl_datamove_done((union ctl_io *)ctnio, false);
}

static void
nvmft_datamove_out(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp,
    struct nvmf_capsule *nc)
{
	struct memdesc mem;
	int error;

	MPASS(ctnio->ext_data_ptr == NULL);
	if (ctnio->kern_sg_entries > 0) {
		struct ctl_sg_entry *sgl;
		struct bus_dma_segment *vlist;

		vlist = mallocarray(ctnio->kern_sg_entries, sizeof(*vlist),
		    M_NVMFT, M_WAITOK);
		ctnio->ext_data_ptr = (void *)vlist;
		sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
		for (u_int i = 0; i < ctnio->kern_sg_entries; i++) {
			vlist[i].ds_addr = (uintptr_t)sgl[i].addr;
			vlist[i].ds_len = sgl[i].len;
		}
		mem = memdesc_vlist(vlist, ctnio->kern_sg_entries);
	} else
		mem = memdesc_vaddr(ctnio->kern_data_ptr, ctnio->kern_data_len);

	error = nvmf_receive_controller_data(nc, ctnio->kern_rel_offset, &mem,
	    ctnio->kern_data_len, nvmft_datamove_out_cb, ctnio);
	if (error == 0)
		return;

	nvmft_printf(nvmft_qpair_ctrlr(qp),
	    "Failed to request capsule data: %d\n", error);
	ctl_nvme_set_data_transfer_error(ctnio);

	if (ctnio->kern_sg_entries) {
		free(ctnio->ext_data_ptr, M_NVMFT);
		ctnio->ext_data_ptr = NULL;
	} else
		MPASS(ctnio->ext_data_ptr == NULL);
	ctl_datamove_done((union ctl_io *)ctnio, true);
}

static struct mbuf *
nvmft_copy_data(struct ctl_nvmeio *ctnio)
{
	struct ctl_sg_entry *sgl;
	struct mbuf *m0, *m;
	uint32_t resid, off, todo;
	int mlen;

	MPASS(ctnio->kern_data_len != 0);

	m0 = m_getm2(NULL, ctnio->kern_data_len, M_WAITOK, MT_DATA, 0);

	if (ctnio->kern_sg_entries == 0) {
		m_copyback(m0, 0, ctnio->kern_data_len, ctnio->kern_data_ptr);
		return (m0);
	}

	resid = ctnio->kern_data_len;
	sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
	off = 0;
	m = m0;
	mlen = M_TRAILINGSPACE(m);
	for (;;) {
		todo = MIN(mlen, sgl->len - off);
		memcpy(mtod(m, char *) + m->m_len, (char *)sgl->addr + off,
		    todo);
		m->m_len += todo;
		resid -= todo;
		if (resid == 0) {
			MPASS(m->m_next == NULL);
			break;
		}

		off += todo;
		if (off == sgl->len) {
			sgl++;
			off = 0;
		}
		mlen -= todo;
		if (mlen == 0) {
			m = m->m_next;
			mlen = M_TRAILINGSPACE(m);
		}
	}

	return (m0);
}

static void
m_free_ref_data(struct mbuf *m)
{
	ctl_ref kern_data_ref = m->m_ext.ext_arg1;

	kern_data_ref(m->m_ext.ext_arg2, -1);
}

static struct mbuf *
m_get_ref_data(struct ctl_nvmeio *ctnio, void *buf, u_int size)
{
	struct mbuf *m;

	m = m_get(M_WAITOK, MT_DATA);
	m_extadd(m, buf, size, m_free_ref_data, ctnio->kern_data_ref,
	    ctnio->kern_data_arg, M_RDONLY, EXT_CTL);
	m->m_len = size;
	ctnio->kern_data_ref(ctnio->kern_data_arg, 1);
	return (m);
}

static struct mbuf *
nvmft_ref_data(struct ctl_nvmeio *ctnio)
{
	struct ctl_sg_entry *sgl;
	struct mbuf *m0, *m;

	MPASS(ctnio->kern_data_len != 0);

	if (ctnio->kern_sg_entries == 0)
		return (m_get_ref_data(ctnio, ctnio->kern_data_ptr,
		    ctnio->kern_data_len));

	sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
	m0 = m_get_ref_data(ctnio, sgl[0].addr, sgl[0].len);
	m = m0;
	for (u_int i = 1; i < ctnio->kern_sg_entries; i++) {
		m->m_next = m_get_ref_data(ctnio, sgl[i].addr, sgl[i].len);
		m = m->m_next;
	}
	return (m0);
}

static void
nvmft_datamove_in(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp,
    struct nvmf_capsule *nc)
{
	struct mbuf *m;
	u_int status;

	if (ctnio->kern_data_ref != NULL)
		m = nvmft_ref_data(ctnio);
	else
		m = nvmft_copy_data(ctnio);
	status = nvmf_send_controller_data(nc, ctnio->kern_rel_offset, m,
	    ctnio->kern_data_len);
	switch (status) {
	case NVMF_SUCCESS_SENT:
		ctnio->success_sent = true;
		nvmft_command_completed(qp, nc);
		/* FALLTHROUGH */
	case NVMF_MORE:
	case NVME_SC_SUCCESS:
		break;
	default:
		ctl_nvme_set_generic_error(ctnio, status);
		break;
	}
	ctl_datamove_done((union ctl_io *)ctnio, true);
}

void
nvmft_handle_datamove(union ctl_io *io)
{
	struct nvmf_capsule *nc;
	struct nvmft_qpair *qp;

	/* Some CTL commands preemptively set a success status. */
	MPASS(io->io_hdr.status == CTL_STATUS_NONE ||
	    io->io_hdr.status == CTL_SUCCESS);
	MPASS(!io->nvmeio.success_sent);

	nc = NVMFT_NC(io);
	qp = NVMFT_QP(io);

	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN)
		nvmft_datamove_in(&io->nvmeio, qp, nc);
	else
		nvmft_datamove_out(&io->nvmeio, qp, nc);
}

void
nvmft_abort_datamove(union ctl_io *io)
{
	io->io_hdr.port_status = 1;
	io->io_hdr.flags |= CTL_FLAG_ABORT;
	ctl_datamove_done(io, true);
}

static void
nvmft_datamove(union ctl_io *io)
{
	struct nvmft_qpair *qp;

	qp = NVMFT_QP(io);
	nvmft_qpair_datamove(qp, io);
}

void
nvmft_enqueue_task(struct task *task)
{
	taskqueue_enqueue(nvmft_taskq, task);
}

void
nvmft_drain_task(struct task *task)
{
	taskqueue_drain(nvmft_taskq, task);
}

static void
hip_add(uint64_t pair[2], uint64_t addend)
{
	uint64_t old, new;

	old = le64toh(pair[0]);
	new = old + addend;
	pair[0] = htole64(new);
	if (new < old)
		pair[1] += htole64(1);
}

static void
nvmft_done(union ctl_io *io)
{
	struct nvmft_controller *ctrlr;
	const struct nvme_command *cmd;
	struct nvmft_qpair *qp;
	struct nvmf_capsule *nc;
	size_t len;

	KASSERT(io->io_hdr.status == CTL_SUCCESS ||
	    io->io_hdr.status == CTL_NVME_ERROR,
	    ("%s: bad status %u", __func__, io->io_hdr.status));

	nc = NVMFT_NC(io);
	qp = NVMFT_QP(io);
	ctrlr = nvmft_qpair_ctrlr(qp);

	if (nc == NULL) {
		/* Completion of nvmft_terminate_commands. */
		goto end;
	}

	cmd = nvmf_capsule_sqe(nc);

	if (io->io_hdr.status == CTL_SUCCESS)
		len = nvmf_capsule_data_len(nc) / 512;
	else
		len = 0;
	switch (cmd->opc) {
	case NVME_OPC_WRITE:
		mtx_lock(&ctrlr->lock);
		hip_add(ctrlr->hip.host_write_commands, 1);
		len += ctrlr->partial_duw;
		if (len > 1000)
			hip_add(ctrlr->hip.data_units_written, len / 1000);
		ctrlr->partial_duw = len % 1000;
		mtx_unlock(&ctrlr->lock);
		break;
	case NVME_OPC_READ:
	case NVME_OPC_COMPARE:
	case NVME_OPC_VERIFY:
		mtx_lock(&ctrlr->lock);
		if (cmd->opc != NVME_OPC_VERIFY)
			hip_add(ctrlr->hip.host_read_commands, 1);
		len += ctrlr->partial_dur;
		if (len > 1000)
			hip_add(ctrlr->hip.data_units_read, len / 1000);
		ctrlr->partial_dur = len % 1000;
		mtx_unlock(&ctrlr->lock);
		break;
	}

	if (io->nvmeio.success_sent) {
		MPASS(io->io_hdr.status == CTL_SUCCESS);
	} else {
		io->nvmeio.cpl.cid = cmd->cid;
		nvmft_send_response(qp, &io->nvmeio.cpl);
	}
	nvmf_free_capsule(nc);
end:
	ctl_free_io(io);
	mtx_lock(&ctrlr->lock);
	ctrlr->pending_commands--;
	if (ctrlr->pending_commands == 0)
		ctrlr->busy_total += sbinuptime() - ctrlr->start_busy;
	mtx_unlock(&ctrlr->lock);
}

static int
nvmft_init(void)
{
	int error;

	nvmft_taskq = taskqueue_create("nvmft", M_WAITOK,
	    taskqueue_thread_enqueue, &nvmft_taskq);
	error = taskqueue_start_threads_in_proc(&nvmft_taskq, mp_ncpus, PWAIT,
	    control_softc->ctl_proc, "nvmft");
	if (error != 0) {
		taskqueue_free(nvmft_taskq);
		return (error);
	}

	TAILQ_INIT(&nvmft_ports);
	sx_init(&nvmft_ports_lock, "nvmft ports");
	return (0);
}

void
nvmft_port_free(struct nvmft_port *np)
{
	KASSERT(TAILQ_EMPTY(&np->controllers),
	    ("%s(%p): active controllers", __func__, np));

	if (np->port.targ_port != -1) {
		if (ctl_port_deregister(&np->port) != 0)
			printf("%s: ctl_port_deregister() failed\n", __func__);
	}

	free(np->active_ns, M_NVMFT);
	clean_unrhdr(np->ids);
	delete_unrhdr(np->ids);
	mtx_destroy(&np->lock);
	free(np, M_NVMFT);
}

static struct nvmft_port *
nvmft_port_find(const char *subnqn)
{
	struct nvmft_port *np;

	KASSERT(nvmf_nqn_valid(subnqn), ("%s: invalid nqn", __func__));

	sx_assert(&nvmft_ports_lock, SA_LOCKED);
	TAILQ_FOREACH(np, &nvmft_ports, link) {
		if (strcmp(np->cdata.subnqn, subnqn) == 0)
			break;
	}
	return (np);
}

static struct nvmft_port *
nvmft_port_find_by_id(int port_id)
{
	struct nvmft_port *np;

	sx_assert(&nvmft_ports_lock, SA_LOCKED);
	TAILQ_FOREACH(np, &nvmft_ports, link) {
		if (np->port.targ_port == port_id)
			break;
	}
	return (np);
}

/*
 * Helper function to fetch a number stored as a string in an nv_list.
 * Returns false if the string was not a valid number.
 */
static bool
dnvlist_get_strnum(nvlist_t *nvl, const char *name, u_long default_value,
	u_long *value)
{
	const char *str;
	char *cp;

	str = dnvlist_get_string(nvl, name, NULL);
	if (str == NULL) {
		*value = default_value;
		return (true);
	}
	if (*str == '\0')
		return (false);
	*value = strtoul(str, &cp, 0);
	if (*cp != '\0')
		return (false);
	return (true);
}

/*
 * NVMeoF ports support the following parameters:
 *
 * Mandatory:
 *
 * subnqn: subsystem NVMe Qualified Name
 * portid: integer port ID from Discovery Log Page entry
 *
 * Optional:
 * serial: Serial Number string
 * max_io_qsize: Maximum number of I/O queue entries
 * enable_timeout: Timeout for controller enable in milliseconds
 * ioccsz: Maximum command capsule size
 * iorcsz: Maximum response capsule size
 * nn: Number of namespaces
 */
static void
nvmft_port_create(struct ctl_req *req)
{
	struct nvmft_port *np;
	struct ctl_port *port;
	const char *serial, *subnqn;
	char serial_buf[NVME_SERIAL_NUMBER_LENGTH];
	u_long enable_timeout, hostid, ioccsz, iorcsz, max_io_qsize, nn, portid;
	int error;

	/* Required parameters. */
	subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL);
	if (subnqn == NULL || !nvlist_exists_string(req->args_nvl, "portid")) {
		req->status = CTL_LUN_ERROR;
		snprintf(req->error_str, sizeof(req->error_str),
		    "Missing required argument");
		return;
	}
	if (!nvmf_nqn_valid(subnqn)) {
		req->status = CTL_LUN_ERROR;
		snprintf(req->error_str, sizeof(req->error_str),
		    "Invalid SubNQN");
		return;
	}
	if (!dnvlist_get_strnum(req->args_nvl, "portid", UINT16_MAX, &portid) ||
	    portid > UINT16_MAX) {
		req->status = CTL_LUN_ERROR;
		snprintf(req->error_str, sizeof(req->error_str),
		    "Invalid port ID");
		return;
	}

	/* Optional parameters. */
	if (!dnvlist_get_strnum(req->args_nvl, "max_io_qsize",
	    NVMF_MAX_IO_ENTRIES, &max_io_qsize) ||
	    max_io_qsize < NVME_MIN_IO_ENTRIES ||
	    max_io_qsize > NVME_MAX_IO_ENTRIES) {
		req->status = CTL_LUN_ERROR;
		snprintf(req->error_str, sizeof(req->error_str),
		    "Invalid maximum I/O queue size");
		return;
	}

	if (!dnvlist_get_strnum(req->args_nvl, "enable_timeout",
	    NVMF_CC_EN_TIMEOUT * 500, &enable_timeout) ||
	    (enable_timeout % 500) != 0 || (enable_timeout / 500) > 255) {
		req->status = CTL_LUN_ERROR;
		snprintf(req->error_str, sizeof(req->error_str),
		    "Invalid enable timeout");
		return;
	}

	if (!dnvlist_get_strnum(req->args_nvl, "ioccsz", NVMF_IOCCSZ,
	    &ioccsz) || ioccsz < sizeof(struct nvme_command) ||
	    (ioccsz % 16) != 0) {
		req->status = CTL_LUN_ERROR;
		snprintf(req->error_str, sizeof(req->error_str),
		    "Invalid Command Capsule size");
		return;
	}

	if (!dnvlist_get_strnum(req->args_nvl, "iorcsz", NVMF_IORCSZ,
	    &iorcsz) || iorcsz < sizeof(struct nvme_completion) ||
	    (iorcsz % 16) != 0) {
		req->status = CTL_LUN_ERROR;
		snprintf(req->error_str, sizeof(req->error_str),
		    "Invalid Response Capsule size");
		return;
	}

	if (!dnvlist_get_strnum(req->args_nvl, "nn", NVMF_NN, &nn) ||
	    nn < 1 || nn > UINT32_MAX) {
		req->status = CTL_LUN_ERROR;
		snprintf(req->error_str, sizeof(req->error_str),
		    "Invalid number of namespaces");
		return;
	}

	serial = dnvlist_get_string(req->args_nvl, "serial", NULL);
	if (serial == NULL) {
		getcredhostid(curthread->td_ucred, &hostid);
		nvmf_controller_serial(serial_buf, sizeof(serial_buf), hostid);
		serial = serial_buf;
	}

	sx_xlock(&nvmft_ports_lock);

	np = nvmft_port_find(subnqn);
	if (np != NULL) {
		req->status = CTL_LUN_ERROR;
		snprintf(req->error_str, sizeof(req->error_str),
		    "SubNQN \"%s\" already exists", subnqn);
		sx_xunlock(&nvmft_ports_lock);
		return;
	}

	np = malloc(sizeof(*np), M_NVMFT, M_WAITOK | M_ZERO);
	refcount_init(&np->refs, 1);
	np->portid = portid;
	np->max_io_qsize = max_io_qsize;
	np->cap = _nvmf_controller_cap(max_io_qsize, enable_timeout / 500);
	mtx_init(&np->lock, "nvmft port", NULL, MTX_DEF);
	np->ids = new_unrhdr(0, MIN(CTL_MAX_INIT_PER_PORT - 1,
	    NVMF_CNTLID_STATIC_MAX), UNR_NO_MTX);
	TAILQ_INIT(&np->controllers);

	/* The controller ID is set later for individual controllers. */
	_nvmf_init_io_controller_data(0, max_io_qsize, serial, ostype,
	    osrelease, subnqn, nn, ioccsz, iorcsz, &np->cdata);
	np->cdata.aerl = NVMFT_NUM_AER - 1;
	np->cdata.oaes = htole32(NVME_ASYNC_EVENT_NS_ATTRIBUTE);
	np->cdata.oncs = htole16(NVMEF(NVME_CTRLR_DATA_ONCS_VERIFY, 1) |
	    NVMEF(NVME_CTRLR_DATA_ONCS_WRZERO, 1) |
	    NVMEF(NVME_CTRLR_DATA_ONCS_DSM, 1) |
	    NVMEF(NVME_CTRLR_DATA_ONCS_COMPARE, 1));
	np->cdata.fuses = NVMEF(NVME_CTRLR_DATA_FUSES_CNW, 1);

	np->fp.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1);
	memcpy(np->fp.revision[0], np->cdata.fr, sizeof(np->cdata.fr));

	port = &np->port;

	port->frontend = &nvmft_frontend;
	port->port_type = CTL_PORT_NVMF;
	port->num_requested_ctl_io = max_io_qsize;
	port->port_name = "nvmf";
	port->physical_port = portid;
	port->virtual_port = 0;
	port->port_online = nvmft_online;
	port->port_offline = nvmft_offline;
	port->port_info = nvmft_info;
	port->onoff_arg = np;
	port->lun_enable = nvmft_lun_enable;
	port->lun_disable = nvmft_lun_disable;
	port->targ_lun_arg = np;
	port->fe_datamove = nvmft_datamove;
	port->fe_done = nvmft_done;
	port->targ_port = -1;
	port->options = nvlist_clone(req->args_nvl);

	error = ctl_port_register(port);
	if (error != 0) {
		sx_xunlock(&nvmft_ports_lock);
		nvlist_destroy(port->options);
		nvmft_port_rele(np);
		req->status = CTL_LUN_ERROR;
		snprintf(req->error_str, sizeof(req->error_str),
		    "Failed to register CTL port with error %d", error);
		return;
	}

	TAILQ_INSERT_TAIL(&nvmft_ports, np, link);
	sx_xunlock(&nvmft_ports_lock);

	req->status = CTL_LUN_OK;
	req->result_nvl = nvlist_create(0);
	nvlist_add_number(req->result_nvl, "port_id", port->targ_port);
}

static void
nvmft_port_remove(struct ctl_req *req)
{
	struct nvmft_port *np;
	const char *subnqn;
	u_long port_id;

	/*
	 * ctladm port -r just provides the port_id, so permit looking
	 * up a port either by "subnqn" or "port_id".
	 */
	port_id = ULONG_MAX;
	subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL);
	if (subnqn == NULL) {
		if (!nvlist_exists_string(req->args_nvl, "port_id")) {
			req->status = CTL_LUN_ERROR;
			snprintf(req->error_str, sizeof(req->error_str),
			    "Missing required argument");
			return;
		}
		if (!dnvlist_get_strnum(req->args_nvl, "port_id", ULONG_MAX,
		    &port_id)) {
			req->status = CTL_LUN_ERROR;
			snprintf(req->error_str, sizeof(req->error_str),
			    "Invalid CTL port ID");
			return;
		}
	} else {
		if (nvlist_exists_string(req->args_nvl, "port_id")) {
			req->status = CTL_LUN_ERROR;
			snprintf(req->error_str, sizeof(req->error_str),
			    "Ambiguous port removal request");
			return;
		}
	}

	sx_xlock(&nvmft_ports_lock);

	if (subnqn != NULL) {
		np = nvmft_port_find(subnqn);
		if (np == NULL) {
			req->status = CTL_LUN_ERROR;
			snprintf(req->error_str, sizeof(req->error_str),
			    "SubNQN \"%s\" does not exist", subnqn);
			sx_xunlock(&nvmft_ports_lock);
			return;
		}
	} else {
		np = nvmft_port_find_by_id(port_id);
		if (np == NULL) {
			req->status = CTL_LUN_ERROR;
			snprintf(req->error_str, sizeof(req->error_str),
			    "CTL port %lu is not a NVMF port", port_id);
			sx_xunlock(&nvmft_ports_lock);
			return;
		}
	}

	TAILQ_REMOVE(&nvmft_ports, np, link);
	sx_xunlock(&nvmft_ports_lock);

	mtx_lock(&np->lock);
	if (np->online) {
		mtx_unlock(&np->lock);
		ctl_port_offline(&np->port);
	} else
		mtx_unlock(&np->lock);

	nvmft_port_rele(np);
	req->status = CTL_LUN_OK;
}

static void
nvmft_handoff(struct ctl_nvmf *cn)
{
	const struct nvmf_fabric_connect_cmd *cmd;
	const struct nvmf_fabric_connect_data *data;
	const nvlist_t *params;
	struct nvmft_port *np;
	nvlist_t *nvl;
	size_t len;
	enum nvmf_trtype trtype;
	int error;

	np = NULL;
	error = nvmf_unpack_ioc_nvlist(&cn->data.handoff, &nvl);
	if (error != 0) {
		cn->status = CTL_NVMF_ERROR;
		snprintf(cn->error_str, sizeof(cn->error_str),
		    "Failed to copyin and unpack handoff arguments");
		return;
	}

	if (!nvlist_exists_number(nvl, "trtype") ||
	    !nvlist_exists_nvlist(nvl, "params") ||
	    !nvlist_exists_binary(nvl, "cmd") ||
	    !nvlist_exists_binary(nvl, "data")) {
		cn->status = CTL_NVMF_ERROR;
		snprintf(cn->error_str, sizeof(cn->error_str),
		    "Handoff arguments missing required value");
		goto out;
	}

	params = nvlist_get_nvlist(nvl, "params");
	if (!nvmf_validate_qpair_nvlist(params, true)) {
		cn->status = CTL_NVMF_ERROR;
		snprintf(cn->error_str, sizeof(cn->error_str),
		    "Invalid queue pair parameters");
		goto out;
	}

	cmd = nvlist_get_binary(nvl, "cmd", &len);
	if (len != sizeof(*cmd)) {
		cn->status = CTL_NVMF_ERROR;
		snprintf(cn->error_str, sizeof(cn->error_str),
		    "Wrong size for CONNECT SQE");
		goto out;
	}

	data = nvlist_get_binary(nvl, "data", &len);
	if (len != sizeof(*data)) {
		cn->status = CTL_NVMF_ERROR;
		snprintf(cn->error_str, sizeof(cn->error_str),
		    "Wrong size for CONNECT data");
		goto out;
	}

	if (!nvmf_nqn_valid(data->subnqn)) {
		cn->status = CTL_NVMF_ERROR;
		snprintf(cn->error_str, sizeof(cn->error_str),
		    "Invalid SubNQN");
		goto out;
	}

	sx_slock(&nvmft_ports_lock);
	np = nvmft_port_find(data->subnqn);
	if (np == NULL) {
		sx_sunlock(&nvmft_ports_lock);
		cn->status = CTL_NVMF_ERROR;
		snprintf(cn->error_str, sizeof(cn->error_str),
		    "Unknown SubNQN");
		goto out;
	}
	if (!np->online) {
		sx_sunlock(&nvmft_ports_lock);
		cn->status = CTL_NVMF_ERROR;
		snprintf(cn->error_str, sizeof(cn->error_str),
		    "CTL port offline");
		np = NULL;
		goto out;
	}
	nvmft_port_ref(np);
	sx_sunlock(&nvmft_ports_lock);

	trtype = nvlist_get_number(nvl, "trtype");
	if (nvlist_get_bool(params, "admin")) {
		error = nvmft_handoff_admin_queue(np, trtype, params, cmd,
		    data);
		if (error != 0) {
			cn->status = CTL_NVMF_ERROR;
			snprintf(cn->error_str, sizeof(cn->error_str),
			    "Failed to handoff admin queue: %d", error);
			goto out;
		}
	} else {
		error = nvmft_handoff_io_queue(np, trtype, params, cmd, data);
		if (error != 0) {
			cn->status = CTL_NVMF_ERROR;
			snprintf(cn->error_str, sizeof(cn->error_str),
			    "Failed to handoff I/O queue: %d", error);
			goto out;
		}
	}

	cn->status = CTL_NVMF_OK;
out:
	if (np != NULL)
		nvmft_port_rele(np);
	nvlist_destroy(nvl);
}

static void
nvmft_list(struct ctl_nvmf *cn)
{
	struct ctl_nvmf_list_params *lp;
	struct nvmft_controller *ctrlr;
	struct nvmft_port *np;
	struct sbuf *sb;
	int error;

	lp = &cn->data.list;

	sb = sbuf_new(NULL, NULL, lp->alloc_len, SBUF_FIXEDLEN |
	    SBUF_INCLUDENUL);
	if (sb == NULL) {
		cn->status = CTL_NVMF_ERROR;
		snprintf(cn->error_str, sizeof(cn->error_str),
		    "Failed to allocate NVMeoF session list");
		return;
	}

	sbuf_printf(sb, "<ctlnvmflist>\n");
	sx_slock(&nvmft_ports_lock);
	TAILQ_FOREACH(np, &nvmft_ports, link) {
		mtx_lock(&np->lock);
		TAILQ_FOREACH(ctrlr, &np->controllers, link) {
			sbuf_printf(sb, "<connection id=\"%d\">"
			    "<hostnqn>%s</hostnqn>"
			    "<subnqn>%s</subnqn>"
			    "<trtype>%u</trtype>"
			    "</connection>\n",
			    ctrlr->cntlid,
			    ctrlr->hostnqn,
			    np->cdata.subnqn,
			    ctrlr->trtype);
		}
		mtx_unlock(&np->lock);
	}
	sx_sunlock(&nvmft_ports_lock);
	sbuf_printf(sb, "</ctlnvmflist>\n");
	if (sbuf_finish(sb) != 0) {
		sbuf_delete(sb);
		cn->status = CTL_NVMF_LIST_NEED_MORE_SPACE;
		snprintf(cn->error_str, sizeof(cn->error_str),
		    "Out of space, %d bytes is too small", lp->alloc_len);
		return;
	}

	error = copyout(sbuf_data(sb), lp->conn_xml, sbuf_len(sb));
	if (error != 0) {
		sbuf_delete(sb);
		cn->status = CTL_NVMF_ERROR;
		snprintf(cn->error_str, sizeof(cn->error_str),
		    "Failed to copyout session list: %d", error);
		return;
	}
	lp->fill_len = sbuf_len(sb);
	cn->status = CTL_NVMF_OK;
	sbuf_delete(sb);
}

static void
nvmft_terminate(struct ctl_nvmf *cn)
{
	struct ctl_nvmf_terminate_params *tp;
	struct nvmft_controller *ctrlr;
	struct nvmft_port *np;
	bool found, match;

	tp = &cn->data.terminate;

	found = false;
	sx_slock(&nvmft_ports_lock);
	TAILQ_FOREACH(np, &nvmft_ports, link) {
		mtx_lock(&np->lock);
		TAILQ_FOREACH(ctrlr, &np->controllers, link) {
			if (tp->all != 0)
				match = true;
			else if (tp->cntlid != -1)
				match = tp->cntlid == ctrlr->cntlid;
			else if (tp->hostnqn[0] != '\0')
				match = strncmp(tp->hostnqn, ctrlr->hostnqn,
				    sizeof(tp->hostnqn)) == 0;
			else
				match = false;
			if (!match)
				continue;
			nvmft_printf(ctrlr,
			    "disconnecting due to administrative request\n");
			nvmft_controller_error(ctrlr, NULL, ECONNABORTED);
			found = true;
		}
		mtx_unlock(&np->lock);
	}
	sx_sunlock(&nvmft_ports_lock);

	if (!found) {
		cn->status = CTL_NVMF_ASSOCIATION_NOT_FOUND;
		snprintf(cn->error_str, sizeof(cn->error_str),
		    "No matching associations found");
		return;
	}
	cn->status = CTL_NVMF_OK;
}

static int
nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int flag,
    struct thread *td)
{
	struct ctl_nvmf *cn;
	struct ctl_req *req;

	switch (cmd) {
	case CTL_PORT_REQ:
		req = (struct ctl_req *)data;
		switch (req->reqtype) {
		case CTL_REQ_CREATE:
			nvmft_port_create(req);
			break;
		case CTL_REQ_REMOVE:
			nvmft_port_remove(req);
			break;
		default:
			req->status = CTL_LUN_ERROR;
			snprintf(req->error_str, sizeof(req->error_str),
			    "Unsupported request type %d", req->reqtype);
			break;
		}
		return (0);
	case CTL_NVMF:
		cn = (struct ctl_nvmf *)data;
		switch (cn->type) {
		case CTL_NVMF_HANDOFF:
			nvmft_handoff(cn);
			break;
		case CTL_NVMF_LIST:
			nvmft_list(cn);
			break;
		case CTL_NVMF_TERMINATE:
			nvmft_terminate(cn);
			break;
		default:
			cn->status = CTL_NVMF_ERROR;
			snprintf(cn->error_str, sizeof(cn->error_str),
			    "Invalid NVMeoF request type %d", cn->type);
			break;
		}
		return (0);
	default:
		return (ENOTTY);
	}
}

static int
nvmft_shutdown(void)
{
	/* TODO: Need to check for active controllers. */
	if (!TAILQ_EMPTY(&nvmft_ports))
		return (EBUSY);

	taskqueue_free(nvmft_taskq);
	sx_destroy(&nvmft_ports_lock);
	return (0);
}

CTL_FRONTEND_DECLARE(nvmft, nvmft_frontend);
MODULE_DEPEND(nvmft, nvmf_transport, 1, 1, 1);
