/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2011 NetApp, Inc.
 * All rights reserved.
 */

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/sx.h>
#include <sys/sysctl.h>

#include <machine/smp.h>

#include <dev/vmm/vmm_vm.h>

SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL);

int vmm_ipinum;
SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
    "IPI vector used for vcpu notifications");

/*
 * Invoke the rendezvous function on the specified vcpu if applicable.  Return
 * true if the rendezvous is finished, false otherwise.
 */
static bool
vm_rendezvous(struct vcpu *vcpu)
{
	struct vm *vm = vcpu->vm;
	int vcpuid;

	mtx_assert(&vcpu->vm->rendezvous_mtx, MA_OWNED);
	KASSERT(vcpu->vm->rendezvous_func != NULL,
	    ("vm_rendezvous: no rendezvous pending"));

	/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
	CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus,
	    &vm->active_cpus);

	vcpuid = vcpu->vcpuid;
	if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
	    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
		(*vm->rendezvous_func)(vcpu, vm->rendezvous_arg);
		CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
	}
	if (CPU_CMP(&vm->rendezvous_req_cpus, &vm->rendezvous_done_cpus) == 0) {
		CPU_ZERO(&vm->rendezvous_req_cpus);
		vm->rendezvous_func = NULL;
		wakeup(&vm->rendezvous_func);
		return (true);
	}
	return (false);
}

int
vm_handle_rendezvous(struct vcpu *vcpu)
{
	struct vm *vm;
	struct thread *td;

	td = curthread;
	vm = vcpu->vm;

	mtx_lock(&vm->rendezvous_mtx);
	while (vm->rendezvous_func != NULL) {
		if (vm_rendezvous(vcpu))
			break;

		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
		    "vmrndv", hz);
		if (td_ast_pending(td, TDA_SUSPEND)) {
			int error;

			mtx_unlock(&vm->rendezvous_mtx);
			error = thread_check_susp(td, true);
			if (error != 0)
				return (error);
			mtx_lock(&vm->rendezvous_mtx);
		}
	}
	mtx_unlock(&vm->rendezvous_mtx);
	return (0);
}

static void
vcpu_wait_idle(struct vcpu *vcpu)
{
	KASSERT(vcpu->state != VCPU_IDLE, ("vcpu already idle"));

	vcpu->reqidle = 1;
	vcpu_notify_event_locked(vcpu);
	msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
}

int
vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
    bool from_idle)
{
	int error;

	vcpu_assert_locked(vcpu);

	/*
	 * State transitions from the vmmdev_ioctl() must always begin from
	 * the VCPU_IDLE state. This guarantees that there is only a single
	 * ioctl() operating on a vcpu at any point.
	 */
	if (from_idle) {
		while (vcpu->state != VCPU_IDLE)
			vcpu_wait_idle(vcpu);
	} else {
		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
		    "vcpu idle state"));
	}

	if (vcpu->state == VCPU_RUNNING) {
		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
	} else {
		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
		    "vcpu that is not running", vcpu->hostcpu));
	}

	/*
	 * The following state transitions are allowed:
	 * IDLE -> FROZEN -> IDLE
	 * FROZEN -> RUNNING -> FROZEN
	 * FROZEN -> SLEEPING -> FROZEN
	 */
	switch (vcpu->state) {
	case VCPU_IDLE:
	case VCPU_RUNNING:
	case VCPU_SLEEPING:
		error = (newstate != VCPU_FROZEN);
		break;
	case VCPU_FROZEN:
		error = (newstate == VCPU_FROZEN);
		break;
	default:
		error = 1;
		break;
	}

	if (error)
		return (EBUSY);

	vcpu->state = newstate;
	if (newstate == VCPU_RUNNING)
		vcpu->hostcpu = curcpu;
	else
		vcpu->hostcpu = NOCPU;

	if (newstate == VCPU_IDLE)
		wakeup(&vcpu->state);

	return (0);
}

/*
 * Try to lock all of the vCPUs in the VM while taking care to avoid deadlocks
 * with vm_smp_rendezvous().
 *
 * The complexity here suggests that the rendezvous mechanism needs a rethink.
 */
int
vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate)
{
	cpuset_t locked;
	struct vcpu *vcpu;
	int error, i;
	uint16_t maxcpus;

	KASSERT(newstate != VCPU_IDLE,
	    ("vcpu_set_state_all: invalid target state %d", newstate));

	error = 0;
	CPU_ZERO(&locked);
	maxcpus = vm->maxcpus;

	mtx_lock(&vm->rendezvous_mtx);
restart:
	if (vm->rendezvous_func != NULL) {
		/*
		 * If we have a pending rendezvous, then the initiator may be
		 * blocked waiting for other vCPUs to execute the callback.  The
		 * current thread may be a vCPU thread so we must not block
		 * waiting for the initiator, otherwise we get a deadlock.
		 * Thus, execute the callback on behalf of any idle vCPUs.
		 */
		for (i = 0; i < maxcpus; i++) {
			vcpu = vm_vcpu(vm, i);
			if (vcpu == NULL)
				continue;
			vcpu_lock(vcpu);
			if (vcpu->state == VCPU_IDLE) {
				(void)vcpu_set_state_locked(vcpu, VCPU_FROZEN,
				    true);
				CPU_SET(i, &locked);
			}
			if (CPU_ISSET(i, &locked)) {
				/*
				 * We can safely execute the callback on this
				 * vCPU's behalf.
				 */
				vcpu_unlock(vcpu);
				(void)vm_rendezvous(vcpu);
				vcpu_lock(vcpu);
			}
			vcpu_unlock(vcpu);
		}
	}

	/*
	 * Now wait for remaining vCPUs to become idle.  This may include the
	 * initiator of a rendezvous that is currently blocked on the rendezvous
	 * mutex.
	 */
	CPU_FOREACH_ISCLR(i, &locked) {
		if (i >= maxcpus)
			break;
		vcpu = vm_vcpu(vm, i);
		if (vcpu == NULL)
			continue;
		vcpu_lock(vcpu);
		while (vcpu->state != VCPU_IDLE) {
			mtx_unlock(&vm->rendezvous_mtx);
			vcpu_wait_idle(vcpu);
			vcpu_unlock(vcpu);
			mtx_lock(&vm->rendezvous_mtx);
			if (vm->rendezvous_func != NULL)
				goto restart;
			vcpu_lock(vcpu);
		}
		error = vcpu_set_state_locked(vcpu, newstate, true);
		vcpu_unlock(vcpu);
		if (error != 0) {
			/* Roll back state changes. */
			CPU_FOREACH_ISSET(i, &locked)
				(void)vcpu_set_state(vcpu, VCPU_IDLE, false);
			break;
		}
		CPU_SET(i, &locked);
	}
	mtx_unlock(&vm->rendezvous_mtx);
	return (error);
}


int
vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
{
	int error;

	vcpu_lock(vcpu);
	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
	vcpu_unlock(vcpu);

	return (error);
}

enum vcpu_state
vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
{
	enum vcpu_state state;

	vcpu_lock(vcpu);
	state = vcpu->state;
	if (hostcpu != NULL)
		*hostcpu = vcpu->hostcpu;
	vcpu_unlock(vcpu);

	return (state);
}

/*
 * This function is called to ensure that a vcpu "sees" a pending event
 * as soon as possible:
 * - If the vcpu thread is sleeping then it is woken up.
 * - If the vcpu is running on a different host_cpu then an IPI will be directed
 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
 */
void
vcpu_notify_event_locked(struct vcpu *vcpu)
{
	int hostcpu;

	hostcpu = vcpu->hostcpu;
	if (vcpu->state == VCPU_RUNNING) {
		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
		if (hostcpu != curcpu) {
			ipi_cpu(hostcpu, vmm_ipinum);
		} else {
			/*
			 * If the 'vcpu' is running on 'curcpu' then it must
			 * be sending a notification to itself (e.g. SELF_IPI).
			 * The pending event will be picked up when the vcpu
			 * transitions back to guest context.
			 */
		}
	} else {
		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
		    "with hostcpu %d", vcpu->state, hostcpu));
		if (vcpu->state == VCPU_SLEEPING)
			wakeup_one(vcpu);
	}
}

void
vcpu_notify_event(struct vcpu *vcpu)
{
	vcpu_lock(vcpu);
	vcpu_notify_event_locked(vcpu);
	vcpu_unlock(vcpu);
}

int
vcpu_debugged(struct vcpu *vcpu)
{
	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
}

void
vm_lock_vcpus(struct vm *vm)
{
	sx_xlock(&vm->vcpus_init_lock);
}

void
vm_unlock_vcpus(struct vm *vm)
{
	sx_unlock(&vm->vcpus_init_lock);
}

void
vm_disable_vcpu_creation(struct vm *vm)
{
	sx_xlock(&vm->vcpus_init_lock);
	vm->dying = true;
	sx_xunlock(&vm->vcpus_init_lock);
}

uint16_t
vm_get_maxcpus(struct vm *vm)
{
	return (vm->maxcpus);
}

void
vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
    uint16_t *threads, uint16_t *maxcpus)
{
	*sockets = vm->sockets;
	*cores = vm->cores;
	*threads = vm->threads;
	*maxcpus = vm->maxcpus;
}

int
vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
    uint16_t threads, uint16_t maxcpus __unused)
{
	/* Ignore maxcpus. */
	if (sockets * cores * threads > vm->maxcpus)
		return (EINVAL);
	vm->sockets = sockets;
	vm->cores = cores;
	vm->threads = threads;
	return (0);
}

int
vm_suspend(struct vm *vm, enum vm_suspend_how how)
{
	int i;

	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
		return (EINVAL);

	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0)
		return (EALREADY);

	/*
	 * Notify all active vcpus that they are now suspended.
	 */
	for (i = 0; i < vm->maxcpus; i++) {
		if (CPU_ISSET(i, &vm->active_cpus))
			vcpu_notify_event(vm_vcpu(vm, i));
	}

	return (0);
}

int
vm_reinit(struct vm *vm)
{
	int error;

	/*
	 * A virtual machine can be reset only if all vcpus are suspended.
	 */
	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
		vm_reset(vm);
		error = 0;
	} else {
		error = EBUSY;
	}

	return (error);
}

int
vm_activate_cpu(struct vcpu *vcpu)
{
	struct vm *vm = vcpu->vm;

	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
		return (EBUSY);

	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
	return (0);
}

int
vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
{
	if (vcpu == NULL) {
		vm->debug_cpus = vm->active_cpus;
		for (int i = 0; i < vm->maxcpus; i++) {
			if (CPU_ISSET(i, &vm->active_cpus))
				vcpu_notify_event(vm_vcpu(vm, i));
		}
	} else {
		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
			return (EINVAL);

		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
		vcpu_notify_event(vcpu);
	}
	return (0);
}

int
vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
{
	if (vcpu == NULL) {
		CPU_ZERO(&vm->debug_cpus);
	} else {
		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
			return (EINVAL);

		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
	}
	return (0);
}

cpuset_t
vm_active_cpus(struct vm *vm)
{
	return (vm->active_cpus);
}

cpuset_t
vm_debug_cpus(struct vm *vm)
{
	return (vm->debug_cpus);
}

cpuset_t
vm_suspended_cpus(struct vm *vm)
{
	return (vm->suspended_cpus);
}