1 files changed, 148 insertions, 29 deletions
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 2ac076551165..f7c59847140b 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -562,9 +562,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid)
 }
 
 void
-vm_slock_vcpus(struct vm *vm)
+vm_lock_vcpus(struct vm *vm)
 {
-	sx_slock(&vm->vcpus_init_lock);
+	sx_xlock(&vm->vcpus_init_lock);
 }
 
 void
@@ -990,6 +990,54 @@ save_guest_fpustate(struct vcpu *vcpu)
 
 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 
+/*
+ * Invoke the rendezvous function on the specified vcpu if applicable.  Return
+ * true if the rendezvous is finished, false otherwise.
+ */
+static bool
+vm_rendezvous(struct vcpu *vcpu)
+{
+	struct vm *vm = vcpu->vm;
+	int vcpuid;
+
+	mtx_assert(&vcpu->vm->rendezvous_mtx, MA_OWNED);
+	KASSERT(vcpu->vm->rendezvous_func != NULL,
+	    ("vm_rendezvous: no rendezvous pending"));
+
+	/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
+	CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus,
+	    &vm->active_cpus);
+
+	vcpuid = vcpu->vcpuid;
+	if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
+	    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
+		VMM_CTR0(vcpu, "Calling rendezvous func");
+		(*vm->rendezvous_func)(vcpu, vm->rendezvous_arg);
+		CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
+	}
+	if (CPU_CMP(&vm->rendezvous_req_cpus,
+	    &vm->rendezvous_done_cpus) == 0) {
+		VMM_CTR0(vcpu, "Rendezvous completed");
+		CPU_ZERO(&vm->rendezvous_req_cpus);
+		vm->rendezvous_func = NULL;
+		wakeup(&vm->rendezvous_func);
+		return (true);
+	}
+	return (false);
+}
+
+static void
+vcpu_wait_idle(struct vcpu *vcpu)
+{
+	KASSERT(vcpu->state != VCPU_IDLE, ("vcpu already idle"));
+
+	vcpu->reqidle = 1;
+	vcpu_notify_event_locked(vcpu, false);
+	VMM_CTR1(vcpu, "vcpu state change from %s to "
+	    "idle requested", vcpu_state2str(vcpu->state));
+	msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
+}
+
 static int
 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
     bool from_idle)
@@ -1004,13 +1052,8 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
-		while (vcpu->state != VCPU_IDLE) {
-			vcpu->reqidle = 1;
-			vcpu_notify_event_locked(vcpu, false);
-			VMM_CTR1(vcpu, "vcpu state change from %s to "
-			    "idle requested", vcpu_state2str(vcpu->state));
-			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
-		}
+		while (vcpu->state != VCPU_IDLE)
+			vcpu_wait_idle(vcpu);
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
@@ -1062,6 +1105,95 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
 	return (0);
 }
 
+/*
+ * Try to lock all of the vCPUs in the VM while taking care to avoid deadlocks
+ * with vm_smp_rendezvous().
+ *
+ * The complexity here suggests that the rendezvous mechanism needs a rethink.
+ */
+int
+vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate)
+{
+	cpuset_t locked;
+	struct vcpu *vcpu;
+	int error, i;
+	uint16_t maxcpus;
+
+	KASSERT(newstate != VCPU_IDLE,
+	    ("vcpu_set_state_all: invalid target state %d", newstate));
+
+	error = 0;
+	CPU_ZERO(&locked);
+	maxcpus = vm->maxcpus;
+
+	mtx_lock(&vm->rendezvous_mtx);
+restart:
+	if (vm->rendezvous_func != NULL) {
+		/*
+		 * If we have a pending rendezvous, then the initiator may be
+		 * blocked waiting for other vCPUs to execute the callback.  The
+		 * current thread may be a vCPU thread so we must not block
+		 * waiting for the initiator, otherwise we get a deadlock.
+		 * Thus, execute the callback on behalf of any idle vCPUs.
+		 */
+		for (i = 0; i < maxcpus; i++) {
+			vcpu = vm_vcpu(vm, i);
+			if (vcpu == NULL)
+				continue;
+			vcpu_lock(vcpu);
+			if (vcpu->state == VCPU_IDLE) {
+				(void)vcpu_set_state_locked(vcpu, VCPU_FROZEN,
+				    true);
+				CPU_SET(i, &locked);
+			}
+			if (CPU_ISSET(i, &locked)) {
+				/*
+				 * We can safely execute the callback on this
+				 * vCPU's behalf.
+				 */
+				vcpu_unlock(vcpu);
+				(void)vm_rendezvous(vcpu);
+				vcpu_lock(vcpu);
+			}
+			vcpu_unlock(vcpu);
+		}
+	}
+
+	/*
+	 * Now wait for remaining vCPUs to become idle.  This may include the
+	 * initiator of a rendezvous that is currently blocked on the rendezvous
+	 * mutex.
+	 */
+	CPU_FOREACH_ISCLR(i, &locked) {
+		if (i >= maxcpus)
+			break;
+		vcpu = vm_vcpu(vm, i);
+		if (vcpu == NULL)
+			continue;
+		vcpu_lock(vcpu);
+		while (vcpu->state != VCPU_IDLE) {
+			mtx_unlock(&vm->rendezvous_mtx);
+			vcpu_wait_idle(vcpu);
+			vcpu_unlock(vcpu);
+			mtx_lock(&vm->rendezvous_mtx);
+			if (vm->rendezvous_func != NULL)
+				goto restart;
+			vcpu_lock(vcpu);
+		}
+		error = vcpu_set_state_locked(vcpu, newstate, true);
+		vcpu_unlock(vcpu);
+		if (error != 0) {
+			/* Roll back state changes. */
+			CPU_FOREACH_ISSET(i, &locked)
+				(void)vcpu_set_state(vcpu, VCPU_IDLE, false);
+			break;
+		}
+		CPU_SET(i, &locked);
+	}
+	mtx_unlock(&vm->rendezvous_mtx);
+	return (error);
+}
+
 static void
 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
 {
@@ -1083,36 +1215,23 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
 static int
 vm_handle_rendezvous(struct vcpu *vcpu)
 {
-	struct vm *vm = vcpu->vm;
+	struct vm *vm;
 	struct thread *td;
-	int error, vcpuid;
 
-	error = 0;
-	vcpuid = vcpu->vcpuid;
 	td = curthread;
+	vm = vcpu->vm;
+
 	mtx_lock(&vm->rendezvous_mtx);
 	while (vm->rendezvous_func != NULL) {
-		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
-		CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus);
-
-		if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
-		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
-			VMM_CTR0(vcpu, "Calling rendezvous func");
-			(*vm->rendezvous_func)(vcpu, vm->rendezvous_arg);
-			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
-		}
-		if (CPU_CMP(&vm->rendezvous_req_cpus,
-		    &vm->rendezvous_done_cpus) == 0) {
-			VMM_CTR0(vcpu, "Rendezvous completed");
-			CPU_ZERO(&vm->rendezvous_req_cpus);
-			vm->rendezvous_func = NULL;
-			wakeup(&vm->rendezvous_func);
+		if (vm_rendezvous(vcpu))
 			break;
-		}
+
 		VMM_CTR0(vcpu, "Wait for rendezvous completion");
 		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
 		    "vmrndv", hz);
 		if (td_ast_pending(td, TDA_SUSPEND)) {
+			int error;
+
 			mtx_unlock(&vm->rendezvous_mtx);
 			error = thread_check_susp(td, true);
 			if (error != 0)