fix linux repro extensions (#2415)

* fix OMS Linux repro extension config

* Fixing lost Node state updates

* fix bug in ReproVmss

* rewrite ssh auth

* win azure function ssh-keygen fix

* more logs

* try -P

* use empty string for password

* use argument list

* addressing comments

Co-authored-by: stas <statis@microsoft.com>
Co-authored-by: George Pollard <gpollard@microsoft.com>
This commit is contained in:
Stas 2022-09-20 08:15:26 -07:00 committed by GitHub
parent b647e4a1fe
commit f40a69a37f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 157 additions and 97 deletions

View File

@ -73,13 +73,13 @@ public class AgentEvents {
if (ev.State == NodeState.Free) {
if (node.ReimageRequested || node.DeleteRequested) {
_log.Info($"stopping free node with reset flags: {machineId}");
await _context.NodeOperations.Stop(node);
_ = await _context.NodeOperations.Stop(node);
return null;
}
if (await _context.NodeOperations.CouldShrinkScaleset(node)) {
_log.Info($"stopping free node to resize scaleset: {machineId}");
await _context.NodeOperations.SetHalt(node);
_ = await _context.NodeOperations.SetHalt(node);
return null;
}
}
@ -87,7 +87,7 @@ public class AgentEvents {
if (ev.State == NodeState.Init) {
if (node.DeleteRequested) {
_log.Info($"stopping node (init and delete_requested): {machineId}");
await _context.NodeOperations.Stop(node);
_ = await _context.NodeOperations.Stop(node);
return null;
}
@ -95,12 +95,12 @@ public class AgentEvents {
// they send 'init' with reimage_requested, it's because the node was reimaged
// successfully.
node = node with { ReimageRequested = false, InitializedAt = DateTimeOffset.UtcNow };
await _context.NodeOperations.SetState(node, ev.State);
_ = await _context.NodeOperations.SetState(node, ev.State);
return null;
}
_log.Info($"node state update: {machineId} from {node.State} to {ev.State}");
await _context.NodeOperations.SetState(node, ev.State);
node = await _context.NodeOperations.SetState(node, ev.State);
if (ev.State == NodeState.Free) {
_log.Info($"node now available for work: {machineId}");
@ -193,7 +193,8 @@ public class AgentEvents {
}
if (!node.State.ReadyForReset()) {
await _context.NodeOperations.SetState(node, NodeState.Busy);
_ = await _context.NodeOperations.SetState(node, NodeState.Busy);
// node unused after this point
}
var nodeTask = new NodeTasks(

View File

@ -100,9 +100,12 @@ public class Node {
context: patch.MachineId.ToString());
}
await _context.NodeOperations.Stop(node, done: true);
node = await _context.NodeOperations.Stop(node, done: true);
if (node.DebugKeepNode) {
await _context.NodeOperations.Replace(node with { DebugKeepNode = false });
var r = await _context.NodeOperations.Replace(node with { DebugKeepNode = false });
if (!r.IsOk) {
_log.Error($"Failed to replace node {node.MachineId} due to {r.ErrorV}");
}
}
return await RequestHandling.Ok(req, true);
@ -137,7 +140,10 @@ public class Node {
node = node with { DebugKeepNode = value };
}
await _context.NodeOperations.Replace(node);
var r = await _context.NodeOperations.Replace(node);
if (!r.IsOk) {
_log.Error($"Failed to replace node {node.MachineId} due to {r.ErrorV}");
}
return await RequestHandling.Ok(req, true);
}
@ -166,9 +172,12 @@ public class Node {
context: delete.MachineId.ToString());
}
await _context.NodeOperations.SetHalt(node);
node = await _context.NodeOperations.SetHalt(node);
if (node.DebugKeepNode) {
await _context.NodeOperations.Replace(node with { DebugKeepNode = false });
var r = await _context.NodeOperations.Replace(node with { DebugKeepNode = false });
if (!r.IsOk) {
_log.Error($"Failed to replace node {node.MachineId} due to {r.ErrorV}");
}
}
return await RequestHandling.Ok(req, true);

View File

@ -102,7 +102,7 @@ public class ReproVmss {
context: "repro delete");
}
var vm = await _context.ReproOperations.SearchByPartitionKeys(new[] { $"request.OkV.VmId" }).FirstOrDefaultAsync();
var vm = await _context.ReproOperations.SearchByPartitionKeys(new[] { request.OkV.VmId.ToString()! }).FirstOrDefaultAsync();
if (vm == null) {
return await _context.RequestHandling.NotOk(
@ -112,7 +112,10 @@ public class ReproVmss {
}
var updatedRepro = vm with { State = VmState.Stopping };
await _context.ReproOperations.Replace(updatedRepro);
var r = await _context.ReproOperations.Replace(updatedRepro);
if (!r.IsOk) {
_log.Error($"Failed to replace repro {updatedRepro.VmId} due to {r.ErrorV}");
}
var response = req.CreateResponse(HttpStatusCode.OK);
await response.WriteAsJsonAsync(updatedRepro);

View File

@ -114,7 +114,7 @@ public class Scaleset {
ScalesetId: Guid.NewGuid(),
State: ScalesetState.Init,
NeedsConfigUpdate: false,
Auth: Auth.BuildAuth(),
Auth: await Auth.BuildAuth(),
PoolName: create.PoolName,
VmSku: create.VmSku,
Image: create.Image,

View File

@ -1,45 +1,65 @@
namespace Microsoft.OneFuzz.Service;
using System.Buffers.Binary;
using System.Diagnostics;
using System.Security.Cryptography;
using System.IO;
public class Auth {
private static ReadOnlySpan<byte> SSHRSABytes => new byte[] { (byte)'s', (byte)'s', (byte)'h', (byte)'-', (byte)'r', (byte)'s', (byte)'a' };
private static ProcessStartInfo SshKeyGenProcConfig(string tempFile) {
private static byte[] BuildPublicKey(RSA rsa) {
static Span<byte> WriteLengthPrefixedBytes(ReadOnlySpan<byte> src, Span<byte> dest) {
BinaryPrimitives.WriteInt32BigEndian(dest, src.Length);
dest = dest[sizeof(int)..];
src.CopyTo(dest);
return dest[src.Length..];
string keyGen = "ssh-keygen";
var winAzureFunctionPath = "C:\\Program Files\\Git\\usr\\bin\\ssh-keygen.exe";
if (File.Exists(winAzureFunctionPath)) {
keyGen = winAzureFunctionPath;
}
var parameters = rsa.ExportParameters(includePrivateParameters: false);
// public key format is "ssh-rsa", exponent, modulus, all written
// as (big-endian) length-prefixed bytes
var result = new byte[sizeof(int) + SSHRSABytes.Length + sizeof(int) + parameters.Modulus!.Length + sizeof(int) + parameters.Exponent!.Length];
var spanResult = result.AsSpan();
spanResult = WriteLengthPrefixedBytes(SSHRSABytes, spanResult);
spanResult = WriteLengthPrefixedBytes(parameters.Exponent, spanResult);
spanResult = WriteLengthPrefixedBytes(parameters.Modulus, spanResult);
Debug.Assert(spanResult.Length == 0);
return result;
var p = new ProcessStartInfo() {
FileName = keyGen,
CreateNoWindow = false,
UseShellExecute = false,
RedirectStandardOutput = false,
RedirectStandardError = true,
ArgumentList = {
"-t",
"rsa",
"-f",
tempFile,
"-P",
"",
"-b",
"2048"
}
};
return p;
}
public static Authentication BuildAuth() {
using var rsa = RSA.Create(2048);
var privateKey = rsa.ExportRSAPrivateKey();
var formattedPrivateKey = $"-----BEGIN RSA PRIVATE KEY-----\n{Convert.ToBase64String(privateKey)}\n-----END RSA PRIVATE KEY-----\n";
var publicKey = BuildPublicKey(rsa);
var formattedPublicKey = $"ssh-rsa {Convert.ToBase64String(publicKey)} onefuzz-generated-key";
// This works both on Windows and Linux azure function hosts
private static async Async.Task<(string, string)> GenerateKeyValuePair() {
var tmpFile = Path.GetTempFileName();
File.Delete(tmpFile);
tmpFile = tmpFile + ".key";
var startInfo = SshKeyGenProcConfig(tmpFile);
using var proc = new Process() { StartInfo = startInfo };
if (proc.Start()) {
var stdErr = await proc.StandardError.ReadToEndAsync();
await proc.WaitForExitAsync();
if (proc.ExitCode != 0) {
throw new Exception($"ssh-keygen failed due to {stdErr}");
}
var priv = File.ReadAllText(tmpFile);
var pub = File.ReadAllText(tmpFile + ".pub");
File.Delete(tmpFile);
return (priv, pub.Trim());
} else {
throw new Exception("failed to start new ssh-keygen");
}
}
public static async Async.Task<Authentication> BuildAuth() {
var (priv, pub) = await GenerateKeyValuePair();
return new Authentication(
Password: Guid.NewGuid().ToString(),
PublicKey: formattedPublicKey,
PrivateKey: formattedPrivateKey);
PublicKey: pub,
PrivateKey: priv);
}
}

View File

@ -341,10 +341,10 @@ public class Extensions : IExtensions {
} else if (vmOs == Os.Linux) {
return new VMExtensionWrapper {
Location = region,
Name = "OMSExtension",
Name = "OmsAgentForLinux",
TypePropertiesType = "OmsAgentForLinux",
Publisher = "Microsoft.EnterpriseCloud.Monitoring",
TypeHandlerVersion = "1.12",
TypeHandlerVersion = "1.0",
AutoUpgradeMinorVersion = true,
Settings = new BinaryData(extensionSettings),
ProtectedSettings = new BinaryData(protectedExtensionSettings),

View File

@ -13,12 +13,12 @@ public interface INodeOperations : IStatefulOrm<Node, NodeState> {
Task<OneFuzzResultVoid> ReleaseScaleInProtection(Node node);
bool IsOutdated(Node node);
Async.Task Stop(Node node, bool done = false);
Async.Task<Node> Stop(Node node, bool done = false);
bool IsTooOld(Node node);
Task<bool> CouldShrinkScaleset(Node node);
Async.Task SetHalt(Node node);
Async.Task SetState(Node node, NodeState state);
Async.Task ToReimage(Node node, bool done = false);
Async.Task<Node> SetHalt(Node node);
Async.Task<Node> SetState(Node node, NodeState state);
Async.Task<Node> ToReimage(Node node, bool done = false);
Async.Task SendStopIfFree(Node node);
IAsyncEnumerable<Node> SearchStates(Guid? poolId = default,
Guid? scalesetId = default,
@ -54,7 +54,7 @@ public interface INodeOperations : IStatefulOrm<Node, NodeState> {
IAsyncEnumerable<Node> SearchByPoolName(PoolName poolName);
Async.Task SetShutdown(Node node);
Async.Task<Node> SetShutdown(Node node);
// state transitions:
Async.Task<Node> Init(Node node);
@ -118,13 +118,13 @@ public class NodeOperations : StatefulOrm<Node, NodeState, NodeOperations>, INod
public async Task<bool> CanProcessNewWork(Node node) {
if (IsOutdated(node) && _context.ServiceConfiguration.OneFuzzAllowOutdatedAgent != "true") {
_logTracer.Info($"can_process_new_work agent and service versions differ, stopping node. machine_id:{node.MachineId} agent_version:{node.Version} service_version:{_context.ServiceConfiguration.OneFuzzVersion}");
await Stop(node, done: true);
_ = await Stop(node, done: true);
return false;
}
if (IsTooOld(node)) {
_logTracer.Info($"can_process_new_work node is too old. machine_id:{node.MachineId}");
await Stop(node, done: true);
_ = await Stop(node, done: true);
return false;
}
@ -140,19 +140,19 @@ public class NodeOperations : StatefulOrm<Node, NodeState, NodeOperations>, INod
if (node.DeleteRequested) {
_logTracer.Info($"can_process_new_work is set to be deleted. machine_id:{node.MachineId}");
await Stop(node, done: true);
_ = await Stop(node, done: true);
return false;
}
if (node.ReimageRequested) {
_logTracer.Info($"can_process_new_work is set to be reimaged. machine_id:{node.MachineId}");
await Stop(node, done: true);
_ = await Stop(node, done: true);
return false;
}
if (await CouldShrinkScaleset(node)) {
_logTracer.Info($"can_process_new_work node scheduled to shrink. machine_id:{node.MachineId}");
await SetHalt(node);
_ = await SetHalt(node);
return false;
}
@ -197,27 +197,26 @@ public class NodeOperations : StatefulOrm<Node, NodeState, NodeOperations>, INod
if (node.DebugKeepNode) {
_logTracer.Info($"removing debug_keep_node for expired node. scaleset_id:{node.ScalesetId} machine_id:{node.MachineId}");
}
await ToReimage(node with { DebugKeepNode = false });
_ = await ToReimage(node with { DebugKeepNode = false });
}
}
public async Async.Task MarkOutdatedNodes() {
//#if outdated agents are allowed, do not attempt to update
bool allowOutdatedAgent;
var parsed = bool.TryParse(_context.ServiceConfiguration.OneFuzzAllowOutdatedAgent ?? "false", out allowOutdatedAgent);
//if outdated agents are allowed, do not attempt to update
var parsed = bool.TryParse(_context.ServiceConfiguration.OneFuzzAllowOutdatedAgent ?? "false", out var allowOutdatedAgent);
if (parsed && allowOutdatedAgent) {
return;
}
var outdated = this.SearchOutdated(excludeUpdateScheduled: true);
var outdated = SearchOutdated(excludeUpdateScheduled: true);
await foreach (var node in outdated) {
_logTracer.Info($"node is outdated: {node.MachineId} - node_version:{node.Version} api_version:");
if (node.Version == "1.0.0") {
await ToReimage(node, done: true);
_ = await ToReimage(node, done: true);
} else {
await ToReimage(node);
_ = await ToReimage(node);
}
}
}
@ -291,7 +290,7 @@ public class NodeOperations : StatefulOrm<Node, NodeState, NodeOperations>, INod
}
}
public async Async.Task ToReimage(Node node, bool done = false) {
public async Async.Task<Node> ToReimage(Node node, bool done = false) {
var nodeState = node.State;
if (done) {
@ -314,6 +313,8 @@ public class NodeOperations : StatefulOrm<Node, NodeState, NodeOperations>, INod
if (!r.IsOk) {
_logTracer.WithHttpStatus(r.ErrorV).Error("Failed to save Node record");
}
return updatedNode;
}
public IAsyncEnumerable<Node> GetDeadNodes(Guid scaleSetId, TimeSpan expirationPeriod) {
@ -356,9 +357,10 @@ public class NodeOperations : StatefulOrm<Node, NodeState, NodeOperations>, INod
return node;
}
public async Async.Task Stop(Node node, bool done = false) {
await ToReimage(node, done);
public async Async.Task<Node> Stop(Node node, bool done = false) {
node = await ToReimage(node, done);
await SendMessage(node, new NodeCommand(Stop: new StopNodeCommand()));
return node;
}
/// <summary>
@ -366,14 +368,15 @@ public class NodeOperations : StatefulOrm<Node, NodeState, NodeOperations>, INod
/// </summary>
/// <param name="node"></param>
/// <returns></returns>
public async Async.Task SetHalt(Node node) {
public async Async.Task<Node> SetHalt(Node node) {
_logTracer.Info($"setting halt: {node.MachineId}");
var updatedNode = node with { DeleteRequested = true };
await Stop(updatedNode, true);
await SendStopIfFree(updatedNode);
node = node with { DeleteRequested = true };
node = await Stop(node, true);
await SendStopIfFree(node);
return node;
}
public async Async.Task SetShutdown(Node node) {
public async Async.Task<Node> SetShutdown(Node node) {
//don't give out more work to the node, but let it finish existing work
_logTracer.Info($"setting delete_requested: {node.MachineId}");
node = node with { DeleteRequested = true };
@ -383,6 +386,7 @@ public class NodeOperations : StatefulOrm<Node, NodeState, NodeOperations>, INod
}
await SendStopIfFree(node);
return node;
}
@ -435,22 +439,26 @@ public class NodeOperations : StatefulOrm<Node, NodeState, NodeOperations>, INod
return false;
}
public async Async.Task SetState(Node node, NodeState state) {
var newNode = node;
if (node.State != state) {
newNode = newNode with { State = state };
await _context.Events.SendEvent(new EventNodeStateUpdated(
newNode.MachineId,
newNode.ScalesetId,
newNode.PoolName,
state
));
public async Async.Task<Node> SetState(Node node, NodeState state) {
if (node.State == state) {
return node;
}
var r = await Replace(newNode);
node = node with { State = state };
await _context.Events.SendEvent(new EventNodeStateUpdated(
node.MachineId,
node.ScalesetId,
node.PoolName,
state
));
var r = await Update(node);
if (!r.IsOk) {
_logTracer.Error($"Failed to update node for machine: {newNode.MachineId} to state {state} due to {r.ErrorV}");
_logTracer.Error($"Failed to update node for machine: {node.MachineId} to state {state} due to {r.ErrorV}");
// TODO: this should error out
}
return node;
}
public static string SearchStatesQuery(
@ -558,7 +566,7 @@ public class NodeOperations : StatefulOrm<Node, NodeState, NodeOperations>, INod
await foreach (var node in nodes) {
await _context.NodeMessageOperations.SendMessage(node.MachineId, new NodeCommand(StopTask: new StopTaskNodeCommand(task_id)));
if (!(await StopIfComplete(node))) {
if (!await StopIfComplete(node)) {
_logTracer.Info($"nodes: stopped task on node, but not reimaging due to other tasks: task_id:{task_id} machine_id:{node.MachineId}");
}
}
@ -592,7 +600,7 @@ public class NodeOperations : StatefulOrm<Node, NodeState, NodeOperations>, INod
}
_logTracer.Info($"node: stopping busy node with all tasks complete: {node.MachineId}");
await Stop(node, done: done);
_ = await Stop(node, done: done);
return true;
}

View File

@ -58,7 +58,7 @@ public class ProxyOperations : StatefulOrm<Proxy, VmState, ProxyOperations>, IPr
}
_logTracer.Info($"creating proxy: region:{region}");
var newProxy = new Proxy(region, Guid.NewGuid(), DateTimeOffset.UtcNow, VmState.Init, Auth.BuildAuth(), null, null, _context.ServiceConfiguration.OneFuzzVersion, null, false);
var newProxy = new Proxy(region, Guid.NewGuid(), DateTimeOffset.UtcNow, VmState.Init, await Auth.BuildAuth(), null, null, _context.ServiceConfiguration.OneFuzzVersion, null, false);
var r = await Replace(newProxy);
if (!r.IsOk) {

View File

@ -281,7 +281,10 @@ public class ReproOperations : StatefulOrm<Repro, VmState, ReproOperations>, IRe
State = VmState.Stopping
};
await Replace(repro);
var r = await Replace(repro);
if (!r.IsOk) {
_logTracer.Error($"failed to replace repro record for {repro.VmId} due to {r.ErrorV}");
}
return repro;
}
@ -306,12 +309,15 @@ public class ReproOperations : StatefulOrm<Repro, VmState, ReproOperations>, IRe
Config: config,
TaskId: task.TaskId,
Os: task.Os,
Auth: Auth.BuildAuth(),
Auth: await Auth.BuildAuth(),
EndTime: DateTimeOffset.UtcNow + TimeSpan.FromHours(config.Duration),
UserInfo: userInfo
);
await _context.ReproOperations.Insert(vm);
var r = await _context.ReproOperations.Insert(vm);
if (!r.IsOk) {
_logTracer.Error($"failed to insert repro record for {vm.VmId} due to {r.ErrorV}");
}
return OneFuzzResult<Repro>.Ok(vm);
} else {
return OneFuzzResult<Repro>.Error(ErrorCode.UNABLE_TO_FIND, "unable to find report");

View File

@ -562,11 +562,9 @@ public class ScalesetOperations : StatefulOrm<Scaleset, ScalesetState, ScalesetO
toDelete[node.MachineId] = node;
} else {
if (await new ShrinkQueue(scaleSet.ScalesetId, _context.Queue, _log).ShouldShrink()) {
await _context.NodeOperations.SetHalt(node);
toDelete[node.MachineId] = node;
toDelete[node.MachineId] = await _context.NodeOperations.SetHalt(node);
} else if (await new ShrinkQueue(pool.OkV!.PoolId, _context.Queue, _log).ShouldShrink()) {
await _context.NodeOperations.SetHalt(node);
toDelete[node.MachineId] = node;
toDelete[node.MachineId] = await _context.NodeOperations.SetHalt(node);
} else {
toReimage[node.MachineId] = node;
}
@ -585,8 +583,7 @@ public class ScalesetOperations : StatefulOrm<Scaleset, ScalesetState, ScalesetO
var error = new Error(ErrorCode.TASK_FAILED, new[] { $"{errorMessage} scaleset_id {deadNode.ScalesetId} last heartbeat:{deadNode.Heartbeat}" });
await _context.NodeOperations.MarkTasksStoppedEarly(deadNode, error);
await _context.NodeOperations.ToReimage(deadNode, true);
toReimage[deadNode.MachineId] = deadNode;
toReimage[deadNode.MachineId] = await _context.NodeOperations.ToReimage(deadNode, true);
}
// Perform operations until they fail due to scaleset getting locked:
@ -671,7 +668,7 @@ public class ScalesetOperations : StatefulOrm<Scaleset, ScalesetState, ScalesetO
}
// TODO: try to do this as one atomic operation:
await Async.Task.WhenAll(nodes.Select(node => _context.NodeOperations.SetHalt(node)));
nodes = await Async.Task.WhenAll(nodes.Select(node => _context.NodeOperations.SetHalt(node)));
if (scaleset.State == ScalesetState.Halt) {
_log.Info($"{SCALESET_LOG_PREFIX} scaleset halting, ignoring deletion {scaleset.ScalesetId}");

View File

@ -0,0 +1,16 @@
using FluentAssertions;
using Xunit;
namespace Tests {
public class AuthTests {
[Fact]
public async System.Threading.Tasks.Task TestAuth() {
var auth = await Microsoft.OneFuzz.Service.Auth.BuildAuth();
auth.Should().NotBeNull();
auth.PrivateKey.StartsWith("-----BEGIN OPENSSH PRIVATE KEY-----").Should().BeTrue();
auth.PrivateKey.EndsWith("-----END OPENSSH PRIVATE KEY-----\n").Should().BeTrue();
auth.PublicKey.StartsWith("ssh-rsa").Should().BeTrue();
}
}
}