From f40a69a37f9bb18eb2b36e421c9521a14021cdcf Mon Sep 17 00:00:00 2001 From: Stas Date: Tue, 20 Sep 2022 08:15:26 -0700 Subject: [PATCH] fix linux repro extensions (#2415) * fix OMS Linux repro extension config * Fixing lost Node state updates * fix bug in ReproVmss * rewrite ssh auth * win azure function ssh-keygen fix * more logs * try -P * use empty string for password * use argument list * addressing comments Co-authored-by: stas Co-authored-by: George Pollard --- .../ApiService/Functions/AgentEvents.cs | 13 +-- src/ApiService/ApiService/Functions/Node.cs | 19 ++-- .../ApiService/Functions/ReproVmss.cs | 7 +- .../ApiService/Functions/Scaleset.cs | 2 +- src/ApiService/ApiService/onefuzzlib/Auth.cs | 82 +++++++++++------- .../ApiService/onefuzzlib/Extension.cs | 4 +- .../ApiService/onefuzzlib/NodeOperations.cs | 86 ++++++++++--------- .../ApiService/onefuzzlib/ProxyOperations.cs | 2 +- .../ApiService/onefuzzlib/ReproOperations.cs | 12 ++- .../onefuzzlib/ScalesetOperations.cs | 11 +-- src/ApiService/Tests/AuthTests.cs | 16 ++++ 11 files changed, 157 insertions(+), 97 deletions(-) create mode 100644 src/ApiService/Tests/AuthTests.cs diff --git a/src/ApiService/ApiService/Functions/AgentEvents.cs b/src/ApiService/ApiService/Functions/AgentEvents.cs index f33a50b7e..21b1b74f5 100644 --- a/src/ApiService/ApiService/Functions/AgentEvents.cs +++ b/src/ApiService/ApiService/Functions/AgentEvents.cs @@ -73,13 +73,13 @@ public class AgentEvents { if (ev.State == NodeState.Free) { if (node.ReimageRequested || node.DeleteRequested) { _log.Info($"stopping free node with reset flags: {machineId}"); - await _context.NodeOperations.Stop(node); + _ = await _context.NodeOperations.Stop(node); return null; } if (await _context.NodeOperations.CouldShrinkScaleset(node)) { _log.Info($"stopping free node to resize scaleset: {machineId}"); - await _context.NodeOperations.SetHalt(node); + _ = await _context.NodeOperations.SetHalt(node); return null; } } @@ -87,7 +87,7 @@ public class AgentEvents { if (ev.State == NodeState.Init) { if (node.DeleteRequested) { _log.Info($"stopping node (init and delete_requested): {machineId}"); - await _context.NodeOperations.Stop(node); + _ = await _context.NodeOperations.Stop(node); return null; } @@ -95,12 +95,12 @@ public class AgentEvents { // they send 'init' with reimage_requested, it's because the node was reimaged // successfully. node = node with { ReimageRequested = false, InitializedAt = DateTimeOffset.UtcNow }; - await _context.NodeOperations.SetState(node, ev.State); + _ = await _context.NodeOperations.SetState(node, ev.State); return null; } _log.Info($"node state update: {machineId} from {node.State} to {ev.State}"); - await _context.NodeOperations.SetState(node, ev.State); + node = await _context.NodeOperations.SetState(node, ev.State); if (ev.State == NodeState.Free) { _log.Info($"node now available for work: {machineId}"); @@ -193,7 +193,8 @@ public class AgentEvents { } if (!node.State.ReadyForReset()) { - await _context.NodeOperations.SetState(node, NodeState.Busy); + _ = await _context.NodeOperations.SetState(node, NodeState.Busy); + // node unused after this point } var nodeTask = new NodeTasks( diff --git a/src/ApiService/ApiService/Functions/Node.cs b/src/ApiService/ApiService/Functions/Node.cs index 21867367e..c900e01bb 100644 --- a/src/ApiService/ApiService/Functions/Node.cs +++ b/src/ApiService/ApiService/Functions/Node.cs @@ -100,9 +100,12 @@ public class Node { context: patch.MachineId.ToString()); } - await _context.NodeOperations.Stop(node, done: true); + node = await _context.NodeOperations.Stop(node, done: true); if (node.DebugKeepNode) { - await _context.NodeOperations.Replace(node with { DebugKeepNode = false }); + var r = await _context.NodeOperations.Replace(node with { DebugKeepNode = false }); + if (!r.IsOk) { + _log.Error($"Failed to replace node {node.MachineId} due to {r.ErrorV}"); + } } return await RequestHandling.Ok(req, true); @@ -137,7 +140,10 @@ public class Node { node = node with { DebugKeepNode = value }; } - await _context.NodeOperations.Replace(node); + var r = await _context.NodeOperations.Replace(node); + if (!r.IsOk) { + _log.Error($"Failed to replace node {node.MachineId} due to {r.ErrorV}"); + } return await RequestHandling.Ok(req, true); } @@ -166,9 +172,12 @@ public class Node { context: delete.MachineId.ToString()); } - await _context.NodeOperations.SetHalt(node); + node = await _context.NodeOperations.SetHalt(node); if (node.DebugKeepNode) { - await _context.NodeOperations.Replace(node with { DebugKeepNode = false }); + var r = await _context.NodeOperations.Replace(node with { DebugKeepNode = false }); + if (!r.IsOk) { + _log.Error($"Failed to replace node {node.MachineId} due to {r.ErrorV}"); + } } return await RequestHandling.Ok(req, true); diff --git a/src/ApiService/ApiService/Functions/ReproVmss.cs b/src/ApiService/ApiService/Functions/ReproVmss.cs index 4b99877a7..9512dd0e8 100644 --- a/src/ApiService/ApiService/Functions/ReproVmss.cs +++ b/src/ApiService/ApiService/Functions/ReproVmss.cs @@ -102,7 +102,7 @@ public class ReproVmss { context: "repro delete"); } - var vm = await _context.ReproOperations.SearchByPartitionKeys(new[] { $"request.OkV.VmId" }).FirstOrDefaultAsync(); + var vm = await _context.ReproOperations.SearchByPartitionKeys(new[] { request.OkV.VmId.ToString()! }).FirstOrDefaultAsync(); if (vm == null) { return await _context.RequestHandling.NotOk( @@ -112,7 +112,10 @@ public class ReproVmss { } var updatedRepro = vm with { State = VmState.Stopping }; - await _context.ReproOperations.Replace(updatedRepro); + var r = await _context.ReproOperations.Replace(updatedRepro); + if (!r.IsOk) { + _log.Error($"Failed to replace repro {updatedRepro.VmId} due to {r.ErrorV}"); + } var response = req.CreateResponse(HttpStatusCode.OK); await response.WriteAsJsonAsync(updatedRepro); diff --git a/src/ApiService/ApiService/Functions/Scaleset.cs b/src/ApiService/ApiService/Functions/Scaleset.cs index df2f08e70..08cf50d64 100644 --- a/src/ApiService/ApiService/Functions/Scaleset.cs +++ b/src/ApiService/ApiService/Functions/Scaleset.cs @@ -114,7 +114,7 @@ public class Scaleset { ScalesetId: Guid.NewGuid(), State: ScalesetState.Init, NeedsConfigUpdate: false, - Auth: Auth.BuildAuth(), + Auth: await Auth.BuildAuth(), PoolName: create.PoolName, VmSku: create.VmSku, Image: create.Image, diff --git a/src/ApiService/ApiService/onefuzzlib/Auth.cs b/src/ApiService/ApiService/onefuzzlib/Auth.cs index 517835caf..c62041c6e 100644 --- a/src/ApiService/ApiService/onefuzzlib/Auth.cs +++ b/src/ApiService/ApiService/onefuzzlib/Auth.cs @@ -1,45 +1,65 @@ namespace Microsoft.OneFuzz.Service; - -using System.Buffers.Binary; using System.Diagnostics; -using System.Security.Cryptography; +using System.IO; public class Auth { - private static ReadOnlySpan SSHRSABytes => new byte[] { (byte)'s', (byte)'s', (byte)'h', (byte)'-', (byte)'r', (byte)'s', (byte)'a' }; + private static ProcessStartInfo SshKeyGenProcConfig(string tempFile) { - private static byte[] BuildPublicKey(RSA rsa) { - static Span WriteLengthPrefixedBytes(ReadOnlySpan src, Span dest) { - BinaryPrimitives.WriteInt32BigEndian(dest, src.Length); - dest = dest[sizeof(int)..]; - src.CopyTo(dest); - return dest[src.Length..]; + string keyGen = "ssh-keygen"; + var winAzureFunctionPath = "C:\\Program Files\\Git\\usr\\bin\\ssh-keygen.exe"; + if (File.Exists(winAzureFunctionPath)) { + keyGen = winAzureFunctionPath; } - - var parameters = rsa.ExportParameters(includePrivateParameters: false); - - // public key format is "ssh-rsa", exponent, modulus, all written - // as (big-endian) length-prefixed bytes - var result = new byte[sizeof(int) + SSHRSABytes.Length + sizeof(int) + parameters.Modulus!.Length + sizeof(int) + parameters.Exponent!.Length]; - var spanResult = result.AsSpan(); - spanResult = WriteLengthPrefixedBytes(SSHRSABytes, spanResult); - spanResult = WriteLengthPrefixedBytes(parameters.Exponent, spanResult); - spanResult = WriteLengthPrefixedBytes(parameters.Modulus, spanResult); - Debug.Assert(spanResult.Length == 0); - - return result; + var p = new ProcessStartInfo() { + FileName = keyGen, + CreateNoWindow = false, + UseShellExecute = false, + RedirectStandardOutput = false, + RedirectStandardError = true, + ArgumentList = { + "-t", + "rsa", + "-f", + tempFile, + "-P", + "", + "-b", + "2048" + } + }; + return p; } - public static Authentication BuildAuth() { - using var rsa = RSA.Create(2048); - var privateKey = rsa.ExportRSAPrivateKey(); - var formattedPrivateKey = $"-----BEGIN RSA PRIVATE KEY-----\n{Convert.ToBase64String(privateKey)}\n-----END RSA PRIVATE KEY-----\n"; - var publicKey = BuildPublicKey(rsa); - var formattedPublicKey = $"ssh-rsa {Convert.ToBase64String(publicKey)} onefuzz-generated-key"; + // This works both on Windows and Linux azure function hosts + private static async Async.Task<(string, string)> GenerateKeyValuePair() { + var tmpFile = Path.GetTempFileName(); + File.Delete(tmpFile); + tmpFile = tmpFile + ".key"; + var startInfo = SshKeyGenProcConfig(tmpFile); + using var proc = new Process() { StartInfo = startInfo }; + if (proc.Start()) { + var stdErr = await proc.StandardError.ReadToEndAsync(); + await proc.WaitForExitAsync(); + if (proc.ExitCode != 0) { + throw new Exception($"ssh-keygen failed due to {stdErr}"); + } + var priv = File.ReadAllText(tmpFile); + var pub = File.ReadAllText(tmpFile + ".pub"); + File.Delete(tmpFile); + return (priv, pub.Trim()); + } else { + throw new Exception("failed to start new ssh-keygen"); + } + } + + + public static async Async.Task BuildAuth() { + var (priv, pub) = await GenerateKeyValuePair(); return new Authentication( Password: Guid.NewGuid().ToString(), - PublicKey: formattedPublicKey, - PrivateKey: formattedPrivateKey); + PublicKey: pub, + PrivateKey: priv); } } diff --git a/src/ApiService/ApiService/onefuzzlib/Extension.cs b/src/ApiService/ApiService/onefuzzlib/Extension.cs index 97f433cc9..683b28eb1 100644 --- a/src/ApiService/ApiService/onefuzzlib/Extension.cs +++ b/src/ApiService/ApiService/onefuzzlib/Extension.cs @@ -341,10 +341,10 @@ public class Extensions : IExtensions { } else if (vmOs == Os.Linux) { return new VMExtensionWrapper { Location = region, - Name = "OMSExtension", + Name = "OmsAgentForLinux", TypePropertiesType = "OmsAgentForLinux", Publisher = "Microsoft.EnterpriseCloud.Monitoring", - TypeHandlerVersion = "1.12", + TypeHandlerVersion = "1.0", AutoUpgradeMinorVersion = true, Settings = new BinaryData(extensionSettings), ProtectedSettings = new BinaryData(protectedExtensionSettings), diff --git a/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs b/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs index cbe3ca3b6..9a449641a 100644 --- a/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs @@ -13,12 +13,12 @@ public interface INodeOperations : IStatefulOrm { Task ReleaseScaleInProtection(Node node); bool IsOutdated(Node node); - Async.Task Stop(Node node, bool done = false); + Async.Task Stop(Node node, bool done = false); bool IsTooOld(Node node); Task CouldShrinkScaleset(Node node); - Async.Task SetHalt(Node node); - Async.Task SetState(Node node, NodeState state); - Async.Task ToReimage(Node node, bool done = false); + Async.Task SetHalt(Node node); + Async.Task SetState(Node node, NodeState state); + Async.Task ToReimage(Node node, bool done = false); Async.Task SendStopIfFree(Node node); IAsyncEnumerable SearchStates(Guid? poolId = default, Guid? scalesetId = default, @@ -54,7 +54,7 @@ public interface INodeOperations : IStatefulOrm { IAsyncEnumerable SearchByPoolName(PoolName poolName); - Async.Task SetShutdown(Node node); + Async.Task SetShutdown(Node node); // state transitions: Async.Task Init(Node node); @@ -118,13 +118,13 @@ public class NodeOperations : StatefulOrm, INod public async Task CanProcessNewWork(Node node) { if (IsOutdated(node) && _context.ServiceConfiguration.OneFuzzAllowOutdatedAgent != "true") { _logTracer.Info($"can_process_new_work agent and service versions differ, stopping node. machine_id:{node.MachineId} agent_version:{node.Version} service_version:{_context.ServiceConfiguration.OneFuzzVersion}"); - await Stop(node, done: true); + _ = await Stop(node, done: true); return false; } if (IsTooOld(node)) { _logTracer.Info($"can_process_new_work node is too old. machine_id:{node.MachineId}"); - await Stop(node, done: true); + _ = await Stop(node, done: true); return false; } @@ -140,19 +140,19 @@ public class NodeOperations : StatefulOrm, INod if (node.DeleteRequested) { _logTracer.Info($"can_process_new_work is set to be deleted. machine_id:{node.MachineId}"); - await Stop(node, done: true); + _ = await Stop(node, done: true); return false; } if (node.ReimageRequested) { _logTracer.Info($"can_process_new_work is set to be reimaged. machine_id:{node.MachineId}"); - await Stop(node, done: true); + _ = await Stop(node, done: true); return false; } if (await CouldShrinkScaleset(node)) { _logTracer.Info($"can_process_new_work node scheduled to shrink. machine_id:{node.MachineId}"); - await SetHalt(node); + _ = await SetHalt(node); return false; } @@ -197,27 +197,26 @@ public class NodeOperations : StatefulOrm, INod if (node.DebugKeepNode) { _logTracer.Info($"removing debug_keep_node for expired node. scaleset_id:{node.ScalesetId} machine_id:{node.MachineId}"); } - await ToReimage(node with { DebugKeepNode = false }); + _ = await ToReimage(node with { DebugKeepNode = false }); } } public async Async.Task MarkOutdatedNodes() { - //#if outdated agents are allowed, do not attempt to update - bool allowOutdatedAgent; - var parsed = bool.TryParse(_context.ServiceConfiguration.OneFuzzAllowOutdatedAgent ?? "false", out allowOutdatedAgent); + //if outdated agents are allowed, do not attempt to update + var parsed = bool.TryParse(_context.ServiceConfiguration.OneFuzzAllowOutdatedAgent ?? "false", out var allowOutdatedAgent); if (parsed && allowOutdatedAgent) { return; } - var outdated = this.SearchOutdated(excludeUpdateScheduled: true); + var outdated = SearchOutdated(excludeUpdateScheduled: true); await foreach (var node in outdated) { _logTracer.Info($"node is outdated: {node.MachineId} - node_version:{node.Version} api_version:"); if (node.Version == "1.0.0") { - await ToReimage(node, done: true); + _ = await ToReimage(node, done: true); } else { - await ToReimage(node); + _ = await ToReimage(node); } } } @@ -291,7 +290,7 @@ public class NodeOperations : StatefulOrm, INod } } - public async Async.Task ToReimage(Node node, bool done = false) { + public async Async.Task ToReimage(Node node, bool done = false) { var nodeState = node.State; if (done) { @@ -314,6 +313,8 @@ public class NodeOperations : StatefulOrm, INod if (!r.IsOk) { _logTracer.WithHttpStatus(r.ErrorV).Error("Failed to save Node record"); } + + return updatedNode; } public IAsyncEnumerable GetDeadNodes(Guid scaleSetId, TimeSpan expirationPeriod) { @@ -356,9 +357,10 @@ public class NodeOperations : StatefulOrm, INod return node; } - public async Async.Task Stop(Node node, bool done = false) { - await ToReimage(node, done); + public async Async.Task Stop(Node node, bool done = false) { + node = await ToReimage(node, done); await SendMessage(node, new NodeCommand(Stop: new StopNodeCommand())); + return node; } /// @@ -366,14 +368,15 @@ public class NodeOperations : StatefulOrm, INod /// /// /// - public async Async.Task SetHalt(Node node) { + public async Async.Task SetHalt(Node node) { _logTracer.Info($"setting halt: {node.MachineId}"); - var updatedNode = node with { DeleteRequested = true }; - await Stop(updatedNode, true); - await SendStopIfFree(updatedNode); + node = node with { DeleteRequested = true }; + node = await Stop(node, true); + await SendStopIfFree(node); + return node; } - public async Async.Task SetShutdown(Node node) { + public async Async.Task SetShutdown(Node node) { //don't give out more work to the node, but let it finish existing work _logTracer.Info($"setting delete_requested: {node.MachineId}"); node = node with { DeleteRequested = true }; @@ -383,6 +386,7 @@ public class NodeOperations : StatefulOrm, INod } await SendStopIfFree(node); + return node; } @@ -435,22 +439,26 @@ public class NodeOperations : StatefulOrm, INod return false; } - public async Async.Task SetState(Node node, NodeState state) { - var newNode = node; - if (node.State != state) { - newNode = newNode with { State = state }; - await _context.Events.SendEvent(new EventNodeStateUpdated( - newNode.MachineId, - newNode.ScalesetId, - newNode.PoolName, - state - )); + public async Async.Task SetState(Node node, NodeState state) { + if (node.State == state) { + return node; } - var r = await Replace(newNode); + node = node with { State = state }; + await _context.Events.SendEvent(new EventNodeStateUpdated( + node.MachineId, + node.ScalesetId, + node.PoolName, + state + )); + + var r = await Update(node); if (!r.IsOk) { - _logTracer.Error($"Failed to update node for machine: {newNode.MachineId} to state {state} due to {r.ErrorV}"); + _logTracer.Error($"Failed to update node for machine: {node.MachineId} to state {state} due to {r.ErrorV}"); + // TODO: this should error out } + + return node; } public static string SearchStatesQuery( @@ -558,7 +566,7 @@ public class NodeOperations : StatefulOrm, INod await foreach (var node in nodes) { await _context.NodeMessageOperations.SendMessage(node.MachineId, new NodeCommand(StopTask: new StopTaskNodeCommand(task_id))); - if (!(await StopIfComplete(node))) { + if (!await StopIfComplete(node)) { _logTracer.Info($"nodes: stopped task on node, but not reimaging due to other tasks: task_id:{task_id} machine_id:{node.MachineId}"); } } @@ -592,7 +600,7 @@ public class NodeOperations : StatefulOrm, INod } _logTracer.Info($"node: stopping busy node with all tasks complete: {node.MachineId}"); - await Stop(node, done: done); + _ = await Stop(node, done: done); return true; } diff --git a/src/ApiService/ApiService/onefuzzlib/ProxyOperations.cs b/src/ApiService/ApiService/onefuzzlib/ProxyOperations.cs index c6cfe37be..0cde95d4b 100644 --- a/src/ApiService/ApiService/onefuzzlib/ProxyOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/ProxyOperations.cs @@ -58,7 +58,7 @@ public class ProxyOperations : StatefulOrm, IPr } _logTracer.Info($"creating proxy: region:{region}"); - var newProxy = new Proxy(region, Guid.NewGuid(), DateTimeOffset.UtcNow, VmState.Init, Auth.BuildAuth(), null, null, _context.ServiceConfiguration.OneFuzzVersion, null, false); + var newProxy = new Proxy(region, Guid.NewGuid(), DateTimeOffset.UtcNow, VmState.Init, await Auth.BuildAuth(), null, null, _context.ServiceConfiguration.OneFuzzVersion, null, false); var r = await Replace(newProxy); if (!r.IsOk) { diff --git a/src/ApiService/ApiService/onefuzzlib/ReproOperations.cs b/src/ApiService/ApiService/onefuzzlib/ReproOperations.cs index 6dd594910..54bfa209c 100644 --- a/src/ApiService/ApiService/onefuzzlib/ReproOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/ReproOperations.cs @@ -281,7 +281,10 @@ public class ReproOperations : StatefulOrm, IRe State = VmState.Stopping }; - await Replace(repro); + var r = await Replace(repro); + if (!r.IsOk) { + _logTracer.Error($"failed to replace repro record for {repro.VmId} due to {r.ErrorV}"); + } return repro; } @@ -306,12 +309,15 @@ public class ReproOperations : StatefulOrm, IRe Config: config, TaskId: task.TaskId, Os: task.Os, - Auth: Auth.BuildAuth(), + Auth: await Auth.BuildAuth(), EndTime: DateTimeOffset.UtcNow + TimeSpan.FromHours(config.Duration), UserInfo: userInfo ); - await _context.ReproOperations.Insert(vm); + var r = await _context.ReproOperations.Insert(vm); + if (!r.IsOk) { + _logTracer.Error($"failed to insert repro record for {vm.VmId} due to {r.ErrorV}"); + } return OneFuzzResult.Ok(vm); } else { return OneFuzzResult.Error(ErrorCode.UNABLE_TO_FIND, "unable to find report"); diff --git a/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs b/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs index 465bf49e2..806cacc24 100644 --- a/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs @@ -562,11 +562,9 @@ public class ScalesetOperations : StatefulOrm _context.NodeOperations.SetHalt(node))); + nodes = await Async.Task.WhenAll(nodes.Select(node => _context.NodeOperations.SetHalt(node))); if (scaleset.State == ScalesetState.Halt) { _log.Info($"{SCALESET_LOG_PREFIX} scaleset halting, ignoring deletion {scaleset.ScalesetId}"); diff --git a/src/ApiService/Tests/AuthTests.cs b/src/ApiService/Tests/AuthTests.cs new file mode 100644 index 000000000..c7476c22c --- /dev/null +++ b/src/ApiService/Tests/AuthTests.cs @@ -0,0 +1,16 @@ +using FluentAssertions; +using Xunit; + +namespace Tests { + public class AuthTests { + [Fact] + public async System.Threading.Tasks.Task TestAuth() { + var auth = await Microsoft.OneFuzz.Service.Auth.BuildAuth(); + + auth.Should().NotBeNull(); + auth.PrivateKey.StartsWith("-----BEGIN OPENSSH PRIVATE KEY-----").Should().BeTrue(); + auth.PrivateKey.EndsWith("-----END OPENSSH PRIVATE KEY-----\n").Should().BeTrue(); + auth.PublicKey.StartsWith("ssh-rsa").Should().BeTrue(); + } + } +}