Add unmanaged nodes integration tests (#2780)

* Add docker file to the runtime tools

* fixes

* bug fixes

* more bug fixes and added doc

* don;t overwrite the RUST_LOG env var

* integration test for unmanaged nodes

* add unamanged parameters to launch()

* add ing object_id

* more bug fixes

* bug fixes

* chmod on the linux files in docker

* format

* cleanup merge

* added test_unmanaged command

* cleanup

* use a single image for the docker compose
remove images after the test

* docs and formatting

* format

* format

* format and bug fixes

* using windows server

* fix linux container
make the base image a paramter on windows
use the windows server base image on  windows server

* format

* bug fix

* more fixes

* allow reboot

* more fixes

* added more logging around the service principal creation

* format

* more logging

* change restart policy

* fix multi tenant domain

* more fixes

* exit instead of reboot when running inside docker

* remove comment

* build fix

* try_exist instead of exist

* save the docker logs

* bug_fix

* adding timeout

* fix timeout logic

* adding a build profile

* make all agent depend on the first one

* remove profile

* another fix

* restart agent 1

* Update docs/unmnaged-nodes.md

Co-authored-by: Teo Voinea <58236992+tevoinea@users.noreply.github.com>

---------

Co-authored-by: Teo Voinea <58236992+tevoinea@users.noreply.github.com>
This commit is contained in:
Cheick Keita
2023-02-08 11:07:19 -08:00
committed by GitHub
parent f93c75556d
commit d732028201
9 changed files with 404 additions and 51 deletions

View File

@ -88,3 +88,33 @@ onefuzz nodes get <machine_guid>
This should return one entry. Verify that the `pool_name` matched the pool name created earlier.
From here you will be able to schedule jobs on that pool and they will be running.
## Troubleshooting
### increase the verbosity of the logs
It can help when investigating issues to increase the log verbosity. you will need to set the [RUST_LOG](https://docs.rs/env_logger/latest/env_logger/#enabling-logging) environment variable when starting docker
```
docker run --rm --env RUST_LOG=<log_level> <image_name> --machine_id <machine_id>
```
log_level can be any of
- error
- warn
- info
- debug
- trace
### use the container interactively
you can use the container interactively by with the following command
windows
```
docker run --it --rm --entrypoint powershell <image_name>
```
linux
```
docker run --it --rm --entrypoint bash <image_name>
```

View File

@ -320,10 +320,6 @@ public class Config : IConfig {
return ResultVoid<TaskConfigError>.Error(new TaskConfigError($"invalid pool: {config.Pool.PoolName}"));
}
if ((config.Task.RebootAfterSetup ?? false) && !pool.OkV.Managed) {
return ResultVoid<TaskConfigError>.Error(new TaskConfigError("reboot_after_setup is not supported for unmanaged pools"));
}
var checkTarget = await CheckTargetExe(config, definition);
if (!checkTarget.IsOk) {
return checkTarget;

View File

@ -90,17 +90,22 @@ impl Reboot {
#[cfg(target_family = "unix")]
pub fn invoke(&self) -> Result<()> {
match std::path::Path::new("/.dockerenv").try_exists() {
Ok(true) => {
info!("running inside docker, exiting instead of rebooting");
std::process::exit(0);
}
_ => {
info!("invoking local reboot command");
Command::new("reboot").arg("-f").status()?;
self.wait_for_reboot()
}
}
}
#[cfg(target_family = "windows")]
pub fn invoke(&self) -> Result<()> {
info!("invoking local reboot command");
Command::new("powershell.exe")
.arg("-Command")
.arg("Restart-Computer")

View File

@ -1281,7 +1281,7 @@ class Pool(Endpoint):
client_secret="<client_secret>",
resource=self.onefuzz._backend.config.endpoint,
tenant=urlparse(self.onefuzz._backend.config.authority).path.strip("/"),
multi_tenant_domain=self.onefuzz._backend.config.tenant_domain,
multi_tenant_domain=self.onefuzz._backend.config.get_multi_tenant_domain(),
)
return pool.config

View File

@ -98,6 +98,12 @@ class BackendConfig(BaseModel):
features: Set[str] = Field(default_factory=set)
tenant_domain: str
def get_multi_tenant_domain(self) -> Optional[str]:
if "https://login.microsoftonline.com/common" in self.authority:
return self.tenant_domain
else:
return None
class Backend:
def __init__(
@ -182,10 +188,11 @@ class Backend:
if not self.config.endpoint:
raise Exception("endpoint not configured")
if "https://login.microsoftonline.com/common" in self.config.authority:
multi_tenant_domain = self.config.get_multi_tenant_domain()
if multi_tenant_domain is not None:
endpoint = urlparse(self.config.endpoint).netloc.split(".")[0]
scopes = [
f"api://{self.config.tenant_domain}/{endpoint}/.default",
f"api://{multi_tenant_domain}/{endpoint}/.default",
]
else:
netloc = urlparse(self.config.endpoint).netloc

View File

@ -307,12 +307,13 @@ def create_application_registration(
error: Optional[Exception] = None
for _ in range(10):
try:
query_microsoft_graph(
service_principal = query_microsoft_graph(
method="POST",
resource="servicePrincipals",
body=service_principal_params,
subscription=subscription_id,
)
logger.info(f"created service principal:\n {service_principal}")
return
except GraphQueryError as err:
# work around timing issue when creating service principal
@ -654,8 +655,11 @@ def assign_instance_app_role(
if len(onefuzz_service_principals) == 0:
raise Exception("onefuzz app service principal not found")
onefuzz_service_principal = onefuzz_service_principals[0]
onefuzz_service_principal = onefuzz_service_principals[0]
logger.info(
f"Assigning app role instance service principal {onefuzz_service_principal['id']}"
)
if isinstance(application_name, str):
application_service_principals = query_microsoft_graph_list(
method="GET",

View File

@ -18,25 +18,32 @@
# allows testing multiple components concurrently.
import datetime
import json
import logging
import os
import re
import shutil
import subprocess
import sys
from textwrap import TextWrapper
import time
import zipfile
from enum import Enum
from shutil import which
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar
from uuid import UUID, uuid4
import requests
from onefuzz.api import Command, Onefuzz
from onefuzz.backend import ContainerWrapper, wait
from onefuzz.cli import execute_api
from onefuzztypes.enums import OS, ContainerType, TaskState, VmState, ScalesetState
import yaml
from onefuzztypes.enums import OS, ContainerType, ScalesetState, TaskState, VmState
from onefuzztypes.models import Job, Pool, Repro, Scaleset, Task
from onefuzztypes.primitives import Container, Directory, File, PoolName, Region
from pydantic import BaseModel, Field
from onefuzz.api import Command, Onefuzz
from onefuzz.backend import ContainerWrapper, wait
from onefuzz.cli import execute_api
LINUX_POOL = "linux-test"
WINDOWS_POOL = "linux-test"
BUILD = "0"
@ -299,7 +306,14 @@ def retry(
class TestOnefuzz:
def __init__(
self, onefuzz: Onefuzz, logger: logging.Logger, test_id: UUID, polling_period=30
self,
onefuzz: Onefuzz,
logger: logging.Logger,
test_id: UUID,
polling_period=30,
unmanaged_client_id: Optional[UUID] = None,
unmanaged_client_secret: Optional[str] = None,
unmanaged_principal_id: Optional[UUID] = None,
) -> None:
self.of = onefuzz
self.logger = logger
@ -308,13 +322,13 @@ class TestOnefuzz:
self.start_log_marker = f"integration-test-injection-error-start-{self.test_id}"
self.stop_log_marker = f"integration-test-injection-error-stop-{self.test_id}"
self.polling_period = polling_period
self.tools_dir = f"{self.test_id}/tools"
self.unmanaged_client_id = unmanaged_client_id
self.unmanaged_client_secret = unmanaged_client_secret
self.unmanaged_principal_id = unmanaged_principal_id
def setup(
self,
*,
region: Optional[Region] = None,
pool_size: int,
os_list: List[OS],
self, *, region: Optional[Region] = None, pool_size: int, os_list: List[OS]
) -> None:
def try_info_get(data: Any) -> None:
self.of.info.get()
@ -331,12 +345,211 @@ class TestOnefuzz:
name, pool_size, region=region, initial_size=pool_size
)
class UnmanagedPool:
def __init__(
self,
onefuzz: Onefuzz,
logger: logging.Logger,
test_id: UUID,
pool_name: PoolName,
the_os: OS,
pool_size: int,
unmanaged_client_id: UUID,
unmanaged_client_secret: str,
unmanaged_principal_id: UUID,
save_logs: bool = False,
) -> None:
self.of = onefuzz
self.logger = logger
self.test_id = test_id
self.project = f"test-{self.test_id}"
self.tools_dir = f"{self.test_id}/tools"
self.unmanaged_client_id = unmanaged_client_id
self.unmanaged_client_secret = unmanaged_client_secret
self.pool_name = pool_name
if pool_size < 1:
raise Exception("pool_size must be >= 1")
self.pool_size = pool_size
self.the_os = the_os
self.unmanaged_principal_id = unmanaged_principal_id
self.image_tag = f"unmanaged_agent:{self.test_id}"
self.log_file_path: Optional[str] = None
self.process: Optional[subprocess.Popen[bytes]] = None
self.save_logs = save_logs
def __enter__(self):
self.start_unmanaged_pool()
def __exit__(self, *args):
self.stop_unmanaged_pool()
def get_tools_path(self, the_os: OS):
if the_os == OS.linux:
return os.path.join(self.tools_dir, "linux")
elif the_os == OS.windows:
return os.path.join(self.tools_dir, "win64")
else:
raise Exception(f"unsupported os: {the_os}")
def start_unmanaged_pool(self):
self.logger.info("creating pool: %s:%s", self.the_os.name, self.pool_name)
self.of.pools.create(
self.pool_name,
self.the_os,
unmanaged=True,
object_id=self.unmanaged_principal_id,
)
os.makedirs(self.tools_dir, exist_ok=True)
self.logger.info("starting unmanaged pools docker containers")
if self.unmanaged_client_id is None or self.unmanaged_client_secret is None:
raise Exception(
"unmanaged_client_id and unmanaged_client_secret must be set to test the unmanaged scenario"
)
self.logger.info("downloading tools")
self.of.tools.get(self.tools_dir)
self.logger.info("extracting tools")
with zipfile.ZipFile(
os.path.join(self.tools_dir, "tools.zip"), "r"
) as zip_ref:
zip_ref.extractall(self.tools_dir)
tools_path = self.get_tools_path(self.the_os)
self.logger.info("creating docker compose file")
services = list(
map(
lambda x: {
f"agent{x+1}": {
"depends_on": ["agent1"],
"image": self.image_tag,
"command": f"--machine_id {uuid4()}",
"restart": "unless-stopped",
}
},
range(1, self.pool_size - 1),
)
)
build = {"context": "."}
if self.the_os == OS.windows:
windows_type = subprocess.check_output(
"powershell -c (Get-ComputerInfo).OsProductType", shell=True
)
if windows_type.strip() == b"Workstation":
self.logger.info("using windows workstation image")
build = {
"context": ".",
"args": {"BASE_IMAGE": "mcr.microsoft.com/windows:ltsc2019"},
}
else:
self.logger.info("using windows server image")
build = {
"context": ".",
"args": {
"BASE_IMAGE": "mcr.microsoft.com/windows/server:ltsc2022"
},
}
# create docker compose file
compose = {
"version": "3",
"services": {
"agent1": {
"image": self.image_tag,
"build": build,
"command": f"--machine_id {uuid4()}",
"restart": "unless-stopped",
}
},
}
for service in services:
key = next(iter(service.keys()))
compose["services"][key] = service[key]
docker_compose_path = os.path.join(tools_path, "docker-compose.yml")
self.logger.info(
f"writing docker-compose.yml to {docker_compose_path}:\n{yaml.dump(compose)}"
)
with open(docker_compose_path, "w") as f:
yaml.dump(compose, f)
self.logger.info(f"retrieving base config.json from {self.pool_name}")
config = self.of.pools.get_config(self.pool_name)
self.logger.info(f"updating config.json with unmanaged credentials")
config.client_credentials.client_id = self.unmanaged_client_id
config.client_credentials.client_secret = self.unmanaged_client_secret
config_path = os.path.join(tools_path, "config.json")
self.logger.info(f"writing config.json to {config_path}")
with open(config_path, "w") as f:
f.write(config.json())
self.logger.info(f"starting docker compose")
log_file_name = "docker-logs.txt"
self.log_file_path = os.path.join(tools_path, log_file_name)
subprocess.check_call(
"docker compose up -d --force-recreate --build",
shell=True,
cwd=tools_path,
)
if self.save_logs:
self.process = subprocess.Popen(
f"docker compose logs -f > {log_file_name} 2>&1",
shell=True,
cwd=tools_path,
)
def stop_unmanaged_pool(self):
tools_path = self.get_tools_path(self.the_os)
subprocess.check_call(
"docker compose rm --stop --force", shell=True, cwd=tools_path
)
subprocess.check_call(
f"docker image rm {self.image_tag}", shell=True, cwd=tools_path
)
def create_unmanaged_pool(
self, pool_size: int, the_os: OS, save_logs: bool = False
) -> "UnmanagedPool":
if (
self.unmanaged_client_id is None
or self.unmanaged_client_secret is None
or self.unmanaged_principal_id is None
):
raise Exception(
"unmanaged_client_id, unmanaged_client_secret and unmanaged_principal_id must be set to test the unmanaged scenario"
)
return self.UnmanagedPool(
self.of,
self.logger,
self.test_id,
PoolName(f"unmanaged-testpool-{self.test_id}"),
the_os,
pool_size,
unmanaged_client_id=self.unmanaged_client_id,
unmanaged_client_secret=self.unmanaged_client_secret,
unmanaged_principal_id=self.unmanaged_principal_id,
save_logs=save_logs,
)
def launch(
self, path: Directory, *, os_list: List[OS], targets: List[str], duration=int
self,
path: Directory,
*,
os_list: List[OS],
targets: List[str],
duration=int,
unmanaged_pool: Optional[UnmanagedPool] = None,
) -> List[UUID]:
"""Launch all of the fuzzing templates"""
pools = {}
if unmanaged_pool is not None:
pools[unmanaged_pool.the_os] = self.of.pools.get(unmanaged_pool.pool_name)
else:
for pool in self.of.pools.list():
pools[pool.os] = pool
@ -355,7 +568,9 @@ class TestOnefuzz:
self.logger.info("launching: %s", target)
if config.setup_dir is None:
setup = Directory(os.path.join(path, target)) if config.use_setup else None
setup = (
Directory(os.path.join(path, target)) if config.use_setup else None
)
else:
setup = config.setup_dir
@ -521,6 +736,7 @@ class TestOnefuzz:
poll: bool = False,
stop_on_complete_check: bool = False,
job_ids: List[UUID] = [],
timeout: datetime.timedelta = datetime.timedelta(hours=1),
) -> bool:
"""Check all of the integration jobs"""
jobs: Dict[UUID, Job] = {
@ -561,16 +777,19 @@ class TestOnefuzz:
if poll:
print("")
start = datetime.datetime.utcnow()
def check_jobs_impl() -> Tuple[bool, str, bool]:
self.cleared = False
failed_jobs: Set[UUID] = set()
job_task_states: Dict[UUID, Set[TaskTestState]] = {}
if datetime.datetime.utcnow() - start > timeout:
return (True, "timed out while checking jobs", False)
for job_id in check_containers:
finished_containers: Set[Container] = set()
for (container_name, container_impl) in check_containers[
job_id
].items():
for container_name, container_impl in check_containers[job_id].items():
container_client, count = container_impl
if len(container_client.list_blobs()) >= count:
clear()
@ -737,10 +956,10 @@ class TestOnefuzz:
OS.linux: ("info reg rip", r"^rip\s+0x[a-f0-9]+\s+0x[a-f0-9]+"),
}
for (job, repro) in list(repros.values()):
for job, repro in list(repros.values()):
repros[job.job_id] = (job, self.of.repro.get(repro.vm_id))
for (job, repro) in list(repros.values()):
for job, repro in list(repros.values()):
if repro.error:
clear()
self.logger.error(
@ -782,7 +1001,7 @@ class TestOnefuzz:
del repros[job.job_id]
repro_states: Dict[str, List[str]] = {}
for (job, repro) in repros.values():
for job, repro in repros.values():
if repro.state.name not in repro_states:
repro_states[repro.state.name] = []
repro_states[repro.state.name].append(job.config.name)
@ -836,10 +1055,12 @@ class TestOnefuzz:
)
try:
self.of.pools.shutdown(pool.name, now=True)
except Exception as e:
self.logger.error("cleanup of pool failed: %s - %s", pool.name, e)
errors.append(e)
shutil.rmtree(self.tools_dir)
container_names = set()
for job in jobs:
for task in self.of.tasks.list(job_id=job.job_id, state=None):
@ -935,10 +1156,7 @@ class TestOnefuzz:
# ignore warnings coming from the rust code, only be concerned
# about errors
if (
entry.get("severityLevel") == 2
and "rust" in entry.get("sdkVersion")
):
if entry.get("severityLevel") == 2 and "rust" in entry.get("sdkVersion"):
continue
# ignore resource not found warnings from azure-functions layer,
@ -997,7 +1215,9 @@ class Run(Command):
)
tester = TestOnefuzz(self.onefuzz, self.logger, test_id)
result = tester.check_jobs(
poll=poll, stop_on_complete_check=stop_on_complete_check, job_ids=job_ids
poll=poll,
stop_on_complete_check=stop_on_complete_check,
job_ids=job_ids,
)
if not result:
raise Exception("jobs failed")
@ -1050,8 +1270,16 @@ class Run(Command):
retry(self.logger, try_setup, "trying to configure")
tester = TestOnefuzz(self.onefuzz, self.logger, test_id)
tester.setup(region=region, pool_size=pool_size, os_list=os_list)
tester = TestOnefuzz(
self.onefuzz,
self.logger,
test_id,
)
tester.setup(
region=region,
pool_size=pool_size,
os_list=os_list,
)
def launch(
self,
@ -1081,7 +1309,6 @@ class Run(Command):
retry(self.logger, try_setup, "trying to configure")
tester = TestOnefuzz(self.onefuzz, self.logger, test_id)
job_ids = tester.launch(
samples, os_list=os_list, targets=targets, duration=duration
)
@ -1136,7 +1363,6 @@ class Run(Command):
test_id: UUID,
job_ids: List[UUID] = [],
) -> None:
self.check_jobs(
test_id,
endpoint=endpoint,
@ -1160,6 +1386,76 @@ class Run(Command):
job_ids=job_ids,
)
def test_unmanaged(
self,
samples: Directory,
os: OS,
*,
test_id: Optional[UUID] = None,
endpoint: Optional[str] = None,
authority: Optional[str] = None,
client_id: Optional[str] = None,
client_secret: Optional[str] = None,
pool_size: int = 4,
targets: List[str] = list(TARGETS.keys()),
duration: int = 1,
unmanaged_client_id: Optional[UUID] = None,
unmanaged_client_secret: Optional[str] = None,
unmanaged_principal_id: Optional[UUID] = None,
save_logs: bool = False,
timeout_in_minutes: int = 60,
) -> None:
if test_id is None:
test_id = uuid4()
self.logger.info("test_unmanaged test_id: %s", test_id)
try:
def try_setup(data: Any) -> None:
self.onefuzz.__setup__(
endpoint=endpoint,
client_id=client_id,
client_secret=client_secret,
authority=authority,
)
retry(self.logger, try_setup, "trying to configure")
tester = TestOnefuzz(
self.onefuzz,
self.logger,
test_id,
unmanaged_client_id=unmanaged_client_id,
unmanaged_client_secret=unmanaged_client_secret,
unmanaged_principal_id=unmanaged_principal_id,
)
unmanaged_pool = tester.create_unmanaged_pool(
pool_size, os, save_logs=save_logs
)
with unmanaged_pool:
tester.launch(
samples,
os_list=[os],
targets=targets,
duration=duration,
unmanaged_pool=unmanaged_pool,
)
result = tester.check_jobs(
poll=True,
stop_on_complete_check=True,
timeout=datetime.timedelta(minutes=timeout_in_minutes),
)
if not result:
raise Exception("jobs failed")
else:
self.logger.info("****** testing succeeded")
except Exception as e:
self.logger.error("testing failed: %s", repr(e))
sys.exit(1)
except KeyboardInterrupt:
self.logger.error("interrupted testing")
sys.exit(1)
def test(
self,
samples: Directory,
@ -1174,6 +1470,8 @@ class Run(Command):
targets: List[str] = list(TARGETS.keys()),
skip_repro: bool = False,
duration: int = 1,
unmanaged_client_id: Optional[UUID] = None,
unmanaged_client_secret: Optional[str] = None,
) -> None:
success = True
@ -1190,8 +1488,18 @@ class Run(Command):
)
retry(self.logger, try_setup, "trying to configure")
tester = TestOnefuzz(self.onefuzz, self.logger, test_id)
tester.setup(region=region, pool_size=pool_size, os_list=os_list)
tester = TestOnefuzz(
self.onefuzz,
self.logger,
test_id,
unmanaged_client_id=unmanaged_client_id,
unmanaged_client_secret=unmanaged_client_secret,
)
tester.setup(
region=region,
pool_size=pool_size,
os_list=os_list,
)
tester.launch(samples, os_list=os_list, targets=targets, duration=duration)
result = tester.check_jobs(poll=True, stop_on_complete_check=True)
if not result:

View File

@ -7,14 +7,16 @@ RUN echo "#!/bin/bash\n\$@" > /usr/bin/sudo && chmod +x /usr/bin/sudo
RUN mkdir /onefuzz
COPY . /onefuzz
RUN chmod +x /onefuzz/setup.sh
RUN chmod +x /onefuzz/tools/linux/run.sh
RUN export DEBIAN_FRONTEND=noninteractive && export DOCKER_BUILD=1 && cd /onefuzz && ./setup.sh
RUN export DEBIAN_FRONTEND=noninteractive && export DOCKER_BUILD=1 && cd /onefuzz && ./setup.sh fuzz
RUN export DEBIAN_FRONTEND=noninteractive \
&& apt-get -y install --no-install-recommends \
libunwind-dev build-essential libssl-dev \
pkg-config lldb ca-certificates
RUN mkdir -p /onefuzz/tools/linux \
&& mv /onefuzz/downloaded/* /onefuzz/tools/linux
RUN chmod +x /onefuzz/tools/linux/run.sh
RUN chmod +x /onefuzz/tools/linux/onefuzz-agent
RUN chmod +x /onefuzz/tools/linux/onefuzz-task
WORKDIR /onefuzz
ENTRYPOINT ["./tools/linux/run.sh"]

View File

@ -2,7 +2,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
FROM mcr.microsoft.com/windows:ltsc2019 as base_onefuzz
ARG BASE_IMAGE=mcr.microsoft.com/windows:ltsc2019
FROM ${BASE_IMAGE}
SHELL ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-Command"]