add log checking to refactored integration check (#700)

In practice, Application Insights can take up to 3 minutes before something sent to it is available via KQL. This PR logs a start and stop marker such that the integration tests only search for logs during the integration tests. This reduces the complexity when using the integration tests during the development process. Note: this migrated the new functionality from #356 into the latest integration test tools.
2025-06-14 11:08:06 +00:00 · 2021-04-02 17:49:19 -04:00
parent 9c1540aca8
commit ca12904684
7 changed files with 147 additions and 24 deletions
--- a/src/integration-tests/integration-test.py
+++ b/src/integration-tests/integration-test.py
@ -17,6 +17,7 @@
 #    checks on each of the created items for the stage.  This batch processing
 #    allows testing multiple components concurrently.

+import datetime
 import logging
 import os
 import re
@ -26,6 +27,7 @@ from shutil import which
 from typing import Dict, List, Optional, Set, Tuple
 from uuid import UUID, uuid4

+import requests
 from onefuzz.api import Command, Onefuzz
 from onefuzz.backend import ContainerWrapper, wait
 from onefuzz.cli import execute_api
@ -207,6 +209,8 @@ class TestOnefuzz:
        self.pools: Dict[OS, Pool] = {}
        self.test_id = test_id
        self.project = f"test-{self.test_id}"
+        self.start_log_marker = f"integration-test-injection-error-start-{self.test_id}"
+        self.stop_log_marker = f"integration-test-injection-error-stop-{self.test_id}"

    def setup(
        self,
@ -215,6 +219,7 @@ class TestOnefuzz:
        pool_size: int,
        os_list: List[OS],
    ) -> None:
+        self.inject_log(self.start_log_marker)
        for entry in os_list:
            name = PoolName(f"testpool-{entry.name}-{self.test_id}")
            self.logger.info("creating pool: %s:%s", entry.name, name)
@ -686,6 +691,101 @@ class TestOnefuzz:
        if errors:
            raise Exception("cleanup failed")

+    def inject_log(self, message: str) -> None:
+        # This is an *extremely* minimal implementation of the Application Insights rest
+        # API, as discussed here:
+        #
+        # https://apmtips.com/posts/2017-10-27-send-metric-to-application-insights/
+
+        key = self.of.info.get().insights_instrumentation_key
+        assert key is not None, "instrumentation key required for integration testing"
+
+        data = {
+            "data": {
+                "baseData": {
+                    "message": message,
+                    "severityLevel": "Information",
+                    "ver": 2,
+                },
+                "baseType": "MessageData",
+            },
+            "iKey": key,
+            "name": "Microsoft.ApplicationInsights.Message",
+            "time": datetime.datetime.now(datetime.timezone.utc)
+            .astimezone()
+            .isoformat(),
+        }
+
+        requests.post(
+            "https://dc.services.visualstudio.com/v2/track", json=data
+        ).raise_for_status()
+
+    def check_log_end_marker(
+        self,
+    ) -> Tuple[bool, str, bool]:
+        logs = self.of.debug.logs.keyword(
+            self.stop_log_marker, limit=1, timespan="PT1H"
+        )
+        return (
+            len(logs) > 0,
+            "waiting for application insight logs to flush",
+            True,
+        )
+
+    def check_logs_for_errors(self) -> None:
+        # only check for errors that exist between the start and stop markers
+        # also, only check for the most recent 100 errors within the last 3
+        # hours. The records are scanned through in reverse chronological
+        # order.
+
+        self.inject_log(self.stop_log_marker)
+        wait(self.check_log_end_marker, frequency=5.0)
+        self.logger.info("application insights log flushed")
+
+        logs = self.of.debug.logs.keyword("error", limit=100000, timespan="PT3H")
+
+        seen_errors = False
+        seen_stop = False
+        for entry in logs:
+            message = entry.get("message", "")
+            if not seen_stop:
+                if self.stop_log_marker in message:
+                    seen_stop = True
+                continue
+
+            if self.start_log_marker in message:
+                break
+
+            # ignore logging.info coming from Azure Functions
+            if entry.get("customDimensions", {}).get("LogLevel") == "Information":
+                continue
+
+            # ignore warnings coming from the rust code, only be concerned
+            # about errors
+            if (
+                entry.get("severityLevel") == 2
+                and entry.get("sdkVersion") == "rust:0.1.5"
+            ):
+                continue
+
+            # ignore resource not found warnings from azure-functions layer,
+            # which relate to azure-retry issues
+            if (
+                message.startswith("Client-Request-ID=")
+                and "ResourceNotFound" in message
+                and entry.get("sdkVersion", "").startswith("azurefunctions")
+            ):
+                continue
+
+            if message is None:
+                self.logger.error("error log: %s", entry)
+            else:
+                self.logger.error("error log: %s", message)
+            seen_errors = True
+
+        if seen_errors:
+            raise Exception("logs included errors")
+

 class Run(Command):
    def check_jobs(
@ -739,6 +839,11 @@ class Run(Command):
        tester = TestOnefuzz(self.onefuzz, self.logger, test_id=test_id)
        tester.cleanup()

+    def check_logs(self, test_id: UUID, *, endpoint: Optional[str]) -> None:
+        self.onefuzz.__setup__(endpoint=endpoint)
+        tester = TestOnefuzz(self.onefuzz, self.logger, test_id=test_id)
+        tester.check_logs_for_errors()
+
    def test(
        self,
        samples: Directory,
@ -774,6 +879,9 @@ class Run(Command):
                self.logger.warning("not testing crash repro")
            else:
                self.check_repros(test_id, endpoint=endpoint)
+
+            self.check_logs(test_id, endpoint=endpoint)
+
        except Exception as e:
            self.logger.error("testing failed: %s", repr(e))
            error = e