Enable test retry in distributed testing (#5702)

* TM-88 only write the test to the file if it passes otherwise retry

* TM-88 1 failing test for testing purposes

* TM-88 fail an integration test instead of unit test

* TM-88 include failed test for retry

* TM-88 new logic for retrying failed tests

* TM-88 refactored to use retry class when dealing with test failures + copy results even after pod failure

* TM-88 remaining pods as set to prevent duplicates, limit retries to prevent crashed build and a bit more logging

* TM-88 atomic int instead of int array

* TM-88 moving atomic int outside retry loops

* TM-88 removing second retry as it is not needed

* TM-88 small fix to final copy

* TM-88 while loop to prevent destruction of results on test retry

* TM-88 removing shutdown hook

* TM-88 reverting to previous implementation

* TM-88 copying xml files before retry

* TM-88 removing fail
This commit is contained in:
Razvan Codreanu 2019-11-15 15:07:23 +00:00 committed by Stefano Franz
parent b48a714aaa
commit deed4e9763
3 changed files with 32 additions and 12 deletions

View File

@ -7,6 +7,8 @@ import org.gradle.api.Plugin
import org.gradle.api.Project
import org.gradle.api.Task
import org.gradle.api.tasks.testing.Test
import org.gradle.api.tasks.testing.TestResult
import org.gradle.internal.impldep.junit.framework.TestFailure
import java.util.stream.Collectors
@ -252,8 +254,10 @@ class DistributedTesting implements Plugin<Project> {
}
afterTest { desc, result ->
executedTestsFile.withWriterAppend { writer ->
writer.writeLine(desc.getClassName() + "." + desc.getName())
if (result.getResultType() == TestResult.ResultType.SUCCESS ) {
executedTestsFile.withWriterAppend { writer ->
writer.writeLine(desc.getClassName() + "." + desc.getName())
}
}
}
}

View File

@ -18,7 +18,6 @@ import io.fabric8.kubernetes.client.dsl.PodResource;
import io.fabric8.kubernetes.client.utils.Serialization;
import net.corda.testing.retry.Retry;
import okhttp3.Response;
import org.apache.commons.compress.utils.IOUtils;
import org.gradle.api.DefaultTask;
import org.gradle.api.tasks.TaskAction;
import org.jetbrains.annotations.NotNull;
@ -27,8 +26,6 @@ import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
@ -36,23 +33,25 @@ import java.io.InputStreamReader;
import java.io.PipedInputStream;
import java.io.PipedOutputStream;
import java.math.BigInteger;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.Queue;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@ -76,7 +75,7 @@ public class KubesTest extends DefaultTask {
Integer memoryGbPerFork = 6;
public volatile List<File> testOutput = Collections.emptyList();
public volatile List<KubePodResult> containerResults = Collections.emptyList();
private final List<String> remainingPods = Collections.synchronizedList(new ArrayList());
private final Set<String> remainingPods = Collections.synchronizedSet(new HashSet());
public static String NAMESPACE = "thisisatest";
int k8sTimeout = 50 * 1_000;
@ -214,7 +213,7 @@ public class KubesTest extends DefaultTask {
});
int podNumber = podIdx + 1;
final AtomicInteger testRetries = new AtomicInteger(0);
try {
// pods might die, so we retry
return Retry.fixed(numberOfRetries).run(() -> {
@ -248,11 +247,20 @@ public class KubesTest extends DefaultTask {
if (!podLogsDirectory.exists()) {
podLogsDirectory.mkdirs();
}
File podOutput = executeBuild(namespace, numberOfPods, podIdx, podName, podLogsDirectory, printOutput, stdOutOs, stdOutIs, errChannelStream, waiter);
File podOutput = executeBuild(namespace, numberOfPods, podIdx, podName, podLogsDirectory, printOutput, stdOutOs, stdOutIs, errChannelStream, waiter);
int resCode = waiter.join();
getProject().getLogger().lifecycle("build has ended on on pod " + podName + " (" + podNumber + "/" + numberOfPods + ") with result " + resCode + " , gathering results");
Collection<File> binaryResults = downloadTestXmlFromPod(namespace, createdPod);
Collection<File> binaryResults;
//we don't retry on the final attempt as this will crash the build and some pods might not get to finish
if (resCode != 0 && testRetries.getAndIncrement() < numberOfRetries - 1) {
downloadTestXmlFromPod(namespace, createdPod);
getProject().getLogger().lifecycle("There are test failures in this pod. Retrying failed tests!!!");
throw new RuntimeException("There are test failures in this pod");
} else {
binaryResults = downloadTestXmlFromPod(namespace, createdPod);
}
getLogger().lifecycle("removing pod " + podName + " (" + podNumber + "/" + numberOfPods + ") after completed build");
try (KubernetesClient client = getKubernetesClient()) {
@ -267,6 +275,8 @@ public class KubesTest extends DefaultTask {
return new KubePodResult(podIdx, resCode, podOutput, binaryResults);
});
} catch (Retry.RetryException e) {
Pod pod = getKubernetesClient().pods().inNamespace(namespace).create(buildPodRequest(podName, pvc));
downloadTestXmlFromPod(namespace, pod);
throw new RuntimeException("Failed to build in pod " + podName + " (" + podNumber + "/" + numberOfPods + ") in " + numberOfRetries + " attempts", e);
}
}

View File

@ -2,10 +2,14 @@ package net.corda.testing.driver
import net.corda.core.concurrent.CordaFuture
import net.corda.core.identity.CordaX500Name
import net.corda.core.internal.*
import net.corda.core.internal.CertRole
import net.corda.core.internal.concurrent.fork
import net.corda.core.internal.concurrent.openFuture
import net.corda.core.internal.concurrent.transpose
import net.corda.core.internal.div
import net.corda.core.internal.isRegularFile
import net.corda.core.internal.list
import net.corda.core.internal.readLines
import net.corda.core.utilities.getOrThrow
import net.corda.node.internal.NodeStartup
import net.corda.testing.common.internal.ProjectStructure.projectRootDir
@ -15,7 +19,9 @@ import net.corda.testing.core.DUMMY_BANK_B_NAME
import net.corda.testing.http.HttpApi
import net.corda.testing.node.internal.addressMustBeBound
import net.corda.testing.node.internal.addressMustNotBeBound
import org.assertj.core.api.Assertions.*
import org.assertj.core.api.Assertions.assertThat
import org.assertj.core.api.Assertions.assertThatCode
import org.assertj.core.api.Assertions.assertThatIllegalArgumentException
import org.json.simple.JSONObject
import org.junit.Test
import java.util.*