From 36cee898d24701e9fdbbf345b16af6af6cb77689 Mon Sep 17 00:00:00 2001
From: Yaroslav Lobankov <y.lobankov@tarantool.org>
Date: Mon, 14 Mar 2022 17:04:07 +0400
Subject: [PATCH] Rerun all failed tests, not only marked as fragile

test-run supports functionality to rerun failed tests in place, but
these tests have to be on so called fragile list. To add a test to the
fragile list we need to add a special configuration to the suite.ini
file of a test suite. Configuration example:

    fragile = {
        "retries": 5,
        "tests": {
            "tarantoolctl.test.lua": {
                "issues": [ "gh-5059", "gh-5346" ]
            },
            "debug.test.lua": {
                "issues": [ "gh-5346" ]
            },
            ...
        }
    }

Rerunning failed tests in place is quite convenient because it allows us
to avoid rerunning all tests again and thus save time.

But to make it work as expected we should keep the list of fragile tests
always up-to-date. Flaky tests may be introduced every day and keeping
the list of fragile tests always up-to-date becomes extremely difficult
to do.

So our solusion is quite simple: just rerun all failed tests.
By default, the number of retries for regular and fragile tests is 3.
But for fragile tests this number can be overriden in the suite.ini
file.

Closes #328
---
 lib/test_suite.py |  9 ++++++---
 lib/worker.py     | 19 ++++++++++++-------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/lib/test_suite.py b/lib/test_suite.py
index 5c8831a3..44b51301 100644
--- a/lib/test_suite.py
+++ b/lib/test_suite.py
@@ -45,6 +45,9 @@ class TestSuite:
     server for this suite, the client program to execute individual
     tests and other suite properties. The server is started once per
     suite."""
+
+    RETRIES_COUNT = 3
+
     def get_multirun_conf(self, suite_path):
         conf_name = self.ini.get('config', None)
         if conf_name is None:
@@ -91,7 +94,7 @@ def __init__(self, suite_path, args):
         self.args = args
         self.tests = []
         self.ini = {}
-        self.fragile = {'retries': 0, 'tests': {}}
+        self.fragile = {'retries': self.RETRIES_COUNT, 'tests': {}}
         self.suite_path = suite_path
         self.ini["core"] = "tarantool"
 
@@ -128,7 +131,7 @@ def __init__(self, suite_path, args):
         if config.has_option("default", "fragile"):
             fragiles = config.get("default", "fragile")
             try:
-                self.fragile = json.loads(fragiles)
+                self.fragile.update(json.loads(fragiles))
                 if 'tests' not in self.fragile:
                     raise RuntimeError(
                         "Key 'tests' absent in 'fragile' json: {}"
@@ -288,7 +291,7 @@ def is_parallel(self):
         return self.ini['is_parallel']
 
     def fragile_retries(self):
-        return self.fragile.get('retries', 0)
+        return self.fragile['retries']
 
     def show_reproduce_content(self):
         return self.ini['show_reproduce_content']
diff --git a/lib/worker.py b/lib/worker.py
index 57c95e0c..eb47efd5 100644
--- a/lib/worker.py
+++ b/lib/worker.py
@@ -350,20 +350,25 @@ def run_loop(self, task_queue, result_queue):
                                   'defined in suite.ini but this functionality '
                                   'is dropped' % testname)
                 )
-            retries_left = self.suite.fragile_retries()
+            retries_left = self.suite.RETRIES_COUNT
+            if testname in self.suite.fragile['tests']:
+                retries_left = self.suite.fragile_retries()
             # let's run till short_status became 'pass'
             while short_status in (None, 'fail') and retries_left >= 0:
                 self.restart_server()
                 # print message only after some fails occurred
                 if short_status == 'fail':
-                    color_stdout(
-                        'Test "%s", conf: "%s"\n'
-                        '\tfrom "fragile" list failed, rerunning ...\n'
-                        % (task_id[0], task_id[1]), schema='error')
+                    if testname not in self.suite.fragile['tests']:
+                        color_stdout(
+                            'Test "%s", conf: "%s"\n\tfailed, rerunning ...\n'
+                            % (task_id[0], task_id[1]), schema='error')
+                    else:
+                        color_stdout(
+                            'Test "%s", conf: "%s"\n'
+                            '\tfrom "fragile" list failed, rerunning ...\n'
+                            % (task_id[0], task_id[1]), schema='error')
                 # run task and save the result to short_status
                 short_status, duration = self.run_task(task_id)
-                if testname not in self.suite.fragile['tests']:
-                    break
                 retries_left = retries_left - 1
 
             result_queue.put(self.wrap_result(task_id, short_status, duration))