Explorar o código

Expand bazel retries in tests. (#3983)

Trying to better address ephemeral issues such as:

https://github.com/carbon-language/carbon-lang/actions/runs/9226025647/job/25384877352

`bazelisk test` currently has some retries, and this migrates those to a
script and copies it to other places that we invoke bazelisk.

---------

Co-authored-by: Chandler Carruth <chandlerc@gmail.com>
Jon Ross-Perkins hai 1 ano
pai
achega
4d38c7b1e4
Modificáronse 2 ficheiros con 73 adicións e 48 borrados
  1. 19 45
      .github/workflows/tests.yaml
  2. 54 3
      scripts/run_bazel.py

+ 19 - 45
.github/workflows/tests.yaml

@@ -212,6 +212,8 @@ jobs:
           echo '*** bazelisk'
           which bazelisk
           bazelisk --version
+          echo '*** run_bazel.py'
+          ./scripts/run_bazel.py --version
           echo '*** python'
           which python
           python --version
@@ -296,7 +298,7 @@ jobs:
           build --verbose_failures
           test --test_output=errors
           EOF
-          bazelisk info
+          ./scripts/run_bazel.py info
 
       # Just for visibility, print space before and after the build.
       - name: Disk space before build
@@ -307,9 +309,13 @@ jobs:
         if: steps.filter.outputs.has_code == 'true'
         run: |
           exit_code=0
-          bazelisk mod deps --lockfile_mode=error || exit_code=$?
+          ./scripts/run_bazel.py \
+            --attempts=5 \
+            mod deps --lockfile_mode=error || exit_code=$?
           if (( $exit_code != 0 )); then
-            bazelisk mod deps --lockfile_mode=update
+            ./scripts/run_bazel.py \
+              --attempts=5 \
+              mod deps --lockfile_mode=update
             echo "MODULE.bazel.lock is out of date! Use below file for update."
             echo "Platforms may require merging output, for example by applying"
             echo "an update, re-running triggers, and applying the next update."
@@ -367,47 +373,13 @@ jobs:
           BAZEL_USE_CPP_ONLY_TOOLCHAIN: 1
           TARGETS_FILE: ${{ runner.temp }}/targets
         run: |
-          for i in {1..5}; do
-            if (( $i == 4 )); then
-              # Decrease the jobs sharply if we see repeated failures to try to
-              # work around transient network errors even if it makes things
-              # slower.
-              echo "build --jobs=4" >>user.bazelrc
-            fi
-
-            bazel_exit=0
-            bazelisk test -c ${{ matrix.build_mode }} \
-              --target_pattern_file=$TARGETS_FILE || bazel_exit=$?
-
-            # If we succeed, we're done.
-            if (( $bazel_exit == 0 )); then
-              break
-            fi
-
-            # Several error codes are reliably permanent, break immediately.
-            # `1`  -- The build failed.
-            # `2`  -- Command line or environment problem.
-            # `3`  -- Tests failed or timed out, we don't retry at this layer
-            #         on execution timeout.
-            # `4`  -- No tests found, which should be impossible here.
-            # `8`  -- Explicitly interrupted build.
-            #
-            # Note that `36` is documented as "likely permanent", but we retry
-            # it as most of our transient failures actually produce that error
-            # code.
-            if (( $bazel_exit == 1 || $bazel_exit == 2 || $bazel_exit == 3 || \
-                  $bazel_exit == 4 || $bazel_exit == 8 || $bazel_exit == 8 ))
-            then
-              break
-            fi
-
-            echo "Retrying a failed build as it may be transient..."
-            # Also sleep a bit to try to skip over transient machine load.
-            sleep $i
-          done
-
-          # Propagate the Bazel exit code.
-          exit $bazel_exit
+          # Decrease the jobs sharply if we see repeated failures to try to
+          # work around transient network errors even if it makes things
+          # slower.
+          ./scripts/run_bazel.py \
+            --attempts=5 --jobs-on-last-attempt=4 \
+            test -c ${{ matrix.build_mode }} \
+            --target_pattern_file=$TARGETS_FILE
 
       # Run in the clang-tidy config. This is done as part of tests so that we
       # aren't duplicating bazel/llvm setup.
@@ -420,7 +392,9 @@ jobs:
         env:
           TARGETS_FILE: ${{ runner.temp }}/targets
         run: |
-          bazelisk build --config=clang-tidy -k \
+          ./scripts/run_bazel.py \
+            --attempts=5 \
+            build --config=clang-tidy -k \
             --target_pattern_file=$TARGETS_FILE
 
       # See "Disk space before build".

+ 54 - 3
scripts/run_bazel.py

@@ -12,15 +12,66 @@ Exceptions. See /LICENSE for license information.
 SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """
 
-import os
-import sys
+import argparse
+import subprocess
+import time
 
 import scripts_utils
 
 
 def main() -> None:
+    parser = argparse.ArgumentParser(description="Runs bazel.")
+    parser.add_argument(
+        "--attempts",
+        metavar="COUNT",
+        type=int,
+        default=1,
+        help="The number of attempts to execute the command, automatically "
+        "retrying errors that may be transient.",
+    )
+    parser.add_argument(
+        "--jobs-on-last-attempt",
+        metavar="COUNT",
+        type=int,
+        help="Sets the number of jobs in user.bazelrc on the last attempt. If "
+        "there is only one attempt, this will be set immediately.",
+    )
+    script_args, bazel_args = parser.parse_known_args()
+
     bazel = scripts_utils.locate_bazel()
-    os.execv(bazel, [bazel] + sys.argv[1:])
+    attempt = 0
+    while True:
+        attempt += 1
+        if attempt == script_args.attempts and script_args.jobs_on_last_attempt:
+            with open("user.bazelrc", "a") as bazelrc:
+                bazelrc.write(
+                    f"build --jobs={script_args.jobs_on_last_attempt}\n"
+                )
+
+        p = subprocess.run([bazel] + bazel_args)
+
+        # If this was the last attempt, we're done.
+        if attempt == script_args.attempts:
+            exit(p.returncode)
+
+        # Several error codes are reliably permanent, break immediately.
+        # `0`  -- Success.
+        # `1`  -- The build failed.
+        # `2`  -- Command line or environment problem.
+        # `3`  -- Tests failed or timed out, we don't retry at this layer
+        #         on execution timeout.
+        # `4`  -- Test command but no tests found.
+        # `8`  -- Explicitly interrupted build.
+        #
+        # Note that `36` is documented as "likely permanent", but we retry
+        # it as most of our transient failures actually produce that error
+        # code.
+        if p.returncode in (0, 1, 2, 3, 4, 8):
+            exit(p.returncode)
+
+        print("Retrying a failure because it may be transient...")
+        # Also sleep a bit to try to skip over transient machine load.
+        time.sleep(attempt)
 
 
 if __name__ == "__main__":