hai 1 ano · 4d38c7b1e4
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -212,6 +212,8 @@ jobs:
 
				           echo '*** bazelisk'
			
 
				           which bazelisk
			
 
				           bazelisk --version
			
 
				+          echo '*** run_bazel.py'
			
 
				+          ./scripts/run_bazel.py --version
			
 
				           echo '*** python'
			
 
				           which python
			
 
				           python --version
			
@@ -296,7 +298,7 @@ jobs:
 
				           build --verbose_failures
			
 
				           test --test_output=errors
			
 
				           EOF
			
 
				-          bazelisk info
			
 
				+          ./scripts/run_bazel.py info
			
 
				 
			
 
				       # Just for visibility, print space before and after the build.
			
 
				       - name: Disk space before build
			
@@ -307,9 +309,13 @@ jobs:
 
				         if: steps.filter.outputs.has_code == 'true'
			
 
				         run: |
			
 
				           exit_code=0
			
 
				-          bazelisk mod deps --lockfile_mode=error || exit_code=$?
			
 
				+          ./scripts/run_bazel.py \
			
 
				+            --attempts=5 \
			
 
				+            mod deps --lockfile_mode=error || exit_code=$?
			
 
				           if (( $exit_code != 0 )); then
			
 
				-            bazelisk mod deps --lockfile_mode=update
			
 
				+            ./scripts/run_bazel.py \
			
 
				+              --attempts=5 \
			
 
				+              mod deps --lockfile_mode=update
			
 
				             echo "MODULE.bazel.lock is out of date! Use below file for update."
			
 
				             echo "Platforms may require merging output, for example by applying"
			
 
				             echo "an update, re-running triggers, and applying the next update."
			
@@ -367,47 +373,13 @@ jobs:
 
				           BAZEL_USE_CPP_ONLY_TOOLCHAIN: 1
			
 
				           TARGETS_FILE: ${{ runner.temp }}/targets
			
 
				         run: |
			
 
				-          for i in {1..5}; do
			
 
				-            if (( $i == 4 )); then
			
 
				-              # Decrease the jobs sharply if we see repeated failures to try to
			
 
				-              # work around transient network errors even if it makes things
			
 
				-              # slower.
			
 
				-              echo "build --jobs=4" >>user.bazelrc
			
 
				-            fi
			
 
				-
			
 
				-            bazel_exit=0
			
 
				-            bazelisk test -c ${{ matrix.build_mode }} \
			
 
				-              --target_pattern_file=$TARGETS_FILE || bazel_exit=$?
			
 
				-
			
 
				-            # If we succeed, we're done.
			
 
				-            if (( $bazel_exit == 0 )); then
			
 
				-              break
			
 
				-            fi
			
 
				-
			
 
				-            # Several error codes are reliably permanent, break immediately.
			
 
				-            # `1`  -- The build failed.
			
 
				-            # `2`  -- Command line or environment problem.
			
 
				-            # `3`  -- Tests failed or timed out, we don't retry at this layer
			
 
				-            #         on execution timeout.
			
 
				-            # `4`  -- No tests found, which should be impossible here.
			
 
				-            # `8`  -- Explicitly interrupted build.
			
 
				-            #
			
 
				-            # Note that `36` is documented as "likely permanent", but we retry
			
 
				-            # it as most of our transient failures actually produce that error
			
 
				-            # code.
			
 
				-            if (( $bazel_exit == 1 || $bazel_exit == 2 || $bazel_exit == 3 || \
			
 
				-                  $bazel_exit == 4 || $bazel_exit == 8 || $bazel_exit == 8 ))
			
 
				-            then
			
 
				-              break
			
 
				-            fi
			
 
				-
			
 
				-            echo "Retrying a failed build as it may be transient..."
			
 
				-            # Also sleep a bit to try to skip over transient machine load.
			
 
				-            sleep $i
			
 
				-          done
			
 
				-
			
 
				-          # Propagate the Bazel exit code.
			
 
				-          exit $bazel_exit
			
 
				+          # Decrease the jobs sharply if we see repeated failures to try to
			
 
				+          # work around transient network errors even if it makes things
			
 
				+          # slower.
			
 
				+          ./scripts/run_bazel.py \
			
 
				+            --attempts=5 --jobs-on-last-attempt=4 \
			
 
				+            test -c ${{ matrix.build_mode }} \
			
 
				+            --target_pattern_file=$TARGETS_FILE
			
 
				 
			
 
				       # Run in the clang-tidy config. This is done as part of tests so that we
			
 
				       # aren't duplicating bazel/llvm setup.
			
@@ -420,7 +392,9 @@ jobs:
 
				         env:
			
 
				           TARGETS_FILE: ${{ runner.temp }}/targets
			
 
				         run: |
			
 
				-          bazelisk build --config=clang-tidy -k \
			
 
				+          ./scripts/run_bazel.py \
			
 
				+            --attempts=5 \
			
 
				+            build --config=clang-tidy -k \
			
 
				             --target_pattern_file=$TARGETS_FILE
			
 
				 
			
 
				       # See "Disk space before build".
			
--- a/scripts/run_bazel.py
+++ b/scripts/run_bazel.py
@@ -12,15 +12,66 @@ Exceptions. See /LICENSE for license information.
 
				 SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				 """
			
 
				 
			
 
				-import os
			
 
				-import sys
			
 
				+import argparse
			
 
				+import subprocess
			
 
				+import time
			
 
				 
			
 
				 import scripts_utils
			
 
				 
			
 
				 
			
 
				 def main() -> None:
			
 
				+    parser = argparse.ArgumentParser(description="Runs bazel.")
			
 
				+    parser.add_argument(
			
 
				+        "--attempts",
			
 
				+        metavar="COUNT",
			
 
				+        type=int,
			
 
				+        default=1,
			
 
				+        help="The number of attempts to execute the command, automatically "
			
 
				+        "retrying errors that may be transient.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--jobs-on-last-attempt",
			
 
				+        metavar="COUNT",
			
 
				+        type=int,
			
 
				+        help="Sets the number of jobs in user.bazelrc on the last attempt. If "
			
 
				+        "there is only one attempt, this will be set immediately.",
			
 
				+    )
			
 
				+    script_args, bazel_args = parser.parse_known_args()
			
 
				+
			
 
				     bazel = scripts_utils.locate_bazel()
			
 
				-    os.execv(bazel, [bazel] + sys.argv[1:])
			
 
				+    attempt = 0
			
 
				+    while True:
			
 
				+        attempt += 1
			
 
				+        if attempt == script_args.attempts and script_args.jobs_on_last_attempt:
			
 
				+            with open("user.bazelrc", "a") as bazelrc:
			
 
				+                bazelrc.write(
			
 
				+                    f"build --jobs={script_args.jobs_on_last_attempt}\n"
			
 
				+                )
			
 
				+
			
 
				+        p = subprocess.run([bazel] + bazel_args)
			
 
				+
			
 
				+        # If this was the last attempt, we're done.
			
 
				+        if attempt == script_args.attempts:
			
 
				+            exit(p.returncode)
			
 
				+
			
 
				+        # Several error codes are reliably permanent, break immediately.
			
 
				+        # `0`  -- Success.
			
 
				+        # `1`  -- The build failed.
			
 
				+        # `2`  -- Command line or environment problem.
			
 
				+        # `3`  -- Tests failed or timed out, we don't retry at this layer
			
 
				+        #         on execution timeout.
			
 
				+        # `4`  -- Test command but no tests found.
			
 
				+        # `8`  -- Explicitly interrupted build.
			
 
				+        #
			
 
				+        # Note that `36` is documented as "likely permanent", but we retry
			
 
				+        # it as most of our transient failures actually produce that error
			
 
				+        # code.
			
 
				+        if p.returncode in (0, 1, 2, 3, 4, 8):
			
 
				+            exit(p.returncode)
			
 
				+
			
 
				+        print("Retrying a failure because it may be transient...")
			
 
				+        # Also sleep a bit to try to skip over transient machine load.
			
 
				+        time.sleep(attempt)
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":