From 650ada81e8f0acc871e183a5bbc1dd60118813cb Mon Sep 17 00:00:00 2001
From: Tony Medhat <tonymedhat5@tutanota.com>
Date: Wed, 18 Feb 2026 17:36:56 +0200
Subject: [PATCH] build: migrate to the latest benchcomp, benchmark.sh, and
 workflow versions

---
 .github/scripts/benchmark.sh           | 151 +++++++++++++++++++++++++
 .github/scripts/run_macrobenchmarks.sh |  92 ---------------
 .github/scripts/step_fit.py            | 111 ------------------
 .github/workflows/Build.yaml           |  16 +--
 4 files changed, 160 insertions(+), 210 deletions(-)
 create mode 100644 .github/scripts/benchmark.sh
 delete mode 100644 .github/scripts/run_macrobenchmarks.sh
 delete mode 100644 .github/scripts/step_fit.py
diff --git a/.github/scripts/benchmark.sh b/.github/scripts/benchmark.sh
new file mode 100644
index 000000000..cd75d5c35
--- /dev/null
+++ b/.github/scripts/benchmark.sh
@@ -0,0 +1,151 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# Configuration & Defaults
+NUMBER_OF_RUNS=1
+PATH_APK_BASELINE=""
+PATH_APK_CANDIDATE=""
+PATH_APK_BENCHMARK=""
+INSTRUMENT_PASSTHROUGH_ARGS=()
+OUTPUT_DIR="./macrobenchmark_results"
+TEST_RUNNER="androidx.test.runner.AndroidJUnitRunner"
+EMULATOR_BENCHMARK_RESULT_DIR="/sdcard/Download"
+
+# Cleanup
+TEMP_DIR="$(mktemp -d)"
+trap 'rm -rf "${TEMP_DIR}"' EXIT
+
+die() {
+  echo "err: $*" 1>&2
+  exit 1
+}
+
+print_usage() {
+  cat << EOF
+Usage: $(basename "$0") [OPTIONS] --baseline_apk <path> --candidate_apk <path> --benchmark_apk <path> [-- INSTRUMENT_ARGS]
+
+Automated benchmark script for APKs.
+
+Options:
+  -o, --output-dir <path>      Directory where benchmark results will be saved. (Default: "${OUTPUT_DIR}")
+  --baseline-apk <path>        Path to the baseline APK file.
+  --candidate-apk <path>       Path to the candidate APK file.
+  --benchmark-apk <path>       Path to the benchmark APK. Must contain instrumented tests.
+  -n, --runs <number>          Set number of runs per benchmark. (Default: 1)
+  -h, --help                   Display this help message and exit.
+
+Additional Arguments:
+  --                           Everything after '--' is passed directly to
+                               the adb instrumentation command.
+
+Example:
+  $(basename "$0") -o ./macrobenchmark_results --baseline_apk base.apk --candidate_apk candidate.apk -- -e androidx.benchmark.profiling.mode none
+EOF
+}
+
+get_pkg_name() {
+  local apk="${1}"
+  apkanalyzer manifest application-id "${apk}"
+}
+
+install_apk() {
+  local apk="${1}"
+  echo "Installing APK: ${apk}"
+  adb install -d "${apk}" > /dev/null || die "failed to install apk '${apk}'"
+  adb shell pm clear "${APP_PKG_NAME}" > /dev/null 2>&1 || true
+  adb shell pm clear "${BENCHMARK_PKG_NAME}" > /dev/null 2>&1 || true
+  adb shell "rm -rf ${EMULATOR_BENCHMARK_RESULT_DIR} && mkdir -p ${EMULATOR_BENCHMARK_RESULT_DIR}" > /dev/null || true
+}
+
+run_benchmark() {
+  echo "Running benchmarks..."
+  adb shell am instrument -w \
+    -e androidx.benchmark.suppressErrors EMULATOR \
+    -e androidx.benchmark.profiling.mode none \
+    -e no-isolated-storage true \
+    -e additionalTestOutputDir "${EMULATOR_BENCHMARK_RESULT_DIR}" \
+    "${INSTRUMENT_PASSTHROUGH_ARGS[@]}" \
+    "${BENCHMARK_PKG_NAME}/$TEST_RUNNER"
+}
+
+write_benchmark_result() {
+  local dest_path="${1}"
+  local pull_temp="${TEMP_DIR}/pull_$(date +%s)"
+  adb pull "${EMULATOR_BENCHMARK_RESULT_DIR}/." "${pull_temp}" > /dev/null
+  mkdir -p $(dirname "${dest_path}") && mv "${pull_temp}/"*.json "${dest_path}"
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    -h|--help)
+      print_usage
+      exit 0
+      ;;
+    -o|--output-dir)
+      OUTPUT_DIR="$2"
+      shift 2
+      ;;
+    --baseline-apk)
+      PATH_APK_BASELINE="$2"
+      shift 2
+      ;;
+    --candidate-apk)
+      PATH_APK_CANDIDATE="$2"
+      shift 2
+      ;;
+    --benchmark-apk)
+      PATH_APK_BENCHMARK="$2"
+      shift 2
+      ;;
+    -n|--runs)
+      NUMBER_OF_RUNS="$2"
+      if ! [[ "$NUMBER_OF_RUNS" -eq "$NUMBER_OF_RUNS" ]] 2> /dev/null; then
+          print_usage
+          exit 1
+      fi
+      shift 2
+      ;;
+    --)
+      shift
+      INSTRUMENT_PASSTHROUGH_ARGS+=("$@")
+      break
+      ;;
+    *)
+      echo "$(basename "$0"): invalid option -- '$1'"
+      echo "Try '$(basename "$0") --help' for more information"
+      exit 1
+      ;;
+  esac
+done
+
+if [[ -z "${PATH_APK_BASELINE}" || -z "${PATH_APK_CANDIDATE}" || -z "${PATH_APK_BENCHMARK}" ]]; then
+    print_usage
+    exit 1
+fi
+
+APP_PKG_NAME=$(get_pkg_name "${PATH_APK_BASELINE}")
+BENCHMARK_PKG_NAME=$(get_pkg_name "${PATH_APK_BENCHMARK}")
+
+install_apk "${PATH_APK_BENCHMARK}"
+for ((i=1; i<=${NUMBER_OF_RUNS}; i++)); do
+  echo "--- Starting benchmark run (${i} / ${NUMBER_OF_RUNS}) ---"
+
+  start_time=$SECONDS
+  output_filename="${BENCHMARK_PKG_NAME}_$(date +"%Y-%m-%dT%H-%M-%S").json"
+
+  # Baseline
+  install_apk "${PATH_APK_BASELINE}"
+  run_benchmark
+  write_benchmark_result "${OUTPUT_DIR}/baseline/${output_filename}"
+
+  # Candidate
+  install_apk "${PATH_APK_CANDIDATE}"
+  run_benchmark
+  write_benchmark_result "${OUTPUT_DIR}/candidate/${output_filename}"
+
+  duration=$((SECONDS - start_time))
+  echo "--- Ending benchmark run (${i} / ${NUMBER_OF_RUNS}) took ${duration}s ---"
+done
+
+echo "Benchmark completed. Results in '$OUTPUT_DIR'"
diff --git a/.github/scripts/run_macrobenchmarks.sh b/.github/scripts/run_macrobenchmarks.sh
deleted file mode 100644
index e15984a67..000000000
--- a/.github/scripts/run_macrobenchmarks.sh
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-# TODO: pass number of runs as a commandline argument
-NUMBER_OF_RUNS=2
-
-APP_PKG="com.google.samples.apps.nowinandroid"
-BENCHMARK_PKG="com.google.samples.apps.nowinandroid.benchmarks"
-TEST_RUNNER="androidx.test.runner.AndroidJUnitRunner"
-EMULATOR_BENCHMARK_RESULT_DIR="/sdcard/Download"
-
-PATH_APK_BASELINE="${1:-}"
-PATH_APK_CANDIDATE="${2:-}"
-OUTPUT_DIR="${3:-./macrobenchmark_results}"
-
-TEMP_DIR="$(mktemp -d)"
-trap 'rm -rf "${TEMP_DIR}"' EXIT
-
-install_apk() {
-  local apk_path="${1}"
-
-  adb install -r "${apk_path}"
-
-  adb shell pm clear "$APP_PKG" || true
-  adb shell pm clear "${BENCHMARK_PKG}" || true
-  adb shell rm -rf "${EMULATOR_BENCHMARK_RESULT_DIR}" || true
-  adb shell mkdir -p "${EMULATOR_BENCHMARK_RESULT_DIR}"
-}
-
-run_benchmark() {
-  adb shell am instrument -w \
-    -e class com.google.samples.apps.nowinandroid.startup.StartupBenchmark#startupPrecompiledWithBaselineProfile \
-    -e androidx.benchmark.suppressErrors EMULATOR \
-    -e androidx.benchmark.profiling.mode none \
-    -e no-isolated-storage true \
-    -e additionalTestOutputDir "${EMULATOR_BENCHMARK_RESULT_DIR}" \
-    "$BENCHMARK_PKG/$TEST_RUNNER"
-}
-
-write_benchmark_result() {
-  local output_path="${1}"
-
-  adb pull "${EMULATOR_BENCHMARK_RESULT_DIR}/." "${TEMP_DIR}/pull_out/"
-
-  mv "${TEMP_DIR}/pull_out/"*.json "${output_path}"
-  rm -rf "${TEMP_DIR}/pull_out/"
-}
-
-if [[ -z "${PATH_APK_BASELINE}" || -z "${PATH_APK_CANDIDATE}" ]]; then
-    echo "Usage: $0 <path_to_baseline.apk> <path_to_candidate.apk> [output_dir]"
-    exit 1
-fi
-
-mkdir -p "${OUTPUT_DIR}/baseline" "${OUTPUT_DIR}/candidate"
-
-# Alternate runs: v1, v2, v1, v2 ...
-for ((i=1; i<=${NUMBER_OF_RUNS}; i++)); do
-  start_time=$(date +%s)
-
-  timestamp=$(date +"%Y-%m-%dT%H-%M-%S")
-  output_filename="${BENCHMARK_PKG}_${timestamp}.json"
-  baseline_output_path="${OUTPUT_DIR}/baseline/${output_filename}"
-  candidate_output_path="${OUTPUT_DIR}/candidate/${output_filename}"
-
-  echo "=============================="
-  echo "Start iteration (${i} / ${NUMBER_OF_RUNS})"
-  echo "=============================="
-
-  echo "Starting Baseline Benchmark:"
-  echo "    >> APK file        : ${PATH_APK_BASELINE}"
-  echo "    >> Output file path: ${baseline_output_path}"
-
-  install_apk "${PATH_APK_BASELINE}"
-  run_benchmark
-  write_benchmark_result "${baseline_output_path}"
-
-  echo "Starting Candidate Benchmark:"
-  echo "    >> APK file        : ${PATH_APK_CANDIDATE}"
-  echo "    >> Output file path: ${candidate_output_path}"
-
-  install_apk "${PATH_APK_CANDIDATE}"
-  run_benchmark
-  write_benchmark_result "${candidate_output_path}"
-
-  end_time=$(date +%s)
-  duration=$((end_time - start_time))
-
-  echo "=============================="
-  echo "End iteration (${i} / ${NUMBER_OF_RUNS}) took ${duration}s"
-  echo "=============================="
-done
diff --git a/.github/scripts/step_fit.py b/.github/scripts/step_fit.py
deleted file mode 100644
index ef1691ece..000000000
--- a/.github/scripts/step_fit.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import argparse
-import json
-import math
-import sys
-from pathlib import Path
-
-# ----------- CONFIG -----------
-BENCHMARK_NAME = "startupPrecompiledWithBaselineProfile"
-METRIC_KEY = "timeToInitialDisplayMs"
-# ------------------------------
-
-def step_fit(a, b):
-    def sum_squared_error(values):
-        avg = sum(values) / len(values)
-        return sum((v - avg) ** 2 for v in values)
-
-    if not a or not b:
-            return 0.0
-
-    total_squared_error = sum_squared_error(a) + sum_squared_error(b)
-    step_error = math.sqrt(total_squared_error) / (len(a) + len(b))
-    if step_error == 0.0:
-        return 0.0
-
-    return (sum(a) / len(a) - sum(b) / len(b)) / step_error
-
-def extract_median_from_files(paths):
-    medians = []
-
-    for path in paths:
-        with open(path, "r") as f:
-            data = json.load(f)
-
-        found = False
-        for bench in data.get("benchmarks", []):
-            if bench.get("name") == BENCHMARK_NAME:
-                metrics = bench.get("metrics", {})
-                metric = metrics.get(METRIC_KEY, {})
-                medians.append(metric.get("median"))
-                found = True
-
-        if not found:
-            raise ValueError(f"Metric not found in {path}")
-
-    return medians
-
-def main():
-    parser = argparse.ArgumentParser(prog='Comperator', description='Compare between multiple macrobenchmark test results')
-    parser.add_argument('baseline_dir', help='Baseline macrobenchmark reports directory')
-    parser.add_argument('candidate_dir', help='Candidate macrobenchmark reports directory')
-    args = parser.parse_args()
-
-    baseline_dir = Path(args.baseline_dir)
-    candidate_dir = Path(args.candidate_dir)
-    baseline_files = sorted(baseline_dir.glob("*.json"))
-    candidate_files = sorted(candidate_dir.glob("*.json"))
-
-    if len(baseline_files) <= 0:
-        print('ERR: baseline has no macrobenchmark results', file=sys.stderr)
-        exit(1)
-
-    if len(candidate_files) <= 0:
-        print('ERR: candidate has no macrobenchmark results', file=sys.stderr)
-        exit(1)
-
-    min_len = min(len(baseline_files), len(candidate_files))
-    if len(baseline_files) != len(candidate_files):
-        print(f"WARN: Length mismatch, using first {min_len} samples. baseline: {len(baseline_files)}, candidate: {len(candidate_files)}")
-
-    print('Macrobenchmark Result Mapping:')
-    print('| Index | Baseline | Candidate |')
-    print('--------------------------------')
-
-    mismatch_count = 0
-    for i in range(min_len):
-        baseline_filename = baseline_files[i].name.upper()
-        candidate_filename = candidate_files[i].name.upper()
-        if baseline_filename != candidate_filename:
-            mismatch_count += 1
-            print('* ', end='')
-        print(f'{i + 1} {baseline_files[i]} <-> {candidate_files[i]}')
-
-    print('--------------------------------')
-    print(f'# Match   : {min_len - mismatch_count}')
-    print(f'# Mismatch: {mismatch_count}')
-    if mismatch_count > 0:
-        print("WARN: filename mapping mismatch detected. Output prediction may be incorrect")
-    print()
-
-    baseline_medians  = extract_median_from_files(baseline_files[:min_len])
-    candidate_medians = extract_median_from_files(candidate_files[:min_len])
-    assert (len(baseline_medians) == len(candidate_medians))
-
-    print(f"Benchmark        : {BENCHMARK_NAME}")
-    print(f"Metric           : {METRIC_KEY}")
-    print(f"Baseline medians : {baseline_medians}")
-    print(f"Candidate medians: {candidate_medians}")
-    print("-----------------------------")
-    print("Result: ", end="")
-
-    result = step_fit(baseline_medians, candidate_medians)
-    if abs(result) <= 25:
-        print("Within noise range", end="")
-    elif result < 0:
-        print("POSSIBLE REGRESSION", end="")
-    else:
-        print("POSSIBLE IMPROVEMENT", end="")
-    print(f" (Step fit: {result:.4})")
-
-if __name__ == "__main__":
-    main()
diff --git a/.github/workflows/Build.yaml b/.github/workflows/Build.yaml
index edd2c70cf..54f643493 100644
--- a/.github/workflows/Build.yaml
+++ b/.github/workflows/Build.yaml
@@ -5,7 +5,6 @@ on:
 
 env:
   BASELINE_BRANCH: main
-  GRADLE_BUILD_CACHE_DIR: ${{ github.workspace }}/.common-gradle-cache
 
 jobs:
   benchmark-android:
@@ -49,6 +48,11 @@ jobs:
           # Jobs on other branches will read entries from the cache but will not write updated entries.
           cache-read-only: false
 
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+
       - name: Build APKs (Baseline)
         working-directory: ./baseline
         run: |
@@ -71,15 +75,13 @@ jobs:
           emulator-options: -no-snapshot -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -memory 4096
           disable-animations: true
           script: |
-            adb install ./candidate/benchmarks/build/outputs/apk/demo/benchmarkRelease/benchmarks-demo-benchmarkRelease.apk
-            chmod +x ./candidate/.github/scripts/run_macrobenchmarks.sh
-            ./candidate/.github/scripts/run_macrobenchmarks.sh "./baseline/app/build/outputs/apk/demo/benchmarkRelease/app-demo-benchmarkRelease.apk" "./candidate/app/build/outputs/apk/demo/benchmarkRelease/app-demo-benchmarkRelease.apk"
+            chmod +x ./candidate/.github/scripts/benchmark.sh
+            ./candidate/.github/scripts/benchmark.sh --baseline-apk "./baseline/app/build/outputs/apk/demo/benchmarkRelease/app-demo-benchmarkRelease.apk" --candidate-apk "./candidate/app/build/outputs/apk/demo/benchmarkRelease/app-demo-benchmarkRelease.apk" --benchmark-apk "./candidate/benchmarks/build/outputs/apk/demo/benchmarkRelease/benchmarks-demo-benchmarkRelease.apk"
 
       - name: Compare macrobenchmark results
         run: |
-            python3 ./candidate/.github/scripts/step_fit.py \
-              "./macrobenchmark_results/baseline" \
-              "./macrobenchmark_results/candidate"
+          pip install git+https://github.com/Frozen-Bytes/benchcomp.git@v1.0.0
+          benchcomp --verbose "./macrobenchmark_results/baseline" "./macrobenchmark_results/candidate"
 
       - name: Upload Artifacts
         if: always()