feat: testing ways to mitigate variability

These ways are: alternate benchmarking and the step fit algorithm, I also removed the comments for emulator screen recording, and I now only store the JSON files that we care about without the other perfetto traces and text files
pull/2098/head^2
Ahmed Khaled 4 months ago
parent 8134706afc
commit 9e0861d7c2
No known key found for this signature in database

@ -0,0 +1,72 @@
import json
import math
import glob
import os
# ----------- CONFIG -----------
RESULTS_DIR = "benchmarks/build/outputs/connected_android_test_additional_output/collected"
BENCHMARK_NAME = "startupPrecompiledWithBaselineProfile"
METRIC_KEY = "timeToInitialDisplayMs"
# ------------------------------
def sum_squared_error(values):
avg = sum(values) / len(values)
return sum((v - avg) ** 2 for v in values)
def step_fit(before, after):
total_squared_error = sum_squared_error(before) + sum_squared_error(after)
step_error = math.sqrt(total_squared_error) / (len(before) + len(after))
if step_error == 0.0:
return 0.0
return (sum(before) / len(before) - sum(after) / len(after)) / step_error
def extract_median_from_file(path):
with open(path, "r") as f:
data = json.load(f)
for bench in data.get("benchmarks", []):
if bench.get("name") == BENCHMARK_NAME:
metrics = bench.get("metrics", {})
metric = metrics.get(METRIC_KEY, {})
return metric.get("median")
raise ValueError(f"Metric not found in {path}")
def main():
before = []
after = []
json_files = sorted(glob.glob(os.path.join(RESULTS_DIR, "*.json")))
if len(json_files) == 0:
raise RuntimeError("No JSON files found.")
for path in json_files:
median = extract_median_from_file(path)
filename = os.path.basename(path).lower()
if "v1" in filename:
before.append(median)
elif "v2" in filename:
after.append(median)
else:
print(f"Skipping file with unknown label: {filename}")
print(f"{filename}: median={median:.3f} ms")
if len(before) != 5 or len(after) != 5:
raise RuntimeError(f"Expected 5 runs each, got v1={len(before)}, v2={len(after)}")
result = step_fit(before, after)
print("\n-----------------------------")
print(f"v1 medians: {before}")
print(f"v2 medians: {after}")
print(f"Step Fit Result: {result:.4f}")
print("-----------------------------")
if abs(result) <= 25:
print("➡️ Difference is within noise range (low confidence of real regression)")
elif result > 0:
print("⚠️ v2 is slower than v1 (possible regression)")
else:
print("🚀 v2 is faster than v1 (possible improvement)")
if __name__ == "__main__":
main()

@ -38,7 +38,7 @@ jobs:
- name: Build Benchmark APKs
run: ./gradlew :app:assembleDemoBenchmark :benchmarks:assembleDemoBenchmark
- name: Run Benchmarks & Record Video
- name: Run Benchmarks
uses: reactivecircus/android-emulator-runner@v2
with:
api-level: 30
@ -48,23 +48,54 @@ jobs:
emulator-options: -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -memory 4096
disable-animations: true
script: |
# 1. Start recording
# adb shell screenrecord --time-limit 180 /sdcard/benchmark_video.mp4 &
# 2. Run ONLY the Startup tests
./gradlew :benchmarks:connectedDemoBenchmarkAndroidTest -Pandroid.testInstrumentationRunnerArguments.class=com.google.samples.apps.nowinandroid.startup.StartupBenchmark -Pandroid.testInstrumentationRunnerArguments.androidx.benchmark.suppressErrors=EMULATOR || true
# 3. Pull the video (This will run even if the tests above fail)
# echo "Pulling video file..."
# adb shell pkill -2 screenrecord || true
# sleep 5
# adb pull /sdcard/benchmark_video.mp4 benchmark_video.mp4
set -e
OUTPUT_DIR="benchmarks/build/outputs/connected_android_test_additional_output"
COLLECTED_DIR="$OUTPUT_DIR/collected"
mkdir -p "$COLLECTED_DIR"
run_benchmark () {
VERSION_LABEL=$1 # v1 or v2
RUN_NUMBER=$2 # 1..5
echo "=============================="
echo "Running benchmark for $VERSION_LABEL run $RUN_NUMBER"
echo "=============================="
# Clear app data to keep runs consistent
adb shell pm clear com.google.samples.apps.nowinandroid || true
# Run only the Startup benchmark
./gradlew :benchmarks:connectedDemoBenchmarkAndroidTest \
-Pandroid.testInstrumentationRunnerArguments.class=com.google.samples.apps.nowinandroid.startup.StartupBenchmark \
-Pandroid.testInstrumentationRunnerArguments.androidx.benchmark.suppressErrors=EMULATOR
# Find the newest JSON result file
LATEST_JSON=$(find "$OUTPUT_DIR" -name "*.json" -type f -printf "%T@ %p\n" | sort -nr | head -n1 | cut -d' ' -f2-)
if [ -z "$LATEST_JSON" ]; then
echo "Error: No benchmark JSON file found"
exit 1
fi
NEW_NAME="$COLLECTED_DIR/benchmark_${VERSION_LABEL}_run${RUN_NUMBER}.json"
cp "$LATEST_JSON" "$NEW_NAME"
echo "Saved result to $NEW_NAME"
}
# Alternate runs: v1, v2, v1, v2 ...
for i in 1 2 3 4 5; do
run_benchmark "v1" "$i"
run_benchmark "v2" "$i"
done
- name: Run step fit
run: python3 .github/scripts/step_fit.py
- name: Upload Artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: benchmark-results
path: |
# benchmark_video.mp4
benchmarks/build/outputs/connected_android_test_additional_output/
path: benchmarks/build/outputs/connected_android_test_additional_output/collected/
Loading…
Cancel
Save