Skip to content

Commit 2203195

Browse files
authored
Merge pull request #86 from sierra-research/leaderboard/add-gemini-3
Leaderboard/Add gemini 3
2 parents 0ed2fd8 + 7bdce0b commit 2203195

File tree

2 files changed

+54
-2
lines changed

2 files changed

+54
-2
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
{
2+
"model_name": "Gemini 3.0 Pro",
3+
"model_organization": "Google",
4+
"submitting_organization": "Sierra",
5+
"submission_date": "2025-11-18",
6+
"contact_info": {},
7+
"is_new": true,
8+
"trajectories_available": false,
9+
"references": [
10+
{
11+
"title": "Gemini Eval Documentation",
12+
"url": "https://storage.googleapis.com/deepmind-media/gemini/gemini_3_pro_model_evaluation.pdf",
13+
"type": "documentation"
14+
}
15+
],
16+
"results": {
17+
"retail": {
18+
"pass_1": 73.0,
19+
"pass_2": null,
20+
"pass_3": null,
21+
"pass_4": null,
22+
"cost": null
23+
},
24+
"airline": {
25+
"pass_1": 85.3,
26+
"pass_2": null,
27+
"pass_3": null,
28+
"pass_4": null,
29+
"cost": null
30+
},
31+
"telecom": {
32+
"pass_1": 98.0,
33+
"pass_2": null,
34+
"pass_3": null,
35+
"pass_4": null,
36+
"cost": null
37+
}
38+
},
39+
"methodology": {
40+
"evaluation_date": "2025-11-18",
41+
"tau2_bench_version": "v0.1.3",
42+
"user_simulator": "Gemini-3.0-Pro",
43+
"notes": "τ2-bench results for Gemini use standard sierra framework with a prompt adjustment to provide instructions relevant to each environment. The user model uses Gemini with a system instruction. ",
44+
"verification": {
45+
"modified_prompts": true,
46+
"omitted_questions": false,
47+
"details": "Prompt adjustments made to provide instructions relevant to each environment. User model uses Gemini with system instruction."
48+
}
49+
}
50+
}
51+

web/leaderboard/public/submissions/manifest.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
"o3_openai_2025-01-15",
1212
"o4-mini_openai_2024-06-20",
1313
"gpt-5_sierra_2025-08-09",
14-
"qwen3-max_qwen_2024_09_23"
14+
"qwen3-max_qwen_2024_09_23",
15+
"gemini-3-pro_google_2025-11-18"
1516
],
16-
"last_updated": "2025-10-11T01:30:00Z"
17+
"last_updated": "2025-11-18T00:00:00Z"
1718
}

0 commit comments

Comments
 (0)