MTSAIR SBS Leaderboard

MTSAIR leaderboard evaluating the LLM's performance as a judge for a side-by-side assessment in terms of:

  1. correlation with manual judgement,
  2. robustness to positional bias.
{
  • "headers": [
    • "Model",
    • "Avg. Correlation โฌ†๏ธ",
    • "APCC",
    • "MPCC",
    • "PCon@AB",
    • "MPCC Consistency",
    • "MPCC Swap Delta",
    • "Architecture",
    • "Precision",
    • "Hub License",
    • "#Params (B)",
    • "Hub โค๏ธ",
    • "Model sha"
    ],
  • "data": [
    • [
      • "<a target="_blank" href="https://huggingface.co/anthropic/Claude-3-opus" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">anthropic/Claude-3-opus</a>",
      • 64.16,
      • 68.1,
      • 60.22,
      • 43.12,
      • 59.8,
      • -12.52,
      • "?",
      • "?",
      • "?",
      • 0,
      • 0,
      • "main"
      ],
    • [
      • "<a target="_blank" href="https://huggingface.co/deepseek-ai/DeepSeek-V3" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">deepseek-ai/DeepSeek-V3</a>",
      • 59.73,
      • 62.44,
      • 57.02,
      • 27.06,
      • 16.43,
      • 7.74,
      • "DeepseekV3ForCausalLM",
      • "?",
      • "?",
      • 0,
      • 0,
      • "main"
      ],
    • [
      • "<a target="_blank" href="https://huggingface.co/deepseek-ai/deepseek-r1-distill-llama-70b-awq" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">deepseek-ai/deepseek-r1-distill-llama-70b-awq</a>",
      • 43.32,
      • 22.68,
      • 63.97,
      • 49.97,
      • 84.58,
      • -1.19,
      • "?",
      • "?",
      • "?",
      • 0,
      • 0,
      • "main"
      ],
    • [
      • "<a target="_blank" href="https://huggingface.co/yandex/YandexGPT-4-pro" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">yandex/YandexGPT-4-pro</a>",
      • 40.14,
      • 23.05,
      • 57.23,
      • 40.95,
      • 31.87,
      • 9.25,
      • "?",
      • "?",
      • "?",
      • 0,
      • 0,
      • "main"
      ],
    • [
      • "<a target="_blank" href="https://huggingface.co/meta-llama/llama-3.3-70b-instruct-awq" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">meta-llama/llama-3.3-70b-instruct-awq</a>",
      • 31.8,
      • 4.08,
      • 59.53,
      • 47.58,
      • 59.86,
      • -24.86,
      • "?",
      • "?",
      • "?",
      • 0,
      • 0,
      • "main"
      ],
    • [
      • "<a target="_blank" href="https://huggingface.co/t-tech/T-pro-it-1.0" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">t-tech/T-pro-it-1.0</a>",
      • 28.24,
      • 6.96,
      • 49.51,
      • 39.96,
      • 36.27,
      • 17.95,
      • "Qwen2ForCausalLM",
      • "?",
      • "?",
      • 0,
      • 0,
      • "main"
      ],
    • [
      • "<a target="_blank" href="https://huggingface.co/openai/GPT-4" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">openai/GPT-4</a>",
      • 27.99,
      • -8.22,
      • 64.2,
      • 33.86,
      • 67.44,
      • -25.98,
      • "?",
      • "?",
      • "?",
      • 0,
      • 0,
      • "main"
      ],
    • [
      • "<a target="_blank" href="https://huggingface.co/miqudev/miqu-1-70b" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">miqudev/miqu-1-70b</a>",
      • 27.16,
      • 2.18,
      • 52.13,
      • 23.3,
      • 18.72,
      • -33.8,
      • "?",
      • "?",
      • "?",
      • 0,
      • 0,
      • "main"
      ],
    • [
      • "<a target="_blank" href="https://huggingface.co/t-tech/T-lite-it-1.0" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">t-tech/T-lite-it-1.0</a>",
      • 18.83,
      • 23.83,
      • 13.82,
      • 12.24,
      • -37.03,
      • -9.28,
      • "Qwen2ForCausalLM",
      • "?",
      • "?",
      • 0,
      • 0,
      • "main"
      ],
    • [
      • "<a target="_blank" href="https://huggingface.co/openai/GPT-4o" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">openai/GPT-4o</a>",
      • 17.55,
      • -14.38,
      • 49.47,
      • 32.88,
      • 66.57,
      • -3.45,
      • "?",
      • "?",
      • "?",
      • 0,
      • 0,
      • "main"
      ],
    • [
      • "<a target="_blank" href="https://huggingface.co/anthropic/Claude-3-5-sonnet" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">anthropic/Claude-3-5-sonnet</a>",
      • 15.29,
      • -12.17,
      • 42.74,
      • 10.65,
      • -27.48,
      • -24.1,
      • "?",
      • "?",
      • "?",
      • 0,
      • 0,
      • "main"
      ],
    • [
      • "<a target="_blank" href="https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">meta-llama/Llama-3.1-405B-Instruct</a>",
      • 13.92,
      • -22.87,
      • 50.7,
      • 33.58,
      • 52.57,
      • -5.84,
      • "LlamaForCausalLM",
      • "?",
      • "?",
      • 0,
      • 0,
      • "main"
      ]
    ],
  • "metadata": null
}