Deprecated: The each() function is deprecated. This message will be suppressed on further calls in /home/zhenxiangba/zhenxiangba.com/public_html/phproxy-improved-master/index.php on line 456
Paper page - WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild
[go: Go Back, main page]

@article\n\t{yuchen2024wildbench,\n title={WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild},\n author={Yuchen Lin, Bill and Deng, Yuntian and Chandu, Khyathi and Brahman, Faeze and Ravichander, Abhilasha and Pyatkin, Valentina and Dziri, Nouha and Le Bras, Ronan and Choi, Yejin},\n journal={arXiv e-prints},\n pages={arXiv--2406},\n year={2024}\n}\n\n","updatedAt":"2024-07-21T05:09:24.271Z","author":{"_id":"607f666a4ad99100d63ce35c","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/607f666a4ad99100d63ce35c/QxhxnvfeV6efkxwUFHwjI.png","fullname":"Bill Yuchen Lin","name":"yuchenlin","type":"user","isPro":false,"isHf":false,"isHfAdmin":false,"isMod":false,"followerCount":145,"isUserFollowing":false}},"numEdits":1,"identifiedLanguage":{"language":"en","probability":0.7277757525444031},"editors":["yuchenlin"],"editorAvatarUrls":["https://cdn-avatars.huggingface.co/v1/production/uploads/607f666a4ad99100d63ce35c/QxhxnvfeV6efkxwUFHwjI.png"],"reactions":[],"isReport":false}}],"primaryEmailConfirmed":false,"paper":{"id":"2406.04770","authors":[{"_id":"66665de49f5a0b322905de64","user":{"_id":"607f666a4ad99100d63ce35c","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/607f666a4ad99100d63ce35c/QxhxnvfeV6efkxwUFHwjI.png","isPro":false,"fullname":"Bill Yuchen Lin","user":"yuchenlin","type":"user"},"name":"Bill Yuchen Lin","status":"admin_assigned","statusLastChangedAt":"2024-06-10T10:19:45.095Z","hidden":false},{"_id":"66665de49f5a0b322905de65","user":{"_id":"63081e15a670ed10f9d44229","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/63081e15a670ed10f9d44229/w1b9uq-9774bMMgJbSPsS.jpeg","isPro":true,"fullname":"Yuntian Deng","user":"yuntian-deng","type":"user"},"name":"Yuntian Deng","status":"admin_assigned","statusLastChangedAt":"2024-06-10T10:19:52.064Z","hidden":false},{"_id":"66665de49f5a0b322905de66","user":{"_id":"62ebe6472f8999993a847bb6","avatarUrl":"/avatars/29fb590f8116bdc9c55f7a333e2c4280.svg","isPro":false,"fullname":"Khyathi Raghavi Chandu","user":"khyathi","type":"user"},"name":"Khyathi Chandu","status":"admin_assigned","statusLastChangedAt":"2024-06-10T10:20:22.751Z","hidden":false},{"_id":"66665de49f5a0b322905de67","user":{"_id":"65282b8d578679aac7888aec","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/65282b8d578679aac7888aec/dibBkhH-z1c70mJZZxJ7u.jpeg","isPro":false,"fullname":"Faeze Brahman","user":"faezeb","type":"user"},"name":"Faeze Brahman","status":"admin_assigned","statusLastChangedAt":"2024-06-10T10:20:29.009Z","hidden":false},{"_id":"66665de49f5a0b322905de68","user":{"_id":"6349886c429608888c42319a","avatarUrl":"/avatars/f84b5fe8b76172878274754e3399d6ec.svg","isPro":false,"fullname":"Abhilasha Ravichander","user":"lasha-nlp","type":"user"},"name":"Abhilasha Ravichander","status":"admin_assigned","statusLastChangedAt":"2024-06-10T10:20:49.502Z","hidden":false},{"_id":"66665de49f5a0b322905de69","user":{"_id":"6556cff80e7a7067a934445f","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/6556cff80e7a7067a934445f/PoT7qQ6tVqLGGrYvGBbr8.jpeg","isPro":false,"fullname":"Valentina Pyatkin","user":"valpy","type":"user"},"name":"Valentina Pyatkin","status":"admin_assigned","statusLastChangedAt":"2024-06-10T10:20:56.401Z","hidden":false},{"_id":"66665de49f5a0b322905de6a","user":{"_id":"625888c2c964680d85d70bff","avatarUrl":"/avatars/2ba91e452c5ea577ec9f1f71abc2fa49.svg","isPro":false,"fullname":"Nouha Dziri","user":"nouhadziri","type":"user"},"name":"Nouha Dziri","status":"admin_assigned","statusLastChangedAt":"2024-06-10T10:21:19.482Z","hidden":false},{"_id":"66665de49f5a0b322905de6b","user":{"_id":"635049104e753c9940fefd71","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/635049104e753c9940fefd71/HgR43XIFw3dneY5ufrAE8.jpeg","isPro":false,"fullname":"Ronan Le Bras","user":"ronanlb","type":"user"},"name":"Ronan Le Bras","status":"admin_assigned","statusLastChangedAt":"2024-06-10T10:21:26.252Z","hidden":false},{"_id":"66665de49f5a0b322905de6c","user":{"_id":"64d42729f63b01b7f676b176","avatarUrl":"/avatars/52e54bdd6a1fb6c774a40cd70f3d7925.svg","isPro":false,"fullname":"Yejin Choi","user":"yejinchoinka","type":"user"},"name":"Yejin Choi","status":"admin_assigned","statusLastChangedAt":"2024-06-10T10:21:33.126Z","hidden":false}],"publishedAt":"2024-06-07T09:15:44.000Z","submittedOnDailyAt":"2024-06-10T00:29:01.425Z","title":"WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in\n the Wild","submittedOnDailyBy":{"_id":"60f1abe7544c2adfd699860c","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1674929746905-60f1abe7544c2adfd699860c.jpeg","isPro":false,"fullname":"AK","user":"akhaliq","type":"user"},"summary":"We introduce WildBench, an automated evaluation framework designed to\nbenchmark large language models (LLMs) using challenging, real-world user\nqueries. WildBench consists of 1,024 tasks carefully selected from over one\nmillion human-chatbot conversation logs. For automated evaluation with\nWildBench, we have developed two metrics, WB-Reward and WB-Score, which are\ncomputable using advanced LLMs such as GPT-4-turbo. WildBench evaluation uses\ntask-specific checklists to evaluate model outputs systematically and provides\nstructured explanations that justify the scores and comparisons, resulting in\nmore reliable and interpretable automatic judgments. WB-Reward employs\nfine-grained pairwise comparisons between model responses, generating five\npotential outcomes: much better, slightly better, slightly worse, much worse,\nor a tie. Unlike previous evaluations that employed a single baseline model, we\nselected three baseline models at varying performance levels to ensure a\ncomprehensive pairwise evaluation. Additionally, we propose a simple method to\nmitigate length bias, by converting outcomes of ``slightly better/worse'' to\n``tie'' if the winner response exceeds the loser one by more than K\ncharacters. WB-Score evaluates the quality of model outputs individually,\nmaking it a fast and cost-efficient evaluation metric. WildBench results\ndemonstrate a strong correlation with the human-voted Elo ratings from Chatbot\nArena on hard tasks. Specifically, WB-Reward achieves a Pearson correlation of\n0.98 with top-ranking models. Additionally, WB-Score reaches 0.95, surpassing\nboth ArenaHard's 0.91 and AlpacaEval2.0's 0.89 for length-controlled win rates,\nas well as the 0.87 for regular win rates.","upvotes":28,"discussionId":"66665de59f5a0b322905dedb","githubRepo":"https://github.com/allenai/wildbench","githubRepoAddedBy":"auto","ai_summary":"WildBench evaluates large language models using human-chatbot queries, providing tasks and metrics that correlate well with human-voted evaluations.","ai_keywords":["WildBench","large language models","LLMs","user queries","human-chatbot conversation logs","WB-Reward","WB-Score","task-specific checklists","fine-grained pairwise comparisons","baseline models","length bias","Pearson correlation","Elo ratings","ArenaHard","AlpacaEval2.0"],"githubStars":246},"canReadDatabase":false,"canManagePapers":false,"canSubmit":false,"hasHfLevelAccess":false,"upvoted":false,"upvoters":[{"_id":"620783f24e28382272337ba4","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/620783f24e28382272337ba4/zkUveQPNiDfYjgGhuFErj.jpeg","isPro":false,"fullname":"GuoLiangTang","user":"Tommy930","type":"user"},{"_id":"6039478ab3ecf716b1a5fd4d","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/6039478ab3ecf716b1a5fd4d/_Thy4E7taiSYBLKxEKJbT.jpeg","isPro":true,"fullname":"taesiri","user":"taesiri","type":"user"},{"_id":"6550c4f27bbfce1878f5f280","avatarUrl":"/avatars/0ecedbcd8a55b2c4abd1da9e741a6652.svg","isPro":false,"fullname":"seongyun_lee","user":"Seongyun","type":"user"},{"_id":"6311bca0ae8896941da24e66","avatarUrl":"/avatars/48de64894fc3c9397e26e4d6da3ff537.svg","isPro":false,"fullname":"Fynn Kröger","user":"fynnkroeger","type":"user"},{"_id":"6350c89759bfa9a85d434138","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1666238674117-6350c89759bfa9a85d434138.jpeg","isPro":false,"fullname":"Yang Lee","user":"innovation64","type":"user"},{"_id":"655ac762cb17ec19ef82719b","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/655ac762cb17ec19ef82719b/1kDncYrGLYS_2SR8cNdAL.png","isPro":false,"fullname":"Welcome to matlok","user":"matlok","type":"user"},{"_id":"63a369d98c0c89dcae3b8329","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/63a369d98c0c89dcae3b8329/AiH2zjy1cnt9OADAAZMLD.jpeg","isPro":false,"fullname":"Adina Yakefu","user":"AdinaY","type":"user"},{"_id":"63ddc7b80f6d2d6c3efe3600","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/63ddc7b80f6d2d6c3efe3600/RX5q9T80Jl3tn6z03ls0l.jpeg","isPro":false,"fullname":"J","user":"dashfunnydashdash","type":"user"},{"_id":"5dd96eb166059660ed1ee413","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5dd96eb166059660ed1ee413/NQtzmrDdbG0H8qkZvRyGk.jpeg","isPro":true,"fullname":"Julien Chaumond","user":"julien-c","type":"user"},{"_id":"62567c86d444a9b5a0ec51c1","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62567c86d444a9b5a0ec51c1/1vXJf2uGztPcXpkwyTBr6.png","isPro":false,"fullname":"Dongfu Jiang","user":"DongfuJiang","type":"user"},{"_id":"657217faabb25ed8aedd5e48","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/657217faabb25ed8aedd5e48/UUHAXeGtOnQBXFD3nYtf2.jpeg","isPro":false,"fullname":"Vlad Bogolin","user":"vladbogo","type":"user"},{"_id":"648eb1eb59c4e5c87dc116e0","avatarUrl":"/avatars/c636cea39c2c0937f01398c94ead5dad.svg","isPro":false,"fullname":"fdsqefsgergd","user":"T-representer","type":"user"}],"acceptLanguages":["*"],"dailyPaperRank":0}">
Papers
arxiv:2406.04770

WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild

Published on Jun 7, 2024
· Submitted by
AK
on Jun 10, 2024

Abstract

WildBench evaluates large language models using human-chatbot queries, providing tasks and metrics that correlate well with human-voted evaluations.

AI-generated summary

We introduce WildBench, an automated evaluation framework designed to benchmark large language models (LLMs) using challenging, real-world user queries. WildBench consists of 1,024 tasks carefully selected from over one million human-chatbot conversation logs. For automated evaluation with WildBench, we have developed two metrics, WB-Reward and WB-Score, which are computable using advanced LLMs such as GPT-4-turbo. WildBench evaluation uses task-specific checklists to evaluate model outputs systematically and provides structured explanations that justify the scores and comparisons, resulting in more reliable and interpretable automatic judgments. WB-Reward employs fine-grained pairwise comparisons between model responses, generating five potential outcomes: much better, slightly better, slightly worse, much worse, or a tie. Unlike previous evaluations that employed a single baseline model, we selected three baseline models at varying performance levels to ensure a comprehensive pairwise evaluation. Additionally, we propose a simple method to mitigate length bias, by converting outcomes of ``slightly better/worse'' to ``tie'' if the winner response exceeds the loser one by more than K characters. WB-Score evaluates the quality of model outputs individually, making it a fast and cost-efficient evaluation metric. WildBench results demonstrate a strong correlation with the human-voted Elo ratings from Chatbot Arena on hard tasks. Specifically, WB-Reward achieves a Pearson correlation of 0.98 with top-ranking models. Additionally, WB-Score reaches 0.95, surpassing both ArenaHard's 0.91 and AlpacaEval2.0's 0.89 for length-controlled win rates, as well as the 0.87 for regular win rates.

Community



@article
	{yuchen2024wildbench,
  title={WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild},
  author={Yuchen Lin, Bill and Deng, Yuntian and Chandu, Khyathi and Brahman, Faeze and Ravichander, Abhilasha and Pyatkin, Valentina and Dziri, Nouha and Le Bras, Ronan and Choi, Yejin},
  journal={arXiv e-prints},
  pages={arXiv--2406},
  year={2024}
}

Sign up or log in to comment

Models citing this paper 0

No model linking this paper

Cite arxiv.org/abs/2406.04770 in a model README.md to link it from this page.

Datasets citing this paper 7

Browse 7 datasets citing this paper

Spaces citing this paper 27

Collections including this paper 5