We design and use model-based evaluators to both evaluate and autonomously refine the performance of digital agents. Experiments show that domain-general automated evaluators can significantly improve the performance of digital agents, without any extra supervision.
\nJiayi Pan, Yichi Zhang, Nicholas Tomlin, Yifei Zhou, Sergey Levine, Alane Suhr
\nUC Berkeley, University of Michigan
\n","classNames":"hf-sanitized hf-sanitized-k6J7Pqk_OCAbCwixxQwE_"},"users":[{"_id":"61568f37272f2d87a99ba884","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/61568f37272f2d87a99ba884/lgvkl5f0rEyiQRVU5FE32.png","isPro":false,"fullname":"Jiayi Pan","user":"Jiayi-Pan","type":"user"},{"_id":"6369b1d456d1f93498130a8a","avatarUrl":"/avatars/8ec228aa6f171715652511f948765db9.svg","isPro":false,"fullname":"Yichi Zhang","user":"594zyc","type":"user"}],"userCount":2,"collections":[],"datasets":[{"author":"Agent-Eval-Refine","downloads":33,"gated":false,"id":"Agent-Eval-Refine/Agent-Trajectories","lastModified":"2024-04-12T22:26:52.000Z","private":false,"repoType":"dataset","likes":5,"isLikedByUser":false,"isBenchmark":false},{"author":"Agent-Eval-Refine","downloads":19,"gated":false,"id":"Agent-Eval-Refine/GUI-Dense-Descriptions","lastModified":"2024-04-02T22:01:26.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":1263,"libraries":["datasets","pandas","mlcroissant","polars"],"formats":["parquet"],"modalities":["image","text"]},"private":false,"repoType":"dataset","likes":5,"isLikedByUser":false,"isBenchmark":false}],"models":[{"author":"Agent-Eval-Refine","authorData":{"_id":"660f3cc1d5671fdffc7a7fe0","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/61568f37272f2d87a99ba884/RIWxfVjG_TaAKa-1uWAxf.jpeg","fullname":"Agent-Eval-Refine","name":"Agent-Eval-Refine","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"followerCount":8,"isUserFollowing":false},"downloads":4,"gated":false,"id":"Agent-Eval-Refine/Captioner","availableInferenceProviders":[],"lastModified":"2024-04-08T00:32:06.000Z","likes":2,"pipeline_tag":"text-generation","private":false,"repoType":"model","isLikedByUser":false,"widgetOutputUrls":[]},{"author":"Agent-Eval-Refine","authorData":{"_id":"660f3cc1d5671fdffc7a7fe0","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/61568f37272f2d87a99ba884/RIWxfVjG_TaAKa-1uWAxf.jpeg","fullname":"Agent-Eval-Refine","name":"Agent-Eval-Refine","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"followerCount":8,"isUserFollowing":false},"downloads":0,"gated":false,"id":"Agent-Eval-Refine/CogAgent-iOS-SelfTrain","availableInferenceProviders":[],"lastModified":"2024-04-05T05:53:24.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false},{"author":"Agent-Eval-Refine","authorData":{"_id":"660f3cc1d5671fdffc7a7fe0","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/61568f37272f2d87a99ba884/RIWxfVjG_TaAKa-1uWAxf.jpeg","fullname":"Agent-Eval-Refine","name":"Agent-Eval-Refine","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"followerCount":8,"isUserFollowing":false},"downloads":0,"gated":false,"id":"Agent-Eval-Refine/CogAgent-iOS-FilteredBC","availableInferenceProviders":[],"lastModified":"2024-04-05T05:06:55.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false}],"paperPreviews":[],"spaces":[{"author":"Agent-Eval-Refine","authorData":{"_id":"660f3cc1d5671fdffc7a7fe0","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/61568f37272f2d87a99ba884/RIWxfVjG_TaAKa-1uWAxf.jpeg","fullname":"Agent-Eval-Refine","name":"Agent-Eval-Refine","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"followerCount":8,"isUserFollowing":false},"colorFrom":"gray","colorTo":"blue","createdAt":"2024-04-04T23:12:06.000Z","emoji":"🏢","id":"Agent-Eval-Refine/Captioner","lastModified":"2024-04-10T04:04:56.000Z","likes":16,"pinned":false,"private":false,"sdk":"gradio","repoType":"space","runtime":{"stage":"RUNTIME_ERROR","hardware":{"current":null,"requested":"zero-a10g"},"storage":null,"gcTimeout":172800,"errorMessage":"()\n File \"/usr/local/lib/python3.10/site-packages/gradio/helpers.py\", line 306, in create\n self._start_caching()\n File \"/usr/local/lib/python3.10/site-packages/gradio/helpers.py\", line 357, in _start_caching\n client_utils.synchronize_async(self.cache)\n File \"/usr/local/lib/python3.10/site-packages/gradio_client/utils.py\", line 858, in synchronize_async\n return fsspec.asyn.sync(fsspec.asyn.get_loop(), func, *args, **kwargs) # type: ignore\n File \"/usr/local/lib/python3.10/site-packages/fsspec/asyn.py\", line 103, in sync\n raise return_result\n File \"/usr/local/lib/python3.10/site-packages/fsspec/asyn.py\", line 56, in _runner\n result[0] = await coro\n File \"/usr/local/lib/python3.10/site-packages/gradio/helpers.py\", line 478, in cache\n prediction = await Context.root_block.process_api(\n File \"/usr/local/lib/python3.10/site-packages/gradio/blocks.py\", line 1741, in process_api\n result = await self.call_function(\n File \"/usr/local/lib/python3.10/site-packages/gradio/blocks.py\", line 1296, in call_function\n prediction = await anyio.to_thread.run_sync(\n File \"/usr/local/lib/python3.10/site-packages/anyio/to_thread.py\", line 56, in run_sync\n return await get_async_backend().run_sync_in_worker_thread(\n File \"/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py\", line 2144, in run_sync_in_worker_thread\n return await future\n File \"/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py\", line 851, in run\n result = context.run(func, *args)\n File \"/usr/local/lib/python3.10/site-packages/gradio/utils.py\", line 751, in wrapper\n response = f(*args, **kwargs)\n File \"/usr/local/lib/python3.10/site-packages/spaces/zero/wrappers.py\", line 160, in gradio_handler\n raise res.value\nRuntimeError: CUDA error: unknown error\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1.\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n\n","replicas":{"requested":1},"devMode":false,"domains":[{"domain":"agent-eval-refine-captioner.hf.space","stage":"READY"}]},"title":"Captioner Demo","isLikedByUser":false,"trendingScore":0,"tags":["gradio","region:us"],"featured":false}],"buckets":[],"numBuckets":0,"numDatasets":2,"numModels":3,"numSpaces":2,"lastOrgActivities":[],"acceptLanguages":["*"],"canReadRepos":false,"canReadSpaces":false,"blogPosts":[],"currentRepoPage":0,"filters":{},"paperView":false}">AI & ML interests
None defined yet.
Model/Data associated with research project Autonomous Evaluation and Refinement of Digital Agents.
Paper | Code
We design and use model-based evaluators to both evaluate and autonomously refine the performance of digital agents. Experiments show that domain-general automated evaluators can significantly improve the performance of digital agents, without any extra supervision.
Jiayi Pan, Yichi Zhang, Nicholas Tomlin, Yifei Zhou, Sergey Levine, Alane Suhr
UC Berkeley, University of Michigan