Deprecated: The each() function is deprecated. This message will be suppressed on further calls in /home/zhenxiangba/zhenxiangba.com/public_html/phproxy-improved-master/index.php on line 456 Paper page - VIBEVOICE-ASR Technical Report
Please give a thumbs up to this comment if you found it helpful!
\n
If you want recommendations for any Paper on Hugging Face checkout this Space
\n
You can directly ask Librarian Bot for paper recommendations by tagging it in a comment: \n\n@librarian-bot\n\t recommend
\n","updatedAt":"2026-01-28T01:39:56.289Z","author":{"_id":"63d3e0e8ff1384ce6c5dd17d","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1674830754237-63d3e0e8ff1384ce6c5dd17d.jpeg","fullname":"Librarian Bot (Bot)","name":"librarian-bot","type":"user","isPro":false,"isHf":false,"isHfAdmin":false,"isMod":false,"followerCount":318,"isUserFollowing":false}},"numEdits":0,"identifiedLanguage":{"language":"en","probability":0.6883174777030945},"editors":["librarian-bot"],"editorAvatarUrls":["https://cdn-avatars.huggingface.co/v1/production/uploads/1674830754237-63d3e0e8ff1384ce6c5dd17d.jpeg"],"reactions":[],"isReport":false}},{"id":"69797a50649d2b5c8d77a869","author":{"_id":"65243980050781c16f234f1f","avatarUrl":"/avatars/743a009681d5d554c27e04300db9f267.svg","fullname":"Avi","name":"avahal","type":"user","isPro":false,"isHf":false,"isHfAdmin":false,"isMod":false,"followerCount":3,"isUserFollowing":false},"createdAt":"2026-01-28T02:54:08.000Z","type":"comment","data":{"edited":false,"hidden":false,"latest":{"raw":"arXivlens breakdown of this paper ๐ https://arxivlens.com/PaperView/Details/vibevoice-asr-technical-report-4449-77afe093\n\n- Executive Summary\n- Detailed Breakdown\n- Practical Applications","html":"
\n","updatedAt":"2026-01-28T02:54:08.761Z","author":{"_id":"65243980050781c16f234f1f","avatarUrl":"/avatars/743a009681d5d554c27e04300db9f267.svg","fullname":"Avi","name":"avahal","type":"user","isPro":false,"isHf":false,"isHfAdmin":false,"isMod":false,"followerCount":3,"isUserFollowing":false}},"numEdits":0,"identifiedLanguage":{"language":"en","probability":0.6774253845214844},"editors":["avahal"],"editorAvatarUrls":["/avatars/743a009681d5d554c27e04300db9f267.svg"],"reactions":[],"isReport":false}}],"primaryEmailConfirmed":false,"paper":{"id":"2601.18184","authors":[{"_id":"697832bd026bdf0473116e89","user":{"_id":"68e3ba91fc9f7cbda9a2be02","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/5h166wHkdQSG7IusNV5wT.png","isPro":false,"fullname":"pengzhiliang","user":"zhiliang2","type":"user"},"name":"Zhiliang Peng","status":"admin_assigned","statusLastChangedAt":"2026-01-27T14:29:35.447Z","hidden":false},{"_id":"697832bd026bdf0473116e8a","name":"Jianwei Yu","hidden":false},{"_id":"697832bd026bdf0473116e8b","user":{"_id":"65e552cb4dbf9514fb0c3110","avatarUrl":"/avatars/655dd9a36e85c1290855fb2c296472f8.svg","isPro":false,"fullname":"Yaoyao Chang","user":"YaoyaoChang","type":"user"},"name":"Yaoyao Chang","status":"admin_assigned","statusLastChangedAt":"2026-01-27T14:30:15.096Z","hidden":false},{"_id":"697832bd026bdf0473116e8c","name":"Zilong Wang","hidden":false},{"_id":"697832bd026bdf0473116e8d","name":"Li Dong","hidden":false},{"_id":"697832bd026bdf0473116e8e","user":{"_id":"672b6da69380700b60c92367","avatarUrl":"/avatars/651b8bd4d1d6bbd047c6f0d6010a0ea3.svg","isPro":false,"fullname":"Yingbo Hao","user":"YingboHao","type":"user"},"name":"Yingbo Hao","status":"admin_assigned","statusLastChangedAt":"2026-01-27T14:30:33.802Z","hidden":false},{"_id":"697832bd026bdf0473116e8f","name":"Yujie Tu","hidden":false},{"_id":"697832bd026bdf0473116e90","name":"Chenyu Yang","hidden":false},{"_id":"697832bd026bdf0473116e91","user":{"_id":"661396d69ef83c1509d41c3f","avatarUrl":"/avatars/9760ac72f2d44320e6033d30e2ce7bd5.svg","isPro":false,"fullname":"Wenhui Wang","user":"stonewh1","type":"user"},"name":"Wenhui Wang","status":"admin_assigned","statusLastChangedAt":"2026-01-27T14:30:44.557Z","hidden":false},{"_id":"697832bd026bdf0473116e92","name":"Songchen Xu","hidden":false},{"_id":"697832bd026bdf0473116e93","name":"Yutao Sun","hidden":false},{"_id":"697832bd026bdf0473116e94","name":"Hangbo Bao","hidden":false},{"_id":"697832bd026bdf0473116e95","user":{"_id":"66f4cc8dcb62f781535a27ac","avatarUrl":"/avatars/2b6cf6513a0c93f932115634092f7d30.svg","isPro":false,"fullname":"Weijiang Xu","user":"WeijiangXU","type":"user"},"name":"Weijiang Xu","status":"admin_assigned","statusLastChangedAt":"2026-01-27T14:30:57.860Z","hidden":false},{"_id":"697832bd026bdf0473116e96","name":"Yi Zhu","hidden":false},{"_id":"697832bd026bdf0473116e97","user":{"_id":"67b358e504fec7fd742ae8c1","avatarUrl":"/avatars/3f404b20df769bcf87e8d8d7d5fc9ed6.svg","isPro":false,"fullname":"Zehua Wang","user":"zehuawang","type":"user"},"name":"Zehua Wang","status":"admin_assigned","statusLastChangedAt":"2026-01-27T14:31:04.270Z","hidden":false},{"_id":"697832bd026bdf0473116e98","name":"Ting Song","hidden":false},{"_id":"697832bd026bdf0473116e99","name":"Yan Xia","hidden":false},{"_id":"697832bd026bdf0473116e9a","user":{"_id":"60f6d61f89b21b8fd2d471c6","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/60f6d61f89b21b8fd2d471c6/RmLFf97vUoXMoCT3rWbhm.jpeg","isPro":false,"fullname":"Zewen Chi","user":"CZWin32768","type":"user"},"name":"Zewen Chi","status":"admin_assigned","statusLastChangedAt":"2026-01-27T14:31:10.007Z","hidden":false},{"_id":"697832bd026bdf0473116e9b","name":"Shaohan Huang","hidden":false},{"_id":"697832bd026bdf0473116e9c","name":"Liang Wang","hidden":false},{"_id":"697832bd026bdf0473116e9d","name":"Chuang Ding","hidden":false},{"_id":"697832bd026bdf0473116e9e","name":"Shuai Wang","hidden":false},{"_id":"697832bd026bdf0473116e9f","name":"Xie Chen","hidden":false},{"_id":"697832bd026bdf0473116ea0","user":{"_id":"6368c512fbfe97c16a40baba","avatarUrl":"/avatars/1c23bc7c0b6d9225699ce27647623d7a.svg","isPro":false,"fullname":"Furu Wei","user":"thegenerality","type":"user"},"name":"Furu Wei","status":"admin_assigned","statusLastChangedAt":"2026-01-27T14:31:19.998Z","hidden":false}],"publishedAt":"2026-01-26T06:11:51.000Z","submittedOnDailyAt":"2026-01-27T01:18:16.537Z","title":"VIBEVOICE-ASR Technical Report","submittedOnDailyBy":{"_id":"67ecd6178647cfa1775f75ed","avatarUrl":"/avatars/98882cc58dc0a5de94df765d523d92c9.svg","isPro":false,"fullname":"Furu Wei","user":"frontierai","type":"user"},"summary":"This report presents VibeVoice-ASR, a general-purpose speech understanding framework built upon VibeVoice, designed to address the persistent challenges of context fragmentation and multi-speaker complexity in long-form audio (e.g., meetings, podcasts) that remain despite recent advancements in short-form speech recognition. Unlike traditional pipelined approaches that rely on audio chunking, VibeVoice-ASRsupports single-pass processing for up to 60 minutes of audio. It unifies Automatic Speech Recognition, Speaker Diarization, and Timestamping into a single end-to-end generation task. In addition, VibeVoice-ASR supports over 50 languages, requires no explicit language setting, and natively handles code-switching within and across utterances. Furthermore, we introduce a prompt-based context injection mechanism that allows users to supply customized conetxt, significantly improving accuracy on domain-specific terminology and polyphonic character disambiguation.","upvotes":20,"discussionId":"697832bd026bdf0473116ea1","ai_summary":"VibeVoice-ASR is a unified end-to-end speech understanding framework that processes long-form audio in a single pass while supporting multilingual, code-switching, and domain-specific context injection.","ai_keywords":["speech understanding framework","VibeVoice","Automatic Speech Recognition","Speaker Diarization","Timestamping","end-to-end generation","long-form audio","single-pass processing","multilingual support","code-switching","prompt-based context injection"],"organization":{"_id":"5e6485f787403103f9f1055e","name":"microsoft","fullname":"Microsoft","avatar":"https://cdn-uploads.huggingface.co/production/uploads/1583646260758-5e64858c87403103f9f1055d.png"}},"canReadDatabase":false,"canManagePapers":false,"canSubmit":false,"hasHfLevelAccess":false,"upvoted":false,"upvoters":[{"_id":"646c408336505117e22f4b36","avatarUrl":"/avatars/8201da3c4ec51c8e113446f9578d44f6.svg","isPro":false,"fullname":"zhiliang","user":"zzliang","type":"user"},{"_id":"6039478ab3ecf716b1a5fd4d","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/6039478ab3ecf716b1a5fd4d/_Thy4E7taiSYBLKxEKJbT.jpeg","isPro":true,"fullname":"taesiri","user":"taesiri","type":"user"},{"_id":"66a72d5f25bcf18873ea202e","avatarUrl":"/avatars/80c458feabfd4c1040d50d254fb23159.svg","isPro":false,"fullname":"TTTTT","user":"TTTT-TTTT","type":"user"},{"_id":"6540abf239948eb634ce1760","avatarUrl":"/avatars/3c9c73ec6f758ebd411b509a06746bc2.svg","isPro":false,"fullname":"Ling Zhenqing","user":"lingzhq11","type":"user"},{"_id":"6683b1082ca1c52c2756c458","avatarUrl":"/avatars/04c060438a54dcc658b7584b93369461.svg","isPro":false,"fullname":"Zhang","user":"WestZhang","type":"user"},{"_id":"66c453199f88f7346c15daf2","avatarUrl":"/avatars/6f3dc33d0eb3a989638c0f3c6d92918f.svg","isPro":false,"fullname":"Xun Wu","user":"YUSHUIWX2","type":"user"},{"_id":"6384db7fb2906edaf835a91d","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/6384db7fb2906edaf835a91d/MOTXxaOmjlTZ8wONYifnD.jpeg","isPro":true,"fullname":"Eric Bezzam","user":"bezzam","type":"user"},{"_id":"67ecd6178647cfa1775f75ed","avatarUrl":"/avatars/98882cc58dc0a5de94df765d523d92c9.svg","isPro":false,"fullname":"Furu Wei","user":"frontierai","type":"user"},{"_id":"6978b81ae1217778b3d50da9","avatarUrl":"/avatars/ddb6da35f903cb65be396d31e75421b5.svg","isPro":false,"fullname":"Cheng Zhi","user":"Chengong2023","type":"user"},{"_id":"5df85abada6d0311fd3d5408","avatarUrl":"/avatars/2331cf703c1b5d3a62e2050b1a6eb108.svg","isPro":false,"fullname":"Li Dong","user":"unilm","type":"user"},{"_id":"6978d2da286d9d2ac38ab89f","avatarUrl":"/avatars/aaf728e6ca668ce6c35b84c2ae3457af.svg","isPro":false,"fullname":"Purple Liu","user":"purpleteam33","type":"user"},{"_id":"64092a1ab6a334f53e278b3b","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/64092a1ab6a334f53e278b3b/tcueLWyyDL6WMUTw3Or4t.jpeg","isPro":false,"fullname":"Tiantian Feng","user":"tiantiaf","type":"user"}],"acceptLanguages":["*"],"dailyPaperRank":0,"organization":{"_id":"5e6485f787403103f9f1055e","name":"microsoft","fullname":"Microsoft","avatar":"https://cdn-uploads.huggingface.co/production/uploads/1583646260758-5e64858c87403103f9f1055d.png"}}">
VibeVoice-ASR is a unified end-to-end speech understanding framework that processes long-form audio in a single pass while supporting multilingual, code-switching, and domain-specific context injection.
AI-generated summary
This report presents VibeVoice-ASR, a general-purpose speech understanding framework built upon VibeVoice, designed to address the persistent challenges of context fragmentation and multi-speaker complexity in long-form audio (e.g., meetings, podcasts) that remain despite recent advancements in short-form speech recognition. Unlike traditional pipelined approaches that rely on audio chunking, VibeVoice-ASRsupports single-pass processing for up to 60 minutes of audio. It unifies Automatic Speech Recognition, Speaker Diarization, and Timestamping into a single end-to-end generation task. In addition, VibeVoice-ASR supports over 50 languages, requires no explicit language setting, and natively handles code-switching within and across utterances. Furthermore, we introduce a prompt-based context injection mechanism that allows users to supply customized conetxt, significantly improving accuracy on domain-specific terminology and polyphonic character disambiguation.
VibeVoice-ASR is a unified speech-to-text model designed to handle 60-minute long-form audio in a single pass, generating structured transcriptions containing Who (Speaker), When (Timestamps), and What (Content), with support for User-Customized Context.