Deprecated: The each() function is deprecated. This message will be suppressed on further calls in /home/zhenxiangba/zhenxiangba.com/public_html/phproxy-improved-master/index.php on line 456 Paper page - PLaD: Preference-based Large Language Model Distillation with
Pseudo-Preference Pairs
\n","updatedAt":"2024-06-09T20:16:44.556Z","author":{"_id":"6186ddf6a7717cb375090c01","avatarUrl":"/avatars/716b6a7d1094c8036b2a8a7b9063e8aa.svg","fullname":"Julien BLANCHON","name":"blanchon","type":"user","isPro":true,"isHf":false,"isHfAdmin":false,"isMod":false,"followerCount":176,"isUserFollowing":false}},"numEdits":0,"identifiedLanguage":{"language":"en","probability":0.4720543622970581},"editors":["blanchon"],"editorAvatarUrls":["/avatars/716b6a7d1094c8036b2a8a7b9063e8aa.svg"],"reactions":[],"isReport":false}}],"primaryEmailConfirmed":false,"paper":{"id":"2406.02886","authors":[{"_id":"66612600fb3dfd49f05b2215","user":{"_id":"644a2c4e9a1c5faef7a5dbd8","avatarUrl":"/avatars/fbbbc1347f8e423b2477e2506fdb43d9.svg","isPro":false,"fullname":"Rongzhi Zhang","user":"Solute","type":"user"},"name":"Rongzhi Zhang","status":"extracted_confirmed","statusLastChangedAt":"2025-05-16T06:19:21.355Z","hidden":false},{"_id":"66612600fb3dfd49f05b2216","user":{"_id":"5f7d1ecdb1a525442ff96ee6","avatarUrl":"/avatars/cbc7840a6fb318dd37ce91347a7f96e4.svg","isPro":false,"fullname":"Jiaming Shen","user":"jmshen","type":"user"},"name":"Jiaming Shen","status":"admin_assigned","statusLastChangedAt":"2024-06-06T10:08:16.564Z","hidden":false},{"_id":"66612600fb3dfd49f05b2217","user":{"_id":"646784af696e7355f5d5001f","avatarUrl":"/avatars/39ce8999c296584ee0bdeb7848eee6d9.svg","isPro":false,"fullname":"Tianqi Liu","user":"TianqiLiuAI","type":"user"},"name":"Tianqi Liu","status":"admin_assigned","statusLastChangedAt":"2024-06-06T10:08:45.814Z","hidden":false},{"_id":"66612600fb3dfd49f05b2218","user":{"_id":"63476afcdbe3f8b009e2f758","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1665624822880-noauth.jpeg","isPro":false,"fullname":"Wang Haorui","user":"JerrySkywalker","type":"user"},"name":"Haorui Wang","status":"admin_assigned","statusLastChangedAt":"2024-06-06T10:08:58.413Z","hidden":false},{"_id":"66612600fb3dfd49f05b2219","user":{"_id":"64a2bd9245dde1d6ac7b4d81","avatarUrl":"/avatars/8a238a832bfcf14e7704a2152b254a0e.svg","isPro":false,"fullname":"Zhen Qin","user":"paulqin","type":"user"},"name":"Zhen Qin","status":"claimed_verified","statusLastChangedAt":"2024-06-18T08:29:42.402Z","hidden":false},{"_id":"66612600fb3dfd49f05b221a","name":"Feng Han","hidden":false},{"_id":"66612600fb3dfd49f05b221b","user":{"_id":"64acb90a5d48838462da09ae","avatarUrl":"/avatars/dc87416dcb2806a6e0d60bd636024978.svg","isPro":false,"fullname":"liujialu","user":"Jialuliu","type":"user"},"name":"Jialu Liu","status":"admin_assigned","statusLastChangedAt":"2024-06-06T10:10:23.991Z","hidden":false},{"_id":"66612600fb3dfd49f05b221c","user":{"_id":"6560f697e0a7720b6ae377bc","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/6560f697e0a7720b6ae377bc/8b-p-lQ7KV_VygBJ6Vf3_.jpeg","isPro":false,"fullname":"Simon Baumgartner","user":"sens3","type":"user"},"name":"Simon Baumgartner","status":"admin_assigned","statusLastChangedAt":"2024-06-06T10:10:32.982Z","hidden":false},{"_id":"66612600fb3dfd49f05b221d","name":"Michael Bendersky","hidden":false},{"_id":"66612600fb3dfd49f05b221e","name":"Chao Zhang","hidden":false}],"publishedAt":"2024-06-05T03:08:25.000Z","submittedOnDailyAt":"2024-06-06T01:29:13.459Z","title":"PLaD: Preference-based Large Language Model Distillation with\n Pseudo-Preference Pairs","submittedOnDailyBy":{"_id":"60f1abe7544c2adfd699860c","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1674929746905-60f1abe7544c2adfd699860c.jpeg","isPro":false,"fullname":"AK","user":"akhaliq","type":"user"},"summary":"Large Language Models (LLMs) have exhibited impressive capabilities in\nvarious tasks, yet their vast parameter sizes restrict their applicability in\nresource-constrained settings. Knowledge distillation (KD) offers a viable\nsolution by transferring expertise from large teacher models to compact student\nmodels. However, traditional KD techniques face specific challenges when\napplied to LLMs, including restricted access to LLM outputs, significant\nteacher-student capacity gaps, and the inherited mis-calibration issue. In this\nwork, we present PLaD, a novel preference-based LLM distillation framework.\nPLaD exploits the teacher-student capacity discrepancy to generate\npseudo-preference pairs where teacher outputs are preferred over student\noutputs. Then, PLaD leverages a ranking loss to re-calibrate student's\nestimation of sequence likelihood, which steers the student's focus towards\nunderstanding the relative quality of outputs instead of simply imitating the\nteacher. PLaD bypasses the need for access to teacher LLM's internal states,\ntackles the student's expressivity limitations, and mitigates the student\nmis-calibration issue. Through extensive experiments on two sequence generation\ntasks and with various LLMs, we demonstrate the effectiveness of our proposed\nPLaD framework.","upvotes":10,"discussionId":"66612601fb3dfd49f05b2247","ai_summary":"PLaD is a preference-based distillation framework that re-calibrates compact student LLMs by comparing them to larger teacher models without direct access to internal states.","ai_keywords":["Large Language Models (LLMs)","Knowledge distillation (KD)","pseudo-preference pairs","ranking loss","sequence likelihood","sequence generation tasks"]},"canReadDatabase":false,"canManagePapers":false,"canSubmit":false,"hasHfLevelAccess":false,"upvoted":false,"upvoters":[{"_id":"64a2bd9245dde1d6ac7b4d81","avatarUrl":"/avatars/8a238a832bfcf14e7704a2152b254a0e.svg","isPro":false,"fullname":"Zhen Qin","user":"paulqin","type":"user"},{"_id":"620783f24e28382272337ba4","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/620783f24e28382272337ba4/zkUveQPNiDfYjgGhuFErj.jpeg","isPro":false,"fullname":"GuoLiangTang","user":"Tommy930","type":"user"},{"_id":"648eb1eb59c4e5c87dc116e0","avatarUrl":"/avatars/c636cea39c2c0937f01398c94ead5dad.svg","isPro":false,"fullname":"fdsqefsgergd","user":"T-representer","type":"user"},{"_id":"643b19f8a856622f978df30f","avatarUrl":"/avatars/c82779fdf94f80cdb5020504f83c818b.svg","isPro":false,"fullname":"Yatharth Sharma","user":"YaTharThShaRma999","type":"user"},{"_id":"646784af696e7355f5d5001f","avatarUrl":"/avatars/39ce8999c296584ee0bdeb7848eee6d9.svg","isPro":false,"fullname":"Tianqi Liu","user":"TianqiLiuAI","type":"user"},{"_id":"6538119803519fddb4a17e10","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/6538119803519fddb4a17e10/ffJMkdx-rM7VvLTCM6ri_.jpeg","isPro":false,"fullname":"samusenps","user":"samusenps","type":"user"},{"_id":"60078446e55258e41786a959","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/60078446e55258e41786a959/UGPCE4YqG9BVMSf0YauxL.png","isPro":false,"fullname":"Motoki Wu","user":"tokestermw","type":"user"},{"_id":"641b754d1911d3be6745cce9","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/641b754d1911d3be6745cce9/Ydjcjd4VuNUGj5Cd4QHdB.png","isPro":false,"fullname":"atayloraerospace","user":"Taylor658","type":"user"},{"_id":"663ccbff3a74a20189d4aa2e","avatarUrl":"/avatars/83a54455e0157480f65c498cd9057cf2.svg","isPro":false,"fullname":"Nguyen Van Thanh","user":"NguyenVanThanhHust","type":"user"},{"_id":"67c71bb3ab668e09ae450451","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/67c71bb3ab668e09ae450451/_DDWX38p4JaLsoXKe7Tv_.jpeg","isPro":false,"fullname":"FelixWeber","user":"FM000001","type":"user"}],"acceptLanguages":["*"],"dailyPaperRank":0}">
PLaD is a preference-based distillation framework that re-calibrates compact student LLMs by comparing them to larger teacher models without direct access to internal states.
AI-generated summary
Large Language Models (LLMs) have exhibited impressive capabilities in
various tasks, yet their vast parameter sizes restrict their applicability in
resource-constrained settings. Knowledge distillation (KD) offers a viable
solution by transferring expertise from large teacher models to compact student
models. However, traditional KD techniques face specific challenges when
applied to LLMs, including restricted access to LLM outputs, significant
teacher-student capacity gaps, and the inherited mis-calibration issue. In this
work, we present PLaD, a novel preference-based LLM distillation framework.
PLaD exploits the teacher-student capacity discrepancy to generate
pseudo-preference pairs where teacher outputs are preferred over student
outputs. Then, PLaD leverages a ranking loss to re-calibrate student's
estimation of sequence likelihood, which steers the student's focus towards
understanding the relative quality of outputs instead of simply imitating the
teacher. PLaD bypasses the need for access to teacher LLM's internal states,
tackles the student's expressivity limitations, and mitigates the student
mis-calibration issue. Through extensive experiments on two sequence generation
tasks and with various LLMs, we demonstrate the effectiveness of our proposed
PLaD framework.