Deprecated: The each() function is deprecated. This message will be suppressed on further calls in /home/zhenxiangba/zhenxiangba.com/public_html/phproxy-improved-master/index.php on line 456 pb09204048 (Hejian Sang)
@sseymens\n\t Thank you for your comments. I can help to reply your question about MOE on policy part. \n\n
Yeah, forcing old_log_prob = log_prob.detach() does not solve the on policy issue since the prob is using current policy but sampling distribution can be different due to expert selection.
\n
When we explored the agentic issues for gpt-oss training, we did not root the cause at the beginning. One hypothesis is due to inference-training inconsistency. After we apply the importance sampling, it does not help. So we test if forcing old_log_prob = log_prob.detach() will alleviate the issue if this is the root cause. This is just for hypothesis testing.
\n
When we explored the agentic issues for gpt-oss training, verl has not supported expert router replay yet. So we cannot test this idea. https://arxiv.org/pdf/2510.11370v1. Now we tested the relay. But this is not the root cause too. The root cause is attention sink.
\n\n"}},{"time":"2026-01-27T02:04:30.717Z","user":"pb09204048","userAvatarUrl":"/avatars/fdee8313785f592ee11b1c879f3df775.svg","type":"article","blog":{"_id":"6972aad7ca7ddee41cb2b84a","isUpvotedByUser":false,"numCoauthors":4,"publishedAt":"2026-01-27T01:53:15.932Z","slug":"gpt-oss-agentic-rl","title":"Unlocking Agentic RL Training for GPT-OSS: A Practical Retrospective","upvotes":56,"upvotes7d":4,"thumbnail":"https://cdn-uploads.huggingface.co/production/uploads/64efbd469e7770db74cb72f5/rNoYY6yaQSfXYMBn3M1xb.png","authorsData":[{"_id":"6697e878d8b5b78e6e7485b7","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/6697e834fd52271e0b9ce8d8/VSBDJkmYgk4-LeXgTKThN.png","fullname":"LinkedIn","name":"LinkedIn","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":107,"isUserFollowing":false}],"url":"/blog/LinkedIn/gpt-oss-agentic-rl","apiPrefix":"/api/blog/LinkedIn/gpt-oss-agentic-rl","canonical":false,"status":"published","isHFChangelog":false}}],"blogPosts":[{"_id":"6972aad7ca7ddee41cb2b84a","isUpvotedByUser":false,"numCoauthors":4,"publishedAt":"2026-01-27T01:53:15.932Z","slug":"gpt-oss-agentic-rl","title":"Unlocking Agentic RL Training for GPT-OSS: A Practical Retrospective","upvotes":56,"upvotes7d":4,"thumbnail":"https://cdn-uploads.huggingface.co/production/uploads/64efbd469e7770db74cb72f5/rNoYY6yaQSfXYMBn3M1xb.png","authorsData":[{"_id":"6697e878d8b5b78e6e7485b7","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/6697e834fd52271e0b9ce8d8/VSBDJkmYgk4-LeXgTKThN.png","fullname":"LinkedIn","name":"LinkedIn","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":107,"isUserFollowing":false}],"url":"/blog/LinkedIn/gpt-oss-agentic-rl","apiPrefix":"/api/blog/LinkedIn/gpt-oss-agentic-rl","canonical":false,"status":"published","isHFChangelog":false,"orgPlan":"team"}],"totalBlogPosts":1,"canReadDatabase":false,"canManageEntities":false,"canReadEntities":false,"canImpersonate":false,"canManageBilling":false,"canReadRepos":false,"canReadSpaces":false,"communityScore":1,"collections":[],"datasets":[],"models":[],"buckets":[],"numBuckets":0,"numberLikes":7,"papers":[{"id":"2510.00237","title":"Debunk the Myth of SFT Generalization","thumbnailUrl":"https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2510.00237.png","upvotes":2,"publishedAt":"2025-09-30T20:01:09.000Z","isUpvotedByUser":false}],"posts":[],"totalPosts":0,"spaces":[],"u":{"avatarUrl":"/avatars/fdee8313785f592ee11b1c879f3df775.svg","isPro":false,"fullname":"Hejian Sang","user":"pb09204048","orgs":[{"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/6697e834fd52271e0b9ce8d8/VSBDJkmYgk4-LeXgTKThN.png","fullname":"LinkedIn","name":"LinkedIn","userRole":"read","type":"org","isHf":false,"plan":"team"}],"signup":{"bluesky":"","details":"","github":"","homepage":"","linkedin":"","twitter":""},"isHf":false,"isMod":false,"type":"user","theme":"light"},"upvotes":3,"numFollowers":7,"numFollowingUsers":1,"numFollowingOrgs":5,"numModels":0,"numDatasets":0,"numSpaces":0,"isFollowing":false,"isFollower":false,"sampleFollowers":[{"user":"webxos","fullname":"webXOS","type":"user","_id":"694b63991453114f60919026","isPro":false,"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/694b63991453114f60919026/uletdNpOiRkEjEp3QvxCH.jpeg"},{"user":"JasonZhu13","fullname":"Jason Zhu","type":"user","_id":"64efbd469e7770db74cb72f5","isPro":false,"avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/64efbd469e7770db74cb72f5/yeYzDPziD-5KIHPH0dlJJ.png"},{"user":"Jibbscript","fullname":"Gibran Iqbal","type":"user","_id":"65dba1f1b62d242ed88b2d2a","isPro":false,"avatarUrl":"/avatars/e35ef7687e217e6ab71ad76cef59ea21.svg"},{"user":"m0m0chen","fullname":"Yanning Chen","type":"user","_id":"64cfeba8e9cac0020b9fc89b","isPro":false,"avatarUrl":"/avatars/94ba6256461336fae9f955010f9430a0.svg"}],"isWatching":false,"isIgnored":false,"acceptLanguages":["*"],"filters":{},"currentRepoPage":0}">