Deprecated: The each() function is deprecated. This message will be suppressed on further calls in /home/zhenxiangba/zhenxiangba.com/public_html/phproxy-improved-master/index.php on line 456 Paper page - S-LoRA: Serving Thousands of Concurrent LoRA Adapters
\n","updatedAt":"2024-06-09T07:20:25.239Z","author":{"_id":"6186ddf6a7717cb375090c01","avatarUrl":"/avatars/716b6a7d1094c8036b2a8a7b9063e8aa.svg","fullname":"Julien BLANCHON","name":"blanchon","type":"user","isPro":true,"isHf":false,"isHfAdmin":false,"isMod":false,"followerCount":176,"isUserFollowing":false}},"numEdits":0,"identifiedLanguage":{"language":"en","probability":0.5489867925643921},"editors":["blanchon"],"editorAvatarUrls":["/avatars/716b6a7d1094c8036b2a8a7b9063e8aa.svg"],"reactions":[],"isReport":false}}],"primaryEmailConfirmed":false,"paper":{"id":"2311.03285","authors":[{"_id":"6549c8e17ab20d807df276dd","user":{"_id":"6455b969654d8bccae50736c","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/6455b969654d8bccae50736c/WsK0_sbQSVeosi-fh-O_U.jpeg","isPro":false,"fullname":"Ying Sheng","user":"ying1123","type":"user"},"name":"Ying Sheng","status":"admin_assigned","statusLastChangedAt":"2023-11-07T14:45:44.239Z","hidden":false},{"_id":"6549c8e17ab20d807df276de","user":{"_id":"64ebbae6895a36ab28de811a","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/64ebbae6895a36ab28de811a/gBiaQP4paS4L13eu-yRm7.jpeg","isPro":false,"fullname":"Shiyi Cao","user":"eva98","type":"user"},"name":"Shiyi Cao","status":"admin_assigned","statusLastChangedAt":"2023-11-07T14:45:51.245Z","hidden":false},{"_id":"6549c8e17ab20d807df276df","user":{"_id":"63715b25ffc0489ed7d1f415","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/63715b25ffc0489ed7d1f415/xZJepbs0LRqFbW1knnBKR.jpeg","isPro":false,"fullname":"Dacheng Li","user":"DachengLi","type":"user"},"name":"Dacheng Li","status":"admin_assigned","statusLastChangedAt":"2023-11-07T14:45:59.515Z","hidden":false},{"_id":"6549c8e17ab20d807df276e0","user":{"_id":"647d650483cdb93baf77dd8f","avatarUrl":"/avatars/03a076088efb009a5fe5f27033569320.svg","isPro":false,"fullname":"Coleman Hooper","user":"chooper1","type":"user"},"name":"Coleman Hooper","status":"admin_assigned","statusLastChangedAt":"2023-11-07T14:46:06.278Z","hidden":false},{"_id":"6549c8e17ab20d807df276e1","user":{"_id":"656a4547996819a82836852d","avatarUrl":"/avatars/a5b522c18fced97140a156ff3e88d2e8.svg","isPro":false,"fullname":"Nicholas Lee","user":"nicholaslee","type":"user"},"name":"Nicholas Lee","status":"claimed_verified","statusLastChangedAt":"2024-07-15T09:02:30.645Z","hidden":false},{"_id":"6549c8e17ab20d807df276e2","user":{"_id":"642b970ceb31218a5f204a29","avatarUrl":"/avatars/582287f477bbb1a0842787145e375fd3.svg","isPro":false,"fullname":"andy-yang","user":"andy-yang","type":"user"},"name":"Shuo Yang","status":"claimed_verified","statusLastChangedAt":"2025-05-27T08:00:36.002Z","hidden":false},{"_id":"6549c8e17ab20d807df276e3","user":{"_id":"62fb0555e8c9c532aa7c6e5d","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62fb0555e8c9c532aa7c6e5d/AUds_M-PA7TCrj3gQ15IN.jpeg","isPro":false,"fullname":"Christopher Chou","user":"BabyChou","type":"user"},"name":"Christopher Chou","status":"admin_assigned","statusLastChangedAt":"2023-11-07T14:47:13.582Z","hidden":false},{"_id":"6549c8e17ab20d807df276e4","user":{"_id":"647b8885aba7062fe5c32000","avatarUrl":"/avatars/61eacc1bdfc5129ead49c61eb2691f4a.svg","isPro":false,"fullname":"Banghua Zhu","user":"banghua","type":"user"},"name":"Banghua Zhu","status":"admin_assigned","statusLastChangedAt":"2023-11-07T14:47:24.274Z","hidden":false},{"_id":"6549c8e17ab20d807df276e5","name":"Lianmin Zheng","hidden":false},{"_id":"6549c8e17ab20d807df276e6","user":{"_id":"6251bf4b183aa4266924ad91","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1678041834400-6251bf4b183aa4266924ad91.jpeg","isPro":true,"fullname":"Kurt Keutzer","user":"kurtkeutzer","type":"user"},"name":"Kurt Keutzer","status":"admin_assigned","statusLastChangedAt":"2023-11-07T14:47:49.327Z","hidden":false},{"_id":"6549c8e17ab20d807df276e7","user":{"_id":"645d2e8401f4eaab2a0878ce","avatarUrl":"/avatars/1273c5fb607b4b622a746a42692fa632.svg","isPro":false,"fullname":"Joseph E. Gonzalez","user":"ProfJoeyG","type":"user"},"name":"Joseph E. Gonzalez","status":"admin_assigned","statusLastChangedAt":"2023-11-07T14:48:03.399Z","hidden":false},{"_id":"6549c8e17ab20d807df276e8","name":"Ion Stoica","hidden":false}],"publishedAt":"2023-11-06T17:26:17.000Z","submittedOnDailyAt":"2023-11-07T02:49:30.371Z","title":"S-LoRA: Serving Thousands of Concurrent LoRA Adapters","submittedOnDailyBy":{"_id":"60f1abe7544c2adfd699860c","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1674929746905-60f1abe7544c2adfd699860c.jpeg","isPro":false,"fullname":"AK","user":"akhaliq","type":"user"},"summary":"The \"pretrain-then-finetune\" paradigm is commonly adopted in the deployment\nof large language models. Low-Rank Adaptation (LoRA), a parameter-efficient\nfine-tuning method, is often employed to adapt a base model to a multitude of\ntasks, resulting in a substantial collection of LoRA adapters derived from one\nbase model. We observe that this paradigm presents significant opportunities\nfor batched inference during serving. To capitalize on these opportunities, we\npresent S-LoRA, a system designed for the scalable serving of many LoRA\nadapters. S-LoRA stores all adapters in the main memory and fetches the\nadapters used by the currently running queries to the GPU memory. To\nefficiently use the GPU memory and reduce fragmentation, S-LoRA proposes\nUnified Paging. Unified Paging uses a unified memory pool to manage dynamic\nadapter weights with different ranks and KV cache tensors with varying sequence\nlengths. Additionally, S-LoRA employs a novel tensor parallelism strategy and\nhighly optimized custom CUDA kernels for heterogeneous batching of LoRA\ncomputation. Collectively, these features enable S-LoRA to serve thousands of\nLoRA adapters on a single GPU or across multiple GPUs with a small overhead.\nCompared to state-of-the-art libraries such as HuggingFace PEFT and vLLM (with\nnaive support of LoRA serving), S-LoRA can improve the throughput by up to 4\ntimes and increase the number of served adapters by several orders of\nmagnitude. As a result, S-LoRA enables scalable serving of many task-specific\nfine-tuned models and offers the potential for large-scale customized\nfine-tuning services.","upvotes":31,"discussionId":"6549c8e27ab20d807df276f5","githubRepo":"https://github.com/s-lora/s-lora","githubRepoAddedBy":"auto","ai_summary":"S-LoRA is a system that allows for efficient and scalable serving of numerous LoRA adapters using a unified memory pool, tensor parallelism, and custom CUDA kernels.","ai_keywords":["Low-Rank Adaptation (LoRA)","parameter-efficient fine-tuning","S-LoRA","Unified Paging","tensor parallelism","custom CUDA kernels","heterogeneous batching","LoRA computation"],"githubStars":1897},"canReadDatabase":false,"canManagePapers":false,"canSubmit":false,"hasHfLevelAccess":false,"upvoted":false,"upvoters":[{"_id":"60794650c59d9e1697fa2321","avatarUrl":"/avatars/53931a59a703d4f462c16866419ac953.svg","isPro":false,"fullname":"CHU Tianxiang","user":"keyfan","type":"user"},{"_id":"6101c620900eaa0057c2ce1d","avatarUrl":"/avatars/bd282166c120711c65b5409dc860ac58.svg","isPro":false,"fullname":"Abdel-Dayane Marcos","user":"admarcosai","type":"user"},{"_id":"5f106ce5348d4c7346cd19ab","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5f106ce5348d4c7346cd19ab/Uu08yZZlFuj3dtG4wld3n.jpeg","isPro":false,"fullname":"Abdullah Abdelrhim","user":"abdullah","type":"user"},{"_id":"64522233ea94bf023430dd95","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/noauth/CVDqDeJ_fLTULhCTTSogb.png","isPro":true,"fullname":"Chenhui Zhang","user":"danielz01","type":"user"},{"_id":"644e1b1d9b4e87c31bab0a14","avatarUrl":"/avatars/88bb4c4a67dc8958069e9014f5e73a0b.svg","isPro":false,"fullname":"Michael Barry","user":"MichaelBarryUK","type":"user"},{"_id":"5dd96eb166059660ed1ee413","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5dd96eb166059660ed1ee413/NQtzmrDdbG0H8qkZvRyGk.jpeg","isPro":true,"fullname":"Julien Chaumond","user":"julien-c","type":"user"},{"_id":"648eb1eb59c4e5c87dc116e0","avatarUrl":"/avatars/c636cea39c2c0937f01398c94ead5dad.svg","isPro":false,"fullname":"fdsqefsgergd","user":"T-representer","type":"user"},{"_id":"63040a2dd14428368d1aa8ca","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1678357323248-63040a2dd14428368d1aa8ca.png","isPro":false,"fullname":"Eugenia Cheskidova","user":"fogside","type":"user"},{"_id":"637553b4dee28348a9c2d2bb","avatarUrl":"/avatars/b4695fd0604894b5cfd8258dff569deb.svg","isPro":false,"fullname":"Maky","user":"makysmaky","type":"user"},{"_id":"646b8f66628e5b50b2deee93","avatarUrl":"/avatars/6c5b79f67c00027f2a626f4d86e7557f.svg","isPro":false,"fullname":"Cole Hoffer","user":"choffer1120","type":"user"},{"_id":"653a18000dfecb4b26dd2876","avatarUrl":"/avatars/fcf8a2ea58f6eca0a6196299c68fc8ad.svg","isPro":false,"fullname":"James Chang","user":"strategist922","type":"user"},{"_id":"620783f24e28382272337ba4","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/620783f24e28382272337ba4/zkUveQPNiDfYjgGhuFErj.jpeg","isPro":false,"fullname":"GuoLiangTang","user":"Tommy930","type":"user"}],"acceptLanguages":["*"],"dailyPaperRank":0}">
S-LoRA is a system that allows for efficient and scalable serving of numerous LoRA adapters using a unified memory pool, tensor parallelism, and custom CUDA kernels.
AI-generated summary
The "pretrain-then-finetune" paradigm is commonly adopted in the deployment
of large language models. Low-Rank Adaptation (LoRA), a parameter-efficient
fine-tuning method, is often employed to adapt a base model to a multitude of
tasks, resulting in a substantial collection of LoRA adapters derived from one
base model. We observe that this paradigm presents significant opportunities
for batched inference during serving. To capitalize on these opportunities, we
present S-LoRA, a system designed for the scalable serving of many LoRA
adapters. S-LoRA stores all adapters in the main memory and fetches the
adapters used by the currently running queries to the GPU memory. To
efficiently use the GPU memory and reduce fragmentation, S-LoRA proposes
Unified Paging. Unified Paging uses a unified memory pool to manage dynamic
adapter weights with different ranks and KV cache tensors with varying sequence
lengths. Additionally, S-LoRA employs a novel tensor parallelism strategy and
highly optimized custom CUDA kernels for heterogeneous batching of LoRA
computation. Collectively, these features enable S-LoRA to serve thousands of
LoRA adapters on a single GPU or across multiple GPUs with a small overhead.
Compared to state-of-the-art libraries such as HuggingFace PEFT and vLLM (with
naive support of LoRA serving), S-LoRA can improve the throughput by up to 4
times and increase the number of served adapters by several orders of
magnitude. As a result, S-LoRA enables scalable serving of many task-specific
fine-tuned models and offers the potential for large-scale customized
fine-tuning services.