Deprecated: The each() function is deprecated. This message will be suppressed on further calls in /home/zhenxiangba/zhenxiangba.com/public_html/phproxy-improved-master/index.php on line 456
HuggingFaceFW (FineData)
[go: Go Back, main page]

FineData

Team
community
Activity Feed
Science Team releasing large scale pre-training datasets to accelerate open LLM development.

\n\n","classNames":"hf-sanitized hf-sanitized-1NFePSVFl4rFvLXQbZUM-"},"users":[{"_id":"5df7e9e5da6d0311fd3d53f9","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1583857746553-5df7e9e5da6d0311fd3d53f9.jpeg","isPro":true,"fullname":"Thomas Wolf","user":"thomwolf","type":"user"},{"_id":"5e48005437cb5b49818287a5","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/5e48005437cb5b49818287a5/4uCXGGui-9QifAT4qelxU.png","isPro":false,"fullname":"Leandro von Werra","user":"lvwerra","type":"user"},{"_id":"602e6dee60e3dd96631c906e","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1613655355830-noauth.png","isPro":false,"fullname":"Anton Lozhkov","user":"anton-l","type":"user"},{"_id":"61c141342aac764ce1654e43","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/61c141342aac764ce1654e43/81AwoT5IQ_Xdw0OVw7TKu.jpeg","isPro":false,"fullname":"Loubna Ben Allal","user":"loubnabnl","type":"user"},{"_id":"64622c4093f702673bf9b953","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/noauth/iB42uASF1DoQth23cizSh.png","isPro":true,"fullname":"Ferdinand Mom","user":"3outeille","type":"user"},{"_id":"651e96991b97c9f33d26bde6","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/651e96991b97c9f33d26bde6/-Bqs6qrmz0yCfwtB2e-6q.jpeg","isPro":true,"fullname":"Elie Bakouch","user":"eliebak","type":"user"},{"_id":"6079c29765b9d0165cb18392","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1618592397610-noauth.jpeg","isPro":false,"fullname":"Colin Raffel","user":"craffel","type":"user"},{"_id":"60c757ea5f9a76ab3f844f12","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1626214544196-60c757ea5f9a76ab3f844f12.png","isPro":false,"fullname":"Margaret Mitchell","user":"meg","type":"user"},{"_id":"63e0eea7af523c37e5a77966","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1678663263366-63e0eea7af523c37e5a77966.jpeg","isPro":true,"fullname":"Nathan Habib","user":"SaylorTwift","type":"user"},{"_id":"6202a599216215a22221dea9","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1644340617257-noauth.png","isPro":false,"fullname":"ClΓ©mentine Fourrier","user":"clefourrier","type":"user"},{"_id":"65ba0b22eb364bc793936fc3","avatarUrl":"/avatars/1ca6827f5800620085746966b5f4c91f.svg","isPro":false,"fullname":"Vinko Sabolcec","user":"vsabolcec","type":"user"},{"_id":"65b960eeeedfbaf37f8f6869","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/65b960eeeedfbaf37f8f6869/aVIaa_roB-j157KhhDaf2.jpeg","isPro":false,"fullname":"Negar Foroutan","user":"negar-foroutan","type":"user"},{"_id":"65524059f71a0f5860d0aeaa","avatarUrl":"/avatars/c4297407e40dbe6088c2ae9d311f3f30.svg","isPro":false,"fullname":"Messmer","user":"NXz64Fdf8Y","type":"user"},{"_id":"60107b385ac3e86b3ea4fc34","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1627505688463-60107b385ac3e86b3ea4fc34.jpeg","isPro":true,"fullname":"Daniel van Strien","user":"davanstrien","type":"user"},{"_id":"5e9ecfc04957053f60648a3e","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1594214747713-5e9ecfc04957053f60648a3e.png","isPro":true,"fullname":"Quentin Lhoest","user":"lhoestq","type":"user"},{"_id":"648a374f00f7a3374ee64b99","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/648a374f00f7a3374ee64b99/YPwSOrronoozwHbJchPn3.jpeg","isPro":true,"fullname":"Caleb Fahlgren","user":"cfahlgren1","type":"user"},{"_id":"5fae5f68b8423e1d80b8a988","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1656078368144-5fae5f68b8423e1d80b8a988.jpeg","isPro":false,"fullname":"Joel Niklaus","user":"joelniklaus","type":"user"}],"userCount":17,"collections":[{"slug":"HuggingFaceFW/smol-data-699244eb3a18b4874222cfe8","title":"Smol-Data","description":"Tried and tested mixes for strong pretraining","gating":false,"lastUpdated":"2026-02-15T23:13:26.065Z","owner":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"items":[{"_id":"69924ae6b5d3127126aefee3","position":0,"type":"dataset","author":"HuggingFaceFW","downloads":292,"gated":false,"id":"HuggingFaceFW/dclm_100BT","lastModified":"2026-02-11T19:08:33.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":89269902,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false,"isBenchmark":false},{"_id":"69924af03883cdc4e0fa70db","position":1,"type":"dataset","author":"HuggingFaceFW","downloads":12,"gated":false,"id":"HuggingFaceFW/dclm_100BT-shuffled","lastModified":"2026-02-15T08:52:11.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":89269902,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet","optimized-parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false,"isBenchmark":false},{"_id":"69924b1c9ce0e2fe4946ed06","position":2,"type":"dataset","author":"HuggingFaceFW","downloads":823,"gated":false,"id":"HuggingFaceFW/finepdfs_100BT","lastModified":"2026-02-12T15:42:28.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":29904625,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false,"isBenchmark":false},{"_id":"69924b25b5d3127126af02c5","position":3,"type":"dataset","author":"HuggingFaceFW","downloads":28,"gated":false,"id":"HuggingFaceFW/finepdfs_100BT-shuffled","lastModified":"2026-02-15T09:02:43.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":14640807,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet","optimized-parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false,"isBenchmark":false}],"position":0,"theme":"indigo","private":false,"shareUrl":"https://hf.co/collections/HuggingFaceFW/smol-data","upvotes":1,"isUpvotedByUser":false},{"slug":"HuggingFaceFW/finewiki-68f6615c6bb86563dcd5e846","title":"🌐 FineWiki","description":"","gating":false,"lastUpdated":"2025-10-21T15:17:59.339Z","owner":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"items":[{"_id":"68f6616b1a6fbb49f08bf9c0","position":0,"type":"dataset","author":"HuggingFaceFW","downloads":6116,"gated":false,"id":"HuggingFaceFW/finewiki","lastModified":"2025-10-22T11:02:22.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":61550610,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":284,"isLikedByUser":false,"isBenchmark":false},{"_id":"68f661e26354ef24c523c386","position":1,"type":"space","author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"colorFrom":"blue","colorTo":"gray","createdAt":"2025-10-16T13:28:52.000Z","emoji":"🌐","id":"HuggingFaceFW/finewiki-viewer","lastModified":"2025-10-16T15:37:04.000Z","likes":11,"pinned":false,"private":false,"sdk":"gradio","repoType":"space","runtime":{"stage":"RUNNING","hardware":{"current":"cpu-basic","requested":"cpu-basic"},"storage":null,"gcTimeout":172800,"errorMessage":null,"replicas":{"current":1,"requested":1},"devMode":false,"domains":[{"domain":"huggingfacefw-finewiki-viewer.hf.space","stage":"READY"}],"sha":"cb8932ed5d5285333862787816b64be8bd0dce84"},"shortDescription":"Viewer to explore the finewiki dataset","title":"FineWiki Viewer","isLikedByUser":false,"ai_short_description":"Browse Wikipedia articles in multiple languages","ai_category":"Document Analysis","trendingScore":0,"tags":["gradio","region:us"],"featured":false}],"position":2,"theme":"blue","private":false,"shareUrl":"https://hf.co/collections/HuggingFaceFW/finewiki","upvotes":5,"isUpvotedByUser":false},{"slug":"HuggingFaceFW/finepdfs-68bd02d20928419c1dc12296","title":"πŸ“„ FinePDFs","description":"","gating":false,"lastUpdated":"2026-01-09T22:18:36.716Z","owner":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"items":[{"_id":"68bd02df919b8f31c7f11f7a","position":0,"type":"dataset","author":"HuggingFaceFW","downloads":35903,"gated":false,"id":"HuggingFaceFW/finepdfs","lastModified":"2026-01-09T10:37:26.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":476178356,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":818,"isLikedByUser":false,"isBenchmark":false},{"_id":"6913892e472fc4f02b6ba31e","position":1,"type":"dataset","author":"HuggingFaceFW","downloads":6693,"gated":false,"id":"HuggingFaceFW/finepdfs-edu","lastModified":"2025-11-11T18:49:02.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":49526501,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":79,"isLikedByUser":false,"isBenchmark":false},{"_id":"68f670ce6079930856f53fa3","position":2,"type":"dataset","author":"HuggingFaceFW","downloads":159,"gated":false,"id":"HuggingFaceFW/ocr-annotations","lastModified":"2025-10-20T17:29:22.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":1620,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet"],"modalities":["document","text"]},"private":false,"repoType":"dataset","likes":17,"isLikedByUser":false,"isBenchmark":false},{"_id":"68f670e0068057313eae9119","position":3,"type":"dataset","author":"HuggingFaceFW","downloads":4966,"gated":false,"id":"HuggingFaceFW/finepdfs_lang_classification","lastModified":"2025-10-17T11:54:25.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":3075541,"libraries":["datasets","pandas","mlcroissant","polars"],"formats":["parquet"],"modalities":["tabular"]},"private":false,"repoType":"dataset","likes":4,"isLikedByUser":false,"isBenchmark":false}],"position":3,"theme":"green","private":false,"shareUrl":"https://hf.co/collections/HuggingFaceFW/finepdfs","upvotes":27,"isUpvotedByUser":false},{"slug":"HuggingFaceFW/fineweb2-6755657a481dae41e8fbba4d","title":"πŸ₯‚ FineWeb2","description":"","gating":false,"lastUpdated":"2025-06-27T00:19:47.207Z","owner":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"items":[{"_id":"685de381de4c986d3fb8de33","position":0,"type":"paper","id":"2506.20920","title":"FineWeb2: One Pipeline to Scale Them All -- Adapting Pre-Training Data\n Processing to Every Language","thumbnailUrl":"https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2506.20920.png","upvotes":77,"publishedAt":"2025-06-26T01:01:47.000Z","isUpvotedByUser":false},{"_id":"6755659464f2ebb9dff28aa8","position":1,"type":"dataset","author":"HuggingFaceFW","downloads":74224,"gated":false,"id":"HuggingFaceFW/fineweb-2","lastModified":"2025-10-27T18:32:07.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":4484929995,"libraries":[],"formats":[],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":753,"isLikedByUser":false,"isBenchmark":false},{"_id":"675565a29b57b4baf0b0de29","position":2,"type":"space","author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"colorFrom":"blue","colorTo":"indigo","createdAt":"2024-10-14T19:38:04.000Z","emoji":"πŸ“","id":"HuggingFaceFW/blogpost-fine-tasks","lastModified":"2024-12-04T16:42:54.000Z","likes":88,"pinned":false,"private":false,"sdk":"docker","repoType":"space","runtime":{"stage":"RUNNING","hardware":{"current":"cpu-basic","requested":"cpu-basic"},"storage":null,"gcTimeout":172800,"errorMessage":null,"replicas":{"current":1,"requested":1},"devMode":false,"domains":[{"domain":"huggingfacefw-blogpost-fine-tasks.hf.space","stage":"READY"}],"sha":"33e1a4e3c2f0701362621709d9d2448cbddc4d6c"},"title":"Scaling FineWeb to 1000+ languages: Step 1: finding signal in 100s of evaluation tasks","isLikedByUser":false,"ai_short_description":"Evaluate multilingual models using FineTasks","ai_category":"Text Analysis","trendingScore":0,"tags":["docker","region:us"],"featured":false}],"position":4,"theme":"orange","private":false,"shareUrl":"https://hf.co/collections/HuggingFaceFW/fineweb2","upvotes":24,"isUpvotedByUser":false},{"slug":"HuggingFaceFW/fineweb-662458592d61edba3d2f245d","title":"🍷 FineWeb","description":"","gating":false,"lastUpdated":"2025-06-20T10:26:25.990Z","owner":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"items":[{"_id":"665edf276bb2f5f35035e51f","position":0,"type":"space","author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"colorFrom":"pink","colorTo":"red","createdAt":"2024-05-23T16:35:21.000Z","emoji":"🍷","id":"HuggingFaceFW/blogpost-fineweb-v1","lastModified":"2024-12-18T18:32:39.000Z","likes":1295,"pinned":false,"private":false,"sdk":"static","repoType":"space","runtime":{"stage":"RUNNING","hardware":{"current":null,"requested":null},"storage":null,"replicas":{"requested":1,"current":1}},"title":"FineWeb: decanting the web for the finest text data at scale","isLikedByUser":false,"ai_short_description":"Explore the FineWeb dataset and its creation process","ai_category":"Other","trendingScore":9,"tags":["static","region:us"],"featured":true},{"_id":"6624586f9999cbbc782646e4","position":1,"type":"dataset","author":"HuggingFaceFW","downloads":197213,"gated":false,"id":"HuggingFaceFW/fineweb","lastModified":"2025-07-11T20:16:53.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":52453695892,"libraries":[],"formats":[],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":2666,"isLikedByUser":false,"isBenchmark":false},{"_id":"6658630db7d3130100209ee4","position":2,"type":"dataset","author":"HuggingFaceFW","downloads":262345,"gated":false,"id":"HuggingFaceFW/fineweb-edu","lastModified":"2025-07-11T20:16:53.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":3496736741,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":953,"isLikedByUser":false,"isBenchmark":false},{"_id":"66584e2de3e6949bb93aa539","position":3,"type":"dataset","author":"HuggingFaceFW","downloads":9253,"gated":false,"id":"HuggingFaceFW/fineweb-edu-score-2","lastModified":"2025-07-11T20:16:52.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":13892422290,"libraries":["datasets","dask","mlcroissant","polars"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":84,"isLikedByUser":false,"isBenchmark":false}],"position":6,"theme":"pink","private":false,"shareUrl":"https://hf.co/collections/HuggingFaceFW/fineweb","upvotes":30,"isUpvotedByUser":false},{"slug":"HuggingFaceFW/fineweb-edu-6659c3f3d399d0e1d648adfd","title":"πŸ“š FineWeb-Edu","description":"FineWeb-Edu datasets, classifier and ablation model","gating":false,"lastUpdated":"2024-06-12T09:41:59.930Z","owner":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"items":[{"_id":"6659c4415d5cb57cbb7bdbd0","position":0,"type":"dataset","author":"HuggingFaceFW","downloads":262345,"gated":false,"id":"HuggingFaceFW/fineweb-edu","lastModified":"2025-07-11T20:16:53.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":3496736741,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":953,"isLikedByUser":false,"isBenchmark":false},{"_id":"6659c44bc1a1c789c9963b07","position":1,"type":"dataset","author":"HuggingFaceFW","downloads":9253,"gated":false,"id":"HuggingFaceFW/fineweb-edu-score-2","lastModified":"2025-07-11T20:16:52.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":13892422290,"libraries":["datasets","dask","mlcroissant","polars"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":84,"isLikedByUser":false,"isBenchmark":false},{"_id":"6659c42a9a9fd35b5cd170ec","position":2,"type":"model","author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":4890,"gated":false,"id":"HuggingFaceFW/fineweb-edu-classifier","availableInferenceProviders":[{"provider":"hf-inference","modelStatus":"error","providerStatus":"live","providerId":"HuggingFaceFW/fineweb-edu-classifier","task":"text-classification","adapterWeightsPath":"model.safetensors","isCheapestPricingOutput":false,"isFastestThroughput":false,"isModelAuthor":false}],"lastModified":"2024-11-17T15:00:11.000Z","likes":206,"pipeline_tag":"text-classification","private":false,"repoType":"model","isLikedByUser":false,"widgetOutputUrls":[],"numParameters":109483009},{"_id":"6659c43072a584ee6d7b77af","position":3,"type":"model","author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":410,"gated":false,"id":"HuggingFaceFW/ablation-model-fineweb-edu","availableInferenceProviders":[],"lastModified":"2024-06-11T12:00:27.000Z","likes":20,"pipeline_tag":"text-generation","private":false,"repoType":"model","isLikedByUser":false,"widgetOutputUrls":[],"numParameters":1713670144}],"position":7,"theme":"blue","private":false,"shareUrl":"https://hf.co/collections/HuggingFaceFW/fineweb-edu","upvotes":20,"isUpvotedByUser":false},{"slug":"HuggingFaceFW/dataset-comparison-models-662457b0d213e8c14fe47f32","title":"πŸ“€ Dataset comparison models","description":"1.8B models trained on 350BT to compare different pretraining datasets","gating":false,"lastUpdated":"2024-06-12T09:41:54.615Z","owner":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"items":[{"_id":"66584dc1c6e364c8ecca279f","position":0,"type":"model","author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":410,"gated":false,"id":"HuggingFaceFW/ablation-model-fineweb-edu","availableInferenceProviders":[],"lastModified":"2024-06-11T12:00:27.000Z","likes":20,"pipeline_tag":"text-generation","private":false,"repoType":"model","isLikedByUser":false,"widgetOutputUrls":[],"numParameters":1713670144},{"_id":"662457d184db70bddd3a398c","position":1,"type":"model","author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":24,"gated":false,"id":"HuggingFaceFW/ablation-model-fineweb-v1","availableInferenceProviders":[],"lastModified":"2024-04-25T08:32:46.000Z","likes":14,"pipeline_tag":"text-generation","private":false,"repoType":"model","isLikedByUser":false,"widgetOutputUrls":[],"numParameters":1713670144},{"_id":"662457f0ff3bcf1c6dd937ad","position":2,"type":"model","author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":4,"gated":false,"id":"HuggingFaceFW/ablation-model-refinedweb","availableInferenceProviders":[],"lastModified":"2024-04-25T08:33:53.000Z","likes":3,"pipeline_tag":"text-generation","private":false,"repoType":"model","isLikedByUser":false,"widgetOutputUrls":[],"numParameters":1713670144},{"_id":"662457dad323727f8100ddc9","position":3,"type":"model","author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":1,"gated":false,"id":"HuggingFaceFW/ablation-model-c4","availableInferenceProviders":[],"lastModified":"2024-04-25T08:34:07.000Z","likes":4,"pipeline_tag":"text-generation","private":false,"repoType":"model","isLikedByUser":false,"widgetOutputUrls":[],"numParameters":1713670144}],"position":8,"theme":"orange","private":false,"shareUrl":"https://hf.co/collections/HuggingFaceFW/dataset-comparison-models","upvotes":42,"isUpvotedByUser":false},{"slug":"HuggingFaceFW/fineweb-v1-data-experiments-665ed849020d8b66a5d9896f","title":"πŸ§ͺ FineWeb v1 data experiments","description":"Ablation models trained for our data experiments.","gating":false,"lastUpdated":"2024-06-12T09:41:49.383Z","owner":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"items":[{"_id":"665ed88c96b66b86685e64c8","position":0,"type":"model","note":{"html":"Ablation trained to compare warc+trafilatura text extraction with the default wet extraction from CommonCrawl [28BT]","text":"Ablation trained to compare warc+trafilatura text extraction with the default wet extraction from CommonCrawl [28BT]"},"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":1,"gated":false,"id":"HuggingFaceFW/ablation-exp-textext-warc_trafilatura-28BT","availableInferenceProviders":[],"lastModified":"2024-06-04T20:44:35.000Z","likes":1,"pipeline_tag":"text-generation","private":false,"repoType":"model","isLikedByUser":false,"widgetOutputUrls":[],"numParameters":1713670144},{"_id":"665ed8860eb022e5366cbdcf","position":1,"type":"model","note":{"html":"Ablation trained to compare warc+trafilatura text extraction with the default wet extraction from CommonCrawl [28BT]","text":"Ablation trained to compare warc+trafilatura text extraction with the default wet extraction from CommonCrawl [28BT]"},"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":1,"gated":false,"id":"HuggingFaceFW/ablation-exp-textext-wet-28BT","availableInferenceProviders":[],"lastModified":"2024-06-04T20:46:00.000Z","likes":0,"pipeline_tag":"text-generation","private":false,"repoType":"model","isLikedByUser":false,"widgetOutputUrls":[],"numParameters":1713670144},{"_id":"665ed8942d0161db305c12af","position":2,"type":"model","note":{"html":"Trained on all CommonCrawl dumps after text extraction and our base filtering [350BT]","text":"Trained on all CommonCrawl dumps after text extraction and our base filtering [350BT]"},"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":1,"gated":false,"id":"HuggingFaceFW/ablation-exp-fw-base_filtering-350BT","availableInferenceProviders":[],"lastModified":"2024-06-04T06:26:34.000Z","likes":0,"pipeline_tag":"text-generation","private":false,"repoType":"model","isLikedByUser":false,"widgetOutputUrls":[],"numParameters":1713670144},{"_id":"665ed88addb96a4d72827506","position":3,"type":"model","note":{"html":"Trained on all CommonCrawl dumps after global MinHash deduplication [350BT]","text":"Trained on all CommonCrawl dumps after global MinHash deduplication [350BT]"},"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":1,"gated":false,"id":"HuggingFaceFW/ablation-exp-dedup-global_minhash-350BT","availableInferenceProviders":[],"lastModified":"2024-06-04T03:10:19.000Z","likes":0,"pipeline_tag":"text-generation","private":false,"repoType":"model","isLikedByUser":false,"widgetOutputUrls":[],"numParameters":1713670144}],"position":9,"theme":"green","private":false,"shareUrl":"https://hf.co/collections/HuggingFaceFW/fineweb-v1-data-experiments","upvotes":8,"isUpvotedByUser":false}],"datasets":[{"author":"HuggingFaceFW","downloads":15,"gated":false,"id":"HuggingFaceFW/fineweb_100BT-shuffled","lastModified":"2026-02-15T20:31:26.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":160677091,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet","optimized-parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false,"isBenchmark":false},{"author":"HuggingFaceFW","downloads":159,"gated":false,"id":"HuggingFaceFW/fineweb_edu_100BT-shuffled","lastModified":"2026-02-15T20:23:53.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":102063987,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet","optimized-parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false,"isBenchmark":false},{"author":"HuggingFaceFW","downloads":28,"gated":false,"id":"HuggingFaceFW/finepdfs_100BT-shuffled","lastModified":"2026-02-15T09:02:43.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":14640807,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet","optimized-parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false,"isBenchmark":false},{"author":"HuggingFaceFW","downloads":12,"gated":false,"id":"HuggingFaceFW/dclm_100BT-shuffled","lastModified":"2026-02-15T08:52:11.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":89269902,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet","optimized-parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false,"isBenchmark":false},{"author":"HuggingFaceFW","downloads":24,"gated":false,"id":"HuggingFaceFW/finepdfs_50BT-dclm_30BT-fineweb_edu_20BT-shuffled","lastModified":"2026-02-15T08:49:36.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":62119279,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet","optimized-parquet"],"modalities":["text"]},"private":false,"repoType":"dataset","likes":2,"isLikedByUser":false,"isBenchmark":false},{"author":"HuggingFaceFW","downloads":43,"gated":false,"id":"HuggingFaceFW/finepdfs_edu_50BT-dclm_30BT-fineweb_edu_20BT-shuffled","lastModified":"2026-02-15T08:46:58.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":56051218,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet","optimized-parquet"],"modalities":["text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false,"isBenchmark":false},{"author":"HuggingFaceFW","downloads":25,"gated":false,"id":"HuggingFaceFW/finepdfs_edu_100BT-shuffled","lastModified":"2026-02-15T08:45:50.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":17781952,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet","optimized-parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false,"isBenchmark":false},{"author":"HuggingFaceFW","downloads":776,"gated":false,"id":"HuggingFaceFW/finepdfs_50BT-dclm_30BT-fineweb_edu_20BT","lastModified":"2026-02-13T17:26:11.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":62119279,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet"],"modalities":["text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false,"isBenchmark":false},{"author":"HuggingFaceFW","downloads":4411,"gated":false,"id":"HuggingFaceFW/finepdfs_edu_50BT-dclm_30BT-fineweb_edu_20BT","lastModified":"2026-02-13T14:14:41.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":56051218,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet"],"modalities":["text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false,"isBenchmark":false},{"author":"HuggingFaceFW","downloads":823,"gated":false,"id":"HuggingFaceFW/finepdfs_100BT","lastModified":"2026-02-12T15:42:28.000Z","datasetsServerInfo":{"viewer":"viewer","numRows":29904625,"libraries":["datasets","dask","polars","mlcroissant"],"formats":["parquet"],"modalities":["tabular","text"]},"private":false,"repoType":"dataset","likes":0,"isLikedByUser":false,"isBenchmark":false}],"models":[{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":1647,"gated":false,"id":"HuggingFaceFW/finepdfs_edu_classifier_eng_Latn","availableInferenceProviders":[],"lastModified":"2025-11-11T17:58:18.000Z","likes":2,"private":false,"repoType":"model","isLikedByUser":false,"numParameters":395832321},{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":21,"gated":false,"id":"HuggingFaceFW/finepdfs_dclm_classifier_eng_Latn","availableInferenceProviders":[],"lastModified":"2025-10-06T21:25:57.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false,"numParameters":395832321},{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":13,"gated":false,"id":"HuggingFaceFW/finepdfs_edu_classifier_v2_eng_Latn","availableInferenceProviders":[],"lastModified":"2025-10-06T21:03:48.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false,"numParameters":395832321},{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":8,"gated":false,"id":"HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn","availableInferenceProviders":[],"lastModified":"2025-10-06T20:50:57.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false,"numParameters":395832321},{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":0,"gated":false,"id":"HuggingFaceFW/finepdfs_edu_classifier_guj_Gujr","availableInferenceProviders":[],"lastModified":"2025-10-06T05:59:01.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false,"numParameters":307531009},{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":2,"gated":false,"id":"HuggingFaceFW/finepdfs_edu_classifier_nno_Latn","availableInferenceProviders":[],"lastModified":"2025-10-06T05:58:42.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false,"numParameters":307531009},{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":3,"gated":false,"id":"HuggingFaceFW/finepdfs_edu_classifier_kaz_Cyrl","availableInferenceProviders":[],"lastModified":"2025-10-06T05:58:24.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false,"numParameters":307531009},{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":1,"gated":false,"id":"HuggingFaceFW/finepdfs_edu_classifier_tam_Taml","availableInferenceProviders":[],"lastModified":"2025-10-06T05:58:05.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false,"numParameters":307531009},{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":2,"gated":false,"id":"HuggingFaceFW/finepdfs_edu_classifier_azj_Latn","availableInferenceProviders":[],"lastModified":"2025-10-06T05:57:46.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false,"numParameters":307531009},{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"downloads":3,"gated":false,"id":"HuggingFaceFW/finepdfs_edu_classifier_afr_Latn","availableInferenceProviders":[],"lastModified":"2025-10-06T05:57:28.000Z","likes":0,"private":false,"repoType":"model","isLikedByUser":false,"numParameters":307531009}],"paperPreviews":[{"_id":"2506.20920","title":"FineWeb2: One Pipeline to Scale Them All -- Adapting Pre-Training Data\n Processing to Every Language","id":"2506.20920","thumbnailUrl":"https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2506.20920.png"},{"_id":"2406.17557","title":"The FineWeb Datasets: Decanting the Web for the Finest Text Data at\n Scale","id":"2406.17557","thumbnailUrl":"https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2406.17557.png"}],"spaces":[{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"colorFrom":"blue","colorTo":"indigo","createdAt":"2026-01-05T01:17:09.000Z","emoji":"πŸ“„","id":"HuggingFaceFW/FinePDFsBlog","lastModified":"2026-01-07T17:29:13.000Z","likes":66,"pinned":false,"private":false,"sdk":"docker","repoType":"space","runtime":{"stage":"RUNNING","hardware":{"current":"cpu-basic","requested":"cpu-basic"},"storage":null,"gcTimeout":172800,"errorMessage":null,"replicas":{"current":1,"requested":1},"devMode":false,"domains":[{"domain":"huggingfacefw-finepdfsblog.hf.space","stage":"READY"}],"sha":"a6fd303de18471af4fee3f017748ebe41d7dd722"},"title":"FinePDFs: Liberating 3T of the finest tokens from PDFs","isLikedByUser":false,"trendingScore":0,"tags":["docker","research","dataset","pdf","region:us"],"featured":true},{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"colorFrom":"blue","colorTo":"gray","createdAt":"2025-10-16T13:28:52.000Z","emoji":"🌐","id":"HuggingFaceFW/finewiki-viewer","lastModified":"2025-10-16T15:37:04.000Z","likes":11,"pinned":false,"private":false,"sdk":"gradio","repoType":"space","runtime":{"stage":"RUNNING","hardware":{"current":"cpu-basic","requested":"cpu-basic"},"storage":null,"gcTimeout":172800,"errorMessage":null,"replicas":{"current":1,"requested":1},"devMode":false,"domains":[{"domain":"huggingfacefw-finewiki-viewer.hf.space","stage":"READY"}],"sha":"cb8932ed5d5285333862787816b64be8bd0dce84"},"shortDescription":"Viewer to explore the finewiki dataset","title":"FineWiki Viewer","isLikedByUser":false,"ai_short_description":"Browse Wikipedia articles in multiple languages","ai_category":"Document Analysis","trendingScore":0,"tags":["gradio","region:us"],"featured":false},{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"colorFrom":"pink","colorTo":"red","createdAt":"2024-05-23T16:35:21.000Z","emoji":"🍷","id":"HuggingFaceFW/blogpost-fineweb-v1","lastModified":"2024-12-18T18:32:39.000Z","likes":1295,"pinned":false,"private":false,"sdk":"static","repoType":"space","runtime":{"stage":"RUNNING","hardware":{"current":null,"requested":null},"storage":null,"replicas":{"requested":1,"current":1}},"title":"FineWeb: decanting the web for the finest text data at scale","isLikedByUser":false,"ai_short_description":"Explore the FineWeb dataset and its creation process","ai_category":"Other","trendingScore":9,"tags":["static","region:us"],"featured":true},{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"colorFrom":"blue","colorTo":"indigo","createdAt":"2024-10-14T19:38:04.000Z","emoji":"πŸ“","id":"HuggingFaceFW/blogpost-fine-tasks","lastModified":"2024-12-04T16:42:54.000Z","likes":88,"pinned":false,"private":false,"sdk":"docker","repoType":"space","runtime":{"stage":"RUNNING","hardware":{"current":"cpu-basic","requested":"cpu-basic"},"storage":null,"gcTimeout":172800,"errorMessage":null,"replicas":{"current":1,"requested":1},"devMode":false,"domains":[{"domain":"huggingfacefw-blogpost-fine-tasks.hf.space","stage":"READY"}],"sha":"33e1a4e3c2f0701362621709d9d2448cbddc4d6c"},"title":"Scaling FineWeb to 1000+ languages: Step 1: finding signal in 100s of evaluation tasks","isLikedByUser":false,"ai_short_description":"Evaluate multilingual models using FineTasks","ai_category":"Text Analysis","trendingScore":0,"tags":["docker","region:us"],"featured":false},{"author":"HuggingFaceFW","authorData":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"colorFrom":"blue","colorTo":"green","createdAt":"2024-11-28T22:24:05.000Z","emoji":"🏒","id":"HuggingFaceFW/Tasks-Explorer","lastModified":"2024-11-29T12:05:17.000Z","likes":1,"pinned":false,"private":false,"sdk":"gradio","repoType":"space","runtime":{"stage":"BUILD_ERROR","hardware":{"current":null,"requested":"cpu-basic"},"storage":null,"gcTimeout":172800,"errorMessage":"Job failed with exit code: 1. Reason: cache miss: [run 1/3] LINK COPY --link ./ /app\ncache miss: [base 6/6] RUN pip install --no-cache-dir \tgradio[oauth]==4.38.1 \t\"uvicorn>=0.14.0\" \tspaces \"fastapi<0.113.0\"\ncache miss: [run 3/3] COPY --from=pipfreeze --link /pipfreeze/ /pipfreeze/\ncache miss: [run 1/3] COPY --link ./ /app\ncache miss: [base 5/6] RUN --mount=target=/tmp/requirements.txt,source=requirements.txt pip install --no-cache-dir -r /tmp/requirements.txt\ncache miss: [pipfreeze 1/2] RUN mkdir -p /pipfreeze\ncache miss: [run 2/3] RUN mkdir -p /home/user && ( [ -e /home/user/app ] || ln -s /app/ /home/user/app ) || true\ncache miss: [run 3/3] LINK COPY --from=pipfreeze --link /pipfreeze/ /pipfreeze/\ncache miss: [pipfreeze 2/2] RUN pip freeze > /pipfreeze/freeze.txt\n{\"total\":25,\"completed\":18,\"user_total\":14,\"user_cached\":4,\"user_completed\":7,\"user_cacheable\":13,\"from\":1,\"miss\":9,\"client_duration_ms\":25349}\n","replicas":{"requested":1},"devMode":false,"domains":[{"domain":"huggingfacefw-tasks-explorer.hf.space","stage":"READY"}]},"title":"Tasks Explorer","isLikedByUser":false,"ai_short_description":"Explore and analyze experiment results","ai_category":"Data Visualization","trendingScore":0,"tags":["gradio","region:us"],"featured":false}],"buckets":[],"numBuckets":0,"numDatasets":34,"numModels":105,"numSpaces":7,"lastOrgActivities":[{"time":"2026-02-15T22:43:37.383Z","user":"joelniklaus","userAvatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1656078368144-5fae5f68b8423e1d80b8a988.jpeg","orgAvatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","type":"collection","collection":{"id":"699244eb3a18b4874222cfe8","slug":"HuggingFaceFW/smol-data-699244eb3a18b4874222cfe8","title":"Smol-Data","description":"Tried and tested mixes for strong pretraining","lastUpdated":"2026-02-15T23:13:26.065Z","numberItems":14,"owner":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"theme":"indigo","shareUrl":"https://hf.co/collections/HuggingFaceFW/smol-data","upvotes":1,"isUpvotedByUser":false},"org":"HuggingFaceFW"},{"time":"2026-02-15T22:42:59.653Z","user":"joelniklaus","userAvatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1656078368144-5fae5f68b8423e1d80b8a988.jpeg","orgAvatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","type":"collection","collection":{"id":"699244eb3a18b4874222cfe8","slug":"HuggingFaceFW/smol-data-699244eb3a18b4874222cfe8","title":"Smol-Data","description":"Tried and tested mixes for strong pretraining","lastUpdated":"2026-02-15T23:13:26.065Z","numberItems":14,"owner":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"theme":"indigo","shareUrl":"https://hf.co/collections/HuggingFaceFW/smol-data","upvotes":1,"isUpvotedByUser":false},"org":"HuggingFaceFW"},{"time":"2026-02-15T22:42:49.803Z","user":"joelniklaus","userAvatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/1656078368144-5fae5f68b8423e1d80b8a988.jpeg","orgAvatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","type":"collection","collection":{"id":"699244eb3a18b4874222cfe8","slug":"HuggingFaceFW/smol-data-699244eb3a18b4874222cfe8","title":"Smol-Data","description":"Tried and tested mixes for strong pretraining","lastUpdated":"2026-02-15T23:13:26.065Z","numberItems":14,"owner":{"_id":"657882cd90df9d85fb6be8df","avatarUrl":"https://cdn-avatars.huggingface.co/v1/production/uploads/62596f9e1c0a084224b93e00/1NVyAAh-WfULT4i_LWZNB.png","fullname":"FineData","name":"HuggingFaceFW","type":"org","isHf":false,"isHfAdmin":false,"isMod":false,"plan":"team","followerCount":1383,"isUserFollowing":false},"theme":"indigo","shareUrl":"https://hf.co/collections/HuggingFaceFW/smol-data","upvotes":1,"isUpvotedByUser":false},"org":"HuggingFaceFW"}],"acceptLanguages":["*"],"canReadRepos":false,"canReadSpaces":false,"blogPosts":[],"currentRepoPage":0,"filters":{},"paperView":false}">

AI & ML interests

We release large pre-training datasets to accelerate open LLM development. Part of the Hugging Face Science team (hf.co/science)

Recent Activity

joelniklausΒ  updated a collection 4 days ago
Smol-Data
joelniklausΒ  updated a collection 4 days ago
Smol-Data
joelniklausΒ  updated a collection 4 days ago
Smol-Data
View all activity

🍷 FineData

This is the home of the 🍷 FineData team, a branch of the πŸ€— Hugging Face Science Team releasing large scale pre-training datasets to accelerate open LLM development.