Files
fineweb-eda/nb.ipynb
2025-11-28 16:11:49 +05:30

358 lines
54 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "ccfdbb24",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from datasets import load_dataset"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e21fae62",
"metadata": {},
"outputs": [],
"source": [
"OUTPUT_DIR = \"./dataset\"\n",
"FILE_PATH = os.path.join(OUTPUT_DIR, \"fineweb-6b.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "bb9c37b2",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Generating train split: 8548517 examples [00:20, 417303.96 examples/s]\n"
]
},
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['text', 'meta', 'id'],\n",
" num_rows: 8548517\n",
"})"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = load_dataset(\"parquet\", data_files={\n",
" \"train\": FILE_PATH\n",
"}, split=\"train\")\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "917999bb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'text': Value('string'),\n",
" 'meta': {'url': Value('string'),\n",
" 'dump': Value('string'),\n",
" 's_cluster': Value('int64'),\n",
" 'token_count': Value('int64')},\n",
" 'id': Value('string')}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.features"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "1f6addf9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'id': '<urn:uuid:d66bc6fe-8477-4adf-b430-f6a558ccc8ff>',\n",
" 'meta': {'dump': None, 's_cluster': None, 'token_count': None, 'url': None},\n",
" 'text': 'How AP reported in all formats from tornado-stricken regionsMarch 8, '\n",
" '2012\\n'\n",
" 'When the first serious bout of tornadoes of 2012 blew through middle '\n",
" 'America in the middle of the night, they touched down in places '\n",
" 'hours from any AP bureau. Our closest video journalist was '\n",
" 'Chicago-based Robert Ray, who dropped his plans to travel to Georgia '\n",
" 'for Super Tuesday, booked several flights to the cities closest to '\n",
" 'the strikes and headed for the airport. Hed decide once there which '\n",
" 'flight to take.\\n'\n",
" 'He never got on board a plane. Instead, he ended up driving toward '\n",
" 'Harrisburg, Ill., where initial reports suggested a town was '\n",
" 'destroyed. That decision turned out to be a lucky break for the AP. '\n",
" 'Twice.\\n'\n",
" 'Ray was among the first journalists to arrive and he confirmed those '\n",
" 'reports -- in all formats. He shot powerful video, put victims on '\n",
" 'the phone with AP Radio and played back sound to an editor who '\n",
" 'transcribed the interviews and put the material on text wires. He '\n",
" 'then walked around the devastation with the Central Regional Desk on '\n",
" 'the line, talking to victims with the phone held so close that '\n",
" 'editors could transcribe his interviews in real time.\\n'\n",
" 'Ray also made a dramatic image of a young girl who found a mans '\n",
" 'prosthetic leg in the rubble, propped it up next to her destroyed '\n",
" 'home and spray-painted an impromptu sign: “Found leg. Seriously.”\\n'\n",
" 'The following day, he was back on the road and headed for Georgia '\n",
" 'and a Super Tuesday date with Newt Gingrichs campaign. The drive '\n",
" 'would take him through a stretch of the South that forecasters '\n",
" 'expected would suffer another wave of tornadoes.\\n'\n",
" 'To prevent running into THAT storm, Ray used his iPhone to monitor '\n",
" 'Doppler radar, zooming in on extreme cells and using Google maps to '\n",
" 'direct himself to safe routes. And then the journalist took over '\n",
" 'again.\\n'\n",
" '“When weather like that occurs, a reporter must seize the '\n",
" 'opportunity to get the news out and allow people to see, hear and '\n",
" 'read the power of nature so that they can take proper shelter,” Ray '\n",
" 'says.\\n'\n",
" 'So Ray now started to use his phone to follow the storms. He '\n",
" 'attached a small GoPro camera to his steering wheel in case a '\n",
" 'tornado dropped down in front of the car somewhere, and took video '\n",
" 'of heavy rain and hail with his iPhone. Soon, he spotted a tornado '\n",
" 'and the chase was on. He followed an unmarked emergency vehicle to '\n",
" \"Cleveland, Tenn., where he was first on the scene of the storm's \"\n",
" 'aftermath.\\n'\n",
" 'Again, the tornadoes had struck in locations that were hours from '\n",
" 'the nearest AP bureau. Damage and debris, as well as a wickedly '\n",
" 'violent storm that made travel dangerous, slowed our efforts to get '\n",
" 'to the news. That wasnt a problem in Tennessee, where our customers '\n",
" 'were well served by an all-formats report that included this text '\n",
" 'story.\\n'\n",
" '“CLEVELAND, Tenn. (AP) _ Fierce wind, hail and rain lashed Tennessee '\n",
" 'for the second time in three days, and at least 15 people were '\n",
" 'hospitalized Friday in the Chattanooga area.”\\n'\n",
" 'The byline? Robert Ray.\\n'\n",
" 'For being adept with technology, chasing after news as it literally '\n",
" 'dropped from the sky and setting a standard for all-formats '\n",
" 'reporting that put the AP ahead on the most competitive news story '\n",
" 'of the day, Ray wins this weeks $300 Best of the States prize.\\n'\n",
" '© 2013 The Associated Press. All rights reserved. Terms and '\n",
" 'conditions apply. See AP.org for details.'}\n"
]
}
],
"source": [
"from pprint import pprint\n",
"pprint(dataset[0])"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "f3dbfcf5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"728"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(tokenizer(dataset[0]['text'])['input_ids'])"
]
},
{
"cell_type": "markdown",
"id": "18bc69d9",
"metadata": {},
"source": [
"# **Document Length Analysis**\n",
"> Understanding length of the document is crucial for setting effective padding/truncation strategies during training."
]
},
{
"cell_type": "markdown",
"id": "1ef0cdc6",
"metadata": {},
"source": [
"**A. Apply tokenizer and map**\n",
"- We'll use `map` function to calculate the token count for every document\n",
"- We'll store it in a new column\n",
" - hf's `map` is highly optimized and works well with large datasets"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d83540a8",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "ea3aaf18",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Map (num_proc=24): 0%| | 57/8548517 [00:03<126:46:39, 18.73 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (8502 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 238/8548517 [00:03<24:01:39, 98.82 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (9849 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 1598/8548517 [00:03<2:03:15, 1155.63 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (9183 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 7328/8548517 [00:05<43:17, 3287.81 examples/s] Token indices sequence length is longer than the specified maximum sequence length for this model (11389 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 7680/8548517 [00:05<47:08, 3020.02 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (30422 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 9837/8548517 [00:06<34:08, 4169.10 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (10627 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 17071/8548517 [00:07<23:41, 6002.42 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (11265 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 17697/8548517 [00:07<23:45, 5986.02 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (9853 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 21964/8548517 [00:08<19:57, 7118.45 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (8480 > 8192). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (10483 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 23469/8548517 [00:08<19:39, 7228.23 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (12505 > 8192). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (8226 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 24209/8548517 [00:08<20:37, 6888.89 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (10808 > 8192). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (18963 > 8192). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (9103 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 24928/8548517 [00:08<20:34, 6906.48 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (8232 > 8192). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (18374 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 26669/8548517 [00:08<18:12, 7801.39 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (8793 > 8192). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (8551 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 28799/8548517 [00:08<15:50, 8964.52 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (11499 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 29717/8548517 [00:09<16:03, 8843.98 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (9902 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 30618/8548517 [00:09<16:39, 8519.71 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (11594 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 0%| | 33358/8548517 [00:09<17:05, 8304.62 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (18832 > 8192). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (43572 > 8192). Running this sequence through the model will result in indexing errors\n",
"Map (num_proc=24): 100%|██████████| 8548517/8548517 [17:47<00:00, 8004.38 examples/s]\n"
]
}
],
"source": [
"TOKENIZER_NAME = \"HuggingFaceTB/SmolLM2-1.7B\"\n",
"tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)\n",
"\n",
"def calculate_token_length(example) -> dict:\n",
" tokens = tokenizer(example[\"text\"], truncation=False, padding=False)[\"input_ids\"]\n",
" return { \"length\": len(tokens) }\n",
"\n",
"dataset_with_lengths = dataset.map(calculate_token_length, num_proc=24)"
]
},
{
"cell_type": "markdown",
"id": "8e9d98bd",
"metadata": {},
"source": [
"**B. Analyze distribution**"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "38094868",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"---- Document Length Stats (Tokens) ---\n",
"count 8.548517e+06\n",
"mean 7.018761e+02\n",
"std 1.515926e+03\n",
"min 3.000000e+01\n",
"25% 2.010000e+02\n",
"50% 3.880000e+02\n",
"75% 7.560000e+02\n",
"max 1.534070e+05\n",
"dtype: float64\n"
]
}
],
"source": [
"lengths = pd.Series(dataset_with_lengths[\"length\"])\n",
"\n",
"# descriptive stats\n",
"print(\"\\n---- Document Length Stats (Tokens) ---\")\n",
"print(lengths.describe())"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "03c77507",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 2000x1200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# plot distribution (use log scale for better viz of skew)\n",
"plt.figure(figsize=(20, 12))\n",
"\n",
"# zoom into bulk of the data eg. docs < 10K tokens\n",
"lengths[lengths < 10000].hist(bins=100, log=True)\n",
"plt.title('Token Length Distribution (Truncated at 10k Tokens)')\n",
"plt.xlabel('Token Count (SmolLM2-1.7B)')\n",
"plt.ylabel('Frequency (Log Scale)')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "fineweb",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}