358 lines
54 KiB
Plaintext
358 lines
54 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "ccfdbb24",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"import pandas as pd\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"\n",
|
||
"from datasets import load_dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "e21fae62",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"OUTPUT_DIR = \"./dataset\"\n",
|
||
"FILE_PATH = os.path.join(OUTPUT_DIR, \"fineweb-6b.parquet\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "bb9c37b2",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Generating train split: 8548517 examples [00:20, 417303.96 examples/s]\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Dataset({\n",
|
||
" features: ['text', 'meta', 'id'],\n",
|
||
" num_rows: 8548517\n",
|
||
"})"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"dataset = load_dataset(\"parquet\", data_files={\n",
|
||
" \"train\": FILE_PATH\n",
|
||
"}, split=\"train\")\n",
|
||
"dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "917999bb",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"{'text': Value('string'),\n",
|
||
" 'meta': {'url': Value('string'),\n",
|
||
" 'dump': Value('string'),\n",
|
||
" 's_cluster': Value('int64'),\n",
|
||
" 'token_count': Value('int64')},\n",
|
||
" 'id': Value('string')}"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"dataset.features"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "1f6addf9",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"{'id': '<urn:uuid:d66bc6fe-8477-4adf-b430-f6a558ccc8ff>',\n",
|
||
" 'meta': {'dump': None, 's_cluster': None, 'token_count': None, 'url': None},\n",
|
||
" 'text': 'How AP reported in all formats from tornado-stricken regionsMarch 8, '\n",
|
||
" '2012\\n'\n",
|
||
" 'When the first serious bout of tornadoes of 2012 blew through middle '\n",
|
||
" 'America in the middle of the night, they touched down in places '\n",
|
||
" 'hours from any AP bureau. Our closest video journalist was '\n",
|
||
" 'Chicago-based Robert Ray, who dropped his plans to travel to Georgia '\n",
|
||
" 'for Super Tuesday, booked several flights to the cities closest to '\n",
|
||
" 'the strikes and headed for the airport. He’d decide once there which '\n",
|
||
" 'flight to take.\\n'\n",
|
||
" 'He never got on board a plane. Instead, he ended up driving toward '\n",
|
||
" 'Harrisburg, Ill., where initial reports suggested a town was '\n",
|
||
" 'destroyed. That decision turned out to be a lucky break for the AP. '\n",
|
||
" 'Twice.\\n'\n",
|
||
" 'Ray was among the first journalists to arrive and he confirmed those '\n",
|
||
" 'reports -- in all formats. He shot powerful video, put victims on '\n",
|
||
" 'the phone with AP Radio and played back sound to an editor who '\n",
|
||
" 'transcribed the interviews and put the material on text wires. He '\n",
|
||
" 'then walked around the devastation with the Central Regional Desk on '\n",
|
||
" 'the line, talking to victims with the phone held so close that '\n",
|
||
" 'editors could transcribe his interviews in real time.\\n'\n",
|
||
" 'Ray also made a dramatic image of a young girl who found a man’s '\n",
|
||
" 'prosthetic leg in the rubble, propped it up next to her destroyed '\n",
|
||
" 'home and spray-painted an impromptu sign: “Found leg. Seriously.”\\n'\n",
|
||
" 'The following day, he was back on the road and headed for Georgia '\n",
|
||
" 'and a Super Tuesday date with Newt Gingrich’s campaign. The drive '\n",
|
||
" 'would take him through a stretch of the South that forecasters '\n",
|
||
" 'expected would suffer another wave of tornadoes.\\n'\n",
|
||
" 'To prevent running into THAT storm, Ray used his iPhone to monitor '\n",
|
||
" 'Doppler radar, zooming in on extreme cells and using Google maps to '\n",
|
||
" 'direct himself to safe routes. And then the journalist took over '\n",
|
||
" 'again.\\n'\n",
|
||
" '“When weather like that occurs, a reporter must seize the '\n",
|
||
" 'opportunity to get the news out and allow people to see, hear and '\n",
|
||
" 'read the power of nature so that they can take proper shelter,” Ray '\n",
|
||
" 'says.\\n'\n",
|
||
" 'So Ray now started to use his phone to follow the storms. He '\n",
|
||
" 'attached a small GoPro camera to his steering wheel in case a '\n",
|
||
" 'tornado dropped down in front of the car somewhere, and took video '\n",
|
||
" 'of heavy rain and hail with his iPhone. Soon, he spotted a tornado '\n",
|
||
" 'and the chase was on. He followed an unmarked emergency vehicle to '\n",
|
||
" \"Cleveland, Tenn., where he was first on the scene of the storm's \"\n",
|
||
" 'aftermath.\\n'\n",
|
||
" 'Again, the tornadoes had struck in locations that were hours from '\n",
|
||
" 'the nearest AP bureau. Damage and debris, as well as a wickedly '\n",
|
||
" 'violent storm that made travel dangerous, slowed our efforts to get '\n",
|
||
" 'to the news. That wasn’t a problem in Tennessee, where our customers '\n",
|
||
" 'were well served by an all-formats report that included this text '\n",
|
||
" 'story.\\n'\n",
|
||
" '“CLEVELAND, Tenn. (AP) _ Fierce wind, hail and rain lashed Tennessee '\n",
|
||
" 'for the second time in three days, and at least 15 people were '\n",
|
||
" 'hospitalized Friday in the Chattanooga area.”\\n'\n",
|
||
" 'The byline? Robert Ray.\\n'\n",
|
||
" 'For being adept with technology, chasing after news as it literally '\n",
|
||
" 'dropped from the sky and setting a standard for all-formats '\n",
|
||
" 'reporting that put the AP ahead on the most competitive news story '\n",
|
||
" 'of the day, Ray wins this week’s $300 Best of the States prize.\\n'\n",
|
||
" '© 2013 The Associated Press. All rights reserved. Terms and '\n",
|
||
" 'conditions apply. See AP.org for details.'}\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from pprint import pprint\n",
|
||
"pprint(dataset[0])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "f3dbfcf5",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"728"
|
||
]
|
||
},
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(tokenizer(dataset[0]['text'])['input_ids'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "18bc69d9",
|
||
"metadata": {},
|
||
"source": [
|
||
"# **Document Length Analysis**\n",
|
||
"> Understanding length of the document is crucial for setting effective padding/truncation strategies during training."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "1ef0cdc6",
|
||
"metadata": {},
|
||
"source": [
|
||
"**A. Apply tokenizer and map**\n",
|
||
"- We'll use `map` function to calculate the token count for every document\n",
|
||
"- We'll store it in a new column\n",
|
||
" - hf's `map` is highly optimized and works well with large datasets"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "d83540a8",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from transformers import AutoTokenizer"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "ea3aaf18",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Map (num_proc=24): 0%| | 57/8548517 [00:03<126:46:39, 18.73 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (8502 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 238/8548517 [00:03<24:01:39, 98.82 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (9849 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 1598/8548517 [00:03<2:03:15, 1155.63 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (9183 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 7328/8548517 [00:05<43:17, 3287.81 examples/s] Token indices sequence length is longer than the specified maximum sequence length for this model (11389 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 7680/8548517 [00:05<47:08, 3020.02 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (30422 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 9837/8548517 [00:06<34:08, 4169.10 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (10627 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 17071/8548517 [00:07<23:41, 6002.42 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (11265 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 17697/8548517 [00:07<23:45, 5986.02 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (9853 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 21964/8548517 [00:08<19:57, 7118.45 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (8480 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Token indices sequence length is longer than the specified maximum sequence length for this model (10483 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 23469/8548517 [00:08<19:39, 7228.23 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (12505 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Token indices sequence length is longer than the specified maximum sequence length for this model (8226 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 24209/8548517 [00:08<20:37, 6888.89 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (10808 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Token indices sequence length is longer than the specified maximum sequence length for this model (18963 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Token indices sequence length is longer than the specified maximum sequence length for this model (9103 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 24928/8548517 [00:08<20:34, 6906.48 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (8232 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Token indices sequence length is longer than the specified maximum sequence length for this model (18374 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 26669/8548517 [00:08<18:12, 7801.39 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (8793 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Token indices sequence length is longer than the specified maximum sequence length for this model (8551 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 28799/8548517 [00:08<15:50, 8964.52 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (11499 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 29717/8548517 [00:09<16:03, 8843.98 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (9902 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 30618/8548517 [00:09<16:39, 8519.71 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (11594 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 0%| | 33358/8548517 [00:09<17:05, 8304.62 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (18832 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Token indices sequence length is longer than the specified maximum sequence length for this model (43572 > 8192). Running this sequence through the model will result in indexing errors\n",
|
||
"Map (num_proc=24): 100%|██████████| 8548517/8548517 [17:47<00:00, 8004.38 examples/s]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"TOKENIZER_NAME = \"HuggingFaceTB/SmolLM2-1.7B\"\n",
|
||
"tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)\n",
|
||
"\n",
|
||
"def calculate_token_length(example) -> dict:\n",
|
||
" tokens = tokenizer(example[\"text\"], truncation=False, padding=False)[\"input_ids\"]\n",
|
||
" return { \"length\": len(tokens) }\n",
|
||
"\n",
|
||
"dataset_with_lengths = dataset.map(calculate_token_length, num_proc=24)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "8e9d98bd",
|
||
"metadata": {},
|
||
"source": [
|
||
"**B. Analyze distribution**"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "38094868",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"---- Document Length Stats (Tokens) ---\n",
|
||
"count 8.548517e+06\n",
|
||
"mean 7.018761e+02\n",
|
||
"std 1.515926e+03\n",
|
||
"min 3.000000e+01\n",
|
||
"25% 2.010000e+02\n",
|
||
"50% 3.880000e+02\n",
|
||
"75% 7.560000e+02\n",
|
||
"max 1.534070e+05\n",
|
||
"dtype: float64\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"lengths = pd.Series(dataset_with_lengths[\"length\"])\n",
|
||
"\n",
|
||
"# descriptive stats\n",
|
||
"print(\"\\n---- Document Length Stats (Tokens) ---\")\n",
|
||
"print(lengths.describe())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "03c77507",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 2000x1200 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# plot distribution (use log scale for better viz of skew)\n",
|
||
"plt.figure(figsize=(20, 12))\n",
|
||
"\n",
|
||
"# zoom into bulk of the data eg. docs < 10K tokens\n",
|
||
"lengths[lengths < 10000].hist(bins=100, log=True)\n",
|
||
"plt.title('Token Length Distribution (Truncated at 10k Tokens)')\n",
|
||
"plt.xlabel('Token Count (SmolLM2-1.7B)')\n",
|
||
"plt.ylabel('Frequency (Log Scale)')\n",
|
||
"plt.show()"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "fineweb",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.12"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|