| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| Tokenize Dataset Script: Prepare Tool Calling Dataset for Training |
| |
| This script tokenizes the nvidia/Nemotron-Agentic-v1 tool_calling dataset |
| and uploads it to HuggingFace Hub for reuse. |
| |
| Usage: |
| uv run tokenize_dataset.py |
| |
| Can run on CPU - no GPU required! |
| """ |
|
|
| import os |
| import json |
| from datasets import load_dataset, Dataset |
| from transformers import AutoTokenizer |
| from huggingface_hub import hf_hub_download, HfApi, create_repo |
|
|
| |
| |
| |
|
|
| |
| BASE_MODEL = "Tesslate/Synthia-S1-27b" |
|
|
| |
| DATASET_NAME = "nvidia/Nemotron-Agentic-v1" |
| DATASET_SPLIT = "tool_calling" |
|
|
| |
| TOKENIZED_DATASET_REPO = "Codyfederer/synthia-tool-calling-tokenized" |
| TOKENIZED_DATASET_PRIVATE = True |
|
|
| |
| MAX_SEQ_LENGTH = 4096 |
|
|
| |
| |
| |
|
|
| def tokenize_conversation(example, tokenizer, max_length): |
| """ |
| Tokenize a conversation using the model's chat template. |
| Returns input_ids, attention_mask, and labels for causal LM training. |
| """ |
| messages = example["messages"] |
|
|
| |
| text = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=False |
| ) |
|
|
| |
| tokenized = tokenizer( |
| text, |
| truncation=True, |
| max_length=max_length, |
| padding=False, |
| return_tensors=None, |
| ) |
|
|
| |
| tokenized["labels"] = tokenized["input_ids"].copy() |
|
|
| return tokenized |
|
|
|
|
| def main(): |
| print("=" * 60) |
| print("Tokenize Dataset for Tool Calling Training") |
| print("=" * 60) |
|
|
| |
| from huggingface_hub import whoami |
| try: |
| username = whoami()["name"] |
| print(f"Logged in as: {username}") |
| except Exception as e: |
| print(f"ERROR: Not logged in to HF Hub ({e})") |
| print("Run 'huggingface-cli login' first") |
| return |
|
|
| |
| |
| |
| print(f"\nLoading tokenizer from {BASE_MODEL}...") |
|
|
| tokenizer = AutoTokenizer.from_pretrained( |
| BASE_MODEL, |
| trust_remote_code=True, |
| padding_side="right", |
| ) |
|
|
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| tokenizer.pad_token_id = tokenizer.eos_token_id |
|
|
| print(f"Vocab size: {len(tokenizer):,}") |
|
|
| |
| |
| |
| print(f"\nLoading dataset: {DATASET_NAME} ({DATASET_SPLIT} split)...") |
|
|
| |
| jsonl_file = f"data/{DATASET_SPLIT}.jsonl" |
| print(f"Downloading {jsonl_file}...") |
|
|
| local_path = hf_hub_download( |
| repo_id=DATASET_NAME, |
| filename=jsonl_file, |
| repo_type="dataset" |
| ) |
| print(f"Downloaded to: {local_path}") |
|
|
| |
| print("Loading and processing JSONL file...") |
| processed_examples = [] |
| skipped = 0 |
|
|
| with open(local_path, 'r', encoding='utf-8') as f: |
| for line_num, line in enumerate(f): |
| if line_num % 50000 == 0: |
| print(f" Processed {line_num:,} lines...") |
| try: |
| example = json.loads(line.strip()) |
| messages = example.get("messages", []) |
|
|
| |
| formatted_messages = [] |
| for msg in messages: |
| role = msg.get("role", "user") |
| content = msg.get("content", "") |
|
|
| |
| if isinstance(content, list): |
| parts = [] |
| for item in content: |
| if isinstance(item, dict): |
| if "text" in item: |
| parts.append(item["text"]) |
| else: |
| parts.append(json.dumps(item)) |
| else: |
| parts.append(str(item)) |
| content = "\n".join(parts) if parts else "" |
| elif isinstance(content, dict): |
| content = json.dumps(content) |
| elif content is None: |
| content = "" |
| else: |
| content = str(content) |
|
|
| formatted_messages.append({ |
| "role": role, |
| "content": content |
| }) |
|
|
| |
| if formatted_messages: |
| merged_messages = [] |
| for msg in formatted_messages: |
| role = msg["role"] |
| content = msg["content"] |
|
|
| |
| if role == "tool": |
| role = "user" |
| content = f"[Tool Result]\n{content}" |
|
|
| if merged_messages and merged_messages[-1]["role"] == role: |
| merged_messages[-1]["content"] += f"\n\n{content}" |
| else: |
| merged_messages.append({"role": role, "content": content}) |
|
|
| |
| if merged_messages and merged_messages[0]["role"] != "user": |
| merged_messages.insert(0, {"role": "user", "content": "[Start]"}) |
|
|
| processed_examples.append({"messages": merged_messages}) |
|
|
| except Exception as e: |
| skipped += 1 |
| if skipped < 5: |
| print(f" Warning: Skipped line {line_num}: {e}") |
|
|
| print(f"Loaded {len(processed_examples):,} examples (skipped {skipped})") |
|
|
| |
| dataset = Dataset.from_list(processed_examples) |
| print(f"Dataset size: {len(dataset):,} examples") |
|
|
| |
| split_dataset = dataset.train_test_split(test_size=0.02, seed=42) |
| train_dataset = split_dataset["train"] |
| eval_dataset = split_dataset["test"] |
|
|
| print(f"Train samples: {len(train_dataset):,}") |
| print(f"Eval samples: {len(eval_dataset):,}") |
|
|
| |
| |
| |
| print(f"\nTokenizing dataset with max_length={MAX_SEQ_LENGTH}...") |
| print("This may take a while for large datasets...") |
|
|
| train_dataset = train_dataset.map( |
| lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH), |
| remove_columns=["messages"], |
| num_proc=1, |
| desc="Tokenizing train", |
| ) |
|
|
| eval_dataset = eval_dataset.map( |
| lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH), |
| remove_columns=["messages"], |
| num_proc=1, |
| desc="Tokenizing eval", |
| ) |
|
|
| print(f"Tokenization complete!") |
| print(f"Train dataset columns: {train_dataset.column_names}") |
| print(f"Sample input_ids length: {len(train_dataset[0]['input_ids'])}") |
|
|
| |
| |
| |
| print(f"\nUploading TOKENIZED dataset to Hub: {TOKENIZED_DATASET_REPO}") |
|
|
| |
| api = HfApi() |
| try: |
| create_repo( |
| TOKENIZED_DATASET_REPO, |
| repo_type="dataset", |
| private=TOKENIZED_DATASET_PRIVATE, |
| exist_ok=True |
| ) |
| print(f" Created/verified repo (private={TOKENIZED_DATASET_PRIVATE})") |
|
|
| if TOKENIZED_DATASET_PRIVATE: |
| try: |
| api.update_repo_visibility( |
| TOKENIZED_DATASET_REPO, |
| repo_type="dataset", |
| private=True |
| ) |
| except Exception: |
| pass |
| except Exception as e: |
| print(f" Repo note: {e}") |
|
|
| |
| train_dataset.reset_format() |
| eval_dataset.reset_format() |
|
|
| |
| print(f" Verifying tokenized data...") |
| print(f" Train columns: {train_dataset.column_names}") |
| print(f" Sample input_ids type: {type(train_dataset[0]['input_ids'])}") |
| print(f" Sample input_ids length: {len(train_dataset[0]['input_ids'])}") |
| print(f" First 10 tokens: {train_dataset[0]['input_ids'][:10]}") |
|
|
| |
| print(f" Pushing train split ({len(train_dataset):,} examples)...") |
| train_dataset.push_to_hub( |
| TOKENIZED_DATASET_REPO, |
| split="train", |
| ) |
|
|
| print(f" Pushing test split ({len(eval_dataset):,} examples)...") |
| eval_dataset.push_to_hub( |
| TOKENIZED_DATASET_REPO, |
| split="test", |
| ) |
|
|
| print(f"\n" + "=" * 60) |
| print(f"SUCCESS! Tokenized dataset saved to:") |
| print(f" https://huggingface.co/datasets/{TOKENIZED_DATASET_REPO}") |
| print(f"=" * 60) |
|
|
| |
| print("\nVerifying upload...") |
| try: |
| from datasets import load_dataset as verify_load |
| verify_ds = verify_load(TOKENIZED_DATASET_REPO, split="train", streaming=True) |
| sample = next(iter(verify_ds)) |
| if "input_ids" in sample: |
| print(f" VERIFIED: Dataset contains input_ids with {len(sample['input_ids'])} tokens") |
| else: |
| print(f" WARNING: input_ids not found in columns: {list(sample.keys())}") |
| except Exception as ve: |
| print(f" Could not verify: {ve}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|