| from transformers import AutoModel, AutoTokenizer |
| from datasets import load_dataset |
| from torch.utils.data import DataLoader, Dataset |
| import torch.optim as optim |
| import torch.nn as nn |
|
|
| class ShellcodeDataset(Dataset): |
| def __init__(self, data, tokenizer): |
| self.data = data |
| self.tokenizer = tokenizer |
|
|
| def __len__(self): |
| return len(self.data) |
|
|
| def __getitem__(self, idx): |
| intent = self.data[idx]['intent'] |
| snippet = self.data[idx]['snippet'] |
| encoding = self.tokenizer(intent, return_tensors="pt", padding="max_length", truncation=True, max_length=1024) |
| label = self.tokenizer(snippet, return_tensors="pt", padding="max_length", truncation=True, max_length=1024) |
| return {'input_ids': encoding['input_ids'], 'labels': label['input_ids']} |
|
|
| |
| model_name = "openai-community/gpt2" |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModel.from_pretrained(model_name) |
|
|
| |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| |
| dataset = load_dataset('SoLID/shellcode_i_a32') |
|
|
| |
| train_dataset = ShellcodeDataset(dataset['train'], tokenizer) |
| train_dataloader = DataLoader(train_dataset, batch_size=16) |
|
|
| |
| optimizer = optim.Adam(model.parameters()) |
| criterion = nn.CrossEntropyLoss() |
|
|
| |
| model.train() |
| for epoch in range(3): |
| for batch in train_dataloader: |
| optimizer.zero_grad() |
| input_ids, labels = batch['input_ids'], batch['labels'] |
| outputs = model(input_ids) |
| loss = criterion(outputs.logits, labels) |
| loss.backward() |
| optimizer.step() |
|
|