| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa). |
| GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned |
| using a masked language modeling (MLM) loss. |
| """ |
|
|
| from __future__ import absolute_import |
| import os |
| import sys |
| import pickle |
| import torch |
| import json |
|
|
| import random |
| import logging |
| import argparse |
| import numpy as np |
| from io import open |
| from itertools import cycle |
| import torch.nn as nn |
| from model import Seq2Seq |
| from tqdm import tqdm, trange |
| from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset |
| from torch.utils.data.distributed import DistributedSampler |
| from tqdm import tqdm |
| from fuzzywuzzy import fuzz |
| import re |
| import multiprocessing |
| from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, |
| RobertaConfig, RobertaModel, RobertaTokenizer) |
|
|
| divide_number = 2 |
| cpu_cont = 16 |
| logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', |
| datefmt = '%m/%d/%Y %H:%M:%S', |
| level = logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
|
|
| class Example(object): |
| """A single training/test example.""" |
| def __init__(self, |
| idx, |
| source, |
| target, |
| max_src_len, |
| max_tar_len |
| ): |
| self.idx = idx |
| self.source = source |
| self.target = target |
| self.max_src_len = max_src_len |
| self.max_tar_len = max_tar_len |
|
|
| def read_examples(filename): |
| """Read examples from filename.""" |
| examples=[] |
| |
| with open(filename,encoding="utf-8") as f: |
| max_src_len = 0 |
| max_tar_len = 0 |
| for idx, line in enumerate(f): |
| js=json.loads(line) |
| inputs = " ".join(js["Template_token"][1:]) |
| max_src_len = max(max_src_len, len(js["Template_token"])) |
|
|
| if "ground_truth" in js: |
| outputs = " ".join(js["ground_truth"]) |
| max_tar_len = max(max_src_len, len(js["ground_truth"])) |
| else: |
| outputs = inputs |
| if 'Idx' in js: |
| idx = js['Idx'] |
| examples.append( |
| Example( |
| idx = idx, |
| source = inputs, |
| target = outputs, |
| max_src_len = max_src_len, |
| max_tar_len = max_tar_len |
| ) |
| ) |
| return examples |
|
|
|
|
| class InputFeatures(object): |
| """A single training/test features for a example.""" |
| def __init__(self, |
| example_id, |
| source_ids, |
| target_ids, |
| ): |
| self.example_id = example_id |
| self.source_ids = source_ids |
| self.target_ids = target_ids |
| |
| def convert_examples_to_features(examples, tokenizer, args,stage=None): |
| features = [] |
| for example_index, example in enumerate(examples): |
| |
| source_tokens = tokenizer.tokenize(example.source)[:args.max_source_length-5] |
| source_tokens =[tokenizer.cls_token,tokenizer.sep_token]+source_tokens+["<mask>", tokenizer.sep_token] |
| source_ids = tokenizer.convert_tokens_to_ids(source_tokens) |
| padding_length = args.max_source_length - len(source_ids) |
| source_ids+=[tokenizer.pad_token_id]*padding_length |
| |
| |
| if stage=="test": |
| target_tokens = tokenizer.tokenize("None") |
| else: |
| target_tokens = ["<mask>"] + tokenizer.tokenize(example.target)[:args.max_target_length-2] |
| target_tokens = target_tokens+[tokenizer.sep_token] |
| target_ids = tokenizer.convert_tokens_to_ids(target_tokens) |
| padding_length = args.max_target_length - len(target_ids) |
| target_ids+=[tokenizer.pad_token_id]*padding_length |
| |
| features.append( |
| InputFeatures( |
| example_index, |
| source_ids, |
| target_ids, |
| ) |
| ) |
| return features |
|
|
|
|
|
|
| def set_seed(seed=20240124): |
| random.seed(seed) |
| os.environ['PYHTONHASHSEED'] = str(seed) |
| np.random.seed(seed) |
| torch.manual_seed(seed) |
| torch.cuda.manual_seed(seed) |
| torch.backends.cudnn.deterministic = True |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser() |
|
|
| |
| parser.add_argument("--model_name_or_path", default=None, type=str, required=True, |
| help="Path to pre-trained model: e.g. roberta-base" ) |
| parser.add_argument("--output_dir", default=None, type=str, required=True, |
| help="The output directory where the model predictions and checkpoints will be written.") |
| parser.add_argument("--load_model_path", default=None, type=str, |
| help="Path to trained model: Should contain the .bin files" ) |
| |
| parser.add_argument("--task", default=None, type=str, required=True, |
| help="Task Type: statement_level, next_statement" ) |
|
|
| parser.add_argument("--train_filename", default="../../Dataset/", type=str, |
| help="The train filename. Should contain the .jsonl files for this task.") |
| parser.add_argument("--dev_filename", default="../../Dataset/", type=str, |
| help="The dev filename. Should contain the .jsonl files for this task.") |
| parser.add_argument("--test_filename", default="../../Dataset/", type=str, |
| help="The test filename. Should contain the .jsonl files for this task.") |
| |
| parser.add_argument("--config_name", default="", type=str, |
| help="Pretrained config name or path if not the same as model_name") |
| parser.add_argument("--tokenizer_name", default="", type=str, |
| help="Pretrained tokenizer name or path if not the same as model_name") |
| |
| |
| |
| |
| |
| |
| |
| parser.add_argument("--do_train", action='store_true', |
| help="Whether to run training.") |
| parser.add_argument("--do_eval", action='store_true', |
| help="Whether to run eval on the dev set.") |
| parser.add_argument("--do_test", action='store_true', |
| help="Whether to run eval on the dev set.") |
| parser.add_argument("--test_org", action='store_true', |
| help="Whether to run eval on org model.") |
| parser.add_argument("--do_lower_case", action='store_true', |
| help="Set this flag if you are using an uncased model.") |
| parser.add_argument("--no_cuda", action='store_true', |
| help="Avoid using CUDA when available") |
| |
| parser.add_argument("--train_batch_size", default=8, type=int, |
| help="Batch size per GPU/CPU for training.") |
| parser.add_argument("--eval_batch_size", default=8, type=int, |
| help="Batch size per GPU/CPU for evaluation.") |
| parser.add_argument('--gradient_accumulation_steps', type=int, default=1, |
| help="Number of updates steps to accumulate before performing a backward/update pass.") |
| parser.add_argument("--learning_rate", default=5e-5, type=float, |
| help="The initial learning rate for Adam.") |
| parser.add_argument("--beam_size", default=10, type=int, |
| help="beam size for beam search") |
| parser.add_argument("--weight_decay", default=0.0, type=float, |
| help="Weight deay if we apply some.") |
| parser.add_argument("--adam_epsilon", default=1e-8, type=float, |
| help="Epsilon for Adam optimizer.") |
| parser.add_argument("--max_grad_norm", default=1.0, type=float, |
| help="Max gradient norm.") |
| parser.add_argument("--num_train_epochs", default=3, type=int, |
| help="Total number of training epochs to perform.") |
| parser.add_argument("--max_steps", default=-1, type=int, |
| help="If > 0: set total number of training steps to perform. Override num_train_epochs.") |
| parser.add_argument("--eval_steps", default=-1, type=int, |
| help="") |
| parser.add_argument("--max_target_length", default=128, type=int, |
| help="") |
| parser.add_argument("--max_source_length", default=384, type=int, |
| help="") |
| parser.add_argument("--train_steps", default=-1, type=int, |
| help="") |
| parser.add_argument("--warmup_steps", default=0, type=int, |
| help="Linear warmup over warmup_steps.") |
| parser.add_argument("--local_rank", type=int, default=-1, |
| help="For distributed training: local_rank") |
| parser.add_argument('--seed', type=int, default=20240124, |
| help="random seed for initialization") |
| |
| args = parser.parse_args() |
| |
| logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', |
| datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO ) |
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| args.n_gpu = torch.cuda.device_count() |
| args.device = device |
| logger.info("device: %s, n_gpu: %s",device, args.n_gpu) |
| |
| |
| set_seed(args.seed) |
|
|
| |
| if os.path.exists(args.output_dir) is False: |
| os.makedirs(args.output_dir) |
|
|
| |
| tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) |
| config = RobertaConfig.from_pretrained(args.model_name_or_path) |
| |
| config.is_decoder = True |
| encoder = RobertaModel.from_pretrained(args.model_name_or_path,config=config) |
|
|
| model = Seq2Seq(encoder=encoder,decoder=encoder,config=config, |
| beam_size=args.beam_size,max_length=args.max_target_length, |
| sos_id=tokenizer.convert_tokens_to_ids(["<mask0>"])[0],eos_id=tokenizer.sep_token_id) |
| |
| logger.info("Training/evaluation parameters %s", args) |
|
|
| if args.load_model_path is not None: |
| if args.task == "statement_level": |
| logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin")) |
| model.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin")) |
| else: |
| logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin")) |
| model.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin")) |
| |
| model.to(args.device) |
| |
| if args.n_gpu > 1: |
| |
| model = torch.nn.DataParallel(model) |
|
|
| if args.do_train: |
| |
| if args.task == "statement_level": |
| train_examples = read_examples(args.train_filename + "/Code_Completion/statement_level/train.jsonl") |
| else: |
| train_examples = read_examples(args.train_filename + "/Code_Completion/next_statement/train.jsonl") |
| train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train') |
| all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long) |
| all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long) |
| train_data = TensorDataset(all_source_ids,all_target_ids) |
| train_sampler = RandomSampler(train_data) |
| train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) |
|
|
|
|
| |
| no_decay = ['bias', 'LayerNorm.weight'] |
| optimizer_grouped_parameters = [ |
| {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], |
| 'weight_decay': args.weight_decay}, |
| {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} |
| ] |
| optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) |
| scheduler = get_linear_schedule_with_warmup(optimizer, |
| num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1), |
| num_training_steps=len(train_dataloader)*args.num_train_epochs) |
| |
| |
| logger.info("***** Running training *****") |
| logger.info(" Num examples = %d", len(train_examples)) |
| logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps) |
| logger.info(" Num epoch = %d", args.num_train_epochs) |
| |
|
|
| model.train() |
| patience, best_score, losses, dev_dataset = 0, 0, [], {} |
| for epoch in range(args.num_train_epochs): |
| for idx,batch in enumerate(train_dataloader): |
| batch = tuple(t.to(device) for t in batch) |
| source_ids,target_ids = batch |
| loss,_,_ = model(source_ids=source_ids,target_ids=target_ids) |
|
|
| if args.n_gpu > 1: |
| loss = loss.mean() |
| if args.gradient_accumulation_steps > 1: |
| loss = loss / args.gradient_accumulation_steps |
| |
| losses.append(loss.item()) |
| loss.backward() |
| if len(losses) % args.gradient_accumulation_steps == 0: |
| |
| optimizer.step() |
| optimizer.zero_grad() |
| scheduler.step() |
| if len(losses) // args.gradient_accumulation_steps % 100 == 0: |
| logger.info("epoch {} step {} loss {}".format(epoch, |
| len(losses)//args.gradient_accumulation_steps, |
| round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4))) |
| if args.do_eval: |
| |
| |
| if 'dev_loss' in dev_dataset: |
| eval_examples,eval_data = dev_dataset['dev_loss'] |
| else: |
| if args.task == "statement_level": |
| eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl") |
| else: |
| eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl") |
| eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev') |
| all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long) |
| all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long) |
| eval_data = TensorDataset(all_source_ids,all_target_ids) |
| dev_dataset['dev_loss' ]= eval_examples,eval_data |
| eval_sampler = SequentialSampler(eval_data) |
| eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) |
| res_list = [] |
| logger.info("\n***** Running evaluation *****") |
| logger.info(" Num examples = %d", len(eval_examples)) |
| logger.info(" Batch size = %d", args.eval_batch_size) |
|
|
| |
| model.eval() |
| eval_loss,tokens_num = 0,0 |
| for batch in eval_dataloader: |
| batch = tuple(t.to(device) for t in batch) |
| source_ids,target_ids = batch |
|
|
| with torch.no_grad(): |
| _,loss,num = model(source_ids=source_ids,target_ids=target_ids) |
| eval_loss += loss.sum().item() |
| tokens_num += num.sum().item() |
| |
| model.train() |
| eval_loss = eval_loss / tokens_num |
| result = {'eval_ppl': round(np.exp(eval_loss),5)} |
| for key in sorted(result.keys()): |
| logger.info(" %s = %s", key, str(result[key])) |
| logger.info(" "+"*"*20) |
|
|
| |
| if 'dev_bleu' in dev_dataset: |
| eval_examples,eval_data=dev_dataset['dev_bleu'] |
| else: |
| if args.task == "statement_level": |
| eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl") |
| else: |
| eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl") |
| |
| eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test') |
| all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long) |
| eval_data = TensorDataset(all_source_ids) |
| dev_dataset['dev_bleu'] = eval_examples,eval_data |
|
|
| eval_sampler = SequentialSampler(eval_data) |
| eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) |
|
|
| model.eval() |
| p=[] |
| for batch in eval_dataloader: |
| batch = tuple(t.to(device) for t in batch) |
| source_ids = batch[0] |
| with torch.no_grad(): |
| preds = model(source_ids) |
| |
| for pred in preds: |
| t = pred[0].cpu().numpy() |
| t = list(t) |
| if 0 in t: |
| t = t[:t.index(0)] |
| text = tokenizer.decode(t,clean_up_tokenization_spaces=False) |
| p.append(text) |
| model.train() |
| EM = 0.0 |
| edit_sim = 0.0 |
| total = len(p) |
| token_accuracy = 0 |
| for ref,gold in zip(p,eval_examples): |
| pred = ref.strip() |
| gt = gold.target |
| edit_sim += fuzz.ratio(pred, gt) |
| if pred.split() == gt.split(): |
| EM += 1 |
| res_list.append([pred,gt]) |
| dev_acc = round(EM/total*100, 2) |
| |
| logger.info(" %s = %s "%("Epoch",str(epoch))) |
| logger.info(" %s = %s "%("EM Acc",str(dev_acc))) |
| logger.info(" %s = %s "%("Edit Distance",str(round(edit_sim/total, 2)))) |
| logger.info(" "+"*"*20) |
|
|
| if dev_acc > best_score: |
| best_score = dev_acc |
| |
| if args.task == "statement_level": |
| output_dir = os.path.join(args.output_dir, 'statement_level/') |
| else: |
| output_dir = os.path.join(args.output_dir, 'next_statement/') |
| if not os.path.exists(output_dir): |
| os.makedirs(output_dir) |
| model_to_save = model.module if hasattr(model, 'module') else model |
| output_model_file = os.path.join(output_dir, "pytorch_model.bin") |
| torch.save(model_to_save.state_dict(), output_model_file) |
| patience = 0 |
| else: |
| patience += 1 |
| if patience == 3: |
| break |
| logger.info(" Best score:%s",best_score) |
| logger.info(" "+"*"*20) |
|
|
| if args.task == "statement_level": |
| output_dir = os.path.join(args.output_dir, 'statement_level/') |
| else: |
| output_dir = os.path.join(args.output_dir, 'next_statement/') |
| with open(output_dir + "/last_training_result.jsonl", 'w') as wf: |
| for line in res_list: |
| dic = {} |
| dic["Pred"] = line[0] |
| dic["GT"] = line[1] |
| wf.write(json.dumps(dic)) |
| wf.write("\n") |
| |
| if args.do_test: |
| res_list = [] |
| output_dir2 = "" |
| |
| if args.load_model_path is not None: |
| model_to_load = model.module if hasattr(model, 'module') else model |
| |
| if args.task == "statement_level": |
| logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin")) |
| model_to_load.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin")) |
| else: |
| logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin")) |
| model_to_load.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin")) |
| |
| if args.task == "statement_level": |
| args.test_filename = os.path.join(args.test_filename, 'Code_Completion/statement_level/test.jsonl') |
| else: |
| args.test_filename = os.path.join(args.test_filename, 'Code_Completion/next_statement/test.jsonl') |
| eval_examples = read_examples(args.test_filename) |
| eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test') |
| all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long) |
| eval_data = TensorDataset(all_source_ids) |
|
|
| |
| eval_sampler = SequentialSampler(eval_data) |
| eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) |
|
|
| model.eval() |
| p=[] |
| for batch in tqdm(eval_dataloader,total=len(eval_dataloader)): |
| batch = tuple(t.to(device) for t in batch) |
| source_ids = batch[0] |
| with torch.no_grad(): |
| preds = model(source_ids) |
| |
| for pred in preds: |
| t = pred[0].cpu().numpy() |
| t = list(t) |
| if 0 in t: |
| t = t[:t.index(0)] |
| text = tokenizer.decode(t,clean_up_tokenization_spaces=False) |
| p.append(text) |
| model.train() |
| avg_acc = 0.0 |
| avg_EM = 0.0 |
| total = 0 |
| for ref,gold in zip(p,eval_examples): |
| pred = ref.strip() |
| gt = gold.target.strip() |
| if pred == gt: |
| avg_EM += 1 |
| avg_acc += fuzz.ratio(pred, gt) |
| res_list.append([pred, gt]) |
| total += 1 |
| dev_acc = round(avg_acc/total, 2) |
| dev_em = round(avg_EM/total, 4) |
|
|
| logger.info(" %s = %s "%("Test Token Avg Edit Distance",str(dev_acc))) |
| logger.info(" %s = %s "%("Test Token Avg Exact Match Rate",str(dev_em))) |
| logger.info(" "+"*"*20) |
| if args.test_org: |
| output_dir = args.output_dir |
| else: |
| if args.task == "statement_level": |
| output_dir = os.path.join(args.output_dir, 'statement_level/') |
| else: |
| output_dir = os.path.join(args.output_dir, 'next_statement/') |
|
|
| with open(output_dir + "/test_result.jsonl", 'w') as wf: |
| for line in res_list: |
| dic = {} |
| dic["Pred"] = line[0] |
| dic["GT"] = line[1] |
| wf.write(json.dumps(dic)) |
| wf.write("\n") |
|
|
| |
| |
| |
| if __name__ == "__main__": |
| main() |
|
|
|
|
|
|
|
|