| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa). |
| GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned |
| using a masked language modeling (MLM) loss. |
| """ |
|
|
| from __future__ import absolute_import |
| import os |
| import sys |
| from bleu import _bleu |
| import pickle |
| import torch |
| import json |
| import random |
| import logging |
| import argparse |
| import numpy as np |
| from io import open |
| from itertools import cycle |
| import torch.nn as nn |
| from model import Seq2Seq |
| from tqdm import tqdm, trange |
| from fuzzywuzzy import fuzz |
| from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset |
| from torch.utils.data.distributed import DistributedSampler |
|
|
| from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, |
| RobertaConfig, RobertaModel, RobertaTokenizer) |
| logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', |
| datefmt = '%m/%d/%Y %H:%M:%S', |
| level = logging.INFO) |
| logger = logging.getLogger(__name__) |
| divide_number = 3 |
| |
|
|
| class Example(object): |
| """A single training/test example.""" |
| def __init__(self, |
| idx, |
| source, |
| ts_v, |
| target, |
| ): |
| self.idx = idx |
| self.source = source |
| self.ts_v = ts_v |
| self.target = target |
|
|
| def read_examples(filename): |
| """Read examples from filename.""" |
| examples=[] |
| with open(filename,encoding="utf-8") as f: |
| for idx, line in enumerate(f): |
| line=line.strip() |
| js=json.loads(line) |
| examples.append( |
| Example( |
| idx = idx, |
| source=" ".join(js['natrual_language']), |
| ts_v = ",".join(js['TS_V_token']), |
| target = " ".join(js["ground_truth"][1:-1]), |
| ) |
| ) |
| |
| return examples |
|
|
|
|
| class InputFeatures(object): |
| """A single training/test features for a example.""" |
| def __init__(self, |
| example_id, |
| source_ids, |
| target_ids, |
| ): |
| self.example_id = example_id |
| self.source_ids = source_ids |
| self.target_ids = target_ids |
| |
| def convert_examples_to_features(examples, tokenizer, args,stage=None): |
| features = [] |
| for example_index, example in enumerate(examples): |
| |
| source_tokens = tokenizer.tokenize(example.source) |
| ts_v_tokens = tokenizer.tokenize(example.ts_v) |
| source_tokens =[tokenizer.cls_token]+source_tokens+[tokenizer.sep_token]+ts_v_tokens+[tokenizer.sep_token] |
| |
| source_ids = tokenizer.convert_tokens_to_ids(source_tokens[:args.max_source_length-5]) |
| padding_length = args.max_source_length - len(source_ids) |
| source_ids+=[tokenizer.pad_token_id]*padding_length |
| |
| |
| if stage=="test": |
| target_tokens = tokenizer.tokenize("None") |
| else: |
| target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2] |
| target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token] |
| target_ids = tokenizer.convert_tokens_to_ids(target_tokens) |
| padding_length = args.max_target_length - len(target_ids) |
| target_ids+=[tokenizer.pad_token_id]*padding_length |
|
|
| features.append( |
| InputFeatures( |
| example_index, |
| source_ids, |
| target_ids, |
| ) |
| ) |
| return features |
|
|
|
|
|
|
| def set_seed(seed=20240124): |
| random.seed(seed) |
| os.environ['PYHTONHASHSEED'] = str(seed) |
| np.random.seed(seed) |
| torch.manual_seed(seed) |
| torch.cuda.manual_seed(seed) |
| torch.backends.cudnn.deterministic = True |
| |
| def main(): |
| parser = argparse.ArgumentParser() |
|
|
| |
| parser.add_argument("--model_name_or_path", default=None, type=str, required=True, |
| help="Path to pre-trained model: e.g. roberta-base" ) |
| parser.add_argument("--load_model_path", default=None, type=str, |
| help="Path to trained model" ) |
| parser.add_argument("--output_dir", default=None, type=str, required=True, |
| help="The output directory where the model predictions and checkpoints will be written.") |
| |
| |
| parser.add_argument("--train_filename", default=None, type=str, |
| help="The train filename. Should contain the .jsonl files for this task.") |
| parser.add_argument("--dev_filename", default=None, type=str, |
| help="The dev filename. Should contain the .jsonl files for this task.") |
| parser.add_argument("--test_filename", default=None, type=str, |
| help="The test filename. Should contain the .jsonl files for this task.") |
| parser.add_argument("--max_source_length", default=256, type=int, |
| help="The maximum total source sequence length after tokenization. Sequences longer " |
| "than this will be truncated, sequences shorter will be padded.") |
| parser.add_argument("--max_target_length", default=256, type=int, |
| help="The maximum total target sequence length after tokenization. Sequences longer " |
| "than this will be truncated, sequences shorter will be padded.") |
| parser.add_argument("--do_train", action='store_true', |
| help="Whether to run training.") |
| parser.add_argument("--do_eval", action='store_true', |
| help="Whether to run eval on the dev set.") |
| parser.add_argument("--do_test", action='store_true', |
| help="Whether to run eval on the dev set.") |
| parser.add_argument("--no_cuda", action='store_true', |
| help="Avoid using CUDA when available") |
| |
| parser.add_argument("--train_batch_size", default=8, type=int, |
| help="Batch size per GPU/CPU for training.") |
| parser.add_argument("--eval_batch_size", default=8, type=int, |
| help="Batch size per GPU/CPU for evaluation.") |
| parser.add_argument('--gradient_accumulation_steps', type=int, default=1, |
| help="Number of updates steps to accumulate before performing a backward/update pass.") |
| parser.add_argument("--learning_rate", default=5e-5, type=float, |
| help="The initial learning rate for Adam.") |
| parser.add_argument("--beam_size", default=10, type=int, |
| help="beam size for beam search") |
| parser.add_argument("--weight_decay", default=0.0, type=float, |
| help="Weight deay if we apply some.") |
| parser.add_argument("--adam_epsilon", default=1e-8, type=float, |
| help="Epsilon for Adam optimizer.") |
| parser.add_argument("--max_grad_norm", default=1.0, type=float, |
| help="Max gradient norm.") |
| parser.add_argument("--num_train_epochs", default=3, type=int, |
| help="Total number of training epochs to perform.") |
| parser.add_argument('--seed', type=int, default=20240124, |
| help="random seed for initialization") |
| |
| |
| args = parser.parse_args() |
| |
| logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', |
| datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO ) |
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| args.n_gpu = torch.cuda.device_count() |
| args.device = device |
| logger.info("device: %s, n_gpu: %s",device, args.n_gpu) |
| |
| |
| set_seed(args.seed) |
| |
| if os.path.exists(args.output_dir) is False: |
| os.makedirs(args.output_dir) |
|
|
| |
| tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) |
| config = RobertaConfig.from_pretrained(args.model_name_or_path) |
| |
| config.is_decoder = True |
| encoder = RobertaModel.from_pretrained(args.model_name_or_path,config=config) |
|
|
| model = Seq2Seq(encoder=encoder,decoder=encoder,config=config, |
| beam_size=args.beam_size,max_length=args.max_target_length, |
| sos_id=tokenizer.convert_tokens_to_ids(["<mask0>"])[0],eos_id=tokenizer.sep_token_id) |
| |
| logger.info("Training/evaluation parameters %s", args) |
| if args.load_model_path is not None: |
| logger.info("reload model from {}".format(args.load_model_path + "/pytorch_model.bin")) |
| model.load_state_dict(torch.load(args.load_model_path + "/pytorch_model.bin")) |
| model.to(args.device) |
| |
| if args.n_gpu > 1: |
| |
| model = torch.nn.DataParallel(model) |
|
|
| if args.do_train: |
| |
| train_examples = read_examples(args.train_filename) |
| train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train') |
| all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long) |
| all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long) |
| train_data = TensorDataset(all_source_ids,all_target_ids) |
| train_sampler = RandomSampler(train_data) |
| train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) |
|
|
|
|
| |
| no_decay = ['bias', 'LayerNorm.weight'] |
| optimizer_grouped_parameters = [ |
| {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], |
| 'weight_decay': args.weight_decay}, |
| {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} |
| ] |
| optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) |
| scheduler = get_linear_schedule_with_warmup(optimizer, |
| num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1), |
| num_training_steps=len(train_dataloader)*args.num_train_epochs) |
| |
| |
| logger.info("***** Running training *****") |
| logger.info(" Num examples = %d", len(train_examples)) |
| logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps) |
| logger.info(" Num epoch = %d", args.num_train_epochs) |
| |
|
|
| model.train() |
| patience, best_score, losses, dev_dataset = 0, 0, [], {} |
| for epoch in range(args.num_train_epochs): |
| for idx,batch in enumerate(train_dataloader): |
| batch = tuple(t.to(device) for t in batch) |
| source_ids,target_ids = batch |
| loss,_,_ = model(source_ids=source_ids,target_ids=target_ids) |
|
|
| if args.n_gpu > 1: |
| loss = loss.mean() |
| if args.gradient_accumulation_steps > 1: |
| loss = loss / args.gradient_accumulation_steps |
| |
| losses.append(loss.item()) |
| loss.backward() |
| if len(losses) % args.gradient_accumulation_steps == 0: |
| |
| optimizer.step() |
| optimizer.zero_grad() |
| scheduler.step() |
| if len(losses) // args.gradient_accumulation_steps % 100 == 0: |
| logger.info("epoch {} step {} loss {}".format(epoch, |
| len(losses)//args.gradient_accumulation_steps, |
| round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4))) |
| if args.do_eval: |
| |
| if 'dev_loss' in dev_dataset: |
| eval_examples,eval_data = dev_dataset['dev_loss'] |
| else: |
| eval_examples = read_examples(args.dev_filename) |
| eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev') |
| all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long) |
| all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long) |
| eval_data = TensorDataset(all_source_ids,all_target_ids) |
| dev_dataset['dev_loss' ]= eval_examples,eval_data |
| eval_sampler = SequentialSampler(eval_data) |
| eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) |
|
|
| logger.info("\n***** Running evaluation *****") |
| logger.info(" Num examples = %d", len(eval_examples)) |
| logger.info(" Batch size = %d", args.eval_batch_size) |
|
|
| |
| model.eval() |
| eval_loss,tokens_num = 0,0 |
| for batch in eval_dataloader: |
| batch = tuple(t.to(device) for t in batch) |
| source_ids,target_ids = batch |
|
|
| with torch.no_grad(): |
| _,loss,num = model(source_ids=source_ids,target_ids=target_ids) |
| eval_loss += loss.sum().item() |
| tokens_num += num.sum().item() |
| |
| model.train() |
| eval_loss = eval_loss / tokens_num |
| result = {'eval_ppl': round(np.exp(eval_loss),5)} |
| for key in sorted(result.keys()): |
| logger.info(" %s = %s", key, str(result[key])) |
| logger.info(" "+"*"*20) |
|
|
| |
| if 'dev_bleu' in dev_dataset: |
| eval_examples,eval_data=dev_dataset['dev_bleu'] |
| else: |
| eval_examples = read_examples(args.dev_filename) |
| |
| eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test') |
| all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long) |
| eval_data = TensorDataset(all_source_ids) |
| dev_dataset['dev_bleu'] = eval_examples,eval_data |
|
|
| eval_sampler = SequentialSampler(eval_data) |
| eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) |
|
|
| model.eval() |
| p=[] |
| for batch in eval_dataloader: |
| batch = tuple(t.to(device) for t in batch) |
| source_ids = batch[0] |
| with torch.no_grad(): |
| preds = model(source_ids=source_ids) |
| |
| for pred in preds: |
| t = pred[0].cpu().numpy() |
| t = list(t) |
| if 0 in t: |
| t = t[:t.index(0)] |
| text = tokenizer.decode(t,clean_up_tokenization_spaces=False) |
| |
| p.append(text) |
|
|
| model.train() |
| predictions = [] |
| edit_dis = 0 |
| cnt_all = 0 |
| res_list = [] |
| EM = [] |
| is_gened = False |
| with open(args.output_dir+"/dev.output",'w') as f, open(args.output_dir+"/dev.gold",'w') as f1: |
| for ref,gold in zip(p,eval_examples): |
| predictions.append(ref) |
| if len(ref) > 0: |
| is_gened = True |
| f.write(ref+'\n') |
| f1.write(gold.target+'\n') |
| EM.append(ref.split()==gold.target.split()) |
| edit_dis += fuzz.ratio(ref, gold.target) |
| res_list.append([ref,gold.target]) |
| cnt_all += 1 |
| if is_gened: |
| dev_bleu = _bleu(args.output_dir+"/dev.gold", args.output_dir+"/dev.output") |
| else: |
| dev_bleu = 0 |
| avg_edit_dis = float(edit_dis)/cnt_all |
| logger.info(" %s = %s "%("Epoch",str(epoch))) |
| logger.info(" %s = %s "%("bleu-4",str(dev_bleu))) |
| logger.info(" %s = %s "%("Edit Distance",str(round(float(edit_dis)/cnt_all,2)))) |
| logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2)))) |
| logger.info(" "+"*"*20) |
| dev_score = (dev_bleu+avg_edit_dis) / 2.0 |
| if dev_score>best_score: |
| best_score=dev_score |
| |
| output_dir = args.output_dir |
| if not os.path.exists(output_dir): |
| os.makedirs(output_dir) |
| model_to_save = model.module if hasattr(model, 'module') else model |
| output_model_file = os.path.join(output_dir, "pytorch_model.bin") |
| torch.save(model_to_save.state_dict(), output_model_file) |
| patience =0 |
| else: |
| patience +=1 |
| if patience == 3: |
| break |
| output_dir = args.output_dir |
| with open(output_dir + "/last_training_result.jsonl", 'w') as wf: |
| for line in res_list: |
| dic = {} |
| dic["Pred"] = line[0] |
| dic["GT"] = line[1] |
| wf.write(json.dumps(dic)) |
| wf.write("\n") |
| |
| logger.info(" Best score:%s",best_score) |
| logger.info(" "+"*"*20) |
| if args.do_test: |
| res_list = [] |
| if args.load_model_path is not None: |
| checkpoint_prefix = 'pytorch_model.bin' |
| output_dir = os.path.join(args.output_dir, checkpoint_prefix) |
| model_to_load = model.module if hasattr(model, 'module') else model |
| model_to_load.load_state_dict(torch.load(output_dir)) |
| |
| |
|
|
| eval_examples = read_examples(args.test_filename) |
| eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test') |
| all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long) |
| eval_data = TensorDataset(all_source_ids) |
|
|
| |
| eval_sampler = SequentialSampler(eval_data) |
| eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) |
|
|
| model.eval() |
| p=[] |
| for batch in tqdm(eval_dataloader,total=len(eval_dataloader)): |
| batch = tuple(t.to(device) for t in batch) |
| source_ids = batch[0] |
| with torch.no_grad(): |
| preds = model(source_ids) |
| |
| for pred in preds: |
| t = pred[0].cpu().numpy() |
| t = list(t) |
| if 0 in t: |
| t = t[:t.index(0)] |
| text = tokenizer.decode(t,clean_up_tokenization_spaces=False) |
| p.append(text) |
|
|
| predictions=[] |
| EM = [] |
| edit_dis = 0 |
| cnt = 0 |
| with open(args.output_dir+"/test.output",'w') as f, open(args.output_dir+"/test.gold",'w') as f1: |
| for ref,gold in zip(p,eval_examples): |
| res_list.append([ref,gold.target]) |
| predictions.append(ref) |
| f.write(ref+'\n') |
| f1.write(gold.target+'\n') |
| EM.append(ref.split()==gold.target.split()) |
| edit_dis += fuzz.ratio(ref, gold.target) |
| cnt += 1 |
|
|
| dev_bleu = _bleu(args.output_dir+"/test.gold", args.output_dir+"/test.output") |
| logger.info(" %s = %s "%("bleu-4",str(dev_bleu))) |
| logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2)))) |
| logger.info(" %s = %s "%("Edit Distance",str(round(float(edit_dis)/cnt,2)))) |
| logger.info(" "+"*"*20) |
|
|
|
|
| with open(args.output_dir + "/last_training_result.jsonl", 'w') as wf: |
| for line in res_list: |
| dic = {} |
| dic["Pred"] = line[0] |
| dic["GT"] = line[1] |
| wf.write(json.dumps(dic)) |
| wf.write("\n") |
| |
| if __name__ == "__main__": |
| main() |
|
|
|
|
|
|