| | import argparse |
| | import numpy as np |
| | import os |
| | import pandas as pd |
| | from tqdm import tqdm |
| | import torch |
| | import utils |
| | from metrics import * |
| | import sys |
| | sys.path.insert(0,"./") |
| | from Models.SentenceTransformersModel import SentenceTransformerModels |
| | from Models.llm_embeddings import LLMEmbeddings |
| | from main_args import get_args |
| | from metrics import CosineMetric |
| |
|
| |
|
| | def read_pertubed_data(filename, task, lang="en"): |
| | |
| | if not os.path.exists(filename): |
| | raise FileNotFoundError(f"File {filename} not found.") |
| | return pd.read_csv(filename) |
| |
|
| |
|
| | def run(args_model, dataset_name, target_lang,args_task, default_gpu="cuda", metric="cosine",save=False,batch_size=2): |
| | model = LLMEmbeddings(args_model, device=default_gpu) |
| | |
| | pertubed_data_path = f"./data/perturbed_dataset/{target_lang}/{args_task}/{dataset_name}_{args_task}_perturbed_{target_lang}.csv" |
| | |
| | data = read_pertubed_data(pertubed_data_path, args_task) |
| | |
| | |
| | print(f"\n*** Model {args_model} on {dataset_name} dataset for {args_task} task ***\n") |
| |
|
| | |
| | sentences = [] |
| | if args_task in ["Anto","anto","Antonym"]: |
| | cols = ["original_sentence", "paraphrased_sentence", "perturb_n1"] |
| | for _, row in data[cols].iterrows(): |
| | sentences.extend(row.values) |
| | elif args_task in ["jumbling", "Jumbling","jumb"]: |
| | cols = ["original_sentence", "paraphrased_sentence", "perturb_n1", "perturb_n2", "perturb_n3"] |
| | for _, row in data[cols].iterrows(): |
| | sentences.extend(row.values) |
| | elif args_task in ["Syn","syn","Synonym"]: |
| | cols = ["original_sentence", "perturb_n1", "perturb_n2", "perturb_n3"] |
| | for _, row in data[cols].iterrows(): |
| | sentences.extend(row.values) |
| | elif args_task in ["paraphrase","Paraphrase","para"]: |
| | cols = ["original_sentence", "paraphrased_sentence"] |
| | for _, row in data[cols].iterrows(): |
| | sentences.extend(row.values) |
| | |
| | |
| | embeddings = model.encode_batch(sentences,batch_size=batch_size) |
| | |
| | if args_model == "chatgpt": |
| | |
| | embeddings = [emb.cpu().numpy() if isinstance(emb, torch.Tensor) else emb for emb in embeddings] |
| | embeddings = np.array(embeddings) |
| | else: |
| | |
| | if isinstance(embeddings, torch.Tensor): |
| | embeddings = embeddings.cpu().numpy() |
| | |
| | |
| | if args_task == "anto": |
| | emb_org = embeddings[0::3] |
| | emb_para = embeddings[1::3] |
| | emb_anto = embeddings[2::3] |
| | |
| | mean_para,sim_para = utils.similarity_between_sent(emb_org, emb_para) |
| | mean_anto,sim_anto = utils.similarity_between_sent(emb_org, emb_anto) |
| | data["sim_org_para"] = sim_para |
| | data["sim_org_anto"] = sim_anto |
| | data["diff_org_para"] = np.array(sim_para) - np.array(sim_anto) |
| | |
| | print(f"""The summary for Antonym Criteria for {args_model} \n {data.describe()} """) |
| | |
| |
|
| | elif args_task == "jumbling": |
| | |
| | emb_org = embeddings[0::5] |
| | emb_para = embeddings[1::5] |
| | emb_n1 = embeddings[2::5] |
| | emb_n2 = embeddings[3::5] |
| | emb_n3 = embeddings[4::5] |
| | |
| | |
| | mean_para,sim_para = utils.similarity_between_sent(emb_org, emb_para) |
| | mean_n1,sim_n1 = utils.similarity_between_sent(emb_org, emb_n1) |
| | mean_n2,sim_n2 = utils.similarity_between_sent(emb_org, emb_n2) |
| | mean_n3,sim_n3 = utils.similarity_between_sent(emb_org, emb_n3) |
| | |
| | data["sim_org_para"] = sim_para |
| | data["sim_org_n1"] = sim_n1 |
| | data["sim_org_n2"] = sim_n2 |
| | data["sim_org_n3"] = sim_n3 |
| | |
| | data["diff_org_para"] = sim_para - sim_para |
| | data["diff_org_n1"] = sim_para - sim_n1 |
| | data["diff_org_n2"] = sim_para - sim_n2 |
| | data["diff_org_n3"] = sim_para - sim_n3 |
| | |
| | print(f"""The summary for Jumbling Criteria for {args_model} \n {data.describe()} """) |
| | |
| | |
| | elif args_task == "syn": |
| | |
| | emb_org = embeddings[0::4] |
| | emb_s1 = embeddings[1::4] |
| | emb_s2 = embeddings[2::4] |
| | emb_s3 = embeddings[3::4] |
| | |
| | _,sim_s1 = utils.similarity_between_sent(emb_org, emb_s1) |
| | _,sim_s2 = utils.similarity_between_sent(emb_org, emb_s2) |
| | _,sim_s3 = utils.similarity_between_sent(emb_org, emb_s3) |
| | |
| | data["sim_org_s1"] = sim_s1 |
| | data["sim_org_s2"] = sim_s2 |
| | data["sim_org_s3"] = sim_s3 |
| | |
| | print(f"""The summary for Synonym Criteria for {args_model} \n {data.describe()} """) |
| | |
| | elif args_task == "paraphrase": |
| | emb_s1 = embeddings[0::2] |
| | emb_s2 = embeddings[1::2] |
| | data["sim"] = utils.similarity_between_sent(emb_s1, emb_s2) |
| | |
| | print(f"""The summary for Paraphrase Criteria for {args_model} \n {data.describe()} """) |
| | |
| | if save: |
| | path = f"./Results/{target_lang}/{args_task}/{dataset_name}_{args_model}_{args_task}_metric.csv" |
| | data.to_csv(path) |
| | print("Data saved at path : {path} ") |
| | return data |
| |
|
| | if __name__ == "__main__": |
| | if sys.gettrace() is None: |
| | parser = get_args() |
| | config = { |
| | "args_model": parser.model_name, |
| | "dataset_name": parser.perturbed_dataset, |
| | "args_task": parser.task, |
| | "default_gpu": parser.gpu, |
| | "save": parser.save, |
| | "target_lang": parser.target_lang, |
| | "metric":parser.metric, |
| | "batch_size":2 |
| | } |
| | else: |
| | |
| | config = { |
| | "args_model": "llama3", |
| | "dataset_name": "mrpc", |
| | "args_task": "syn", |
| | "default_gpu": "cuda:2", |
| | "save": False, |
| | "target_lang": "en" |
| | |
| | } |
| | run(**config) |
| |
|