| | import pandas as pd |
| | from googletrans import Translator |
| | import os |
| | from tqdm import tqdm |
| | import sys |
| | sys.path.insert(0, "/home/yash/EMNLP-2024/ALIGN-Multilingual/") |
| | from Models.MultilingualTranslationModel import NLLBTranslator |
| | from args_parser import get_args |
| | from src.utils import read_data |
| |
|
| | |
| |
|
| | def translate_dataset(dataset_name, model_name, target_lang,batch_size=16, sample_size=1000,save=False): |
| | """ |
| | Translates a dataset in batches using the NLLB model. |
| | |
| | Args: |
| | dataset_name (str): Name of the dataset. |
| | model_name (str): Model name used for translation. |
| | target_lang (str): Target language for translation. |
| | batch_size (int): Number of sentences to process in each batch. |
| | sample_size (int): Number of rows to process. |
| | save (bool): Whether to save the translated dataset to CSV. |
| | |
| | Returns: |
| | pd.DataFrame: Translated dataset. |
| | """ |
| | |
| | |
| | |
| | translated_file_path = f"/home/yash/EMNLP-2024/ALIGN-Multilingual/data/{dataset_name}_{target_lang}.csv" |
| | |
| | |
| | |
| | data = read_data(dataset_name) |
| | |
| | print(f"Size of dataset: {len(data)}") |
| |
|
| | print("original dataset loaded successfully") |
| | |
| | model = NLLBTranslator(model_name=model_name) |
| | print("NLLB model loaded successfully") |
| | |
| | if os.path.exists(translated_file_path): |
| | translated_dataset = pd.read_csv(translated_file_path) |
| | print("Dataset exists and loaded successfully") |
| | return translated_dataset |
| | |
| | print("Creatign the dataset ....") |
| | translated_dataset = pd.DataFrame(columns=['sentence1', 'sentence2', 'label']) |
| |
|
| | for i in tqdm(range(0, len(data), batch_size)): |
| | batch_sentences1 = data.loc[i:i+batch_size-1, 'sentence1'].tolist() |
| | batch_sentences2 = data.loc[i:i+batch_size-1, 'sentence2'].tolist() |
| | batch_labels = data.loc[i:i+batch_size-1, 'label'].tolist() |
| | |
| | translated_batch1 = model.translate(batch_sentences1, source_lang="en", target_lang=target_lang) |
| | translated_batch2 = model.translate(batch_sentences2, source_lang="en", target_lang=target_lang) |
| | |
| | |
| | batch_df = pd.DataFrame({ |
| | 'sentence1': translated_batch1, |
| | 'sentence2': translated_batch2, |
| | 'label': batch_labels |
| | }) |
| |
|
| | translated_dataset = pd.concat([translated_dataset, batch_df], ignore_index=True) |
| | |
| | if save: |
| | translated_dataset.to_csv(translated_file_path, index=False) |
| | print(f"Translated dataset saved to {translated_file_path}") |
| | return translated_dataset |
| |
|
| |
|
| |
|
| | if __name__ == "__main__": |
| | languages=['fr','es',"de","zh-CN","ja","ko"] |
| | |
| | |
| | args = get_args() |
| | |
| | for language in languages: |
| | print(f"Translating to {language} ....") |
| | config= { |
| | "dataset_name": args.dataset_name, |
| | "model_name": args.model_name, |
| | "target_lang": language, |
| | "batch_size": args.batch_size, |
| | "save": args.save |
| | } |
| | translated_dataset_lang = translate_dataset(**config) |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | print("Done") |
| |
|