| from typing import Dict |
| from pathlib import Path |
| import tempfile |
| import torch |
| import torchaudio |
| import librosa |
|
|
| SAMPLE_RATE = 16000 |
|
|
| class EndpointHandler(): |
| def __init__(self, path=""): |
| |
| self.mars5, self.config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True) |
|
|
| def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: |
| """ |
| Args: |
| data (Dict[str, bytes]): |
| Includes the text, audio file path, and transcript. |
| Returns: |
| Dict[str, str]: Path to the synthesized audio file. |
| """ |
| |
| text = data["text"] |
| audio_file = data["audio_file"] |
| transcript = data["transcript"] |
|
|
| |
| wav, sr = librosa.load(audio_file, sr=self.mars5.sr, mono=True) |
| wav = torch.from_numpy(wav) |
|
|
| |
| deep_clone = True |
| cfg = self.config_class(deep_clone=deep_clone, rep_penalty_window=100, top_k=100, temperature=0.7, freq_penalty=3) |
|
|
| |
| ar_codes, wav_out = self.mars5.tts(text, wav, transcript, cfg=cfg) |
|
|
| |
| output_path = Path(tempfile.mktemp(suffix=".wav")) |
| torchaudio.save(output_path, wav_out.unsqueeze(0), self.mars5.sr) |
|
|
| return {"synthesized_audio": str(output_path)} |
|
|