tiny ramdom models
Collection
96 items • Updated • 8
This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from k2-fsa/OmniVoice.
| File path | Size |
|---|---|
| model.safetensors | 5.5MB |
| audio_tokenizer/model.safetensors | 6.7MB |
from omnivoice import OmniVoice
import torch
import torchaudio
model_id = "tiny-random/omnivoice"
model = OmniVoice.from_pretrained(
model_id,
dtype=torch.bfloat16,
)
audio = model.generate(
text="Hello, this is test example 1",
instruct="low pitch, british accent",
)
torchaudio.save("/tmp/example1.wav", audio[0], 24000)
audio2 = model.generate(
text="Hello, this is test example 2",
ref_audio="/tmp/example1.wav",
ref_text="Hello, this is test example 1",
)
torchaudio.save("/tmp/example2.wav", audio2[0], 24000)
import torch
import os
from transformers import (
set_seed,
AutoConfig,
AutoTokenizer,
HiggsAudioV2TokenizerModel,
AutoFeatureExtractor,
)
from huggingface_hub import hf_hub_download
import json
from omnivoice import OmniVoice, OmniVoiceConfig
source_model_id = "k2-fsa/OmniVoice"
save_folder = "/tmp/tiny-random/omnivoice"
set_seed(42)
tokenizer = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
tokenizer.save_pretrained(save_folder)
with open(
hf_hub_download(source_model_id, filename="audio_tokenizer/config.json", repo_type="model"),
"r",
encoding="utf-8",
) as f:
config_dict = json.load(f)
config_dict["acoustic_model_config"].update(
{
"decoder_hidden_size": 32,
"encoder_hidden_size": 4,
"hidden_size": 4,
"codebook_dim": 8,
}
)
config_dict["semantic_model_config"].update(
{
"conv_dim": [8] * 7,
"hidden_size": 16 * 4,
"intermediate_size": 64,
"num_attention_heads": 4,
"num_hidden_layers": 2,
}
)
os.makedirs(os.path.join(save_folder, "audio_tokenizer"), exist_ok=True)
with open(os.path.join(save_folder, "audio_tokenizer/config.json"), "w", encoding="utf-8") as f:
json.dump(config_dict, f, ensure_ascii=False, indent=2)
audio_tokenizer = HiggsAudioV2TokenizerModel(
AutoConfig.from_pretrained(os.path.join(save_folder, "audio_tokenizer"))
)
audio_tokenizer.save_pretrained(os.path.join(save_folder, "audio_tokenizer"))
print(audio_tokenizer)
set_seed(42)
with torch.no_grad():
for name, p in sorted(audio_tokenizer.named_parameters()):
torch.nn.init.normal_(p, 0, 0.2)
print(name, p.shape)
feature_extractor = AutoFeatureExtractor.from_pretrained(source_model_id, subfolder="audio_tokenizer")
feature_extractor.save_pretrained(os.path.join(save_folder, "audio_tokenizer"))
with open(
hf_hub_download(source_model_id, filename="config.json", repo_type="model"),
"r",
encoding="utf-8",
) as f:
config_dict = json.load(f)
config_dict["llm_config"].update(
{
"hidden_size": 8,
"head_dim": 32,
"intermediate_size": 32,
"num_attention_heads": 8,
"num_key_value_heads": 4,
"num_hidden_layers": 4,
"max_window_layers": 2,
"layer_types": ["full_attention"] * 4,
}
)
config = OmniVoiceConfig.from_dict(config_dict)
model = OmniVoice(config).eval()
set_seed(42)
with torch.no_grad():
for name, p in sorted(model.named_parameters()):
torch.nn.init.normal_(p, 0, 0.2)
print(name, p.shape)
model.save_pretrained(save_folder)