| import sys |
| import os |
| sys.path.append(os.path.dirname(os.path.abspath(__file__))) |
| from configuration_dolphin import DolphinConfig |
| from modeling_dolphin import DolphinForCausalLM |
| from transformers import (AutoTokenizer, AutoModelForCausalLM, AutoConfig) |
| import torch |
|
|
| def inference_instruct(mycontext, question, device="cuda:0"): |
| import time |
| MEMORY_SIZE = 32 |
| start_time = time.time() |
| generated_token_ids = [] |
| prompt = f" <context>{question}" |
| text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")] |
| input_ids = ( |
| torch.tensor( |
| text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long |
| ) |
| .unsqueeze(0) |
| .to(device) |
| ) |
| |
| context_tokenized = tokenizer( |
| mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]), |
| return_tensors="pt", |
| ) |
| context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()} |
| context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE |
| |
| for i in range(context_token_count): |
| next_token = ( |
| model( |
| input_ids, |
| context_input_ids=context_tokenized["input_ids"], |
| context_attention_mask=context_tokenized["attention_mask"], |
| ) |
| .logits[:, -1] |
| .argmax(-1) |
| ) |
| if next_token.item() == 151643: |
| break |
| generated_token_ids.append(next_token.item()) |
| input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1) |
| result = tokenizer.decode(generated_token_ids) |
| print(f"Time taken: {time.time() - start_time}") |
| return result |
|
|
|
|
| if __name__ == "__main__": |
| |
| AutoConfig.register("dolphin", DolphinConfig) |
| AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM) |
| device_name = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True) |
| model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name) |
| |
| |
| mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally" |
| question = "Who founded Nexa AI?" |
| |
| result = inference_instruct(mycontext, question, device=device_name) |
| print("Result:", result) |