Spaces:
Runtime error
Runtime error
import os | |
from dotenv import load_dotenv | |
from huggingface_hub import login | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig | |
import gradio as gr | |
load_dotenv() | |
API_TOKEN = os.environ.get("HUGGINGFACE_TOKEN") | |
login(API_TOKEN) | |
model_id = "meta-llama/Llama-2-7b-chat-hf" | |
quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True) | |
device_map = { | |
"transformer.word_embeddings": "cpu", | |
"transformer.word_embeddings_layernorm": "cpu", | |
"lm_head": "cpu", | |
"transformer.h": "cpu", | |
"transformer.ln_f": "cpu", | |
"model.embed_tokens": "cpu", | |
"model.layers":"cpu", | |
"model.norm":"cpu" | |
} | |
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device_map,quantization_config=quantization_config) | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
generate_text_pipeline = pipeline( | |
model=model, tokenizer=tokenizer, | |
return_full_text=True, | |
task='text-generation', | |
temperature=0.1, | |
max_new_tokens=512, | |
repetition_penalty=1.1 # without this output begins repeating | |
) | |
def get_results(text): | |
res = generate_text_pipeline(text) | |
return res[0]["generated_text"] | |
iface = gr.Interface(fn=get_results, inputs="text", outputs="text") | |
iface.launch() | |