import gradio as gr from langchain.llms import HuggingFacePipeline, CTransformers from transformers import AutoTokenizer, AutoModel import transformers import torch import warnings warnings.filterwarnings('ignore') model = CTransformers(model="TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q3_K_S.bin") tokenizer=AutoTokenizer.from_pretrained(model="TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q3_K_S.bin") pipeline=transformers.pipeline( "text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto", max_length=500, do_sample=True, top_k=10, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id ) llm=HuggingFacePipeline(pipeline=pipeline, model_kwargs={'temperature':0}) def greet(prompt): return llm(prompt) iface = gr.Interface(fn=greet, inputs="text", outputs="text") iface.launch()