import streamlit as st from langchain.chains import LLMChain from langchain.prompts import PromptTemplate import torch,os from langchain.llms import HuggingFacePipeline from transformers import AutoTokenizer,AutoModelForCausalLM,pipeline,BitsAndBytesConfig model_name_or_path = "meta-llama/Llama-2-13b-chat-hf" # Count the number of GPUs available gpu_count = torch.cuda.device_count() # Determine the device to use based on GPU availability and count # If more than one GPU is available, use 'auto' to allow the library to choose # If only one GPU is available, use 'cuda:0' to specify the first GPU # If no GPU is available, use the CPU if torch.cuda.is_available() and gpu_count > 1: device = 'auto' elif torch.cuda.is_available(): device = 'cuda:0' else: device = 'cpu' tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) model = AutoModelForCausalLM.from_pretrained(model_name_or_path, # quantization_config=bnb_config, torch_dtype=torch.float16, device_map='auto',) print(model.hf_device_map) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_length=2500, return_full_text=True, do_sample=True, repetition_penalty=1.15, num_return_sequences=1, pad_token_id=2, model_kwargs={"temperature": 0.3, "top_p":0.95, "top_k":40, "max_new_tokens":2500}, ) llm = HuggingFacePipeline(pipeline=pipe) template = template = """Prompt: {query} Answer: """ prompt_template = PromptTemplate( input_variables=["query"], template=template ) #instantiate the chain llm_chain = LLMChain(prompt=prompt_template, llm=llm) st.title('Test Multi GPU') md = st.text_area('Type in your markdown string (without outer quotes)') if st.button("Enter"): with st.spinner(text="In progress..."): resp=llm_chain.invoke(md)['text'] st.write(resp)