testMultiGPU / app.py
Gopal2002's picture
Update app.py
580a57c verified
raw
history blame
2.09 kB
import streamlit as st
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import torch,os
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer,AutoModelForCausalLM,pipeline,BitsAndBytesConfig
model_name_or_path = "meta-llama/Llama-2-13b-chat-hf"
# Count the number of GPUs available
gpu_count = torch.cuda.device_count()
# Determine the device to use based on GPU availability and count
# If more than one GPU is available, use 'auto' to allow the library to choose
# If only one GPU is available, use 'cuda:0' to specify the first GPU
# If no GPU is available, use the CPU
if torch.cuda.is_available() and gpu_count > 1:
device = 'auto'
elif torch.cuda.is_available():
device = 'cuda:0'
else:
device = 'cpu'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
# quantization_config=bnb_config,
torch_dtype=torch.float16,
device_map='auto',)
print(model.hf_device_map)
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_length=2500,
return_full_text=True,
do_sample=True,
repetition_penalty=1.15,
num_return_sequences=1,
pad_token_id=2,
model_kwargs={"temperature": 0.3,
"top_p":0.95,
"top_k":40,
"max_new_tokens":2500},
)
llm = HuggingFacePipeline(pipeline=pipe)
template = template = """Prompt: {query}
Answer: """
prompt_template = PromptTemplate(
input_variables=["query"],
template=template
)
#instantiate the chain
llm_chain = LLMChain(prompt=prompt_template, llm=llm)
st.title('Test Multi GPU')
md = st.text_area('Type in your markdown string (without outer quotes)')
st.button("Enter", type="primary")
if st.button("Say hello"):
resp=llm_chain.invoke(md)['text']
st.write(resp)
else:
st.write("Goodbye")