import torch from transformers import AutoTokenizer, AutoModelForCausalLM from sentence_transformers import SentenceTransformer import chromadb import gradio as gr # Determine the device (CPU or GPU) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Initialize the tokenizer and model tokenizer = AutoTokenizer.from_pretrained("./SMOL") model = AutoModelForCausalLM.from_pretrained("./SMOL").to(device) # Initialize the sentence transformer model smodel = SentenceTransformer('./embed') # Initialize the chromadb client and collection client = chromadb.PersistentClient(path="vectordb") collection = client.get_or_create_collection("dogdb") def clean_text_block(text): start_keyword = "'documents': [[" end_keyword = "]], 'uris':" start_index = text.find(start_keyword) end_index = text.find(end_keyword) + len(end_keyword) if start_index != -1 and end_index != -1: cleaned_text = text[start_index + len(start_keyword):end_index - len(end_keyword)] return cleaned_text else: return "Keywords not found in the text." def remove_unwanted_parts(text): start_keyword = "system" end_keyword = """Respond in a friendly manner; you are an informational about dogs. assistant""" start_idx = text.find(start_keyword) end_idx = text.find(end_keyword) + len(end_keyword) if start_idx != -1 and end_idx != -1: cleaned_text = text[:start_idx] + text[end_idx:] return cleaned_text.strip() else: return text def generate_response(question): query = [{'question': f"{question}?"}] query_embeddings = smodel.encode(query) results = collection.query( query_embeddings=query_embeddings, n_results=3 # how many results to return ) results = clean_text_block(str(results)) messages = [{"role": "user", "content": f"""After the colon is a set of text with information about dogs, then a question about the given text. Please answer the question based off the text, and do not talk about the documentation: text - {results} question - {question} Respond in a friendly manner; you are an informational about dogs."""}] input_text = tokenizer.apply_chat_template(messages, tokenize=False) encoded_inputs = tokenizer.encode_plus(input_text, return_tensors="pt", add_special_tokens=True) inputs, attention_mask = encoded_inputs["input_ids"].to(device), encoded_inputs["attention_mask"].to(device) outputs = model.generate(inputs, attention_mask=attention_mask, max_new_tokens=150, temperature=0.4, top_p=0.6, do_sample=True) output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) cleaned_output = remove_unwanted_parts(output_text) return cleaned_output # Create Gradio interface iface = gr.Interface(fn=generate_response, inputs="text", outputs="text", title="Dog Breed Q&A", description="Ask a question about your dog's breed! From over 70 different breeds. You can find the full list under this space's files 'Dog_List'. All done on a CPU!") # Launch the interface iface.launch()