Spaces:

Luciferalive
/

Rag-v10

Sleeping

App Files Files Community

Rag-v10 / app.py

Luciferalive

Update app.py

a8c94d3 verified 11 months ago

raw

history blame

6.12 kB

	import streamlit as st
	from langchain.chains import LLMChain
	from langchain.prompts import PromptTemplate
	from langchain_community.llms import HuggingFaceEndpoint
	from pdfminer.high_level import extract_text
	import docx2txt
	import io
	import re
	from typing import List
	from langchain.vectorstores import Chroma
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import SentenceTransformerEmbeddings
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np
	import os

	HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
	#browser.gatherUsageStats = False
	def extract_text_from_pdf(pdf_content):
	return extract_text(io.BytesIO(pdf_content))

	def extract_text_from_doc(doc_content):
	return docx2txt.process(io.BytesIO(doc_content))

	def preprocess_text(text):
	text = text.replace('\n', ' ').replace('\r', ' ')
	text = re.sub(r'[^\x00-\x7F]+', ' ', text)
	text = text.lower()
	text = re.sub(r'[^\w\s]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def process_files(file_contents: List[bytes]):
	all_text = ""
	for file_content in file_contents:
	if file_content.startswith(b'%PDF'):
	extracted_text = extract_text_from_pdf(file_content)
	else:
	extracted_text = extract_text_from_doc(file_content)
	preprocessed_text = preprocess_text(extracted_text)
	all_text += preprocessed_text + " "
	return all_text

	def compute_cosine_similarity_scores(query, retrieved_docs):
	model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
	query_embedding = model.encode(query, convert_to_tensor=True)
	doc_embeddings = model.encode(retrieved_docs, convert_to_tensor=True)
	cosine_scores = np.dot(doc_embeddings, query_embedding.T)
	readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
	return readable_scores

	def answer_query_with_similarity(query, file_contents):
	try:
	all_text = process_files(file_contents)

	embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	texts = text_splitter.split_text(all_text)

	vector_store = Chroma.from_texts(texts, embeddings, collection_metadata={"hnsw:space": "cosine"}, persist_directory="stores/insurance_cosine")
	load_vector_store = Chroma(persist_directory="stores/insurance_cosine", embedding_function=embeddings)
	print("Vector DB Successfully Created!")

	db3 = Chroma(persist_directory=f"stores/insurance_cosine", embedding_function=embeddings)
	docs = db3.similarity_search(query)
	print(f"\n\nDocuments retrieved: {len(docs)}")

	if not docs:
	print("No documents match the query.")
	return None

	docs_content = [doc.page_content for doc in docs]
	for i, content in enumerate(docs_content, start=1):
	print(f"\nDocument {i}: {content}...")

	cosine_similarity_scores = compute_cosine_similarity_scores(query, docs_content)
	for score in cosine_similarity_scores:
	print(f"\nDocument Score: {score['score']}")

	all_docs_content = " ".join(docs_content)

	template = """
	### [INST] Instruction:You are an AI assistant named Goose. Your purpose is to provide accurate, relevant, and helpful information to users in a friendly, warm, and supportive manner, similar to ChatGPT. When responding to queries, please keep the following guidelines in mind:

	Retrieve relevant information from your knowledge base to formulate accurate and informative responses.
	Always maintain a positive, friendly, and encouraging tone in your interactions with users.
	Strictly write the crisp and clear answers, dont write unnecesary stuff.
	Only answer to the asked question, don't hellucinate of print any pre information.
	After providing the answer, always ask a for any other help needed in the next paragraph
	Writing in the bullet format is our top preference
	Remember, your goal is to be a reliable, friendly, and supportive AI assistant that provides accurate information while creating a positive user experience, just like ChatGPT. Adapt your communication style to best suit each user's needs and preferences.
	### Docs : {docs}
	### Question : {question}
	"""
	prompt = PromptTemplate.from_template(template.format(docs=all_docs_content, question=query))

	repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
	llm = HuggingFaceEndpoint(repo_id=repo_id, temperature=0.1, token=HUGGINGFACEHUB_API_TOKEN,
	top_p=0.15,
	max_new_tokens=256,
	repetition_penalty=1.1
	)
	llm_chain = LLMChain(prompt=prompt, llm=llm)

	answer = llm_chain.run(question=query)
	cleaned_answer = answer.split("Answer:")[-1].strip()
	print(f"\n\nAnswer: {cleaned_answer}")

	return cleaned_answer
	except Exception as e:
	print("An error occurred while getting the answer: ", str(e))
	return None

	def main():
	st.title("Document Query App")

	uploaded_files = st.file_uploader("Upload files", accept_multiple_files=True)
	file_contents = [file.read() for file in uploaded_files]

	query = st.text_input("Enter your query:")

	if st.button("Get Answer"):
	if file_contents and query:
	response = answer_query_with_similarity(query, file_contents)
	if response:
	st.write("Answer:", response)
	else:
	st.write("No answer found.")
	else:
	st.write("Please provide files and a query.")

	if __name__ == "__main__":
	main()