Spaces:
Sleeping
Sleeping
File size: 6,719 Bytes
0175843 9b00b1d 0175843 9b00b1d 0175843 540b20d 364d4fe 6bc73f4 364d4fe 6bc73f4 0175843 6bc73f4 0175843 364d4fe 6bc73f4 364d4fe 0175843 364d4fe 0175843 364d4fe 0175843 364d4fe 0175843 9b00b1d 0175843 c865b7d 0175843 9b00b1d 0175843 9b00b1d 0175843 3a54452 0175843 364d4fe 0175843 9b00b1d 0175843 364d4fe 0175843 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import streamlit as st
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceEndpoint
from pdfminer.high_level import extract_text
import docx2txt
import io
import re
from typing import List
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
def extract_text_from_pdf(pdf_content):
return extract_text(io.BytesIO(pdf_content))
def extract_text_from_doc(doc_content):
return docx2txt.process(io.BytesIO(doc_content))
def preprocess_text(text):
text = text.replace('\n', ' ').replace('\r', ' ')
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
text = text.lower()
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def process_files(file_contents: List[bytes]):
all_text = ""
for file_content in file_contents:
if file_content.startswith(b'%PDF'):
extracted_text = extract_text_from_pdf(file_content)
else:
extracted_text = extract_text_from_doc(file_content)
preprocessed_text = preprocess_text(extracted_text)
all_text += preprocessed_text + " "
return all_text
def compute_cosine_similarity_scores(query, retrieved_docs):
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
query_embedding = model.encode(query, convert_to_tensor=True)
doc_embeddings = model.encode(retrieved_docs, convert_to_tensor=True)
cosine_scores = np.dot(doc_embeddings, query_embedding.T)
readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
return readable_scores
def answer_query_with_similarity(query, file_contents):
try:
all_text = process_files(file_contents)
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_text(all_text)
vector_store = Chroma.from_texts(texts, embeddings, collection_metadata={"hnsw:space": "cosine"}, persist_directory="stores/insurance_cosine")
load_vector_store = Chroma(persist_directory="stores/insurance_cosine", embedding_function=embeddings)
print("Vector DB Successfully Created!")
db3 = Chroma(persist_directory=f"stores/insurance_cosine", embedding_function=embeddings)
docs = db3.similarity_search(query)
print(f"\n\nDocuments retrieved: {len(docs)}")
if not docs:
print("No documents match the query.")
return None
docs_content = [doc.page_content for doc in docs]
for i, content in enumerate(docs_content, start=1):
print(f"\nDocument {i}: {content}...")
cosine_similarity_scores = compute_cosine_similarity_scores(query, docs_content)
for score in cosine_similarity_scores:
print(f"\nDocument Score: {score['score']}")
all_docs_content = " ".join(docs_content)
template = """
### [INST] Instruction:You are an AI assistant named HelpfulRAG. Your purpose is to provide accurate, relevant, and helpful information to users in a friendly, warm, and supportive manner, similar to ChatGPT. When responding to queries, please keep the following guidelines in mind:
Retrieve relevant information from your knowledge base to formulate accurate and informative responses.
Always maintain a positive, friendly, and encouraging tone in your interactions with users.
Show empathy and understanding when addressing sensitive topics or user concerns.
Break down complex concepts into easily understandable explanations.
Offer step-by-step guidance or examples when appropriate to help users grasp the information better.
Encourage users to ask follow-up questions and provide additional context if needed for a more accurate response.
If you are uncertain about an answer or lack sufficient information, honestly communicate this to the user and suggest alternative resources or ways they can find the information they need.
Maintain a conversational and engaging style, using simple and clear language.
Personalize your responses when appropriate to create a more meaningful connection with the user.
Always prioritize the user's well-being, and avoid providing any information that could potentially cause harm or be misleading.
Remember, your goal is to be a reliable, friendly, and supportive AI assistant that provides accurate information while creating a positive user experience, just like ChatGPT. Adapt your communication style to best suit each user's needs and preferences.
### Docs : {docs}
### Question : {question}
"""
prompt = PromptTemplate.from_template(template.format(docs=all_docs_content, question=query))
repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
llm = HuggingFaceEndpoint(repo_id=repo_id, temperature=0.1, token=HUGGINGFACEHUB_API_TOKEN,
top_p=0.15,
max_new_tokens=512,
repetition_penalty=1.1
)
llm_chain = LLMChain(prompt=prompt, llm=llm)
answer = llm_chain.run(question=query)
cleaned_answer = answer.split("Answer:")[-1].strip()
print(f"\n\nAnswer: {cleaned_answer}")
return cleaned_answer
except Exception as e:
print("An error occurred while getting the answer: ", str(e))
return None
def main():
st.title("Document Query App")
uploaded_files = st.file_uploader("Upload files", accept_multiple_files=True)
file_contents = [file.read() for file in uploaded_files]
query = st.text_input("Enter your query:")
if st.button("Get Answer"):
if file_contents and query:
response = answer_query_with_similarity(query, file_contents)
if response:
st.write("Answer:", response)
else:
st.write("No answer found.")
else:
st.write("Please provide files and a query.")
if __name__ == "__main__":
main() |