Spaces:
Sleeping
Sleeping
import streamlit as st | |
from langchain.chains import LLMChain | |
from langchain.prompts import PromptTemplate | |
from langchain_community.llms import HuggingFaceEndpoint | |
from pdfminer.high_level import extract_text | |
import docx2txt | |
import io | |
import re | |
from typing import List | |
from langchain.vectorstores import Chroma | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import SentenceTransformerEmbeddings | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import numpy as np | |
import os | |
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
#browser.gatherUsageStats = False | |
def extract_text_from_pdf(pdf_content): | |
return extract_text(io.BytesIO(pdf_content)) | |
def extract_text_from_doc(doc_content): | |
return docx2txt.process(io.BytesIO(doc_content)) | |
def preprocess_text(text): | |
text = text.replace('\n', ' ').replace('\r', ' ') | |
text = re.sub(r'[^\x00-\x7F]+', ' ', text) | |
text = text.lower() | |
text = re.sub(r'[^\w\s]', '', text) | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
def process_files(file_contents: List[bytes]): | |
all_text = "" | |
for file_content in file_contents: | |
if file_content.startswith(b'%PDF'): | |
extracted_text = extract_text_from_pdf(file_content) | |
else: | |
extracted_text = extract_text_from_doc(file_content) | |
preprocessed_text = preprocess_text(extracted_text) | |
all_text += preprocessed_text + " " | |
return all_text | |
def compute_cosine_similarity_scores(query, retrieved_docs): | |
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2") | |
query_embedding = model.encode(query, convert_to_tensor=True) | |
doc_embeddings = model.encode(retrieved_docs, convert_to_tensor=True) | |
cosine_scores = np.dot(doc_embeddings, query_embedding.T) | |
readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())] | |
return readable_scores | |
def answer_query_with_similarity(query, file_contents): | |
try: | |
all_text = process_files(file_contents) | |
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
texts = text_splitter.split_text(all_text) | |
vector_store = Chroma.from_texts(texts, embeddings, collection_metadata={"hnsw:space": "cosine"}, persist_directory="stores/insurance_cosine") | |
load_vector_store = Chroma(persist_directory="stores/insurance_cosine", embedding_function=embeddings) | |
print("Vector DB Successfully Created!") | |
db3 = Chroma(persist_directory=f"stores/insurance_cosine", embedding_function=embeddings) | |
docs = db3.similarity_search(query) | |
print(f"\n\nDocuments retrieved: {len(docs)}") | |
if not docs: | |
print("No documents match the query.") | |
return None | |
docs_content = [doc.page_content for doc in docs] | |
for i, content in enumerate(docs_content, start=1): | |
print(f"\nDocument {i}: {content}...") | |
cosine_similarity_scores = compute_cosine_similarity_scores(query, docs_content) | |
for score in cosine_similarity_scores: | |
print(f"\nDocument Score: {score['score']}") | |
all_docs_content = " ".join(docs_content) | |
template = """ | |
### [INST] Instruction:You are an AI assistant named Goose. Your purpose is to provide accurate, relevant, and helpful information to users in a friendly, warm, and supportive manner, similar to ChatGPT. When responding to queries, please keep the following guidelines in mind: | |
Retrieve relevant information from your knowledge base to formulate accurate and informative responses. | |
Always maintain a positive, friendly, and encouraging tone in your interactions with users. | |
Strictly write the crisp and clear answers, dont write unnecesary stuff. | |
Only answer to the asked question, don't hellucinate of print any pre information. | |
After providing the answer, always ask a for any other help needed in the next paragraph | |
Writing in the bullet format is our top preference | |
Remember, your goal is to be a reliable, friendly, and supportive AI assistant that provides accurate information while creating a positive user experience, just like ChatGPT. Adapt your communication style to best suit each user's needs and preferences. | |
### Docs : {docs} | |
### Question : {question} | |
""" | |
prompt = PromptTemplate.from_template(template.format(docs=all_docs_content, question=query)) | |
repo_id = "meta-llama/Meta-Llama-3-8B-Instruct" | |
llm = HuggingFaceEndpoint(repo_id=repo_id, temperature=0.1, token=HUGGINGFACEHUB_API_TOKEN, | |
top_p=0.15, | |
max_new_tokens=256, | |
repetition_penalty=1.1 | |
) | |
llm_chain = LLMChain(prompt=prompt, llm=llm) | |
answer = llm_chain.run(question=query) | |
cleaned_answer = answer.split("Answer:")[-1].strip() | |
print(f"\n\nAnswer: {cleaned_answer}") | |
return cleaned_answer | |
except Exception as e: | |
print("An error occurred while getting the answer: ", str(e)) | |
return None | |
def main(): | |
st.title("Document Query App") | |
uploaded_files = st.file_uploader("Upload files", accept_multiple_files=True) | |
file_contents = [file.read() for file in uploaded_files] | |
query = st.text_input("Enter your query:") | |
if st.button("Get Answer"): | |
if file_contents and query: | |
response = answer_query_with_similarity(query, file_contents) | |
if response: | |
st.write("Answer:", response) | |
else: | |
st.write("No answer found.") | |
else: | |
st.write("Please provide files and a query.") | |
if __name__ == "__main__": | |
main() |