File size: 5,808 Bytes
4dbedd6 6c35ef2 4dbedd6 9a9be61 4dbedd6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import os
import pytesseract
import streamlit as st
import time
from dotenv import load_dotenv
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
from io import BytesIO
load_dotenv()
groq_api_key = os.environ["groq"]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
LANGCHAIN_API_KEY = os.environ["LANGCHAIN_API_KEY"]
st.title("Talk to your Documents!!!")
st.sidebar.title("Model fine-tuning")
model = st.sidebar.selectbox("Select the model that you want to use",
('llama-3.1-8b-instant', 'llama-3.1-70b-versatile', 'llama3-70b-8192',
'llama3-8b-8192', 'mixtral-8x7b-32768', 'gemma2-9b-it', 'gemma-7b-it'))
temprature = st.sidebar.slider("Temperature", min_value=0., max_value=1., value=0.7)
tokens = st.sidebar.slider("Tokens", min_value=256, max_value=4096, value=1024)
llm = ChatGroq(groq_api_key=groq_api_key, model_name=model, temperature=temprature, max_tokens=tokens)
prompt = ChatPromptTemplate.from_template(
"""
Answer the questions based on the provided context only.
Please provide the most accurate response based on the question.
Be respectful and friendly, and you can use emojis too.
You do not know anything out of context, and if the question
is out of context simply say that you do not know.
Do not provide output based on your general knowledge.
The response provided must be more than 256 tokens.
<context>
{context}
<context>
Questions:{input}
"""
)
def ocr_pdf_page(file_path, page_number):
images = convert_from_path(file_path, first_page=page_number, last_page=page_number)
if images:
return pytesseract.image_to_string(images[0])
return ""
def process_uploaded_pdfs(uploaded_files):
documents = []
for uploaded_file in uploaded_files:
with BytesIO(uploaded_file.read()) as pdf_stream:
pdf_reader = PdfReader(pdf_stream)
for i, page in enumerate(pdf_reader.pages):
text = page.extract_text() or ""
if not text.strip():
text = ocr_pdf_page(uploaded_file.name, i + 1)
documents.append(Document(page_content=text, metadata={"source": uploaded_file.name, "page": i + 1}))
return documents
# def process_uploaded_pdfs(uploaded_files):
# documents = []
# for uploaded_file in uploaded_files:
# with BytesIO(uploaded_file.read()) as pdf_stream:
# pdf_reader = PdfReader(pdf_stream)
# for i, page in enumerate(pdf_reader.pages):
# text = page.extract_text() or ""
# if not text.strip():
# text = ocr_pdf_page(uploaded_file.name, i + 1)
# documents.append({"page_content": text, "metadata": {"source": uploaded_file.name, "page": i + 1}})
# return documents
def vector_embedding_pdfs(uploaded_files):
if "vector" not in st.session_state:
st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
st.session_state.pdf_docs = process_uploaded_pdfs(uploaded_files)
st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
st.session_state.final_documents = st.session_state.text_splitter.split_documents(st.session_state.pdf_docs)
st.session_state.vectors = FAISS.from_documents(st.session_state.final_documents, st.session_state.embeddings)
st.session_state.vectors.save_local("./faissDsRagGroq", "index_hf")
uploaded_files = st.file_uploader("Upload PDF documents", type="pdf", accept_multiple_files=True)
if uploaded_files and st.button("Documents Embedding"):
start = time.process_time()
vector_embedding_pdfs(uploaded_files)
st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
st.session_state.vector_db = FAISS.load_local("./faissDsRagGroq", embeddings=st.session_state.embeddings,
index_name="index_hf", allow_dangerous_deserialization=True)
end = time.process_time()
st.write("Embeddings completed!!!")
st.write(f"Time taken for generating embeddings: {(end - start):.2f} seconds...")
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
input_prompt = st.text_input("Enter Your Question From Documents")
if input_prompt:
document_chain = create_stuff_documents_chain(llm, prompt)
retriever = st.session_state.vector_db.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)
start = time.process_time()
response = retrieval_chain.invoke({"input": input_prompt})
response_time = time.process_time() - start
st.session_state.chat_history.append({"question": input_prompt, "response": response['answer']})
st.write(f"Response time: {response_time:.2f} seconds")
st.write(response['answer'])
with st.expander("Document Similarity Search"):
for i, doc in enumerate(response["context"]):
st.write(doc.page_content)
st.write("--------------------------------")
with st.expander("Chat History"):
for chat in st.session_state.chat_history:
st.write(f"**Question:** {chat['question']}")
st.write(f"**Response:** {chat['response']}")
st.write("---")
|