File size: 5,610 Bytes
409f81b
eda6735
4f4ccbd
eda6735
409f81b
eda6735
 
409f81b
eda6735
 
 
 
ccfc904
eda6735
 
 
 
2c02a9e
d382509
 
 
 
 
eda6735
ccfc904
d382509
 
 
 
2c02a9e
eda6735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c02a9e
f7133fb
 
 
 
 
 
 
 
4f4ccbd
 
 
 
 
 
 
 
 
 
 
eda6735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import io
import fitz  # PyMuPDF
import PyPDF2
from docx import Document
from dotenv import load_dotenv
import streamlit as st
from sentence_transformers import SentenceTransformer
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.faiss import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceEndpoint

# Load environment variables from .env file
load_dotenv()

# Initialize the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize the HuggingFace LLM
llm = HuggingFaceEndpoint(
    endpoint_url="https://api-inference.huggingface.co/models/gpt-3.5-turbo",
    model_kwargs={"api_key": os.getenv('HUGGINGFACEHUB_API_TOKEN')}
)

# Initialize the HuggingFace embeddings
embedding = HuggingFaceEmbeddings()

# Streamlit setup
st.set_page_config(layout="centered")
st.markdown("<h1 style='font-size:24px;'>PDF and DOCX ChatBot</h1>", unsafe_allow_html=True)

# Retrieve API key from environment variable
google_api_key = os.getenv("GOOGLE_API_KEY")

# Check if the API key is available
if google_api_key is None:
    st.warning("API key not found. Please set the google_api_key environment variable.")
    st.stop()

# File Upload
uploaded_file = st.file_uploader("Upload your PDF or DOCX file", type=["pdf", "docx"])

prompt_template = """
Answer the question as detailed as possible from the provided context,
make sure to provide all the details, if the answer is not in
provided context just say, "answer is not available in the context",
don't provide the wrong answer\n\n
Context:\n {context}?\n
Question: \n{question}\n
Answer:
"""

prompt_template += """
--------------------------------------------------
Prompt Suggestions:
1. Summarize the primary theme of the context.
2. Elaborate on the crucial concepts highlighted in the context.
3. Pinpoint any supporting details or examples pertinent to the question.
4. Examine any recurring themes or patterns relevant to the question within the context.
5. Contrast differing viewpoints or elements mentioned in the context.
6. Explore the potential implications or outcomes of the information provided.
7. Assess the trustworthiness and validity of the information given.
8. Propose recommendations or advice based on the presented information.
9. Forecast likely future events or results stemming from the context.
10. Expand on the context or background information pertinent to the question.
11. Define any specialized terms or technical language used within the context.
12. Analyze any visual representations like charts or graphs in the context.
13. Highlight any restrictions or important considerations when responding to the question.
14. Examine any presuppositions or biases evident within the context.
15. Present alternate interpretations or viewpoints regarding the information provided.
16. Reflect on any moral or ethical issues raised by the context.
17. Investigate any cause-and-effect relationships identified in the context.
18. Uncover any questions or areas requiring further exploration.
19. Resolve any vague or conflicting information in the context.
20. Cite case studies or examples that demonstrate the concepts discussed in the context.
--------------------------------------------------
Context:\n{context}\n
Question:\n{question}\n
Answer:
"""

def extract_text_from_docx(docx_path):
    text = ""
    try:
        doc = Document(docx_path)
        text = "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"Error extracting text from DOCX: {e}")
    return text

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        pdf_document = fitz.open(pdf_path)
        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
    return text

if uploaded_file is not None:
    st.text("File Uploaded Successfully!")
    
    context = ""
    
    # Process the uploaded file
    if uploaded_file.name.endswith('.pdf'):
        pdf_data = uploaded_file.read()
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_data))
        pdf_pages = pdf_reader.pages
        context = "\n\n".join(page.extract_text() for page in pdf_pages)
    elif uploaded_file.name.endswith('.docx'):
        docx_data = uploaded_file.read()
        context = extract_text_from_docx(io.BytesIO(docx_data))

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=200)
    texts = text_splitter.split_text(context)
    embeddings = HuggingFaceEmbeddings()
    vector_index = FAISS.from_texts(texts, embeddings).as_retriever()

    user_question = st.text_input("Ask Anything from the Document:", "")

    if st.button("Get Answer"):
        if user_question:
            with st.spinner("Processing..."):
                docs = vector_index.get_relevant_documents(user_question)
                prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
                chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)
                response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
                st.subheader("Answer:")
                st.write(response['output_text'])
        else:
            st.warning("Please enter a question.")