Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import pipeline
|
3 |
+
import fitz # PyMuPDF for PDF handling
|
4 |
+
|
5 |
+
# Function to extract text from PDF
|
6 |
+
def extract_text_from_pdf(uploaded_file):
|
7 |
+
doc = fitz.open(uploaded_file)
|
8 |
+
text = ""
|
9 |
+
for page in doc:
|
10 |
+
text += page.get_text()
|
11 |
+
return text
|
12 |
+
|
13 |
+
# Function to chunk large text for context
|
14 |
+
def chunk_text(text, max_length=1000):
|
15 |
+
# Split the text into chunks with a maximum character length
|
16 |
+
chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
|
17 |
+
return chunks
|
18 |
+
|
19 |
+
# Initialize Hugging Face model pipeline
|
20 |
+
@st.cache_resource
|
21 |
+
def load_model():
|
22 |
+
# Using a publicly available model like gpt2
|
23 |
+
return pipeline("text-generation", model="gpt2")
|
24 |
+
|
25 |
+
model_pipeline = load_model()
|
26 |
+
|
27 |
+
# Generate a response
|
28 |
+
def get_response(prompt, context):
|
29 |
+
combined_prompt = f"{context}\n\nQuestion: {prompt}"
|
30 |
+
response = model_pipeline(combined_prompt, max_length=200, num_return_sequences=1)
|
31 |
+
return response[0]["generated_text"]
|
32 |
+
|
33 |
+
# Streamlit App UI
|
34 |
+
st.title("Chat with PDF!!!")
|
35 |
+
|
36 |
+
# Slider for description
|
37 |
+
st.subheader("How to Use This Application:")
|
38 |
+
st.slider(
|
39 |
+
"Slide to explore the usage description",
|
40 |
+
min_value=0, max_value=100, step=1, value=50
|
41 |
+
)
|
42 |
+
|
43 |
+
# Displaying detailed description and instructions
|
44 |
+
st.markdown("""
|
45 |
+
### Welcome to the 'Chat with PDF' Application!
|
46 |
+
|
47 |
+
**Description**:
|
48 |
+
This web application allows you to interact with the contents of a PDF document by uploading a file and asking questions about it. The application processes the uploaded PDF, extracts the text, and uses a powerful Large Language Model (LLM) to respond to your questions in real time.
|
49 |
+
|
50 |
+
**Model Used**:
|
51 |
+
The application leverages the **GPT-2 Model**, a publicly available language model that can understand the text from the PDF and provide answers. GPT-2 works well with both short and long texts, making it ideal for this use case.
|
52 |
+
|
53 |
+
**How It Works**:
|
54 |
+
1. **Upload a PDF File**:
|
55 |
+
Use the file uploader to select and upload the PDF file you wish to analyze. The file should be in `.pdf` format.
|
56 |
+
|
57 |
+
2. **Text Extraction**:
|
58 |
+
The application extracts the text from the uploaded PDF using the `PyMuPDF` library (imported as `fitz`). This library enables the reading and extraction of text from each page in the PDF.
|
59 |
+
|
60 |
+
3. **Text Chunking**:
|
61 |
+
The extracted text may be very large, so it is divided into smaller chunks to facilitate better processing. By default, each chunk contains up to 1000 characters. These chunks serve as context for answering questions.
|
62 |
+
|
63 |
+
4. **Ask Questions**:
|
64 |
+
After the text is processed and chunked, you can ask questions related to the content of the PDF. Simply type your question in the text area provided on the app.
|
65 |
+
|
66 |
+
5. **Model Response**:
|
67 |
+
When you ask a question, the app sends the prompt (your question) along with the relevant chunk of text to the Hugging Face model. The model then generates a response based on the content it was provided. The response is displayed in the app.
|
68 |
+
|
69 |
+
6. **Receive Insights**:
|
70 |
+
The answers are tailored to the content of the PDF, providing detailed, context-specific insights to help you better understand the document.
|
71 |
+
|
72 |
+
**Features**:
|
73 |
+
- Upload any PDF document for analysis.
|
74 |
+
- Ask natural language questions based on the document's content.
|
75 |
+
- Get accurate and context-aware responses generated by a state-of-the-art LLM.
|
76 |
+
- Split large documents into manageable chunks for optimal performance.
|
77 |
+
|
78 |
+
**Why Use This App?**
|
79 |
+
- If you're reading a long PDF and need quick answers, this tool can assist by summarizing sections of the document or directly answering your specific questions.
|
80 |
+
- Useful for academic research, legal documents, technical papers, or any lengthy PDF content that needs to be understood quickly.
|
81 |
+
|
82 |
+
**Try it now** and start chatting with your PDF to gain insights faster and more efficiently!
|
83 |
+
""")
|
84 |
+
|
85 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
|
86 |
+
|
87 |
+
if uploaded_file is not None:
|
88 |
+
# Extract text from the uploaded PDF
|
89 |
+
pdf_text = extract_text_from_pdf(uploaded_file)
|
90 |
+
|
91 |
+
# Chunk the extracted text
|
92 |
+
text_chunks = chunk_text(pdf_text)
|
93 |
+
|
94 |
+
# Display the first chunk as a summary
|
95 |
+
st.subheader("PDF Content Summary:")
|
96 |
+
st.write(text_chunks[0]) # Display first chunk
|
97 |
+
|
98 |
+
# Input for user prompt
|
99 |
+
prompt = st.text_area(label="Ask a question based on the PDF content")
|
100 |
+
button = st.button("Submit")
|
101 |
+
|
102 |
+
if button:
|
103 |
+
if prompt:
|
104 |
+
# Select a chunk of text to send with the prompt
|
105 |
+
chunk_to_send = text_chunks[0] # Select the relevant chunk
|
106 |
+
response = get_response(prompt, chunk_to_send)
|
107 |
+
st.markdown(f"**Response:**\n\n{response}")
|