Anuj02003 commited on
Commit
db806d0
·
verified ·
1 Parent(s): b61cf60

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -0
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline
3
+ import fitz # PyMuPDF for PDF handling
4
+ import re
5
+ import tempfile
6
+
7
+ # Function to clean extracted text
8
+ def clean_text(text):
9
+ # Replace multiple spaces or newlines with a single space
10
+ text = re.sub(r'\s+', ' ', text)
11
+ return text.strip()
12
+
13
+ # Function to extract text from PDF and clean it
14
+ def extract_text_from_pdf(uploaded_file):
15
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
16
+ tmp_file.write(uploaded_file.read())
17
+ tmp_file_path = tmp_file.name
18
+
19
+ doc = fitz.open(tmp_file_path)
20
+ text = ""
21
+ for page in doc:
22
+ text += page.get_text()
23
+ doc.close()
24
+
25
+ return clean_text(text)
26
+
27
+ # Function to chunk large text for context
28
+ def chunk_text(text, max_length=1000):
29
+ # Split the text into chunks with a maximum character length
30
+ chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
31
+ return chunks
32
+
33
+ # Initialize Hugging Face model pipeline
34
+ @st.cache_resource
35
+ def load_model():
36
+ return pipeline("text-generation", model="gpt2", clean_up_tokenization_spaces=True) # Use GPT-2 for simplicity
37
+
38
+ model_pipeline = load_model()
39
+
40
+ # Generate a response
41
+ # Generate a response
42
+ def get_response(prompt, context):
43
+ combined_prompt = f"Context: {context}\n\nQuestion: {prompt}\nAnswer:"
44
+ response = model_pipeline(
45
+ combined_prompt,
46
+ max_new_tokens=150, # Ensure concise answers
47
+ num_return_sequences=1
48
+ )
49
+ # Extract only the answer part after the "Answer:" in the generated response
50
+ raw_response = response[0]["generated_text"]
51
+ answer_start = raw_response.find("Answer:") + len("Answer:")
52
+ answer = raw_response[answer_start:].strip()
53
+ return clean_text(answer)
54
+
55
+
56
+ # Streamlit App UI
57
+ st.title("Chat with PDF!!!")
58
+
59
+ # Sidebar for description
60
+ st.sidebar.title("Instructions")
61
+ st.sidebar.markdown("""
62
+ ### How to Use This Application:
63
+ 1. **Upload a PDF File**:
64
+ Use the file uploader to select and upload the PDF file you wish to analyze. The file should be in `.pdf` format.
65
+
66
+ 2. **Text Extraction**:
67
+ The application extracts the text from the uploaded PDF using the `PyMuPDF` library (imported as `fitz`).
68
+
69
+ 3. **Text Chunking**:
70
+ Large documents are divided into smaller chunks for better processing. Each chunk contains up to 1000 characters.
71
+
72
+ 4. **Ask Questions**:
73
+ After the text is processed, type your question about the document.
74
+
75
+ 5. **Model Response**:
76
+ The app sends the prompt and the relevant chunk to the model, which generates a response based on the content.
77
+
78
+ 6. **Receive Insights**:
79
+ Get detailed insights and answers related to the PDF content.
80
+ """)
81
+
82
+ uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
83
+
84
+ if uploaded_file is not None:
85
+ # Extract and clean text from the uploaded PDF
86
+ pdf_text = extract_text_from_pdf(uploaded_file)
87
+
88
+ # Chunk the extracted text
89
+ text_chunks = chunk_text(pdf_text)
90
+
91
+ # Display the first chunk as a summary
92
+ st.subheader("PDF Content Summary:")
93
+ st.write(text_chunks[0]) # Display first chunk
94
+
95
+ # Input for user prompt
96
+ prompt = st.text_area(label="Ask a question based on the PDF content")
97
+ button = st.button("Ok")
98
+
99
+ if button:
100
+ if prompt:
101
+ # Select relevant chunk based on the question
102
+ relevant_chunk = None
103
+ for chunk in text_chunks:
104
+ if any(keyword.lower() in chunk.lower() for keyword in prompt.split()):
105
+ relevant_chunk = chunk
106
+ break
107
+
108
+ # If no relevant chunk was found, use the first chunk as a fallback
109
+ if not relevant_chunk:
110
+ relevant_chunk = text_chunks[0]
111
+
112
+ # Get response from the model
113
+ response = get_response(prompt, relevant_chunk)
114
+ st.markdown(f"**Answer:** {response}")