Spaces:

ivyblossom
/

question-answering

Running

App Files Files Community

ivyblossom commited on Aug 3, 2023

Commit

7b208e8

1 Parent(s): 0f897d9

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -33

app.py CHANGED Viewed

@@ -1,13 +1,9 @@
 import os
 import fitz  # PyMuPDF for parsing PDF
 import streamlit as st
-from sentence_transformers import SentenceTransformer, util
 import re
-# Load a pre-trained SentenceTransformer model
-model_name = "paraphrase-MiniLM-L6-v2"
-model = SentenceTransformer(model_name)
 # Function to extract text from a PDF file
 def extract_text_from_pdf(pdf_path):
     text = ""
@@ -19,38 +15,28 @@ def extract_text_from_pdf(pdf_path):
             yield page_num + 1, page_text  # Return the page number (1-based) and the extracted text
 # Function to truncate text to the nearest word boundary
-def truncate_to_word_boundary(text, max_words=500):
     words = re.findall(r'\w+', text)
     truncated_text = ' '.join(words[:max_words])
     return truncated_text
-# Function to perform semantic search
-def semantic_search(query, documents, top_k=5, max_words=500):
-    query_embedding = model.encode(query, convert_to_tensor=True)
-    # Convert the list of documents to embeddings
-    document_embeddings = model.encode([text for _, text in documents], convert_to_tensor=True)
-    # Compute cosine similarity scores of query with documents
-    cosine_scores = util.pytorch_cos_sim(query_embedding.unsqueeze(0), document_embeddings)[0]
-    # Sort the results in decreasing order
-    results = []
-    for idx in range(len(cosine_scores)):
-        page_num, text = documents[idx]
-        truncated_text = truncate_to_word_boundary(text, max_words)
-        results.append((page_num, truncated_text, cosine_scores[idx].item()))
-    results = sorted(results, key=lambda x: x[2], reverse=True)
-    return results[:top_k]
 def main():
-    st.title("Semantic Search on PDF Documents")
     pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
-    query = st.text_input("Enter your query:")
-    if st.button("Search"):
         if pdf_file:
             pdf_path = os.path.join(os.getcwd(), pdf_file.name)
             with open(pdf_path, "wb") as f:
@@ -59,15 +45,21 @@ def main():
             # Extract text from the PDF along with page numbers
             pdf_text_with_pages = list(extract_text_from_pdf(pdf_path))
-            search_results = semantic_search(query, pdf_text_with_pages)
             os.remove(pdf_path)  # Delete the uploaded file after processing
-            st.write(f"Search results for query: '{query}'")
-            for i, (page_num, result_text, score) in enumerate(search_results, start=1):
-                with st.container():
-                    st.write(f"Result {i} - Page {page_num}")
-                    st.write(f"Score: {score:.2f}")
-                    st.write(result_text)
 if __name__ == "__main__":
     main()

 import os
 import fitz  # PyMuPDF for parsing PDF
 import streamlit as st
+from transformers import pipeline
 import re
 # Function to extract text from a PDF file
 def extract_text_from_pdf(pdf_path):
     text = ""
             yield page_num + 1, page_text  # Return the page number (1-based) and the extracted text
 # Function to truncate text to the nearest word boundary
+def truncate_to_word_boundary(text, max_words=100):
     words = re.findall(r'\w+', text)
     truncated_text = ' '.join(words[:max_words])
     return truncated_text
+# Function to perform question-answering
+def question_answering(question, pdf_text_with_pages):
+    pdf_text = "\n".join([text for _, text in pdf_text_with_pages])
+    # Perform question-answering using Hugging Face's Transformers
+    question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
+    answer = question_answerer(question=question, context=pdf_text)
+    return answer
 def main():
+    st.title("Question Answering using a PDF Document")
     pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
+    question = st.text_input("Ask your question:")
+    if st.button("Answer"):
         if pdf_file:
             pdf_path = os.path.join(os.getcwd(), pdf_file.name)
             with open(pdf_path, "wb") as f:
             # Extract text from the PDF along with page numbers
             pdf_text_with_pages = list(extract_text_from_pdf(pdf_path))
+            # Perform question-answering
+            answer = question_answering(question, pdf_text_with_pages)
             os.remove(pdf_path)  # Delete the uploaded file after processing
+            st.write(f"Question: '{question}'")
+            st.write("Answer:", answer['answer'])
+            st.write("Score:", answer['score'])
+            st.write("Page Number:", answer['start'] + 1)  # Add 1 to convert 0-based index to 1-based page number
+            # Display truncated context
+            start_page = answer['start']
+            context = pdf_text_with_pages[start_page][1]
+            truncated_context = truncate_to_word_boundary(context)
+            st.write("Context:", truncated_context)
 if __name__ == "__main__":
     main()