Spaces:
Sleeping
Sleeping
File size: 5,346 Bytes
8ca60f2 b07d002 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 b07d002 8ca60f2 b07d002 8ca60f2 b07d002 8ca60f2 b07d002 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 c494e5a 8ca60f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
import PyPDF2
from docx import Document
import numpy as np
# Load the tokenizer and model for sentence embeddings
@st.cache_resource
def load_model():
try:
tokenizer = AutoTokenizer.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
model = AutoModelForCausalLM.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2') # Smaller, faster sentence embeddings model
st.success("Model loaded successfully!")
return tokenizer, model, sentence_model
except Exception as e:
st.error(f"Error loading models: {e}")
return None, None, None
# Extract text from a PDF file
def extract_text_from_pdf(pdf_file):
try:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
except Exception as e:
st.error(f"Error reading PDF: {e}")
return ""
# Extract text from a Word document
def extract_text_from_word(docx_file):
try:
doc = Document(docx_file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
except Exception as e:
st.error(f"Error reading Word document: {e}")
return ""
# Optimized comparison using embeddings and matrix operations
def compare_sentences(doc1_sentences, doc2_sentences, sentence_model):
# Encode all sentences in batches to get embeddings
doc1_embeddings = sentence_model.encode(doc1_sentences, convert_to_tensor=True, batch_size=16)
doc2_embeddings = sentence_model.encode(doc2_sentences, convert_to_tensor=True, batch_size=16)
# Compute cosine similarity matrix between all pairs
similarity_matrix = util.pytorch_cos_sim(doc1_embeddings, doc2_embeddings)
# Extract pairs with similarity > threshold
threshold = 0.6 # Adjust this for stricter or looser matching
similar_sentences = []
for i, row in enumerate(similarity_matrix):
for j, score in enumerate(row):
if score >= threshold:
similar_sentences.append((i, j, score.item(), doc1_sentences[i], doc2_sentences[j]))
return similar_sentences
# Streamlit UI
def main():
st.title("Optimized Comparative Analysis of Two Documents")
st.sidebar.header("Upload Files")
# Upload files
uploaded_file1 = st.sidebar.file_uploader("Upload the First Document (PDF/Word)", type=["pdf", "docx"])
uploaded_file2 = st.sidebar.file_uploader("Upload the Second Document (PDF/Word)", type=["pdf", "docx"])
if uploaded_file1 and uploaded_file2:
# Extract text from the uploaded documents
if uploaded_file1.name.endswith(".pdf"):
text1 = extract_text_from_pdf(uploaded_file1)
else:
text1 = extract_text_from_word(uploaded_file1)
if uploaded_file2.name.endswith(".pdf"):
text2 = extract_text_from_pdf(uploaded_file2)
else:
text2 = extract_text_from_word(uploaded_file2)
if not text1.strip():
st.error("The first document is empty or could not be read.")
return
if not text2.strip():
st.error("The second document is empty or could not be read.")
return
st.write("### Preview of Document 1:")
st.text(text1[:500]) # Display a preview of Document 1
st.write("### Preview of Document 2:")
st.text(text2[:500]) # Display a preview of Document 2
# Split text into sentences
doc1_sentences = text1.split('. ')
doc2_sentences = text2.split('. ')
# Limit sentences for testing purposes (optional)
doc1_sentences = doc1_sentences[:50] # Remove this line for full processing
doc2_sentences = doc2_sentences[:50] # Remove this line for full processing
# Load models
tokenizer, model, sentence_model = load_model()
if not sentence_model:
st.error("Failed to load the sentence embedding model.")
return
# Perform sentence comparison
st.info("Comparing sentences, this may take a moment...")
similar_sentences = compare_sentences(doc1_sentences, doc2_sentences, sentence_model)
# Display results
st.header("Comparative Analysis Results")
st.write(f"Number of sentences in Document 1: {len(doc1_sentences)}")
st.write(f"Number of sentences in Document 2: {len(doc2_sentences)}")
if similar_sentences:
st.success(f"Found {len(similar_sentences)} similar sentences!")
for match in similar_sentences:
doc1_index, doc2_index, score, sent1, sent2 = match
st.markdown(f"**Document 1 Sentence {doc1_index + 1}:** {sent1}")
st.markdown(f"**Document 2 Sentence {doc2_index + 1}:** {sent2}")
st.markdown(f"**Similarity Score:** {score:.2f}")
st.markdown("---")
else:
st.info("No significantly similar sentences found.")
else:
st.warning("Please upload two documents to compare.")
if __name__ == "__main__":
main()
|