File size: 5,346 Bytes
8ca60f2
 
 
 
 
b07d002
8ca60f2
 
 
 
c494e5a
 
 
 
 
 
 
 
 
8ca60f2
 
 
c494e5a
 
 
 
 
 
 
 
 
8ca60f2
 
 
c494e5a
 
 
 
 
 
 
 
 
8ca60f2
b07d002
8ca60f2
b07d002
 
 
 
 
 
 
 
 
8ca60f2
b07d002
 
 
 
 
 
8ca60f2
 
 
 
b07d002
8ca60f2
 
 
 
 
 
 
 
c494e5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ca60f2
 
 
 
 
c494e5a
 
 
 
 
8ca60f2
c494e5a
 
 
8ca60f2
 
c494e5a
8ca60f2
 
 
 
c494e5a
 
 
8ca60f2
c494e5a
8ca60f2
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
import PyPDF2
from docx import Document
import numpy as np

# Load the tokenizer and model for sentence embeddings
@st.cache_resource
def load_model():
    try:
        tokenizer = AutoTokenizer.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
        model = AutoModelForCausalLM.from_pretrained("rakeshkiriyath/gpt2Medium_text_to_sql")
        sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  # Smaller, faster sentence embeddings model
        st.success("Model loaded successfully!")
        return tokenizer, model, sentence_model
    except Exception as e:
        st.error(f"Error loading models: {e}")
        return None, None, None

# Extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    try:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        st.error(f"Error reading PDF: {e}")
        return ""

# Extract text from a Word document
def extract_text_from_word(docx_file):
    try:
        doc = Document(docx_file)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text
    except Exception as e:
        st.error(f"Error reading Word document: {e}")
        return ""

# Optimized comparison using embeddings and matrix operations
def compare_sentences(doc1_sentences, doc2_sentences, sentence_model):
    # Encode all sentences in batches to get embeddings
    doc1_embeddings = sentence_model.encode(doc1_sentences, convert_to_tensor=True, batch_size=16)
    doc2_embeddings = sentence_model.encode(doc2_sentences, convert_to_tensor=True, batch_size=16)

    # Compute cosine similarity matrix between all pairs
    similarity_matrix = util.pytorch_cos_sim(doc1_embeddings, doc2_embeddings)

    # Extract pairs with similarity > threshold
    threshold = 0.6  # Adjust this for stricter or looser matching
    similar_sentences = []

    for i, row in enumerate(similarity_matrix):
        for j, score in enumerate(row):
            if score >= threshold:
                similar_sentences.append((i, j, score.item(), doc1_sentences[i], doc2_sentences[j]))

    return similar_sentences

# Streamlit UI
def main():
    st.title("Optimized Comparative Analysis of Two Documents")
    st.sidebar.header("Upload Files")

    # Upload files
    uploaded_file1 = st.sidebar.file_uploader("Upload the First Document (PDF/Word)", type=["pdf", "docx"])
    uploaded_file2 = st.sidebar.file_uploader("Upload the Second Document (PDF/Word)", type=["pdf", "docx"])

    if uploaded_file1 and uploaded_file2:
        # Extract text from the uploaded documents
        if uploaded_file1.name.endswith(".pdf"):
            text1 = extract_text_from_pdf(uploaded_file1)
        else:
            text1 = extract_text_from_word(uploaded_file1)

        if uploaded_file2.name.endswith(".pdf"):
            text2 = extract_text_from_pdf(uploaded_file2)
        else:
            text2 = extract_text_from_word(uploaded_file2)

        if not text1.strip():
            st.error("The first document is empty or could not be read.")
            return
        if not text2.strip():
            st.error("The second document is empty or could not be read.")
            return

        st.write("### Preview of Document 1:")
        st.text(text1[:500])  # Display a preview of Document 1
        st.write("### Preview of Document 2:")
        st.text(text2[:500])  # Display a preview of Document 2

        # Split text into sentences
        doc1_sentences = text1.split('. ')
        doc2_sentences = text2.split('. ')

        # Limit sentences for testing purposes (optional)
        doc1_sentences = doc1_sentences[:50]  # Remove this line for full processing
        doc2_sentences = doc2_sentences[:50]  # Remove this line for full processing

        # Load models
        tokenizer, model, sentence_model = load_model()
        if not sentence_model:
            st.error("Failed to load the sentence embedding model.")
            return

        # Perform sentence comparison
        st.info("Comparing sentences, this may take a moment...")
        similar_sentences = compare_sentences(doc1_sentences, doc2_sentences, sentence_model)

        # Display results
        st.header("Comparative Analysis Results")
        st.write(f"Number of sentences in Document 1: {len(doc1_sentences)}")
        st.write(f"Number of sentences in Document 2: {len(doc2_sentences)}")

        if similar_sentences:
            st.success(f"Found {len(similar_sentences)} similar sentences!")
            for match in similar_sentences:
                doc1_index, doc2_index, score, sent1, sent2 = match
                st.markdown(f"**Document 1 Sentence {doc1_index + 1}:** {sent1}")
                st.markdown(f"**Document 2 Sentence {doc2_index + 1}:** {sent2}")
                st.markdown(f"**Similarity Score:** {score:.2f}")
                st.markdown("---")
        else:
            st.info("No significantly similar sentences found.")
    else:
        st.warning("Please upload two documents to compare.")

if __name__ == "__main__":
    main()