File size: 8,185 Bytes
180a0cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3073f86
180a0cc
748d430
 
 
 
180a0cc
3073f86
89d3f24
180a0cc
748d430
f6df22c
 
748d430
180a0cc
 
 
748d430
180a0cc
748d430
180a0cc
 
 
 
 
f6df22c
 
be9c181
 
f6df22c
 
 
 
eb91470
 
180a0cc
 
 
 
 
 
eb91470
180a0cc
 
eb91470
f6df22c
89d3f24
180a0cc
f6df22c
180a0cc
f6df22c
 
059a4c9
 
 
 
 
be9c181
180a0cc
eb91470
 
180a0cc
748d430
180a0cc
 
748d430
f6df22c
059a4c9
180a0cc
 
be9c181
af0863a
 
 
 
be9c181
af0863a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb91470
af0863a
 
 
 
 
 
 
 
be9c181
180a0cc
 
 
be9c181
 
 
180a0cc
 
748d430
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import streamlit as st
from PyPDF2 import PdfReader
from anthropic import Anthropic
from prompts import DIFFERENTIATE_PROMPT

def extract_differences(input_text):
        input_text = input_text.strip()
        qa_pairs = input_text.split('</difference>')
        
        # Initialize an empty list to hold the parsed dictionary objects
        parsed_data = []
        
        # Iterate over each QA pair
        for pair in qa_pairs:
            # Check if the pair has both question and answer (ignoring the last one)
            if '<text1_section>' in pair and '</text1_section>' in pair and '<text2_section>' in pair and '</text2_section>' in pair and '<explanation>' in pair and '</explanation>' in pair:
                # Extract the question and answer text
                text1 = pair.split('<text1_section>')[1].split('</text1_section>')[0]
                text2 = pair.split('<text2_section>')[1].split('</text2_section>')[0]
                explanation = pair.split('<explanation>')[1].split('</explanation>')[0]
                
                # Create a dictionary for the current pair and append it to the list
                parsed_data.append({'text1': text1.strip(), 'text2': text2.strip(), 'explanation': explanation.strip()})
        
        return parsed_data

def make_llm_api_call(prompt):
    client = Anthropic()
    message = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=4096,
        temperature=0,
        messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
    )
    return message

def get_llm_response(extractedtext1, extractedtext2):
    prompt = DIFFERENTIATE_PROMPT.format(text1=extractedtext1, text2=extractedtext2)
    
    message = make_llm_api_call(prompt)
    
    message_text = message.content[0].text

    try:
        try:
            differences_list = extract_differences(message_text.split("<differences>")[1].split("</differences>")[0].strip())
        except Exception as e:
            differences_list = extract_differences(message_text)
    except Exception as e:
        print("Error:", e)
        return message_text, []
    
    # display_text = f"{before_differences}\n\n{difference_content}\n\n{after_differences}"
    return differences_list



def extract_text_with_pypdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = []
    for page in reader.pages:
        text.append(page.extract_text())
    return text


def main():
    st.set_page_config(layout="wide")  # Enable wide layout
    if "differences_data" not in  st.session_state:
        st.session_state.differences_data = []
    if "display_data" not in st.session_state:
        st.session_state.display_data = {"file1": None, "file2": None, "i": 0}
    if "file1" not in st.session_state:
        st.session_state.file1 = None
    if "file2" not in st.session_state:
        st.session_state.file2 = None
    if "extracted_texts" not in st.session_state:
        st.session_state.extracted_texts = {"file1": None, "file2": None, "extracted_text_1": [], "extracted_text_2": []}
    st.markdown('<div style="text-align: center;">' + '<h1>PDF Upload and Compare App</h1>' + '</div>', unsafe_allow_html=True)
    
    # Create columns for side-by-side buttons
    col1, col2 = st.columns([2, 2])
        
    # Add upload button to left column
    st.session_state.file1 = col1.file_uploader("**PDF 1**", type="pdf")
    
    # Add upload button to right column
    st.session_state.file2 = col2.file_uploader("**PDF 2**", type="pdf")

    
    # Check if both files are uploaded
    if st.session_state.file1 and st.session_state.file2:
        # Get filenames from uploaded files
        filename1 = st.session_state.file1.name
        filename2 = st.session_state.file2.name

        with st.spinner("Extracting text from PDFs"):
            if st.session_state.display_data["file1"] != st.session_state.file1 or st.session_state.display_data["file2"] != st.session_state.file2:
                st.session_state.display_data = {"file1": st.session_state.file1, "file2": st.session_state.file2, "i": 0}
                st.session_state.extracted_texts = {"file1": st.session_state.display_data["file1"], "file2": st.session_state.display_data["file2"], "extracted_text_1": extract_text_with_pypdf(st.session_state.file1), "extracted_text_2": extract_text_with_pypdf(st.session_state.file2)}
        
        try:
            extracted_text1 = st.session_state.extracted_texts["extracted_text_1"]
            extracted_text2 = st.session_state.extracted_texts["extracted_text_2"]
            with col1.expander(filename1):
                st.write("\n\n".join("\n\n------------------------\n\n".join(extracted_text1).splitlines()))
            
            with col2.expander(filename2):
                st.write("\n\n".join("\n\n------------------------\n\n".join(extracted_text2).splitlines()))
            
            st.success(f"PDF text extraction complete")
        except Exception as e:
            st.error(f"Error saving files: {str(e)}")

        with st.spinner("Processing Pages within the PDFS"):
            try:
                # display_text, parsed_data = process_concurrently(extracted_text1, extracted_text2)
                # display_text, parsed_data = get_llm_response(extracted_text1, extracted_text2)
                
                for i,(etext1, etext2) in enumerate(zip(extracted_text1, extracted_text2)):
                    if i >= st.session_state.display_data["i"]: break
                    data = next((d for d in st.session_state.differences_data if d['etext1'] == etext1 and d['etext2'] == etext2), None)
                    if data:
                        pdata = data['pdata']
                        dext1 = data['dext1']
                        dext2 = data['dext2']
                    else:
                        pdata = get_llm_response(etext1, etext2)
                        dext1 = etext1
                        dext2 = etext2
    
                        for diff in pdata:
                            diff_text1 = diff['text1'].strip()
                            diff_text2 = diff['text2'].strip()          
                            if diff_text1 == "" or diff_text2 == "": continue              
                            diff_text1_phrase = "\n".join([f"<span style='background-color: grey;'>{t}</span>" for t in diff_text1.splitlines()])
                            diff_text2_phrase = "\n".join([f"<span style='background-color: grey;'>{t}</span>" for t in diff_text2.splitlines()])
                            dext1 = diff_text1_phrase.join(dext1.split(diff_text1)) if diff_text1 in dext1 else dext1
                            dext2 = diff_text2_phrase.join(dext2.split(diff_text2)) if diff_text2 in dext2 else dext2
    
                        st.session_state.differences_data.append({"etext1": etext1, "etext2": etext2, "pdata": pdata, "dext1": dext1, "dext2": dext2})
                        reverse_pdata = [{'text1': d['text2'], 'text2': d['text1'], 'explanation': d['explanation']} for d in pdata]
                        st.session_state.differences_data.append({"etext1": etext2, "etext2": etext1, "pdata": reverse_pdata, "dext1": dext2, "dext2": dext1})
                    
                    display_text = "\n\n\n".join([f"**PDF 1:**\n\n{d['text1']}\n\n**PDF 2:**\n\n{d['text2']}\n\n**Explanation:**\n\n{d['explanation']}\n\n----------------------\n" for d in pdata])
    
                    with st.expander(f"**Page {i+1}** - {filename1}"):
                        st.markdown("\n\n".join(dext1.splitlines()), unsafe_allow_html=True)
                    with st.expander(f"**Page {i+1}** - {filename2}"):
                        st.markdown("\n\n".join(dext2.splitlines()), unsafe_allow_html=True)
                    st.markdown(display_text)
            except Exception as e:
                st.error(f"Error finding differences: {str(e)}")


        # Add button at the bottom to run Find Differences function
        if st.button("Find Differences"):
            st.session_state.display_data["i"] = st.session_state.display_data["i"] + 5
            st.rerun()


if __name__ == "__main__":
    main()