File size: 7,516 Bytes
180a0cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3073f86
180a0cc
748d430
 
 
 
180a0cc
3073f86
89d3f24
180a0cc
748d430
f6df22c
 
748d430
180a0cc
 
 
748d430
180a0cc
748d430
180a0cc
 
 
 
 
f6df22c
 
be9c181
 
f6df22c
 
 
 
180a0cc
 
 
 
 
 
f6df22c
180a0cc
 
f6df22c
 
89d3f24
180a0cc
f6df22c
180a0cc
f6df22c
 
180a0cc
be9c181
 
 
180a0cc
f6df22c
 
180a0cc
748d430
180a0cc
 
748d430
f6df22c
180a0cc
 
 
be9c181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180a0cc
 
 
be9c181
 
 
180a0cc
 
748d430
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import streamlit as st
from PyPDF2 import PdfReader
from anthropic import Anthropic
from prompts import DIFFERENTIATE_PROMPT

def extract_differences(input_text):
        input_text = input_text.strip()
        qa_pairs = input_text.split('</difference>')
        
        # Initialize an empty list to hold the parsed dictionary objects
        parsed_data = []
        
        # Iterate over each QA pair
        for pair in qa_pairs:
            # Check if the pair has both question and answer (ignoring the last one)
            if '<text1_section>' in pair and '</text1_section>' in pair and '<text2_section>' in pair and '</text2_section>' in pair and '<explanation>' in pair and '</explanation>' in pair:
                # Extract the question and answer text
                text1 = pair.split('<text1_section>')[1].split('</text1_section>')[0]
                text2 = pair.split('<text2_section>')[1].split('</text2_section>')[0]
                explanation = pair.split('<explanation>')[1].split('</explanation>')[0]
                
                # Create a dictionary for the current pair and append it to the list
                parsed_data.append({'text1': text1.strip(), 'text2': text2.strip(), 'explanation': explanation.strip()})
        
        return parsed_data

def make_llm_api_call(prompt):
    client = Anthropic()
    message = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=4096,
        temperature=0,
        messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
    )
    return message

def get_llm_response(extractedtext1, extractedtext2):
    prompt = DIFFERENTIATE_PROMPT.format(text1=extractedtext1, text2=extractedtext2)
    
    message = make_llm_api_call(prompt)
    
    message_text = message.content[0].text

    try:
        try:
            differences_list = extract_differences(message_text.split("<differences>")[1].split("</differences>")[0].strip())
        except Exception as e:
            differences_list = extract_differences(message_text)
    except Exception as e:
        print("Error:", e)
        return message_text, []
    
    # display_text = f"{before_differences}\n\n{difference_content}\n\n{after_differences}"
    return differences_list



def extract_text_with_pypdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = []
    for page in reader.pages:
        text.append(page.extract_text())
    return text


def main():
    st.set_page_config(layout="wide")  # Enable wide layout
    if "differences_data" not in  st.session_state:
        st.session_state.differences_data = []
    if "display_data" not in st.session_state:
        st.session_state.display_data = {"file1": None, "file2": None, "i": 0}
    if "file1" not in st.session_state:
        st.session_state.file1 = None
    if "file2" not in st.session_state:
        st.session_state.file2 = None
    st.markdown('<div style="text-align: center;">' + '<h1>PDF Upload and Compare App</h1>' + '</div>', unsafe_allow_html=True)
    
    # Create columns for side-by-side buttons
    col1, col2 = st.columns([2, 2])
        
    # Add upload button to left column
    st.session_state.file1 = col1.file_uploader("**Text 1**", type="pdf")
    
    # Add upload button to right column
    st.session_state.file2 = col2.file_uploader("**Text 2**", type="pdf")

    
    # Check if both files are uploaded
    if st.session_state.file1 and st.session_state.file2:
        # Get filenames from uploaded files
        filename1 = st.session_state.file1.name
        filename2 = st.session_state.file2.name
        
        if st.session_state.display_data["file1"] != st.session_state.file1 or st.session_state.display_data["file2"] != st.session_state.file2:
            st.session_state.display_data = {"file1": st.session_state.file1, "file2": st.session_state.file2, "i": 0}
        
        try:
            extracted_text1 = extract_text_with_pypdf(st.session_state.file1)
            extracted_text2 = extract_text_with_pypdf(st.session_state.file2)
            with col1.expander(filename1):
                st.write("\n\n".join("\n\n------------------------\n\n".join(extracted_text1).splitlines()))
            
            with col2.expander(filename2):
                st.write("\n\n".join("\n\n------------------------\n\n".join(extracted_text2).splitlines()))
            
            st.success(f"Content of files **{filename1}** and **{filename2}** have been extracted successfully.")
        except Exception as e:
            st.error(f"Error saving files: {str(e)}")
        
        try:
            # display_text, parsed_data = process_concurrently(extracted_text1, extracted_text2)
            # display_text, parsed_data = get_llm_response(extracted_text1, extracted_text2)
            
            for i,(etext1, etext2) in enumerate(zip(extracted_text1, extracted_text2)):
                if i >= st.session_state.display_data["i"]: break
                data = next((d for d in st.session_state.differences_data if d['etext1'] == etext1 and d['etext2'] == etext2), None)
                if data:
                    pdata = data['pdata']
                    dext1 = data['dext1']
                    dext2 = data['dext2']
                else:
                    pdata = get_llm_response(etext1, etext2)
                    dext1 = etext1
                    dext2 = etext2

                    for diff in pdata:
                        diff_text1 = diff['text1'].strip()
                        diff_text2 = diff['text2'].strip()          
                        if diff_text1 == "" or diff_text2 == "": continue              
                        diff_text1_phrase = "\n".join([f"<span style='background-color: grey;'>{t}</span>" for t in diff_text1.splitlines()])
                        diff_text2_phrase = "\n".join([f"<span style='background-color: grey;'>{t}</span>" for t in diff_text2.splitlines()])
                        dext1 = diff_text1_phrase.join(dext1.split(diff_text1)) if diff_text1 in dext1 else dext1
                        dext2 = diff_text2_phrase.join(dext2.split(diff_text2)) if diff_text2 in dext2 else dext2

                    st.session_state.differences_data.append({"etext1": etext1, "etext2": etext2, "pdata": pdata, "dext1": dext1, "dext2": dext2})
                    reverse_pdata = [{'text1': d['text2'], 'text2': d['text1'], 'explanation': d['explanation']} for d in pdata]
                    st.session_state.differences_data.append({"etext1": etext2, "etext2": etext1, "pdata": reverse_pdata, "dext1": dext2, "dext2": dext1})
                
                display_text = "\n\n\n".join([f"**Text1:**\n\n{d['text1']}\n\n**Text2:**\n\n{d['text2']}\n\n**Explanation:**\n\n{d['explanation']}\n\n----------------------\n" for d in pdata])

                with st.expander(f"**Page {i+1}** - {filename1}"):
                    st.markdown("\n\n".join(dext1.splitlines()), unsafe_allow_html=True)
                with st.expander(f"**Page {i+1}** - {filename2}"):
                    st.markdown("\n\n".join(dext2.splitlines()), unsafe_allow_html=True)
                st.markdown(display_text)
        except Exception as e:
            st.error(f"Error finding differences: {str(e)}")


        # Add button at the bottom to run Find Differences function
        if st.button("Find Differences"):
            st.session_state.display_data["i"] = st.session_state.display_data["i"] + 5
            st.rerun()


if __name__ == "__main__":
    main()