Spaces:
Runtime error
Runtime error
File size: 8,185 Bytes
180a0cc 3073f86 180a0cc 748d430 180a0cc 3073f86 89d3f24 180a0cc 748d430 f6df22c 748d430 180a0cc 748d430 180a0cc 748d430 180a0cc f6df22c be9c181 f6df22c eb91470 180a0cc eb91470 180a0cc eb91470 f6df22c 89d3f24 180a0cc f6df22c 180a0cc f6df22c 059a4c9 be9c181 180a0cc eb91470 180a0cc 748d430 180a0cc 748d430 f6df22c 059a4c9 180a0cc be9c181 af0863a be9c181 af0863a eb91470 af0863a be9c181 180a0cc be9c181 180a0cc 748d430 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import streamlit as st
from PyPDF2 import PdfReader
from anthropic import Anthropic
from prompts import DIFFERENTIATE_PROMPT
def extract_differences(input_text):
input_text = input_text.strip()
qa_pairs = input_text.split('</difference>')
# Initialize an empty list to hold the parsed dictionary objects
parsed_data = []
# Iterate over each QA pair
for pair in qa_pairs:
# Check if the pair has both question and answer (ignoring the last one)
if '<text1_section>' in pair and '</text1_section>' in pair and '<text2_section>' in pair and '</text2_section>' in pair and '<explanation>' in pair and '</explanation>' in pair:
# Extract the question and answer text
text1 = pair.split('<text1_section>')[1].split('</text1_section>')[0]
text2 = pair.split('<text2_section>')[1].split('</text2_section>')[0]
explanation = pair.split('<explanation>')[1].split('</explanation>')[0]
# Create a dictionary for the current pair and append it to the list
parsed_data.append({'text1': text1.strip(), 'text2': text2.strip(), 'explanation': explanation.strip()})
return parsed_data
def make_llm_api_call(prompt):
client = Anthropic()
message = client.messages.create(
model="claude-3-haiku-20240307",
max_tokens=4096,
temperature=0,
messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
)
return message
def get_llm_response(extractedtext1, extractedtext2):
prompt = DIFFERENTIATE_PROMPT.format(text1=extractedtext1, text2=extractedtext2)
message = make_llm_api_call(prompt)
message_text = message.content[0].text
try:
try:
differences_list = extract_differences(message_text.split("<differences>")[1].split("</differences>")[0].strip())
except Exception as e:
differences_list = extract_differences(message_text)
except Exception as e:
print("Error:", e)
return message_text, []
# display_text = f"{before_differences}\n\n{difference_content}\n\n{after_differences}"
return differences_list
def extract_text_with_pypdf(pdf_path):
reader = PdfReader(pdf_path)
text = []
for page in reader.pages:
text.append(page.extract_text())
return text
def main():
st.set_page_config(layout="wide") # Enable wide layout
if "differences_data" not in st.session_state:
st.session_state.differences_data = []
if "display_data" not in st.session_state:
st.session_state.display_data = {"file1": None, "file2": None, "i": 0}
if "file1" not in st.session_state:
st.session_state.file1 = None
if "file2" not in st.session_state:
st.session_state.file2 = None
if "extracted_texts" not in st.session_state:
st.session_state.extracted_texts = {"file1": None, "file2": None, "extracted_text_1": [], "extracted_text_2": []}
st.markdown('<div style="text-align: center;">' + '<h1>PDF Upload and Compare App</h1>' + '</div>', unsafe_allow_html=True)
# Create columns for side-by-side buttons
col1, col2 = st.columns([2, 2])
# Add upload button to left column
st.session_state.file1 = col1.file_uploader("**PDF 1**", type="pdf")
# Add upload button to right column
st.session_state.file2 = col2.file_uploader("**PDF 2**", type="pdf")
# Check if both files are uploaded
if st.session_state.file1 and st.session_state.file2:
# Get filenames from uploaded files
filename1 = st.session_state.file1.name
filename2 = st.session_state.file2.name
with st.spinner("Extracting text from PDFs"):
if st.session_state.display_data["file1"] != st.session_state.file1 or st.session_state.display_data["file2"] != st.session_state.file2:
st.session_state.display_data = {"file1": st.session_state.file1, "file2": st.session_state.file2, "i": 0}
st.session_state.extracted_texts = {"file1": st.session_state.display_data["file1"], "file2": st.session_state.display_data["file2"], "extracted_text_1": extract_text_with_pypdf(st.session_state.file1), "extracted_text_2": extract_text_with_pypdf(st.session_state.file2)}
try:
extracted_text1 = st.session_state.extracted_texts["extracted_text_1"]
extracted_text2 = st.session_state.extracted_texts["extracted_text_2"]
with col1.expander(filename1):
st.write("\n\n".join("\n\n------------------------\n\n".join(extracted_text1).splitlines()))
with col2.expander(filename2):
st.write("\n\n".join("\n\n------------------------\n\n".join(extracted_text2).splitlines()))
st.success(f"PDF text extraction complete")
except Exception as e:
st.error(f"Error saving files: {str(e)}")
with st.spinner("Processing Pages within the PDFS"):
try:
# display_text, parsed_data = process_concurrently(extracted_text1, extracted_text2)
# display_text, parsed_data = get_llm_response(extracted_text1, extracted_text2)
for i,(etext1, etext2) in enumerate(zip(extracted_text1, extracted_text2)):
if i >= st.session_state.display_data["i"]: break
data = next((d for d in st.session_state.differences_data if d['etext1'] == etext1 and d['etext2'] == etext2), None)
if data:
pdata = data['pdata']
dext1 = data['dext1']
dext2 = data['dext2']
else:
pdata = get_llm_response(etext1, etext2)
dext1 = etext1
dext2 = etext2
for diff in pdata:
diff_text1 = diff['text1'].strip()
diff_text2 = diff['text2'].strip()
if diff_text1 == "" or diff_text2 == "": continue
diff_text1_phrase = "\n".join([f"<span style='background-color: grey;'>{t}</span>" for t in diff_text1.splitlines()])
diff_text2_phrase = "\n".join([f"<span style='background-color: grey;'>{t}</span>" for t in diff_text2.splitlines()])
dext1 = diff_text1_phrase.join(dext1.split(diff_text1)) if diff_text1 in dext1 else dext1
dext2 = diff_text2_phrase.join(dext2.split(diff_text2)) if diff_text2 in dext2 else dext2
st.session_state.differences_data.append({"etext1": etext1, "etext2": etext2, "pdata": pdata, "dext1": dext1, "dext2": dext2})
reverse_pdata = [{'text1': d['text2'], 'text2': d['text1'], 'explanation': d['explanation']} for d in pdata]
st.session_state.differences_data.append({"etext1": etext2, "etext2": etext1, "pdata": reverse_pdata, "dext1": dext2, "dext2": dext1})
display_text = "\n\n\n".join([f"**PDF 1:**\n\n{d['text1']}\n\n**PDF 2:**\n\n{d['text2']}\n\n**Explanation:**\n\n{d['explanation']}\n\n----------------------\n" for d in pdata])
with st.expander(f"**Page {i+1}** - {filename1}"):
st.markdown("\n\n".join(dext1.splitlines()), unsafe_allow_html=True)
with st.expander(f"**Page {i+1}** - {filename2}"):
st.markdown("\n\n".join(dext2.splitlines()), unsafe_allow_html=True)
st.markdown(display_text)
except Exception as e:
st.error(f"Error finding differences: {str(e)}")
# Add button at the bottom to run Find Differences function
if st.button("Find Differences"):
st.session_state.display_data["i"] = st.session_state.display_data["i"] + 5
st.rerun()
if __name__ == "__main__":
main()
|