ajoy0071998 commited on
Commit
dbce58a
Β·
verified Β·
1 Parent(s): 3363b66

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -78
app.py DELETED
@@ -1,78 +0,0 @@
1
- import streamlit as st
2
- import pdfplumber
3
- from sentence_transformers import SentenceTransformer, util
4
- import torch
5
- from typing import List
6
- from difflib import ndiff
7
-
8
- # Load SBERT model
9
- model = SentenceTransformer('paraphrase-mpnet-base-v2')
10
-
11
- st.set_page_config(page_title="PDF Difference Viewer", layout="wide")
12
- st.title("πŸ“„ PDF Semantic Difference Viewer")
13
-
14
- # Function to extract text from PDF
15
- def extract_text(pdf_file) -> List[str]:
16
- with pdfplumber.open(pdf_file) as pdf:
17
- text = ""
18
- for page in pdf.pages:
19
- text += page.extract_text() + "\n"
20
- return [para.strip() for para in text.split("\n") if para.strip()]
21
-
22
- # Function to compare text semantically
23
- def compare_texts(text_a: List[str], text_b: List[str], threshold_mod=0.85, threshold_add_del=0.6):
24
- results = []
25
- emb_a = model.encode(text_a, convert_to_tensor=True)
26
- emb_b = model.encode(text_b, convert_to_tensor=True)
27
-
28
- matched_b = set()
29
- add_count = del_count = mod_count = 0
30
-
31
- for idx_a, a_vec in enumerate(emb_a):
32
- scores = util.cos_sim(a_vec, emb_b)[0]
33
- best_match_idx = torch.argmax(scores).item()
34
- best_score = scores[best_match_idx].item()
35
-
36
- if best_score >= threshold_mod:
37
- results.append(("modified", text_a[idx_a], text_b[best_match_idx]))
38
- matched_b.add(best_match_idx)
39
- mod_count += 1
40
- elif best_score < threshold_add_del:
41
- results.append(("removed", text_a[idx_a], ""))
42
- del_count += 1
43
-
44
- # Find additions
45
- for idx_b, para_b in enumerate(text_b):
46
- if idx_b not in matched_b:
47
- results.append(("added", "", para_b))
48
- add_count += 1
49
-
50
- return results, add_count, del_count, mod_count
51
-
52
- # Streamlit file uploader
53
- col1, col2 = st.columns(2)
54
- with col1:
55
- pdf1 = st.file_uploader("Upload First PDF", type="pdf")
56
- with col2:
57
- pdf2 = st.file_uploader("Upload Second PDF", type="pdf")
58
-
59
- if pdf1 and pdf2:
60
- text_a = extract_text(pdf1)
61
- text_b = extract_text(pdf2)
62
-
63
- st.success("PDFs uploaded and processed. Comparing...")
64
- results, add_count, del_count, mod_count = compare_texts(text_a, text_b)
65
-
66
- st.subheader("πŸ“Š Summary Report")
67
- st.markdown(f"- βœ… **Added**: {add_count}\n- ❌ **Removed**: {del_count}\n- ✏️ **Modified**: {mod_count}")
68
-
69
- st.subheader("πŸ“ Detailed Comparison")
70
- for tag, old, new in results:
71
- if tag == "added":
72
- st.markdown(f"<div style='background-color:#d4edda;padding:10px;border-radius:5px;'>βœ… <b>Added:</b> {new}</div>", unsafe_allow_html=True)
73
- elif tag == "removed":
74
- st.markdown(f"<div style='background-color:#f8d7da;padding:10px;border-radius:5px;'>❌ <b>Removed:</b> {old}</div>", unsafe_allow_html=True)
75
- elif tag == "modified":
76
- st.markdown(f"<div style='background-color:#fff3cd;padding:10px;border-radius:5px;'>✏️ <b>Modified:</b><br><i>Old:</i> {old}<br><i>New:</i> {new}</div>", unsafe_allow_html=True)
77
- else:
78
- st.info("Please upload two PDF files to begin comparison.")