Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
-
from transformers import pipeline
|
4 |
import spacy
|
5 |
import lib.read_pdf
|
6 |
import pandas as pd
|
@@ -12,6 +12,56 @@ import io
|
|
12 |
nlp = spacy.load('en_core_web_sm')
|
13 |
nlp.add_pipe('sentencizer')
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def split_in_sentences(text):
|
16 |
doc = nlp(text)
|
17 |
return [str(sent).strip() for sent in doc.sents]
|
@@ -230,7 +280,8 @@ with gr.Blocks() as demo:
|
|
230 |
with gr.Column():
|
231 |
gr.Markdown("### PDF 1 Analysis")
|
232 |
selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content", lines=4)
|
233 |
-
|
|
|
234 |
summarize_btn1 = gr.Button("Summarize Text from PDF 1")
|
235 |
summary_textbox_1 = gr.Textbox(label="Summary for PDF 1", lines=2)
|
236 |
summarize_btn1.click(fn=lambda p: process_paragraph_1_sum(p), inputs=paragraph_1_dropdown, outputs=summary_textbox_1)
|
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
+
from transformers import pipeline, BertTokenizer, BertModel
|
4 |
import spacy
|
5 |
import lib.read_pdf
|
6 |
import pandas as pd
|
|
|
12 |
nlp = spacy.load('en_core_web_sm')
|
13 |
nlp.add_pipe('sentencizer')
|
14 |
|
15 |
+
bert_model_name = "bert-base-uncased"
|
16 |
+
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
|
17 |
+
model = BertModel.from_pretrained(bert_model_name)
|
18 |
+
model.eval()
|
19 |
+
|
20 |
+
import torch
|
21 |
+
import numpy as np
|
22 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
23 |
+
|
24 |
+
def get_bert_embeddings(texts):
|
25 |
+
"""Obtain BERT embeddings for a list of texts."""
|
26 |
+
embeddings = []
|
27 |
+
with torch.no_grad():
|
28 |
+
for text in texts:
|
29 |
+
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
|
30 |
+
outputs = model(**inputs)
|
31 |
+
# Take the mean of token embeddings as the sentence embedding
|
32 |
+
embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
33 |
+
embeddings.append(embedding)
|
34 |
+
return np.array(embeddings)
|
35 |
+
|
36 |
+
def compute_similarity(embedding1, embeddings2):
|
37 |
+
"""Compute cosine similarity between a single embedding and a set of embeddings."""
|
38 |
+
return cosine_similarity([embedding1], embeddings2)[0]
|
39 |
+
|
40 |
+
def compare_paragraph_to_list(paragraph, paragraph_list, top_n=3):
|
41 |
+
"""Compare a single paragraph to a list of paragraphs and return the top N most similar ones."""
|
42 |
+
# Get embedding for the target paragraph
|
43 |
+
target_embedding = get_bert_embeddings([paragraph])[0] # Only one paragraph
|
44 |
+
|
45 |
+
# Get embeddings for the list of paragraphs
|
46 |
+
list_embeddings = get_bert_embeddings(paragraph_list)
|
47 |
+
|
48 |
+
# Compute similarity between the target and each paragraph in the list
|
49 |
+
similarity_scores = compute_similarity(target_embedding, list_embeddings)
|
50 |
+
|
51 |
+
# Combine paragraphs with their similarity scores
|
52 |
+
results = [
|
53 |
+
{'compared_paragraph': paragraph_list[i], 'similarity_score': similarity_scores[i]}
|
54 |
+
for i in range(len(paragraph_list))
|
55 |
+
]
|
56 |
+
|
57 |
+
# Sort the results by similarity score in descending order and take the top N
|
58 |
+
sorted_results = sorted(results, key=lambda x: x['similarity_score'], reverse=True)[:top_n]
|
59 |
+
|
60 |
+
# Return only the top N most similar paragraphs
|
61 |
+
return sorted_results
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
def split_in_sentences(text):
|
66 |
doc = nlp(text)
|
67 |
return [str(sent).strip() for sent in doc.sents]
|
|
|
280 |
with gr.Column():
|
281 |
gr.Markdown("### PDF 1 Analysis")
|
282 |
selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content", lines=4)
|
283 |
+
selected_paragraph_btn1 = gr.Button("Nearest paragraph content from pdf 2")
|
284 |
+
selected_paragraph_btn1.click(fn=lambda p: compare_paragraph_to_list(p, stored_paragraphs_2), inputs=paragraph_1_dropdown, outputs=selected_paragraph_1)
|
285 |
summarize_btn1 = gr.Button("Summarize Text from PDF 1")
|
286 |
summary_textbox_1 = gr.Textbox(label="Summary for PDF 1", lines=2)
|
287 |
summarize_btn1.click(fn=lambda p: process_paragraph_1_sum(p), inputs=paragraph_1_dropdown, outputs=summary_textbox_1)
|