Cachoups commited on
Commit
6058808
·
verified ·
1 Parent(s): 8ed5868

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -2
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import gradio as gr
3
- from transformers import pipeline
4
  import spacy
5
  import lib.read_pdf
6
  import pandas as pd
@@ -12,6 +12,56 @@ import io
12
  nlp = spacy.load('en_core_web_sm')
13
  nlp.add_pipe('sentencizer')
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def split_in_sentences(text):
16
  doc = nlp(text)
17
  return [str(sent).strip() for sent in doc.sents]
@@ -230,7 +280,8 @@ with gr.Blocks() as demo:
230
  with gr.Column():
231
  gr.Markdown("### PDF 1 Analysis")
232
  selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content", lines=4)
233
- selected_paragraph_1.change(show, paragraph_1_dropdown, selected_paragraph_1)
 
234
  summarize_btn1 = gr.Button("Summarize Text from PDF 1")
235
  summary_textbox_1 = gr.Textbox(label="Summary for PDF 1", lines=2)
236
  summarize_btn1.click(fn=lambda p: process_paragraph_1_sum(p), inputs=paragraph_1_dropdown, outputs=summary_textbox_1)
 
1
  import os
2
  import gradio as gr
3
+ from transformers import pipeline, BertTokenizer, BertModel
4
  import spacy
5
  import lib.read_pdf
6
  import pandas as pd
 
12
  nlp = spacy.load('en_core_web_sm')
13
  nlp.add_pipe('sentencizer')
14
 
15
+ bert_model_name = "bert-base-uncased"
16
+ tokenizer = BertTokenizer.from_pretrained(bert_model_name)
17
+ model = BertModel.from_pretrained(bert_model_name)
18
+ model.eval()
19
+
20
+ import torch
21
+ import numpy as np
22
+ from sklearn.metrics.pairwise import cosine_similarity
23
+
24
+ def get_bert_embeddings(texts):
25
+ """Obtain BERT embeddings for a list of texts."""
26
+ embeddings = []
27
+ with torch.no_grad():
28
+ for text in texts:
29
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
30
+ outputs = model(**inputs)
31
+ # Take the mean of token embeddings as the sentence embedding
32
+ embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
33
+ embeddings.append(embedding)
34
+ return np.array(embeddings)
35
+
36
+ def compute_similarity(embedding1, embeddings2):
37
+ """Compute cosine similarity between a single embedding and a set of embeddings."""
38
+ return cosine_similarity([embedding1], embeddings2)[0]
39
+
40
+ def compare_paragraph_to_list(paragraph, paragraph_list, top_n=3):
41
+ """Compare a single paragraph to a list of paragraphs and return the top N most similar ones."""
42
+ # Get embedding for the target paragraph
43
+ target_embedding = get_bert_embeddings([paragraph])[0] # Only one paragraph
44
+
45
+ # Get embeddings for the list of paragraphs
46
+ list_embeddings = get_bert_embeddings(paragraph_list)
47
+
48
+ # Compute similarity between the target and each paragraph in the list
49
+ similarity_scores = compute_similarity(target_embedding, list_embeddings)
50
+
51
+ # Combine paragraphs with their similarity scores
52
+ results = [
53
+ {'compared_paragraph': paragraph_list[i], 'similarity_score': similarity_scores[i]}
54
+ for i in range(len(paragraph_list))
55
+ ]
56
+
57
+ # Sort the results by similarity score in descending order and take the top N
58
+ sorted_results = sorted(results, key=lambda x: x['similarity_score'], reverse=True)[:top_n]
59
+
60
+ # Return only the top N most similar paragraphs
61
+ return sorted_results
62
+
63
+
64
+
65
  def split_in_sentences(text):
66
  doc = nlp(text)
67
  return [str(sent).strip() for sent in doc.sents]
 
280
  with gr.Column():
281
  gr.Markdown("### PDF 1 Analysis")
282
  selected_paragraph_1 = gr.Textbox(label="Selected Paragraph 1 Content", lines=4)
283
+ selected_paragraph_btn1 = gr.Button("Nearest paragraph content from pdf 2")
284
+ selected_paragraph_btn1.click(fn=lambda p: compare_paragraph_to_list(p, stored_paragraphs_2), inputs=paragraph_1_dropdown, outputs=selected_paragraph_1)
285
  summarize_btn1 = gr.Button("Summarize Text from PDF 1")
286
  summary_textbox_1 = gr.Textbox(label="Summary for PDF 1", lines=2)
287
  summarize_btn1.click(fn=lambda p: process_paragraph_1_sum(p), inputs=paragraph_1_dropdown, outputs=summary_textbox_1)