|
from transformers import AutoTokenizer |
|
from transformers import AutoModelForSeq2SeqLM |
|
import plotly.graph_objs as go |
|
import textwrap |
|
from transformers import pipeline |
|
import re |
|
import time |
|
import requests |
|
from PIL import Image |
|
import itertools |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import matplotlib |
|
from matplotlib.colors import ListedColormap, rgb2hex |
|
import ipywidgets as widgets |
|
from IPython.display import display, HTML |
|
import pandas as pd |
|
from pprint import pprint |
|
from tenacity import retry |
|
from tqdm import tqdm |
|
import scipy.stats |
|
import torch |
|
from transformers import GPT2LMHeadModel |
|
import seaborn as sns |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM |
|
import random |
|
from nltk.corpus import stopwords |
|
from termcolor import colored |
|
import nltk |
|
from nltk.translate.bleu_score import sentence_bleu |
|
from transformers import BertTokenizer, BertModel |
|
import graphviz |
|
import gradio as gr |
|
from tree import generate_plot |
|
from paraphraser import generate_paraphrase |
|
|
|
nltk.download('stopwords') |
|
|
|
|
|
|
|
def longest_common_subss(original_sentence, paraphrased_sentences): |
|
stop_words = set(stopwords.words('english')) |
|
original_sentence_lower = original_sentence.lower() |
|
paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences] |
|
paraphrased_sentences_no_stopwords = [] |
|
|
|
for sentence in paraphrased_sentences_lower: |
|
words = re.findall(r'\b\w+\b', sentence) |
|
filtered_sentence = ' '.join([word for word in words if word not in stop_words]) |
|
paraphrased_sentences_no_stopwords.append(filtered_sentence) |
|
|
|
results = [] |
|
for sentence in paraphrased_sentences_no_stopwords: |
|
common_words = set(original_sentence_lower.split()) & set(sentence.split()) |
|
for word in common_words: |
|
sentence = sentence.replace(word, colored(word, 'green')) |
|
results.append({ |
|
"Original Sentence": original_sentence_lower, |
|
"Paraphrased Sentence": sentence, |
|
"Substrings Word Pair": common_words |
|
}) |
|
return results |
|
|
|
|
|
def common_substring_word(original_sentence, paraphrased_sentences): |
|
stop_words = set(stopwords.words('english')) |
|
original_sentence_lower = original_sentence.lower() |
|
paraphrased_sentences_lower = [s.lower() for s in paraphrased_sentences] |
|
paraphrased_sentences_no_stopwords = [] |
|
|
|
for sentence in paraphrased_sentences_lower: |
|
words = re.findall(r'\b\w+\b', sentence) |
|
filtered_sentence = ' '.join([word for word in words if word not in stop_words]) |
|
paraphrased_sentences_no_stopwords.append(filtered_sentence) |
|
|
|
results = [] |
|
for idx, sentence in enumerate(paraphrased_sentences_no_stopwords): |
|
common_words = set(original_sentence_lower.split()) & set(sentence.split()) |
|
common_substrings = ', '.join(sorted(common_words)) |
|
for word in common_words: |
|
sentence = sentence.replace(word, colored(word, 'green')) |
|
results.append({ |
|
f"Paraphrased Sentence {idx+1}": sentence, |
|
"Common Substrings": common_substrings |
|
}) |
|
return results |
|
|
|
|
|
import re |
|
from nltk.corpus import stopwords |
|
|
|
def find_common_subsequences(sentence, str_list): |
|
stop_words = set(stopwords.words('english')) |
|
sentence = sentence.lower() |
|
|
|
str_list = [s.lower() for s in str_list] |
|
|
|
def is_present(lcs, str_list): |
|
for string in str_list: |
|
if lcs not in string: |
|
return False |
|
return True |
|
|
|
def remove_stop_words_and_special_chars(sentence): |
|
sentence = re.sub(r'[^\w\s]', '', sentence) |
|
words = sentence.split() |
|
filtered_words = [word for word in words if word.lower() not in stop_words] |
|
return " ".join(filtered_words) |
|
|
|
sentence = remove_stop_words_and_special_chars(sentence) |
|
str_list = [remove_stop_words_and_special_chars(s) for s in str_list] |
|
|
|
words = sentence.split(" ") |
|
common_grams = [] |
|
added_phrases = set() |
|
|
|
def is_covered(subseq, added_phrases): |
|
for phrase in added_phrases: |
|
if subseq in phrase: |
|
return True |
|
return False |
|
|
|
for i in range(len(words) - 4): |
|
penta = " ".join(words[i:i+5]) |
|
if is_present(penta, str_list): |
|
common_grams.append(penta) |
|
added_phrases.add(penta) |
|
|
|
for i in range(len(words) - 3): |
|
quad = " ".join(words[i:i+4]) |
|
if is_present(quad, str_list) and not is_covered(quad, added_phrases): |
|
common_grams.append(quad) |
|
added_phrases.add(quad) |
|
|
|
for i in range(len(words) - 2): |
|
tri = " ".join(words[i:i+3]) |
|
if is_present(tri, str_list) and not is_covered(tri, added_phrases): |
|
common_grams.append(tri) |
|
added_phrases.add(tri) |
|
|
|
for i in range(len(words) - 1): |
|
bi = " ".join(words[i:i+2]) |
|
if is_present(bi, str_list) and not is_covered(bi, added_phrases): |
|
common_grams.append(bi) |
|
added_phrases.add(bi) |
|
|
|
for i in range(len(words)): |
|
uni = words[i] |
|
if is_present(uni, str_list) and not is_covered(uni, added_phrases): |
|
common_grams.append(uni) |
|
added_phrases.add(uni) |
|
|
|
return common_grams |
|
|
|
def llm_output(prompt): |
|
return prompt, prompt |
|
|
|
def highlight_phrases_with_colors(sentences, phrases): |
|
color_map = {} |
|
color_index = 0 |
|
highlighted_html = [] |
|
idx = 1 |
|
for sentence in sentences: |
|
sentence_with_idx = f"{idx}. {sentence}" |
|
idx += 1 |
|
highlighted_sentence = sentence_with_idx |
|
phrase_count = 0 |
|
words = re.findall(r'\b\w+\b', sentence) |
|
word_index = 1 |
|
for phrase in phrases: |
|
if phrase not in color_map: |
|
color_map[phrase] = f'hsl({color_index * 60 % 360}, 70%, 80%)' |
|
color_index += 1 |
|
escaped_phrase = re.escape(phrase) |
|
pattern = rf'\b{escaped_phrase}\b' |
|
highlighted_sentence, num_replacements = re.subn( |
|
pattern, |
|
lambda m, count=phrase_count, color=color_map[phrase], index=word_index: ( |
|
f'<span style="background-color: {color}; font-weight: bold;' |
|
f' padding: 2px 4px; border-radius: 2px; position: relative;">' |
|
f'<span style="background-color: black; color: white; border-radius: 50%;' |
|
f' padding: 2px 5px; margin-right: 5px;">{index}</span>' |
|
f'{m.group(0)}' |
|
f'</span>' |
|
), |
|
highlighted_sentence, |
|
flags=re.IGNORECASE |
|
) |
|
if num_replacements > 0: |
|
phrase_count += 1 |
|
word_index += 1 |
|
highlighted_html.append(highlighted_sentence) |
|
final_html = "<br><br>".join(highlighted_html) |
|
return f''' |
|
<div style="border: solid 1px #; padding: 16px; background-color: #FFFFFF; color: #374151; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 2px;"> |
|
<h3 style="margin-top: 0; font-size: 1em; color: #111827;">Paraphrased And Highlighted Text</h3> |
|
<div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 2px;">{final_html}</div> |
|
</div> |
|
''' |
|
|
|
import re |
|
|
|
def highlight_phrases_with_colors_single_sentence(sentence, phrases): |
|
color_map = {} |
|
color_index = 0 |
|
highlighted_sentence = sentence |
|
phrase_count = 0 |
|
words = re.findall(r'\b\w+\b', sentence) |
|
word_index = 1 |
|
|
|
for phrase in phrases: |
|
if phrase not in color_map: |
|
color_map[phrase] = f'hsl({color_index * 60 % 360}, 70%, 80%)' |
|
color_index += 1 |
|
escaped_phrase = re.escape(phrase) |
|
pattern = rf'\b{escaped_phrase}\b' |
|
highlighted_sentence, num_replacements = re.subn( |
|
pattern, |
|
lambda m, count=phrase_count, color=color_map[phrase], index=word_index: ( |
|
f'<span style="background-color: {color}; font-weight: bold;' |
|
f' padding: 2px 4px; border-radius: 2px; position: relative;">' |
|
f'<span style="background-color: black; color: white; border-radius: 50%;' |
|
f' padding: 2px 5px; margin-right: 5px;">{index}</span>' |
|
f'{m.group(0)}' |
|
f'</span>' |
|
), |
|
highlighted_sentence, |
|
flags=re.IGNORECASE |
|
) |
|
if num_replacements > 0: |
|
phrase_count += 1 |
|
word_index += 1 |
|
|
|
final_html = highlighted_sentence |
|
return f''' |
|
<div style="border: solid 1px #; padding: 16px; background-color: #FFFFFF; color: #374151; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 2px;"> |
|
<h3 style="margin-top: 0; font-size: 1em; color: #111827;">Selected Sentence</h3> |
|
<div style="background-color: #F5F5F5; line-height: 1.6; padding: 15px; border-radius: 2px;">{final_html}</div> |
|
</div> |
|
''' |
|
|
|
|
|
|
|
def model(prompt): |
|
generated, sentence = llm_output(prompt) |
|
res = generate_paraphrase(sentence) |
|
common_subs = longest_common_subss(sentence, res) |
|
common_grams = find_common_subsequences(sentence, res) |
|
for i in range(len(common_subs)): |
|
common_subs[i]["Paraphrased Sentence"] = res[i] |
|
generated_highlighted = highlight_phrases_with_colors_single_sentence(generated, common_grams) |
|
result = highlight_phrases_with_colors(res, common_grams) |
|
tree = generate_plot(sentence) |
|
return generated, generated_highlighted, result, tree |
|
|
|
with gr.Blocks(theme = gr.themes.Monochrome()) as demo: |
|
gr.Markdown("# Paraphrases the Text and Highlights the Non-melting Points") |
|
|
|
with gr.Row(): |
|
user_input = gr.Textbox(label="User Prompt") |
|
|
|
with gr.Row(): |
|
submit_button = gr.Button("Submit") |
|
clear_button = gr.Button("Clear") |
|
|
|
with gr.Row(): |
|
ai_output = gr.Textbox(label="AI-generated Text (Llama3)") |
|
|
|
with gr.Row(): |
|
selected_sentence = gr.HTML() |
|
|
|
with gr.Row(): |
|
html_output = gr.HTML() |
|
|
|
with gr.Row(): |
|
tree = gr.Plot() |
|
|
|
submit_button.click(model, inputs=user_input, outputs=[ai_output, selected_sentence, html_output, tree]) |
|
clear_button.click(lambda: "", inputs=None, outputs=user_input) |
|
clear_button.click(lambda: "", inputs=None, outputs=[ai_output, selected_sentence, html_output, tree]) |
|
|
|
|
|
demo.launch(share=True) |