Spaces:

Amitontheweb
/

Text_Paraphraser_Title_Generator

Sleeping

File size: 4,502 Bytes

import numpy as np 
import pandas as pd 
import re
import torch
import gradio as gr

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

tokenizer_gen_title = AutoTokenizer.from_pretrained("Ateeqq/news-title-generator")
model_gen_title = AutoModelForSeq2SeqLM.from_pretrained("Ateeqq/news-title-generator")

def generate_title(input_text): #Generate a title for input text with Ateeq model

  input_ids = tokenizer_gen_title.encode(input_text, return_tensors="pt") #Tokenize input text
  #input_ids = input_ids.to('cuda') #Send tokenized inputs to gpu
  output = model_gen_title.generate(input_ids, 
                          max_new_tokens=100, 
                          do_sample=True, 
                          temperature=0.8,
                          top_k = 20
                        )
  decoded_text = tokenizer_gen_title.decode(output[0], skip_special_tokens=True)
  return decoded_text



def split_into_sentences(paragraph): #For paraphraser - return a list of sentences from input para
    # Split sentences after period. Retains \n if part of the text, but not included in model output
    
    sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s'
    sentences = re.split(sentence_endings, paragraph)    
    return sentences

def paraphrase(
    text,
    beam_search,
    #num_beams=10,
    #num_beam_groups=10,
    #num_return_sequences=1,
    #repetition_penalty=1.0,
    #diversity_penalty=1.0,
    #no_repeat_ngram_size=3,
    temperature=0.8,
    max_length=128
):
 if text != "":
    sentence_list = split_into_sentences(text) #feed input para into sentence splitter
    output = [] #List to hold the individual rephrased sentences obtained from the model

    for sentence in sentence_list:
    
        input_ids = tokenizer(
            f'paraphrase: {sentence}', #Using paraphrase prompt for T5
            return_tensors="pt", padding="longest",
            #max_length=max_length,
            #truncation=True,
        ).input_ids
        
        outputs = model.generate(
            input_ids,
            do_sample=True,
            num_beams = 20 if beam_search else 1,
            temperature=temperature,
            max_length=max_length, 
            no_repeat_ngram_size=4
        )
    
        res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        output.append(res[0]) #Add rephrased sentence to list

    paraphrased_text = "" #to hold the combined sentence output made from generated list
    titles_list = "" #to hold the three titles

    for sentence in output: #Join all new reworded sentences together
        paraphrased_text += sentence + " "
     
    for title in range (1,4): #Print 3 titles by calling Ateeq model fn - generate_title
        
        titles_list += (f"Title {title}: {generate_title (paraphrased_text)}<br>")
        #titles_list.append ("") #space after each title
    
    
    return (titles_list, paraphrased_text) # Return paraphrased text after printing three titles above


iface = gr.Interface(fn=paraphrase, 
                     inputs=[gr.Textbox(label="Paste text in the input box and press 'Submit'.", lines=10), "checkbox", gr.Slider(0.1, 2, 0.8)],
                     outputs=[gr.HTML(label="Titles:"), gr.Textbox(label="Rephrased text:", lines=15)],
                     title="AI Paraphraser with Title Generator", 
                     description="Sentencet-to-sentence rewording backed with GPT-3.5 training set",
                     article="<div align=left><h1>AI Paraphraser and Title Generator</h1><li>Each sentence is rephrased separately without context.</li><li>Temperature: Increase value for more creative rewordings. Higher values may corrupt the sentence. Reset value after pressing 'Clear'</li><li>Beam search: Try for safer and conservative rephrasing.</li><p>Models:<br><li>Training set derived by using Chat-GPT3.5. No competition intended.</li><li>Original models: humarin/chatgpt_paraphraser_on_T5_base and Ateeq_news_title_generator. Deployment code modified for long text inputs.</li></p><p>Parameter details:<br><li>For rephraser: Beam search: No. of beams = 20, no_repeat_ngram_size=4, do_sample=True.</li><li>For title generator: do_sample=True, temperature=0.8, top_k = 20 </li></div>",
                     flagging_mode='never'
                    )

iface.launch()