File size: 4,753 Bytes
2e727c2 c81d8ab b3c7043 5e89d35 2e727c2 580e12f 2e727c2 d9b5a75 3d2876f 128d6a6 d9b5a75 3d2876f b588b2c d7f037a 65fb371 d7f037a 5e89d35 9e17c04 2e727c2 b451def 5773fec c81d8ab 5773fec b451def b588b2c 5773fec 002b4f1 5e89d35 b451def 980b3f8 5e89d35 980b3f8 b451def b588b2c 2e727c2 980b3f8 2e727c2 3d2876f 5e89d35 980b3f8 3d2876f 5dd28de 980b3f8 b588b2c 5dd28de 980b3f8 2e727c2 980b3f8 5e89d35 2e727c2 f05ba0b 23659c4 f05ba0b 5dd28de f05ba0b 23659c4 f05ba0b 23659c4 f05ba0b 23659c4 2e727c2 c81d8ab 3d2876f 2e727c2 c81d8ab 2e727c2 3d2876f 2e727c2 b588b2c fb658f7 b451def c81d8ab 7cd8f48 b3c7043 580e12f 23659c4 f05ba0b fc9e651 3d2876f d7f037a 9e17c04 931e757 5dd28de 002b4f1 128d6a6 65fb371 d9b5a75 5e89d35 5773fec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import gradio as gr
from transformers import pipeline, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize
import time
# Download NLTK data
nltk.download('punkt')
# Translation models
translation_models = {
'Vietnamese': "Helsinki-NLP/opus-mt-en-vi",
'Japanese': "Helsinki-NLP/opus-mt-en-jap",
'Thai': "Helsinki-NLP/opus-mt-en-tha",
'Spanish': "Helsinki-NLP/opus-mt-en-es"
}
# Summarization models
summarization_models = {
'Scientific': "facebook/bart-large-cnn",
'Literature': "google/pegasus-xsum"
}
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
# Helper function to initialize summarization pipeline
def get_summarizer(model_name):
return pipeline("summarization", model=model_name)
# Initialize translation pipeline
def get_translator(language):
model_name = translation_models.get(language)
if model_name:
return pipeline("translation", model=model_name)
return None
# Helper function to split text into chunks
def split_text(text, max_tokens=1024):
sentences = sent_tokenize(text)
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence_length = len(tokenizer.tokenize(sentence))
if current_length + sentence_length <= max_tokens:
current_chunk.append(sentence)
current_length += sentence_length
else:
chunks.append(" ".join(current_chunk))
current_chunk = [sentence]
current_length = sentence_length
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
# Helper function to summarize text
def summarize_text(text, model_name):
if len(text) < 200: # Adjust the threshold as needed
print("Input text is too short for summarization. Please provide longer text.")
return ""
summarizer = get_summarizer(model_name)
chunks = split_text(text)
summaries = []
for chunk in chunks:
try:
summary = summarizer(chunk, max_length=150, min_length=20, do_sample=False)[0]['summary_text']
summaries.append(summary)
except Exception as e:
print(f"Error summarizing chunk: {chunk}\nError: {e}")
return " ".join(summaries)
# Helper function to translate text
def translate_text(text, language):
translator = get_translator(language)
if translator:
try:
translated_text = translator(text)[0]['translation_text']
return translated_text
except Exception as e:
print(f"Error translating text: {text}\nError: {e}")
return text
return text
def process_text(input_text, model, language):
start_time = time.time()
print(f"Input text: {input_text[:500]}...") # Show only the first 500 characters for brevity
summary = summarize_text(input_text, model)
if not summary:
print("Summarization failed. Please provide longer text or try a different model.")
return "", ""
print(f"Summary: {summary[:500]}...") # Show only the first 500 characters for brevity
bullet_points = generate_bullet_points(summary)
if not bullet_points:
print("Bullet points generation failed.")
return "", ""
print(f"Bullet Points: {bullet_points}")
translated_text = translate_text(bullet_points, language)
print(f"Translated Text: {translated_text}")
end_time = time.time()
print(f"Processing time: {end_time - start_time} seconds")
return bullet_points, translated_text
def generate_bullet_points(summary):
print("Summary Text:", summary)
# Extract key sentences
sentences = sent_tokenize(summary)
if not sentences:
return ""
key_sentences = sentences[:3] # Extract the first three sentences as key points
bullet_points = "\n".join(f"- {sentence}" for sentence in key_sentences)
print("Bullet Points:", bullet_points)
return bullet_points
# Create Gradio interface
iface = gr.Interface(
fn=process_text,
inputs=[
gr.Textbox(label="Input Text", placeholder="Paste your text here...", lines=10),
gr.Radio(choices=["Scientific", "Literature"], label="Summarization Model"),
gr.Dropdown(choices=["Vietnamese", "Japanese", "Thai", "Spanish"], label="Translate to", value="Vietnamese")
],
outputs=[
gr.Textbox(label="Bullet Points", lines=10),
gr.Textbox(label="Translated Bullet Points", lines=10)
],
title="Text to Bullet Points and Translation",
description="Paste any text, choose the summarization model, and optionally translate the bullet points into Vietnamese, Japanese, Thai, or Spanish."
)
iface.launch()
|