File size: 3,295 Bytes
2e727c2 c81d8ab 2e727c2 b451def 2e727c2 fb658f7 c81d8ab b588b2c 2e727c2 b451def c81d8ab b451def c81d8ab b451def b588b2c b451def b588b2c 2e727c2 b588b2c 2e727c2 c81d8ab 2e727c2 c81d8ab 2e727c2 b588b2c 2e727c2 b588b2c fb658f7 b451def c81d8ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import gradio as gr
from transformers import pipeline, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
import math
# Translation models
translation_models = {
'Vietnamese': "Helsinki-NLP/opus-mt-en-vi",
'Japanese': "Helsinki-NLP/opus-mt-en-jap",
'Thai': "Helsinki-NLP/opus-mt-en-tha",
'Spanish': "Helsinki-NLP/opus-mt-en-es"
}
# Initialize summarization pipeline with a specified model
model_name = "sshleifer/distilbart-cnn-12-6"
summarizer = pipeline("summarization", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Initialize translation pipeline
def get_translator(language):
model_name = translation_models.get(language)
if model_name:
return pipeline("translation", model=model_name)
return None
# Helper function to generate bullet points
def generate_bullet_points(text):
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
sentences = text.split('. ')
embeddings = model.encode(sentences, convert_to_tensor=True)
clusters = util.community_detection(embeddings, threshold=0.75)
bullet_points = []
for cluster in clusters:
cluster_sentences = [sentences[idx] for idx in cluster]
main_sentence = cluster_sentences[0] if cluster_sentences else ""
bullet_points.append(main_sentence.strip())
return "\n".join(f"- {point}" for point in bullet_points)
# Helper function to split text into chunks
def split_text(text, max_tokens=1024):
inputs = tokenizer(text, return_tensors='pt', truncation=False)
input_ids = inputs['input_ids'][0]
total_tokens = len(input_ids)
chunks = []
for i in range(0, total_tokens, max_tokens):
chunk_ids = input_ids[i:i+max_tokens]
chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
chunks.append(chunk_text)
return chunks
# Helper function to summarize text
def summarize_text(text):
chunks = split_text(text)
summaries = [summarizer(chunk, max_length=150, min_length=40, do_sample=False)[0]['summary_text'] for chunk in chunks]
return " ".join(summaries)
# Helper function to translate text
def translate_text(text, language):
translator = get_translator(language)
if translator:
translated_text = translator(text)[0]['translation_text']
return translated_text
return text
def process_text(input_text, language):
summary = summarize_text(input_text)
bullet_points = generate_bullet_points(summary)
translated_text = translate_text(bullet_points, language)
return bullet_points, translated_text
# Create Gradio interface
iface = gr.Interface(
fn=process_text,
inputs=[
gr.Textbox(label="Input Text", placeholder="Paste your text here...", lines=10),
gr.Dropdown(choices=["Vietnamese", "Japanese", "Thai", "Spanish"], label="Translate to", value="Vietnamese")
],
outputs=[
gr.Textbox(label="Bullet Points", lines=10),
gr.Textbox(label="Translated Bullet Points", lines=10)
],
title="Text to Bullet Points and Translation",
description="Paste any text, and the program will summarize it into bullet points. Optionally, translate the bullet points into Vietnamese, Japanese, Thai, or Spanish."
)
iface.launch()
|