from huggingface_hub import InferenceClient import gradio as gr import random import pandas as pd from io import BytesIO import csv import os import io import tempfile import re import streamlit as st import torch from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration import time import logging if torch.cuda.is_available(): device = torch.device("cuda:0") else: device = torch.device("cpu") logging.warning("GPU not found, using CPU, translation will be very slow.") client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1") lang_id = { "Afrikaans": "af", "Amharic": "am", "Arabic": "ar", "Asturian": "ast", "Azerbaijani": "az", "Bashkir": "ba", "Belarusian": "be", "Bulgarian": "bg", "Bengali": "bn", "Breton": "br", "Bosnian": "bs", "Catalan": "ca", "Cebuano": "ceb", "Czech": "cs", "Welsh": "cy", "Danish": "da", "German": "de", "Greeek": "el", "English": "en", "Spanish": "es", "Estonian": "et", "Persian": "fa", "Fulah": "ff", "Finnish": "fi", "French": "fr", "Western Frisian": "fy", "Irish": "ga", "Gaelic": "gd", "Galician": "gl", "Gujarati": "gu", "Hausa": "ha", "Hebrew": "he", "Hindi": "hi", "Croatian": "hr", "Haitian": "ht", "Hungarian": "hu", "Armenian": "hy", "Indonesian": "id" } @st.cache(suppress_st_warning=True, allow_output_mutation=True) def load_model(pretrained_model: str = "facebook/m2m100_1.2B", cache_dir: str = "models/"): tokenizer = M2M100Tokenizer.from_pretrained(pretrained_model, cache_dir=cache_dir) model = M2M100ForConditionalGeneration.from_pretrained(pretrained_model, cache_dir=cache_dir).to(device) model.eval() return tokenizer, model def extract_text_from_excel(file): df = pd.read_excel(file) text = ' '.join(df['Unnamed: 1'].astype(str)) return text def save_to_csv(sentence, output, filename="synthetic_data.csv"): with open(filename, mode='a', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow([sentence, output]) def generate(file, temperature, max_new_tokens, top_p, repetition_penalty, num_similar_sentences): text = extract_text_from_excel(file) sentences = text.split('.') random.shuffle(sentences) # Shuffle sentences with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp: fieldnames = ['Original Sentence', 'Generated Sentence'] writer = csv.DictWriter(tmp, fieldnames=fieldnames) writer.writeheader() for sentence in sentences: sentence = sentence.strip() if not sentence: continue generate_kwargs = { "temperature": temperature, "max_new_tokens": max_new_tokens, "top_p": top_p, "repetition_penalty": repetition_penalty, "do_sample": True, "seed": 42, } try: stream = client.text_generation(sentence, **generate_kwargs, stream=True, details=True, return_full_text=False) output = "" for response in stream: output += response.token.text generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output) generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.'] for _ in range(num_similar_sentences): if not generated_sentences: break generated_sentence = generated_sentences.pop(random.randrange(len(generated_sentences))) # Translate generated sentence to English tokenizer, model = load_model() src_lang = lang_id[language] trg_lang = lang_id["English"] tokenizer.src_lang = src_lang with torch.no_grad(): encoded_input = tokenizer(generated_sentence, return_tensors="pt").to(device) generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id(trg_lang)) translated_sentence = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] # Translate original sentence to Azerbaijani tokenizer, model = load_model() src_lang = lang_id["English"] trg_lang = lang_id["Azerbaijani"] tokenizer.src_lang = src_lang with torch.no_grad(): encoded_input = tokenizer(sentence, return_tensors="pt").to(device) generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id(trg_lang)) translated_sentence_az = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] writer.writerow({'Original Sentence': translated_sentence_az, 'Generated Sentence': translated_sentence}) except Exception as e: print(f"Error generating data for sentence '{sentence}': {e}") tmp_path = tmp.name return tmp_path gr.Interface( fn=generate, inputs=[ gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]), gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"), gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"), gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"), gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"), gr.Slider(label="Number of similar sentences", value=10, minimum=1, maximum=20, step=1, interactive=True, info="Number of similar sentences to generate for each original sentence"), gr.Dropdown(label="Language of the input data", choices=list(lang_id.keys()), value="English") ], outputs=gr.File(label="Synthetic Data "), title="SDG", description="AYE QABIL.", allow_flagging="never", ).launch()