File size: 6,515 Bytes
cd650c7 9cbb806 418de0b 97425d1 18cb91d 97425d1 18cb91d 97425d1 09b14bf adba430 923f75f 97425d1 8115786 97425d1 418de0b adba430 97425d1 418de0b fa4d0d9 18cb91d 97425d1 7b026a2 18cb91d a5056fa 18cb91d d53066f 18cb91d 97425d1 18cb91d 97425d1 a5056fa 18cb91d a5056fa 79a6f49 97425d1 7b026a2 18cb91d fa4d0d9 18cb91d 34421df 18cb91d 9cbb806 97425d1 418de0b 1fd65af 418de0b 1fd65af 79a6f49 97425d1 18cb91d 418de0b adba430 09b14bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
from huggingface_hub import InferenceClient
import gradio as gr
import random
import pandas as pd
from io import BytesIO
import csv
import os
import io
import tempfile
import re
import streamlit as st
import torch
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
import time
import logging
if torch.cuda.is_available():
device = torch.device("cuda:0")
else:
device = torch.device("cpu")
logging.warning("GPU not found, using CPU, translation will be very slow.")
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
lang_id = {
"Afrikaans": "af",
"Amharic": "am",
"Arabic": "ar",
"Asturian": "ast",
"Azerbaijani": "az",
"Bashkir": "ba",
"Belarusian": "be",
"Bulgarian": "bg",
"Bengali": "bn",
"Breton": "br",
"Bosnian": "bs",
"Catalan": "ca",
"Cebuano": "ceb",
"Czech": "cs",
"Welsh": "cy",
"Danish": "da",
"German": "de",
"Greeek": "el",
"English": "en",
"Spanish": "es",
"Estonian": "et",
"Persian": "fa",
"Fulah": "ff",
"Finnish": "fi",
"French": "fr",
"Western Frisian": "fy",
"Irish": "ga",
"Gaelic": "gd",
"Galician": "gl",
"Gujarati": "gu",
"Hausa": "ha",
"Hebrew": "he",
"Hindi": "hi",
"Croatian": "hr",
"Haitian": "ht",
"Hungarian": "hu",
"Armenian": "hy",
"Indonesian": "id"
}
@st.cache(suppress_st_warning=True, allow_output_mutation=True)
def load_model(pretrained_model: str = "facebook/m2m100_1.2B", cache_dir: str = "models/"):
tokenizer = M2M100Tokenizer.from_pretrained(pretrained_model, cache_dir=cache_dir)
model = M2M100ForConditionalGeneration.from_pretrained(pretrained_model, cache_dir=cache_dir).to(device)
model.eval()
return tokenizer, model
def extract_text_from_excel(file):
df = pd.read_excel(file)
text = ' '.join(df['Unnamed: 1'].astype(str))
return text
def save_to_csv(sentence, output, filename="synthetic_data.csv"):
with open(filename, mode='a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow([sentence, output])
def generate(file, temperature, max_new_tokens, top_p, repetition_penalty, num_similar_sentences):
text = extract_text_from_excel(file)
sentences = text.split('.')
random.shuffle(sentences) # Shuffle sentences
with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
fieldnames = ['Original Sentence', 'Generated Sentence']
writer = csv.DictWriter(tmp, fieldnames=fieldnames)
writer.writeheader()
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
generate_kwargs = {
"temperature": temperature,
"max_new_tokens": max_new_tokens,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"do_sample": True,
"seed": 42,
}
try:
stream = client.text_generation(sentence, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
output += response.token.text
generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']
for _ in range(num_similar_sentences):
if not generated_sentences:
break
generated_sentence = generated_sentences.pop(random.randrange(len(generated_sentences)))
# Translate generated sentence to English
tokenizer, model = load_model()
src_lang = lang_id[language]
trg_lang = lang_id["English"]
tokenizer.src_lang = src_lang
with torch.no_grad():
encoded_input = tokenizer(generated_sentence, return_tensors="pt").to(device)
generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id(trg_lang))
translated_sentence = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Translate original sentence to Azerbaijani
tokenizer, model = load_model()
src_lang = lang_id["English"]
trg_lang = lang_id["Azerbaijani"]
tokenizer.src_lang = src_lang
with torch.no_grad():
encoded_input = tokenizer(sentence, return_tensors="pt").to(device)
generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id(trg_lang))
translated_sentence_az = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
writer.writerow({'Original Sentence': translated_sentence_az, 'Generated Sentence': translated_sentence})
except Exception as e:
print(f"Error generating data for sentence '{sentence}': {e}")
tmp_path = tmp.name
return tmp_path
gr.Interface(
fn=generate,
inputs=[
gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
gr.Slider(label="Number of similar sentences", value=10, minimum=1, maximum=20, step=1, interactive=True, info="Number of similar sentences to generate for each original sentence"),
gr.Dropdown(label="Language of the input data", choices=list(lang_id.keys()), value="English")
],
outputs=gr.File(label="Synthetic Data "),
title="SDG",
description="AYE QABIL.",
allow_flagging="never",
).launch() |