Lenylvt commited on
Commit
41ec54b
·
verified ·
1 Parent(s): 40038ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -54
app.py CHANGED
@@ -1,62 +1,70 @@
1
- from huggingface_hub import InferenceClient
 
2
  import gradio as gr
 
 
3
 
4
- client = InferenceClient(
5
- "mistralai/Mixtral-8x7B-Instruct-v0.1"
6
- )
 
 
 
7
 
8
- def format_prompt(message, history, target_language):
9
- prompt = "<s>"
10
- for user_prompt, bot_response in history:
11
- prompt += f"[INST] {user_prompt} [/INST]"
12
- prompt += f" {bot_response}</s> "
13
- prompt += f"[INST] {message} [/INST] [Translate to: {target_language}]"
14
- return prompt
15
-
16
- def generate_from_srt(file_content, target_language, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
17
- # Process the SRT file content as needed before using it as a prompt
18
- # For example, extracting text and removing timestamps if necessary
19
- # Directly using the file content for simplicity here
20
-
21
- temperature = float(temperature)
22
- if temperature < 1e-2:
23
- temperature = 1e-2
24
- top_p = float(top_p)
25
-
26
- generate_kwargs = dict(
27
- temperature=temperature,
28
- max_new_tokens=max_new_tokens,
29
- top_p=top_p,
30
- repetition_penalty=repetition_penalty,
31
- do_sample=True,
32
- seed=42,
33
- )
34
-
35
- formatted_prompt = format_prompt(file_content, [], target_language)
36
- stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
37
- output = ""
38
-
39
- for response in stream:
40
- output += response.token.text
41
- return output
42
-
43
- def handle_file(file_info, target_language):
44
- # Directly use the file content if it's a text file
45
- if isinstance(file_info, str):
46
- file_content = file_info
47
- else:
48
- # If file_info is not a string, it might be a binary file
49
- file_content = file_info.decode('utf-8')
50
 
51
- return generate_from_srt(file_content, target_language)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  iface = gr.Interface(
54
- fn=handle_file,
55
- inputs=[gr.File(label="Upload SRT File"), gr.Textbox(label="Target Language", placeholder="Enter target language (e.g., Spanish, French)")],
56
- outputs="text",
57
- title="SRT File Translation",
58
- description="Translate the content of SRT files to the specified language.",
59
- concurrency_limit=20,
 
 
 
60
  )
61
 
62
- iface.launch()
 
1
+ import requests
2
+ import pandas as pd
3
  import gradio as gr
4
+ from transformers import MarianMTModel, MarianTokenizer
5
+ import io
6
 
7
+ # Fetch and parse language options
8
+ url = "https://huggingface.co/Lenylvt/LanguageISO/resolve/main/iso.md"
9
+ response = requests.get(url)
10
+ df = pd.read_csv(io.StringIO(response.text), delimiter="|", skiprows=2, header=None).dropna(axis=1, how='all')
11
+ df.columns = ['ISO 639-1', 'ISO 639-2', 'Language Name', 'Native Name']
12
+ df['ISO 639-1'] = df['ISO 639-1'].str.strip()
13
 
14
+ # Prepare language options for the dropdown
15
+ language_options = [(row['ISO 639-1'], f"{row['ISO 639-1']} - {row['Language Name'].strip()}") for index, row in df.iterrows()]
16
+
17
+ def translate_text(text, source_language_code, target_language_code):
18
+ # Construct model name using ISO 639-1 codes
19
+ model_name = f"Helsinki-NLP/opus-mt-{source_language_code}-{target_language_code}"
20
+
21
+ # Check if source and target languages are the same
22
+ if source_language_code == target_language_code:
23
+ return "Translation between the same languages is not supported."
24
+
25
+ # Load tokenizer and model
26
+ try:
27
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
28
+ model = MarianMTModel.from_pretrained(model_name)
29
+ except Exception as e:
30
+ return f"Failed to load model for {source_language_code} to {target_language_code}: {str(e)}"
31
+
32
+ # Translate text
33
+ translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512))
34
+ translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ return translated_text
37
+
38
+ def translate_srt(file_info, source_language_code, target_language_code):
39
+ srt_content = file_info["content"].decode("utf-8")
40
+ translated_lines = []
41
+ for line in srt_content.split("\n"):
42
+ if line.isdigit() or "-->" in line:
43
+ translated_lines.append(line)
44
+ elif line.strip() == "":
45
+ translated_lines.append(line)
46
+ else:
47
+ translated_text = translate_text(line, source_language_code, target_language_code)
48
+ translated_lines.append(translated_text)
49
+ translated_srt_content = "\n".join(translated_lines)
50
+ output_path = "/mnt/data/translated_srt.srt"
51
+ with open(output_path, "w", encoding="utf-8") as file:
52
+ file.write(translated_srt_content)
53
+ return output_path
54
+
55
+ source_language_dropdown = gr.Dropdown(choices=language_options, label="Source Language")
56
+ target_language_dropdown = gr.Dropdown(choices=language_options, label="Target Language")
57
 
58
  iface = gr.Interface(
59
+ fn=translate_srt,
60
+ inputs=[
61
+ gr.File(label="Upload SRT File", type="file"),
62
+ source_language_dropdown,
63
+ target_language_dropdown
64
+ ],
65
+ outputs=gr.File(label="Download Translated SRT File"),
66
+ title="SRT Translator",
67
+ description="Translate SubRip Text (SRT) subtitle files. This tool uses models from the Language Technology Research Group at the University of Helsinki."
68
  )
69
 
70
+ iface.launch()