translation

Sleeping

App Files Files Community

TenzinGayche commited on Jan 22

Commit

4fe238f

verified ·

1 Parent(s): 3ed952b

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -55

app.py CHANGED Viewed

@@ -3,15 +3,16 @@ from threading import Thread, Event
 from typing import Iterator
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer
 DESCRIPTION = """\
-# Monlam LLM v2.0.1
 """
-path="TenzinGayche/tpo_v1.0.0_ep2_dpo_ft"
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 # Load the model and tokenizer
@@ -21,91 +22,98 @@ model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16).to
 model.config.sliding_window = 4096
 model.eval()
-# Create a shared stop event
 stop_event = Event()
-def generate(
-    message: str,
-    chat_history: list[dict],
-    max_new_tokens: int = 2048,
     temperature: float = 0.6,
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
-    do_sample: bool= False
 ) -> Iterator[str]:
-    # Clear the stop event before starting a new generation
     stop_event.clear()
-    # Append the user's message to the conversation history
-    conversation = chat_history.copy()
-    if not conversation:
-        conversation.extend([
-            {
-                "role": "user",
-                "content": "ཁྱེད་རང་སྨོན་ལམ་མི་བཟོས་རིག་ནུས་ཤིག་ཡིན་པ་དང་ཁྱེད་རང་མི་བཟོས་རིག་ནུས་(AI)ཤིག་ཡིན།"
-            },
-            {
-                "role": "assistant",
-                "content": "ལགས་སོ། ང་ཡིས་ཁྱེད་ཀྱི་བཀའ་བཞིན་སྒྲུབ་ཆོག"
-            }
-        ])
-    conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
-    # Create a streamer to get the generated response
     streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
     )
-    # Run generation in a background thread
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
     for text in streamer:
         if stop_event.is_set():
-            break  # Stop if the stop button is pressed
-        outputs.append(text)
-        yield "".join(outputs)
-    # After generation, append the assistant's response to the chat history
-    assistant_response = "".join(outputs)
-    chat_history.append({"role": "assistant", "content": assistant_response})
-# Define a function to stop the generation
 def stop_generation():
     stop_event.set()
-# Create the chat interface with additional inputs and the stop button
 with gr.Blocks(css="style.css", fill_height=True) as demo:
     gr.Markdown(DESCRIPTION)
-    # Create the chat interface
-    chat_interface = gr.ChatInterface(
         fn=generate,
-        examples=[
-            ["Hello there! How are you doing?"],
-            ["Can you explain briefly to me what is the Python programming language?"],
-            ["Explain the plot of Cinderella in a sentence."],
-            ["How many hours does it take a man to eat a Helicopter?"],
-            ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
-        ],
-        cache_examples=False,
-        type="messages",
     )
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch(share=True)

 from typing import Iterator
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer
 DESCRIPTION = """\
+# Monlam LLM v2.0.1 - Thoughts and Translation
+This version generates detailed reasoning (thoughts) followed by a tokenized translation.
 """
+# Constants
+path = "TenzinGayche/tpo_v1.0.0_dpo_2_3ep_ft"
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 # Load the model and tokenizer
 model.config.sliding_window = 4096
 model.eval()
+# Shared stop event
 stop_event = Event()
+# Generate function
+def generate(message: str,
+    show_thoughts: bool,
+    max_new_tokens: int = 1024,
     temperature: float = 0.6,
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
+    do_sample: bool = False,
 ) -> Iterator[str]:
     stop_event.clear()
+    # Prepare input for the model
+    conversation = [
+        {"role": "user", "content": f"Please translate the following into Germany: {message} Translation:"}
+    ]
     input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        gr.Warning(f"Input trimmed as it exceeded {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
+    # Use a streamer to get generated text
     streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        input_ids=input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
     )
+    # Generate in a separate thread
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
+    in_translation = False
     for text in streamer:
         if stop_event.is_set():
+            break
+        # Process the generated text
+        if "#Final Translation:" in text and not in_translation:
+            in_translation = True
+            if not show_thoughts:
+                text = text.split("#Final Translation:", 1)[1].strip()  # Skip reasoning if "View Thoughts" is disabled
+        if in_translation:
+            outputs.append(text)
+            yield "".join(outputs)
+        elif show_thoughts:
+            outputs.append(text)
+            yield "".join(outputs)
+    # Append assistant's response
+    chat_history = "".join(outputs)
+# Stop generation function
 def stop_generation():
     stop_event.set()
+# Create the Gradio interface
 with gr.Blocks(css="style.css", fill_height=True) as demo:
     gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        input_text = gr.Textbox(label="Enter Tibetan text", placeholder="Type Tibetan text here...")
+        show_thoughts = gr.Checkbox(label="View Detailed Thoughts", value=True)
+        submit_button = gr.Button("Translate")
+        stop_button = gr.Button("Stop")
+    with gr.Row():
+        output_area = gr.Textbox(
+            label="Output (Thoughts and Translation)",
+            lines=20,
+            interactive=False,
+        )
+    # Connect buttons to functions
+    submit_button.click(
         fn=generate,
+        inputs=[input_text, show_thoughts],
+        outputs=output_area,
+        queue=True,  # Enable streaming
     )
+    stop_button.click(stop_generation)
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch(share=True)