rajj0 commited on
Commit
d3770a3
·
verified ·
1 Parent(s): 7e95097

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -59
app.py CHANGED
@@ -1,74 +1,105 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
 
 
 
 
 
4
  import os
 
 
 
 
5
 
6
- # Model and tokenizer paths
7
- model_path = "rajj0/autotrain-phi3-midium-4k-godsent-orpo-6"
8
- hf_token = os.getenv("HF_TOKEN") # Get the token from environment variables
 
 
9
 
10
- # Debugging: print the token to ensure it's being set
11
- print(f"HF_TOKEN: {hf_token}")
12
 
13
- if hf_token is None:
14
- raise ValueError("HF_TOKEN environment variable not set")
15
 
16
- try:
17
- # Load the tokenizer and model with trust_remote_code=True
18
- print("Loading tokenizer...")
19
- tokenizer = AutoTokenizer.from_pretrained(model_path, use_auth_token=hf_token, trust_remote_code=True)
20
- print("Tokenizer loaded successfully.")
 
 
 
 
21
 
22
- print("Loading model...")
23
- model = AutoModelForCausalLM.from_pretrained(
24
- model_path,
25
- device_map="auto",
26
- torch_dtype='auto',
27
- use_auth_token=hf_token,
28
- trust_remote_code=True
29
- ).eval()
30
- print("Model loaded successfully.")
31
- except Exception as e:
32
- print(f"Error loading model or tokenizer: {e}")
33
- raise
34
 
35
- # Function to generate a response from the model
36
- def generate_response(user_input):
37
- try:
38
- print(f"User input: {user_input}")
39
- messages = [{"role": "user", "content": user_input}]
40
- print(f"Messages: {messages}")
41
 
42
- input_ids = tokenizer.apply_chat_template(
43
- conversation=messages,
44
- tokenize=True,
45
- add_generation_prompt=True,
46
- return_tensors='pt'
47
- )
48
- print(f"Input IDs: {input_ids}")
49
 
50
- output_ids = model.generate(input_ids.to('cuda'))
51
- print(f"Output IDs: {output_ids}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
54
- print(f"Response: {response}")
55
 
56
- return response
57
- except Exception as e:
58
- print(f"Error generating response: {e}")
59
- return "An error occurred while generating the response."
60
 
61
- # Create the Gradio interface
62
- iface = gr.Interface(
63
- fn=generate_response,
64
- inputs="text",
65
- outputs="text",
66
- title="PHI Model Chatbot",
67
- description="A chatbot powered by the PHI model."
68
- )
69
 
70
- # Launch the Gradio interface
71
- if __name__ == "__main__":
72
- print("Launching Gradio interface...")
73
- iface.launch()
74
- print("Gradio interface launched.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import torch
3
+ from transformers import (
4
+ AutoModelForCausalLM,
5
+ AutoTokenizer,
6
+ TextIteratorStreamer,
7
+ )
8
  import os
9
+ from threading import Thread
10
+ import spaces
11
+ import time
12
+ import subprocess
13
 
14
+ subprocess.run(
15
+ "pip install flash-attn --no-build-isolation",
16
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
17
+ shell=True,
18
+ )
19
 
20
+ token = os.environ["HF_TOKEN"]
 
21
 
 
 
22
 
23
+ model = AutoModelForCausalLM.from_pretrained(
24
+ "rajj0/autotrain-phi3-midium-4k-godsent-orpo-6",
25
+ token=token,
26
+ trust_remote_code=True,
27
+ )
28
+ tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
29
+ terminators = [
30
+ tok.eos_token_id,
31
+ ]
32
 
33
+ if torch.cuda.is_available():
34
+ device = torch.device("cuda")
35
+ print(f"Using GPU: {torch.cuda.get_device_name(device)}")
36
+ else:
37
+ device = torch.device("cpu")
38
+ print("Using CPU")
 
 
 
 
 
 
39
 
40
+ model = model.to(device)
41
+ # Dispatch Errors
 
 
 
 
42
 
 
 
 
 
 
 
 
43
 
44
+ @spaces.GPU(duration=60)
45
+ def chat(message, history, temperature, do_sample, max_tokens):
46
+ chat = []
47
+ for item in history:
48
+ chat.append({"role": "user", "content": item[0]})
49
+ if item[1] is not None:
50
+ chat.append({"role": "assistant", "content": item[1]})
51
+ chat.append({"role": "user", "content": message})
52
+ messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
53
+ model_inputs = tok([messages], return_tensors="pt").to(device)
54
+ streamer = TextIteratorStreamer(
55
+ tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
56
+ )
57
+ generate_kwargs = dict(
58
+ model_inputs,
59
+ streamer=streamer,
60
+ max_new_tokens=max_tokens,
61
+ do_sample=True,
62
+ temperature=temperature,
63
+ eos_token_id=terminators,
64
+ )
65
 
66
+ if temperature == 0:
67
+ generate_kwargs["do_sample"] = False
68
 
69
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
70
+ t.start()
 
 
71
 
72
+ partial_text = ""
73
+ for new_text in streamer:
74
+ partial_text += new_text
75
+ yield partial_text
 
 
 
 
76
 
77
+ yield partial_text
78
+
79
+
80
+ demo = gr.ChatInterface(
81
+ fn=chat,
82
+ examples=[["Write me a poem about Machine Learning."]],
83
+ # multimodal=False,
84
+ additional_inputs_accordion=gr.Accordion(
85
+ label="⚙️ Parameters", open=False, render=False
86
+ ),
87
+ additional_inputs=[
88
+ gr.Slider(
89
+ minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
90
+ ),
91
+ gr.Checkbox(label="Sampling", value=True),
92
+ gr.Slider(
93
+ minimum=128,
94
+ maximum=4096,
95
+ step=1,
96
+ value=512,
97
+ label="Max new tokens",
98
+ render=False,
99
+ ),
100
+ ],
101
+ stop_btn="Stop Generation",
102
+ title="Chat With LLMs",
103
+ description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
104
+ )
105
+ demo.launch()