File size: 4,227 Bytes
ce62d55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load model and tokenizer
def load_model(model_id):
    # First load the base model
    base_model_id = "microsoft/phi-2"
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    
    # Ensure tokenizer has a padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    # Load and merge the LoRA adapter
    model = PeftModel.from_pretrained(base_model, model_id)
    return model, tokenizer

def generate_response(instruction, model, tokenizer, max_length=200, temperature=0.7, top_p=0.9):
    # Format the input text
    input_text = instruction.strip()
    
    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=temperature,
            top_p=top_p,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True
        )
    
    # Decode and return the response
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the response part (what comes after the instruction)
    if len(input_text) < len(full_text):
        response = full_text[len(input_text):].strip()
        return response
    return full_text.strip()

def create_demo(model_id):
    # Load model and tokenizer
    model, tokenizer = load_model(model_id)
    
    # Define the interface
    def process_input(instruction, max_length, temperature, top_p):
        try:
            return generate_response(
                instruction,
                model,
                tokenizer,
                max_length=max_length,
                temperature=temperature,
                top_p=top_p
            )
        except Exception as e:
            return f"Error generating response: {str(e)}"
    
    # Create the interface
    demo = gr.Interface(
        fn=process_input,
        inputs=[
            gr.Textbox(
                label="Input Text",
                placeholder="Enter your text here...",
                lines=4
            ),
            gr.Slider(
                minimum=50,
                maximum=500,
                value=150,
                step=10,
                label="Maximum Length"
            ),
            gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.7,
                step=0.1,
                label="Temperature"
            ),
            gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.9,
                step=0.1,
                label="Top P"
            )
        ],
        outputs=gr.Textbox(label="Completion", lines=8),
        title="Phi-2 GRPO Model Demo",
        description="""This is a generative model trained using GRPO (Generative Reinforcement from Preference Optimization) 
        on the TLDR dataset. The model was trained to generate completions of around 150 characters.
        
        You can adjust the generation parameters:
        - **Maximum Length**: Controls the maximum length of the generated response
        - **Temperature**: Higher values make the output more random, lower values make it more focused
        - **Top P**: Controls the cumulative probability threshold for token sampling
        """,
        examples=[
            ["The quick brown fox jumps over the lazy dog."],
            ["In this tutorial, we will explore how to build a neural network for image classification."],
            ["The best way to prepare for an interview is to"],
            ["Python is a popular programming language because"]
        ]
    )
    return demo

if __name__ == "__main__":
    # Use your model ID
    model_id = "jatingocodeo/phi2-grpo"
    demo = create_demo(model_id)
    demo.launch()