nafisneehal commited on
Commit
cc5a84d
·
verified ·
1 Parent(s): 260f4bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -15
app.py CHANGED
@@ -1,16 +1,17 @@
1
  import gradio as gr
2
  import os
3
  import torch
4
- from unsloth import FastLanguageModel
 
5
  from huggingface_hub import spaces
6
 
7
- # Get Hugging Face token from environment variables
8
- HF_TOKEN = os.environ.get('HF_TOKEN')
9
-
10
  # Check if we're running in a Hugging Face Space with GPU constraints
11
  IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
12
  IS_SPACE = os.environ.get("SPACE_ID", None) is not None
13
 
 
 
 
14
  # Determine device (use GPU if available)
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"
@@ -19,31 +20,28 @@ print(f"Using device: {device}")
19
  print(f"Low memory mode: {LOW_MEMORY}")
20
 
21
  # Model configuration
22
- max_seq_length = 2048 # Max sequence length for RoPE scaling
23
- dtype = torch.float16 if device == "cuda" else torch.float32
24
- load_in_4bit = True # Enable 4-bit quantization if memory is limited
25
 
26
  # Load model and tokenizer with device mapping
 
27
  model_name = "nafisneehal/chandler_bot"
28
- model, tokenizer = FastLanguageModel.from_pretrained(
29
- model_name=model_name,
30
- max_seq_length=max_seq_length,
31
- dtype=dtype,
32
  load_in_4bit=load_in_4bit,
33
  device_map="auto" if device == "cuda" else None # Automatic GPU mapping
34
  )
35
- FastLanguageModel.for_inference(model) # Optimize model for faster inference
36
 
37
  # Define prompt structure (update if necessary for your model)
38
  alpaca_prompt = "{instruction} {input} {output}"
39
 
40
- instruction_text = "Learn how to talk like Chandler - a popular character from FRIENDS TV Show. Input is someone saying something, Output is what Chandler saying in response."
41
 
42
 
43
  @spaces.GPU # Use GPU provided by Hugging Face Spaces if available
44
  def generate_response(user_input, chat_history):
45
- instruction = user_input # Treats user input as instruction
46
- input_text = "" # Any additional input if needed; empty otherwise
47
 
48
  # Prepare inputs for model inference on the correct device
49
  inputs = tokenizer(
 
1
  import gradio as gr
2
  import os
3
  import torch
4
+ from peft import AutoPeftModelForCausalLM
5
+ from transformers import AutoTokenizer
6
  from huggingface_hub import spaces
7
 
 
 
 
8
  # Check if we're running in a Hugging Face Space with GPU constraints
9
  IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1"
10
  IS_SPACE = os.environ.get("SPACE_ID", None) is not None
11
 
12
+ # Get Hugging Face token from environment variables
13
+ HF_TOKEN = os.environ.get('HF_TOKEN')
14
+
15
  # Determine device (use GPU if available)
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
  LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1"
 
20
  print(f"Low memory mode: {LOW_MEMORY}")
21
 
22
  # Model configuration
23
+ load_in_4bit = True # Use 4-bit quantization if memory is constrained
 
 
24
 
25
  # Load model and tokenizer with device mapping
26
+ # Replace with the name of your trained model
27
  model_name = "nafisneehal/chandler_bot"
28
+ model = AutoPeftModelForCausalLM.from_pretrained(
29
+ model_name,
 
 
30
  load_in_4bit=load_in_4bit,
31
  device_map="auto" if device == "cuda" else None # Automatic GPU mapping
32
  )
33
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
34
 
35
  # Define prompt structure (update if necessary for your model)
36
  alpaca_prompt = "{instruction} {input} {output}"
37
 
38
+ instruction = "Chat with me like Chandler"
39
 
40
 
41
  @spaces.GPU # Use GPU provided by Hugging Face Spaces if available
42
  def generate_response(user_input, chat_history):
43
+ instruction = instruction # Treats user input as the instruction
44
+ input_text = user_input # Any additional input if needed; leave blank otherwise
45
 
46
  # Prepare inputs for model inference on the correct device
47
  inputs = tokenizer(