macadeliccc commited on
Commit
19a1870
Β·
1 Parent(s): 3a44bb8

changed space to use LiquidAI/LFM2-1.2

Browse files
Files changed (3) hide show
  1. README.md +5 -5
  2. app.py +58 -60
  3. requirements.txt +1 -5
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: SOLAR-math-2x10.7b MoE-Chat
3
- emoji: πŸ†
4
- colorFrom: yellow
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.1.2
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
1
  ---
2
+ title: LiquidAI - LFM2-1.2B Chat
3
+ emoji: 🌊
4
+ colorFrom: orange
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 5.7.3
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
app.py CHANGED
@@ -1,82 +1,80 @@
1
  import spaces
2
  import gradio as gr
3
  import torch
4
- from transformers import AutoModelForCausalLM, AutoTokenizer
5
- from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
6
  from threading import Thread
7
 
8
- # Lazy loading the model to meet huggingface stateless GPU requirements
9
-
10
- # Defining a custom stopping criteria class for the model's text generation.
11
- class StopOnTokens(StoppingCriteria):
12
- def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
13
- stop_ids = [50256, 50295] # IDs of tokens where the generation should stop.
14
- for stop_id in stop_ids:
15
- if input_ids[0][-1] == stop_id: # Checking if the last generated token is a stop token.
16
- return True
17
- return False
18
-
19
-
20
- # Function to generate model predictions.
21
  @spaces.GPU
22
  def predict(message, history):
23
  torch.set_default_device("cuda")
24
-
25
- # Loading the tokenizer and model from Hugging Face's model hub.
26
- tokenizer = AutoTokenizer.from_pretrained(
27
- "macadeliccc/SOLAR-math-2x10.7b",
28
- trust_remote_code=True
29
- )
30
  model = AutoModelForCausalLM.from_pretrained(
31
- "macadeliccc/SOLAR-math-2x10.7b",
32
- torch_dtype="auto",
33
- load_in_4bit=True,
34
- trust_remote_code=True
 
 
35
  )
36
- history_transformer_format = history + [[message, ""]]
37
- stop = StopOnTokens()
38
-
39
- # Formatting the input for the model.
40
- system_prompt = "<|im_start|>system\nYou are Solar, a helpful AI assistant.<|im_end|>"
41
- messages = system_prompt + "".join(["".join(["\n<|im_start|>user\n" + item[0], "<|im_end|>\n<|im_start|>assistant\n" + item[1]]) for item in history_transformer_format])
42
- input_ids = tokenizer([messages], return_tensors="pt").to('cuda')
 
 
 
 
 
 
 
 
43
  streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
 
 
44
  generate_kwargs = dict(
45
- input_ids,
46
  streamer=streamer,
47
  max_new_tokens=256,
48
  do_sample=True,
49
- top_p=0.95,
50
- top_k=50,
51
- temperature=0.7,
52
- num_beams=1,
53
- stopping_criteria=StoppingCriteriaList([stop])
54
  )
 
 
55
  t = Thread(target=model.generate, kwargs=generate_kwargs)
56
- t.start() # Starting the generation in a separate thread.
 
 
57
  partial_message = ""
58
  for new_token in streamer:
59
  partial_message += new_token
60
- if '<|im_end|>' in partial_message: # Breaking the loop if the stop token is generated.
61
- break
62
  yield partial_message
63
 
64
-
65
- # Setting up the Gradio chat interface.
66
- gr.ChatInterface(predict,
67
- description="""
68
- <center><img src="https://huggingface.co/macadeliccc/SOLAR-math-2x10.7b-v0.2/resolve/main/solar.png" width="33%"></center>\n\n
69
- Chat with [macadeliccc/SOLAR-math-2x10.7b-v0.2](https://huggingface.co/macadeliccc/SOLAR-math-2x10.7b-v0.2), the first Mixture of Experts made by merging two fine-tuned [upstage/SOLAR-10.7B-v1.0](https://huggingface.co/upstage/SOLAR-10.7B-v1.0) models.
70
- This model (19.2B param) scores top 5 on several evaluations. Output is considered experimental.\n\n
71
- ❀️ If you like this work, please follow me on [Hugging Face](https://huggingface.co/macadeliccc) and [LinkedIn](https://www.linkedin.com/in/tim-dolan-python-dev/).
72
- """,
73
- examples=[
74
- 'Can you solve the equation 2x + 3 = 11 for x?',
75
- 'How does Fermats last theorem impact number theory?',
76
- 'What is a vector in the scope of computer science rather than physics?',
77
- 'Use a list comprehension to create a list of squares for numbers from 1 to 10.',
78
- 'Recommend some popular science fiction books.',
79
- 'Can you write a short story about a time-traveling detective?'
80
- ],
81
- theme=gr.themes.Soft(primary_hue="purple"),
82
- ).launch()
 
 
1
  import spaces
2
  import gradio as gr
3
  import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
5
  from threading import Thread
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  @spaces.GPU
8
  def predict(message, history):
9
  torch.set_default_device("cuda")
10
+
11
+ # Load model and tokenizer
12
+ model_id = "LiquidAI/LFM2-1.2B"
13
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
14
  model = AutoModelForCausalLM.from_pretrained(
15
+ model_id,
16
+ device_map="auto",
17
+ torch_dtype=torch.bfloat16,
18
+ trust_remote_code=True,
19
+ load_in_4bit=True, # Keeping 4-bit quantization for efficiency
20
+ # attn_implementation="flash_attention_2" # Uncomment on compatible GPU
21
  )
22
+
23
+ # Format conversation history for chat template
24
+ messages = [{"role": "user" if i % 2 == 0 else "assistant", "content": msg}
25
+ for conv in history for i, msg in enumerate(conv) if msg]
26
+ messages.append({"role": "user", "content": message})
27
+
28
+ # Apply chat template
29
+ input_ids = tokenizer.apply_chat_template(
30
+ messages,
31
+ add_generation_prompt=True,
32
+ return_tensors="pt",
33
+ tokenize=True
34
+ ).to('cuda')
35
+
36
+ # Setup streamer for real-time output
37
  streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
38
+
39
+ # Generation parameters
40
  generate_kwargs = dict(
41
+ input_ids=input_ids,
42
  streamer=streamer,
43
  max_new_tokens=256,
44
  do_sample=True,
45
+ temperature=0.3,
46
+ min_p=0.15,
47
+ repetition_penalty=1.05,
48
+ pad_token_id=tokenizer.eos_token_id
 
49
  )
50
+
51
+ # Start generation in separate thread
52
  t = Thread(target=model.generate, kwargs=generate_kwargs)
53
+ t.start()
54
+
55
+ # Stream tokens
56
  partial_message = ""
57
  for new_token in streamer:
58
  partial_message += new_token
 
 
59
  yield partial_message
60
 
61
+ # Setup Gradio interface
62
+ gr.ChatInterface(
63
+ predict,
64
+ description="""
65
+ <center><h2>LiquidAI LFM2-1.2B Chat</h2></center>
66
+
67
+ Chat with [LiquidAI/LFM2-1.2B](https://huggingface.co/LiquidAI/LFM2-1.2B), a compact and efficient language model.
68
+
69
+ This model provides high-quality responses while maintaining a small footprint, making it ideal for fast inference.
70
+ """,
71
+ examples=[
72
+ 'Can you solve the equation 2x + 3 = 11 for x?',
73
+ 'What is C. elegans?',
74
+ 'Explain quantum computing in simple terms',
75
+ 'Write a Python function to find prime numbers',
76
+ 'What are the key differences between RNA and DNA?',
77
+ 'Can you write a haiku about artificial intelligence?'
78
+ ],
79
+ theme=gr.themes.Soft(primary_hue="blue"),
80
+ ).launch()
requirements.txt CHANGED
@@ -1,12 +1,8 @@
1
- git+https://github.com/huggingface/diffusers.git
2
  git+https://github.com/huggingface/transformers.git
3
  git+https://github.com/huggingface/peft.git
4
- --extra-index-url https://download.pytorch.org/whl/cu113
5
  torch
6
  pydantic
7
- Pillow
8
  accelerate
9
- bitsandbytes
10
  spaces
11
- invisible_watermark
12
 
 
 
1
  git+https://github.com/huggingface/transformers.git
2
  git+https://github.com/huggingface/peft.git
3
+ --extra-index-url https://download.pytorch.org/whl/cu126
4
  torch
5
  pydantic
 
6
  accelerate
 
7
  spaces
 
8