Spaces:

SagarKeshave
/

math_app

Sleeping

App Files Files Community

SagarKeshave commited on Feb 9, 2024

Commit

6d6c51d

verified ·

1 Parent(s): f1753a9

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -31

app.py CHANGED Viewed

@@ -2,52 +2,52 @@ import streamlit as st
 import transformers
-import torch
 import json
 import os
-from transformers import AutoTokenizer, TextStreamer , pipeline
-model_id = "WizardLM/WizardMath-7B-V1.1"
-# Configuration
-runtimeFlag = "cuda:0" #Run on GPU (you can't run GPTQ on cpu)
-cache_dir = None # by default, don't set a cache directory. This is automatically updated if you connect Google Drive.
-scaling_factor = 1.0 # allows for a max sequence length of 16384*6 = 98304! Unfortunately, requires Colab Pro and a V100 or A100 to have sufficient RAM.
-from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    offload_folder="offload",
-    pad_token_id=tokenizer.eos_token_id,
-    offload_state_dict = True,
-    torch_dtype=torch.float16,
-    # rope_scaling = {"type": "dynamic", "factor": scaling_factor}
-    )
-pipe = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-    max_new_tokens=512,
-    temperature=0.7,
-    top_p=0.95,
-    repetition_penalty=1.15
-)
-question = st.text_area("Enter questoin")
 if text:
-    out = pipe(question)[0]['generated_text']
-    st.write(out)

 import transformers
+# import torch
 import json
 import os
+# from transformers import AutoTokenizer, TextStreamer , pipeline
+# model_id = "WizardLM/WizardMath-7B-V1.1"
+# # Configuration
+# runtimeFlag = "cuda:0" #Run on GPU (you can't run GPTQ on cpu)
+# cache_dir = None # by default, don't set a cache directory. This is automatically updated if you connect Google Drive.
+# scaling_factor = 1.0 # allows for a max sequence length of 16384*6 = 98304! Unfortunately, requires Colab Pro and a V100 or A100 to have sufficient RAM.
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+model_id = "WizardLM/WizardMath-7B-V1.1"
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
+question = st.text_area("Enter questoin")
+# text = "Sum of two numbers is 20 and difference is 4. What are the numbers?"
+text = st.text_area("Enter questoin")
+# print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 if text:
+    device = "cuda:0"
+    inputs = tokenizer(str(text), return_tensors="pt").to(device)
+    outputs = model_4bit.generate(**inputs, max_new_tokens=512)
+    # out = pipe(question)[0]['generated_text']
+    st.write(tokenizer.decode(outputs[0], skip_special_tokens=True))