Spaces:
Paused
Paused
File size: 2,063 Bytes
e008ae1 7843ac8 55ed521 f5ab0cc fd4c28d 2995eda f5ab0cc 37f4fec f5ab0cc 37f4fec f5ab0cc 55ed521 37f4fec 55ed521 37f4fec f5ab0cc 37f4fec 2995eda 9da18aa 4305b4c fd4c28d 37f4fec fd4c28d 2995eda 37f4fec 55ed521 37f4fec 55ed521 37f4fec 55ed521 2995eda 9ebdc85 37f4fec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import streamlit as st
from transformers import AutoTokenizer, TextStreamer, pipeline
from auto_gptq import AutoGPTQForCausalLM
from huggingface_hub import snapshot_download
import os
import gc
# Define pretrained and quantized model directories
pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
#cwd = os.getcwd()
#quantized_model_dir = cwd + "/Jackson2-4bit-128g-GPTQ"
# Check if the model directory is empty (i.e., model not downloaded yet)
#if not os.path.exists(quantized_model_dir) or not os.listdir(quantized_model_dir):
# Create the cache directory if it doesn't exist
# os.makedirs(quantized_model_dir, exist_ok=True)
# snapshot_download(repo_id=pretrained_model_dir, local_dir=quantized_model_dir, local_dir_use_symlinks=True)
#st.write(f'{os.listdir(quantized_model_dir)}')
#model_name_or_path = quantized_model_dir
#model_basename = "Jackson2-4bit-128g-GPTQ"
#os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# Before allocating or loading the model, clear up memory
#gc.collect()
#torch.cuda.empty_cache()
use_triton = False
#tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(
pretrained_model_dir,
#model_basename=model_basename,
use_safetensors=True,
device="cuda:0",
#use_triton=use_triton,
#quantize_config=None
)
user_input = st.text_input("Input a phrase")
prompt_template = f'USER: {user_input}\nASSISTANT:'
if st.button("Generate the prompt"):
inputs = tokenizer(prompt_template, return_tensors='pt')
#streamer = TextStreamer(tokenizer)
#pipe = pipeline(
# "text-generation",
# model=model,
# tokenizer=tokenizer,
# streamer=streamer,
# max_new_tokens=512,
# temperature=0.2,
# top_p=0.95,
# repetition_penalty=1.15
#)
output = model.generate(**prompt_template)
st.markdown(f"tokenizer.decode(output)")
#st.write(output[0]['generated_text']) |