Spaces:

rodrigomasini
/

rephrase

Paused

File size: 2,038 Bytes

e008ae1
7843ac8
55ed521
f5ab0cc
 
fd4c28d
2995eda
f5ab0cc
 
 
 
 
 
55ed521
9e7be30
55ed521
9e7be30
55ed521
f5ab0cc
cd04c80
f5ab0cc
900693a
2995eda
9da18aa
4305b4c
fd4c28d
 
 
 
2995eda
 
 
 
55ed521
 
 
 
 
 
 
 
 
2995eda
9ebdc85
 
 
 
 
55ed521
9ebdc85
 
 
 
 
 
 
 
 
 
55ed521

import streamlit as st
from transformers import AutoTokenizer, TextStreamer, pipeline
from auto_gptq import AutoGPTQForCausalLM
from huggingface_hub import snapshot_download
import os
import gc

# Define pretrained and quantized model directories
pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
cwd = os.getcwd()

quantized_model_dir = cwd + "/Jackson2-4bit-128g-GPTQ"

# Check if the model directory is empty (i.e., model not downloaded yet)
if not os.path.exists(quantized_model_dir) or not os.listdir(quantized_model_dir):
    # Create the cache directory if it doesn't exist
    os.makedirs(quantized_model_dir, exist_ok=True)
    snapshot_download(repo_id=pretrained_model_dir, local_dir=quantized_model_dir, local_dir_use_symlinks=True)

st.write(f'{os.listdir(quantized_model_dir)}')
model_name_or_path = quantized_model_dir
model_basename = "Jackson2-4bit-128g-GPTQ"

#os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Before allocating or loading the model, clear up memory
gc.collect()
torch.cuda.empty_cache()

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, legacy=False)

model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    model_basename=model_basename,
    use_safetensors=True,
    trust_remote_code=True,
    device="cuda:0",
    use_triton=use_triton,
    quantize_config=None
)

user_input = st.text_input("Input a phrase")

prompt_template = f'USER: {user_input}\nASSISTANT:'

if st.button("Generate the prompt"):
    input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
    streamer = TextStreamer(tokenizer)
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        streamer=streamer,
        max_new_tokens=512,
        temperature=0.2,
        top_p=0.95,
        repetition_penalty=1.15
    )
    # You had called pipe(prompt_template) twice which was unnecessary. Just call it once.
    output = pipe(prompt_template)
    st.write(output[0]['generated_text'])