Spaces:

AhmadT198
/

zeroGPUDemo1

Sleeping

File size: 978 Bytes

522fd1b
8d5b6b3
698bcdd
8d5b6b3
11a4987
 
 
 
 
0a6540b
b7899e7
0a6540b
 
 
dde3493
 
11a4987
0a6540b
b7899e7
11a4987
 
 
698bcdd
 
 
 
 
 
 
 
 
5293d29
dde3493
 
698bcdd
dde3493
11a4987
8d5b6b3
3f7015b
8d5b6b3
11a4987
8d5b6b3

import spaces
import gradio as gr
import torch



# Use a pipeline as a high-level helper
from transformers import pipeline

torch.cuda.empty_cache()

print("RUNNING PIPE")
pipe = pipeline("text-generation", model="NousResearch/Hermes-3-Llama-3.1-8B", max_new_tokens=200, device=0)
print("PIPE DONE") 

@spaces.GPU(duration=120)
def llama3_1_8B(question):
    

    messages = [
    {"role": "user", "content": question},
    ]
    if torch.cuda.is_available():
        num_devices = torch.cuda.device_count()
        print(f"Number of CUDA devices: {num_devices}")
        
        for i in range(num_devices):
            print(f"Device {i}: {torch.cuda.get_device_name(i)}")
                    
    else:
        print("CUDA is not available.")

    print("GATHERING RESPONSES")
    responses = pipe(messages)

    return str(responses)

def greet(name):
    return "Hello " + name + "!!???"

demo = gr.Interface(fn=llama3_1_8B, inputs="text", outputs="text")
demo.launch()