theosaurus commited on
Commit
fb79cf3
·
1 Parent(s): b27ab96

Add initial implementation of LLM model with Hugging Face integration

Browse files
Files changed (2) hide show
  1. .gitignore +0 -0
  2. app.py +124 -0
.gitignore ADDED
File without changes
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ from huggingface_hub import InferenceClient, login
4
+ import accelerate
5
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
6
+ import numpy as np
7
+ import tempfile
8
+ import os
9
+ from threading import Thread
10
+ import torch
11
+ from rdflib import Graph, Namespace, URIRef, Literal
12
+ from rdflib.namespace import RDF, RDFS, OWL
13
+ from time import time
14
+ from typing import Optional
15
+
16
+
17
+ # Initialize logging and device information
18
+ print(f"Is CUDA available: {torch.cuda.is_available()}")
19
+ print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
20
+
21
+ class HuggingFaceLogin:
22
+ """Handles authentication to the Hugging Face Hub using environment variables or explicit tokens."""
23
+ def __init__(self, env_token_key: str = "HF_TOKEN"):
24
+ """Initialize the login handler.
25
+
26
+ Args:
27
+ env_token_key (str): Environment variable key containing the token. Defaults to "HF_TOKEN".
28
+ """
29
+ self.token = os.getenv(env_token_key)
30
+
31
+ def login(self, token: str = None) -> bool:
32
+ """Authenticate with the Hugging Face Hub.
33
+
34
+ Args:
35
+ token (Optional[str]): Optional explicit token. If not provided, uses token from environment.
36
+
37
+ Returns:
38
+ bool: True if login successful, False otherwise.
39
+
40
+ Raises:
41
+ ValueError: If no token is available (neither in env nor passed explicitly).
42
+ """
43
+
44
+ if not self.token:
45
+ raise ValueError("No authentication token provided. Set HF_TOKEN environment variable or pass token explicitly.")
46
+ try:
47
+ print("Logging in to the Hugging Face Hub...")
48
+ login(token=self.token)
49
+ return True
50
+ except Exception as e:
51
+ print(f"Login failed: {str(e)}")
52
+ return False
53
+
54
+ device = "cuda" if torch.cuda.is_available() else "cpu"
55
+ model_id = "meta-llama/Llama-3.1-8B-Instruct"
56
+ model_config = BitsAndBytesConfig(
57
+ load_in_4bit=True,
58
+ bnb_4bit_compute_dtype=torch.float16,
59
+ bnb_4bit_use_double_quant=True,
60
+ bnb_4bit_quant_type="nf4"
61
+ )
62
+ llm_model = AutoModelForCausalLM.from_pretrained(
63
+ model_id,
64
+ quantization_config=model_config,
65
+ device_map="auto")
66
+
67
+ @spaces.GPU
68
+ def initialize_llm():
69
+ """
70
+ Initialize the LLM with careful memory management.
71
+ Returns the model and tokenizer configured for efficient memory use.
72
+ """
73
+ print("Loading tokenizer...")
74
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
75
+
76
+ print("Loading model with memory optimizations...")
77
+ model = AutoModelForCausalLM.from_pretrained(
78
+ model_id,
79
+ quantization_config=model_config,
80
+ device_map="auto",
81
+ torch_dtype=torch.float16,
82
+ low_cpu_mem_usage=True,
83
+ use_cache=False # Disable KV cache to save memory
84
+ )
85
+
86
+ return model, tokenizer
87
+
88
+ def generate_response(prompt:str, history: Optional[list], llm: Optional[AutoModelForCausalLM], tokenizer, max_length: int = 100) -> str:
89
+ """
90
+ Generate a response from the LLM model given a prompt.
91
+ """
92
+ messages = [
93
+ {"role": "system", "content": "You are a pirate."},
94
+ {"role": "user", "content": f"What do you think I should do about {prompt}?"},
95
+ ]
96
+ tokenized_chat = tokenizer.apply_chat_template(messages,
97
+ tokenize=True,
98
+ add_generation_prompt=True,
99
+ return_tensors="pt",
100
+ max_length= max_length)
101
+ with torch.no_grad():
102
+ output = llm.generate(
103
+ **tokenized_chat,
104
+ do_sample=True,
105
+ max_length=max_length,
106
+ pad_token_id=tokenizer.eos_token_id,
107
+ num_return_sequences=1
108
+ )
109
+
110
+ return output
111
+
112
+ demo = gr.ChatInterface(
113
+ fn=generate_response,
114
+ type="messages"
115
+ )
116
+
117
+
118
+ if __name__ == "__main__":
119
+ auth = HuggingFaceLogin()
120
+ auth.login()
121
+
122
+ # Initialize the model and tokenizer
123
+ llm_model, llm_tokenizer = initialize_llm()
124
+ demo.launch()