Spaces:

Somekindofathing
/

ontology-individuals-filler

Paused

App Files Files Community

theosaurus commited on Jan 23

Commit

fb79cf3

1 Parent(s): b27ab96

Add initial implementation of LLM model with Hugging Face integration

Browse files

Files changed (2) hide show

.gitignore +0 -0
app.py +124 -0

.gitignore ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import gradio as gr
+import spaces
+from huggingface_hub import InferenceClient, login
+import accelerate
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
+import numpy as np
+import tempfile
+import os
+from threading import Thread
+import torch
+from rdflib import Graph, Namespace, URIRef, Literal
+from rdflib.namespace import RDF, RDFS, OWL
+from time import time
+from typing import Optional
+# Initialize logging and device information
+print(f"Is CUDA available: {torch.cuda.is_available()}")
+print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
+class HuggingFaceLogin:
+    """Handles authentication to the Hugging Face Hub using environment variables or explicit tokens."""
+    def __init__(self, env_token_key: str = "HF_TOKEN"):
+        """Initialize the login handler.
+        Args:
+            env_token_key (str): Environment variable key containing the token. Defaults to "HF_TOKEN".
+        """
+        self.token = os.getenv(env_token_key)
+    def login(self, token: str = None) -> bool:
+        """Authenticate with the Hugging Face Hub.
+        Args:
+            token (Optional[str]): Optional explicit token. If not provided, uses token from environment.
+        Returns:
+            bool: True if login successful, False otherwise.
+        Raises:
+            ValueError: If no token is available (neither in env nor passed explicitly).
+        """
+        if not self.token:
+            raise ValueError("No authentication token provided. Set HF_TOKEN environment variable or pass token explicitly.")
+        try:
+            print("Logging in to the Hugging Face Hub...")
+            login(token=self.token)
+            return True
+        except Exception as e:
+            print(f"Login failed: {str(e)}")
+            return False
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_id = "meta-llama/Llama-3.1-8B-Instruct"
+model_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4"
+)
+llm_model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    quantization_config=model_config,
+    device_map="auto")
+@spaces.GPU
+def initialize_llm():
+    """
+    Initialize the LLM with careful memory management.
+    Returns the model and tokenizer configured for efficient memory use.
+    """
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    print("Loading model with memory optimizations...")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        quantization_config=model_config,
+        device_map="auto",
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
+        use_cache=False  # Disable KV cache to save memory
+    )
+    return model, tokenizer
+def generate_response(prompt:str, history: Optional[list], llm: Optional[AutoModelForCausalLM], tokenizer, max_length: int = 100) -> str:
+    """
+    Generate a response from the LLM model given a prompt.
+    """
+    messages = [
+        {"role": "system", "content": "You are a pirate."},
+        {"role": "user", "content": f"What do you think I should do about {prompt}?"},
+    ]
+    tokenized_chat = tokenizer.apply_chat_template(messages,
+                                                   tokenize=True,
+                                                   add_generation_prompt=True,
+                                                   return_tensors="pt",
+                                                   max_length= max_length)
+    with torch.no_grad():
+        output = llm.generate(
+            **tokenized_chat,
+            do_sample=True,
+            max_length=max_length,
+            pad_token_id=tokenizer.eos_token_id,
+            num_return_sequences=1
+        )
+    return output
+demo =  gr.ChatInterface(
+    fn=generate_response,
+    type="messages"
+    )
+if __name__ == "__main__":
+    auth = HuggingFaceLogin()
+    auth.login()
+    # Initialize the model and tokenizer
+    llm_model, llm_tokenizer = initialize_llm()
+    demo.launch()