Spaces:

LastSmile
/

CodeLlama-7b

Runtime error

App Files Files Community

Shawn732 commited on Jan 17, 2024

Commit

df8bb52

1 Parent(s): 5bc5847

1st Init Commit

Browse files

Files changed (7) hide show

Dockerfile +59 -0
doc_reader.py +53 -0
main.py +232 -0
model.py +83 -0
requirements.txt +15 -0
start_server.sh +22 -0
streamlit_app.py +23 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,59 @@

+# Use an NVIDIA CUDA base image
+ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
+FROM nvidia/cuda:${CUDA_IMAGE}
+ENV HOST 0.0.0.0
+# Set the working directory in the container to /app
+#WORKDIR /app
+RUN mkdir -p /app/cache && chmod -R 777 /app/cache
+ENV HF_HOME=/app/cache
+# Install Python and pip
+RUN apt-get update && apt-get upgrade -y \
+    && apt-get install -y git build-essential \
+    python3 python3-pip gcc wget \
+    ocl-icd-opencl-dev opencl-headers clinfo \
+    libclblast-dev libopenblas-dev \
+    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
+ENV CUDA_DOCKER_ARCH=all
+ENV LLAMA_CUBLAS=1
+# Copy the current directory contents into the container at /app
+COPY . /app
+# Install required packages from requirements.txt
+COPY ./requirements.txt /app/requirements.txt
+RUN pip3 install --no-cache-dir -r /app/requirements.txt
+# Expose the ports for FastAPI and Streamlit
+EXPOSE 8000
+EXPOSE 8501
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+WORKDIR /home/user/app
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=$HOME/app \
+	PYTHONUNBUFFERED=1 \
+	GRADIO_ALLOW_FLAGGING=never \
+	GRADIO_NUM_PORTS=1 \
+	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_THEME=huggingface \
+	SYSTEM=spaces
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . /home/user/app
+# Copy and give execute permissions to the start script
+COPY start_server.sh /app/start_server.sh
+RUN chmod +x /app/start_server.sh
+# Run the start script
+CMD ["/app/start_server.sh"]

doc_reader.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import glob
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Qdrant
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_core.documents.base import Document
+class DocReader:
+    def __init__(self, pdf_path, model_path="sentence-transformers/all-mpnet-base-v2", persist_directory="db"):
+        self.pdfs = glob.glob(f"{pdf_path}/*.pdf")  # Adjusted to get all PDF files in the folder
+        self.model_path = model_path
+        self.persist_directory = persist_directory
+    def load_pdfs(self):
+        all_pages = []
+        for pdf_file in self.pdfs:
+            loader = PyPDFLoader(pdf_file)
+            pages = loader.load()
+            all_pages.extend(pages)
+        return all_pages
+    def convert_to_markdown(self, documents):
+        markdown_text = ""
+        for doc in documents:
+            page_text = doc.page_content.replace('\n', '\n\n')  # Add extra newline for Markdown
+            markdown_text += page_text + "\n\n---\n\n"
+        return markdown_text
+    def split_text(self, pages):
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=128,
+            chunk_overlap=24)
+        documents = [Document(page_content=page) for page in pages]
+        split_documents = text_splitter.split_documents(documents)
+        texts = [doc.page_content for doc in split_documents]
+        return texts
+    def generate_embeddings(self, texts):
+        embeddings = HuggingFaceEmbeddings(
+            model_name=self.model_path,
+            model_kwargs={"device": "cuda:0"},
+            encode_kwargs={"normalize_embeddings": True},
+        )
+        documents = [Document(page_content=text) for text in texts]
+        db = Qdrant.from_documents(documents, embeddings, location=":memory:", collection_name="pdf_collection")
+        return db
+    def search_similar(self, input_text, k=3):
+        results = self.db.similarity_search(input_text, k)
+        return results

main.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# main.py
+import logging
+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+import nest_asyncio
+from pyngrok import ngrok
+import uvicorn
+import json
+from model import Model
+from doc_reader import DocReader
+from transformers import GenerationConfig, pipeline
+from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
+from langchain.prompts import PromptTemplate
+from langchain.chains import RetrievalQA
+from langchain.schema.runnable import RunnableBranch
+from langchain_core.runnables import RunnableLambda
+# Logger configuration
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s [%(levelname)s] %(message)s',
+                    datefmt='%Y-%m-%d %H:%M:%S')
+logger = logging.getLogger(__name__)
+# Add path to sys
+# sys.path.insert(0,'/opt/accelerate')
+# sys.path.insert(0,'/opt/uvicorn')
+# sys.path.insert(0,'/opt/pyngrok')
+# sys.path.insert(0,'/opt/huggingface_hub')
+# sys.path.insert(0,'/opt/nest_asyncio')
+# sys.path.insert(0,'/opt/transformers')
+# sys.path.insert(0,'/opt/pytorch')
+# Initialize FastAPI app
+app = FastAPI()
+NGROK_TOKEN = "2aQUM6MDkhjcPEBbIFTiu4cZBBr_sMMei8h5yejFbxFeMFuQ"  # Replace with your NGROK token
+#MODEL_NAME = "/opt/Llama-2-13B-chat-GPTQ"
+#MODEL_NAME = "MediaTek-Research/Breeze-7B-Instruct-64k-v0.1"
+MODEL_NAME = "codellama/CodeLlama-7b-Instruct-hf"
+PDF_PATH = "/opt/docs"
+CLASSIFIER_MODEL_NAME = "roberta-large-mnli"
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=['*'],
+    allow_credentials=True,
+    allow_methods=['*'],
+    allow_headers=['*'],
+)
+model_instance = Model(MODEL_NAME)
+model_instance.load()
+#model_instance.load(model_name_or_path = GGUF_HUGGINGFACE_REPO, model_basename = GGUF_HUGGINGFACE_BIN_FILE
+# classifier_model = pipeline("zero-shot-classification",
+#                       model=CLASSIFIER_MODEL_NAME)
+@app.post("/predict")
+async def predict_text(request: Request):
+    try:
+        # Parse request body as JSON
+        request_body = await request.json()
+        prompt = request_body.get("prompt", "")
+        # TODO: handle additional parameters like 'temperature' or 'max_tokens' if needed
+        result = general_chain.invoke({"question":prompt})
+        logger.info(f"Result: {result}")
+        formatted_response = {
+            "choices": [
+                {
+                    "message": {
+                        "content": result['result']
+                    }
+                }
+            ]
+        }
+        return formatted_response
+    except json.JSONDecodeError:
+        return {"error": "Invalid JSON format"}
+def load_pdfs():
+  global db
+  doc_reader = DocReader(PDF_PATH)
+  # Load PDFs and convert to Markdown
+  pages = doc_reader.load_pdfs()
+  markdown_text = doc_reader.convert_to_markdown(pages)
+  texts = doc_reader.split_text([markdown_text])  # Assuming split_text now takes a list of Markdown texts
+  # Generate embeddings
+  db = doc_reader.generate_embeddings(texts)
+# def classify_sequence(input_data):
+#     sequence_to_classify = input_data["question"]
+#     candidate_labels = ['LinuxCommand', 'TechnicalSupport', 'GeneralResponse']
+#     classification = classifier_model(sequence_to_classify, candidate_labels)
+#     # Extract the label with the highest score
+#     return {"topic": classification['labels'][0], "question": sequence_to_classify}
+def format_output(output):
+    return {"result": output}
+def setup_chain():
+  #global full_chain
+  #global classifier_chain
+  global command_chain
+  #global support_chain
+  global general_chain
+  generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
+  generation_config.max_new_tokens = 1024
+  generation_config.temperature = 0.3
+  generation_config.top_p = 0.9
+  generation_config.do_sample = True
+  generation_config.repetition_penalty = 1.15
+  text_pipeline = pipeline(
+    "text-generation",
+    model=model_instance.model,
+    tokenizer=model_instance.tokenizer,
+    return_full_text=True,
+    generation_config=generation_config,
+  )
+  llm = HuggingFacePipeline(pipeline=text_pipeline)
+  # Classifier
+  #classifier_runnable = RunnableLambda(classify_sequence)
+  # Formatter
+  output_runnable = RunnableLambda(format_output)
+  # System Commands
+  command_template = """
+  [INST] <<SYS>>
+  As a Gemini Central engineer specializing in Linux, evaluate the user's input and choose the most likely command they want to execute from these options:
+  - 'systemctl stop sbox-admin'
+  - 'systemctl start sbox-admin'
+  - 'systemctl restart sbox-admin'
+  Respond with the chosen command. If uncertain, reply with 'No command will be executed'.
+  <</SYS>>
+  question:
+  {question}
+  answer:
+  [/INST]"""
+  command_chain =  (PromptTemplate(template=command_template,input_variables=["question"]) | llm | output_runnable )
+  # Support
+#   support_template = """
+#   [INST] <<SYS>>
+#   Act as a Gemini support engineer who is good at reading technical data. Use the following information to answer the question at the end.
+#   <</SYS>>
+#   {context}
+#   {question}
+#   answer:
+#   [/INST]
+#   """
+  # General
+  general_template = """
+  [INST] <<SYS>>
+  You are an advanced AI assistant designed to provide assistance with a wide range of queries.
+  Users may request you to assume various roles or perform diverse tasks
+  <</SYS>>
+  question:
+  {question}
+  answer:
+  [/INST]"""
+  general_chain = (PromptTemplate(template=general_template,input_variables=["question"]) | llm | output_runnable)
+  #support_prompt = PromptTemplate(template=support_template, input_variables=["context","question"])
+  #support_chain = RetrievalQA.from_llm(llm=llm, retriever= db.as_retriever(), prompt=support_prompt, input_key="question", return_source_documents=True, verbose=True)
+#   support_chain = RetrievalQA.from_chain_type(
+#       llm=llm,
+#       chain_type="stuff",
+#       #retriever=db.as_retriever(search_kwargs={"k": 3}),
+#       retriever=db.as_retriever(),
+#       input_key="question",
+#       return_source_documents=True,
+#       chain_type_kwargs={"prompt": support_prompt},
+#       verbose=False
+#   )
+#   logger.info("support chain loaded successfully.")
+  # branch = RunnableBranch(
+  #     (lambda x: x == "command", command_chain),
+  #     (lambda x: x == "support", support_chain),
+  #     general_chain,  # Default chain
+  # )
+#   def route_classification(output):
+#     if output['topic'] == 'LinuxCommand':
+#         logger.info("Routing to command chain")
+#         return command_chain
+#     elif output['topic'] == 'TechnicalSupport':
+#         logger.info("Routing to support chain")
+#         return support_chain
+#     else:
+#         logger.info("Routing to general chain")
+#         return general_chain
+#   routing_runnable = RunnableLambda(route_classification)
+  # Full chain integration
+  #full_chain = classifier_runnable | routing_runnable
+  #logger.info("Full chain loaded successfully.")
+  return general_chain
+###############
+# launch once at startup
+#load_pdfs()
+setup_chain()
+###############
+if __name__ == "__main__":
+    if NGROK_TOKEN is not None:
+        ngrok.set_auth_token(NGROK_TOKEN)
+    ngrok_tunnel = ngrok.connect(8000)
+    public_url = ngrok_tunnel.public_url
+    print('Public URL:', public_url)
+    print("You can use {}/predict to get the assistant result.".format(public_url))
+    logger.info("You can use {}/predict to get the assistant result.".format(public_url))
+    nest_asyncio.apply()
+    uvicorn.run(app, port=8000)

model.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# model.py
+import logging
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+# Logger configuration
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s [%(levelname)s] %(message)s',
+                    datefmt='%Y-%m-%d %H:%M:%S')
+logger = logging.getLogger(__name__)
+#model_path = "/opt/Llama-2-13B-chat-GPTQ"
+class Model:
+    def __init__(self, model_path):
+        self.model_name = model_path
+        self.model = None
+        self.tokenizer = None
+        self.loaded = False
+    def load(self, precision='fp16'):
+        try:
+            # Check if CUDA is available
+            if not torch.cuda.is_available():
+                raise EnvironmentError("CUDA not available.")
+            # Set precision settings
+            if precision == 'fp16':
+                torch_dtype = torch.float16
+            else:
+                torch_dtype = torch.float32
+            # Initialize tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            # Set up model configuration
+            config = AutoConfig.from_pretrained(self.model_name)
+            #config.quantization_config["disable_exllama"] = False
+            #config.quantization_config["use_exllama"] = True
+            #config.quantization_config["exllama_config"] = {"version": 2}
+            # Load model with configuration and precision
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                config=config,
+                device_map="cuda:0",  # Set to GPU 0
+                torch_dtype=torch_dtype
+            )
+            self.loaded = True
+            logger.info(f"Model loaded successfully on GPU with {precision} precision.")
+        except Exception as e:
+            logger.error(f"Error loading model: {e}")
+    def predict(self, input_text, max_length=50):
+        if not self.loaded:
+            logger.error("Model not loaded. Please load the model before prediction.")
+            return None
+        logger.info("========== Start Prediction ==========")
+        try:
+            # Ensure the input_text is a string
+            if not isinstance(input_text, str):
+                raise ValueError("Input text must be a string.")
+            # Encoding the input text
+            input_ids = self.tokenizer.encode(input_text, return_tensors='pt')
+            # Move input to the same device as model
+            input_ids = input_ids.to(next(self.model.parameters()).device)
+            # Generating output using the model
+            outputs = self.model.generate(input_ids, max_length=max_length)
+            # Decoding and returning the generated text
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            logger.info("Response: {}".format(response))
+        except Exception as e:
+            logger.error(f"Error during prediction: {e}")
+            response = None
+        logger.info("========== End Prediction ==========")
+        return response

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+fastapi
+nest-asyncio
+pyngrok
+uvicorn
+accelerate
+transformers
+sentence-transformers
+torch
+auto-gptq
+optimum
+huggingface_hub
+langchain
+pypdf
+qdrant-client
+streamlit

start_server.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/bin/bash
+# Start FastAPI app
+echo "Starting FastAPI app..."
+#uvicorn main:app --reload &
+python3 main.py &
+# Store FastAPI process ID
+FASTAPI_PID=$!
+# Start Streamlit app
+echo "Starting Streamlit app..."
+streamlit run streamlit_app.py &
+# Store Streamlit process ID
+STREAMLIT_PID=$!
+# Wait for any process to exit
+wait -n
+# Kill the other process when one exits
+kill -TERM $FASTAPI_PID
+kill -TERM $STREAMLIT_PID

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import streamlit as st
+import requests
+# Run this with: streamlit run streamlit_app.py
+# Streamlit interface
+st.title("Gemini Central Console Bot")
+user_input = st.text_input("Enter your text here")
+url = "http://localhost:8000/predict"  # URL of your FastAPI predict endpoint
+if st.button("Submit"):
+    # Prepare the payload
+    payload = {"prompt": user_input}
+    # Send the request to FastAPI endpoint
+    response = requests.post(url, json=payload)
+    # Display the response
+    if response.status_code == 200:
+        result = response.json()
+        content = result["choices"][0]["message"]["content"]
+        st.write(content)
+    else:
+        st.write("Failed to get response")