Spaces:

0504ankitsharma
/

thapargpt_openai

Sleeping

App Files Files Community

0504ankitsharma commited on Nov 27, 2024

Commit

8c44cf7

verified ·

1 Parent(s): c4e6640

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +97 -93

app/main.py CHANGED Viewed

@@ -1,10 +1,6 @@
 import os
 import re
-import time
-import nltk
-from fastapi import FastAPI, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
 from langchain_openai import ChatOpenAI
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.combine_documents import create_stuff_documents_chain
@@ -12,30 +8,44 @@ from langchain_core.prompts import ChatPromptTemplate
 from langchain.chains import create_retrieval_chain
 from langchain_community.vectorstores import FAISS
 from langchain_community.document_loaders import UnstructuredWordDocumentLoader as DocxLoader
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
-# Configure NLTK custom download directory
-NLTK_DATA_PATH = os.getenv("NLTK_DATA_PATH", os.path.join(os.getcwd(), "nltk_data"))
-os.makedirs(NLTK_DATA_PATH, exist_ok=True)
-nltk.data.path.append(NLTK_DATA_PATH)
-# Download necessary NLTK resources
-nltk.download("punkt", download_dir=NLTK_DATA_PATH)
-# Utility function to clean the response
 def clean_response(response):
-    if not response:
-        return "Sorry, I couldn't generate a response."
     cleaned = response.strip()
     cleaned = re.sub(r'^["\']+|["\']+$', '', cleaned)
     cleaned = re.sub(r'\n+', '\n', cleaned)
     cleaned = cleaned.replace('\\n', '')
     return cleaned
-# Initialize FastAPI app
 app = FastAPI()
-# CORS Middleware setup
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -44,109 +54,103 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Global Variables
-openai_api_key = os.getenv('OPENAI_API_KEY')  # Ensure this is set in your environment
-VECTOR_DB_PATH = "./vectors_db"
-DATA_FILE_PATH = "./data/Data.docx"
-MODEL_NAME = "BAAI/bge-base-en"
-# Initialize OpenAI LLM
 llm = ChatOpenAI(
     api_key=openai_api_key,
-    model_name="gpt-4-turbo-preview",  # Use "gpt-3.5-turbo" for cost efficiency if required
-    temperature=0.7,
-)
-# Prompt template
-prompt = ChatPromptTemplate.from_template(
-    """
-    You are a helpful assistant designed specifically for the Thapar Institute of Engineering and Technology (TIET), a renowned technical college. Your task is to answer all queries related to TIET. Every response you provide should be relevant to the context of TIET. If a question falls outside of this context, please decline by stating, 'Sorry, I cannot help with that.' If you do not know the answer to a question, do not attempt to fabricate a response; instead, politely decline.
-    If the query is not related to TIET or falls outside the context of education, respond with:
-            "Sorry, I cannot help with that. I'm specifically designed to answer questions about the Thapar Institute of Engineering and Technology.
-            For more information, please contact at our toll-free number: 18002024100 or E-mail us at [email protected]
-    <context>
-    {context}
-    </context>
-    Question: {input}
-    """
 )
-# Route: Home
 @app.get("/")
 def read_root():
-    return {"message": "Welcome to the ThaparGPT API!"}
-# Route: Chat Endpoint
 class Query(BaseModel):
     query_text: str
-@app.post("/chat")
-def chat(query: Query):
-    try:
-        # Load the vector store
-        embeddings = get_embeddings()
-        vectors = FAISS.load_local(VECTOR_DB_PATH, embeddings, allow_dangerous_deserialization=True)
-    except Exception as e:
-        print(f"Error loading vector store: {str(e)}")
-        raise HTTPException(status_code=500, detail="Vector Store not found or loading failed. Please run /setup first.")
-    # Retrieve and process the query
-    query_text = query.query_text
-    if query_text:
-        start_time = time.process_time()
-        document_chain = create_stuff_documents_chain(llm, prompt)
-        retriever = vectors.as_retriever()
-        retrieval_chain = create_retrieval_chain(retriever, document_chain)
-        try:
-            response = retrieval_chain.invoke({'input': query_text})
-        except Exception as e:
-            print(f"Error during query processing: {str(e)}")
-            raise HTTPException(status_code=500, detail="Error processing the query.")
-        print("Response time:", time.process_time() - start_time)
-        cleaned_response = clean_response(response.get('answer', ''))
-        return {"response": cleaned_response}
-    else:
-        raise HTTPException(status_code=400, detail="No query found in the request.")
-# Route: Setup Endpoint
-@app.get("/setup")
-def setup():
-    return vector_embedding()
-# Utility: Create Vector Embeddings
 def vector_embedding():
     try:
-        if not os.path.exists(DATA_FILE_PATH):
-            print(f"The file {DATA_FILE_PATH} does not exist.")
-            raise HTTPException(status_code=404, detail="Data file not found.")
-        # Load and split document
-        loader = DocxLoader(DATA_FILE_PATH)
         documents = loader.load()
-        print(f"Loaded document: {DATA_FILE_PATH}")
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
         chunks = text_splitter.split_documents(documents)
         print(f"Created {len(chunks)} chunks.")
-        # Create vector store
-        embeddings = get_embeddings()
-        db = FAISS.from_documents(chunks, embeddings)
-        db.save_local(VECTOR_DB_PATH)
         print("Vector store created and saved successfully.")
-        return {"response": "Vector Store DB is ready."}
     except Exception as e:
-        print(f"Error during setup: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Error during setup: {str(e)}")
-# Utility: Load Embedding Model
 def get_embeddings():
     encode_kwargs = {'normalize_embeddings': True}
-    return HuggingFaceBgeEmbeddings(model_name=MODEL_NAME, encode_kwargs=encode_kwargs)
-# Main entry point
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)

 import os
 import re
+from openai import OpenAI
 from langchain_openai import ChatOpenAI
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.chains import create_retrieval_chain
 from langchain_community.vectorstores import FAISS
 from langchain_community.document_loaders import UnstructuredWordDocumentLoader as DocxLoader
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi import FastAPI
+from pydantic import BaseModel
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+import nltk  # Importing NLTK
+import time
+# Configure NLTK data directory
+nltk_data_path = os.path.join(os.getcwd(), 'nltk_data')  # Use a writable directory
+nltk.data.path.append(nltk_data_path)
+# Ensure the directory exists
+if not os.path.exists(nltk_data_path):
+    os.makedirs(nltk_data_path)
+# Download required NLTK resources
+try:
+    nltk.download('punkt', download_dir=nltk_data_path)
+except Exception as e:
+    print(f"Error downloading NLTK resources: {e}")
 def clean_response(response):
+    # Remove any leading/trailing whitespace, including newlines
     cleaned = response.strip()
+    # Remove any enclosing quotation marks
     cleaned = re.sub(r'^["\']+|["\']+$', '', cleaned)
+    # Replace multiple newlines with a single newline
     cleaned = re.sub(r'\n+', '\n', cleaned)
+    # Remove any remaining '\n' characters
     cleaned = cleaned.replace('\\n', '')
     return cleaned
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+openai_api_key = os.environ.get('OPENAI_API_KEY')
 llm = ChatOpenAI(
     api_key=openai_api_key,
+    model_name="gpt-4-turbo-preview",  # or "gpt-3.5-turbo" for a more economical option
+    temperature=0.7
 )
 @app.get("/")
 def read_root():
+    return {"Hello": "World"}
 class Query(BaseModel):
     query_text: str
+prompt = ChatPromptTemplate.from_template(
+"""
+You are a helpful assistant designed specifically for the Thapar Institute of Engineering and Technology (TIET), a renowned technical college. Your task is to answer all queries related to TIET. Every response you provide should be relevant to the context of TIET. If a question falls outside of this context, please decline by stating, 'Sorry, I cannot help with that.' If you do not know the answer to a question, do not attempt to fabricate a response; instead, politely decline.
+You may elaborate on your answers slightly to provide more information, but avoid sounding boastful or exaggerating. Stay focused on the context provided.
+If the query is not related to TIET or falls outside the context of education, respond with:
+        "Sorry, I cannot help with that. I'm specifically designed to answer questions about the Thapar Institute of Engineering and Technology.
+        For more information, please contact at our toll-free number: 18002024100 or E-mail us at [email protected]
+<context>
+{context}
+</context>
+Question: {input}
+"""
+)
 def vector_embedding():
     try:
+        file_path = "./data/Data.docx"
+        if not os.path.exists(file_path):
+            print(f"The file {file_path} does not exist.")
+            return {"response": "Error: Data file not found"}
+        loader = DocxLoader(file_path)
         documents = loader.load()
+        print(f"Loaded document: {file_path}")
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
         chunks = text_splitter.split_documents(documents)
         print(f"Created {len(chunks)} chunks.")
+        model_name = "BAAI/bge-base-en"
+        encode_kwargs = {'normalize_embeddings': True}
+        model_norm = HuggingFaceBgeEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
+        db = FAISS.from_documents(chunks, model_norm)
+        db.save_local("./vectors_db")
         print("Vector store created and saved successfully.")
+        return {"response": "Vector Store DB Is Ready"}
     except Exception as e:
+        print(f"An error occurred: {str(e)}")
+        return {"response": f"Error: {str(e)}"}
 def get_embeddings():
+    model_name = "BAAI/bge-base-en"
     encode_kwargs = {'normalize_embeddings': True}
+    model_norm = HuggingFaceBgeEmbeddings(model_name=model_name, encode_kwargs=encode_kwargs)
+    return model_norm
+@app.post("/chat")  # Changed from /anthropic to /chat
+def read_item(query: Query):
+    try:
+        embeddings = get_embeddings()
+        vectors = FAISS.load_local("./vectors_db", embeddings, allow_dangerous_deserialization=True)
+    except Exception as e:
+        print(f"Error loading vector store: {str(e)}")
+        return {"response": "Vector Store Not Found or Error Loading. Please run /setup first."}
+    prompt1 = query.query_text
+    if prompt1:
+        start = time.process_time()
+        document_chain = create_stuff_documents_chain(llm, prompt)
+        retriever = vectors.as_retriever()
+        retrieval_chain = create_retrieval_chain(retriever, document_chain)
+        response = retrieval_chain.invoke({'input': prompt1})
+        print("Response time:", time.process_time() - start)
+        # Apply the cleaning function to the response
+        cleaned_response = clean_response(response['answer'])
+        # For debugging, print the cleaned response
+        print("Cleaned response:", repr(cleaned_response))
+        return cleaned_response
+    else:
+        return "No Query Found"
+@app.get("/setup")
+def setup():
+    return vector_embedding()
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)