Spaces:

NaimaAqeel
/

Chatbot

Build error

File size: 4,437 Bytes

d382509
 
409f81b
 
 
 
 
 
47ecda0
 
 
2c02a9e
d382509
 
 
 
 
 
 
 
 
 
 
2c02a9e
3a0b46d
2c02a9e
f7133fb
 
 
 
 
 
 
 
4d0c42b
409f81b
261cad3
3a0b46d
ba470cd
3a0b46d
4d0c42b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f812db9
9afffa7
 
 
 
2c02a9e
8ceb607
6e6d28c
cd89674
 
d382509
f7133fb
cd89674
d382509
 
 
 
 
 
3a0b46d
 
 
 
 
 
 
 
 
6e6d28c
ba470cd
8ab4823
3a0b46d
 
 
 
 
 
 
 
 
 
 
 
 
9599ad9
9afffa7
d382509
773aab2
d382509
773aab2
 
 
 
3a0b46d
 
 
 
d382509
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f812db9
 
2c02a9e
261cad3
ba470cd
6e6d28c
70fd172
fa02121
1d0faab
47ecda0
 
 
0385c04
 
84f3457
 
 
409f81b
d7100c1



import os
from docx import Document
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# Initialize the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize the HuggingFace LLM
llm = HuggingFaceEndpoint(
    endpoint_url="https://api-inference.huggingface.co/models/gpt2",
    model_kwargs={"api_key": os.getenv('HUGGINGFACEHUB_API_TOKEN')}
)

# Initialize the HuggingFace embeddings
embedding = HuggingFaceEmbeddings()

# Function to extract text from a Word document
def extract_text_from_docx(docx_path):
    text = ""
    try:
        doc = Document(docx_path)
        text = "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"Error extracting text from DOCX: {e}")
    return text

# Load or create FAISS index
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"

document_texts = []

if os.path.exists(index_path) and os.path.exists(document_texts_path):
    try:
        with open(index_path, "rb") as f:
            index = pickle.load(f)
            print("Loaded FAISS index from faiss_index.pkl")
        with open(document_texts_path, "rb") as f:
            document_texts = pickle.load(f)
            print("Loaded document texts from document_texts.pkl")
    except Exception as e:
        print(f"Error loading FAISS index or document texts: {e}")
else:
    # Create a new FAISS index if it doesn't exist
    index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
    with open(index_path, "wb") as f:
        pickle.dump(index, f)
        print("Created new FAISS index and saved to faiss_index.pkl")

def preprocess_text(text):
    # Add more preprocessing steps if necessary
    return text.strip()

def upload_files(files):
    global index, document_texts
    try:
        for file in files:
            file_path = file.name  # Get the file path from the NamedString object
            if file_path.endswith('.docx'):
                text = extract_text_from_docx(file_path)

                # Process the text and update FAISS index
                sentences = text.split("\n")
                sentences = [preprocess_text(sentence) for sentence in sentences if sentence.strip()]
                embeddings = embedding_model.encode(sentences)
                index.add(np.array(embeddings))
                document_texts.extend(sentences)  # Store sentences for retrieval

        # Save the updated index and documents
        with open(index_path, "wb") as f:
            pickle.dump(index, f)
            print("Saved updated FAISS index to faiss_index.pkl")
        with open(document_texts_path, "wb") as f:
            pickle.dump(document_texts, f)
            print("Saved updated document texts to document_texts.pkl")
        
        return "Files processed successfully"
    except Exception as e:
        print(f"Error processing files: {e}")
        return f"Error processing files: {e}"

def query_text(text):
    try:
        # Encode the query text
        query_embedding = embedding_model.encode([text])
        
        # Search the FAISS index
        D, I = index.search(np.array(query_embedding), k=5)
        
        top_documents = []
        for idx in I[0]:
            if idx != -1 and idx < len(document_texts):  # Ensure that a valid index is found
                top_documents.append(document_texts[idx])  # Append the actual sentences for the response
        
        # Prepare the prompt
        context = "\n".join(top_documents)
        prompt = f"Context:\n{context}\n\nQuestion:\n{text}\n\nAnswer:\n"

        # Query the LLM
        response = llm(prompt)
        return response
    except Exception as e:
        print(f"Error querying text: {e}")
        return f"Error querying text: {e}"

# Sample Gradio integration (for illustration)
import gradio as gr

def main():
    gr.Interface(
        [upload_files, query_text],
        ["files", "text"],
        ["text", "text"],
        title="Document Upload and Query System",
        description="Upload DOCX files to build an index, then query for answers based on uploaded documents.",
    ).launch()

if __name__ == "__main__":
    main()