File size: 3,101 Bytes
409f81b
14afd96
 
409f81b
 
 
 
 
6cc8328
409f81b
 
84f3457
409f81b
84f3457
 
409f81b
 
 
 
 
 
 
 
 
 
6cc8328
409f81b
 
 
 
 
84f3457
409f81b
 
 
 
 
 
 
 
 
 
 
 
 
 
84f3457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2ca711
 
 
 
 
6cc8328
 
 
 
 
 
 
f2ca711
6cc8328
 
 
f2ca711
6cc8328
 
 
f2ca711
 
 
 
 
 
84f3457
f2ca711
 
84f3457
 
 
409f81b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
from dotenv import load_dotenv
import fitz  # PyMuPDF
from docx import Document
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle
from langchain_community.llms import HuggingFaceEndpoint  # Might need update
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
import gradio as gr

# Load environment variables from .env
load_dotenv()

# Initialize the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Hugging Face API token
api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
if not api_token:
    raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
print(f"API Token: {api_token[:5]}...")

# Initialize the HuggingFace LLM (Optional, comment out if not used)
llm = HuggingFaceEndpoint(
    endpoint_url="https://api-inference.huggingface.co/models/gpt2",
    model_kwargs={"api_key": api_token}
)

# Initialize the HuggingFace embedding
embedding = HuggingFaceEmbeddings()

# Load or create FAISS index
index_path = "faiss_index.pkl"
if os.path.exists(index_path):
    with open(index_path, "rb") as f:
        index = pickle.load(f)
else:
    # Create a new FAISS index if it doesn't exist
    index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
    with open(index_path, "wb") as f:
        pickle.dump(index, f)


# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text


# Function to extract text from a Word document
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text


def process_and_query(state, text, file=None):
    # Initialize state on first run
    if state is None:
        state = {"processed_text": None, "conversation": []}

    # Check if a file is uploaded
    if file:
        # Get the uploaded file content
        content = file.read()
        if file.filename.endswith('.pdf'):
            with open("temp.pdf", "wb") as f:
                f.write(content)
            state["processed_text"] = extract_text_from_pdf("temp.pdf")
        elif file.filename.endswith('.docx'):
            with open("temp.docx", "wb") as f:
                f.write(content)
            state["processed_text"] = extract_text_from_docx("temp.docx")
        else:
            return {"error": "Unsupported file format"}

    # Handle user question
    if state["processed_text"] and text:
        # Process the question and potentially use LLM for answering (optional)
        question_embedding = embedding_model.encode([text])
        # ... (logic to search the index and potentially use LLM for answering)
        answer = "Answer retrieved from the document based on your question."  # Placeholder answer

        # Update conversation history
        state["conversation"].append({"question": text,