NaimaAqeel commited on
Commit
2c02a9e
·
verified ·
1 Parent(s): 0385c04

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -47
app.py CHANGED
@@ -1,18 +1,33 @@
1
  import os
2
- from dotenv import load_dotenv
3
- import fitz # PyMuPDF
4
  from docx import Document
5
  from sentence_transformers import SentenceTransformer
6
  import faiss
7
  import numpy as np
8
  import pickle
9
- from langchain_community.llms import HuggingFaceEndpoint # Might need update
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_community.embeddings import HuggingFaceEmbeddings
12
  import gradio as gr
 
13
 
14
- # Load environment variables from .env
15
- load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # Initialize the embedding model
18
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -21,15 +36,16 @@ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
21
  api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
22
  if not api_token:
23
  raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
 
24
  print(f"API Token: {api_token[:5]}...")
25
 
26
- # Initialize the HuggingFace LLM (Optional, comment out if not used)
27
  llm = HuggingFaceEndpoint(
28
  endpoint_url="https://api-inference.huggingface.co/models/gpt2",
29
  model_kwargs={"api_key": api_token}
30
  )
31
 
32
- # Initialize the HuggingFace embedding
33
  embedding = HuggingFaceEmbeddings()
34
 
35
  # Load or create FAISS index
@@ -43,56 +59,67 @@ else:
43
  with open(index_path, "wb") as f:
44
  pickle.dump(index, f)
45
 
46
-
47
- # Function to extract text from a PDF file
48
- def extract_text_from_pdf(pdf_path):
49
- text = ""
50
- doc = fitz.open(pdf_path)
51
- for page_num in range(len(doc)):
52
- page = doc.load_page(page_num)
53
- text += page.get_text()
54
- return text
55
-
56
-
57
- # Function to extract text from a Word document
58
- def extract_text_from_docx(docx_path):
59
- doc = Document(docx_path)
60
- text = "\n".join([para.text for para in doc.paragraphs])
61
- return text
62
-
63
- def process_and_query(state, text, file=None):
64
- # Initialize state on first run
65
- if state is None:
66
- state = {"processed_text": None, "conversation": []}
67
-
68
- # Check if a file is uploaded
69
- if file:
70
- # Get the uploaded file content
71
  content = file.read()
72
- if file.filename.endswith('.pdf'):
73
  with open("temp.pdf", "wb") as f:
74
  f.write(content)
75
- state["processed_text"] = extract_text_from_pdf("temp.pdf")
76
- elif file.filename.endswith('.docx'):
77
  with open("temp.docx", "wb") as f:
78
  f.write(content)
79
- state["processed_text"] = extract_text_from_docx("temp.docx")
80
  else:
81
  return {"error": "Unsupported file format"}
82
 
83
- # Handle user question
84
- if state["processed_text"] and text:
85
- # Process the question and potentially use LLM for answering (optional)
86
- question_embedding = embedding_model.encode([text])
87
- # ... (logic to search the index and potentially use LLM for answering)
88
- answer = "Answer retrieved from the document based on your question." # Placeholder answer
 
 
89
 
90
- # Update conversation history
91
- state["conversation"].append({"question": text,
92
- "answer": answer})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- # Return the conversation history and potentially answer
95
- return {"conversation": state["conversation"]}
96
 
97
 
98
 
 
1
  import os
2
+ import fitz
 
3
  from docx import Document
4
  from sentence_transformers import SentenceTransformer
5
  import faiss
6
  import numpy as np
7
  import pickle
8
+ from langchain_community.llms import HuggingFaceEndpoint
9
  from langchain_community.vectorstores import FAISS
10
  from langchain_community.embeddings import HuggingFaceEmbeddings
11
  import gradio as gr
12
+ from fastapi import FastAPI
13
 
14
+ # Initialize FastAPI
15
+ app = FastAPI()
16
+
17
+ # Function to extract text from a PDF file
18
+ def extract_text_from_pdf(pdf_path):
19
+ text = ""
20
+ doc = fitz.open(pdf_path)
21
+ for page_num in range(len(doc)):
22
+ page = doc.load_page(page_num)
23
+ text += page.get_text()
24
+ return text
25
+
26
+ # Function to extract text from a Word document
27
+ def extract_text_from_docx(docx_path):
28
+ doc = Document(docx_path)
29
+ text = "\n".join([para.text for para in doc.paragraphs])
30
+ return text
31
 
32
  # Initialize the embedding model
33
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 
36
  api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
37
  if not api_token:
38
  raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
39
+
40
  print(f"API Token: {api_token[:5]}...")
41
 
42
+ # Initialize the HuggingFace LLM
43
  llm = HuggingFaceEndpoint(
44
  endpoint_url="https://api-inference.huggingface.co/models/gpt2",
45
  model_kwargs={"api_key": api_token}
46
  )
47
 
48
+ # Initialize the HuggingFace embeddings
49
  embedding = HuggingFaceEmbeddings()
50
 
51
  # Load or create FAISS index
 
59
  with open(index_path, "wb") as f:
60
  pickle.dump(index, f)
61
 
62
+ def upload_files(files):
63
+ for file in files:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  content = file.read()
65
+ if file.name.endswith('.pdf'):
66
  with open("temp.pdf", "wb") as f:
67
  f.write(content)
68
+ text = extract_text_from_pdf("temp.pdf")
69
+ elif file.name.endswith('.docx'):
70
  with open("temp.docx", "wb") as f:
71
  f.write(content)
72
+ text = extract_text_from_docx("temp.docx")
73
  else:
74
  return {"error": "Unsupported file format"}
75
 
76
+ # Process the text and update FAISS index
77
+ sentences = text.split("\n")
78
+ embeddings = embedding_model.encode(sentences)
79
+ index.add(np.array(embeddings))
80
+
81
+ # Save the updated index
82
+ with open(index_path, "wb") as f:
83
+ pickle.dump(index, f)
84
 
85
+ return "Files processed successfully"
86
+
87
+ def query_text(text):
88
+ # Encode the query text
89
+ query_embedding = embedding_model.encode([text])
90
+
91
+ # Search the FAISS index
92
+ D, I = index.search(np.array(query_embedding), k=5)
93
+
94
+ top_documents = []
95
+ for idx in I[0]:
96
+ if idx != -1: # Ensure that a valid index is found
97
+ top_documents.append(f"Document {idx}")
98
+
99
+ return top_documents
100
+
101
+ # Create Gradio interface
102
+ with gr.Blocks() as demo:
103
+ gr.Markdown("## Document Upload and Query System")
104
+
105
+ with gr.Tab("Upload Files"):
106
+ upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")
107
+ upload_button = gr.Button("Upload")
108
+ upload_output = gr.Textbox()
109
+ upload_button.click(fn=upload_files, inputs=upload, outputs=upload_output)
110
+
111
+ with gr.Tab("Query"):
112
+ query = gr.Textbox(label="Enter your query")
113
+ query_button = gr.Button("Search")
114
+ query_output = gr.Textbox()
115
+ query_button.click(fn=query_text, inputs=query, outputs=query_output)
116
+
117
+ demo.launch()
118
+
119
+ if __name__ == "__main__":
120
+ import uvicorn
121
+ uvicorn.run(app, host="0.0.0.0", port=8001)
122
 
 
 
123
 
124
 
125