NaimaAqeel commited on
Commit
84f3457
·
verified ·
1 Parent(s): 9a4226e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -56
app.py CHANGED
@@ -9,27 +9,10 @@ import pickle
9
  from langchain_community.llms import HuggingFaceEndpoint
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_community.embeddings import HuggingFaceEmbeddings
12
- from fastapi import FastAPI, UploadFile, File
13
- from typing import List
14
 
15
- load_dotenv() # Load environment variables from .env
16
-
17
- app = FastAPI()
18
-
19
- # Function to extract text from a PDF file
20
- def extract_text_from_pdf(pdf_path):
21
- text = ""
22
- doc = fitz.open(pdf_path)
23
- for page_num in range(len(doc)):
24
- page = doc.load_page(page_num)
25
- text += page.get_text()
26
- return text
27
-
28
- # Function to extract text from a Word document
29
- def extract_text_from_docx(docx_path):
30
- doc = Document(docx_path)
31
- text = "\n".join([para.text for para in doc.paragraphs])
32
- return text
33
 
34
  # Initialize the embedding model
35
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -38,7 +21,6 @@ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
38
  api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
39
  if not api_token:
40
  raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
41
-
42
  print(f"API Token: {api_token[:5]}...")
43
 
44
  # Initialize the HuggingFace LLM
@@ -47,7 +29,7 @@ llm = HuggingFaceEndpoint(
47
  model_kwargs={"api_key": api_token}
48
  )
49
 
50
- # Initialize the HuggingFace embeddings
51
  embedding = HuggingFaceEmbeddings()
52
 
53
  # Load or create FAISS index
@@ -61,47 +43,56 @@ else:
61
  with open(index_path, "wb") as f:
62
  pickle.dump(index, f)
63
 
64
- @app.post("/upload/")
65
- async def upload_file(files: List[UploadFile] = File(...)):
66
- for file in files:
67
- content = await file.read()
68
- if file.filename.endswith('.pdf'):
69
- with open("temp.pdf", "wb") as f:
70
- f.write(content)
71
- text = extract_text_from_pdf("temp.pdf")
72
- elif file.filename.endswith('.docx'):
73
- with open("temp.docx", "wb") as f:
74
- f.write(content)
75
- text = extract_text_from_docx("temp.docx")
76
- else:
77
- return {"error": "Unsupported file format"}
78
-
79
- # Process the text and update FAISS index
80
- sentences = text.split("\n")
81
- embeddings = embedding_model.encode(sentences)
82
- index.add(np.array(embeddings))
83
-
84
- # Save the updated index
85
- with open(index_path, "wb") as f:
86
- pickle.dump(index, f)
87
 
88
- return {"message": "Files processed successfully"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- @app.post("/query/")
91
- async def query(text: str):
92
- # Encode the query text
93
- query_embedding = embedding_model.encode([text])
94
-
95
  # Search the FAISS index
 
96
  D, I = index.search(np.array(query_embedding), k=5)
97
-
98
  top_documents = []
99
  for idx in I[0]:
100
  if idx != -1: # Ensure that a valid index is found
101
  top_documents.append(f"Document {idx}")
102
 
103
- return {"top_documents": top_documents}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- if __name__ == "__main__":
106
- import uvicorn
107
- uvicorn.run(app, host="0.0.0.0", port=8001)
 
9
  from langchain_community.llms import HuggingFaceEndpoint
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_community.embeddings import HuggingFaceEmbeddings
12
+ import gradio as gr
 
13
 
14
+ # Load environment variables from .env
15
+ load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # Initialize the embedding model
18
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 
21
  api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
22
  if not api_token:
23
  raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
 
24
  print(f"API Token: {api_token[:5]}...")
25
 
26
  # Initialize the HuggingFace LLM
 
29
  model_kwargs={"api_key": api_token}
30
  )
31
 
32
+ # Initialize the HuggingFace embedding
33
  embedding = HuggingFaceEmbeddings()
34
 
35
  # Load or create FAISS index
 
43
  with open(index_path, "wb") as f:
44
  pickle.dump(index, f)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ # Function to extract text from a PDF file
48
+ def extract_text_from_pdf(pdf_path):
49
+ text = ""
50
+ doc = fitz.open(pdf_path)
51
+ for page_num in range(len(doc)):
52
+ page = doc.load_page(page_num)
53
+ text += page.get_text()
54
+ return text
55
+
56
+
57
+ # Function to extract text from a Word document
58
+ def extract_text_from_docx(docx_path):
59
+ doc = Document(docx_path)
60
+ text = "\n".join([para.text for para in doc.paragraphs])
61
+ return text
62
+
63
+
64
+ def process_and_query(text):
65
+ # Process the text and update FAISS index (similar to the previous code)
66
+ sentences = text.split("\n")
67
+ embeddings = embedding_model.encode(sentences)
68
+ index.add(np.array(embeddings))
69
 
 
 
 
 
 
70
  # Search the FAISS index
71
+ query_embedding = embedding_model.encode([text])
72
  D, I = index.search(np.array(query_embedding), k=5)
73
+
74
  top_documents = []
75
  for idx in I[0]:
76
  if idx != -1: # Ensure that a valid index is found
77
  top_documents.append(f"Document {idx}")
78
 
79
+ # Generate response using LLM (optional)
80
+ # You can replace this with your desired LLM interaction logic
81
+ response = llm.run(inputs=text, max_length=100, temperature=0.7)["generated_text"]
82
+
83
+ return {"top_documents": top_documents, "response": response}
84
+
85
+
86
+ # Define the Gradio interface
87
+ interface = gr.Interface(
88
+ fn=process_and_query,
89
+ inputs="textbox",
90
+ outputs=["list", "text"],
91
+ title="Chatbot with Text Processing and Retrieval",
92
+ description="Upload a document (PDF or Word) or enter text to process. The chatbot will retrieve relevant documents and generate a response (optional).",
93
+ )
94
+
95
+ # Launch the Gradio interface
96
+ interface.launch()
97
+
98