NaimaAqeel commited on
Commit
409f81b
·
verified ·
1 Parent(s): c52cd8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -103
app.py CHANGED
@@ -1,104 +1,105 @@
1
- import os
2
- import fitz # PyMuPDF
3
- from docx import Document
4
- from sentence_transformers import SentenceTransformer
5
- import faiss
6
- import numpy as np
7
- import pickle
8
- from langchain_community.llms import HuggingFaceEndpoint
9
- from langchain_community.vectorstores import FAISS
10
- from langchain_community.embeddings import HuggingFaceEmbeddings
11
- from fastapi import FastAPI, UploadFile, File
12
- from typing import List
13
-
14
- app = FastAPI()
15
-
16
- # Function to extract text from a PDF file
17
- def extract_text_from_pdf(pdf_path):
18
- text = ""
19
- doc = fitz.open(pdf_path)
20
- for page_num in range(len(doc)):
21
- page = doc.load_page(page_num)
22
- text += page.get_text()
23
- return text
24
-
25
- # Function to extract text from a Word document
26
- def extract_text_from_docx(docx_path):
27
- doc = Document(docx_path)
28
- text = "\n".join([para.text for para in doc.paragraphs])
29
- return text
30
-
31
- # Initialize the embedding model
32
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
33
-
34
- # Hugging Face API token
35
- api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
36
- if not api_token:
37
- raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
38
-
39
- print(f"API Token: {api_token[:5]}...")
40
-
41
- # Initialize the HuggingFace LLM
42
- llm = HuggingFaceEndpoint(
43
- endpoint_url="https://api-inference.huggingface.co/models/gpt2",
44
- model_kwargs={"api_key": api_token}
45
- )
46
-
47
- # Initialize the HuggingFace embeddings
48
- embedding = HuggingFaceEmbeddings()
49
-
50
- # Load or create FAISS index
51
- index_path = "faiss_index.pkl"
52
- if os.path.exists(index_path):
53
- with open(index_path, "rb") as f:
54
- index = pickle.load(f)
55
- else:
56
- # Create a new FAISS index if it doesn't exist
57
- index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
58
- with open(index_path, "wb") as f:
59
- pickle.dump(index, f)
60
-
61
- @app.post("/upload/")
62
- async def upload_file(files: List[UploadFile] = File(...)):
63
- for file in files:
64
- content = await file.read()
65
- if file.filename.endswith('.pdf'):
66
- with open("temp.pdf", "wb") as f:
67
- f.write(content)
68
- text = extract_text_from_pdf("temp.pdf")
69
- elif file.filename.endswith('.docx'):
70
- with open("temp.docx", "wb") as f:
71
- f.write(content)
72
- text = extract_text_from_docx("temp.docx")
73
- else:
74
- return {"error": "Unsupported file format"}
75
-
76
- # Process the text and update FAISS index
77
- sentences = text.split("\n")
78
- embeddings = embedding_model.encode(sentences)
79
- index.add(np.array(embeddings))
80
-
81
- # Save the updated index
82
- with open(index_path, "wb") as f:
83
- pickle.dump(index, f)
84
-
85
- return {"message": "Files processed successfully"}
86
-
87
- @app.post("/query/")
88
- async def query(text: str):
89
- # Encode the query text
90
- query_embedding = embedding_model.encode([text])
91
-
92
- # Search the FAISS index
93
- D, I = index.search(np.array(query_embedding), k=5)
94
-
95
- top_documents = []
96
- for idx in I[0]:
97
- if idx != -1: # Ensure that a valid index is found
98
- top_documents.append(f"Document {idx}")
99
-
100
- return {"top_documents": top_documents}
101
-
102
- if __name__ == "__main__":
103
- import uvicorn
 
104
  uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
+ import os
2
+ import fitz
3
+ import PyMuPDF
4
+ from docx import Document
5
+ from sentence_transformers import SentenceTransformer
6
+ import faiss
7
+ import numpy as np
8
+ import pickle
9
+ from langchain_community.llms import HuggingFaceEndpoint
10
+ from langchain_community.vectorstores import FAISS
11
+ from langchain_community.embeddings import HuggingFaceEmbeddings
12
+ from fastapi import FastAPI, UploadFile, File
13
+ from typing import List
14
+
15
+ app = FastAPI()
16
+
17
+ # Function to extract text from a PDF file
18
+ def extract_text_from_pdf(pdf_path):
19
+ text = ""
20
+ doc = fitz.open(pdf_path)
21
+ for page_num in range(len(doc)):
22
+ page = doc.load_page(page_num)
23
+ text += page.get_text()
24
+ return text
25
+
26
+ # Function to extract text from a Word document
27
+ def extract_text_from_docx(docx_path):
28
+ doc = Document(docx_path)
29
+ text = "\n".join([para.text for para in doc.paragraphs])
30
+ return text
31
+
32
+ # Initialize the embedding model
33
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
34
+
35
+ # Hugging Face API token
36
+ api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
37
+ if not api_token:
38
+ raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
39
+
40
+ print(f"API Token: {api_token[:5]}...")
41
+
42
+ # Initialize the HuggingFace LLM
43
+ llm = HuggingFaceEndpoint(
44
+ endpoint_url="https://api-inference.huggingface.co/models/gpt2",
45
+ model_kwargs={"api_key": api_token}
46
+ )
47
+
48
+ # Initialize the HuggingFace embeddings
49
+ embedding = HuggingFaceEmbeddings()
50
+
51
+ # Load or create FAISS index
52
+ index_path = "faiss_index.pkl"
53
+ if os.path.exists(index_path):
54
+ with open(index_path, "rb") as f:
55
+ index = pickle.load(f)
56
+ else:
57
+ # Create a new FAISS index if it doesn't exist
58
+ index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
59
+ with open(index_path, "wb") as f:
60
+ pickle.dump(index, f)
61
+
62
+ @app.post("/upload/")
63
+ async def upload_file(files: List[UploadFile] = File(...)):
64
+ for file in files:
65
+ content = await file.read()
66
+ if file.filename.endswith('.pdf'):
67
+ with open("temp.pdf", "wb") as f:
68
+ f.write(content)
69
+ text = extract_text_from_pdf("temp.pdf")
70
+ elif file.filename.endswith('.docx'):
71
+ with open("temp.docx", "wb") as f:
72
+ f.write(content)
73
+ text = extract_text_from_docx("temp.docx")
74
+ else:
75
+ return {"error": "Unsupported file format"}
76
+
77
+ # Process the text and update FAISS index
78
+ sentences = text.split("\n")
79
+ embeddings = embedding_model.encode(sentences)
80
+ index.add(np.array(embeddings))
81
+
82
+ # Save the updated index
83
+ with open(index_path, "wb") as f:
84
+ pickle.dump(index, f)
85
+
86
+ return {"message": "Files processed successfully"}
87
+
88
+ @app.post("/query/")
89
+ async def query(text: str):
90
+ # Encode the query text
91
+ query_embedding = embedding_model.encode([text])
92
+
93
+ # Search the FAISS index
94
+ D, I = index.search(np.array(query_embedding), k=5)
95
+
96
+ top_documents = []
97
+ for idx in I[0]:
98
+ if idx != -1: # Ensure that a valid index is found
99
+ top_documents.append(f"Document {idx}")
100
+
101
+ return {"top_documents": top_documents}
102
+
103
+ if __name__ == "__main__":
104
+ import uvicorn
105
  uvicorn.run(app, host="0.0.0.0", port=8000)