NaimaAqeel commited on
Commit
ae2ee6c
·
verified ·
1 Parent(s): 86e3baa

Upload create_faiss_index.py

Browse files
Files changed (1) hide show
  1. create_faiss_index.py +86 -0
create_faiss_index.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz # PyMuPDF
3
+ from docx import Document
4
+ from sentence_transformers import SentenceTransformer
5
+ import faiss
6
+ import numpy as np
7
+ import pickle
8
+
9
+ # Function to extract text from a PDF file
10
+ def extract_text_from_pdf(pdf_path):
11
+ text = ""
12
+ doc = fitz.open(pdf_path)
13
+ for page_num in range(len(doc)):
14
+ page = doc.load_page(page_num)
15
+ text += page.get_text()
16
+ return text
17
+
18
+ # Function to extract text from a Word document
19
+ def extract_text_from_docx(docx_path):
20
+ doc = Document(docx_path)
21
+ text = "\n".join([para.text for para in doc.paragraphs])
22
+ return text
23
+
24
+ # Initialize the embedding model
25
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
26
+
27
+ # Path to the document (can be either a single file or a directory)
28
+ docs_path = "C:\\Users\\MOD\\chatbot\\Should companies implement a four.docx"
29
+
30
+ documents = []
31
+ doc_texts = []
32
+
33
+ if os.path.isdir(docs_path):
34
+ # Iterate through all files in the directory
35
+ for filename in os.listdir(docs_path):
36
+ file_path = os.path.join(docs_path, filename)
37
+ if filename.endswith(".pdf"):
38
+ text = extract_text_from_pdf(file_path)
39
+ documents.append(filename)
40
+ doc_texts.append(text)
41
+ elif filename.endswith(".docx"):
42
+ text = extract_text_from_docx(file_path)
43
+ documents.append(filename)
44
+ doc_texts.append(text)
45
+ elif os.path.isfile(docs_path):
46
+ # Process a single file
47
+ if docs_path.endswith(".pdf"):
48
+ text = extract_text_from_pdf(docs_path)
49
+ documents.append(os.path.basename(docs_path))
50
+ doc_texts.append(text)
51
+ elif docs_path.endswith(".docx"):
52
+ text = extract_text_from_docx(docs_path)
53
+ documents.append(os.path.basename(docs_path))
54
+ doc_texts.append(text)
55
+ else:
56
+ print("Invalid path specified. Please provide a valid file or directory path.")
57
+
58
+ # Generate embeddings for the document texts
59
+ embeddings = embedding_model.encode(doc_texts)
60
+
61
+ # Create a FAISS index
62
+ d = embeddings.shape[1] # Dimension of the embeddings
63
+ index = faiss.IndexFlatL2(d) # L2 distance metric
64
+ index.add(np.array(embeddings)) # Add embeddings to the index
65
+
66
+ # Save the FAISS index and metadata
67
+ index_path = "faiss_index"
68
+ if not os.path.exists(index_path):
69
+ os.makedirs(index_path)
70
+
71
+ faiss.write_index(index, os.path.join(index_path, "index.faiss"))
72
+
73
+ # Save the document metadata to a file for retrieval purposes
74
+ with open(os.path.join(index_path, "documents.txt"), "w") as f:
75
+ for doc in documents:
76
+ f.write("%s\n" % doc)
77
+
78
+ # Save additional metadata
79
+ metadata = {
80
+ "documents": documents,
81
+ "embeddings": embeddings
82
+ }
83
+ with open(os.path.join(index_path, "index.pkl"), "wb") as f:
84
+ pickle.dump(metadata, f)
85
+
86
+ print("FAISS index and documents saved.")