NaimaAqeel commited on
Commit
e21b098
·
verified ·
1 Parent(s): c2a1366

Delete create_faiss_index.py

Browse files
Files changed (1) hide show
  1. create_faiss_index.py +0 -86
create_faiss_index.py DELETED
@@ -1,86 +0,0 @@
1
- import os
2
- import fitz # PyMuPDF
3
- from docx import Document
4
- from sentence_transformers import SentenceTransformer
5
- import faiss
6
- import numpy as np
7
- import pickle
8
-
9
- # Function to extract text from a PDF file
10
- def extract_text_from_pdf(pdf_path):
11
- text = ""
12
- doc = fitz.open(pdf_path)
13
- for page_num in range(len(doc)):
14
- page = doc.load_page(page_num)
15
- text += page.get_text()
16
- return text
17
-
18
- # Function to extract text from a Word document
19
- def extract_text_from_docx(docx_path):
20
- doc = Document(docx_path)
21
- text = "\n".join([para.text for para in doc.paragraphs])
22
- return text
23
-
24
- # Initialize the embedding model
25
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
26
-
27
- # Path to the document (can be either a single file or a directory)
28
- docs_path = "C:\\Users\\MOD\\chatbot\\Should companies implement a four.docx"
29
-
30
- documents = []
31
- doc_texts = []
32
-
33
- if os.path.isdir(docs_path):
34
- # Iterate through all files in the directory
35
- for filename in os.listdir(docs_path):
36
- file_path = os.path.join(docs_path, filename)
37
- if filename.endswith(".pdf"):
38
- text = extract_text_from_pdf(file_path)
39
- documents.append(filename)
40
- doc_texts.append(text)
41
- elif filename.endswith(".docx"):
42
- text = extract_text_from_docx(file_path)
43
- documents.append(filename)
44
- doc_texts.append(text)
45
- elif os.path.isfile(docs_path):
46
- # Process a single file
47
- if docs_path.endswith(".pdf"):
48
- text = extract_text_from_pdf(docs_path)
49
- documents.append(os.path.basename(docs_path))
50
- doc_texts.append(text)
51
- elif docs_path.endswith(".docx"):
52
- text = extract_text_from_docx(docs_path)
53
- documents.append(os.path.basename(docs_path))
54
- doc_texts.append(text)
55
- else:
56
- print("Invalid path specified. Please provide a valid file or directory path.")
57
-
58
- # Generate embeddings for the document texts
59
- embeddings = embedding_model.encode(doc_texts)
60
-
61
- # Create a FAISS index
62
- d = embeddings.shape[1] # Dimension of the embeddings
63
- index = faiss.IndexFlatL2(d) # L2 distance metric
64
- index.add(np.array(embeddings)) # Add embeddings to the index
65
-
66
- # Save the FAISS index and metadata
67
- index_path = "faiss_index"
68
- if not os.path.exists(index_path):
69
- os.makedirs(index_path)
70
-
71
- faiss.write_index(index, os.path.join(index_path, "index.faiss"))
72
-
73
- # Save the document metadata to a file for retrieval purposes
74
- with open(os.path.join(index_path, "documents.txt"), "w") as f:
75
- for doc in documents:
76
- f.write("%s\n" % doc)
77
-
78
- # Save additional metadata
79
- metadata = {
80
- "documents": documents,
81
- "embeddings": embeddings
82
- }
83
- with open(os.path.join(index_path, "index.pkl"), "wb") as f:
84
- pickle.dump(metadata, f)
85
-
86
- print("FAISS index and documents saved.")