Ajeet001 commited on
Commit
06b3af5
·
verified ·
1 Parent(s): 98ecc34

Upload 5 files

Browse files
src/__init__.py ADDED
File without changes
src/chunking_embedding.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_huggingface import HuggingFaceEmbeddings
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, TokenTextSplitter
3
+
4
+ def setup_chunking_and_embedding(documents, chunking_strategy, chunk_size, chunk_overlap, embedding_model):
5
+ """Setup text chunking and embedding."""
6
+ embedding_models = {
7
+ 'HuggingFace': HuggingFaceEmbeddings()
8
+ }
9
+
10
+ embeddings = embedding_models.get(embedding_model)
11
+ if not embeddings:
12
+ raise ValueError("Unsupported embedding model.")
13
+
14
+ chunking_strategies = {
15
+ 'Recursive': RecursiveCharacterTextSplitter,
16
+ 'Character': CharacterTextSplitter,
17
+ 'Token': TokenTextSplitter
18
+ }
19
+
20
+ text_splitter = chunking_strategies.get(chunking_strategy)
21
+ if not text_splitter:
22
+ raise ValueError("Unsupported chunking strategy.")
23
+
24
+ splitter_instance = text_splitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
25
+ doc_chunks = splitter_instance.split_documents(documents)
26
+
27
+ if not doc_chunks:
28
+ raise ValueError("No document chunks created.")
29
+
30
+ return doc_chunks, embeddings
src/document_loader.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain_community.document_loaders import (
3
+ PyPDFLoader,
4
+ PyMuPDFLoader,
5
+ PDFPlumberLoader,
6
+ PDFMinerLoader,
7
+ UnstructuredWordDocumentLoader,
8
+ )
9
+
10
+ def load_document(file_path, loader_type):
11
+ """Load document based on the selected loader type."""
12
+ loaders = {
13
+ 'PyPDF': PyPDFLoader,
14
+ 'PyMuPDF': PyMuPDFLoader,
15
+ 'PDFPlumber': PDFPlumberLoader,
16
+ 'PDFMiner': PDFMinerLoader,
17
+ 'Unstructured Word': UnstructuredWordDocumentLoader
18
+ }
19
+
20
+ loader = loaders.get(loader_type)
21
+ if not loader:
22
+ raise ValueError("Unsupported loader type.")
23
+
24
+ documents = loader(file_path).load()
25
+ st.success(f"Number of documents loaded: {len(documents)}")
26
+ return documents
src/utils.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ def log_interaction(question, response, params):
5
+ """Log interaction details to a JSON file."""
6
+ log_entry = {
7
+ "question": question,
8
+ "response": response,
9
+ "params": params
10
+ }
11
+ log_file_path = "interaction_log.json"
12
+
13
+ logs = []
14
+ if os.path.exists(log_file_path):
15
+ with open(log_file_path, "r") as log_file:
16
+ logs = json.load(log_file)
17
+
18
+ logs.append(log_entry)
19
+
20
+ with open(log_file_path, "w") as log_file:
21
+ json.dump(logs, log_file, indent=4)
src/vector_store.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from langchain_community.vectorstores import FAISS
2
+
3
+ def create_vectorstore(doc_chunks, embeddings):
4
+ """Create a FAISS vector store from document chunks."""
5
+ vectorstore = FAISS.from_documents(doc_chunks, embeddings)
6
+ return vectorstore