Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- src/__init__.py +0 -0
- src/chunking_embedding.py +30 -0
- src/document_loader.py +26 -0
- src/utils.py +21 -0
- src/vector_store.py +6 -0
src/__init__.py
ADDED
File without changes
|
src/chunking_embedding.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, TokenTextSplitter
|
3 |
+
|
4 |
+
def setup_chunking_and_embedding(documents, chunking_strategy, chunk_size, chunk_overlap, embedding_model):
|
5 |
+
"""Setup text chunking and embedding."""
|
6 |
+
embedding_models = {
|
7 |
+
'HuggingFace': HuggingFaceEmbeddings()
|
8 |
+
}
|
9 |
+
|
10 |
+
embeddings = embedding_models.get(embedding_model)
|
11 |
+
if not embeddings:
|
12 |
+
raise ValueError("Unsupported embedding model.")
|
13 |
+
|
14 |
+
chunking_strategies = {
|
15 |
+
'Recursive': RecursiveCharacterTextSplitter,
|
16 |
+
'Character': CharacterTextSplitter,
|
17 |
+
'Token': TokenTextSplitter
|
18 |
+
}
|
19 |
+
|
20 |
+
text_splitter = chunking_strategies.get(chunking_strategy)
|
21 |
+
if not text_splitter:
|
22 |
+
raise ValueError("Unsupported chunking strategy.")
|
23 |
+
|
24 |
+
splitter_instance = text_splitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
25 |
+
doc_chunks = splitter_instance.split_documents(documents)
|
26 |
+
|
27 |
+
if not doc_chunks:
|
28 |
+
raise ValueError("No document chunks created.")
|
29 |
+
|
30 |
+
return doc_chunks, embeddings
|
src/document_loader.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from langchain_community.document_loaders import (
|
3 |
+
PyPDFLoader,
|
4 |
+
PyMuPDFLoader,
|
5 |
+
PDFPlumberLoader,
|
6 |
+
PDFMinerLoader,
|
7 |
+
UnstructuredWordDocumentLoader,
|
8 |
+
)
|
9 |
+
|
10 |
+
def load_document(file_path, loader_type):
|
11 |
+
"""Load document based on the selected loader type."""
|
12 |
+
loaders = {
|
13 |
+
'PyPDF': PyPDFLoader,
|
14 |
+
'PyMuPDF': PyMuPDFLoader,
|
15 |
+
'PDFPlumber': PDFPlumberLoader,
|
16 |
+
'PDFMiner': PDFMinerLoader,
|
17 |
+
'Unstructured Word': UnstructuredWordDocumentLoader
|
18 |
+
}
|
19 |
+
|
20 |
+
loader = loaders.get(loader_type)
|
21 |
+
if not loader:
|
22 |
+
raise ValueError("Unsupported loader type.")
|
23 |
+
|
24 |
+
documents = loader(file_path).load()
|
25 |
+
st.success(f"Number of documents loaded: {len(documents)}")
|
26 |
+
return documents
|
src/utils.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
+
def log_interaction(question, response, params):
|
5 |
+
"""Log interaction details to a JSON file."""
|
6 |
+
log_entry = {
|
7 |
+
"question": question,
|
8 |
+
"response": response,
|
9 |
+
"params": params
|
10 |
+
}
|
11 |
+
log_file_path = "interaction_log.json"
|
12 |
+
|
13 |
+
logs = []
|
14 |
+
if os.path.exists(log_file_path):
|
15 |
+
with open(log_file_path, "r") as log_file:
|
16 |
+
logs = json.load(log_file)
|
17 |
+
|
18 |
+
logs.append(log_entry)
|
19 |
+
|
20 |
+
with open(log_file_path, "w") as log_file:
|
21 |
+
json.dump(logs, log_file, indent=4)
|
src/vector_store.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.vectorstores import FAISS
|
2 |
+
|
3 |
+
def create_vectorstore(doc_chunks, embeddings):
|
4 |
+
"""Create a FAISS vector store from document chunks."""
|
5 |
+
vectorstore = FAISS.from_documents(doc_chunks, embeddings)
|
6 |
+
return vectorstore
|