Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -40,6 +40,15 @@ def get_text_splitter(strategy: str, chunk_size: int = 1024, chunk_overlap: int
|
|
40 |
}
|
41 |
return splitters.get(strategy)
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
def load_doc(list_file_path: List[str], splitting_strategy: str, chunk_size: str):
|
44 |
chunk_size_value = CHUNK_SIZES[chunk_size][splitting_strategy]
|
45 |
loaders = [PyPDFLoader(x) for x in list_file_path]
|
@@ -59,12 +68,13 @@ def create_db(splits, db_choice: str = "faiss"):
|
|
59 |
"qdrant": lambda: Qdrant.from_documents(
|
60 |
splits,
|
61 |
embeddings,
|
62 |
-
location=":memory:",
|
63 |
collection_name="pdf_docs"
|
64 |
)
|
65 |
}
|
66 |
return db_creators[db_choice]()
|
67 |
|
|
|
68 |
def initialize_database(list_file_obj, splitting_strategy, chunk_size, db_choice, progress=gr.Progress()):
|
69 |
"""Initialize vector database with error handling"""
|
70 |
try:
|
@@ -77,7 +87,7 @@ def initialize_database(list_file_obj, splitting_strategy, chunk_size, db_choice
|
|
77 |
|
78 |
doc_splits = load_doc(list_file_path, splitting_strategy, chunk_size)
|
79 |
if not doc_splits:
|
80 |
-
return None, "No content extracted from documents."
|
81 |
|
82 |
vector_db = create_db(doc_splits, db_choice)
|
83 |
return vector_db, f"Database created successfully using {splitting_strategy} splitting and {db_choice} vector database!"
|
@@ -100,7 +110,7 @@ def initialize_llmchain(llm_choice, temperature, max_tokens, top_k, vector_db, p
|
|
100 |
max_new_tokens=max_tokens,
|
101 |
top_k=top_k
|
102 |
)
|
103 |
-
|
104 |
memory = ConversationBufferMemory(
|
105 |
memory_key="chat_history",
|
106 |
output_key='answer',
|
|
|
40 |
}
|
41 |
return splitters.get(strategy)
|
42 |
|
43 |
+
# def get_text_splitter(strategy, chunk_size=1024, chunk_overlap=64):
|
44 |
+
# if strategy == "recursive":
|
45 |
+
# return RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
46 |
+
# elif strategy == "fixed":
|
47 |
+
# return CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
48 |
+
# elif strategy == "token":
|
49 |
+
# return TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
50 |
+
# return None
|
51 |
+
|
52 |
def load_doc(list_file_path: List[str], splitting_strategy: str, chunk_size: str):
|
53 |
chunk_size_value = CHUNK_SIZES[chunk_size][splitting_strategy]
|
54 |
loaders = [PyPDFLoader(x) for x in list_file_path]
|
|
|
68 |
"qdrant": lambda: Qdrant.from_documents(
|
69 |
splits,
|
70 |
embeddings,
|
71 |
+
location=":memory:", # In memory database for qdrant
|
72 |
collection_name="pdf_docs"
|
73 |
)
|
74 |
}
|
75 |
return db_creators[db_choice]()
|
76 |
|
77 |
+
# Initialize Vector DB
|
78 |
def initialize_database(list_file_obj, splitting_strategy, chunk_size, db_choice, progress=gr.Progress()):
|
79 |
"""Initialize vector database with error handling"""
|
80 |
try:
|
|
|
87 |
|
88 |
doc_splits = load_doc(list_file_path, splitting_strategy, chunk_size)
|
89 |
if not doc_splits:
|
90 |
+
return None, "No content extracted from documents."
|
91 |
|
92 |
vector_db = create_db(doc_splits, db_choice)
|
93 |
return vector_db, f"Database created successfully using {splitting_strategy} splitting and {db_choice} vector database!"
|
|
|
110 |
max_new_tokens=max_tokens,
|
111 |
top_k=top_k
|
112 |
)
|
113 |
+
# Temporary memory
|
114 |
memory = ConversationBufferMemory(
|
115 |
memory_key="chat_history",
|
116 |
output_key='answer',
|