umarbalak commited on
Commit
4ff3463
·
1 Parent(s): e226092
Files changed (3) hide show
  1. .gitignore +183 -0
  2. app.py +235 -60
  3. data_extraction.py +264 -0
.gitignore ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ QueryMind
3
+
4
+ *.log
5
+ *.json
6
+ *.docx
7
+ *.pdf
8
+ *.md
9
+
10
+ # Byte-compiled / optimized / DLL files
11
+ __pycache__/
12
+ *.py[cod]
13
+ *$py.class
14
+
15
+ # C extensions
16
+ *.so
17
+
18
+ # Distribution / packaging
19
+ .Python
20
+ build/
21
+ develop-eggs/
22
+ dist/
23
+ downloads/
24
+ eggs/
25
+ .eggs/
26
+ lib/
27
+ lib64/
28
+ parts/
29
+ sdist/
30
+ var/
31
+ wheels/
32
+ share/python-wheels/
33
+ *.egg-info/
34
+ .installed.cfg
35
+ *.egg
36
+ MANIFEST
37
+
38
+ # PyInstaller
39
+ # Usually these files are written by a python script from a template
40
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
41
+ *.manifest
42
+ *.spec
43
+
44
+ # Installer logs
45
+ pip-log.txt
46
+ pip-delete-this-directory.txt
47
+
48
+ # Unit test / coverage reports
49
+ htmlcov/
50
+ .tox/
51
+ .nox/
52
+ .coverage
53
+ .coverage.*
54
+ .cache
55
+ nosetests.xml
56
+ coverage.xml
57
+ *.cover
58
+ *.py,cover
59
+ .hypothesis/
60
+ .pytest_cache/
61
+ cover/
62
+
63
+ # Translations
64
+ *.mo
65
+ *.pot
66
+
67
+ # Django stuff:
68
+ *.log
69
+ local_settings.py
70
+ db.sqlite3
71
+ db.sqlite3-journal
72
+
73
+ # Flask stuff:
74
+ instance/
75
+ .webassets-cache
76
+
77
+ # Scrapy stuff:
78
+ .scrapy
79
+
80
+ # Sphinx documentation
81
+ docs/_build/
82
+
83
+ # PyBuilder
84
+ .pybuilder/
85
+ target/
86
+
87
+ # Jupyter Notebook
88
+ .ipynb_checkpoints
89
+
90
+ # IPython
91
+ profile_default/
92
+ ipython_config.py
93
+
94
+ # pyenv
95
+ # For a library or package, you might want to ignore these files since the code is
96
+ # intended to run in multiple environments; otherwise, check them in:
97
+ # .python-version
98
+
99
+ # pipenv
100
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
102
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
103
+ # install all needed dependencies.
104
+ #Pipfile.lock
105
+
106
+ # UV
107
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
108
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
109
+ # commonly ignored for libraries.
110
+ #uv.lock
111
+
112
+ # poetry
113
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
114
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
115
+ # commonly ignored for libraries.
116
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
117
+ #poetry.lock
118
+
119
+ # pdm
120
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
121
+ #pdm.lock
122
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
123
+ # in version control.
124
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
125
+ .pdm.toml
126
+ .pdm-python
127
+ .pdm-build/
128
+
129
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
130
+ __pypackages__/
131
+
132
+ # Celery stuff
133
+ celerybeat-schedule
134
+ celerybeat.pid
135
+
136
+ # SageMath parsed files
137
+ *.sage.py
138
+
139
+ # Environments
140
+ .env
141
+ .venv
142
+ env/
143
+ venv/
144
+ ENV/
145
+ env.bak/
146
+ venv.bak/
147
+
148
+ # Spyder project settings
149
+ .spyderproject
150
+ .spyproject
151
+
152
+ # Rope project settings
153
+ .ropeproject
154
+
155
+ # mkdocs documentation
156
+ /site
157
+
158
+ # mypy
159
+ .mypy_cache/
160
+ .dmypy.json
161
+ dmypy.json
162
+
163
+ # Pyre type checker
164
+ .pyre/
165
+
166
+ # pytype static type analyzer
167
+ .pytype/
168
+
169
+ # Cython debug symbols
170
+ cython_debug/
171
+
172
+ # PyCharm
173
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
174
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
175
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
176
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
177
+ #.idea/
178
+
179
+ # Ruff stuff:
180
+ .ruff_cache/
181
+
182
+ # PyPI configuration file
183
+ .pypirc
app.py CHANGED
@@ -1,64 +1,239 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  if __name__ == "__main__":
64
- demo.launch()
 
1
  import gradio as gr
2
+ from pathlib import Path
3
+ from tempfile import mkdtemp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ # LangChain & Embedding/LLM
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_huggingface.embeddings import HuggingFaceEmbeddings
8
+ from langchain_huggingface import HuggingFaceEndpoint
9
+ from langchain.chains import create_retrieval_chain
10
+ from langchain.chains.combine_documents import create_stuff_documents_chain
11
+ from langchain_core.prompts import PromptTemplate
12
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
13
+
14
+ # Custom Data Extractor
15
+ from data_extraction import DataExtractor
16
+
17
+ # Constants
18
+ EMBED_MODEL_ID = "BAAI/bge-large-en-v1.5"
19
+ GEN_MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
20
+ TOP_K = 5
21
+ TEMP_DIR = Path(mkdtemp())
22
+ milvus_uri = str(TEMP_DIR / "docling.db")
23
+
24
+ # Initialize
25
+ extractor = DataExtractor()
26
+ embedding = HuggingFaceEmbeddings(model_name=EMBED_MODEL_ID)
27
+ llm = HuggingFaceEndpoint(repo_id=GEN_MODEL_ID, task="text-generation")
28
+
29
+ template = """You are a helpful assistant. Based on the following context, answer the user's query.
30
+ If you don't know the answer, just say that you don't know. Don't try to make up an answer.
31
+ Use the given context strictly to answer the question.
32
+
33
+ <context>
34
+ {context}
35
+ </context>
36
+
37
+ Question: {input}
38
+ Helpful Answer:"""
39
+
40
+ rag_prompt = PromptTemplate.from_template(template)
41
+
42
+ # App State
43
+ vectorstore = None
44
+ retriever = None # Store the retriever separately
45
+ rag_chain = None
46
+
47
+ # Function to process using direct extraction methods
48
+ def process_direct(file_path):
49
+ text = ""
50
+ if file_path.suffix.lower() == ".pdf":
51
+ text = extractor.extract_text_from_pdf(file_path)
52
+ elif file_path.suffix.lower() == ".docx":
53
+ text = extractor.extract_text_from_doc(file_path)
54
+ elif file_path.suffix.lower() == ".txt":
55
+ text = extractor.extract_text_from_txt(file_path)
56
+ else:
57
+ return None, "Unsupported file format. Please use PDF, DOCX, or TXT."
58
+
59
+ # Clean text
60
+ text = extractor.clean_text(text)
61
+
62
+ # Split text using RecursiveCharacterTextSplitter
63
+ text_splitter = RecursiveCharacterTextSplitter(
64
+ chunk_size=1000,
65
+ chunk_overlap=200,
66
+ length_function=len
67
+ )
68
+
69
+ from langchain_core.documents import Document
70
+ doc = Document(
71
+ page_content=text,
72
+ metadata={
73
+ "source": str(file_path),
74
+ "filename": file_path.name
75
+ }
76
+ )
77
+ return text_splitter.split_documents([doc]), "Direct extraction completed successfully."
78
+
79
+ # Function to process using docling
80
+ def process_docling(file_path):
81
+ try:
82
+ if file_path.suffix.lower() == ".pdf":
83
+ # Convert PDF to markdown
84
+ markdown_path = TEMP_DIR / "temp.md"
85
+ extractor.pdf_to_markdown(file_path, markdown_path)
86
+ # Load markdown
87
+ markdown_raw = extractor.load_markdown_file(markdown_path)
88
+ elif file_path.suffix.lower() == ".md":
89
+ markdown_raw = extractor.load_markdown_file(file_path)
90
+ else:
91
+ return None, "Only PDF or Markdown files are supported for docling extraction."
92
+
93
+ # Update metadata with filename
94
+ for doc in markdown_raw:
95
+ doc.metadata.update({
96
+ "filename": file_path.name
97
+ })
98
+
99
+ # Split markdown
100
+ splits = extractor.spit_markdown(markdown_raw)
101
+
102
+ # Update metadata for all splits
103
+ for split in splits:
104
+ split.metadata.update({
105
+ "filename": file_path.name
106
+ })
107
+
108
+ return splits, "Docling extraction completed successfully."
109
+ except Exception as e:
110
+ return None, f"Error during docling extraction: {str(e)}"
111
+
112
+ # Step 1: File Upload & Index
113
+ def process_file(file, extraction_method):
114
+ global vectorstore, retriever, rag_chain
115
+
116
+ if file is None:
117
+ return "Please upload a file first."
118
+
119
+ file_path = Path(file.name)
120
+
121
+ # Process based on selected extraction method
122
+ if extraction_method == "Direct Extraction":
123
+ data_splits, message = process_direct(file_path)
124
+ else: # Docling Extraction
125
+ data_splits, message = process_docling(file_path)
126
+
127
+ if data_splits is None:
128
+ return message
129
+
130
+ # Create vector store
131
+ vectorstore = FAISS.from_documents(documents=data_splits, embedding=embedding)
132
+
133
+ # Create retriever and chain
134
+ retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
135
+ question_answer_chain = create_stuff_documents_chain(llm, prompt=rag_prompt)
136
+ rag_chain = create_retrieval_chain(retriever, question_answer_chain)
137
+
138
+ # Count chunks created
139
+ chunk_count = len(data_splits)
140
+
141
+ return f"File processed using {extraction_method}. {message} Created {chunk_count} document chunks. You can now ask questions."
142
+
143
+ # Format source information
144
+ def format_sources(docs):
145
+ source_info = []
146
+ seen_sources = set()
147
+
148
+ for i, doc in enumerate(docs, 1):
149
+ # Extract source information
150
+ source = doc.metadata.get("source", "Unknown")
151
+ filename = doc.metadata.get("filename", Path(source).name if source != "Unknown" else "Unknown")
152
+
153
+ # Get header information if available (from docling extraction)
154
+ header_info = []
155
+ for level in range(1, 4):
156
+ header = doc.metadata.get(f"Header_{level}", "")
157
+ if header:
158
+ header_info.append(header)
159
+
160
+ # Create a unique identifier for this source
161
+ source_id = f"{filename}{'_' + '_'.join(header_info) if header_info else ''}"
162
+
163
+ if source_id not in seen_sources:
164
+ seen_sources.add(source_id)
165
+
166
+ # Format the source information
167
+ source_text = f"**Source {i}: {filename}**"
168
+ if header_info:
169
+ source_text += f"\n - Section: {' > '.join(header_info)}"
170
+
171
+ # Add the full text content without truncation
172
+ source_text += f"\n - Content: \"{doc.page_content}\""
173
+
174
+ source_info.append(source_text)
175
+
176
+ return "\n\n".join(source_info)
177
+
178
+ # Step 2: Chat with RAG
179
+ def chat_with_doc(query):
180
+ if not rag_chain:
181
+ return "Upload and process a document first.", ""
182
+
183
+ # Use the stored retriever directly to get documents
184
+ if not retriever:
185
+ return "Retriever not initialized. Please process a document first.", ""
186
+
187
+ # Get documents directly from retriever
188
+ retrieved_docs = retriever.get_relevant_documents(query)
189
+
190
+ # Then invoke the chain with the query
191
+ result = rag_chain.invoke({"input": query})
192
+ answer = result.get("answer", "No answer found.")
193
+
194
+ # Format source information
195
+ source_text = format_sources(retrieved_docs)
196
+
197
+ return answer, source_text
198
+
199
+ # Gradio Interface
200
+ with gr.Blocks() as app:
201
+ gr.Markdown("# Document Q&A System")
202
+
203
+ with gr.Tab("Upload Document"):
204
+ file_input = gr.File(label="Upload File", file_types=[".pdf", ".md", ".docx", ".txt"])
205
+ extraction_method = gr.Radio(
206
+ ["Direct Extraction", "Docling Extraction"],
207
+ label="Extraction Method",
208
+ value="Direct Extraction",
209
+ info="Direct extraction is faster but simpler. Docling extraction preserves document structure better."
210
+ )
211
+ process_btn = gr.Button("Process Document")
212
+ file_output = gr.Textbox(label="Processing Status", interactive=False)
213
+
214
+ process_btn.click(
215
+ fn=process_file,
216
+ inputs=[file_input, extraction_method],
217
+ outputs=file_output
218
+ )
219
+
220
+ with gr.Tab("Chat with Document"):
221
+ chat_input = gr.Textbox(label="Ask a Question", placeholder="Type your question...")
222
+ ask_btn = gr.Button("Ask")
223
+ chat_output = gr.Textbox(label="Answer", interactive=False)
224
+ sources_output = gr.Textbox(label="Sources", interactive=False)
225
+
226
+ ask_btn.click(
227
+ fn=chat_with_doc,
228
+ inputs=chat_input,
229
+ outputs=[chat_output, sources_output]
230
+ )
231
+
232
+ chat_input.submit(
233
+ fn=chat_with_doc,
234
+ inputs=chat_input,
235
+ outputs=[chat_output, sources_output]
236
+ )
237
 
238
  if __name__ == "__main__":
239
+ app.launch()
data_extraction.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from pathlib import Path
4
+
5
+ class DataExtractor:
6
+ def __init__(self):
7
+ """
8
+ Initialize DataExtractor with robust logging and error handling
9
+ """
10
+ pass
11
+
12
+ def extract_text_from_pdf(self, pdf_path):
13
+ """
14
+ Robust PDF text extraction with enhanced error handling
15
+
16
+ Args:
17
+ pdf_path (str): Path to PDF file
18
+
19
+ Returns:
20
+ str: Extracted text or empty string
21
+ """
22
+ try:
23
+ import pymupdf
24
+ doc = pymupdf.open(pdf_path)
25
+
26
+ # Extract text with page-level error handling
27
+ full_text = []
28
+ for page_num, page in enumerate(doc, 1):
29
+ try:
30
+ page_text = page.get_text()
31
+ full_text.append(page_text)
32
+ except Exception as page_error:
33
+ print(f"Text extraction failed for page {page_num} in {pdf_path}: {page_error}")
34
+
35
+ return ''.join(full_text)
36
+
37
+ except Exception as e:
38
+ print(f"PDF Extraction failed for {pdf_path}: {e}")
39
+ return ""
40
+
41
+ def extract_text_from_doc(self, doc_path):
42
+ """
43
+ Robust DOCX text extraction with comprehensive error handling
44
+
45
+ Args:
46
+ doc_path (str): Path to DOCX file
47
+
48
+ Returns:
49
+ str: Extracted text or empty string
50
+ """
51
+ try:
52
+ import docx2txt
53
+ return docx2txt.process(doc_path)
54
+ except Exception as e:
55
+ print(f"DOCX Extraction failed for {doc_path}: {e}")
56
+ return ""
57
+
58
+ def extract_text_from_txt(self, txt_path):
59
+ """
60
+ Robust TXT file text extraction
61
+
62
+ Args:
63
+ txt_path (str): Path to TXT file
64
+
65
+ Returns:
66
+ str: Extracted text or empty string
67
+ """
68
+ try:
69
+ with open(txt_path, "r", encoding="utf-8") as f:
70
+ return f.read()
71
+ except UnicodeDecodeError:
72
+ # Try alternative encodings
73
+ for encoding in ['latin-1', 'iso-8859-1', 'cp1252']:
74
+ try:
75
+ with open(txt_path, "r", encoding=encoding) as f:
76
+ return f.read()
77
+ except:
78
+ continue
79
+ except Exception as e:
80
+ print(f"TXT Extraction failed for {txt_path}: {e}")
81
+ return ""
82
+
83
+ def clean_text(self, text):
84
+ """
85
+ Robust cleaner for government FAQ chatbot.
86
+ - Removes boilerplate (page numbers, headers/footers)
87
+ - Preserves layout for LLM tokenization
88
+ - Normalizes structure and character encoding
89
+ """
90
+
91
+ # Normalize line endings
92
+ text = text.replace('\r\n', '\n').replace('\r', '\n')
93
+
94
+ # Remove page headers/footers (common boilerplate)
95
+ text = re.sub(r'\n?Page\s*\d+(\s*of\s*\d+)?\n?', '\n', text, flags=re.IGNORECASE)
96
+ text = re.sub(r'\n?Copyright.*?\d{4}.*?\n?', '\n', text, flags=re.IGNORECASE)
97
+ text = re.sub(r'\n?All rights reserved.*?\n?', '\n', text, flags=re.IGNORECASE)
98
+
99
+ # Remove decorative dividers
100
+ text = re.sub(r'^\s*[-=_*]{3,}\s*$', '', text, flags=re.MULTILINE)
101
+
102
+ # Strip non-printable characters but keep structure
103
+ text = re.sub(r'[^\x09\x0A\x0D\x20-\x7E]', '', text)
104
+
105
+ # Normalize whitespace
106
+ text = re.sub(r'[ \t]+', ' ', text) # inline
107
+ text = re.sub(r'[ \t]+\n', '\n', text) # line ends
108
+ text = re.sub(r'\n{3,}', '\n\n', text) # excessive newlines
109
+
110
+ return text.strip()
111
+
112
+
113
+ def extract_text_from_files(self, doc_paths):
114
+ """
115
+ Extract and process text from multiple document types
116
+
117
+ Returns:
118
+ list: Extracted document data with metadata
119
+ """
120
+ doc_paths = [
121
+ str(Path(path).resolve())
122
+ for path in doc_paths
123
+ if Path(path).exists()
124
+ ]
125
+ data = []
126
+
127
+ for path in doc_paths:
128
+ try:
129
+ ext = Path(path).suffix.lower()
130
+
131
+ # Text extraction based on file type
132
+ if ext == '.pdf':
133
+ text = self.extract_text_from_pdf(path)
134
+ elif ext == '.docx':
135
+ text = self.extract_text_from_doc(path)
136
+ elif ext == '.txt':
137
+ text = self.extract_text_from_txt(path)
138
+ else:
139
+ print(f"Unsupported format: {path}")
140
+ continue
141
+
142
+ # Skip empty documents
143
+ if not text.strip():
144
+ print(f"No text extracted from {path}")
145
+ continue
146
+
147
+ # Clean and structure extracted text
148
+ cleaned_text = self.clean_text(text)
149
+
150
+ # Add document metadata
151
+ doc_data = {
152
+ "title": Path(path).name,
153
+ "path": path,
154
+ "text": cleaned_text,
155
+ "text_length": len(cleaned_text)
156
+ }
157
+
158
+ data.append(doc_data)
159
+
160
+ except Exception as e:
161
+ print(f"Unexpected error processing {path}: {e}")
162
+
163
+ return data
164
+
165
+ def pdf_to_markdown(self, pdf_path, markdown_path):
166
+ """
167
+ Convert PDF to Markdown using docling
168
+
169
+ Args:
170
+ pdf_path (Path): Path to PDF file
171
+ markdown_path (Path): Output path for markdown file
172
+
173
+ Returns:
174
+ str: Markdown content
175
+ """
176
+ try:
177
+ from docling.document_converter import DocumentConverter
178
+
179
+ # Define the source PDF file
180
+ source = pdf_path
181
+ converter = DocumentConverter()
182
+
183
+ # Convert the PDF to Markdown
184
+ result = converter.convert(source)
185
+ markdown = result.document.export_to_markdown()
186
+
187
+ # Write the Markdown output to a file
188
+ with open(markdown_path, "w", encoding="utf-8") as file:
189
+ file.write(markdown)
190
+
191
+ return markdown
192
+ except ImportError:
193
+ print("Docling is not installed. Please install it with: pip install docling")
194
+ return ""
195
+ except Exception as e:
196
+ print(f"Error converting PDF to Markdown: {e}")
197
+ return ""
198
+
199
+ def load_markdown_file(self, markdown_path):
200
+ """
201
+ Load markdown file using langchain_docling
202
+
203
+ Args:
204
+ markdown_path (Path): Path to markdown file
205
+
206
+ Returns:
207
+ list: List of documents
208
+ """
209
+ try:
210
+ from langchain_docling import DoclingLoader
211
+ from langchain_docling.loader import ExportType
212
+
213
+ loader = DoclingLoader(
214
+ file_path=markdown_path,
215
+ export_type=ExportType.MARKDOWN
216
+ )
217
+ data = loader.load()
218
+ return data
219
+ except ImportError:
220
+ print("langchain_docling is not installed. Please install it with: pip install langchain_docling")
221
+ return []
222
+ except Exception as e:
223
+ print(f"Error loading markdown file: {e}")
224
+ return []
225
+
226
+ def spit_markdown(self, markdown):
227
+ """
228
+ Split markdown using MarkdownHeaderTextSplitter
229
+
230
+ Args:
231
+ markdown (list): List of markdown documents
232
+
233
+ Returns:
234
+ list: List of split documents
235
+ """
236
+ try:
237
+ from langchain_text_splitters import MarkdownHeaderTextSplitter
238
+
239
+ splitter = MarkdownHeaderTextSplitter(
240
+ headers_to_split_on=[
241
+ ("#", "Header_1"),
242
+ ("##", "Header_2"),
243
+ ("###", "Header_3"),
244
+ ],
245
+ )
246
+
247
+ splits = []
248
+ for doc in markdown:
249
+ for split in splitter.split_text(doc.page_content):
250
+ # Include metadata and page_content from the original document
251
+ split.metadata.update({
252
+ "Header_1": split.metadata.get("Header_1", ""),
253
+ "Header_2": split.metadata.get("Header_2", ""),
254
+ "Header_3": split.metadata.get("Header_3", "")
255
+ })
256
+ splits.append(split)
257
+
258
+ return splits
259
+ except ImportError:
260
+ print("langchain_text_splitters is not installed or has compatibility issues.")
261
+ return []
262
+ except Exception as e:
263
+ print(f"Error splitting markdown: {e}")
264
+ return []