hugging2021 commited on
Commit
65cfd8a
ยท
verified ยท
1 Parent(s): a88526d

Update rag_system.py

Browse files
Files changed (1) hide show
  1. rag_system.py +427 -214
rag_system.py CHANGED
@@ -1,227 +1,440 @@
1
  import os
2
- import argparse
3
- import sys
4
- from langchain_community.chains import RetrievalQA
5
- from langchain_communit.prompts import PromptTemplate
6
- from vector_store import get_embeddings, load_vector_store
7
- from llm_loader import load_llama_model
8
-
9
- def create_refine_prompts_with_pages(language="de"):
10
- if language == "de":
11
- question_prompt = PromptTemplate(
12
- input_variables=["context_str", "question"],
13
- template="""
14
- ๋‹ค์Œ์€ ๊ฒ€์ƒ‰๋œ ๋ฌธ์„œ ์กฐ๊ฐ๋“ค์ž…๋‹ˆ๋‹ค:
15
-
16
- {context_str}
17
-
18
- ์œ„ ๋ฌธ์„œ๋“ค์„ ์ฐธ๊ณ ํ•˜์—ฌ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”.
19
-
20
- **์ค‘์š”ํ•œ ๊ทœ์น™:**
21
- - ๋‹ต๋ณ€ ์‹œ ์ฐธ๊ณ ํ•œ ๋ฌธ์„œ๊ฐ€ ์žˆ๋‹ค๋ฉด ํ•ด๋‹น ์ •๋ณด๋ฅผ ์ธ์šฉํ•˜์„ธ์š”
22
- - ๋ฌธ์„œ์— ๋ช…์‹œ๋œ ์ •๋ณด๋งŒ ์‚ฌ์šฉํ•˜๊ณ , ์ถ”์ธกํ•˜์ง€ ๋งˆ์„ธ์š”
23
- - ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ๋‚˜ ์ถœ์ฒ˜๋Š” ์œ„ ๋ฌธ์„œ์—์„œ ํ™•์ธ๋œ ๊ฒƒ๋งŒ ์–ธ๊ธ‰ํ•˜์„ธ์š”
24
- - ํ™•์‹คํ•˜์ง€ ์•Š์€ ์ •๋ณด๋Š” "๋ฌธ์„œ์—์„œ ํ™•์ธ๋˜์ง€ ์•Š์Œ"์ด๋ผ๊ณ  ๋ช…์‹œํ•˜์„ธ์š”
25
-
26
- ์งˆ๋ฌธ: {question}
27
- ๋‹ต๋ณ€:"""
28
- )
29
-
30
- refine_prompt = PromptTemplate(
31
- input_variables=["question", "existing_answer", "context_str"],
32
- template="""
33
- ๊ธฐ์กด ๋‹ต๋ณ€:
34
- {existing_answer}
35
-
36
- ์ถ”๊ฐ€ ๋ฌธ์„œ:
37
- {context_str}
38
-
39
- ๊ธฐ์กด ๋‹ต๋ณ€์„ ์œ„ ์ถ”๊ฐ€ ๋ฌธ์„œ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ๋ณด์™„ํ•˜๊ฑฐ๋‚˜ ์ˆ˜์ •ํ•ด์ฃผ์„ธ์š”.
40
-
41
- **๊ทœ์น™:**
42
- - ์ƒˆ๋กœ์šด ์ •๋ณด๊ฐ€ ๊ธฐ์กด ๋‹ต๋ณ€๊ณผ ๋‹ค๋ฅด๋‹ค๋ฉด ์ˆ˜์ •ํ•˜์„ธ์š”
43
- - ์ถ”๊ฐ€ ๋ฌธ์„œ์— ๋ช…์‹œ๋œ ์ •๋ณด๋งŒ ์‚ฌ์šฉํ•˜์„ธ์š”
44
- - ํ•˜๋‚˜์˜ ์™„๊ฒฐ๋œ ๋‹ต๋ณ€์œผ๋กœ ์ž‘์„ฑํ•˜์„ธ์š”
45
- - ํ™•์‹คํ•˜์ง€ ์•Š์€ ์ถœ์ฒ˜๋‚˜ ํŽ˜์ด์ง€๋Š” ์–ธ๊ธ‰ํ•˜์ง€ ๋งˆ์„ธ์š”
46
-
47
- ์งˆ๋ฌธ: {question}
48
- ๋‹ต๋ณ€:"""
49
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  else:
51
- question_prompt = PromptTemplate(
52
- input_variables=["context_str", "question"],
53
- template="""
54
- Here are the retrieved document fragments:
55
-
56
- {context_str}
57
-
58
- Please answer the question based on the above documents.
59
-
60
- **Important rules:**
61
- - Only use information explicitly stated in the documents
62
- - If citing sources, only mention what is clearly indicated in the documents above
63
- - Do not guess or infer page numbers not shown in the context
64
- - If unsure, state "not confirmed in the provided documents"
65
-
66
- Question: {question}
67
- Answer:"""
68
- )
69
-
70
- refine_prompt = PromptTemplate(
71
- input_variables=["question", "existing_answer", "context_str"],
72
- template="""
73
- Existing answer:
74
- {existing_answer}
75
-
76
- Additional documents:
77
- {context_str}
78
-
79
- Refine the existing answer using the additional documents.
80
-
81
- **Rules:**
82
- - Only use information explicitly stated in the additional documents
83
- - Create one coherent final answer
84
- - Do not mention uncertain sources or page numbers
85
-
86
- Question: {question}
87
- Answer:"""
88
- )
89
-
90
- return question_prompt, refine_prompt
91
-
92
- def build_rag_chain(llm, vectorstore, language="ko", k=7):
93
- """RAG ์ฒด์ธ ๊ตฌ์ถ•"""
94
- question_prompt, refine_prompt = create_refine_prompts_with_pages(language)
95
-
96
- qa_chain = RetrievalQA.from_chain_type(
97
- llm=llm,
98
- chain_type="refine",
99
- retriever=vectorstore.as_retriever(search_kwargs={"k": k}),
100
- chain_type_kwargs={
101
- "question_prompt": question_prompt,
102
- "refine_prompt": refine_prompt
103
- },
104
- return_source_documents=True
105
- )
106
-
107
- return qa_chain
108
-
109
- def ask_question_with_pages(qa_chain, question):
110
- """์งˆ๋ฌธ ์ฒ˜๋ฆฌ"""
111
- result = qa_chain.invoke({"query": question})
112
-
113
- # ๊ฒฐ๊ณผ์—์„œ A: ์ดํ›„ ๋ฌธ์žฅ๋งŒ ์ถ”์ถœ
114
- answer = result['result']
115
- final_answer = answer.split("A:")[-1].strip() if "A:" in answer else answer.strip()
116
 
117
- print(f"\n๐Ÿงพ ์งˆ๋ฌธ: {question}")
118
- print(f"\n๐ŸŸข ์ตœ์ข… ๋‹ต๋ณ€: {final_answer}")
119
 
120
- # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋””๋ฒ„๊น… ์ •๋ณด ์ถœ๋ ฅ (๋น„ํ™œ์„ฑํ™”)
121
- # debug_metadata_info(result["source_documents"])
122
-
123
- # ์ฐธ๊ณ  ๋ฌธ์„œ๋ฅผ ํŽ˜์ด์ง€๋ณ„๋กœ ์ •๋ฆฌ
124
- print("\n๐Ÿ“š ์ฐธ๊ณ  ๋ฌธ์„œ ์š”์•ฝ:")
125
- source_info = {}
126
 
127
- for doc in result["source_documents"]:
128
- source = doc.metadata.get('source', 'N/A')
129
- page = doc.metadata.get('page', 'N/A')
130
- doc_type = doc.metadata.get('type', 'N/A')
131
- section = doc.metadata.get('section', None)
132
- total_pages = doc.metadata.get('total_pages', None)
 
 
 
 
 
 
 
133
 
134
- filename = doc.metadata.get('filename', 'N/A')
135
- if filename == 'N/A':
136
- filename = os.path.basename(source) if source != 'N/A' else 'N/A'
 
 
 
 
 
 
 
 
 
137
 
138
- if filename not in source_info:
139
- source_info[filename] = {
140
- 'pages': set(),
141
- 'sections': set(),
142
- 'types': set(),
143
- 'total_pages': total_pages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  }
145
-
146
- if page != 'N/A':
147
- if isinstance(page, str) and page.startswith('์„น์…˜'):
148
- source_info[filename]['sections'].add(page)
149
- else:
150
- source_info[filename]['pages'].add(page)
151
-
152
- if section is not None:
153
- source_info[filename]['sections'].add(f"์„น์…˜ {section}")
154
-
155
- source_info[filename]['types'].add(doc_type)
156
-
157
- # ๊ฒฐ๊ณผ ์ถœ๋ ฅ
158
- total_chunks = len(result["source_documents"])
159
- print(f"์ด ์‚ฌ์šฉ๋œ ์ฒญํฌ ์ˆ˜: {total_chunks}")
160
 
161
- for filename, info in source_info.items():
162
- print(f"\n- {filename}")
163
-
164
- # ์ „์ฒด ํŽ˜์ด์ง€ ์ˆ˜ ์ •๋ณด
165
- if info['total_pages']:
166
- print(f" ์ „์ฒด ํŽ˜์ด์ง€ ์ˆ˜: {info['total_pages']}")
167
-
168
- # ํŽ˜์ด์ง€ ์ •๋ณด ์ถœ๋ ฅ
169
- if info['pages']:
170
- pages_list = list(info['pages'])
171
- print(f" ํŽ˜์ด์ง€: {', '.join(map(str, pages_list))}")
172
-
173
- # ์„น์…˜ ์ •๋ณด ์ถœ๋ ฅ
174
- if info['sections']:
175
- sections_list = sorted(list(info['sections']))
176
- print(f" ์„น์…˜: {', '.join(sections_list)}")
177
-
178
- # ํŽ˜์ด์ง€์™€ ์„น์…˜์ด ๋ชจ๋‘ ์—†๋Š” ๊ฒฝ์šฐ
179
- if not info['pages'] and not info['sections']:
180
- print(f" ํŽ˜์ด์ง€: ์ •๋ณด ์—†์Œ")
181
-
182
- # ๋ฌธ์„œ ์œ ํ˜• ์ถœ๋ ฅ
183
- types_str = ', '.join(sorted(info['types']))
184
- print(f" ์œ ํ˜•: {types_str}")
185
-
186
- return result
187
-
188
- # ๊ธฐ์กด ask_question ํ•จ์ˆ˜๋Š” ask_question_with_pages๋กœ ๊ต์ฒด
189
- def ask_question(qa_chain, question):
190
- """ํ˜ธํ™˜์„ฑ์„ ์œ„ํ•œ ๋ž˜ํผ ํ•จ์ˆ˜"""
191
- return ask_question_with_pages(qa_chain, question)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
  if __name__ == "__main__":
194
- parser = argparse.ArgumentParser(description="RAG refine system (ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ ์ง€์›)")
195
- parser.add_argument("--vector_store", type=str, default="vector_db", help="๋ฒกํ„ฐ ์Šคํ† ์–ด ๊ฒฝ๋กœ")
196
- parser.add_argument("--model", type=str, default="LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct", help="LLM ๋ชจ๋ธ ID")
197
- parser.add_argument("--device", type=str, default="cuda", choices=["cuda", "cpu"], help="์‚ฌ์šฉํ•  ๋””๋ฐ”์ด์Šค")
198
- parser.add_argument("--k", type=int, default=7, help="๊ฒ€์ƒ‰ํ•  ๋ฌธ์„œ ์ˆ˜")
199
- parser.add_argument("--language", type=str, default="ko", choices=["ko", "en"], help="์‚ฌ์šฉํ•  ์–ธ์–ด")
200
- parser.add_argument("--query", type=str, help="์งˆ๋ฌธ (์—†์œผ๋ฉด ๋Œ€ํ™”ํ˜• ๋ชจ๋“œ ์‹คํ–‰)")
201
-
202
- args = parser.parse_args()
203
-
204
- embeddings = get_embeddings(device=args.device)
205
- vectorstore = load_vector_store(embeddings, load_path=args.vector_store)
206
- llm = load_llama_model()
207
-
208
- qa_chain = build_rag_chain(llm, vectorstore, language=args.language, k=args.k)
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
- print("๐ŸŸข RAG ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ ์ง€์› ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ!")
 
211
 
212
- if args.query:
213
- ask_question_with_pages(qa_chain, args.query)
214
- else:
215
- print("๐Ÿ’ฌ ๋Œ€ํ™”ํ˜• ๋ชจ๋“œ ์‹œ์ž‘ (์ข…๋ฃŒํ•˜๋ ค๋ฉด 'exit', 'quit', '์ข…๋ฃŒ' ์ž…๋ ฅ)")
216
- while True:
217
- try:
218
- query = input("\n์งˆ๋ฌธ: ").strip()
219
- if query.lower() in ["exit", "quit", "์ข…๋ฃŒ"]:
220
- break
221
- if query: # ๋นˆ ์ž…๋ ฅ ๋ฐฉ์ง€
222
- ask_question_with_pages(qa_chain, query)
223
- except KeyboardInterrupt:
224
- print("\n\nํ”„๋กœ๊ทธ๋žจ์„ ์ข…๋ฃŒํ•ฉ๋‹ˆ๋‹ค.")
225
- break
226
- except Exception as e:
227
- print(f"โ— ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}\n๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”.")
 
1
  import os
2
+ import re
3
+ import glob
4
+ import time
5
+ from collections import defaultdict
6
+
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_core.documents import Document
9
+ from langchain_community.embeddings import HuggingFaceEmbeddings
10
+ from langchain_community.vectorstores import FAISS
11
+
12
+ # PyMuPDF library
13
+ try:
14
+ import fitz # PyMuPDF
15
+ PYMUPDF_AVAILABLE = True
16
+ print("โœ… PyMuPDF library available")
17
+ except ImportError:
18
+ PYMUPDF_AVAILABLE = False
19
+ print("โš ๏ธ PyMuPDF library is not installed. Install with: pip install PyMuPDF")
20
+
21
+ # PDF processing utilities
22
+ import pytesseract
23
+ from PIL import Image
24
+ from pdf2image import convert_from_path
25
+ import pdfplumber
26
+ from pymupdf4llm import LlamaMarkdownReader
27
+
28
+ # --------------------------------
29
+ # Log Output
30
+ # --------------------------------
31
+
32
+ def log(msg):
33
+ print(f"[{time.strftime('%H:%M:%S')}] {msg}")
34
+
35
+ # --------------------------------
36
+ # Text Cleaning Function
37
+ # --------------------------------
38
+
39
+ def clean_text(text):
40
+ return re.sub(r"[^\uAC00-\uD7A3\u1100-\u11FF\u3130-\u318F\w\s.,!?\"'()$:\-]", "", text)
41
+
42
+ def apply_corrections(text):
43
+ corrections = {
44
+ 'ยบยฉ': 'info', 'รŒ': 'of', 'ยฝ': 'operation', 'รƒ': '', 'ยฉ': '',
45
+ 'รขโ‚ฌโ„ข': "'", 'รขโ‚ฌล“': '"', 'รขโ‚ฌ': '"'
46
+ }
47
+ for k, v in corrections.items():
48
+ text = text.replace(k, v)
49
+ return text
50
+
51
+ # --------------------------------
52
+ # HWPX Processing (Section-wise Processing Only)
53
+ # --------------------------------
54
+
55
+ def load_hwpx(file_path):
56
+ """Loading HWPX file (using XML parsing method only)"""
57
+ import zipfile
58
+ import xml.etree.ElementTree as ET
59
+ import chardet
60
+
61
+ log(f"Starting HWPX section-wise processing: {file_path}")
62
+ start = time.time()
63
+ documents = []
64
+
65
+ try:
66
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
67
+ file_list = zip_ref.namelist()
68
+ section_files = [f for f in file_list
69
+ if f.startswith('Contents/section') and f.endswith('.xml')]
70
+ section_files.sort() # Sort by section0.xml, section1.xml order
71
+
72
+ log(f"Found section files: {len(section_files)} files")
73
+
74
+ for section_idx, section_file in enumerate(section_files):
75
+ with zip_ref.open(section_file) as xml_file:
76
+ raw = xml_file.read()
77
+ encoding = chardet.detect(raw)['encoding'] or 'utf-8'
78
+ try:
79
+ text = raw.decode(encoding)
80
+ except UnicodeDecodeError:
81
+ text = raw.decode("cp949", errors="replace")
82
+
83
+ tree = ET.ElementTree(ET.fromstring(text))
84
+ root = tree.getroot()
85
+
86
+ # Find text without namespace
87
+ t_elements = [elem for elem in root.iter() if elem.tag.endswith('}t') or elem.tag == 't']
88
+ body_text = ""
89
+ for elem in t_elements:
90
+ if elem.text:
91
+ body_text += clean_text(elem.text) + " "
92
+
93
+ # Set page metadata to empty
94
+ page_value = ""
95
+
96
+ if body_text.strip():
97
+ documents.append(Document(
98
+ page_content=apply_corrections(body_text),
99
+ metadata={
100
+ "source": file_path,
101
+ "filename": os.path.basename(file_path),
102
+ "type": "hwpx_body",
103
+ "page": page_value,
104
+ "total_sections": len(section_files)
105
+ }
106
+ ))
107
+ log(f"Section text extraction complete (chars: {len(body_text)})")
108
+
109
+ # Find tables
110
+ table_elements = [elem for elem in root.iter() if elem.tag.endswith('}table') or elem.tag == 'table']
111
+ if table_elements:
112
+ table_text = ""
113
+ for table_idx, table in enumerate(table_elements):
114
+ table_text += f"[Table {table_idx + 1}]\n"
115
+ rows = [elem for elem in table.iter() if elem.tag.endswith('}tr') or elem.tag == 'tr']
116
+ for row in rows:
117
+ row_text = []
118
+ cells = [elem for elem in row.iter() if elem.tag.endswith('}tc') or elem.tag == 'tc']
119
+ for cell in cells:
120
+ cell_texts = []
121
+ for t_elem in cell.iter():
122
+ if (t_elem.tag.endswith('}t') or t_elem.tag == 't') and t_elem.text:
123
+ cell_texts.append(clean_text(t_elem.text))
124
+ row_text.append(" ".join(cell_texts))
125
+ if row_text:
126
+ table_text += "\t".join(row_text) + "\n"
127
+
128
+ if table_text.strip():
129
+ documents.append(Document(
130
+ page_content=apply_corrections(table_text),
131
+ metadata={
132
+ "source": file_path,
133
+ "filename": os.path.basename(file_path),
134
+ "type": "hwpx_table",
135
+ "page": page_value,
136
+ "total_sections": len(section_files)
137
+ }
138
+ ))
139
+ log(f"Table extraction complete")
140
+
141
+ # Find images
142
+ if [elem for elem in root.iter() if elem.tag.endswith('}picture') or elem.tag == 'picture']:
143
+ documents.append(Document(
144
+ page_content="[Image included]",
145
+ metadata={
146
+ "source": file_path,
147
+ "filename": os.path.basename(file_path),
148
+ "type": "hwpx_image",
149
+ "page": page_value,
150
+ "total_sections": len(section_files)
151
+ }
152
+ ))
153
+ log(f"Image found")
154
+
155
+ except Exception as e:
156
+ log(f"HWPX processing error: {e}")
157
+
158
+ duration = time.time() - start
159
+
160
+ # Print summary of document information
161
+ if documents:
162
+ log(f"Number of extracted documents: {len(documents)}")
163
+
164
+ log(f"HWPX processing complete: {file_path} โฑ๏ธ {duration:.2f}s, total {len(documents)} documents")
165
+ return documents
166
+
167
+ # --------------------------------
168
+ # PDF Processing Functions (same as before)
169
+ # --------------------------------
170
+
171
+ def run_ocr_on_image(image: Image.Image, lang='kor+eng'):
172
+ return pytesseract.image_to_string(image, lang=lang)
173
+
174
+ def extract_images_with_ocr(pdf_path, lang='kor+eng'):
175
+ try:
176
+ images = convert_from_path(pdf_path)
177
+ page_ocr_data = {}
178
+ for idx, img in enumerate(images):
179
+ page_num = idx + 1
180
+ text = run_ocr_on_image(img, lang=lang)
181
+ if text.strip():
182
+ page_ocr_data[page_num] = text.strip()
183
+ return page_ocr_data
184
+ except Exception as e:
185
+ print(f"Image OCR failed: {e}")
186
+ return {}
187
+
188
+ def extract_tables_with_pdfplumber(pdf_path):
189
+ page_table_data = {}
190
+ try:
191
+ with pdfplumber.open(pdf_path) as pdf:
192
+ for i, page in enumerate(pdf.pages):
193
+ page_num = i + 1
194
+ tables = page.extract_tables()
195
+ table_text = ""
196
+ for t_index, table in enumerate(tables):
197
+ if table:
198
+ table_text += f"[Table {t_index+1}]\n"
199
+ for row in table:
200
+ row_text = "\t".join(cell if cell else "" for cell in row)
201
+ table_text += row_text + "\n"
202
+ if table_text.strip():
203
+ page_table_data[page_num] = table_text.strip()
204
+ return page_table_data
205
+ except Exception as e:
206
+ print(f"Table extraction failed: {e}")
207
+ return {}
208
+
209
+ def extract_body_text_with_pages(pdf_path):
210
+ page_body_data = {}
211
+ try:
212
+ pdf_processor = LlamaMarkdownReader()
213
+ docs = pdf_processor.load_data(file_path=pdf_path)
214
+
215
+ combined_text = ""
216
+ for d in docs:
217
+ if isinstance(d, dict) and "text" in d:
218
+ combined_text += d["text"]
219
+ elif hasattr(d, "text"):
220
+ combined_text += d.text
221
+
222
+ if combined_text.strip():
223
+ chars_per_page = 2000
224
+ start = 0
225
+ page_num = 1
226
+
227
+ while start < len(combined_text):
228
+ end = start + chars_per_page
229
+ if end > len(combined_text):
230
+ end = len(combined_text)
231
+
232
+ page_text = combined_text[start:end]
233
+ if page_text.strip():
234
+ page_body_data[page_num] = page_text.strip()
235
+ page_num += 1
236
+
237
+ if end == len(combined_text):
238
+ break
239
+ start = end - 100
240
+
241
+ except Exception as e:
242
+ print(f"Body extraction failed: {e}")
243
+
244
+ return page_body_data
245
+
246
+ def load_pdf_with_metadata(pdf_path):
247
+ """Extracts page-specific information from a PDF file"""
248
+ log(f"Starting PDF page-wise processing: {pdf_path}")
249
+ start = time.time()
250
+
251
+ # First, check the actual number of pages using PyPDFLoader
252
+ try:
253
+ from langchain_community.document_loaders import PyPDFLoader
254
+ loader = PyPDFLoader(pdf_path)
255
+ pdf_pages = loader.load()
256
+ actual_total_pages = len(pdf_pages)
257
+ log(f"Actual page count as verified by PyPDFLoader: {actual_total_pages}")
258
+ except Exception as e:
259
+ log(f"PyPDFLoader page count verification failed: {e}")
260
+ actual_total_pages = 1
261
+
262
+ try:
263
+ page_tables = extract_tables_with_pdfplumber(pdf_path)
264
+ except Exception as e:
265
+ page_tables = {}
266
+ print(f"Table extraction failed: {e}")
267
+
268
+ try:
269
+ page_ocr = extract_images_with_ocr(pdf_path)
270
+ except Exception as e:
271
+ page_ocr = {}
272
+ print(f"Image OCR failed: {e}")
273
+
274
+ try:
275
+ page_body = extract_body_text_with_pages(pdf_path)
276
+ except Exception as e:
277
+ page_body = {}
278
+ print(f"Body extraction failed: {e}")
279
+
280
+ duration = time.time() - start
281
+ log(f"PDF page-wise processing complete: {pdf_path} โฑ๏ธ {duration:.2f}s")
282
+
283
+ # Set the total number of pages based on the actual number of pages
284
+ all_pages = set(page_tables.keys()) | set(page_ocr.keys()) | set(page_body.keys())
285
+ if all_pages:
286
+ max_extracted_page = max(all_pages)
287
+ # Use the greater of the actual and extracted page numbers
288
+ total_pages = max(actual_total_pages, max_extracted_page)
289
  else:
290
+ total_pages = actual_total_pages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
+ log(f"Final total page count set to: {total_pages}")
 
293
 
294
+ docs = []
 
 
 
 
 
295
 
296
+ for page_num in sorted(all_pages):
297
+ if page_num in page_tables and page_tables[page_num].strip():
298
+ docs.append(Document(
299
+ page_content=clean_text(apply_corrections(page_tables[page_num])),
300
+ metadata={
301
+ "source": pdf_path,
302
+ "filename": os.path.basename(pdf_path),
303
+ "type": "table",
304
+ "page": page_num,
305
+ "total_pages": total_pages
306
+ }
307
+ ))
308
+ log(f"Page {page_num}: Table extraction complete")
309
 
310
+ if page_num in page_body and page_body[page_num].strip():
311
+ docs.append(Document(
312
+ page_content=clean_text(apply_corrections(page_body[page_num])),
313
+ metadata={
314
+ "source": pdf_path,
315
+ "filename": os.path.basename(pdf_path),
316
+ "type": "body",
317
+ "page": page_num,
318
+ "total_pages": total_pages
319
+ }
320
+ ))
321
+ log(f"Page {page_num}: Body extraction complete")
322
 
323
+ if page_num in page_ocr and page_ocr[page_num].strip():
324
+ docs.append(Document(
325
+ page_content=clean_text(apply_corrections(page_ocr[page_num])),
326
+ metadata={
327
+ "source": pdf_path,
328
+ "filename": os.path.basename(pdf_path),
329
+ "type": "ocr",
330
+ "page": page_num,
331
+ "total_pages": total_pages
332
+ }
333
+ ))
334
+ log(f"Page {page_num}: OCR extraction complete")
335
+
336
+ if not docs:
337
+ docs.append(Document(
338
+ page_content="[Content extraction failed]",
339
+ metadata={
340
+ "source": pdf_path,
341
+ "filename": os.path.basename(pdf_path),
342
+ "type": "error",
343
+ "page": 1,
344
+ "total_pages": total_pages
345
  }
346
+ ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
+ # Print summary of page information
349
+ if docs:
350
+ page_numbers = [doc.metadata.get('page', 0) for doc in docs if doc.metadata.get('page')]
351
+ if page_numbers:
352
+ log(f"Extracted page range: {min(page_numbers)} ~ {max(page_numbers)}")
353
+
354
+ log(f"PDF documents with extracted pages: {len(docs)} documents (total {total_pages} pages)")
355
+ return docs
356
+
357
+ # --------------------------------
358
+ # Document Loading and Splitting
359
+ # --------------------------------
360
+
361
+ def load_documents(folder_path):
362
+ documents = []
363
+
364
+ for file in glob.glob(os.path.join(folder_path, "*.hwpx")):
365
+ log(f"HWPX file found: {file}")
366
+ docs = load_hwpx(file)
367
+ documents.extend(docs)
368
+
369
+ for file in glob.glob(os.path.join(folder_path, "*.pdf")):
370
+ log(f"PDF file found: {file}")
371
+ documents.extend(load_pdf_with_metadata(file))
372
+
373
+ log(f"Document loading complete! Total documents: {len(documents)}")
374
+ return documents
375
+
376
+ def split_documents(documents, chunk_size=800, chunk_overlap=100):
377
+ log("Starting chunk splitting")
378
+ splitter = RecursiveCharacterTextSplitter(
379
+ chunk_size=chunk_size,
380
+ chunk_overlap=chunk_overlap,
381
+ length_function=len
382
+ )
383
+ chunks = []
384
+ for doc in documents:
385
+ split = splitter.split_text(doc.page_content)
386
+ for i, chunk in enumerate(split):
387
+ enriched_chunk = f"passage: {chunk}"
388
+ chunks.append(Document(
389
+ page_content=enriched_chunk,
390
+ metadata={**doc.metadata, "chunk_index": i}
391
+ ))
392
+ log(f"Chunk splitting complete: Created {len(chunks)} chunks")
393
+ return chunks
394
+
395
+ # --------------------------------
396
+ # Main Execution
397
+ # --------------------------------
398
 
399
  if __name__ == "__main__":
400
+ folder = "dataset_test"
401
+ log("PyMuPDF-based document processing started")
402
+ docs = load_documents(folder)
403
+ log("Document loading complete")
404
+
405
+ # Page information check
406
+ log("Page information summary:")
407
+ page_info = {}
408
+ for doc in docs:
409
+ source = doc.metadata.get('source', 'unknown')
410
+ page = doc.metadata.get('page', 'unknown')
411
+ doc_type = doc.metadata.get('type', 'unknown')
412
+
413
+ if source not in page_info:
414
+ page_info[source] = {'pages': set(), 'types': set()}
415
+ page_info[source]['pages'].add(page)
416
+ page_info[source]['types'].add(doc_type)
417
+
418
+ for source, info in page_info.items():
419
+ max_page = max(info['pages']) if info['pages'] and isinstance(max(info['pages']), int) else 'unknown'
420
+ log(f" {os.path.basename(source)}: {max_page} pages, type: {info['types']}")
421
+
422
+ chunks = split_documents(docs)
423
+ log("E5-Large-Instruct embedding preparation")
424
+ embedding_model = HuggingFaceEmbeddings(
425
+ model_name="intfloat/e5-large-v2",
426
+ model_kwargs={"device": "cuda"}
427
+ )
428
 
429
+ vectorstore = FAISS.from_documents(chunks, embedding_model)
430
+ vectorstore.save_local("vector_db")
431
 
432
+ log(f"Total number of documents: {len(docs)}")
433
+ log(f"Total number of chunks: {len(chunks)}")
434
+ log("FAISS save complete: vector_db")
435
+
436
+ # Sample output with page information
437
+ log("\nSample including actual page information:")
438
+ for i, chunk in enumerate(chunks[:5]):
439
+ meta = chunk.metadata
440
+ log(f" Chunk {i+1}: {meta.get('type')} | Page {meta.get('page')} | {os.path.basename(meta.get('source', 'unknown'))}")