Chris4K commited on
Commit
4920c28
·
verified ·
1 Parent(s): fd8d102

Create services/pdf_service.py

Browse files
Files changed (1) hide show
  1. services/pdf_service.py +117 -55
services/pdf_service.py CHANGED
@@ -1,11 +1,14 @@
1
  # services/pdf_service.py
2
  from pathlib import Path
3
- from typing import List, Dict, Any
4
  from PyPDF2 import PdfReader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
6
  import asyncio
7
  from concurrent.futures import ThreadPoolExecutor
8
  import logging
 
9
  from config.config import settings
10
 
11
  logger = logging.getLogger(__name__)
@@ -15,70 +18,129 @@ class PDFService:
15
  self.embedder = model_service.embedder
16
  self.text_splitter = RecursiveCharacterTextSplitter(
17
  chunk_size=settings.CHUNK_SIZE,
18
- chunk_overlap=settings.CHUNK_OVERLAP
 
19
  )
20
- self.pdf_chunks = []
21
- self.faiss_index = None
 
 
22
 
23
- async def index_pdfs(self, pdf_folder: Path = settings.PDF_FOLDER) -> List[Dict[str, Any]]:
24
- all_texts = []
25
-
26
- async def process_pdf(pdf_file: Path) -> List[Dict[str, Any]]:
27
- try:
28
- reader = PdfReader(str(pdf_file))
29
- metadata = reader.metadata
30
- full_text = " ".join([
31
- page.extract_text()
32
- for page in reader.pages
33
- if page.extract_text()
34
- ])
35
- chunks = self.text_splitter.split_text(full_text)
36
- return [{
37
- 'text': chunk,
38
- 'source': pdf_file.name,
39
- 'metadata': metadata,
40
- 'chunk_index': i
41
- } for i, chunk in enumerate(chunks)]
42
- except Exception as e:
43
- logger.error(f"Error processing PDF {pdf_file}: {e}")
44
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- pdf_files = [f for f in pdf_folder.iterdir() if f.suffix.lower() == ".pdf"]
47
-
48
- async with ThreadPoolExecutor() as executor:
49
- tasks = [process_pdf(pdf_file) for pdf_file in pdf_files]
50
- results = await asyncio.gather(*tasks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- for result in results:
53
- all_texts.extend(result)
54
 
55
- self.pdf_chunks = all_texts
56
- return all_texts
 
 
 
57
 
58
- async def search_pdfs(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
59
- if not self.pdf_chunks:
 
 
 
 
 
 
60
  await self.index_pdfs()
61
-
62
- query_embedding = self.embedder.encode([query], convert_to_tensor=True).cpu().detach().numpy()
63
 
64
- # Create embeddings for chunks if not already done
65
- if not self.faiss_index:
66
- chunk_embeddings = self.embedder.encode(
67
- [chunk['text'] for chunk in self.pdf_chunks],
68
  convert_to_tensor=True
69
  ).cpu().detach().numpy()
70
 
71
- d = chunk_embeddings.shape[1]
72
- self.faiss_index = faiss.IndexFlatL2(d)
73
- self.faiss_index.add(chunk_embeddings)
74
-
75
- distances, indices = self.faiss_index.search(query_embedding, top_k)
76
-
77
- results = []
78
- for i, idx in enumerate(indices[0]):
79
- chunk = self.pdf_chunks[idx].copy()
80
- chunk['score'] = float(distances[0][i])
81
- results.append(chunk)
 
 
 
 
 
82
 
83
- return results
 
 
84
 
 
1
  # services/pdf_service.py
2
  from pathlib import Path
3
+ from typing import List, Dict, Any, Optional
4
  from PyPDF2 import PdfReader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ import faiss
7
+ import numpy as np
8
  import asyncio
9
  from concurrent.futures import ThreadPoolExecutor
10
  import logging
11
+ from datetime import datetime
12
  from config.config import settings
13
 
14
  logger = logging.getLogger(__name__)
 
18
  self.embedder = model_service.embedder
19
  self.text_splitter = RecursiveCharacterTextSplitter(
20
  chunk_size=settings.CHUNK_SIZE,
21
+ chunk_overlap=settings.CHUNK_OVERLAP,
22
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
23
  )
24
+ self.index = None
25
+ self.chunks = []
26
+ self.last_update = None
27
+ self.pdf_metadata = {}
28
 
29
+ async def process_pdf(self, pdf_path: Path) -> List[Dict[str, Any]]:
30
+ """Process a single PDF file"""
31
+ try:
32
+ reader = PdfReader(str(pdf_path))
33
+ chunks = []
34
+
35
+ # Extract metadata
36
+ metadata = {
37
+ 'title': reader.metadata.get('/Title', ''),
38
+ 'author': reader.metadata.get('/Author', ''),
39
+ 'creation_date': reader.metadata.get('/CreationDate', ''),
40
+ 'pages': len(reader.pages),
41
+ 'filename': pdf_path.name
42
+ }
43
+ self.pdf_metadata[pdf_path.name] = metadata
44
+
45
+ # Process each page
46
+ for page_num, page in enumerate(reader.pages):
47
+ text = page.extract_text()
48
+ if not text:
49
+ continue
50
+
51
+ page_chunks = self.text_splitter.split_text(text)
52
+ for i, chunk in enumerate(page_chunks):
53
+ chunks.append({
54
+ 'text': chunk,
55
+ 'source': pdf_path.name,
56
+ 'page': page_num + 1,
57
+ 'chunk_index': i,
58
+ 'metadata': metadata,
59
+ 'timestamp': datetime.now().isoformat()
60
+ })
61
+
62
+ return chunks
63
+
64
+ except Exception as e:
65
+ logger.error(f"Error processing PDF {pdf_path}: {e}")
66
+ return []
67
 
68
+ async def index_pdfs(self, pdf_folder: Path = settings.PDF_FOLDER) -> None:
69
+ """Index all PDFs in the specified folder"""
70
+ try:
71
+ pdf_files = list(pdf_folder.glob('*.pdf'))
72
+ if not pdf_files:
73
+ logger.warning(f"No PDF files found in {pdf_folder}")
74
+ return
75
+
76
+ # Process PDFs in parallel
77
+ async with ThreadPoolExecutor() as executor:
78
+ tasks = [
79
+ asyncio.create_task(self.process_pdf(pdf_file))
80
+ for pdf_file in pdf_files
81
+ ]
82
+ chunk_lists = await asyncio.gather(*tasks)
83
+
84
+ # Combine all chunks
85
+ self.chunks = []
86
+ for chunk_list in chunk_lists:
87
+ self.chunks.extend(chunk_list)
88
+
89
+ # Create FAISS index
90
+ texts = [chunk['text'] for chunk in self.chunks]
91
+ embeddings = self.embedder.encode(
92
+ texts,
93
+ convert_to_tensor=True,
94
+ show_progress_bar=True
95
+ ).cpu().detach().numpy()
96
+
97
+ dimension = embeddings.shape[1]
98
+ self.index = faiss.IndexFlatL2(dimension)
99
+ self.index.add(embeddings)
100
 
101
+ self.last_update = datetime.now()
 
102
 
103
+ logger.info(f"Indexed {len(self.chunks)} chunks from {len(pdf_files)} PDFs")
104
+
105
+ except Exception as e:
106
+ logger.error(f"Error indexing PDFs: {e}")
107
+ raise
108
 
109
+ async def search(
110
+ self,
111
+ query: str,
112
+ top_k: int = 5,
113
+ min_score: float = 0.5
114
+ ) -> List[Dict[str, Any]]:
115
+ """Search indexed PDFs"""
116
+ if not self.index or not self.chunks:
117
  await self.index_pdfs()
 
 
118
 
119
+ try:
120
+ # Get query embedding
121
+ query_embedding = self.embedder.encode(
122
+ [query],
123
  convert_to_tensor=True
124
  ).cpu().detach().numpy()
125
 
126
+ # Search
127
+ distances, indices = self.index.search(query_embedding, top_k * 2) # Get extra results for filtering
128
+
129
+ # Process results
130
+ results = []
131
+ for i, idx in enumerate(indices[0]):
132
+ if idx >= len(self.chunks) or distances[0][i] > min_score:
133
+ continue
134
+
135
+ chunk = self.chunks[idx].copy()
136
+ chunk['score'] = float(1 - distances[0][i]) # Convert distance to similarity score
137
+ results.append(chunk)
138
+
139
+ # Sort by score and take top_k
140
+ results.sort(key=lambda x: x['score'], reverse=True)
141
+ return results[:top_k]
142
 
143
+ except Exception as e:
144
+ logger.error(f"Error searching PDFs: {e}")
145
+ raise
146