Gopal2002 commited on
Commit
a467a2d
·
verified ·
1 Parent(s): d8a7464

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +23 -0
  2. app.py +105 -0
  3. helper.py +295 -0
  4. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python 3.10.12 image
2
+ FROM python:3.10.12
3
+
4
+ # Copy the current directory contents into the container at .
5
+ COPY . .
6
+
7
+ # Set the working directory to /
8
+ WORKDIR /
9
+
10
+ RUN pip install --no-cache-dir --upgrade -r /requirements.txt
11
+
12
+ RUN useradd -m -u 1000 user
13
+
14
+ USER user
15
+
16
+ ENV HOME=/home/user \
17
+ PATH=/home/user/.local/bin:$PATH
18
+
19
+ WORKDIR $HOME/app
20
+
21
+ COPY --chown=user . $HOME/app
22
+
23
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from dotenv import load_dotenv
3
+ import boto3
4
+ import os
5
+ import uvicorn
6
+ import logging
7
+ from uuid import uuid4
8
+ from pydantic import BaseModel
9
+ from helper import PdfToSectionConverter
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Fetch AWS credentials from environment
19
+ s3_access_key_id = os.getenv("S3_ACCESS_KEY_ID")
20
+ s3_secret_key = os.getenv("S3_SECRET_KEY")
21
+ aws_region = os.getenv("AWS_REGION")
22
+
23
+ # Validate environment variables
24
+ if not all([s3_access_key_id, s3_secret_key, aws_region]):
25
+ logger.error("Missing AWS S3 credentials in environment variables.")
26
+ raise ValueError("AWS credentials not set properly.")
27
+
28
+ # Initialize FastAPI app
29
+ app = FastAPI()
30
+
31
+ # Configure S3 client
32
+ s3_client = boto3.client(
33
+ "s3",
34
+ aws_access_key_id=s3_access_key_id,
35
+ aws_secret_access_key=s3_secret_key,
36
+ region_name=aws_region,
37
+ )
38
+
39
+ class PdfRequest(BaseModel):
40
+ s3_file_path: str
41
+ file_title: str
42
+ doc_id : str
43
+ start_page: int = 0
44
+ end_page: int = 0
45
+
46
+ @app.get("/")
47
+ async def start():
48
+ return {"message": "Parser API is Ready"}
49
+
50
+ @app.post("/convert_pdf")
51
+ async def convert_pdf(request: PdfRequest):
52
+ try:
53
+ output_dir = "/tmp"
54
+ output_path = os.path.join(output_dir, "temp_file.pdf")
55
+ doc_id = request.doc_id
56
+
57
+ # Ensure the directory exists
58
+ if not os.path.exists(output_dir):
59
+ os.makedirs(output_dir, exist_ok=True)
60
+
61
+ # Validate S3 file path
62
+ if not request.s3_file_path.startswith("s3://"):
63
+ raise HTTPException(status_code=400, detail="Invalid S3 file path. Must start with 's3://'")
64
+
65
+ try:
66
+ bucket_name, object_key = request.s3_file_path.replace("s3://", "").split("/", 1)
67
+ except ValueError:
68
+ raise HTTPException(status_code=400, detail="Invalid S3 file path format.")
69
+
70
+ logger.info(f"Downloading {request.s3_file_path} from S3 bucket {bucket_name}...")
71
+
72
+ # Download PDF from S3
73
+ try:
74
+ s3_client.download_file(bucket_name, object_key, output_path)
75
+ except Exception as e:
76
+ logger.error(f"Failed to download file from S3: {str(e)}")
77
+ raise HTTPException(status_code=500, detail="Error downloading file from S3.")
78
+
79
+ # Initialize and run the converter
80
+ converter = PdfToSectionConverter()
81
+ output = converter.convert(
82
+ downloaded_pdf_path=output_path,
83
+ file_title=request.file_title,
84
+ doc_id=doc_id,
85
+ start_page_no=request.start_page,
86
+ end_page_no=request.end_page
87
+ )
88
+
89
+ # Cleanup the temporary file
90
+ os.remove(output_path)
91
+
92
+ return {"status": "success", "data": output}
93
+
94
+ except HTTPException:
95
+ raise
96
+ except Exception as e:
97
+ logger.error(f"Unexpected error: {str(e)}")
98
+ raise HTTPException(status_code=500, detail="Internal Server Error.")
99
+
100
+ def start_server():
101
+ logger.info("Starting Server...")
102
+ uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)
103
+
104
+ if __name__ == "__main__":
105
+ start_server()
helper.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docling.document_converter import DocumentConverter
2
+ import logging
3
+ import re
4
+ from uuid import uuid4
5
+ from typing import List, Optional, Generator, Set
6
+ from functools import partial, reduce
7
+ from itertools import chain
8
+ from PyPDF2 import PdfReader, PdfWriter
9
+
10
+ tag_list = ["Sources:", "Source:", "Tags-", "Tags:", "CONTENTS", "ANNEX", "EXERCISES", "Project/Activity"]
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ import os
15
+
16
+ try:
17
+ converter = DocumentConverter()
18
+ except Exception as e:
19
+ logger.error(f"Error initializing Docling DocumentConverter: {e}")
20
+
21
+ def split_pdf(input_pdf, output_pdf, start_page, end_page):
22
+ reader = PdfReader(input_pdf)
23
+ writer = PdfWriter()
24
+ for i in range(start_page, end_page+1):
25
+ writer.add_page(reader.pages[i])
26
+ with open(output_pdf, "wb") as output_file:
27
+ writer.write(output_file)
28
+ print(f"PDF split successfully: {output_pdf}")
29
+
30
+ def get_texts(res):
31
+ page_texts = {pg:"" for pg in res['pages'].keys()}
32
+ texts = res.get('texts')
33
+ for item in texts:
34
+ for prov in item['prov']:
35
+ page_no = prov['page_no']
36
+ text = item['text']
37
+ page_key = f'{page_no}'
38
+ if page_key not in page_texts:
39
+ page_texts[page_key] = text
40
+ else:
41
+ page_texts[page_key] += ' ' + text
42
+ return page_texts
43
+
44
+ def clean_the_text(text):
45
+ """
46
+ Cleans the extracted text by removing unnecessary characters and formatting issues.
47
+
48
+ Args:
49
+ text (str): The extracted text.
50
+
51
+ Returns:
52
+ str: The cleaned text.
53
+ """
54
+ try:
55
+ text = re.sub(r'\n\s*\n', '\n', text)
56
+ text = text.replace("\t", " ")
57
+ text = text.replace("\f", " ")
58
+ text = re.sub(r'\b(\w+\s*)\1{1,}', '\\1', text)
59
+ text = re.sub(r'[^a-zA-Z0-9\s@\-/,.\\]', ' ', text)
60
+ return text.strip()
61
+ except Exception as e:
62
+ logger.error(f"Error cleaning text: {e}")
63
+ return text
64
+
65
+ def get_tables(res_json):
66
+ page_tables = {pg:[] for pg in res_json['pages'].keys()}
67
+ try:
68
+ tables = res_json.get('tables', [])
69
+ if not isinstance(tables, list):
70
+ raise ValueError("Expected 'tables' to be a list.")
71
+ for table in tables:
72
+ try:
73
+ # Ensure 'prov' exists and has the necessary structure
74
+ prov = table.get('prov', [])
75
+ if not prov or not isinstance(prov, list):
76
+ raise ValueError("Missing or invalid 'prov' structure in table.")
77
+ page_no = str(prov[0].get('page_no'))
78
+ if not page_no:
79
+ raise ValueError("Missing or invalid 'page_no' in 'prov'.")
80
+ # Ensure 'data' and 'grid' exist
81
+ data = table.get('data', {})
82
+ grid = data.get('grid', [])
83
+ if not isinstance(grid, list):
84
+ raise ValueError("Missing or invalid 'grid' structure in 'data'.")
85
+ # Add text to page_texts
86
+ page_tables[f'{page_no}'].append(grid)
87
+
88
+ except Exception as table_error:
89
+ print(f"Error processing table: {table_error}")
90
+
91
+ except Exception as e:
92
+ print(f"Error processing tables: {e}")
93
+
94
+ return page_tables
95
+
96
+ def table_to_text_or_json(table, rtrn_type="text"):
97
+ """
98
+ Converts a table to a single string or JSON format.
99
+
100
+ Args:
101
+ table (dict): The table object to convert.
102
+ rtrn_type (str): The return type, either "text" or "json". Default is "text".
103
+
104
+ Returns:
105
+ str: The table converted to the specified format.
106
+ """
107
+ table_text = "Here is a Table : \n"
108
+ for row in table:
109
+ for col in row:
110
+ val = col.get('text')
111
+ table_text+=f'{val} ,'
112
+ table_text+='\n'
113
+ return table_text
114
+
115
+ def clean_file_name(text: str):
116
+ """
117
+ Cleans the file name by removing any special characters.
118
+
119
+ Args:
120
+ text (str): The original file name.
121
+
122
+ Returns:
123
+ str: The cleaned file name.
124
+ """
125
+ try:
126
+ text = re.sub('[^a-zA-Z0-9 \n\.]', ' ', text)
127
+ return text
128
+ except Exception as e:
129
+ logger.error(f"Error cleaning file name: {e}")
130
+ return text
131
+
132
+ def find_and_remove_header_footer(
133
+ text: str, n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
134
+ ) -> str:
135
+ """
136
+ Heuristic to find footers and headers across different pages by searching for the longest common string.
137
+ For headers we only search in the first n_chars characters (for footer: last n_chars).
138
+ Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
139
+ but won't detect "Page 3 of 4" or similar.
140
+
141
+ :param n_chars: number of first/last characters where the header/footer shall be searched in
142
+ :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
143
+ :param n_last_pages_to_ignore: number of last pages to ignore
144
+ :return: (cleaned pages, found_header_str, found_footer_str)
145
+ """
146
+
147
+ pages = text.split("\f")
148
+
149
+ # header
150
+ start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
151
+ found_header = find_longest_common_ngram(start_of_pages)
152
+ if found_header:
153
+ pages = [page.replace(found_header, "") for page in pages]
154
+
155
+ # footer
156
+ end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
157
+ found_footer = find_longest_common_ngram(end_of_pages)
158
+ if found_footer:
159
+ pages = [page.replace(found_footer, "") for page in pages]
160
+ logger.debug(f"Removed header '{found_header}' and footer '{found_footer}' in document")
161
+ text = "\f".join(pages)
162
+ return text
163
+
164
+ def ngram(self, seq: str, n: int) -> Generator[str, None, None]:
165
+ """
166
+ Return ngram (of tokens - currently split by whitespace)
167
+ :param seq: str, string from which the ngram shall be created
168
+ :param n: int, n of ngram
169
+ :return: str, ngram as string
170
+ """
171
+
172
+ # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
173
+ # we add a space here and remove it after creation of the ngrams again (see below)
174
+ seq = seq.replace("\n", " \n")
175
+ seq = seq.replace("\t", " \t")
176
+
177
+ words = seq.split(" ")
178
+ ngrams = (
179
+ " ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
180
+ )
181
+
182
+ return ngrams
183
+
184
+ def allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
185
+ lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
186
+ ngrams = map(partial(self.ngram, seq), lengths)
187
+ res = set(chain.from_iterable(ngrams))
188
+ return res
189
+
190
+ def find_longest_common_ngram(
191
+ sequences: List[str], max_ngram: int = 30, min_ngram: int = 3
192
+ ) -> Optional[str]:
193
+ """
194
+ Find the longest common ngram across different text sequences (e.g. start of pages).
195
+ Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
196
+
197
+ :param sequences: list[str], list of strings that shall be searched for common n_grams
198
+ :param max_ngram: int, maximum length of ngram to consider
199
+ :param min_ngram: minimum length of ngram to consider
200
+ :return: str, common string of all sections
201
+ """
202
+ sequences = [s for s in sequences if s] # filter empty sequences
203
+ if not sequences:
204
+ return None
205
+ seqs_ngrams = map(partial(allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
206
+ intersection = reduce(set.intersection, seqs_ngrams)
207
+
208
+ try:
209
+ longest = max(intersection, key=len)
210
+ except ValueError:
211
+ # no common sequence found
212
+ longest = ""
213
+ return longest if longest.strip() else None
214
+
215
+
216
+ class PdfToSectionConverter():
217
+ def __int__(self):
218
+ """
219
+ Initializes the PdfToSectionConverter class.
220
+ """
221
+ pass
222
+
223
+ def convert(self, downloaded_pdf_path: str, file_title: str, doc_id: str = None, start_page_no: int = 0,
224
+ end_page_no: int = 0):
225
+ """
226
+ Converts a PDF document to sections with metadata.
227
+
228
+ Args:
229
+ doc_obj (BytesIO): The PDF document object.
230
+ downloaded_pdf_path (str): Path to the downloaded PDF file.
231
+ file_title (str): The title of the file.
232
+ doc_id (str, optional): The document ID. Defaults to None.
233
+ start_page_no (int, optional): The starting page number. Defaults to 0.
234
+ end_page_no (int, optional): The ending page number. Defaults to 0.
235
+
236
+ Returns:
237
+ list: A list of dictionaries containing sections and metadata.
238
+ """
239
+ try:
240
+ print(f"Splitting pdf from page {start_page_no+1} to {end_page_no+1}")
241
+ output_path = "/tmp/splitted.pdf"
242
+ split_pdf(downloaded_pdf_path, output_path, start_page_no, end_page_no)
243
+ print("OCR Started ....")
244
+ result = converter.convert(output_path)
245
+ json_objects = result.document.export_to_dict()
246
+ pages = list(json_objects['pages'].keys())
247
+ texts = get_texts(json_objects)
248
+ tables = get_tables(json_objects)
249
+ except Exception as e:
250
+ logger.error(f"Error getting JSON result from parser: {e}")
251
+ return []
252
+
253
+ output_doc_lst = []
254
+ page_no = start_page_no
255
+ try:
256
+ for page in pages:
257
+ if page_no > end_page_no:
258
+ break
259
+ page_no += 1
260
+ print(f"Page Number to be processed: {page_no}")
261
+ meta = {"doc_id": doc_id, "page_no": page_no, "img_count": 0, "img_lst": []}
262
+ meta_table = {"doc_id": doc_id, "page_no": page_no, "img_count": 0, "img_lst": "[]"}
263
+
264
+ # Extract text from the page
265
+ text_to_append = texts[page]
266
+ text_to_append = clean_the_text(text_to_append)
267
+
268
+ # Detect and extract tables
269
+ tables_to_append = tables[page]
270
+ if tables_to_append:
271
+ tables_to_append = [table_to_text_or_json(table=i, rtrn_type="text") for i in tables_to_append]
272
+
273
+
274
+ # Add the processed section to the output list
275
+ output_doc_lst.append(
276
+ {"doc_id": doc_id, "text": text_to_append, "vector_id": str(uuid4()),
277
+ "meta": meta, "content_type": 'text'})
278
+ for table in tables_to_append:
279
+ output_doc_lst.append(
280
+ {"doc_id": doc_id, "text": table, "vector_id": str(uuid4()),
281
+ "meta": meta_table, "content_type": 'table'})
282
+
283
+ # Post-process text to remove headers and footers
284
+ text_to_append_list = "\f".join([i['text'] for i in output_doc_lst])
285
+ text_to_append_list = find_and_remove_header_footer(text=text_to_append_list, n_chars=10,
286
+ n_first_pages_to_ignore=0,
287
+ n_last_pages_to_ignore=0).split("\f")
288
+
289
+ for i in range(len(output_doc_lst)):
290
+ output_doc_lst[i]['text'] = clean_file_name(file_title) + "\n" + text_to_append_list[i]
291
+
292
+ except Exception as e:
293
+ logger.error(f"Error converting PDF to sections: {e}")
294
+
295
+ return output_doc_lst
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi==0.115.6
2
+ uvicorn==0.34.0
3
+ boto3==1.36.13
4
+ pydantic==2.10.6
5
+ PyPDF2==3.0.1
6
+ docling==2.15.1