VyLala commited on
Commit
458ebc8
·
verified ·
1 Parent(s): 4cc7eed

Update data_preprocess.py

Browse files
Files changed (1) hide show
  1. data_preprocess.py +660 -624
data_preprocess.py CHANGED
@@ -1,625 +1,661 @@
1
- import re
2
- import os
3
- import streamlit as st
4
- import subprocess
5
- import re
6
- from Bio import Entrez
7
- from docx import Document
8
- import fitz
9
- import spacy
10
- from spacy.cli import download
11
- from NER.PDF import pdf
12
- from NER.WordDoc import wordDoc
13
- from NER.html import extractHTML
14
- from NER.word2Vec import word2vec
15
- from transformers import pipeline
16
- import urllib.parse, requests
17
- from pathlib import Path
18
- import pandas as pd
19
- from iterate3 import model
20
- import nltk
21
- nltk.download('punkt_tab')
22
- def download_excel_file(url, save_path="temp.xlsx"):
23
- if "view.officeapps.live.com" in url:
24
- parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
25
- real_url = urllib.parse.unquote(parsed_url["src"][0])
26
- response = requests.get(real_url)
27
- with open(save_path, "wb") as f:
28
- f.write(response.content)
29
- return save_path
30
- elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
31
- response = requests.get(url)
32
- response.raise_for_status() # Raises error if download fails
33
- with open(save_path, "wb") as f:
34
- f.write(response.content)
35
- print(len(response.content))
36
- return save_path
37
- else:
38
- print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
39
- return url
40
- def extract_text(link,saveFolder):
41
- text = ""
42
- name = link.split("/")[-1]
43
- file_path = Path(saveFolder) / name
44
- # pdf
45
- if link.endswith(".pdf"):
46
- if file_path.is_file():
47
- link = saveFolder + "/" + name
48
- print("File exists.")
49
- p = pdf.PDF(link,saveFolder)
50
- text = p.extractTextWithPDFReader()
51
- #text_exclude_table = p.extract_text_excluding_tables()
52
- # worddoc
53
- elif link.endswith(".doc") or link.endswith(".docx"):
54
- d = wordDoc.wordDoc(link,saveFolder)
55
- text = d.extractTextByPage()
56
- # html
57
- if link.split(".")[-1].lower() not in "xlsx":
58
- if "http" in link or "html" in link:
59
- html = extractHTML.HTML("",link)
60
- text = html.getListSection() # the text already clean
61
- return text
62
- def extract_table(link,saveFolder):
63
- table = []
64
- name = link.split("/")[-1]
65
- file_path = Path(saveFolder) / name
66
- # pdf
67
- if link.endswith(".pdf"):
68
- if file_path.is_file():
69
- link = saveFolder + "/" + name
70
- print("File exists.")
71
- p = pdf.PDF(link,saveFolder)
72
- table = p.extractTable()
73
- # worddoc
74
- elif link.endswith(".doc") or link.endswith(".docx"):
75
- d = wordDoc.wordDoc(link,saveFolder)
76
- table = d.extractTableAsList()
77
- # excel
78
- elif link.split(".")[-1].lower() in "xlsx":
79
- # download excel file if it not downloaded yet
80
- savePath = saveFolder +"/"+ link.split("/")[-1]
81
- excelPath = download_excel_file(link, savePath)
82
- try:
83
- xls = pd.ExcelFile(excelPath)
84
- table_list = []
85
- for sheet_name in xls.sheet_names:
86
- df = pd.read_excel(xls, sheet_name=sheet_name)
87
- cleaned_table = df.fillna("").astype(str).values.tolist()
88
- table_list.append(cleaned_table)
89
- table = table_list
90
- except Exception as e:
91
- print("❌ Failed to extract tables from Excel:", e)
92
- # html
93
- elif "http" in link or "html" in link:
94
- html = extractHTML.HTML("",link)
95
- table = html.extractTable() # table is a list
96
- table = clean_tables_format(table)
97
- return table
98
-
99
- def clean_tables_format(tables):
100
- """
101
- Ensures all tables are in consistent format: List[List[List[str]]]
102
- Cleans by:
103
- - Removing empty strings and rows
104
- - Converting all cells to strings
105
- - Handling DataFrames and list-of-lists
106
- """
107
- cleaned = []
108
- if tables:
109
- for table in tables:
110
- standardized = []
111
-
112
- # Case 1: Pandas DataFrame
113
- if isinstance(table, pd.DataFrame):
114
- table = table.fillna("").astype(str).values.tolist()
115
-
116
- # Case 2: List of Lists
117
- if isinstance(table, list) and all(isinstance(row, list) for row in table):
118
- for row in table:
119
- filtered_row = [str(cell).strip() for cell in row if str(cell).strip()]
120
- if filtered_row:
121
- standardized.append(filtered_row)
122
-
123
- if standardized:
124
- cleaned.append(standardized)
125
-
126
- return cleaned
127
-
128
- import json
129
- import tiktoken # Optional: for OpenAI token counting
130
- def normalize_text_for_comparison(s: str) -> str:
131
- """
132
- Normalizes text for robust comparison by:
133
- 1. Converting to lowercase.
134
- 2. Replacing all types of newlines with a single consistent newline (\n).
135
- 3. Removing extra spaces (e.g., multiple spaces, leading/trailing spaces on lines).
136
- 4. Stripping leading/trailing whitespace from the entire string.
137
- """
138
- s = s.lower()
139
- s = s.replace('\r\n', '\n') # Handle Windows newlines
140
- s = s.replace('\r', '\n') # Handle Mac classic newlines
141
-
142
- # Replace sequences of whitespace (including multiple newlines) with a single space
143
- # This might be too aggressive if you need to preserve paragraph breaks,
144
- # but good for exact word-sequence matching.
145
- s = re.sub(r'\s+', ' ', s)
146
-
147
- return s.strip()
148
- def merge_text_and_tables(text, tables, max_tokens=12000, keep_tables=True, tokenizer="cl100k_base", accession_id=None, isolate=None):
149
- """
150
- Merge cleaned text and table into one string for LLM input.
151
- - Avoids duplicating tables already in text
152
- - Extracts only relevant rows from large tables
153
- - Skips or saves oversized tables
154
- """
155
- import importlib
156
- json = importlib.import_module("json")
157
-
158
- def estimate_tokens(text_str):
159
- try:
160
- enc = tiktoken.get_encoding(tokenizer)
161
- return len(enc.encode(text_str))
162
- except:
163
- return len(text_str) // 4 # Fallback estimate
164
-
165
- def is_table_relevant(table, keywords, accession_id=None):
166
- flat = " ".join(" ".join(row).lower() for row in table)
167
- if accession_id and accession_id.lower() in flat:
168
- return True
169
- return any(kw.lower() in flat for kw in keywords)
170
- preview, preview1 = "",""
171
- llm_input = "## Document Text\n" + text.strip() + "\n"
172
- clean_text = normalize_text_for_comparison(text)
173
-
174
- if tables:
175
- for idx, table in enumerate(tables):
176
- keywords = ["province","district","region","village","location", "country", "region", "origin", "ancient", "modern"]
177
- if accession_id: keywords += [accession_id.lower()]
178
- if isolate: keywords += [isolate.lower()]
179
- if is_table_relevant(table, keywords, accession_id):
180
- if len(table) > 0:
181
- for tab in table:
182
- preview = " ".join(tab) if tab else ""
183
- preview1 = "\n".join(tab) if tab else ""
184
- clean_preview = normalize_text_for_comparison(preview)
185
- clean_preview1 = normalize_text_for_comparison(preview1)
186
- if clean_preview not in clean_text:
187
- if clean_preview1 not in clean_text:
188
- table_str = json.dumps([tab], indent=2)
189
- llm_input += f"## Table {idx+1}\n{table_str}\n"
190
- return llm_input.strip()
191
-
192
- def preprocess_document(link, saveFolder, accession=None, isolate=None):
193
- try:
194
- text = extract_text(link, saveFolder)
195
- except: text = ""
196
- try:
197
- tables = extract_table(link, saveFolder)
198
- except: tables = []
199
- if accession: accession = accession
200
- if isolate: isolate = isolate
201
- try:
202
- final_input = merge_text_and_tables(text, tables, max_tokens=12000, accession_id=accession, isolate=isolate)
203
- except: final_input = ""
204
- return text, tables, final_input
205
-
206
- def extract_sentences(text):
207
- sentences = re.split(r'(?<=[.!?])\s+', text)
208
- return [s.strip() for s in sentences if s.strip()]
209
-
210
- def is_irrelevant_number_sequence(text):
211
- if re.search(r'\b[A-Z]{2,}\d+\b|\b[A-Za-z]+\s+\d+\b', text, re.IGNORECASE):
212
- return False
213
- word_count = len(re.findall(r'\b[A-Za-z]{2,}\b', text))
214
- number_count = len(re.findall(r'\b\d[\d\.]*\b', text))
215
- total_tokens = len(re.findall(r'\S+', text))
216
- if total_tokens > 0 and (word_count / total_tokens < 0.2) and (number_count / total_tokens > 0.5):
217
- return True
218
- elif re.fullmatch(r'(\d+(\.\d+)?\s*)+', text.strip()):
219
- return True
220
- return False
221
-
222
- def remove_isolated_single_digits(sentence):
223
- tokens = sentence.split()
224
- filtered_tokens = []
225
- for token in tokens:
226
- if token == '0' or token == '1':
227
- pass
228
- else:
229
- filtered_tokens.append(token)
230
- return ' '.join(filtered_tokens).strip()
231
-
232
- def get_contextual_sentences_BFS(text_content, keyword, depth=2):
233
- def extract_codes(sentence):
234
- # Match codes like 'A1YU101', 'KM1', 'MO6' — at least 2 letters + numbers
235
- return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
236
- sentences = extract_sentences(text_content)
237
- relevant_sentences = set()
238
- initial_keywords = set()
239
-
240
- # Define a regex to capture codes like A1YU101 or KM1
241
- # This pattern looks for an alphanumeric sequence followed by digits at the end of the string
242
- code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
243
-
244
- # Attempt to parse the keyword into its prefix and numerical part using re.search
245
- keyword_match = code_pattern.search(keyword)
246
-
247
- keyword_prefix = None
248
- keyword_num = None
249
-
250
- if keyword_match:
251
- keyword_prefix = keyword_match.group(1).lower()
252
- keyword_num = int(keyword_match.group(2))
253
-
254
- for sentence in sentences:
255
- sentence_added = False
256
-
257
- # 1. Check for exact match of the keyword
258
- if re.search(r'\b' + re.escape(keyword) + r'\b', sentence, re.IGNORECASE):
259
- relevant_sentences.add(sentence.strip())
260
- initial_keywords.add(keyword.lower())
261
- sentence_added = True
262
-
263
- # 2. Check for range patterns (e.g., A1YU101-A1YU137)
264
- # The range pattern should be broad enough to capture the full code string within the range.
265
- range_matches = re.finditer(r'([A-Z0-9]+-\d+)', sentence, re.IGNORECASE) # More specific range pattern if needed, or rely on full code pattern below
266
- range_matches = re.finditer(r'([A-Z0-9]+\d+)-([A-Z0-9]+\d+)', sentence, re.IGNORECASE) # This is the more robust range pattern
267
-
268
- for r_match in range_matches:
269
- start_code_str = r_match.group(1)
270
- end_code_str = r_match.group(2)
271
-
272
- # CRITICAL FIX: Use code_pattern.search for start_match and end_match
273
- start_match = code_pattern.search(start_code_str)
274
- end_match = code_pattern.search(end_code_str)
275
-
276
- if keyword_prefix and keyword_num is not None and start_match and end_match:
277
- start_prefix = start_match.group(1).lower()
278
- end_prefix = end_match.group(1).lower()
279
- start_num = int(start_match.group(2))
280
- end_num = int(end_match.group(2))
281
-
282
- # Check if the keyword's prefix matches and its number is within the range
283
- if keyword_prefix == start_prefix and \
284
- keyword_prefix == end_prefix and \
285
- start_num <= keyword_num <= end_num:
286
- relevant_sentences.add(sentence.strip())
287
- initial_keywords.add(start_code_str.lower())
288
- initial_keywords.add(end_code_str.lower())
289
- sentence_added = True
290
- break # Only need to find one matching range per sentence
291
-
292
- # 3. If the sentence was added due to exact match or range, add all its alphanumeric codes
293
- # to initial_keywords to ensure graph traversal from related terms.
294
- if sentence_added:
295
- for word in extract_codes(sentence):
296
- initial_keywords.add(word.lower())
297
-
298
-
299
- # Build word_to_sentences mapping for all sentences
300
- word_to_sentences = {}
301
- for sent in sentences:
302
- codes_in_sent = set(extract_codes(sent))
303
- for code in codes_in_sent:
304
- word_to_sentences.setdefault(code.lower(), set()).add(sent.strip())
305
-
306
-
307
- # Build the graph
308
- graph = {}
309
- for sent in sentences:
310
- codes = set(extract_codes(sent))
311
- for word1 in codes:
312
- word1_lower = word1.lower()
313
- graph.setdefault(word1_lower, set())
314
- for word2 in codes:
315
- word2_lower = word2.lower()
316
- if word1_lower != word2_lower:
317
- graph[word1_lower].add(word2_lower)
318
-
319
-
320
- # Perform BFS/graph traversal
321
- queue = [(k, 0) for k in initial_keywords if k in word_to_sentences]
322
- visited_words = set(initial_keywords)
323
-
324
- while queue:
325
- current_word, level = queue.pop(0)
326
- if level >= depth:
327
- continue
328
-
329
- relevant_sentences.update(word_to_sentences.get(current_word, []))
330
-
331
- for neighbor in graph.get(current_word, []):
332
- if neighbor not in visited_words:
333
- visited_words.add(neighbor)
334
- queue.append((neighbor, level + 1))
335
-
336
- final_sentences = set()
337
- for sentence in relevant_sentences:
338
- if not is_irrelevant_number_sequence(sentence):
339
- processed_sentence = remove_isolated_single_digits(sentence)
340
- if processed_sentence:
341
- final_sentences.add(processed_sentence)
342
-
343
- return "\n".join(sorted(list(final_sentences)))
344
-
345
-
346
-
347
- def get_contextual_sentences_DFS(text_content, keyword, depth=2):
348
- sentences = extract_sentences(text_content)
349
-
350
- # Build word-to-sentences mapping
351
- word_to_sentences = {}
352
- for sent in sentences:
353
- words_in_sent = set(re.findall(r'\b[A-Za-z0-9\-_\/]+\b', sent))
354
- for word in words_in_sent:
355
- word_to_sentences.setdefault(word.lower(), set()).add(sent.strip())
356
-
357
- # Function to extract codes in a sentence
358
- def extract_codes(sentence):
359
- # Only codes like 'KSK1', 'MG272794', not pure numbers
360
- return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
361
-
362
- # DFS with priority based on distance to keyword and early stop if country found
363
- def dfs_traverse(current_word, current_depth, max_depth, visited_words, collected_sentences, parent_sentence=None):
364
- country = "unknown"
365
- if current_depth > max_depth:
366
- return country, False
367
-
368
- if current_word not in word_to_sentences:
369
- return country, False
370
-
371
- for sentence in word_to_sentences[current_word]:
372
- if sentence == parent_sentence:
373
- continue # avoid reusing the same sentence
374
-
375
- collected_sentences.add(sentence)
376
-
377
- #print("current_word:", current_word)
378
- small_sen = extract_context(sentence, current_word, int(len(sentence) / 4))
379
- #print(small_sen)
380
- country = model.get_country_from_text(small_sen)
381
- #print("small context country:", country)
382
- if country.lower() != "unknown":
383
- return country, True
384
- else:
385
- country = model.get_country_from_text(sentence)
386
- #print("full sentence country:", country)
387
- if country.lower() != "unknown":
388
- return country, True
389
-
390
- codes_in_sentence = extract_codes(sentence)
391
- idx = next((i for i, code in enumerate(codes_in_sentence) if code.lower() == current_word.lower()), None)
392
- if idx is None:
393
- continue
394
-
395
- sorted_children = sorted(
396
- [code for code in codes_in_sentence if code.lower() not in visited_words],
397
- key=lambda x: (abs(codes_in_sentence.index(x) - idx),
398
- 0 if codes_in_sentence.index(x) > idx else 1)
399
- )
400
-
401
- #print("sorted_children:", sorted_children)
402
- for child in sorted_children:
403
- child_lower = child.lower()
404
- if child_lower not in visited_words:
405
- visited_words.add(child_lower)
406
- country, should_stop = dfs_traverse(
407
- child_lower, current_depth + 1, max_depth,
408
- visited_words, collected_sentences, parent_sentence=sentence
409
- )
410
- if should_stop:
411
- return country, True
412
-
413
- return country, False
414
-
415
- # Begin DFS
416
- collected_sentences = set()
417
- visited_words = set([keyword.lower()])
418
- country, status = dfs_traverse(keyword.lower(), 0, depth, visited_words, collected_sentences)
419
-
420
- # Filter irrelevant sentences
421
- final_sentences = set()
422
- for sentence in collected_sentences:
423
- if not is_irrelevant_number_sequence(sentence):
424
- processed = remove_isolated_single_digits(sentence)
425
- if processed:
426
- final_sentences.add(processed)
427
- if not final_sentences:
428
- return country, text_content
429
- return country, "\n".join(sorted(list(final_sentences)))
430
-
431
- # Helper function for normalizing text for overlap comparison
432
- def normalize_for_overlap(s: str) -> str:
433
- s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s).lower()
434
- s = re.sub(r'\s+', ' ', s).strip()
435
- return s
436
-
437
- def merge_texts_skipping_overlap(text1: str, text2: str) -> str:
438
- if not text1: return text2
439
- if not text2: return text1
440
-
441
- # Case 1: text2 is fully contained in text1 or vice-versa
442
- if text2 in text1:
443
- return text1
444
- if text1 in text2:
445
- return text2
446
-
447
- # --- Option 1: Original behavior (suffix of text1, prefix of text2) ---
448
- # This is what your function was primarily designed for.
449
- # It looks for the overlap at the "junction" of text1 and text2.
450
-
451
- max_junction_overlap = 0
452
- for i in range(min(len(text1), len(text2)), 0, -1):
453
- suffix1 = text1[-i:]
454
- prefix2 = text2[:i]
455
- # Prioritize exact match, then normalized match
456
- if suffix1 == prefix2:
457
- max_junction_overlap = i
458
- break
459
- elif normalize_for_overlap(suffix1) == normalize_for_overlap(prefix2):
460
- max_junction_overlap = i
461
- break # Take the first (longest) normalized match
462
-
463
- if max_junction_overlap > 0:
464
- merged_text = text1 + text2[max_junction_overlap:]
465
- return re.sub(r'\s+', ' ', merged_text).strip()
466
-
467
- # --- Option 2: Longest Common Prefix (for cases like "Hi, I am Vy.") ---
468
- # This addresses your specific test case where the overlap is at the very beginning of both strings.
469
- # This is often used when trying to deduplicate content that shares a common start.
470
-
471
- longest_common_prefix_len = 0
472
- min_len = min(len(text1), len(text2))
473
- for i in range(min_len):
474
- if text1[i] == text2[i]:
475
- longest_common_prefix_len = i + 1
476
- else:
477
- break
478
-
479
- # If a common prefix is found AND it's a significant portion (e.g., more than a few chars)
480
- # AND the remaining parts are distinct, then apply this merge.
481
- # This is a heuristic and might need fine-tuning.
482
- if longest_common_prefix_len > 0 and \
483
- text1[longest_common_prefix_len:].strip() and \
484
- text2[longest_common_prefix_len:].strip():
485
-
486
- # Only merge this way if the remaining parts are not empty (i.e., not exact duplicates)
487
- # For "Hi, I am Vy. Nice to meet you." and "Hi, I am Vy. Goodbye Vy."
488
- # common prefix is "Hi, I am Vy."
489
- # Remaining text1: " Nice to meet you."
490
- # Remaining text2: " Goodbye Vy."
491
- # So we merge common_prefix + remaining_text1 + remaining_text2
492
-
493
- common_prefix_str = text1[:longest_common_prefix_len]
494
- remainder_text1 = text1[longest_common_prefix_len:]
495
- remainder_text2 = text2[longest_common_prefix_len:]
496
-
497
- merged_text = common_prefix_str + remainder_text1 + remainder_text2
498
- return re.sub(r'\s+', ' ', merged_text).strip()
499
-
500
-
501
- # If neither specific overlap type is found, just concatenate
502
- merged_text = text1 + text2
503
- return re.sub(r'\s+', ' ', merged_text).strip()
504
-
505
- def save_text_to_docx(text_content: str, file_path: str):
506
- """
507
- Saves a given text string into a .docx file.
508
-
509
- Args:
510
- text_content (str): The text string to save.
511
- file_path (str): The full path including the filename where the .docx file will be saved.
512
- Example: '/content/drive/MyDrive/CollectData/Examples/test/SEA_1234/merged_document.docx'
513
- """
514
- try:
515
- document = Document()
516
-
517
- # Add the entire text as a single paragraph, or split by newlines for multiple paragraphs
518
- for paragraph_text in text_content.split('\n'):
519
- document.add_paragraph(paragraph_text)
520
-
521
- document.save(file_path)
522
- print(f"Text successfully saved to '{file_path}'")
523
- except Exception as e:
524
- print(f"Error saving text to docx file: {e}")
525
-
526
- '''2 scenerios:
527
- - quick look then found then deepdive and directly get location then stop
528
- - quick look then found then deepdive but not find location then hold the related words then
529
- look another files iteratively for each related word and find location and stop'''
530
- def extract_context(text, keyword, window=500):
531
- # firstly try accession number
532
- code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
533
-
534
- # Attempt to parse the keyword into its prefix and numerical part using re.search
535
- keyword_match = code_pattern.search(keyword)
536
-
537
- keyword_prefix = None
538
- keyword_num = None
539
-
540
- if keyword_match:
541
- keyword_prefix = keyword_match.group(1).lower()
542
- keyword_num = int(keyword_match.group(2))
543
- text = text.lower()
544
- idx = text.find(keyword.lower())
545
- if idx == -1:
546
- if keyword_prefix:
547
- idx = text.find(keyword_prefix)
548
- if idx == -1:
549
- return "Sample ID not found."
550
- return text[max(0, idx-window): idx+window]
551
- return text[max(0, idx-window): idx+window]
552
- def process_inputToken(filePaths, saveLinkFolder,accession=None, isolate=None):
553
- cache = {}
554
- country = "unknown"
555
- output = ""
556
- tem_output, small_output = "",""
557
- keyword_appear = (False,"")
558
- keywords = []
559
- if isolate: keywords.append(isolate)
560
- if accession: keywords.append(accession)
561
- for f in filePaths:
562
- # scenerio 1: direct location: truncate the context and then use qa model?
563
- if keywords:
564
- for keyword in keywords:
565
- text, tables, final_input = preprocess_document(f,saveLinkFolder, isolate=keyword)
566
- if keyword in final_input:
567
- context = extract_context(final_input, keyword)
568
- # quick look if country already in context and if yes then return
569
- country = model.get_country_from_text(context)
570
- if country != "unknown":
571
- return country, context, final_input
572
- else:
573
- country = model.get_country_from_text(final_input)
574
- if country != "unknown":
575
- return country, context, final_input
576
- else: # might be cross-ref
577
- keyword_appear = (True, f)
578
- cache[f] = context
579
- small_output = merge_texts_skipping_overlap(output, context) + "\n"
580
- chunkBFS = get_contextual_sentences_BFS(small_output, keyword)
581
- countryBFS = model.get_country_from_text(chunkBFS)
582
- countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
583
- output = merge_texts_skipping_overlap(output, final_input)
584
- if countryDFS != "unknown" and countryBFS != "unknown":
585
- if len(chunkDFS) <= len(chunkBFS):
586
- return countryDFS, chunkDFS, output
587
- else:
588
- return countryBFS, chunkBFS, output
589
- else:
590
- if countryDFS != "unknown":
591
- return countryDFS, chunkDFS, output
592
- if countryBFS != "unknown":
593
- return countryBFS, chunkBFS, output
594
- else:
595
- # scenerio 2:
596
- '''cross-ref: ex: A1YU101 keyword in file 2 which includes KM1 but KM1 in file 1
597
- but if we look at file 1 first then maybe we can have lookup dict which country
598
- such as Thailand as the key and its re'''
599
- cache[f] = final_input
600
- if keyword_appear[0] == True:
601
- for c in cache:
602
- if c!=keyword_appear[1]:
603
- if cache[c].lower() not in output.lower():
604
- output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
605
- chunkBFS = get_contextual_sentences_BFS(output, keyword)
606
- countryBFS = model.get_country_from_text(chunkBFS)
607
- countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
608
- if countryDFS != "unknown" and countryBFS != "unknown":
609
- if len(chunkDFS) <= len(chunkBFS):
610
- return countryDFS, chunkDFS, output
611
- else:
612
- return countryBFS, chunkBFS, output
613
- else:
614
- if countryDFS != "unknown":
615
- return countryDFS, chunkDFS, output
616
- if countryBFS != "unknown":
617
- return countryBFS, chunkBFS, output
618
- else:
619
- if cache[f].lower() not in output.lower():
620
- output = merge_texts_skipping_overlap(output, cache[f]) + "\n"
621
- if len(output) == 0 or keyword_appear[0]==False:
622
- for c in cache:
623
- if cache[c].lower() not in output.lower():
624
- output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
625
  return country, "", output
 
1
+ import re
2
+ import os
3
+ #import streamlit as st
4
+ import subprocess
5
+ import re
6
+ from Bio import Entrez
7
+ from docx import Document
8
+ import fitz
9
+ import spacy
10
+ from spacy.cli import download
11
+ from NER.PDF import pdf
12
+ from NER.WordDoc import wordDoc
13
+ from NER.html import extractHTML
14
+ from NER.word2Vec import word2vec
15
+ from transformers import pipeline
16
+ import urllib.parse, requests
17
+ from pathlib import Path
18
+ import pandas as pd
19
+ import model
20
+ import pipeline
21
+ import tempfile
22
+ import nltk
23
+ nltk.download('punkt_tab')
24
+ def download_excel_file(url, save_path="temp.xlsx"):
25
+ if "view.officeapps.live.com" in url:
26
+ parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
27
+ real_url = urllib.parse.unquote(parsed_url["src"][0])
28
+ response = requests.get(real_url)
29
+ with open(save_path, "wb") as f:
30
+ f.write(response.content)
31
+ return save_path
32
+ elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
33
+ response = requests.get(url)
34
+ response.raise_for_status() # Raises error if download fails
35
+ with open(save_path, "wb") as f:
36
+ f.write(response.content)
37
+ print(len(response.content))
38
+ return save_path
39
+ else:
40
+ print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
41
+ return url
42
+ def extract_text(link,saveFolder):
43
+ text = ""
44
+ name = link.split("/")[-1]
45
+ #file_path = Path(saveFolder) / name
46
+ local_temp_path = os.path.join(tempfile.gettempdir(), name)
47
+ pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
48
+
49
+ # pdf
50
+ if link.endswith(".pdf"):
51
+ # if file_path.is_file():
52
+ # link = saveFolder + "/" + name
53
+ # print("File exists.")
54
+ p = pdf.PDF(local_temp_path, saveFolder)
55
+ #p = pdf.PDF(link,saveFolder)
56
+ text = p.extractTextWithPDFReader()
57
+ #text_exclude_table = p.extract_text_excluding_tables()
58
+ # worddoc
59
+ elif link.endswith(".doc") or link.endswith(".docx"):
60
+ d = wordDoc.wordDoc(local_temp_path,saveFolder)
61
+ text = d.extractTextByPage()
62
+ # html
63
+ if link.split(".")[-1].lower() not in "xlsx":
64
+ if "http" in link or "html" in link:
65
+ html = extractHTML.HTML("",link)
66
+ text = html.getListSection() # the text already clean
67
+ return text
68
+ def extract_table(link,saveFolder):
69
+ table = []
70
+ name = link.split("/")[-1]
71
+ #file_path = Path(saveFolder) / name
72
+ local_temp_path = os.path.join(tempfile.gettempdir(), name)
73
+ pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
74
+ # pdf
75
+ if link.endswith(".pdf"):
76
+ # if file_path.is_file():
77
+ # link = saveFolder + "/" + name
78
+ # print("File exists.")
79
+ p = pdf.PDF(local_temp_path,saveFolder)
80
+ table = p.extractTable()
81
+ # worddoc
82
+ elif link.endswith(".doc") or link.endswith(".docx"):
83
+ d = wordDoc.wordDoc(local_temp_path,saveFolder)
84
+ table = d.extractTableAsList()
85
+ # excel
86
+ elif link.split(".")[-1].lower() in "xlsx":
87
+ # download excel file if it not downloaded yet
88
+ savePath = saveFolder +"/"+ link.split("/")[-1]
89
+ excelPath = download_excel_file(link, savePath)
90
+ try:
91
+ #xls = pd.ExcelFile(excelPath)
92
+ xls = pd.ExcelFile(local_temp_path)
93
+ table_list = []
94
+ for sheet_name in xls.sheet_names:
95
+ df = pd.read_excel(xls, sheet_name=sheet_name)
96
+ cleaned_table = df.fillna("").astype(str).values.tolist()
97
+ table_list.append(cleaned_table)
98
+ table = table_list
99
+ except Exception as e:
100
+ print("❌ Failed to extract tables from Excel:", e)
101
+ # html
102
+ elif "http" in link or "html" in link:
103
+ html = extractHTML.HTML("",link)
104
+ table = html.extractTable() # table is a list
105
+ table = clean_tables_format(table)
106
+ return table
107
+
108
+ def clean_tables_format(tables):
109
+ """
110
+ Ensures all tables are in consistent format: List[List[List[str]]]
111
+ Cleans by:
112
+ - Removing empty strings and rows
113
+ - Converting all cells to strings
114
+ - Handling DataFrames and list-of-lists
115
+ """
116
+ cleaned = []
117
+ if tables:
118
+ for table in tables:
119
+ standardized = []
120
+
121
+ # Case 1: Pandas DataFrame
122
+ if isinstance(table, pd.DataFrame):
123
+ table = table.fillna("").astype(str).values.tolist()
124
+
125
+ # Case 2: List of Lists
126
+ if isinstance(table, list) and all(isinstance(row, list) for row in table):
127
+ for row in table:
128
+ filtered_row = [str(cell).strip() for cell in row if str(cell).strip()]
129
+ if filtered_row:
130
+ standardized.append(filtered_row)
131
+
132
+ if standardized:
133
+ cleaned.append(standardized)
134
+
135
+ return cleaned
136
+
137
+ import json
138
+ import tiktoken # Optional: for OpenAI token counting
139
+ def normalize_text_for_comparison(s: str) -> str:
140
+ """
141
+ Normalizes text for robust comparison by:
142
+ 1. Converting to lowercase.
143
+ 2. Replacing all types of newlines with a single consistent newline (\n).
144
+ 3. Removing extra spaces (e.g., multiple spaces, leading/trailing spaces on lines).
145
+ 4. Stripping leading/trailing whitespace from the entire string.
146
+ """
147
+ s = s.lower()
148
+ s = s.replace('\r\n', '\n') # Handle Windows newlines
149
+ s = s.replace('\r', '\n') # Handle Mac classic newlines
150
+
151
+ # Replace sequences of whitespace (including multiple newlines) with a single space
152
+ # This might be too aggressive if you need to preserve paragraph breaks,
153
+ # but good for exact word-sequence matching.
154
+ s = re.sub(r'\s+', ' ', s)
155
+
156
+ return s.strip()
157
+ def merge_text_and_tables(text, tables, max_tokens=12000, keep_tables=True, tokenizer="cl100k_base", accession_id=None, isolate=None):
158
+ """
159
+ Merge cleaned text and table into one string for LLM input.
160
+ - Avoids duplicating tables already in text
161
+ - Extracts only relevant rows from large tables
162
+ - Skips or saves oversized tables
163
+ """
164
+ import importlib
165
+ json = importlib.import_module("json")
166
+
167
+ def estimate_tokens(text_str):
168
+ try:
169
+ enc = tiktoken.get_encoding(tokenizer)
170
+ return len(enc.encode(text_str))
171
+ except:
172
+ return len(text_str) // 4 # Fallback estimate
173
+
174
+ def is_table_relevant(table, keywords, accession_id=None):
175
+ flat = " ".join(" ".join(row).lower() for row in table)
176
+ if accession_id and accession_id.lower() in flat:
177
+ return True
178
+ return any(kw.lower() in flat for kw in keywords)
179
+ preview, preview1 = "",""
180
+ llm_input = "## Document Text\n" + text.strip() + "\n"
181
+ clean_text = normalize_text_for_comparison(text)
182
+
183
+ if tables:
184
+ for idx, table in enumerate(tables):
185
+ keywords = ["province","district","region","village","location", "country", "region", "origin", "ancient", "modern"]
186
+ if accession_id: keywords += [accession_id.lower()]
187
+ if isolate: keywords += [isolate.lower()]
188
+ if is_table_relevant(table, keywords, accession_id):
189
+ if len(table) > 0:
190
+ for tab in table:
191
+ preview = " ".join(tab) if tab else ""
192
+ preview1 = "\n".join(tab) if tab else ""
193
+ clean_preview = normalize_text_for_comparison(preview)
194
+ clean_preview1 = normalize_text_for_comparison(preview1)
195
+ if clean_preview not in clean_text:
196
+ if clean_preview1 not in clean_text:
197
+ table_str = json.dumps([tab], indent=2)
198
+ llm_input += f"## Table {idx+1}\n{table_str}\n"
199
+ return llm_input.strip()
200
+
201
+ def preprocess_document(link, saveFolder, accession=None, isolate=None):
202
+ try:
203
+ text = extract_text(link, saveFolder)
204
+ except: text = ""
205
+ try:
206
+ tables = extract_table(link, saveFolder)
207
+ except: tables = []
208
+ if accession: accession = accession
209
+ if isolate: isolate = isolate
210
+ try:
211
+ final_input = merge_text_and_tables(text, tables, max_tokens=12000, accession_id=accession, isolate=isolate)
212
+ except: final_input = ""
213
+ return text, tables, final_input
214
+
215
+ def extract_sentences(text):
216
+ sentences = re.split(r'(?<=[.!?])\s+', text)
217
+ return [s.strip() for s in sentences if s.strip()]
218
+
219
+ def is_irrelevant_number_sequence(text):
220
+ if re.search(r'\b[A-Z]{2,}\d+\b|\b[A-Za-z]+\s+\d+\b', text, re.IGNORECASE):
221
+ return False
222
+ word_count = len(re.findall(r'\b[A-Za-z]{2,}\b', text))
223
+ number_count = len(re.findall(r'\b\d[\d\.]*\b', text))
224
+ total_tokens = len(re.findall(r'\S+', text))
225
+ if total_tokens > 0 and (word_count / total_tokens < 0.2) and (number_count / total_tokens > 0.5):
226
+ return True
227
+ elif re.fullmatch(r'(\d+(\.\d+)?\s*)+', text.strip()):
228
+ return True
229
+ return False
230
+
231
+ def remove_isolated_single_digits(sentence):
232
+ tokens = sentence.split()
233
+ filtered_tokens = []
234
+ for token in tokens:
235
+ if token == '0' or token == '1':
236
+ pass
237
+ else:
238
+ filtered_tokens.append(token)
239
+ return ' '.join(filtered_tokens).strip()
240
+
241
+ def get_contextual_sentences_BFS(text_content, keyword, depth=2):
242
+ def extract_codes(sentence):
243
+ # Match codes like 'A1YU101', 'KM1', 'MO6' — at least 2 letters + numbers
244
+ return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
245
+ sentences = extract_sentences(text_content)
246
+ relevant_sentences = set()
247
+ initial_keywords = set()
248
+
249
+ # Define a regex to capture codes like A1YU101 or KM1
250
+ # This pattern looks for an alphanumeric sequence followed by digits at the end of the string
251
+ code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
252
+
253
+ # Attempt to parse the keyword into its prefix and numerical part using re.search
254
+ keyword_match = code_pattern.search(keyword)
255
+
256
+ keyword_prefix = None
257
+ keyword_num = None
258
+
259
+ if keyword_match:
260
+ keyword_prefix = keyword_match.group(1).lower()
261
+ keyword_num = int(keyword_match.group(2))
262
+
263
+ for sentence in sentences:
264
+ sentence_added = False
265
+
266
+ # 1. Check for exact match of the keyword
267
+ if re.search(r'\b' + re.escape(keyword) + r'\b', sentence, re.IGNORECASE):
268
+ relevant_sentences.add(sentence.strip())
269
+ initial_keywords.add(keyword.lower())
270
+ sentence_added = True
271
+
272
+ # 2. Check for range patterns (e.g., A1YU101-A1YU137)
273
+ # The range pattern should be broad enough to capture the full code string within the range.
274
+ range_matches = re.finditer(r'([A-Z0-9]+-\d+)', sentence, re.IGNORECASE) # More specific range pattern if needed, or rely on full code pattern below
275
+ range_matches = re.finditer(r'([A-Z0-9]+\d+)-([A-Z0-9]+\d+)', sentence, re.IGNORECASE) # This is the more robust range pattern
276
+
277
+ for r_match in range_matches:
278
+ start_code_str = r_match.group(1)
279
+ end_code_str = r_match.group(2)
280
+
281
+ # CRITICAL FIX: Use code_pattern.search for start_match and end_match
282
+ start_match = code_pattern.search(start_code_str)
283
+ end_match = code_pattern.search(end_code_str)
284
+
285
+ if keyword_prefix and keyword_num is not None and start_match and end_match:
286
+ start_prefix = start_match.group(1).lower()
287
+ end_prefix = end_match.group(1).lower()
288
+ start_num = int(start_match.group(2))
289
+ end_num = int(end_match.group(2))
290
+
291
+ # Check if the keyword's prefix matches and its number is within the range
292
+ if keyword_prefix == start_prefix and \
293
+ keyword_prefix == end_prefix and \
294
+ start_num <= keyword_num <= end_num:
295
+ relevant_sentences.add(sentence.strip())
296
+ initial_keywords.add(start_code_str.lower())
297
+ initial_keywords.add(end_code_str.lower())
298
+ sentence_added = True
299
+ break # Only need to find one matching range per sentence
300
+
301
+ # 3. If the sentence was added due to exact match or range, add all its alphanumeric codes
302
+ # to initial_keywords to ensure graph traversal from related terms.
303
+ if sentence_added:
304
+ for word in extract_codes(sentence):
305
+ initial_keywords.add(word.lower())
306
+
307
+
308
+ # Build word_to_sentences mapping for all sentences
309
+ word_to_sentences = {}
310
+ for sent in sentences:
311
+ codes_in_sent = set(extract_codes(sent))
312
+ for code in codes_in_sent:
313
+ word_to_sentences.setdefault(code.lower(), set()).add(sent.strip())
314
+
315
+
316
+ # Build the graph
317
+ graph = {}
318
+ for sent in sentences:
319
+ codes = set(extract_codes(sent))
320
+ for word1 in codes:
321
+ word1_lower = word1.lower()
322
+ graph.setdefault(word1_lower, set())
323
+ for word2 in codes:
324
+ word2_lower = word2.lower()
325
+ if word1_lower != word2_lower:
326
+ graph[word1_lower].add(word2_lower)
327
+
328
+
329
+ # Perform BFS/graph traversal
330
+ queue = [(k, 0) for k in initial_keywords if k in word_to_sentences]
331
+ visited_words = set(initial_keywords)
332
+
333
+ while queue:
334
+ current_word, level = queue.pop(0)
335
+ if level >= depth:
336
+ continue
337
+
338
+ relevant_sentences.update(word_to_sentences.get(current_word, []))
339
+
340
+ for neighbor in graph.get(current_word, []):
341
+ if neighbor not in visited_words:
342
+ visited_words.add(neighbor)
343
+ queue.append((neighbor, level + 1))
344
+
345
+ final_sentences = set()
346
+ for sentence in relevant_sentences:
347
+ if not is_irrelevant_number_sequence(sentence):
348
+ processed_sentence = remove_isolated_single_digits(sentence)
349
+ if processed_sentence:
350
+ final_sentences.add(processed_sentence)
351
+
352
+ return "\n".join(sorted(list(final_sentences)))
353
+
354
+
355
+
356
+ def get_contextual_sentences_DFS(text_content, keyword, depth=2):
357
+ sentences = extract_sentences(text_content)
358
+
359
+ # Build word-to-sentences mapping
360
+ word_to_sentences = {}
361
+ for sent in sentences:
362
+ words_in_sent = set(re.findall(r'\b[A-Za-z0-9\-_\/]+\b', sent))
363
+ for word in words_in_sent:
364
+ word_to_sentences.setdefault(word.lower(), set()).add(sent.strip())
365
+
366
+ # Function to extract codes in a sentence
367
+ def extract_codes(sentence):
368
+ # Only codes like 'KSK1', 'MG272794', not pure numbers
369
+ return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
370
+
371
+ # DFS with priority based on distance to keyword and early stop if country found
372
+ def dfs_traverse(current_word, current_depth, max_depth, visited_words, collected_sentences, parent_sentence=None):
373
+ country = "unknown"
374
+ if current_depth > max_depth:
375
+ return country, False
376
+
377
+ if current_word not in word_to_sentences:
378
+ return country, False
379
+
380
+ for sentence in word_to_sentences[current_word]:
381
+ if sentence == parent_sentence:
382
+ continue # avoid reusing the same sentence
383
+
384
+ collected_sentences.add(sentence)
385
+
386
+ #print("current_word:", current_word)
387
+ small_sen = extract_context(sentence, current_word, int(len(sentence) / 4))
388
+ #print(small_sen)
389
+ country = model.get_country_from_text(small_sen)
390
+ #print("small context country:", country)
391
+ if country.lower() != "unknown":
392
+ return country, True
393
+ else:
394
+ country = model.get_country_from_text(sentence)
395
+ #print("full sentence country:", country)
396
+ if country.lower() != "unknown":
397
+ return country, True
398
+
399
+ codes_in_sentence = extract_codes(sentence)
400
+ idx = next((i for i, code in enumerate(codes_in_sentence) if code.lower() == current_word.lower()), None)
401
+ if idx is None:
402
+ continue
403
+
404
+ sorted_children = sorted(
405
+ [code for code in codes_in_sentence if code.lower() not in visited_words],
406
+ key=lambda x: (abs(codes_in_sentence.index(x) - idx),
407
+ 0 if codes_in_sentence.index(x) > idx else 1)
408
+ )
409
+
410
+ #print("sorted_children:", sorted_children)
411
+ for child in sorted_children:
412
+ child_lower = child.lower()
413
+ if child_lower not in visited_words:
414
+ visited_words.add(child_lower)
415
+ country, should_stop = dfs_traverse(
416
+ child_lower, current_depth + 1, max_depth,
417
+ visited_words, collected_sentences, parent_sentence=sentence
418
+ )
419
+ if should_stop:
420
+ return country, True
421
+
422
+ return country, False
423
+
424
+ # Begin DFS
425
+ collected_sentences = set()
426
+ visited_words = set([keyword.lower()])
427
+ country, status = dfs_traverse(keyword.lower(), 0, depth, visited_words, collected_sentences)
428
+
429
+ # Filter irrelevant sentences
430
+ final_sentences = set()
431
+ for sentence in collected_sentences:
432
+ if not is_irrelevant_number_sequence(sentence):
433
+ processed = remove_isolated_single_digits(sentence)
434
+ if processed:
435
+ final_sentences.add(processed)
436
+ if not final_sentences:
437
+ return country, text_content
438
+ return country, "\n".join(sorted(list(final_sentences)))
439
+
440
+ # Helper function for normalizing text for overlap comparison
441
+ def normalize_for_overlap(s: str) -> str:
442
+ s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s).lower()
443
+ s = re.sub(r'\s+', ' ', s).strip()
444
+ return s
445
+
446
+ def merge_texts_skipping_overlap(text1: str, text2: str) -> str:
447
+ if not text1: return text2
448
+ if not text2: return text1
449
+
450
+ # Case 1: text2 is fully contained in text1 or vice-versa
451
+ if text2 in text1:
452
+ return text1
453
+ if text1 in text2:
454
+ return text2
455
+
456
+ # --- Option 1: Original behavior (suffix of text1, prefix of text2) ---
457
+ # This is what your function was primarily designed for.
458
+ # It looks for the overlap at the "junction" of text1 and text2.
459
+
460
+ max_junction_overlap = 0
461
+ for i in range(min(len(text1), len(text2)), 0, -1):
462
+ suffix1 = text1[-i:]
463
+ prefix2 = text2[:i]
464
+ # Prioritize exact match, then normalized match
465
+ if suffix1 == prefix2:
466
+ max_junction_overlap = i
467
+ break
468
+ elif normalize_for_overlap(suffix1) == normalize_for_overlap(prefix2):
469
+ max_junction_overlap = i
470
+ break # Take the first (longest) normalized match
471
+
472
+ if max_junction_overlap > 0:
473
+ merged_text = text1 + text2[max_junction_overlap:]
474
+ return re.sub(r'\s+', ' ', merged_text).strip()
475
+
476
+ # --- Option 2: Longest Common Prefix (for cases like "Hi, I am Vy.") ---
477
+ # This addresses your specific test case where the overlap is at the very beginning of both strings.
478
+ # This is often used when trying to deduplicate content that shares a common start.
479
+
480
+ longest_common_prefix_len = 0
481
+ min_len = min(len(text1), len(text2))
482
+ for i in range(min_len):
483
+ if text1[i] == text2[i]:
484
+ longest_common_prefix_len = i + 1
485
+ else:
486
+ break
487
+
488
+ # If a common prefix is found AND it's a significant portion (e.g., more than a few chars)
489
+ # AND the remaining parts are distinct, then apply this merge.
490
+ # This is a heuristic and might need fine-tuning.
491
+ if longest_common_prefix_len > 0 and \
492
+ text1[longest_common_prefix_len:].strip() and \
493
+ text2[longest_common_prefix_len:].strip():
494
+
495
+ # Only merge this way if the remaining parts are not empty (i.e., not exact duplicates)
496
+ # For "Hi, I am Vy. Nice to meet you." and "Hi, I am Vy. Goodbye Vy."
497
+ # common prefix is "Hi, I am Vy."
498
+ # Remaining text1: " Nice to meet you."
499
+ # Remaining text2: " Goodbye Vy."
500
+ # So we merge common_prefix + remaining_text1 + remaining_text2
501
+
502
+ common_prefix_str = text1[:longest_common_prefix_len]
503
+ remainder_text1 = text1[longest_common_prefix_len:]
504
+ remainder_text2 = text2[longest_common_prefix_len:]
505
+
506
+ merged_text = common_prefix_str + remainder_text1 + remainder_text2
507
+ return re.sub(r'\s+', ' ', merged_text).strip()
508
+
509
+
510
+ # If neither specific overlap type is found, just concatenate
511
+ merged_text = text1 + text2
512
+ return re.sub(r'\s+', ' ', merged_text).strip()
513
+
514
+ from docx import Document
515
+ from pipeline import upload_file_to_drive
516
+ # def save_text_to_docx(text_content: str, file_path: str):
517
+ # """
518
+ # Saves a given text string into a .docx file.
519
+
520
+ # Args:
521
+ # text_content (str): The text string to save.
522
+ # file_path (str): The full path including the filename where the .docx file will be saved.
523
+ # Example: '/content/drive/MyDrive/CollectData/Examples/test/SEA_1234/merged_document.docx'
524
+ # """
525
+ # try:
526
+ # document = Document()
527
+
528
+ # # Add the entire text as a single paragraph, or split by newlines for multiple paragraphs
529
+ # for paragraph_text in text_content.split('\n'):
530
+ # document.add_paragraph(paragraph_text)
531
+
532
+ # document.save(file_path)
533
+ # print(f"Text successfully saved to '{file_path}'")
534
+ # except Exception as e:
535
+ # print(f"Error saving text to docx file: {e}")
536
+ def save_text_to_docx(text_content: str, filename: str, drive_folder_id: str):
537
+ """
538
+ Saves a given text string into a .docx file locally, then uploads to Google Drive.
539
+
540
+ Args:
541
+ text_content (str): The text string to save.
542
+ filename (str): The target .docx file name, e.g. 'BRU18_merged_document.docx'.
543
+ drive_folder_id (str): Google Drive folder ID where to upload the file.
544
+ """
545
+ try:
546
+ # ✅ Save to temporary local path first
547
+ local_path = os.path.join(tempfile.gettempdir(), filename)
548
+ document = Document()
549
+ for paragraph_text in text_content.split('\n'):
550
+ document.add_paragraph(paragraph_text)
551
+ document.save(local_path)
552
+ print(f"✅ Text saved locally to: {local_path}")
553
+
554
+ # Upload to Drive
555
+ upload_file_to_drive(local_path, filename, drive_folder_id)
556
+ print(f"✅ Uploaded '{filename}' to Google Drive folder ID: {drive_folder_id}")
557
+
558
+ except Exception as e:
559
+ print(f"❌ Error saving or uploading DOCX: {e}")
560
+
561
+
562
+ '''2 scenerios:
563
+ - quick look then found then deepdive and directly get location then stop
564
+ - quick look then found then deepdive but not find location then hold the related words then
565
+ look another files iteratively for each related word and find location and stop'''
566
+ def extract_context(text, keyword, window=500):
567
+ # firstly try accession number
568
+ code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
569
+
570
+ # Attempt to parse the keyword into its prefix and numerical part using re.search
571
+ keyword_match = code_pattern.search(keyword)
572
+
573
+ keyword_prefix = None
574
+ keyword_num = None
575
+
576
+ if keyword_match:
577
+ keyword_prefix = keyword_match.group(1).lower()
578
+ keyword_num = int(keyword_match.group(2))
579
+ text = text.lower()
580
+ idx = text.find(keyword.lower())
581
+ if idx == -1:
582
+ if keyword_prefix:
583
+ idx = text.find(keyword_prefix)
584
+ if idx == -1:
585
+ return "Sample ID not found."
586
+ return text[max(0, idx-window): idx+window]
587
+ return text[max(0, idx-window): idx+window]
588
+ def process_inputToken(filePaths, saveLinkFolder,accession=None, isolate=None):
589
+ cache = {}
590
+ country = "unknown"
591
+ output = ""
592
+ tem_output, small_output = "",""
593
+ keyword_appear = (False,"")
594
+ keywords = []
595
+ if isolate: keywords.append(isolate)
596
+ if accession: keywords.append(accession)
597
+ for f in filePaths:
598
+ # scenerio 1: direct location: truncate the context and then use qa model?
599
+ if keywords:
600
+ for keyword in keywords:
601
+ text, tables, final_input = preprocess_document(f,saveLinkFolder, isolate=keyword)
602
+ if keyword in final_input:
603
+ context = extract_context(final_input, keyword)
604
+ # quick look if country already in context and if yes then return
605
+ country = model.get_country_from_text(context)
606
+ if country != "unknown":
607
+ return country, context, final_input
608
+ else:
609
+ country = model.get_country_from_text(final_input)
610
+ if country != "unknown":
611
+ return country, context, final_input
612
+ else: # might be cross-ref
613
+ keyword_appear = (True, f)
614
+ cache[f] = context
615
+ small_output = merge_texts_skipping_overlap(output, context) + "\n"
616
+ chunkBFS = get_contextual_sentences_BFS(small_output, keyword)
617
+ countryBFS = model.get_country_from_text(chunkBFS)
618
+ countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
619
+ output = merge_texts_skipping_overlap(output, final_input)
620
+ if countryDFS != "unknown" and countryBFS != "unknown":
621
+ if len(chunkDFS) <= len(chunkBFS):
622
+ return countryDFS, chunkDFS, output
623
+ else:
624
+ return countryBFS, chunkBFS, output
625
+ else:
626
+ if countryDFS != "unknown":
627
+ return countryDFS, chunkDFS, output
628
+ if countryBFS != "unknown":
629
+ return countryBFS, chunkBFS, output
630
+ else:
631
+ # scenerio 2:
632
+ '''cross-ref: ex: A1YU101 keyword in file 2 which includes KM1 but KM1 in file 1
633
+ but if we look at file 1 first then maybe we can have lookup dict which country
634
+ such as Thailand as the key and its re'''
635
+ cache[f] = final_input
636
+ if keyword_appear[0] == True:
637
+ for c in cache:
638
+ if c!=keyword_appear[1]:
639
+ if cache[c].lower() not in output.lower():
640
+ output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
641
+ chunkBFS = get_contextual_sentences_BFS(output, keyword)
642
+ countryBFS = model.get_country_from_text(chunkBFS)
643
+ countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
644
+ if countryDFS != "unknown" and countryBFS != "unknown":
645
+ if len(chunkDFS) <= len(chunkBFS):
646
+ return countryDFS, chunkDFS, output
647
+ else:
648
+ return countryBFS, chunkBFS, output
649
+ else:
650
+ if countryDFS != "unknown":
651
+ return countryDFS, chunkDFS, output
652
+ if countryBFS != "unknown":
653
+ return countryBFS, chunkBFS, output
654
+ else:
655
+ if cache[f].lower() not in output.lower():
656
+ output = merge_texts_skipping_overlap(output, cache[f]) + "\n"
657
+ if len(output) == 0 or keyword_appear[0]==False:
658
+ for c in cache:
659
+ if cache[c].lower() not in output.lower():
660
+ output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
661
  return country, "", output