oceansweep commited on
Commit
34f604c
·
verified ·
1 Parent(s): 60fa160

Update App_Function_Libraries/Chunk_Lib.py

Browse files
Files changed (1) hide show
  1. App_Function_Libraries/Chunk_Lib.py +1050 -1050
App_Function_Libraries/Chunk_Lib.py CHANGED
@@ -1,1051 +1,1051 @@
1
- # Chunk_Lib.py
2
- #########################################
3
- # Chunking Library
4
- # This library is used to perform chunking of input files.
5
- # Currently, uses naive approaches. Nothing fancy.
6
- #
7
- ####
8
- # Import necessary libraries
9
- import hashlib
10
- import json
11
- import logging
12
- import re
13
- from typing import Any, Dict, List, Optional, Tuple
14
- #
15
- # Import 3rd party
16
- from openai import OpenAI
17
- from tqdm import tqdm
18
- from langdetect import detect
19
- from transformers import GPT2Tokenizer
20
- import nltk
21
- from nltk.tokenize import sent_tokenize, word_tokenize
22
- from sklearn.feature_extraction.text import TfidfVectorizer
23
- from sklearn.metrics.pairwise import cosine_similarity
24
- #
25
- # Import Local
26
- from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
27
- from App_Function_Libraries.Utils.Utils import load_comprehensive_config
28
- #
29
- #######################################################################################################################
30
- # Config Settings
31
- #
32
- #
33
- # FIXME - Make sure it only downloads if it already exists, and does a check first.
34
- # Ensure NLTK data is downloaded
35
- def ensure_nltk_data():
36
- try:
37
- nltk.data.find('tokenizers/punkt')
38
- except LookupError:
39
- nltk.download('punkt')
40
- ensure_nltk_data()
41
-
42
- #
43
- # Load GPT2 tokenizer
44
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
45
- #
46
- # Load configuration
47
- config = load_comprehensive_config()
48
- # Embedding Chunking options
49
- chunk_options = {
50
- 'method': config.get('Chunking', 'method', fallback='words'),
51
- 'max_size': config.getint('Chunking', 'max_size', fallback=400),
52
- 'overlap': config.getint('Chunking', 'overlap', fallback=200),
53
- 'adaptive': config.getboolean('Chunking', 'adaptive', fallback=False),
54
- 'multi_level': config.getboolean('Chunking', 'multi_level', fallback=False),
55
- 'language': config.get('Chunking', 'language', fallback='english')
56
- }
57
-
58
- openai_api_key = config.get('API', 'openai_api_key')
59
- #
60
- # End of settings
61
- #######################################################################################################################
62
- #
63
- # Functions:
64
-
65
- # Create a chunking class for refactoring FIXME
66
- # class Chunker:
67
- # def __init__(self, tokenizer: GPT2Tokenizer):
68
- # self.tokenizer = tokenizer
69
- #
70
- # def detect_language(self, text: str) -> str:
71
- # try:
72
- # return detect(text)
73
- # except:
74
- # return 'en'
75
- #
76
- # def chunk_text(self, text: str, method: str, max_size: int, overlap: int, language: str = None) -> List[str]:
77
- # if language is None:
78
- # language = self.detect_language(text)
79
- #
80
- # if method == 'words':
81
- # return self.chunk_text_by_words(text, max_size, overlap, language)
82
- # elif method == 'sentences':
83
- # return self.chunk_text_by_sentences(text, max_size, overlap, language)
84
- # elif method == 'paragraphs':
85
- # return self.chunk_text_by_paragraphs(text, max_size, overlap)
86
- # elif method == 'tokens':
87
- # return self.chunk_text_by_tokens(text, max_size, overlap, language)
88
- # elif method == 'semantic':
89
- # return self.semantic_chunking(text, max_size)
90
- # else:
91
- # return [text]
92
-
93
- def detect_language(text: str) -> str:
94
- try:
95
- return detect(text)
96
- except:
97
- # Default to English if detection fails
98
- return 'en'
99
-
100
-
101
- def load_document(file_path: str) -> str:
102
- with open(file_path, 'r', encoding='utf-8') as file:
103
- text = file.read()
104
- return re.sub(r'\s+', ' ', text).strip()
105
-
106
-
107
- def improved_chunking_process(text: str, chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
108
- logging.debug("Improved chunking process started...")
109
-
110
- # Extract JSON metadata if present
111
- json_content = {}
112
- try:
113
- json_end = text.index("}\n") + 1
114
- json_content = json.loads(text[:json_end])
115
- text = text[json_end:].strip()
116
- logging.debug(f"Extracted JSON metadata: {json_content}")
117
- except (ValueError, json.JSONDecodeError):
118
- logging.debug("No JSON metadata found at the beginning of the text")
119
-
120
- # Extract any additional header text
121
- header_match = re.match(r"(This text was transcribed using.*?)\n\n", text, re.DOTALL)
122
- header_text = ""
123
- if header_match:
124
- header_text = header_match.group(1)
125
- text = text[len(header_text):].strip()
126
- logging.debug(f"Extracted header text: {header_text}")
127
-
128
- options = chunk_options.copy() if chunk_options else {}
129
- if chunk_options:
130
- options.update(chunk_options)
131
-
132
- chunk_method = options.get('method', 'words')
133
- max_size = options.get('max_size', 2000)
134
- overlap = options.get('overlap', 0)
135
- language = options.get('language', None)
136
-
137
- if language is None:
138
- language = detect_language(text)
139
-
140
- if chunk_method == 'json':
141
- chunks = chunk_text_by_json(text, max_size=max_size, overlap=overlap)
142
- else:
143
- chunks = chunk_text(text, chunk_method, max_size, overlap, language)
144
-
145
- chunks_with_metadata = []
146
- total_chunks = len(chunks)
147
- for i, chunk in enumerate(chunks):
148
- metadata = {
149
- 'chunk_index': i + 1,
150
- 'total_chunks': total_chunks,
151
- 'chunk_method': chunk_method,
152
- 'max_size': max_size,
153
- 'overlap': overlap,
154
- 'language': language,
155
- 'relative_position': (i + 1) / total_chunks
156
- }
157
- metadata.update(json_content) # Add the extracted JSON content to metadata
158
- metadata['header_text'] = header_text # Add the header text to metadata
159
-
160
- if chunk_method == 'json':
161
- chunk_text_content = json.dumps(chunk['json'], ensure_ascii=False)
162
- else:
163
- chunk_text_content = chunk
164
-
165
- chunks_with_metadata.append({
166
- 'text': chunk_text_content,
167
- 'metadata': metadata
168
- })
169
-
170
- return chunks_with_metadata
171
-
172
-
173
- def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
174
- logging.debug("Multi-level chunking process started...")
175
- # First level: chunk by paragraphs
176
- paragraphs = chunk_text_by_paragraphs(text, max_size * 2, overlap)
177
-
178
- # Second level: chunk each paragraph further
179
- chunks = []
180
- for para in paragraphs:
181
- if method == 'words':
182
- chunks.extend(chunk_text_by_words(para, max_words=max_size, overlap=overlap, language=language))
183
- elif method == 'sentences':
184
- chunks.extend(chunk_text_by_sentences(para, max_sentences=max_size, overlap=overlap, language=language))
185
- else:
186
- chunks.append(para)
187
-
188
- return chunks
189
-
190
-
191
- # FIXME - ensure language detection occurs in each chunk function
192
- def chunk_text(text: str, method: str, max_size: int, overlap: int, language: str = None) -> List[str]:
193
- if method == 'words':
194
- logging.debug("Chunking by words...")
195
- return chunk_text_by_words(text, max_words=max_size, overlap=overlap, language=language)
196
- elif method == 'sentences':
197
- logging.debug("Chunking by sentences...")
198
- return chunk_text_by_sentences(text, max_sentences=max_size, overlap=overlap, language=language)
199
- elif method == 'paragraphs':
200
- logging.debug("Chunking by paragraphs...")
201
- return chunk_text_by_paragraphs(text, max_paragraphs=max_size, overlap=overlap)
202
- elif method == 'tokens':
203
- logging.debug("Chunking by tokens...")
204
- return chunk_text_by_tokens(text, max_tokens=max_size, overlap=overlap)
205
- elif method == 'semantic':
206
- logging.debug("Chunking by semantic similarity...")
207
- return semantic_chunking(text, max_chunk_size=max_size)
208
- else:
209
- logging.warning(f"Unknown chunking method '{method}'. Returning full text as a single chunk.")
210
- return [text]
211
-
212
- def determine_chunk_position(relative_position: float) -> str:
213
- if relative_position < 0.33:
214
- return "This chunk is from the beginning of the document"
215
- elif relative_position < 0.66:
216
- return "This chunk is from the middle of the document"
217
- else:
218
- return "This chunk is from the end of the document"
219
-
220
-
221
- def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0, language: str = None) -> List[str]:
222
- logging.debug("chunk_text_by_words...")
223
- if language is None:
224
- language = detect_language(text)
225
-
226
- if language.startswith('zh'): # Chinese
227
- import jieba
228
- words = list(jieba.cut(text))
229
- elif language == 'ja': # Japanese
230
- import fugashi
231
- tagger = fugashi.Tagger()
232
- words = [word.surface for word in tagger(text)]
233
- else: # Default to simple splitting for other languages
234
- words = text.split()
235
-
236
- chunks = []
237
- for i in range(0, len(words), max_words - overlap):
238
- chunk = ' '.join(words[i:i + max_words])
239
- chunks.append(chunk)
240
- return post_process_chunks(chunks)
241
-
242
-
243
- def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = None) -> List[str]:
244
- logging.debug("chunk_text_by_sentences...")
245
- if language is None:
246
- language = detect_language(text)
247
-
248
- if language.startswith('zh'): # Chinese
249
- import jieba
250
- # Use jieba to perform sentence segmentation
251
- # jieba does not support sentence segmentation out of the box
252
- # Use punctuation as delimiters
253
- sentences = re.split(r'[。!?;]', text)
254
- sentences = [s.strip() for s in sentences if s.strip()]
255
- elif language == 'ja': # Japanese
256
- import fugashi
257
- tagger = fugashi.Tagger()
258
- # Simple sentence segmentation based on punctuation
259
- sentences = re.split(r'[。!?]', text)
260
- sentences = [s.strip() for s in sentences if s.strip()]
261
- else: # Default to NLTK for other languages
262
- try:
263
- sentences = sent_tokenize(text, language=language)
264
- except LookupError:
265
- logging.warning(f"Punkt tokenizer not found for language '{language}'. Using default 'english'.")
266
- sentences = sent_tokenize(text, language='english')
267
-
268
- chunks = []
269
- previous_overlap = []
270
-
271
- for i in range(0, len(sentences), max_sentences - overlap):
272
- current_sentences = sentences[i:i + max_sentences]
273
- if overlap > 0 and previous_overlap:
274
- current_sentences = previous_overlap + current_sentences
275
- chunk = ' '.join(current_sentences)
276
- chunks.append(chunk)
277
- previous_overlap = sentences[i + max_sentences - overlap:i + max_sentences] if overlap > 0 else []
278
-
279
- return post_process_chunks(chunks)
280
-
281
-
282
- def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]:
283
- logging.debug("chunk_text_by_paragraphs...")
284
- paragraphs = re.split(r'\n\s*\n', text)
285
- chunks = []
286
- for i in range(0, len(paragraphs), max_paragraphs - overlap):
287
- chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs])
288
- chunks.append(chunk)
289
- return post_process_chunks(chunks)
290
-
291
-
292
- def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
293
- logging.debug("chunk_text_by_tokens...")
294
- # This is a simplified token-based chunking. For more accurate tokenization,
295
- # consider using a proper tokenizer like GPT-2 TokenizerFast
296
- words = text.split()
297
- chunks = []
298
- current_chunk = []
299
- current_token_count = 0
300
-
301
- for word in words:
302
- word_token_count = len(word) // 4 + 1 # Rough estimate of token count
303
- if current_token_count + word_token_count > max_tokens and current_chunk:
304
- chunks.append(' '.join(current_chunk))
305
- current_chunk = current_chunk[-overlap:] if overlap > 0 else []
306
- current_token_count = sum(len(w) // 4 + 1 for w in current_chunk)
307
-
308
- current_chunk.append(word)
309
- current_token_count += word_token_count
310
-
311
- if current_chunk:
312
- chunks.append(' '.join(current_chunk))
313
-
314
- return post_process_chunks(chunks)
315
- # def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
316
- # logging.debug("chunk_text_by_tokens...")
317
- # # Use GPT2 tokenizer for tokenization
318
- # tokens = tokenizer.encode(text)
319
- # chunks = []
320
- # for i in range(0, len(tokens), max_tokens - overlap):
321
- # chunk_tokens = tokens[i:i + max_tokens]
322
- # chunk = tokenizer.decode(chunk_tokens)
323
- # chunks.append(chunk)
324
- # return post_process_chunks(chunks)
325
-
326
-
327
- def post_process_chunks(chunks: List[str]) -> List[str]:
328
- return [chunk.strip() for chunk in chunks if chunk.strip()]
329
-
330
-
331
- # FIXME - F
332
- def get_chunk_metadata(chunk: str, full_text: str, chunk_type: str = "generic",
333
- chapter_number: Optional[int] = None,
334
- chapter_pattern: Optional[str] = None,
335
- language: str = None) -> Dict[str, Any]:
336
- """
337
- Generate metadata for a chunk based on its position in the full text.
338
- """
339
- chunk_length = len(chunk)
340
- start_index = full_text.find(chunk)
341
- end_index = start_index + chunk_length if start_index != -1 else None
342
-
343
- # Calculate a hash for the chunk
344
- chunk_hash = hashlib.md5(chunk.encode()).hexdigest()
345
-
346
- metadata = {
347
- 'start_index': start_index,
348
- 'end_index': end_index,
349
- 'word_count': len(chunk.split()),
350
- 'char_count': chunk_length,
351
- 'chunk_type': chunk_type,
352
- 'language': language,
353
- 'chunk_hash': chunk_hash,
354
- 'relative_position': start_index / len(full_text) if len(full_text) > 0 and start_index != -1 else 0
355
- }
356
-
357
- if chunk_type == "chapter":
358
- metadata['chapter_number'] = chapter_number
359
- metadata['chapter_pattern'] = chapter_pattern
360
-
361
- return metadata
362
-
363
-
364
- def process_document_with_metadata(text: str, chunk_options: Dict[str, Any],
365
- document_metadata: Dict[str, Any]) -> Dict[str, Any]:
366
- chunks = improved_chunking_process(text, chunk_options)
367
-
368
- return {
369
- 'document_metadata': document_metadata,
370
- 'chunks': chunks
371
- }
372
-
373
-
374
- # Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number
375
- def chunk_text_hybrid(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
376
- logging.debug("chunk_text_hybrid...")
377
- sentences = sent_tokenize(text)
378
- chunks = []
379
- current_chunk = []
380
- current_length = 0
381
-
382
- for sentence in sentences:
383
- tokens = tokenizer.encode(sentence)
384
- if current_length + len(tokens) > max_tokens and current_chunk:
385
- chunks.append(' '.join(current_chunk))
386
- # Handle overlap
387
- if overlap > 0:
388
- overlap_tokens = tokenizer.encode(' '.join(current_chunk[-overlap:]))
389
- current_chunk = current_chunk[-overlap:]
390
- current_length = len(overlap_tokens)
391
- else:
392
- current_chunk = []
393
- current_length = 0
394
-
395
- current_chunk.append(sentence)
396
- current_length += len(tokens)
397
-
398
- if current_chunk:
399
- chunks.append(' '.join(current_chunk))
400
-
401
- return post_process_chunks(chunks)
402
-
403
-
404
- # Thanks openai
405
- def chunk_on_delimiter(input_string: str,
406
- max_tokens: int,
407
- delimiter: str) -> List[str]:
408
- logging.debug("chunk_on_delimiter...")
409
- chunks = input_string.split(delimiter)
410
- combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
411
- chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
412
- if dropped_chunk_count > 0:
413
- logging.warning(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
414
- combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
415
- return combined_chunks
416
-
417
-
418
-
419
-
420
- # FIXME
421
- def recursive_summarize_chunks(chunks: List[str], summarize_func, custom_prompt: Optional[str] = None,
422
- temp: Optional[float] = None, system_prompt: Optional[str] = None) -> List[str]:
423
- logging.debug("recursive_summarize_chunks...")
424
- summarized_chunks = []
425
- current_summary = ""
426
-
427
- logging.debug(f"Summarizing {len(chunks)} chunks recursively...")
428
- logging.debug(f"Temperature is set to {temp}")
429
- for i, chunk in enumerate(chunks):
430
- if i == 0:
431
- current_summary = summarize_func(chunk, custom_prompt, temp, system_prompt)
432
- else:
433
- combined_text = current_summary + "\n\n" + chunk
434
- current_summary = summarize_func(combined_text, custom_prompt, temp, system_prompt)
435
-
436
- summarized_chunks.append(current_summary)
437
-
438
- return summarized_chunks
439
-
440
-
441
- # Sample text for testing
442
- sample_text = """
443
- Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
444
- concerned with the interactions between computers and human language, in particular how to program computers
445
- to process and analyze large amounts of natural language data. The result is a computer capable of "understanding"
446
- the contents of documents, including the contextual nuances of the language within them. The technology can then
447
- accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
448
-
449
- Challenges in natural language processing frequently involve speech recognition, natural language understanding,
450
- and natural language generation.
451
-
452
- Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled
453
- "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.
454
- """
455
-
456
- # Example usage of different chunking methods
457
- # print("Chunking by words:")
458
- # print(chunk_text_by_words(sample_text, max_words=50))
459
- #
460
- # print("\nChunking by sentences:")
461
- # print(chunk_text_by_sentences(sample_text, max_sentences=2))
462
- #
463
- # print("\nChunking by paragraphs:")
464
- # print(chunk_text_by_paragraphs(sample_text, max_paragraphs=1))
465
- #
466
- # print("\nChunking by tokens:")
467
- # print(chunk_text_by_tokens(sample_text, max_tokens=50))
468
- #
469
- # print("\nHybrid chunking:")
470
- # print(chunk_text_hybrid(sample_text, max_tokens=50))
471
-
472
-
473
-
474
- #######################################################################################################################
475
- #
476
- # Experimental Semantic Chunking
477
- #
478
-
479
- # Chunk text into segments based on semantic similarity
480
- def count_units(text: str, unit: str = 'words') -> int:
481
- if unit == 'words':
482
- return len(text.split())
483
- elif unit == 'tokens':
484
- return len(tokenizer.encode(text))
485
- elif unit == 'characters':
486
- return len(text)
487
- else:
488
- raise ValueError("Invalid unit. Choose 'words', 'tokens', or 'characters'.")
489
-
490
-
491
-
492
- def semantic_chunking(text: str, max_chunk_size: int = 2000, unit: str = 'words') -> List[str]:
493
- logging.debug("semantic_chunking...")
494
- sentences = sent_tokenize(text)
495
- vectorizer = TfidfVectorizer()
496
- sentence_vectors = vectorizer.fit_transform(sentences)
497
-
498
- chunks = []
499
- current_chunk = []
500
- current_size = 0
501
-
502
- for i, sentence in enumerate(sentences):
503
- sentence_size = count_units(sentence, unit)
504
- if current_size + sentence_size > max_chunk_size and current_chunk:
505
- chunks.append(' '.join(current_chunk))
506
- # Use last 3 sentences for overlap
507
- current_chunk = current_chunk[-3:]
508
- current_size = count_units(' '.join(current_chunk), unit)
509
-
510
- current_chunk.append(sentence)
511
- current_size += sentence_size
512
-
513
- if i + 1 < len(sentences):
514
- current_vector = sentence_vectors[i]
515
- next_vector = sentence_vectors[i + 1]
516
- similarity = cosine_similarity(current_vector, next_vector)[0][0]
517
- if similarity < 0.5 and current_size >= max_chunk_size // 2:
518
- chunks.append(' '.join(current_chunk))
519
- current_chunk = current_chunk[-3:]
520
- current_size = count_units(' '.join(current_chunk), unit)
521
-
522
- if current_chunk:
523
- chunks.append(' '.join(current_chunk))
524
-
525
- return chunks
526
-
527
-
528
- def semantic_chunk_long_file(file_path: str, max_chunk_size: int = 1000, overlap: int = 100, unit: str = 'words') -> Optional[List[str]]:
529
- logging.debug("semantic_chunk_long_file...")
530
- try:
531
- with open(file_path, 'r', encoding='utf-8') as file:
532
- content = file.read()
533
-
534
- chunks = semantic_chunking(content, max_chunk_size, unit)
535
- return chunks
536
- except Exception as e:
537
- logging.error(f"Error chunking text file: {str(e)}")
538
- return None
539
-
540
- #
541
- #
542
- #######################################################################################################################
543
-
544
-
545
- #######################################################################################################################
546
- #
547
- # Embedding Chunking
548
-
549
- def chunk_for_embedding(text: str, file_name: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
550
- options = chunk_options.copy()
551
- if custom_chunk_options:
552
- options.update(custom_chunk_options)
553
-
554
- logging.info(f"Chunking options: {options}")
555
- chunks = improved_chunking_process(text, options)
556
- total_chunks = len(chunks)
557
- logging.info(f"Total chunks created: {total_chunks}")
558
-
559
- chunked_text_with_headers = []
560
- for i, chunk in enumerate(chunks, 1):
561
- chunk_text = chunk['text']
562
- chunk_position = determine_chunk_position(chunk['metadata']['relative_position'])
563
- chunk_header = f"""
564
- Original Document: {file_name}
565
- Chunk: {i} of {total_chunks}
566
- Position: {chunk_position}
567
-
568
- --- Chunk Content ---
569
- """
570
-
571
- full_chunk_text = chunk_header + chunk_text
572
- chunk['text'] = full_chunk_text
573
- chunk['metadata']['file_name'] = file_name
574
- chunked_text_with_headers.append(chunk)
575
-
576
- return chunked_text_with_headers
577
-
578
- #
579
- # End of Embedding Chunking
580
- #######################################################################################################################
581
-
582
-
583
- #######################################################################################################################
584
- #
585
- # JSON Chunking
586
-
587
- # FIXME
588
- def chunk_text_by_json(text: str, max_size: int = 1000, overlap: int = 0) -> List[Dict[str, Any]]:
589
- """
590
- Chunk JSON-formatted text into smaller JSON chunks while preserving structure.
591
-
592
- Parameters:
593
- - text (str): The JSON-formatted text to be chunked.
594
- - max_size (int): Maximum number of items or characters per chunk.
595
- - overlap (int): Number of items or characters to overlap between chunks.
596
-
597
- Returns:
598
- - List[Dict[str, Any]]: A list of chunks with their metadata.
599
- """
600
- logging.debug("chunk_text_by_json started...")
601
- try:
602
- json_data = json.loads(text)
603
- except json.JSONDecodeError as e:
604
- logging.error(f"Invalid JSON data: {e}")
605
- raise ValueError(f"Invalid JSON data: {e}")
606
-
607
- # Determine if JSON data is a list or a dict
608
- if isinstance(json_data, list):
609
- return chunk_json_list(json_data, max_size, overlap)
610
- elif isinstance(json_data, dict):
611
- return chunk_json_dict(json_data, max_size, overlap)
612
- else:
613
- logging.error("Unsupported JSON structure. Only JSON objects and arrays are supported.")
614
- raise ValueError("Unsupported JSON structure. Only JSON objects and arrays are supported.")
615
-
616
-
617
- def chunk_json_list(json_list: List[Any], max_size: int, overlap: int) -> List[Dict[str, Any]]:
618
- """
619
- Chunk a JSON array into smaller chunks.
620
-
621
- Parameters:
622
- - json_list (List[Any]): The JSON array to be chunked.
623
- - max_size (int): Maximum number of items per chunk.
624
- - overlap (int): Number of items to overlap between chunks.
625
-
626
- Returns:
627
- - List[Dict[str, Any]]: A list of JSON chunks with metadata.
628
- """
629
- logging.debug("chunk_json_list started...")
630
- chunks = []
631
- total_items = len(json_list)
632
- step = max_size - overlap
633
- if step <= 0:
634
- raise ValueError("max_size must be greater than overlap.")
635
-
636
- for i in range(0, total_items, step):
637
- chunk = json_list[i:i + max_size]
638
- metadata = {
639
- 'chunk_index': i // step + 1,
640
- 'total_chunks': (total_items + step - 1) // step,
641
- 'chunk_method': 'json_list',
642
- 'max_size': max_size,
643
- 'overlap': overlap,
644
- 'relative_position': i / total_items
645
- }
646
- chunks.append({
647
- 'json': chunk,
648
- 'metadata': metadata
649
- })
650
-
651
- logging.debug(f"chunk_json_list created {len(chunks)} chunks.")
652
- return chunks
653
-
654
-
655
-
656
- def chunk_json_dict(json_dict: Dict[str, Any], max_size: int, overlap: int) -> List[Dict[str, Any]]:
657
- """
658
- Chunk a JSON object into smaller chunks based on its 'data' key while preserving other keys like 'metadata'.
659
-
660
- Parameters:
661
- - json_dict (Dict[str, Any]): The JSON object to be chunked.
662
- - max_size (int): Maximum number of key-value pairs per chunk in the 'data' section.
663
- - overlap (int): Number of key-value pairs to overlap between chunks.
664
-
665
- Returns:
666
- - List[Dict[str, Any]]: A list of JSON chunks with metadata.
667
- """
668
- logging.debug("chunk_json_dict started...")
669
-
670
- # Preserve non-chunked sections
671
- preserved_keys = ['metadata']
672
- preserved_data = {key: value for key, value in json_dict.items() if key in preserved_keys}
673
-
674
- # Identify the chunkable section
675
- chunkable_key = 'data'
676
- if chunkable_key not in json_dict or not isinstance(json_dict[chunkable_key], dict):
677
- logging.error("No chunkable 'data' section found in JSON dictionary.")
678
- raise ValueError("No chunkable 'data' section found in JSON dictionary.")
679
-
680
- chunkable_data = json_dict[chunkable_key]
681
- data_keys = list(chunkable_data.keys())
682
- total_keys = len(data_keys)
683
- chunks = []
684
- step = max_size - overlap
685
- if step <= 0:
686
- raise ValueError("max_size must be greater than overlap.")
687
-
688
- # Adjust the loop to prevent creating an extra chunk
689
- for i in range(0, total_keys, step):
690
- chunk_keys = data_keys[i:i + max_size]
691
-
692
- # Handle overlap
693
- if i != 0 and overlap > 0:
694
- overlap_keys = data_keys[i - overlap:i]
695
- chunk_keys = overlap_keys + chunk_keys
696
-
697
- # Remove duplicate keys caused by overlap
698
- unique_chunk_keys = []
699
- seen_keys = set()
700
- for key in chunk_keys:
701
- if key not in seen_keys:
702
- unique_chunk_keys.append(key)
703
- seen_keys.add(key)
704
-
705
- chunk_data = {key: chunkable_data[key] for key in unique_chunk_keys}
706
-
707
- metadata = {
708
- 'chunk_index': (i // step) + 1,
709
- 'total_chunks': (total_keys + step - 1) // step,
710
- 'chunk_method': 'json_dict',
711
- 'max_size': max_size,
712
- 'overlap': overlap,
713
- 'language': 'english', # Assuming English; modify as needed
714
- 'relative_position': (i // step + 1) / ((total_keys + step - 1) // step)
715
- }
716
-
717
- # Merge preserved data into metadata
718
- metadata.update(preserved_data.get('metadata', {}))
719
-
720
- # Create the chunk with preserved data
721
- chunk = {
722
- 'metadata': preserved_data,
723
- 'data': chunk_data
724
- }
725
-
726
- chunks.append({
727
- 'json': chunk,
728
- 'metadata': metadata
729
- })
730
-
731
- logging.debug(f"chunk_json_dict created {len(chunks)} chunks.")
732
- return chunks
733
-
734
-
735
- #
736
- # End of JSON Chunking
737
- #######################################################################################################################
738
-
739
- #######################################################################################################################
740
- #
741
- # OpenAI Rolling Summarization
742
- #
743
-
744
- client = OpenAI(api_key=openai_api_key)
745
- def get_chat_completion(messages, model='gpt-4-turbo'):
746
- response = client.chat.completions.create(
747
- model=model,
748
- messages=messages,
749
- temperature=0,
750
- )
751
- return response.choices[0].message.content
752
-
753
-
754
- # This function combines text chunks into larger blocks without exceeding a specified token count.
755
- # It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
756
- def combine_chunks_with_no_minimum(
757
- chunks: List[str],
758
- max_tokens: int,
759
- chunk_delimiter: str = "\n\n",
760
- header: Optional[str] = None,
761
- add_ellipsis_for_overflow: bool = False,
762
- ) -> Tuple[List[str], List[List[int]], int]:
763
- dropped_chunk_count = 0
764
- output = [] # list to hold the final combined chunks
765
- output_indices = [] # list to hold the indices of the final combined chunks
766
- candidate = [header] if header else [] # list to hold the current combined chunk candidate
767
- candidate_indices = []
768
- for chunk_i, chunk in enumerate(chunks):
769
- chunk_with_header = [chunk] if not header else [header, chunk]
770
- combined_text = chunk_delimiter.join(candidate + chunk_with_header)
771
- token_count = len(tokenizer.encode(combined_text))
772
- if token_count > max_tokens:
773
- if add_ellipsis_for_overflow and len(candidate) > 0:
774
- ellipsis_text = chunk_delimiter.join(candidate + ["..."])
775
- if len(tokenizer.encode(ellipsis_text)) <= max_tokens:
776
- candidate = candidate + ["..."]
777
- dropped_chunk_count += 1
778
- if len(candidate) > 0:
779
- output.append(chunk_delimiter.join(candidate))
780
- output_indices.append(candidate_indices)
781
- candidate = chunk_with_header
782
- candidate_indices = [chunk_i]
783
- else:
784
- logging.warning(f"Single chunk at index {chunk_i} exceeds max_tokens and will be dropped.")
785
- dropped_chunk_count += 1
786
- else:
787
- candidate.extend(chunk_with_header)
788
- candidate_indices.append(chunk_i)
789
-
790
- if candidate:
791
- output.append(chunk_delimiter.join(candidate))
792
- output_indices.append(candidate_indices)
793
- return output, output_indices, dropped_chunk_count
794
-
795
-
796
- def rolling_summarize(text: str,
797
- detail: float = 0,
798
- model: str = 'gpt-4o',
799
- additional_instructions: Optional[str] = None,
800
- minimum_chunk_size: Optional[int] = 500,
801
- chunk_delimiter: str = ".",
802
- summarize_recursively: bool = False,
803
- verbose: bool = False) -> str:
804
- """
805
- Summarizes a given text by splitting it into chunks, each of which is summarized individually.
806
- The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
807
-
808
- Parameters:
809
- - text (str): The text to be summarized.
810
- - detail (float, optional): A value between 0 and 1 indicating the desired level of detail in the summary.
811
- - additional_instructions (Optional[str], optional): Additional instructions for the model.
812
- - minimum_chunk_size (Optional[int], optional): The minimum size for text chunks.
813
- - chunk_delimiter (str, optional): The delimiter used to split the text into chunks.
814
- - summarize_recursively (bool, optional): If True, summaries are generated recursively.
815
- - verbose (bool, optional): If True, prints detailed information about the chunking process.
816
-
817
- Returns:
818
- - str: The final compiled summary of the text.
819
-
820
- The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
821
- based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
822
- `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
823
- summarization process. The function returns a compiled summary of all chunks.
824
- """
825
-
826
- # Check detail is set correctly
827
- assert 0 <= detail <= 1, "Detail must be between 0 and 1."
828
-
829
- # Interpolate the number of chunks based on the detail parameter
830
- text_length = len(tokenizer.encode(text))
831
- max_chunks = text_length // minimum_chunk_size if minimum_chunk_size else 10
832
- min_chunks = 1
833
- num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
834
-
835
- # Adjust chunk_size based on interpolated number of chunks
836
- chunk_size = max(minimum_chunk_size, text_length // num_chunks) if num_chunks else text_length
837
- text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
838
- if verbose:
839
- print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
840
- print(f"Chunk lengths are {[len(tokenizer.encode(x)) for x in text_chunks]} tokens.")
841
-
842
- # Set system message
843
- system_message_content = "Rewrite this text in summarized form."
844
- if additional_instructions:
845
- system_message_content += f"\n\n{additional_instructions}"
846
-
847
- accumulated_summaries = []
848
- for i, chunk in enumerate(tqdm(text_chunks, desc="Summarizing chunks")):
849
- if summarize_recursively and accumulated_summaries:
850
- # Combine previous summary with current chunk for recursive summarization
851
- combined_text = accumulated_summaries[-1] + "\n\n" + chunk
852
- user_message_content = f"Previous summary and new content to summarize:\n\n{combined_text}"
853
- else:
854
- user_message_content = chunk
855
-
856
- messages = [
857
- {"role": "system", "content": system_message_content},
858
- {"role": "user", "content": user_message_content}
859
- ]
860
-
861
- response = get_chat_completion(messages, model=model)
862
- accumulated_summaries.append(response)
863
-
864
- final_summary = '\n\n'.join(accumulated_summaries)
865
- return final_summary
866
-
867
- #
868
- #
869
- #######################################################################################################################
870
- #
871
- # Ebook Chapter Chunking
872
-
873
-
874
- def chunk_ebook_by_chapters(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
875
- logging.debug("chunk_ebook_by_chapters")
876
- max_chunk_size = int(chunk_options.get('max_size', 300))
877
- overlap = int(chunk_options.get('overlap', 0))
878
- custom_pattern = chunk_options.get('custom_chapter_pattern', None)
879
-
880
- # List of chapter heading patterns to try, in order
881
- chapter_patterns = [
882
- custom_pattern,
883
- r'^#{1,2}\s+', # Markdown style: '# ' or '## '
884
- r'^Chapter\s+\d+', # 'Chapter ' followed by numbers
885
- r'^\d+\.\s+', # Numbered chapters: '1. ', '2. ', etc.
886
- r'^[A-Z\s]+$' # All caps headings
887
- ]
888
-
889
- chapter_positions = []
890
- used_pattern = None
891
-
892
- for pattern in chapter_patterns:
893
- if pattern is None:
894
- continue
895
- chapter_regex = re.compile(pattern, re.MULTILINE | re.IGNORECASE)
896
- chapter_positions = [match.start() for match in chapter_regex.finditer(text)]
897
- if chapter_positions:
898
- used_pattern = pattern
899
- break
900
-
901
- # If no chapters found, return the entire content as one chunk
902
- if not chapter_positions:
903
- metadata = get_chunk_metadata(
904
- chunk=text,
905
- full_text=text,
906
- chunk_type="whole_document",
907
- language=chunk_options.get('language', 'english')
908
- )
909
- return [{'text': text, 'metadata': metadata}]
910
-
911
- # Split content into chapters
912
- chunks = []
913
- for i in range(len(chapter_positions)):
914
- start = chapter_positions[i]
915
- end = chapter_positions[i + 1] if i + 1 < len(chapter_positions) else None
916
- chapter = text[start:end]
917
-
918
- # Apply overlap if specified
919
- if overlap > 0 and i > 0:
920
- overlap_start = max(0, chapter_positions[i] - overlap)
921
- chapter = text[overlap_start:end]
922
-
923
- chunks.append(chapter)
924
-
925
- # Post-process chunks
926
- processed_chunks = post_process_chunks(chunks)
927
-
928
- # Add metadata to chunks
929
- chunks_with_metadata = []
930
- for i, chunk in enumerate(processed_chunks):
931
- metadata = get_chunk_metadata(
932
- chunk=chunk,
933
- full_text=text,
934
- chunk_type="chapter",
935
- chapter_number=i + 1,
936
- chapter_pattern=used_pattern,
937
- language=chunk_options.get('language', 'english')
938
- )
939
- chunks_with_metadata.append({'text': chunk, 'metadata': metadata})
940
-
941
- return chunks_with_metadata
942
-
943
- #
944
- # End of ebook chapter chunking
945
- #######################################################################################################################
946
-
947
- #######################################################################################################################
948
- #
949
- # Functions for adapative chunking:
950
-
951
- # FIXME - punkt
952
-
953
- def adaptive_chunk_size(text: str, base_size: int = 1000, min_size: int = 500, max_size: int = 2000) -> int:
954
- # Tokenize the text into sentences
955
- sentences = sent_tokenize(text)
956
-
957
- if not sentences:
958
- return base_size
959
-
960
- # Calculate average sentence length
961
- avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences)
962
-
963
- # Adjust chunk size based on average sentence length
964
- if avg_sentence_length < 10:
965
- size_factor = 1.2 # Increase chunk size for short sentences
966
- elif avg_sentence_length > 20:
967
- size_factor = 0.8 # Decrease chunk size for long sentences
968
- else:
969
- size_factor = 1.0
970
-
971
- # Calculate adaptive chunk size
972
- adaptive_size = int(base_size * size_factor)
973
-
974
- # Ensure chunk size is within bounds
975
- return max(min_size, min(adaptive_size, max_size))
976
-
977
-
978
- def adaptive_chunk_size_non_punkt(text: str, base_size: int, min_size: int = 100, max_size: int = 2000) -> int:
979
- # Adaptive logic: adjust chunk size based on text complexity
980
- words = text.split()
981
- if not words:
982
- return base_size # Return base_size if text is empty
983
-
984
- avg_word_length = sum(len(word) for word in words) / len(words)
985
-
986
- if avg_word_length > 6: # Threshold for "complex" text
987
- adjusted_size = int(base_size * 0.8) # Reduce chunk size for complex text
988
- elif avg_word_length < 4: # Threshold for "simple" text
989
- adjusted_size = int(base_size * 1.2) # Increase chunk size for simple text
990
- else:
991
- adjusted_size = base_size
992
-
993
- # Ensure the chunk size is within the specified range
994
- return max(min_size, min(adjusted_size, max_size))
995
-
996
-
997
- def adaptive_chunking(text: str, base_size: int = 1000, min_size: int = 500, max_size: int = 2000) -> List[str]:
998
- logging.debug("adaptive_chunking...")
999
- chunk_size = adaptive_chunk_size(text, base_size, min_size, max_size)
1000
- words = text.split()
1001
- chunks = []
1002
- current_chunk = []
1003
- current_length = 0
1004
-
1005
- for word in words:
1006
- if current_length + len(word) > chunk_size and current_chunk:
1007
- chunks.append(' '.join(current_chunk))
1008
- current_chunk = []
1009
- current_length = 0
1010
- current_chunk.append(word)
1011
- current_length += len(word) + 1 # +1 for space
1012
-
1013
- if current_chunk:
1014
- chunks.append(' '.join(current_chunk))
1015
-
1016
- return chunks
1017
-
1018
- # FIXME - usage example
1019
- # chunk_options = {
1020
- # 'method': 'words', # or any other method
1021
- # 'base_size': 1000,
1022
- # 'min_size': 100,
1023
- # 'max_size': 2000,
1024
- # 'adaptive': True,
1025
- # 'language': 'en'
1026
- # }
1027
- #chunks = improved_chunking_process(your_text, chunk_options)
1028
-
1029
-
1030
- # Example of chunking a document with metadata
1031
- # document_metadata = {
1032
- # 'title': 'Example Document',
1033
- # 'author': 'John Doe',
1034
- # 'creation_date': '2023-06-14',
1035
- # 'source': 'https://example.com/document',
1036
- # 'document_type': 'article'
1037
- # }
1038
- #
1039
- # chunk_options = {
1040
- # 'method': 'sentences',
1041
- # 'base_size': 1000,
1042
- # 'adaptive': True,
1043
- # 'language': 'en'
1044
- # }
1045
- #
1046
- # processed_document = process_document_with_metadata(your_text, chunk_options, document_metadata)
1047
-
1048
-
1049
- #
1050
- # End of Chunking Library
1051
  #######################################################################################################################
 
1
+ # Chunk_Lib.py
2
+ #########################################
3
+ # Chunking Library
4
+ # This library is used to perform chunking of input files.
5
+ # Currently, uses naive approaches. Nothing fancy.
6
+ #
7
+ ####
8
+ # Import necessary libraries
9
+ import hashlib
10
+ import json
11
+ import logging
12
+ import re
13
+ from typing import Any, Dict, List, Optional, Tuple
14
+ #
15
+ # Import 3rd party
16
+ from openai import OpenAI
17
+ from tqdm import tqdm
18
+ from langdetect import detect
19
+ from transformers import GPT2Tokenizer
20
+ import nltk
21
+ from nltk.tokenize import sent_tokenize, word_tokenize
22
+ from sklearn.feature_extraction.text import TfidfVectorizer
23
+ from sklearn.metrics.pairwise import cosine_similarity
24
+ #
25
+ # Import Local
26
+ from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
27
+ from App_Function_Libraries.Utils.Utils import load_comprehensive_config
28
+ #
29
+ #######################################################################################################################
30
+ # Config Settings
31
+ #
32
+ #
33
+ # FIXME - Make sure it only downloads if it already exists, and does a check first.
34
+ # Ensure NLTK data is downloaded
35
+ def ensure_nltk_data():
36
+ try:
37
+ nltk.data.find('tokenizers/punkt')
38
+ except LookupError:
39
+ nltk.download('punkt')
40
+ #ensure_nltk_data()
41
+
42
+ #
43
+ # Load GPT2 tokenizer
44
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
45
+ #
46
+ # Load configuration
47
+ config = load_comprehensive_config()
48
+ # Embedding Chunking options
49
+ chunk_options = {
50
+ 'method': config.get('Chunking', 'method', fallback='words'),
51
+ 'max_size': config.getint('Chunking', 'max_size', fallback=400),
52
+ 'overlap': config.getint('Chunking', 'overlap', fallback=200),
53
+ 'adaptive': config.getboolean('Chunking', 'adaptive', fallback=False),
54
+ 'multi_level': config.getboolean('Chunking', 'multi_level', fallback=False),
55
+ 'language': config.get('Chunking', 'language', fallback='english')
56
+ }
57
+
58
+ openai_api_key = config.get('API', 'openai_api_key')
59
+ #
60
+ # End of settings
61
+ #######################################################################################################################
62
+ #
63
+ # Functions:
64
+
65
+ # Create a chunking class for refactoring FIXME
66
+ # class Chunker:
67
+ # def __init__(self, tokenizer: GPT2Tokenizer):
68
+ # self.tokenizer = tokenizer
69
+ #
70
+ # def detect_language(self, text: str) -> str:
71
+ # try:
72
+ # return detect(text)
73
+ # except:
74
+ # return 'en'
75
+ #
76
+ # def chunk_text(self, text: str, method: str, max_size: int, overlap: int, language: str = None) -> List[str]:
77
+ # if language is None:
78
+ # language = self.detect_language(text)
79
+ #
80
+ # if method == 'words':
81
+ # return self.chunk_text_by_words(text, max_size, overlap, language)
82
+ # elif method == 'sentences':
83
+ # return self.chunk_text_by_sentences(text, max_size, overlap, language)
84
+ # elif method == 'paragraphs':
85
+ # return self.chunk_text_by_paragraphs(text, max_size, overlap)
86
+ # elif method == 'tokens':
87
+ # return self.chunk_text_by_tokens(text, max_size, overlap, language)
88
+ # elif method == 'semantic':
89
+ # return self.semantic_chunking(text, max_size)
90
+ # else:
91
+ # return [text]
92
+
93
+ def detect_language(text: str) -> str:
94
+ try:
95
+ return detect(text)
96
+ except:
97
+ # Default to English if detection fails
98
+ return 'en'
99
+
100
+
101
+ def load_document(file_path: str) -> str:
102
+ with open(file_path, 'r', encoding='utf-8') as file:
103
+ text = file.read()
104
+ return re.sub(r'\s+', ' ', text).strip()
105
+
106
+
107
+ def improved_chunking_process(text: str, chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
108
+ logging.debug("Improved chunking process started...")
109
+
110
+ # Extract JSON metadata if present
111
+ json_content = {}
112
+ try:
113
+ json_end = text.index("}\n") + 1
114
+ json_content = json.loads(text[:json_end])
115
+ text = text[json_end:].strip()
116
+ logging.debug(f"Extracted JSON metadata: {json_content}")
117
+ except (ValueError, json.JSONDecodeError):
118
+ logging.debug("No JSON metadata found at the beginning of the text")
119
+
120
+ # Extract any additional header text
121
+ header_match = re.match(r"(This text was transcribed using.*?)\n\n", text, re.DOTALL)
122
+ header_text = ""
123
+ if header_match:
124
+ header_text = header_match.group(1)
125
+ text = text[len(header_text):].strip()
126
+ logging.debug(f"Extracted header text: {header_text}")
127
+
128
+ options = chunk_options.copy() if chunk_options else {}
129
+ if chunk_options:
130
+ options.update(chunk_options)
131
+
132
+ chunk_method = options.get('method', 'words')
133
+ max_size = options.get('max_size', 2000)
134
+ overlap = options.get('overlap', 0)
135
+ language = options.get('language', None)
136
+
137
+ if language is None:
138
+ language = detect_language(text)
139
+
140
+ if chunk_method == 'json':
141
+ chunks = chunk_text_by_json(text, max_size=max_size, overlap=overlap)
142
+ else:
143
+ chunks = chunk_text(text, chunk_method, max_size, overlap, language)
144
+
145
+ chunks_with_metadata = []
146
+ total_chunks = len(chunks)
147
+ for i, chunk in enumerate(chunks):
148
+ metadata = {
149
+ 'chunk_index': i + 1,
150
+ 'total_chunks': total_chunks,
151
+ 'chunk_method': chunk_method,
152
+ 'max_size': max_size,
153
+ 'overlap': overlap,
154
+ 'language': language,
155
+ 'relative_position': (i + 1) / total_chunks
156
+ }
157
+ metadata.update(json_content) # Add the extracted JSON content to metadata
158
+ metadata['header_text'] = header_text # Add the header text to metadata
159
+
160
+ if chunk_method == 'json':
161
+ chunk_text_content = json.dumps(chunk['json'], ensure_ascii=False)
162
+ else:
163
+ chunk_text_content = chunk
164
+
165
+ chunks_with_metadata.append({
166
+ 'text': chunk_text_content,
167
+ 'metadata': metadata
168
+ })
169
+
170
+ return chunks_with_metadata
171
+
172
+
173
+ def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
174
+ logging.debug("Multi-level chunking process started...")
175
+ # First level: chunk by paragraphs
176
+ paragraphs = chunk_text_by_paragraphs(text, max_size * 2, overlap)
177
+
178
+ # Second level: chunk each paragraph further
179
+ chunks = []
180
+ for para in paragraphs:
181
+ if method == 'words':
182
+ chunks.extend(chunk_text_by_words(para, max_words=max_size, overlap=overlap, language=language))
183
+ elif method == 'sentences':
184
+ chunks.extend(chunk_text_by_sentences(para, max_sentences=max_size, overlap=overlap, language=language))
185
+ else:
186
+ chunks.append(para)
187
+
188
+ return chunks
189
+
190
+
191
+ # FIXME - ensure language detection occurs in each chunk function
192
+ def chunk_text(text: str, method: str, max_size: int, overlap: int, language: str = None) -> List[str]:
193
+ if method == 'words':
194
+ logging.debug("Chunking by words...")
195
+ return chunk_text_by_words(text, max_words=max_size, overlap=overlap, language=language)
196
+ elif method == 'sentences':
197
+ logging.debug("Chunking by sentences...")
198
+ return chunk_text_by_sentences(text, max_sentences=max_size, overlap=overlap, language=language)
199
+ elif method == 'paragraphs':
200
+ logging.debug("Chunking by paragraphs...")
201
+ return chunk_text_by_paragraphs(text, max_paragraphs=max_size, overlap=overlap)
202
+ elif method == 'tokens':
203
+ logging.debug("Chunking by tokens...")
204
+ return chunk_text_by_tokens(text, max_tokens=max_size, overlap=overlap)
205
+ elif method == 'semantic':
206
+ logging.debug("Chunking by semantic similarity...")
207
+ return semantic_chunking(text, max_chunk_size=max_size)
208
+ else:
209
+ logging.warning(f"Unknown chunking method '{method}'. Returning full text as a single chunk.")
210
+ return [text]
211
+
212
+ def determine_chunk_position(relative_position: float) -> str:
213
+ if relative_position < 0.33:
214
+ return "This chunk is from the beginning of the document"
215
+ elif relative_position < 0.66:
216
+ return "This chunk is from the middle of the document"
217
+ else:
218
+ return "This chunk is from the end of the document"
219
+
220
+
221
+ def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0, language: str = None) -> List[str]:
222
+ logging.debug("chunk_text_by_words...")
223
+ if language is None:
224
+ language = detect_language(text)
225
+
226
+ if language.startswith('zh'): # Chinese
227
+ import jieba
228
+ words = list(jieba.cut(text))
229
+ elif language == 'ja': # Japanese
230
+ import fugashi
231
+ tagger = fugashi.Tagger()
232
+ words = [word.surface for word in tagger(text)]
233
+ else: # Default to simple splitting for other languages
234
+ words = text.split()
235
+
236
+ chunks = []
237
+ for i in range(0, len(words), max_words - overlap):
238
+ chunk = ' '.join(words[i:i + max_words])
239
+ chunks.append(chunk)
240
+ return post_process_chunks(chunks)
241
+
242
+
243
+ def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = None) -> List[str]:
244
+ logging.debug("chunk_text_by_sentences...")
245
+ if language is None:
246
+ language = detect_language(text)
247
+
248
+ if language.startswith('zh'): # Chinese
249
+ import jieba
250
+ # Use jieba to perform sentence segmentation
251
+ # jieba does not support sentence segmentation out of the box
252
+ # Use punctuation as delimiters
253
+ sentences = re.split(r'[。!?;]', text)
254
+ sentences = [s.strip() for s in sentences if s.strip()]
255
+ elif language == 'ja': # Japanese
256
+ import fugashi
257
+ tagger = fugashi.Tagger()
258
+ # Simple sentence segmentation based on punctuation
259
+ sentences = re.split(r'[。!?]', text)
260
+ sentences = [s.strip() for s in sentences if s.strip()]
261
+ else: # Default to NLTK for other languages
262
+ try:
263
+ sentences = sent_tokenize(text, language=language)
264
+ except LookupError:
265
+ logging.warning(f"Punkt tokenizer not found for language '{language}'. Using default 'english'.")
266
+ sentences = sent_tokenize(text, language='english')
267
+
268
+ chunks = []
269
+ previous_overlap = []
270
+
271
+ for i in range(0, len(sentences), max_sentences - overlap):
272
+ current_sentences = sentences[i:i + max_sentences]
273
+ if overlap > 0 and previous_overlap:
274
+ current_sentences = previous_overlap + current_sentences
275
+ chunk = ' '.join(current_sentences)
276
+ chunks.append(chunk)
277
+ previous_overlap = sentences[i + max_sentences - overlap:i + max_sentences] if overlap > 0 else []
278
+
279
+ return post_process_chunks(chunks)
280
+
281
+
282
+ def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]:
283
+ logging.debug("chunk_text_by_paragraphs...")
284
+ paragraphs = re.split(r'\n\s*\n', text)
285
+ chunks = []
286
+ for i in range(0, len(paragraphs), max_paragraphs - overlap):
287
+ chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs])
288
+ chunks.append(chunk)
289
+ return post_process_chunks(chunks)
290
+
291
+
292
+ def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
293
+ logging.debug("chunk_text_by_tokens...")
294
+ # This is a simplified token-based chunking. For more accurate tokenization,
295
+ # consider using a proper tokenizer like GPT-2 TokenizerFast
296
+ words = text.split()
297
+ chunks = []
298
+ current_chunk = []
299
+ current_token_count = 0
300
+
301
+ for word in words:
302
+ word_token_count = len(word) // 4 + 1 # Rough estimate of token count
303
+ if current_token_count + word_token_count > max_tokens and current_chunk:
304
+ chunks.append(' '.join(current_chunk))
305
+ current_chunk = current_chunk[-overlap:] if overlap > 0 else []
306
+ current_token_count = sum(len(w) // 4 + 1 for w in current_chunk)
307
+
308
+ current_chunk.append(word)
309
+ current_token_count += word_token_count
310
+
311
+ if current_chunk:
312
+ chunks.append(' '.join(current_chunk))
313
+
314
+ return post_process_chunks(chunks)
315
+ # def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
316
+ # logging.debug("chunk_text_by_tokens...")
317
+ # # Use GPT2 tokenizer for tokenization
318
+ # tokens = tokenizer.encode(text)
319
+ # chunks = []
320
+ # for i in range(0, len(tokens), max_tokens - overlap):
321
+ # chunk_tokens = tokens[i:i + max_tokens]
322
+ # chunk = tokenizer.decode(chunk_tokens)
323
+ # chunks.append(chunk)
324
+ # return post_process_chunks(chunks)
325
+
326
+
327
+ def post_process_chunks(chunks: List[str]) -> List[str]:
328
+ return [chunk.strip() for chunk in chunks if chunk.strip()]
329
+
330
+
331
+ # FIXME - F
332
+ def get_chunk_metadata(chunk: str, full_text: str, chunk_type: str = "generic",
333
+ chapter_number: Optional[int] = None,
334
+ chapter_pattern: Optional[str] = None,
335
+ language: str = None) -> Dict[str, Any]:
336
+ """
337
+ Generate metadata for a chunk based on its position in the full text.
338
+ """
339
+ chunk_length = len(chunk)
340
+ start_index = full_text.find(chunk)
341
+ end_index = start_index + chunk_length if start_index != -1 else None
342
+
343
+ # Calculate a hash for the chunk
344
+ chunk_hash = hashlib.md5(chunk.encode()).hexdigest()
345
+
346
+ metadata = {
347
+ 'start_index': start_index,
348
+ 'end_index': end_index,
349
+ 'word_count': len(chunk.split()),
350
+ 'char_count': chunk_length,
351
+ 'chunk_type': chunk_type,
352
+ 'language': language,
353
+ 'chunk_hash': chunk_hash,
354
+ 'relative_position': start_index / len(full_text) if len(full_text) > 0 and start_index != -1 else 0
355
+ }
356
+
357
+ if chunk_type == "chapter":
358
+ metadata['chapter_number'] = chapter_number
359
+ metadata['chapter_pattern'] = chapter_pattern
360
+
361
+ return metadata
362
+
363
+
364
+ def process_document_with_metadata(text: str, chunk_options: Dict[str, Any],
365
+ document_metadata: Dict[str, Any]) -> Dict[str, Any]:
366
+ chunks = improved_chunking_process(text, chunk_options)
367
+
368
+ return {
369
+ 'document_metadata': document_metadata,
370
+ 'chunks': chunks
371
+ }
372
+
373
+
374
+ # Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number
375
+ def chunk_text_hybrid(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
376
+ logging.debug("chunk_text_hybrid...")
377
+ sentences = sent_tokenize(text)
378
+ chunks = []
379
+ current_chunk = []
380
+ current_length = 0
381
+
382
+ for sentence in sentences:
383
+ tokens = tokenizer.encode(sentence)
384
+ if current_length + len(tokens) > max_tokens and current_chunk:
385
+ chunks.append(' '.join(current_chunk))
386
+ # Handle overlap
387
+ if overlap > 0:
388
+ overlap_tokens = tokenizer.encode(' '.join(current_chunk[-overlap:]))
389
+ current_chunk = current_chunk[-overlap:]
390
+ current_length = len(overlap_tokens)
391
+ else:
392
+ current_chunk = []
393
+ current_length = 0
394
+
395
+ current_chunk.append(sentence)
396
+ current_length += len(tokens)
397
+
398
+ if current_chunk:
399
+ chunks.append(' '.join(current_chunk))
400
+
401
+ return post_process_chunks(chunks)
402
+
403
+
404
+ # Thanks openai
405
+ def chunk_on_delimiter(input_string: str,
406
+ max_tokens: int,
407
+ delimiter: str) -> List[str]:
408
+ logging.debug("chunk_on_delimiter...")
409
+ chunks = input_string.split(delimiter)
410
+ combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
411
+ chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
412
+ if dropped_chunk_count > 0:
413
+ logging.warning(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
414
+ combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
415
+ return combined_chunks
416
+
417
+
418
+
419
+
420
+ # FIXME
421
+ def recursive_summarize_chunks(chunks: List[str], summarize_func, custom_prompt: Optional[str] = None,
422
+ temp: Optional[float] = None, system_prompt: Optional[str] = None) -> List[str]:
423
+ logging.debug("recursive_summarize_chunks...")
424
+ summarized_chunks = []
425
+ current_summary = ""
426
+
427
+ logging.debug(f"Summarizing {len(chunks)} chunks recursively...")
428
+ logging.debug(f"Temperature is set to {temp}")
429
+ for i, chunk in enumerate(chunks):
430
+ if i == 0:
431
+ current_summary = summarize_func(chunk, custom_prompt, temp, system_prompt)
432
+ else:
433
+ combined_text = current_summary + "\n\n" + chunk
434
+ current_summary = summarize_func(combined_text, custom_prompt, temp, system_prompt)
435
+
436
+ summarized_chunks.append(current_summary)
437
+
438
+ return summarized_chunks
439
+
440
+
441
+ # Sample text for testing
442
+ sample_text = """
443
+ Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
444
+ concerned with the interactions between computers and human language, in particular how to program computers
445
+ to process and analyze large amounts of natural language data. The result is a computer capable of "understanding"
446
+ the contents of documents, including the contextual nuances of the language within them. The technology can then
447
+ accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
448
+
449
+ Challenges in natural language processing frequently involve speech recognition, natural language understanding,
450
+ and natural language generation.
451
+
452
+ Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled
453
+ "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.
454
+ """
455
+
456
+ # Example usage of different chunking methods
457
+ # print("Chunking by words:")
458
+ # print(chunk_text_by_words(sample_text, max_words=50))
459
+ #
460
+ # print("\nChunking by sentences:")
461
+ # print(chunk_text_by_sentences(sample_text, max_sentences=2))
462
+ #
463
+ # print("\nChunking by paragraphs:")
464
+ # print(chunk_text_by_paragraphs(sample_text, max_paragraphs=1))
465
+ #
466
+ # print("\nChunking by tokens:")
467
+ # print(chunk_text_by_tokens(sample_text, max_tokens=50))
468
+ #
469
+ # print("\nHybrid chunking:")
470
+ # print(chunk_text_hybrid(sample_text, max_tokens=50))
471
+
472
+
473
+
474
+ #######################################################################################################################
475
+ #
476
+ # Experimental Semantic Chunking
477
+ #
478
+
479
+ # Chunk text into segments based on semantic similarity
480
+ def count_units(text: str, unit: str = 'words') -> int:
481
+ if unit == 'words':
482
+ return len(text.split())
483
+ elif unit == 'tokens':
484
+ return len(tokenizer.encode(text))
485
+ elif unit == 'characters':
486
+ return len(text)
487
+ else:
488
+ raise ValueError("Invalid unit. Choose 'words', 'tokens', or 'characters'.")
489
+
490
+
491
+
492
+ def semantic_chunking(text: str, max_chunk_size: int = 2000, unit: str = 'words') -> List[str]:
493
+ logging.debug("semantic_chunking...")
494
+ sentences = sent_tokenize(text)
495
+ vectorizer = TfidfVectorizer()
496
+ sentence_vectors = vectorizer.fit_transform(sentences)
497
+
498
+ chunks = []
499
+ current_chunk = []
500
+ current_size = 0
501
+
502
+ for i, sentence in enumerate(sentences):
503
+ sentence_size = count_units(sentence, unit)
504
+ if current_size + sentence_size > max_chunk_size and current_chunk:
505
+ chunks.append(' '.join(current_chunk))
506
+ # Use last 3 sentences for overlap
507
+ current_chunk = current_chunk[-3:]
508
+ current_size = count_units(' '.join(current_chunk), unit)
509
+
510
+ current_chunk.append(sentence)
511
+ current_size += sentence_size
512
+
513
+ if i + 1 < len(sentences):
514
+ current_vector = sentence_vectors[i]
515
+ next_vector = sentence_vectors[i + 1]
516
+ similarity = cosine_similarity(current_vector, next_vector)[0][0]
517
+ if similarity < 0.5 and current_size >= max_chunk_size // 2:
518
+ chunks.append(' '.join(current_chunk))
519
+ current_chunk = current_chunk[-3:]
520
+ current_size = count_units(' '.join(current_chunk), unit)
521
+
522
+ if current_chunk:
523
+ chunks.append(' '.join(current_chunk))
524
+
525
+ return chunks
526
+
527
+
528
+ def semantic_chunk_long_file(file_path: str, max_chunk_size: int = 1000, overlap: int = 100, unit: str = 'words') -> Optional[List[str]]:
529
+ logging.debug("semantic_chunk_long_file...")
530
+ try:
531
+ with open(file_path, 'r', encoding='utf-8') as file:
532
+ content = file.read()
533
+
534
+ chunks = semantic_chunking(content, max_chunk_size, unit)
535
+ return chunks
536
+ except Exception as e:
537
+ logging.error(f"Error chunking text file: {str(e)}")
538
+ return None
539
+
540
+ #
541
+ #
542
+ #######################################################################################################################
543
+
544
+
545
+ #######################################################################################################################
546
+ #
547
+ # Embedding Chunking
548
+
549
+ def chunk_for_embedding(text: str, file_name: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
550
+ options = chunk_options.copy()
551
+ if custom_chunk_options:
552
+ options.update(custom_chunk_options)
553
+
554
+ logging.info(f"Chunking options: {options}")
555
+ chunks = improved_chunking_process(text, options)
556
+ total_chunks = len(chunks)
557
+ logging.info(f"Total chunks created: {total_chunks}")
558
+
559
+ chunked_text_with_headers = []
560
+ for i, chunk in enumerate(chunks, 1):
561
+ chunk_text = chunk['text']
562
+ chunk_position = determine_chunk_position(chunk['metadata']['relative_position'])
563
+ chunk_header = f"""
564
+ Original Document: {file_name}
565
+ Chunk: {i} of {total_chunks}
566
+ Position: {chunk_position}
567
+
568
+ --- Chunk Content ---
569
+ """
570
+
571
+ full_chunk_text = chunk_header + chunk_text
572
+ chunk['text'] = full_chunk_text
573
+ chunk['metadata']['file_name'] = file_name
574
+ chunked_text_with_headers.append(chunk)
575
+
576
+ return chunked_text_with_headers
577
+
578
+ #
579
+ # End of Embedding Chunking
580
+ #######################################################################################################################
581
+
582
+
583
+ #######################################################################################################################
584
+ #
585
+ # JSON Chunking
586
+
587
+ # FIXME
588
+ def chunk_text_by_json(text: str, max_size: int = 1000, overlap: int = 0) -> List[Dict[str, Any]]:
589
+ """
590
+ Chunk JSON-formatted text into smaller JSON chunks while preserving structure.
591
+
592
+ Parameters:
593
+ - text (str): The JSON-formatted text to be chunked.
594
+ - max_size (int): Maximum number of items or characters per chunk.
595
+ - overlap (int): Number of items or characters to overlap between chunks.
596
+
597
+ Returns:
598
+ - List[Dict[str, Any]]: A list of chunks with their metadata.
599
+ """
600
+ logging.debug("chunk_text_by_json started...")
601
+ try:
602
+ json_data = json.loads(text)
603
+ except json.JSONDecodeError as e:
604
+ logging.error(f"Invalid JSON data: {e}")
605
+ raise ValueError(f"Invalid JSON data: {e}")
606
+
607
+ # Determine if JSON data is a list or a dict
608
+ if isinstance(json_data, list):
609
+ return chunk_json_list(json_data, max_size, overlap)
610
+ elif isinstance(json_data, dict):
611
+ return chunk_json_dict(json_data, max_size, overlap)
612
+ else:
613
+ logging.error("Unsupported JSON structure. Only JSON objects and arrays are supported.")
614
+ raise ValueError("Unsupported JSON structure. Only JSON objects and arrays are supported.")
615
+
616
+
617
+ def chunk_json_list(json_list: List[Any], max_size: int, overlap: int) -> List[Dict[str, Any]]:
618
+ """
619
+ Chunk a JSON array into smaller chunks.
620
+
621
+ Parameters:
622
+ - json_list (List[Any]): The JSON array to be chunked.
623
+ - max_size (int): Maximum number of items per chunk.
624
+ - overlap (int): Number of items to overlap between chunks.
625
+
626
+ Returns:
627
+ - List[Dict[str, Any]]: A list of JSON chunks with metadata.
628
+ """
629
+ logging.debug("chunk_json_list started...")
630
+ chunks = []
631
+ total_items = len(json_list)
632
+ step = max_size - overlap
633
+ if step <= 0:
634
+ raise ValueError("max_size must be greater than overlap.")
635
+
636
+ for i in range(0, total_items, step):
637
+ chunk = json_list[i:i + max_size]
638
+ metadata = {
639
+ 'chunk_index': i // step + 1,
640
+ 'total_chunks': (total_items + step - 1) // step,
641
+ 'chunk_method': 'json_list',
642
+ 'max_size': max_size,
643
+ 'overlap': overlap,
644
+ 'relative_position': i / total_items
645
+ }
646
+ chunks.append({
647
+ 'json': chunk,
648
+ 'metadata': metadata
649
+ })
650
+
651
+ logging.debug(f"chunk_json_list created {len(chunks)} chunks.")
652
+ return chunks
653
+
654
+
655
+
656
+ def chunk_json_dict(json_dict: Dict[str, Any], max_size: int, overlap: int) -> List[Dict[str, Any]]:
657
+ """
658
+ Chunk a JSON object into smaller chunks based on its 'data' key while preserving other keys like 'metadata'.
659
+
660
+ Parameters:
661
+ - json_dict (Dict[str, Any]): The JSON object to be chunked.
662
+ - max_size (int): Maximum number of key-value pairs per chunk in the 'data' section.
663
+ - overlap (int): Number of key-value pairs to overlap between chunks.
664
+
665
+ Returns:
666
+ - List[Dict[str, Any]]: A list of JSON chunks with metadata.
667
+ """
668
+ logging.debug("chunk_json_dict started...")
669
+
670
+ # Preserve non-chunked sections
671
+ preserved_keys = ['metadata']
672
+ preserved_data = {key: value for key, value in json_dict.items() if key in preserved_keys}
673
+
674
+ # Identify the chunkable section
675
+ chunkable_key = 'data'
676
+ if chunkable_key not in json_dict or not isinstance(json_dict[chunkable_key], dict):
677
+ logging.error("No chunkable 'data' section found in JSON dictionary.")
678
+ raise ValueError("No chunkable 'data' section found in JSON dictionary.")
679
+
680
+ chunkable_data = json_dict[chunkable_key]
681
+ data_keys = list(chunkable_data.keys())
682
+ total_keys = len(data_keys)
683
+ chunks = []
684
+ step = max_size - overlap
685
+ if step <= 0:
686
+ raise ValueError("max_size must be greater than overlap.")
687
+
688
+ # Adjust the loop to prevent creating an extra chunk
689
+ for i in range(0, total_keys, step):
690
+ chunk_keys = data_keys[i:i + max_size]
691
+
692
+ # Handle overlap
693
+ if i != 0 and overlap > 0:
694
+ overlap_keys = data_keys[i - overlap:i]
695
+ chunk_keys = overlap_keys + chunk_keys
696
+
697
+ # Remove duplicate keys caused by overlap
698
+ unique_chunk_keys = []
699
+ seen_keys = set()
700
+ for key in chunk_keys:
701
+ if key not in seen_keys:
702
+ unique_chunk_keys.append(key)
703
+ seen_keys.add(key)
704
+
705
+ chunk_data = {key: chunkable_data[key] for key in unique_chunk_keys}
706
+
707
+ metadata = {
708
+ 'chunk_index': (i // step) + 1,
709
+ 'total_chunks': (total_keys + step - 1) // step,
710
+ 'chunk_method': 'json_dict',
711
+ 'max_size': max_size,
712
+ 'overlap': overlap,
713
+ 'language': 'english', # Assuming English; modify as needed
714
+ 'relative_position': (i // step + 1) / ((total_keys + step - 1) // step)
715
+ }
716
+
717
+ # Merge preserved data into metadata
718
+ metadata.update(preserved_data.get('metadata', {}))
719
+
720
+ # Create the chunk with preserved data
721
+ chunk = {
722
+ 'metadata': preserved_data,
723
+ 'data': chunk_data
724
+ }
725
+
726
+ chunks.append({
727
+ 'json': chunk,
728
+ 'metadata': metadata
729
+ })
730
+
731
+ logging.debug(f"chunk_json_dict created {len(chunks)} chunks.")
732
+ return chunks
733
+
734
+
735
+ #
736
+ # End of JSON Chunking
737
+ #######################################################################################################################
738
+
739
+ #######################################################################################################################
740
+ #
741
+ # OpenAI Rolling Summarization
742
+ #
743
+
744
+ client = OpenAI(api_key=openai_api_key)
745
+ def get_chat_completion(messages, model='gpt-4-turbo'):
746
+ response = client.chat.completions.create(
747
+ model=model,
748
+ messages=messages,
749
+ temperature=0,
750
+ )
751
+ return response.choices[0].message.content
752
+
753
+
754
+ # This function combines text chunks into larger blocks without exceeding a specified token count.
755
+ # It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
756
+ def combine_chunks_with_no_minimum(
757
+ chunks: List[str],
758
+ max_tokens: int,
759
+ chunk_delimiter: str = "\n\n",
760
+ header: Optional[str] = None,
761
+ add_ellipsis_for_overflow: bool = False,
762
+ ) -> Tuple[List[str], List[List[int]], int]:
763
+ dropped_chunk_count = 0
764
+ output = [] # list to hold the final combined chunks
765
+ output_indices = [] # list to hold the indices of the final combined chunks
766
+ candidate = [header] if header else [] # list to hold the current combined chunk candidate
767
+ candidate_indices = []
768
+ for chunk_i, chunk in enumerate(chunks):
769
+ chunk_with_header = [chunk] if not header else [header, chunk]
770
+ combined_text = chunk_delimiter.join(candidate + chunk_with_header)
771
+ token_count = len(tokenizer.encode(combined_text))
772
+ if token_count > max_tokens:
773
+ if add_ellipsis_for_overflow and len(candidate) > 0:
774
+ ellipsis_text = chunk_delimiter.join(candidate + ["..."])
775
+ if len(tokenizer.encode(ellipsis_text)) <= max_tokens:
776
+ candidate = candidate + ["..."]
777
+ dropped_chunk_count += 1
778
+ if len(candidate) > 0:
779
+ output.append(chunk_delimiter.join(candidate))
780
+ output_indices.append(candidate_indices)
781
+ candidate = chunk_with_header
782
+ candidate_indices = [chunk_i]
783
+ else:
784
+ logging.warning(f"Single chunk at index {chunk_i} exceeds max_tokens and will be dropped.")
785
+ dropped_chunk_count += 1
786
+ else:
787
+ candidate.extend(chunk_with_header)
788
+ candidate_indices.append(chunk_i)
789
+
790
+ if candidate:
791
+ output.append(chunk_delimiter.join(candidate))
792
+ output_indices.append(candidate_indices)
793
+ return output, output_indices, dropped_chunk_count
794
+
795
+
796
+ def rolling_summarize(text: str,
797
+ detail: float = 0,
798
+ model: str = 'gpt-4o',
799
+ additional_instructions: Optional[str] = None,
800
+ minimum_chunk_size: Optional[int] = 500,
801
+ chunk_delimiter: str = ".",
802
+ summarize_recursively: bool = False,
803
+ verbose: bool = False) -> str:
804
+ """
805
+ Summarizes a given text by splitting it into chunks, each of which is summarized individually.
806
+ The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
807
+
808
+ Parameters:
809
+ - text (str): The text to be summarized.
810
+ - detail (float, optional): A value between 0 and 1 indicating the desired level of detail in the summary.
811
+ - additional_instructions (Optional[str], optional): Additional instructions for the model.
812
+ - minimum_chunk_size (Optional[int], optional): The minimum size for text chunks.
813
+ - chunk_delimiter (str, optional): The delimiter used to split the text into chunks.
814
+ - summarize_recursively (bool, optional): If True, summaries are generated recursively.
815
+ - verbose (bool, optional): If True, prints detailed information about the chunking process.
816
+
817
+ Returns:
818
+ - str: The final compiled summary of the text.
819
+
820
+ The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
821
+ based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
822
+ `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
823
+ summarization process. The function returns a compiled summary of all chunks.
824
+ """
825
+
826
+ # Check detail is set correctly
827
+ assert 0 <= detail <= 1, "Detail must be between 0 and 1."
828
+
829
+ # Interpolate the number of chunks based on the detail parameter
830
+ text_length = len(tokenizer.encode(text))
831
+ max_chunks = text_length // minimum_chunk_size if minimum_chunk_size else 10
832
+ min_chunks = 1
833
+ num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
834
+
835
+ # Adjust chunk_size based on interpolated number of chunks
836
+ chunk_size = max(minimum_chunk_size, text_length // num_chunks) if num_chunks else text_length
837
+ text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
838
+ if verbose:
839
+ print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
840
+ print(f"Chunk lengths are {[len(tokenizer.encode(x)) for x in text_chunks]} tokens.")
841
+
842
+ # Set system message
843
+ system_message_content = "Rewrite this text in summarized form."
844
+ if additional_instructions:
845
+ system_message_content += f"\n\n{additional_instructions}"
846
+
847
+ accumulated_summaries = []
848
+ for i, chunk in enumerate(tqdm(text_chunks, desc="Summarizing chunks")):
849
+ if summarize_recursively and accumulated_summaries:
850
+ # Combine previous summary with current chunk for recursive summarization
851
+ combined_text = accumulated_summaries[-1] + "\n\n" + chunk
852
+ user_message_content = f"Previous summary and new content to summarize:\n\n{combined_text}"
853
+ else:
854
+ user_message_content = chunk
855
+
856
+ messages = [
857
+ {"role": "system", "content": system_message_content},
858
+ {"role": "user", "content": user_message_content}
859
+ ]
860
+
861
+ response = get_chat_completion(messages, model=model)
862
+ accumulated_summaries.append(response)
863
+
864
+ final_summary = '\n\n'.join(accumulated_summaries)
865
+ return final_summary
866
+
867
+ #
868
+ #
869
+ #######################################################################################################################
870
+ #
871
+ # Ebook Chapter Chunking
872
+
873
+
874
+ def chunk_ebook_by_chapters(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
875
+ logging.debug("chunk_ebook_by_chapters")
876
+ max_chunk_size = int(chunk_options.get('max_size', 300))
877
+ overlap = int(chunk_options.get('overlap', 0))
878
+ custom_pattern = chunk_options.get('custom_chapter_pattern', None)
879
+
880
+ # List of chapter heading patterns to try, in order
881
+ chapter_patterns = [
882
+ custom_pattern,
883
+ r'^#{1,2}\s+', # Markdown style: '# ' or '## '
884
+ r'^Chapter\s+\d+', # 'Chapter ' followed by numbers
885
+ r'^\d+\.\s+', # Numbered chapters: '1. ', '2. ', etc.
886
+ r'^[A-Z\s]+$' # All caps headings
887
+ ]
888
+
889
+ chapter_positions = []
890
+ used_pattern = None
891
+
892
+ for pattern in chapter_patterns:
893
+ if pattern is None:
894
+ continue
895
+ chapter_regex = re.compile(pattern, re.MULTILINE | re.IGNORECASE)
896
+ chapter_positions = [match.start() for match in chapter_regex.finditer(text)]
897
+ if chapter_positions:
898
+ used_pattern = pattern
899
+ break
900
+
901
+ # If no chapters found, return the entire content as one chunk
902
+ if not chapter_positions:
903
+ metadata = get_chunk_metadata(
904
+ chunk=text,
905
+ full_text=text,
906
+ chunk_type="whole_document",
907
+ language=chunk_options.get('language', 'english')
908
+ )
909
+ return [{'text': text, 'metadata': metadata}]
910
+
911
+ # Split content into chapters
912
+ chunks = []
913
+ for i in range(len(chapter_positions)):
914
+ start = chapter_positions[i]
915
+ end = chapter_positions[i + 1] if i + 1 < len(chapter_positions) else None
916
+ chapter = text[start:end]
917
+
918
+ # Apply overlap if specified
919
+ if overlap > 0 and i > 0:
920
+ overlap_start = max(0, chapter_positions[i] - overlap)
921
+ chapter = text[overlap_start:end]
922
+
923
+ chunks.append(chapter)
924
+
925
+ # Post-process chunks
926
+ processed_chunks = post_process_chunks(chunks)
927
+
928
+ # Add metadata to chunks
929
+ chunks_with_metadata = []
930
+ for i, chunk in enumerate(processed_chunks):
931
+ metadata = get_chunk_metadata(
932
+ chunk=chunk,
933
+ full_text=text,
934
+ chunk_type="chapter",
935
+ chapter_number=i + 1,
936
+ chapter_pattern=used_pattern,
937
+ language=chunk_options.get('language', 'english')
938
+ )
939
+ chunks_with_metadata.append({'text': chunk, 'metadata': metadata})
940
+
941
+ return chunks_with_metadata
942
+
943
+ #
944
+ # End of ebook chapter chunking
945
+ #######################################################################################################################
946
+
947
+ #######################################################################################################################
948
+ #
949
+ # Functions for adapative chunking:
950
+
951
+ # FIXME - punkt
952
+
953
+ def adaptive_chunk_size(text: str, base_size: int = 1000, min_size: int = 500, max_size: int = 2000) -> int:
954
+ # Tokenize the text into sentences
955
+ sentences = sent_tokenize(text)
956
+
957
+ if not sentences:
958
+ return base_size
959
+
960
+ # Calculate average sentence length
961
+ avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences)
962
+
963
+ # Adjust chunk size based on average sentence length
964
+ if avg_sentence_length < 10:
965
+ size_factor = 1.2 # Increase chunk size for short sentences
966
+ elif avg_sentence_length > 20:
967
+ size_factor = 0.8 # Decrease chunk size for long sentences
968
+ else:
969
+ size_factor = 1.0
970
+
971
+ # Calculate adaptive chunk size
972
+ adaptive_size = int(base_size * size_factor)
973
+
974
+ # Ensure chunk size is within bounds
975
+ return max(min_size, min(adaptive_size, max_size))
976
+
977
+
978
+ def adaptive_chunk_size_non_punkt(text: str, base_size: int, min_size: int = 100, max_size: int = 2000) -> int:
979
+ # Adaptive logic: adjust chunk size based on text complexity
980
+ words = text.split()
981
+ if not words:
982
+ return base_size # Return base_size if text is empty
983
+
984
+ avg_word_length = sum(len(word) for word in words) / len(words)
985
+
986
+ if avg_word_length > 6: # Threshold for "complex" text
987
+ adjusted_size = int(base_size * 0.8) # Reduce chunk size for complex text
988
+ elif avg_word_length < 4: # Threshold for "simple" text
989
+ adjusted_size = int(base_size * 1.2) # Increase chunk size for simple text
990
+ else:
991
+ adjusted_size = base_size
992
+
993
+ # Ensure the chunk size is within the specified range
994
+ return max(min_size, min(adjusted_size, max_size))
995
+
996
+
997
+ def adaptive_chunking(text: str, base_size: int = 1000, min_size: int = 500, max_size: int = 2000) -> List[str]:
998
+ logging.debug("adaptive_chunking...")
999
+ chunk_size = adaptive_chunk_size(text, base_size, min_size, max_size)
1000
+ words = text.split()
1001
+ chunks = []
1002
+ current_chunk = []
1003
+ current_length = 0
1004
+
1005
+ for word in words:
1006
+ if current_length + len(word) > chunk_size and current_chunk:
1007
+ chunks.append(' '.join(current_chunk))
1008
+ current_chunk = []
1009
+ current_length = 0
1010
+ current_chunk.append(word)
1011
+ current_length += len(word) + 1 # +1 for space
1012
+
1013
+ if current_chunk:
1014
+ chunks.append(' '.join(current_chunk))
1015
+
1016
+ return chunks
1017
+
1018
+ # FIXME - usage example
1019
+ # chunk_options = {
1020
+ # 'method': 'words', # or any other method
1021
+ # 'base_size': 1000,
1022
+ # 'min_size': 100,
1023
+ # 'max_size': 2000,
1024
+ # 'adaptive': True,
1025
+ # 'language': 'en'
1026
+ # }
1027
+ #chunks = improved_chunking_process(your_text, chunk_options)
1028
+
1029
+
1030
+ # Example of chunking a document with metadata
1031
+ # document_metadata = {
1032
+ # 'title': 'Example Document',
1033
+ # 'author': 'John Doe',
1034
+ # 'creation_date': '2023-06-14',
1035
+ # 'source': 'https://example.com/document',
1036
+ # 'document_type': 'article'
1037
+ # }
1038
+ #
1039
+ # chunk_options = {
1040
+ # 'method': 'sentences',
1041
+ # 'base_size': 1000,
1042
+ # 'adaptive': True,
1043
+ # 'language': 'en'
1044
+ # }
1045
+ #
1046
+ # processed_document = process_document_with_metadata(your_text, chunk_options, document_metadata)
1047
+
1048
+
1049
+ #
1050
+ # End of Chunking Library
1051
  #######################################################################################################################