oceansweep commited on
Commit
9f31476
·
verified ·
1 Parent(s): dc349d8

Delete App_Function_Libraries/Old_Chunking_Lib.py

Browse files
App_Function_Libraries/Old_Chunking_Lib.py DELETED
@@ -1,159 +0,0 @@
1
- # Old_Chunking_Lib.py
2
- #########################################
3
- # Old Chunking Library
4
- # This library is used to handle chunking of text for summarization.
5
- #
6
- ####
7
- import logging
8
- ####################
9
- # Function List
10
- #
11
- # 1. chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]
12
- # 2. summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, words_per_second: int) -> str
13
- # 3. get_chat_completion(messages, model='gpt-4-turbo')
14
- # 4. chunk_on_delimiter(input_string: str, max_tokens: int, delimiter: str) -> List[str]
15
- # 5. combine_chunks_with_no_minimum(chunks: List[str], max_tokens: int, chunk_delimiter="\n\n", header: Optional[str] = None, add_ellipsis_for_overflow=False) -> Tuple[List[str], List[int]]
16
- # 6. rolling_summarize(text: str, detail: float = 0, model: str = 'gpt-4-turbo', additional_instructions: Optional[str] = None, minimum_chunk_size: Optional[int] = 500, chunk_delimiter: str = ".", summarize_recursively=False, verbose=False)
17
- # 7. chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]
18
- # 8. summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, words_per_second: int) -> str
19
- #
20
- ####################
21
-
22
- # Import necessary libraries
23
- import os
24
- from typing import Optional, List, Tuple
25
- #
26
- # Import 3rd party
27
- from openai import OpenAI
28
- from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
29
- #
30
- # Import Local
31
- #
32
- #######################################################################################################################
33
- # Function Definitions
34
- #
35
-
36
- ######### Words-per-second Chunking #########
37
- def chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]:
38
- words = transcript.split()
39
- words_per_chunk = chunk_duration * words_per_second
40
- chunks = [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
41
- return chunks
42
-
43
-
44
- # def summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int,
45
- # words_per_second: int) -> str:
46
- # if api_name not in summarizers: # See 'summarizers' dict in the main script
47
- # return f"Unsupported API: {api_name}"
48
- #
49
- # summarizer = summarizers[api_name]
50
- # text = extract_text_from_segments(transcript)
51
- # chunks = chunk_transcript(text, chunk_duration, words_per_second)
52
- #
53
- # summaries = []
54
- # for chunk in chunks:
55
- # if api_name == 'openai':
56
- # # Ensure the correct model and prompt are passed
57
- # summaries.append(summarizer(api_key, chunk, custom_prompt))
58
- # else:
59
- # summaries.append(summarizer(api_key, chunk))
60
- #
61
- # return "\n\n".join(summaries)
62
-
63
-
64
- ################## ####################
65
-
66
-
67
- ######### Token-size Chunking ######### FIXME - OpenAI only currently
68
- # This is dirty and shameful and terrible. It should be replaced with a proper implementation.
69
- # anyways lets get to it....
70
- openai_api_key = "Fake_key" # FIXME
71
- client = OpenAI(api_key=openai_api_key)
72
-
73
-
74
-
75
-
76
-
77
- # This function chunks a text into smaller pieces based on a maximum token count and a delimiter
78
- def chunk_on_delimiter(input_string: str,
79
- max_tokens: int,
80
- delimiter: str) -> List[str]:
81
- chunks = input_string.split(delimiter)
82
- combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
83
- chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
84
- if dropped_chunk_count > 0:
85
- print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
86
- combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
87
- return combined_chunks
88
-
89
-
90
-
91
-
92
-
93
- #######################################
94
-
95
-
96
- ######### Words-per-second Chunking #########
97
- # FIXME - WHole section needs to be re-written
98
- def chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]:
99
- words = transcript.split()
100
- words_per_chunk = chunk_duration * words_per_second
101
- chunks = [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
102
- return chunks
103
-
104
-
105
- # def summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int,
106
- # words_per_second: int) -> str:
107
- # if api_name not in summarizers: # See 'summarizers' dict in the main script
108
- # return f"Unsupported API: {api_name}"
109
- #
110
- # if not transcript:
111
- # logging.error("Empty or None transcript provided to summarize_chunks")
112
- # return "Error: Empty or None transcript provided"
113
- #
114
- # text = extract_text_from_segments(transcript)
115
- # chunks = chunk_transcript(text, chunk_duration, words_per_second)
116
- #
117
- # #FIXME
118
- # custom_prompt = args.custom_prompt
119
- #
120
- # summaries = []
121
- # for chunk in chunks:
122
- # if api_name == 'openai':
123
- # # Ensure the correct model and prompt are passed
124
- # summaries.append(summarize_with_openai(api_key, chunk, custom_prompt))
125
- # elif api_name == 'anthropic':
126
- # summaries.append(summarize_with_cohere(api_key, chunk, anthropic_model, custom_prompt))
127
- # elif api_name == 'cohere':
128
- # summaries.append(summarize_with_anthropic(api_key, chunk, cohere_model, custom_prompt))
129
- # elif api_name == 'groq':
130
- # summaries.append(summarize_with_groq(api_key, chunk, groq_model, custom_prompt))
131
- # elif api_name == 'llama':
132
- # summaries.append(summarize_with_llama(llama_api_IP, chunk, api_key, custom_prompt))
133
- # elif api_name == 'kobold':
134
- # summaries.append(summarize_with_kobold(kobold_api_IP, chunk, api_key, custom_prompt))
135
- # elif api_name == 'ooba':
136
- # summaries.append(summarize_with_oobabooga(ooba_api_IP, chunk, api_key, custom_prompt))
137
- # elif api_name == 'tabbyapi':
138
- # summaries.append(summarize_with_vllm(api_key, tabby_api_IP, chunk, summarize.llm_model, custom_prompt))
139
- # elif api_name == 'local-llm':
140
- # summaries.append(summarize_with_local_llm(chunk, custom_prompt))
141
- # else:
142
- # return f"Unsupported API: {api_name}"
143
- #
144
- # return "\n\n".join(summaries)
145
-
146
- # FIXME - WHole section needs to be re-written
147
- def summarize_with_detail_openai(text, detail, verbose=False):
148
- summary_with_detail_variable = rolling_summarize(text, detail=detail, verbose=True)
149
- print(len(openai_tokenize(summary_with_detail_variable)))
150
- return summary_with_detail_variable
151
-
152
-
153
- def summarize_with_detail_recursive_openai(text, detail, verbose=False):
154
- summary_with_recursive_summarization = rolling_summarize(text, detail=detail, summarize_recursively=True)
155
- print(summary_with_recursive_summarization)
156
-
157
- #
158
- #
159
- #################################################################################