Spaces:
Runtime error
Runtime error
Delete App_Function_Libraries/Old_Chunking_Lib.py
Browse files
App_Function_Libraries/Old_Chunking_Lib.py
DELETED
@@ -1,159 +0,0 @@
|
|
1 |
-
# Old_Chunking_Lib.py
|
2 |
-
#########################################
|
3 |
-
# Old Chunking Library
|
4 |
-
# This library is used to handle chunking of text for summarization.
|
5 |
-
#
|
6 |
-
####
|
7 |
-
import logging
|
8 |
-
####################
|
9 |
-
# Function List
|
10 |
-
#
|
11 |
-
# 1. chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]
|
12 |
-
# 2. summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, words_per_second: int) -> str
|
13 |
-
# 3. get_chat_completion(messages, model='gpt-4-turbo')
|
14 |
-
# 4. chunk_on_delimiter(input_string: str, max_tokens: int, delimiter: str) -> List[str]
|
15 |
-
# 5. combine_chunks_with_no_minimum(chunks: List[str], max_tokens: int, chunk_delimiter="\n\n", header: Optional[str] = None, add_ellipsis_for_overflow=False) -> Tuple[List[str], List[int]]
|
16 |
-
# 6. rolling_summarize(text: str, detail: float = 0, model: str = 'gpt-4-turbo', additional_instructions: Optional[str] = None, minimum_chunk_size: Optional[int] = 500, chunk_delimiter: str = ".", summarize_recursively=False, verbose=False)
|
17 |
-
# 7. chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]
|
18 |
-
# 8. summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, words_per_second: int) -> str
|
19 |
-
#
|
20 |
-
####################
|
21 |
-
|
22 |
-
# Import necessary libraries
|
23 |
-
import os
|
24 |
-
from typing import Optional, List, Tuple
|
25 |
-
#
|
26 |
-
# Import 3rd party
|
27 |
-
from openai import OpenAI
|
28 |
-
from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
|
29 |
-
#
|
30 |
-
# Import Local
|
31 |
-
#
|
32 |
-
#######################################################################################################################
|
33 |
-
# Function Definitions
|
34 |
-
#
|
35 |
-
|
36 |
-
######### Words-per-second Chunking #########
|
37 |
-
def chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]:
|
38 |
-
words = transcript.split()
|
39 |
-
words_per_chunk = chunk_duration * words_per_second
|
40 |
-
chunks = [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
|
41 |
-
return chunks
|
42 |
-
|
43 |
-
|
44 |
-
# def summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int,
|
45 |
-
# words_per_second: int) -> str:
|
46 |
-
# if api_name not in summarizers: # See 'summarizers' dict in the main script
|
47 |
-
# return f"Unsupported API: {api_name}"
|
48 |
-
#
|
49 |
-
# summarizer = summarizers[api_name]
|
50 |
-
# text = extract_text_from_segments(transcript)
|
51 |
-
# chunks = chunk_transcript(text, chunk_duration, words_per_second)
|
52 |
-
#
|
53 |
-
# summaries = []
|
54 |
-
# for chunk in chunks:
|
55 |
-
# if api_name == 'openai':
|
56 |
-
# # Ensure the correct model and prompt are passed
|
57 |
-
# summaries.append(summarizer(api_key, chunk, custom_prompt))
|
58 |
-
# else:
|
59 |
-
# summaries.append(summarizer(api_key, chunk))
|
60 |
-
#
|
61 |
-
# return "\n\n".join(summaries)
|
62 |
-
|
63 |
-
|
64 |
-
################## ####################
|
65 |
-
|
66 |
-
|
67 |
-
######### Token-size Chunking ######### FIXME - OpenAI only currently
|
68 |
-
# This is dirty and shameful and terrible. It should be replaced with a proper implementation.
|
69 |
-
# anyways lets get to it....
|
70 |
-
openai_api_key = "Fake_key" # FIXME
|
71 |
-
client = OpenAI(api_key=openai_api_key)
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
# This function chunks a text into smaller pieces based on a maximum token count and a delimiter
|
78 |
-
def chunk_on_delimiter(input_string: str,
|
79 |
-
max_tokens: int,
|
80 |
-
delimiter: str) -> List[str]:
|
81 |
-
chunks = input_string.split(delimiter)
|
82 |
-
combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
|
83 |
-
chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
|
84 |
-
if dropped_chunk_count > 0:
|
85 |
-
print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
|
86 |
-
combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
|
87 |
-
return combined_chunks
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
#######################################
|
94 |
-
|
95 |
-
|
96 |
-
######### Words-per-second Chunking #########
|
97 |
-
# FIXME - WHole section needs to be re-written
|
98 |
-
def chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]:
|
99 |
-
words = transcript.split()
|
100 |
-
words_per_chunk = chunk_duration * words_per_second
|
101 |
-
chunks = [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
|
102 |
-
return chunks
|
103 |
-
|
104 |
-
|
105 |
-
# def summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int,
|
106 |
-
# words_per_second: int) -> str:
|
107 |
-
# if api_name not in summarizers: # See 'summarizers' dict in the main script
|
108 |
-
# return f"Unsupported API: {api_name}"
|
109 |
-
#
|
110 |
-
# if not transcript:
|
111 |
-
# logging.error("Empty or None transcript provided to summarize_chunks")
|
112 |
-
# return "Error: Empty or None transcript provided"
|
113 |
-
#
|
114 |
-
# text = extract_text_from_segments(transcript)
|
115 |
-
# chunks = chunk_transcript(text, chunk_duration, words_per_second)
|
116 |
-
#
|
117 |
-
# #FIXME
|
118 |
-
# custom_prompt = args.custom_prompt
|
119 |
-
#
|
120 |
-
# summaries = []
|
121 |
-
# for chunk in chunks:
|
122 |
-
# if api_name == 'openai':
|
123 |
-
# # Ensure the correct model and prompt are passed
|
124 |
-
# summaries.append(summarize_with_openai(api_key, chunk, custom_prompt))
|
125 |
-
# elif api_name == 'anthropic':
|
126 |
-
# summaries.append(summarize_with_cohere(api_key, chunk, anthropic_model, custom_prompt))
|
127 |
-
# elif api_name == 'cohere':
|
128 |
-
# summaries.append(summarize_with_anthropic(api_key, chunk, cohere_model, custom_prompt))
|
129 |
-
# elif api_name == 'groq':
|
130 |
-
# summaries.append(summarize_with_groq(api_key, chunk, groq_model, custom_prompt))
|
131 |
-
# elif api_name == 'llama':
|
132 |
-
# summaries.append(summarize_with_llama(llama_api_IP, chunk, api_key, custom_prompt))
|
133 |
-
# elif api_name == 'kobold':
|
134 |
-
# summaries.append(summarize_with_kobold(kobold_api_IP, chunk, api_key, custom_prompt))
|
135 |
-
# elif api_name == 'ooba':
|
136 |
-
# summaries.append(summarize_with_oobabooga(ooba_api_IP, chunk, api_key, custom_prompt))
|
137 |
-
# elif api_name == 'tabbyapi':
|
138 |
-
# summaries.append(summarize_with_vllm(api_key, tabby_api_IP, chunk, summarize.llm_model, custom_prompt))
|
139 |
-
# elif api_name == 'local-llm':
|
140 |
-
# summaries.append(summarize_with_local_llm(chunk, custom_prompt))
|
141 |
-
# else:
|
142 |
-
# return f"Unsupported API: {api_name}"
|
143 |
-
#
|
144 |
-
# return "\n\n".join(summaries)
|
145 |
-
|
146 |
-
# FIXME - WHole section needs to be re-written
|
147 |
-
def summarize_with_detail_openai(text, detail, verbose=False):
|
148 |
-
summary_with_detail_variable = rolling_summarize(text, detail=detail, verbose=True)
|
149 |
-
print(len(openai_tokenize(summary_with_detail_variable)))
|
150 |
-
return summary_with_detail_variable
|
151 |
-
|
152 |
-
|
153 |
-
def summarize_with_detail_recursive_openai(text, detail, verbose=False):
|
154 |
-
summary_with_recursive_summarization = rolling_summarize(text, detail=detail, summarize_recursively=True)
|
155 |
-
print(summary_with_recursive_summarization)
|
156 |
-
|
157 |
-
#
|
158 |
-
#
|
159 |
-
#################################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|