tranny / App /Embedding /utils /Summarizer.py
Mbonea's picture
minor important changes
d7d0d8e
raw
history blame
3.94 kB
from langchain import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate
import openai,os
openai.api_base = os.environ.get("api_base")
openai.api_key = os.environ.get("api_key")
openai_api_key = openai.api_key
import json
def read_transcriptions_from_file(file_path):
transcriptions = []
with open(file_path, 'r') as file:
lines = file.readlines()
for line in lines:
part = json.loads(line)
transcriptions.append(part)
return transcriptions
def combine_n_segments(transcriptions, n=2000):
combined_segments = []
current_segment = None
for i, transcription in enumerate(transcriptions):
if current_segment is None:
current_segment = transcription
else:
# Calculate the new end time based on the current segment and the next transcription
current_segment['end'] = transcription['end']
# Combine the texts with a space between
current_segment['text'] += transcription['text']
# If the next transcription is not continuous or n segments are combined, start a new segment
if (transcription['end'] != current_segment['end']) or (i % n == n - 1):
combined_segments.append(current_segment)
current_segment = None
# Append the last segment if needed
if current_segment is not None:
combined_segments.append(current_segment)
return combined_segments
llm = OpenAI(
temperature=0 ,openai_api_key=openai.api_key, openai_api_base=openai.api_base
)
paul_graham_essay = "./sample.txt"
with open(paul_graham_essay, "r") as file:
essay = file.read()
essay = read_transcriptions_from_file("./sample.txt")
essay = combine_n_segments(essay)
def generate_summary(data):
template = """ {data} """.format(data=data)
chunk_size = 2500
inc = 100
max_tokens = 2000
min_tokens = 1500
max_token_doc = 0
# choose the appropriate chunk size
while True:
# initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n"],
chunk_size=chunk_size,
# chunk_overlap=int(chunk_size * 0.1),
)
docs = text_splitter.create_documents([template])
temp =[]
for doc in docs:
temp.append(llm.get_num_tokens(doc.page_content))
max_token_doc = max(temp)
if max_tokens < max_token_doc or max_token_doc < min_tokens:
if max_tokens < max_token_doc:
chunk_size -= inc
else:
chunk_size += inc
print(max_token_doc,chunk_size)
continue
else:
break
map_prompt = """
### Write a summary of the following, video transcript segment, return a detailed summary citing the time stamps which are in seconds
### cite using seconds ONLY
"{text}"
Detailed SUMMARY:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])
combine_prompt = """
### Write a summary of the following, video transcript segment summaries, return a detailed summary.
Return your response citing the time stamps which are in seconds.
#cite using seconds ONLY
```{text}```
SUMMARY:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])
summary_chain = load_summarize_chain(llm=llm,
chain_type='map_reduce',
map_prompt=map_prompt_template,
combine_prompt=combine_prompt_template,
verbose=True
)
summary=summary_chain.run(docs)
print(summary)
generate_summary(essay)