|
from langchain import OpenAI |
|
from langchain.chains.summarize import load_summarize_chain |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain import PromptTemplate |
|
import openai,os |
|
|
|
openai.api_base = os.environ.get("api_base") |
|
openai.api_key = os.environ.get("api_key") |
|
openai_api_key = openai.api_key |
|
|
|
|
|
import json |
|
def read_transcriptions_from_file(file_path): |
|
transcriptions = [] |
|
with open(file_path, 'r') as file: |
|
lines = file.readlines() |
|
for line in lines: |
|
part = json.loads(line) |
|
|
|
transcriptions.append(part) |
|
return transcriptions |
|
def combine_n_segments(transcriptions, n=2000): |
|
combined_segments = [] |
|
current_segment = None |
|
|
|
for i, transcription in enumerate(transcriptions): |
|
if current_segment is None: |
|
current_segment = transcription |
|
else: |
|
|
|
current_segment['end'] = transcription['end'] |
|
|
|
current_segment['text'] += transcription['text'] |
|
|
|
|
|
if (transcription['end'] != current_segment['end']) or (i % n == n - 1): |
|
combined_segments.append(current_segment) |
|
current_segment = None |
|
|
|
|
|
if current_segment is not None: |
|
combined_segments.append(current_segment) |
|
|
|
return combined_segments |
|
|
|
|
|
|
|
|
|
llm = OpenAI( |
|
temperature=0 ,openai_api_key=openai.api_key, openai_api_base=openai.api_base |
|
) |
|
|
|
|
|
paul_graham_essay = "./sample.txt" |
|
|
|
with open(paul_graham_essay, "r") as file: |
|
essay = file.read() |
|
|
|
essay = read_transcriptions_from_file("./sample.txt") |
|
essay = combine_n_segments(essay) |
|
|
|
def generate_summary(data): |
|
template = """ {data} """.format(data=data) |
|
chunk_size = 2500 |
|
inc = 100 |
|
max_tokens = 2000 |
|
min_tokens = 1500 |
|
max_token_doc = 0 |
|
|
|
|
|
while True: |
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
separators=["\n\n", "\n"], |
|
chunk_size=chunk_size, |
|
|
|
) |
|
docs = text_splitter.create_documents([template]) |
|
temp =[] |
|
for doc in docs: |
|
temp.append(llm.get_num_tokens(doc.page_content)) |
|
max_token_doc = max(temp) |
|
if max_tokens < max_token_doc or max_token_doc < min_tokens: |
|
if max_tokens < max_token_doc: |
|
chunk_size -= inc |
|
else: |
|
chunk_size += inc |
|
print(max_token_doc,chunk_size) |
|
continue |
|
|
|
else: |
|
break |
|
map_prompt = """ |
|
### Write a summary of the following, video transcript segment, return a detailed summary citing the time stamps which are in seconds |
|
### cite using seconds ONLY |
|
"{text}" |
|
Detailed SUMMARY: |
|
""" |
|
|
|
|
|
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"]) |
|
combine_prompt = """ |
|
### Write a summary of the following, video transcript segment summaries, return a detailed summary. |
|
Return your response citing the time stamps which are in seconds. |
|
#cite using seconds ONLY |
|
```{text}``` |
|
SUMMARY: |
|
""" |
|
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"]) |
|
summary_chain = load_summarize_chain(llm=llm, |
|
chain_type='map_reduce', |
|
map_prompt=map_prompt_template, |
|
combine_prompt=combine_prompt_template, |
|
verbose=True |
|
) |
|
summary=summary_chain.run(docs) |
|
print(summary) |
|
generate_summary(essay) |
|
|