from langchain import OpenAI from langchain.chains.summarize import load_summarize_chain from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain import PromptTemplate import openai,os openai.api_base = os.environ.get("api_base") openai.api_key = os.environ.get("api_key") openai_api_key = openai.api_key import json def read_transcriptions_from_file(file_path): transcriptions = [] with open(file_path, 'r') as file: lines = file.readlines() for line in lines: part = json.loads(line) transcriptions.append(part) return transcriptions def combine_n_segments(transcriptions, n=2000): combined_segments = [] current_segment = None for i, transcription in enumerate(transcriptions): if current_segment is None: current_segment = transcription else: # Calculate the new end time based on the current segment and the next transcription current_segment['end'] = transcription['end'] # Combine the texts with a space between current_segment['text'] += transcription['text'] # If the next transcription is not continuous or n segments are combined, start a new segment if (transcription['end'] != current_segment['end']) or (i % n == n - 1): combined_segments.append(current_segment) current_segment = None # Append the last segment if needed if current_segment is not None: combined_segments.append(current_segment) return combined_segments llm = OpenAI( temperature=0 ,openai_api_key=openai.api_key, openai_api_base=openai.api_base ) paul_graham_essay = "./sample.txt" with open(paul_graham_essay, "r") as file: essay = file.read() essay = read_transcriptions_from_file("./sample.txt") essay = combine_n_segments(essay) def generate_summary(data): template = """ {data} """.format(data=data) chunk_size = 2500 inc = 100 max_tokens = 2000 min_tokens = 1500 max_token_doc = 0 # choose the appropriate chunk size while True: # initialize text splitter text_splitter = RecursiveCharacterTextSplitter( separators=["\n\n", "\n"], chunk_size=chunk_size, # chunk_overlap=int(chunk_size * 0.1), ) docs = text_splitter.create_documents([template]) temp =[] for doc in docs: temp.append(llm.get_num_tokens(doc.page_content)) max_token_doc = max(temp) if max_tokens < max_token_doc or max_token_doc < min_tokens: if max_tokens < max_token_doc: chunk_size -= inc else: chunk_size += inc print(max_token_doc,chunk_size) continue else: break map_prompt = """ ### Write a summary of the following, video transcript segment, return a detailed summary citing the time stamps which are in seconds ### cite using seconds ONLY "{text}" Detailed SUMMARY: """ map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"]) combine_prompt = """ ### Write a summary of the following, video transcript segment summaries, return a detailed summary. Return your response citing the time stamps which are in seconds. #cite using seconds ONLY ```{text}``` SUMMARY: """ combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"]) summary_chain = load_summarize_chain(llm=llm, chain_type='map_reduce', map_prompt=map_prompt_template, combine_prompt=combine_prompt_template, verbose=True ) summary=summary_chain.run(docs) print(summary) generate_summary(essay)