Spaces:

bla
/

tranny

Runtime error

App Files Files Community

tranny / App /Embedding /utils /Summarizer.py

Mbonea

minor important changes

d7d0d8e over 1 year ago

raw

history blame

3.94 kB

	from langchain import OpenAI
	from langchain.chains.summarize import load_summarize_chain
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain import PromptTemplate
	import openai,os

	openai.api_base = os.environ.get("api_base")
	openai.api_key = os.environ.get("api_key")
	openai_api_key = openai.api_key


	import json
	def read_transcriptions_from_file(file_path):
	transcriptions = []
	with open(file_path, 'r') as file:
	lines = file.readlines()
	for line in lines:
	part = json.loads(line)

	transcriptions.append(part)
	return transcriptions
	def combine_n_segments(transcriptions, n=2000):
	combined_segments = []
	current_segment = None

	for i, transcription in enumerate(transcriptions):
	if current_segment is None:
	current_segment = transcription
	else:
	# Calculate the new end time based on the current segment and the next transcription
	current_segment['end'] = transcription['end']
	# Combine the texts with a space between
	current_segment['text'] += transcription['text']

	# If the next transcription is not continuous or n segments are combined, start a new segment
	if (transcription['end'] != current_segment['end']) or (i % n == n - 1):
	combined_segments.append(current_segment)
	current_segment = None

	# Append the last segment if needed
	if current_segment is not None:
	combined_segments.append(current_segment)

	return combined_segments




	llm = OpenAI(
	temperature=0 ,openai_api_key=openai.api_key, openai_api_base=openai.api_base
	)


	paul_graham_essay = "./sample.txt"

	with open(paul_graham_essay, "r") as file:
	essay = file.read()

	essay = read_transcriptions_from_file("./sample.txt")
	essay = combine_n_segments(essay)

	def generate_summary(data):
	template = """ {data} """.format(data=data)
	chunk_size = 2500
	inc = 100
	max_tokens = 2000
	min_tokens = 1500
	max_token_doc = 0

	# choose the appropriate chunk size
	while True:
	# initialize text splitter
	text_splitter = RecursiveCharacterTextSplitter(
	separators=["\n\n", "\n"],
	chunk_size=chunk_size,
	# chunk_overlap=int(chunk_size * 0.1),
	)
	docs = text_splitter.create_documents([template])
	temp =[]
	for doc in docs:
	temp.append(llm.get_num_tokens(doc.page_content))
	max_token_doc = max(temp)
	if max_tokens < max_token_doc or max_token_doc < min_tokens:
	if max_tokens < max_token_doc:
	chunk_size -= inc
	else:
	chunk_size += inc
	print(max_token_doc,chunk_size)
	continue

	else:
	break
	map_prompt = """
	### Write a summary of the following, video transcript segment, return a detailed summary citing the time stamps which are in seconds
	### cite using seconds ONLY
	"{text}"
	Detailed SUMMARY:
	"""


	map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])
	combine_prompt = """
	### Write a summary of the following, video transcript segment summaries, return a detailed summary.
	Return your response citing the time stamps which are in seconds.
	#cite using seconds ONLY
	```{text}```
	SUMMARY:
	"""
	combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])
	summary_chain = load_summarize_chain(llm=llm,
	chain_type='map_reduce',
	map_prompt=map_prompt_template,
	combine_prompt=combine_prompt_template,
	verbose=True
	)
	summary=summary_chain.run(docs)
	print(summary)
	generate_summary(essay)