File size: 3,939 Bytes
d7d0d8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from langchain import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate
import openai,os

openai.api_base = os.environ.get("api_base")
openai.api_key = os.environ.get("api_key")
openai_api_key = openai.api_key


import json
def read_transcriptions_from_file(file_path):
    transcriptions = []
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            part = json.loads(line)

            transcriptions.append(part)
    return transcriptions
def combine_n_segments(transcriptions, n=2000):
    combined_segments = []
    current_segment = None

    for i, transcription in enumerate(transcriptions):
        if current_segment is None:
            current_segment = transcription
        else:
            # Calculate the new end time based on the current segment and the next transcription
            current_segment['end'] = transcription['end']
            # Combine the texts with a space between
            current_segment['text'] += transcription['text']

        # If the next transcription is not continuous or n segments are combined, start a new segment
        if (transcription['end'] != current_segment['end']) or (i % n == n - 1):
            combined_segments.append(current_segment)
            current_segment = None

    # Append the last segment if needed
    if current_segment is not None:
        combined_segments.append(current_segment)

    return combined_segments




llm = OpenAI(
    temperature=0 ,openai_api_key=openai.api_key, openai_api_base=openai.api_base
)


paul_graham_essay = "./sample.txt"

with open(paul_graham_essay, "r") as file:
    essay = file.read()

essay = read_transcriptions_from_file("./sample.txt")
essay = combine_n_segments(essay)

def generate_summary(data):
    template = """ {data} """.format(data=data)
    chunk_size = 2500
    inc = 100
    max_tokens = 2000
    min_tokens = 1500
    max_token_doc = 0

    # choose the appropriate chunk size
    while True:
        # initialize text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n", "\n"],
            chunk_size=chunk_size,
            # chunk_overlap=int(chunk_size * 0.1),
        )
        docs = text_splitter.create_documents([template])
        temp =[]
        for doc in docs:
            temp.append(llm.get_num_tokens(doc.page_content))
        max_token_doc = max(temp)
        if max_tokens < max_token_doc or max_token_doc < min_tokens:
            if max_tokens < max_token_doc:
                chunk_size -= inc
            else:
                chunk_size += inc
            print(max_token_doc,chunk_size)
            continue
            
        else:
            break
    map_prompt = """
    ### Write a summary of the following, video transcript segment, return a detailed summary  citing the time stamps which are in seconds
    ### cite using seconds ONLY
    "{text}"
    Detailed SUMMARY:
"""


    map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])
    combine_prompt = """
    ### Write a summary of the following, video transcript segment summaries, return a detailed summary.
    Return your response citing the time stamps which are in seconds.
    #cite using seconds ONLY
    ```{text}```
    SUMMARY:
    """
    combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])
    summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True
                                    )
    summary=summary_chain.run(docs)
    print(summary)
generate_summary(essay)