File size: 4,902 Bytes
a7932b8
b432dd9
 
 
 
8f71aa4
0bbc003
b432dd9
 
0bbc003
b432dd9
 
0bbc003
8f71aa4
 
0bbc003
8f71aa4
0bbc003
 
d06c0f4
0bbc003
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f71aa4
0bbc003
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f71aa4
0bbc003
 
 
 
 
 
 
 
 
 
 
 
 
b432dd9
10e4113
0bbc003
 
 
 
 
 
 
 
 
 
10e4113
0bbc003
d06c0f4
302823e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import gradio as gr
from PyPDF2 import PdfReader
import requests
from dotenv import load_dotenv
import tiktoken

# Load environment variables
load_dotenv()

# Get the Hugging Face API token
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

# Initialize the tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    return len(tokenizer.encode(text))

def summarize_text(text, instructions, agent_name):
    print(f"{agent_name}: Starting summarization")
    API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
    headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
    
    payload = {
        "inputs": f"{instructions}\n\nText to summarize:\n{text}",
        "parameters": {"max_length": 500}
    }
    
    print(f"{agent_name}: Sending request to API")
    response = requests.post(API_URL, headers=headers, json=payload)
    print(f"{agent_name}: Received response from API")
    
    # Extracting only the generated summary from the response
    generated_text = response.json()[0]["generated_text"]
    
    # Assuming the model returns the entire input followed by the summary
    # Split the generated text by the delimiter "\n\n" and take the last part as the summary
    summary = generated_text.split("\n\n")[-1]
    
    return summary

def process_pdf(pdf_file, chunk_instructions, window_instructions, final_instructions):
    print("Starting PDF processing")
    # Read PDF
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n\n"
    
    print(f"Extracted {len(reader.pages)} pages from PDF")
    
    # Chunk the text (simple splitting by pages for this example)
    chunks = text.split("\n\n")
    print(f"Split text into {len(chunks)} chunks")
    
    # Agent 1: Summarize each chunk
    agent1_summaries = []
    for i, chunk in enumerate(chunks):
        print(f"Agent 1: Processing chunk {i+1}/{len(chunks)}")
        summary = summarize_text(chunk, chunk_instructions, "Agent 1")
        agent1_summaries.append(summary)
    
    print("Agent 1: Finished processing all chunks")
    
    # Concatenate Agent 1 summaries
    concatenated_summary = "\n\n".join(agent1_summaries)
    print(f"Concatenated Agent 1 summaries (length: {len(concatenated_summary)})")
    print(f"Concatenated Summary: {concatenated_summary}")
    
    # Sliding window approach
    window_size = 3500  # in tokens
    step_size = 3000  # overlap of 500 tokens
    windows = []
    current_position = 0
    
    while current_position < len(concatenated_summary):
        window_end = current_position
        window_text = ""
        while count_tokens(window_text) < window_size and window_end < len(concatenated_summary):
            window_text += concatenated_summary[window_end]
            window_end += 1
        windows.append(window_text)
        current_position += step_size
    
    print(f"Created {len(windows)} windows for intermediate summarization")
    
    # Intermediate summarization
    intermediate_summaries = []
    for i, window in enumerate(windows):
        print(f"Processing window {i+1}/{len(windows)}")
        summary = summarize_text(window, window_instructions, f"Window {i+1}")
        intermediate_summaries.append(summary)
    
    # Final summarization
    final_input = "\n\n".join(intermediate_summaries)
    print(f"Final input length: {count_tokens(final_input)} tokens")
    final_summary = summarize_text(final_input, final_instructions, "Agent 2")
    print("Agent 2: Finished final summarization")
    
    return final_summary

def pdf_summarizer(pdf_file, chunk_instructions, window_instructions, final_instructions):
    if pdf_file is None:
        print("Error: No PDF file uploaded")
        return "Please upload a PDF file."
    
    try:
        print(f"Starting summarization process for file: {pdf_file.name}")
        summary = process_pdf(pdf_file.name, chunk_instructions, window_instructions, final_instructions)
        print("Summarization process completed successfully")
        return summary
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return f"An error occurred: {str(e)}"

# Gradio interface
iface = gr.Interface(
    fn=pdf_summarizer,
    inputs=[
        gr.File(label="Upload PDF"),
        gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
        gr.Textbox(label="Window Instructions", placeholder="Instructions for summarizing each window"),
        gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization")
    ],
    outputs=gr.Textbox(label="Summary"),
    title="PDF Earnings Summary Generator",
    description="Upload a PDF of an earnings summary or transcript to generate a concise summary."
)

print("Launching Gradio interface")
iface.launch()