File size: 4,902 Bytes
a7932b8 b432dd9 8f71aa4 0bbc003 b432dd9 0bbc003 b432dd9 0bbc003 8f71aa4 0bbc003 8f71aa4 0bbc003 d06c0f4 0bbc003 8f71aa4 0bbc003 8f71aa4 0bbc003 b432dd9 10e4113 0bbc003 10e4113 0bbc003 d06c0f4 302823e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
import gradio as gr
from PyPDF2 import PdfReader
import requests
from dotenv import load_dotenv
import tiktoken
# Load environment variables
load_dotenv()
# Get the Hugging Face API token
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
# Initialize the tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")
def count_tokens(text):
return len(tokenizer.encode(text))
def summarize_text(text, instructions, agent_name):
print(f"{agent_name}: Starting summarization")
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
payload = {
"inputs": f"{instructions}\n\nText to summarize:\n{text}",
"parameters": {"max_length": 500}
}
print(f"{agent_name}: Sending request to API")
response = requests.post(API_URL, headers=headers, json=payload)
print(f"{agent_name}: Received response from API")
# Extracting only the generated summary from the response
generated_text = response.json()[0]["generated_text"]
# Assuming the model returns the entire input followed by the summary
# Split the generated text by the delimiter "\n\n" and take the last part as the summary
summary = generated_text.split("\n\n")[-1]
return summary
def process_pdf(pdf_file, chunk_instructions, window_instructions, final_instructions):
print("Starting PDF processing")
# Read PDF
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n\n"
print(f"Extracted {len(reader.pages)} pages from PDF")
# Chunk the text (simple splitting by pages for this example)
chunks = text.split("\n\n")
print(f"Split text into {len(chunks)} chunks")
# Agent 1: Summarize each chunk
agent1_summaries = []
for i, chunk in enumerate(chunks):
print(f"Agent 1: Processing chunk {i+1}/{len(chunks)}")
summary = summarize_text(chunk, chunk_instructions, "Agent 1")
agent1_summaries.append(summary)
print("Agent 1: Finished processing all chunks")
# Concatenate Agent 1 summaries
concatenated_summary = "\n\n".join(agent1_summaries)
print(f"Concatenated Agent 1 summaries (length: {len(concatenated_summary)})")
print(f"Concatenated Summary: {concatenated_summary}")
# Sliding window approach
window_size = 3500 # in tokens
step_size = 3000 # overlap of 500 tokens
windows = []
current_position = 0
while current_position < len(concatenated_summary):
window_end = current_position
window_text = ""
while count_tokens(window_text) < window_size and window_end < len(concatenated_summary):
window_text += concatenated_summary[window_end]
window_end += 1
windows.append(window_text)
current_position += step_size
print(f"Created {len(windows)} windows for intermediate summarization")
# Intermediate summarization
intermediate_summaries = []
for i, window in enumerate(windows):
print(f"Processing window {i+1}/{len(windows)}")
summary = summarize_text(window, window_instructions, f"Window {i+1}")
intermediate_summaries.append(summary)
# Final summarization
final_input = "\n\n".join(intermediate_summaries)
print(f"Final input length: {count_tokens(final_input)} tokens")
final_summary = summarize_text(final_input, final_instructions, "Agent 2")
print("Agent 2: Finished final summarization")
return final_summary
def pdf_summarizer(pdf_file, chunk_instructions, window_instructions, final_instructions):
if pdf_file is None:
print("Error: No PDF file uploaded")
return "Please upload a PDF file."
try:
print(f"Starting summarization process for file: {pdf_file.name}")
summary = process_pdf(pdf_file.name, chunk_instructions, window_instructions, final_instructions)
print("Summarization process completed successfully")
return summary
except Exception as e:
print(f"An error occurred: {str(e)}")
return f"An error occurred: {str(e)}"
# Gradio interface
iface = gr.Interface(
fn=pdf_summarizer,
inputs=[
gr.File(label="Upload PDF"),
gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
gr.Textbox(label="Window Instructions", placeholder="Instructions for summarizing each window"),
gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization")
],
outputs=gr.Textbox(label="Summary"),
title="PDF Earnings Summary Generator",
description="Upload a PDF of an earnings summary or transcript to generate a concise summary."
)
print("Launching Gradio interface")
iface.launch() |