Spaces:
Sleeping
Sleeping
File size: 1,427 Bytes
20b1f3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
import base64
import pdfplumber
from transformers import pipeline
# Function to extract text from a PDF and summarize it
def get_pdf_text(pdf_file):
text = ""
# Open the PDF file and extract text
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
text += page.extract_text() # Extract text from each page
return text
def display_pdf(file_path):
# Read the PDF file
with open(file_path, "rb") as f:
data = f.read()
# Convert PDF content to base64
base64_pdf = base64.b64encode(data).decode("utf-8")
# Create an iframe to display the PDF
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600px"></iframe>'
return pdf_display
def split_text(text, max_length):
"""Split text into smaller chunks based on a specified length."""
words = text.split()
chunks = [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)]
return chunks
def summarize(text,max_length):
summarizer = pipeline(task="summarization", model='facebook/bart-large-cnn')
text_chunks = split_text(text, max_length=max_length) # Split into chunks of 500 words
# Summarize each chunk and combine the results
summaries = [summarizer(chunk)[0]['summary_text'] for chunk in text_chunks]
# Combine the summaries into a final summary
final_summary = ' '.join(summaries)
return final_summary
# return text_chunks[0] |