Spaces:

Mohamed-BC
/

DocuBot

Sleeping

File size: 1,427 Bytes

20b1f3c

import base64
import pdfplumber
from transformers import pipeline
# Function to extract text from a PDF and summarize it
def get_pdf_text(pdf_file):
    text = ""
    # Open the PDF file and extract text
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text += page.extract_text()  # Extract text from each page
    return text

def display_pdf(file_path):
  # Read the PDF file
  with open(file_path, "rb") as f:
      data = f.read()
  # Convert PDF content to base64
  base64_pdf = base64.b64encode(data).decode("utf-8")
  # Create an iframe to display the PDF
  pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600px"></iframe>'
  return pdf_display

def split_text(text, max_length):
  """Split text into smaller chunks based on a specified length."""
  words = text.split()
  chunks = [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)]
  return chunks
  
def summarize(text,max_length):
  summarizer = pipeline(task="summarization", model='facebook/bart-large-cnn')
  text_chunks = split_text(text, max_length=max_length)  # Split into chunks of 500 words
  # Summarize each chunk and combine the results
  summaries = [summarizer(chunk)[0]['summary_text'] for chunk in text_chunks]
  # Combine the summaries into a final summary
  final_summary = ' '.join(summaries)
  return final_summary
  # return text_chunks[0]