|
from pypdf import PdfReader |
|
import streamlit as st |
|
|
|
def get_pdf_text(pdf_docs): |
|
text = "" |
|
for pdf in pdf_docs: |
|
pdf_reader = PdfReader(pdf) |
|
for page in pdf_reader.pages: |
|
text += page.extract_text() |
|
return text |
|
|
|
raw_text="" |
|
with st.sidebar: |
|
st.title("Menu:") |
|
pdf_docs = st.file_uploader( |
|
"Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True |
|
) |
|
if st.button("Submit & Process"): |
|
with st.spinner("Processing..."): |
|
raw_text = get_pdf_text(pdf_docs) |
|
|
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn") |
|
|
|
|
|
inputs = tokenizer(raw_text, return_tensors="pt", max_length=1024, truncation=True) |
|
|
|
|
|
summary_ids = model.generate(inputs["input_ids"], num_beams=4, min_length=30, max_length=200, early_stopping=True) |
|
|
|
|
|
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) |
|
|
|
st.write("\n\nSummary:\n", summary) |
|
st.write("\n\n\nOriginal text:\n", raw_text) |
|
|
|
|