#!pip install pdfreader import pdfreader from pdfreader import PDFDocument, SimplePDFViewer #!pip install bs4 from bs4 import BeautifulSoup import requests from NER import cleanText #!pip install tabula-py import tabula import fitz # PyMuPDF import os class PDF(): def __init__(self, pdf, saveFolder, doi=None): self.pdf = pdf self.doi = doi self.saveFolder = saveFolder def openPDFFile(self): if "https" in self.pdf: name = self.pdf.split("/")[-1] name = self.downloadPDF(self.saveFolder) if name != "no pdfLink to download": fileToOpen = os.path.join(self.saveFolder, name) else: fileToOpen = self.pdf else: fileToOpen = self.pdf return open(fileToOpen, "rb") def downloadPDF(self, saveFolder): pdfLink = '' if ".pdf" not in self.pdf and "https" not in self.pdf: r = requests.get(self.pdf) soup = BeautifulSoup(r.content, 'html.parser') links = soup.find_all("a") for link in links: if ".pdf" in link.get("href", ""): if self.doi in link.get("href"): pdfLink = link.get("href") break else: pdfLink = self.pdf if pdfLink != '': response = requests.get(pdfLink) name = pdfLink.split("/")[-1] print("inside download PDF and name and link are: ", pdfLink, name) print("saveFolder is: ", saveFolder) with open(os.path.join(saveFolder, name), 'wb') as pdf: print("len of response content: ", len(response.content)) pdf.write(response.content) print("pdf downloaded") return name else: return "no pdfLink to download" def extractText(self): try: fileToOpen = self.openPDFFile().name try: doc = fitz.open(fileToOpen) text = "" for page in doc: text += page.get_text("text") + "\n\n" doc.close() if len(text.strip()) < 100: print("Fallback to PDFReader due to weak text extraction.") text = self.extractTextWithPDFReader() return text except Exception as e: print("Failed with PyMuPDF, fallback to PDFReader:", e) return self.extractTextWithPDFReader() except: return "" def extract_text_excluding_tables(self): fileToOpen = self.openPDFFile().name text = "" try: doc = fitz.open(fileToOpen) for page in doc: blocks = page.get_text("dict")["blocks"] for block in blocks: if block["type"] == 0: # text block lines = block.get("lines", []) if not lines: continue avg_words_per_line = sum(len(l["spans"]) for l in lines) / len(lines) if avg_words_per_line > 1: # Heuristic: paragraph-like blocks for line in lines: text += " ".join(span["text"] for span in line["spans"]) + "\n" doc.close() if len(text.strip()) < 100: print("Fallback to PDFReader due to weak text extraction.") text = self.extractTextWithPDFReader() return text except Exception as e: print("Failed with PyMuPDF, fallback to PDFReader:", e) return self.extractTextWithPDFReader() def extractTextWithPDFReader(self): jsonPage = {} try: pdf = self.openPDFFile() print("open pdf file") print(pdf) doc = PDFDocument(pdf) viewer = SimplePDFViewer(pdf) all_pages = [p for p in doc.pages()] cl = cleanText.cleanGenText() pdfText = "" for page in range(1, len(all_pages)): viewer.navigate(page) viewer.render() if str(page) not in jsonPage: jsonPage[str(page)] = {} text = "".join(viewer.canvas.strings) clean, filteredWord = cl.textPreprocessing(text) jsonPage[str(page)]["normalText"] = [text] jsonPage[str(page)]["cleanText"] = [' '.join(filteredWord)] jsonPage[str(page)]["image"] = [viewer.canvas.images] jsonPage[str(page)]["form"] = [viewer.canvas.forms] jsonPage[str(page)]["content"] = [viewer.canvas.text_content] jsonPage[str(page)]["inline_image"] = [viewer.canvas.inline_images] pdf.close() except: jsonPage = {} return self.mergeTextinJson(jsonPage) def extractTable(self,pages="all",saveFile=None,outputFormat=None): '''pages (str, int, iterable of int, optional) – An optional values specifying pages to extract from. It allows str,`int`, iterable of :int. Default: 1 Examples: '1-2,3', 'all', [1,2]''' df = [] if "https" in self.pdf: name = self.pdf.split("/")[-1] name = self.downloadPDF(self.saveFolder) if name != "no pdfLink to download": fileToOpen = self.saveFolder + "/" + name else: fileToOpen = self.pdf else: fileToOpen = self.pdf try: df = tabula.read_pdf(fileToOpen, pages=pages) # saveFile: "/content/drive/MyDrive/CollectData/NER/PDF/tableS1.csv" # outputFormat: "csv" #tabula.convert_into(self.pdf, saveFile, output_format=outputFormat, pages=pages) except:# ValueError: df = [] print("No tables found in PDF file") return df def mergeTextinJson(self, jsonPDF): try: cl = cleanText.cleanGenText() pdfText = "" if jsonPDF: for page in jsonPDF: if len(jsonPDF[page]["normalText"]) > 0: for i in range(len(jsonPDF[page]["normalText"])): text = jsonPDF[page]["normalText"][i] if len(text) > 0: text = cl.removeTabWhiteSpaceNewLine(text) text = cl.removeExtraSpaceBetweenWords(text) jsonPDF[page]["normalText"][i] = text if i - 1 > 0: if jsonPDF[page]["normalText"][i - 1][-1] != ".": pdfText += ". " pdfText += jsonPDF[page]["normalText"][i] if len(jsonPDF[page]["normalText"][i]) > 0: if jsonPDF[page]["normalText"][i][-1] != ".": pdfText += "." pdfText += "\n\n" return pdfText except: return "" def getReference(self): pass def getSupMaterial(self): pass def removeHeaders(self): pass def removeFooters(self): pass def removeReference(self): pass