import PyPDF2 from PIL import Image import pytesseract import io import sys # def extract_text(file_path, output_file_path): # text = "" # try: # if file_path.lower().endswith(".pdf"): # text = extract_text_from_pdf(file_path) # else: # print("Unsupported file format") # with open(output_file_path, "w") as output_file: # print("Run output") # for line in text.splitlines(): # print(line) # output_file.write(text) # print(f"Extracted text saved to {output_file_path}") # except Exception as e: # print("An error occurred:", e) # def extract_text_from_image(file_path): # image_path = file_path # img = Image.open(image_path) # text = pytesseract.image_to_string(img) # print(text[:-1]) import PyPDF2 def extract_text_from_pdf(pdf_file_path): extracted_text = "" try: with open(pdf_file_path, "rb") as file: pdf_reader = PyPDF2.PdfReader(file) num_pages = len(pdf_reader.pages) for i in range(num_pages): page = pdf_reader.pages[i] page_text = page.extract_text() if "ABSTRACT" in page_text: extracted_text += page_text + "\n" break return extracted_text except Exception as e: print("An error occurred:", e) return None # if __name__ == "__main__": # import PyPDF2 # file_path = "./report.pdf" # output_file_path = "./extracted_text.txt" # extract_text_from_pdf(file_path, output_file_path)