GVAmaresh
ini
c87f53a
import PyPDF2
from PIL import Image
import pytesseract
import io
import sys
# def extract_text(file_path, output_file_path):
# text = ""
# try:
# if file_path.lower().endswith(".pdf"):
# text = extract_text_from_pdf(file_path)
# else:
# print("Unsupported file format")
# with open(output_file_path, "w") as output_file:
# print("Run output")
# for line in text.splitlines():
# print(line)
# output_file.write(text)
# print(f"Extracted text saved to {output_file_path}")
# except Exception as e:
# print("An error occurred:", e)
# def extract_text_from_image(file_path):
# image_path = file_path
# img = Image.open(image_path)
# text = pytesseract.image_to_string(img)
# print(text[:-1])
import PyPDF2
def extract_text_from_pdf(pdf_file_path):
extracted_text = ""
try:
with open(pdf_file_path, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
num_pages = len(pdf_reader.pages)
for i in range(num_pages):
page = pdf_reader.pages[i]
page_text = page.extract_text()
if "ABSTRACT" in page_text:
extracted_text += page_text + "\n"
break
return extracted_text
except Exception as e:
print("An error occurred:", e)
return None
# if __name__ == "__main__":
# import PyPDF2
# file_path = "./report.pdf"
# output_file_path = "./extracted_text.txt"
# extract_text_from_pdf(file_path, output_file_path)