NaimaAqeel commited on
Commit
0632240
·
verified ·
1 Parent(s): 3f3bafc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -13,11 +13,19 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
13
  from nltk.tokenize import sent_tokenize # Import for sentence segmentation
14
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
15
 
16
- # Function to extract text from a PDF file (same as before)
17
  def extract_text_from_pdf(pdf_path):
18
- # ...
 
 
 
 
 
 
 
 
19
 
20
- # Function to extract text from a Word document (fixed indentation)
21
  def extract_text_from_docx(docx_path):
22
  """Extracts text from a Word document."""
23
  text = ""
 
13
  from nltk.tokenize import sent_tokenize # Import for sentence segmentation
14
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
15
 
16
+ # Function to extract text from a PDF file
17
  def extract_text_from_pdf(pdf_path):
18
+ text = ""
19
+ try:
20
+ doc = fitz.open(pdf_path)
21
+ for page_num in range(len(doc)):
22
+ page = doc.load_page(page_num)
23
+ text += page.get_text()
24
+ except Exception as e:
25
+ print(f"Error extracting text from PDF: {e}")
26
+ return text
27
 
28
+ # Function to extract text from a Word document
29
  def extract_text_from_docx(docx_path):
30
  """Extracts text from a Word document."""
31
  text = ""