Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -17,10 +17,10 @@ import nltk
|
|
17 |
nltk.download('punkt')
|
18 |
|
19 |
# Function to extract text from a PDF file
|
20 |
-
def extract_text_from_pdf(
|
21 |
text = ""
|
22 |
try:
|
23 |
-
pdf_reader = PyPDF2.PdfReader(io.BytesIO(
|
24 |
for page in pdf_reader.pages:
|
25 |
text += page.extract_text()
|
26 |
except Exception as e:
|
@@ -28,10 +28,10 @@ def extract_text_from_pdf(pdf_file):
|
|
28 |
return text
|
29 |
|
30 |
# Function to extract text from a Word document
|
31 |
-
def extract_text_from_docx(
|
32 |
text = ""
|
33 |
try:
|
34 |
-
doc = Document(io.BytesIO(
|
35 |
text = "\n".join([para.text for para in doc.paragraphs])
|
36 |
except Exception as e:
|
37 |
print(f"Error extracting text from DOCX: {e}")
|
@@ -74,9 +74,9 @@ def upload_files(files):
|
|
74 |
try:
|
75 |
for file in files:
|
76 |
if file.name.endswith('.pdf'):
|
77 |
-
text = extract_text_from_pdf(file.
|
78 |
elif file.name.endswith('.docx'):
|
79 |
-
text = extract_text_from_docx(file.
|
80 |
else:
|
81 |
return {"error": "Unsupported file format"}
|
82 |
|
|
|
17 |
nltk.download('punkt')
|
18 |
|
19 |
# Function to extract text from a PDF file
|
20 |
+
def extract_text_from_pdf(pdf_data):
|
21 |
text = ""
|
22 |
try:
|
23 |
+
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_data))
|
24 |
for page in pdf_reader.pages:
|
25 |
text += page.extract_text()
|
26 |
except Exception as e:
|
|
|
28 |
return text
|
29 |
|
30 |
# Function to extract text from a Word document
|
31 |
+
def extract_text_from_docx(docx_data):
|
32 |
text = ""
|
33 |
try:
|
34 |
+
doc = Document(io.BytesIO(docx_data))
|
35 |
text = "\n".join([para.text for para in doc.paragraphs])
|
36 |
except Exception as e:
|
37 |
print(f"Error extracting text from DOCX: {e}")
|
|
|
74 |
try:
|
75 |
for file in files:
|
76 |
if file.name.endswith('.pdf'):
|
77 |
+
text = extract_text_from_pdf(file.data)
|
78 |
elif file.name.endswith('.docx'):
|
79 |
+
text = extract_text_from_docx(file.data)
|
80 |
else:
|
81 |
return {"error": "Unsupported file format"}
|
82 |
|