Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -70,9 +70,23 @@ except Exception as e:
|
|
70 |
|
71 |
|
72 |
def Parsing(parsed_text):
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
|
78 |
#Added more stopwords to avoid irrelevant terms
|
|
|
70 |
|
71 |
|
72 |
def Parsing(parsed_text):
|
73 |
+
'''
|
74 |
+
Process a PDF file and extract its text content
|
75 |
+
parsed_text: Can be a file object with a 'name' attribute or a file path string
|
76 |
+
'''
|
77 |
+
try:
|
78 |
+
# Handle different input types
|
79 |
+
if hasattr(parsed_text, 'name'):
|
80 |
+
file_path = parsed_text.name
|
81 |
+
else:
|
82 |
+
file_path = parsed_text
|
83 |
+
|
84 |
+
# Extract text from PDF
|
85 |
+
raw_party = textract.process(file_path, encoding='ascii', method='pdfminer')
|
86 |
+
return clean(raw_party)
|
87 |
+
except Exception as e:
|
88 |
+
print(f"Error parsing PDF: {str(e)}")
|
89 |
+
return f"Error parsing PDF: {str(e)}"
|
90 |
|
91 |
|
92 |
#Added more stopwords to avoid irrelevant terms
|