Spaces:
Running
Running
import PyPDF2 | |
import spacy | |
import re | |
nlp = spacy.load("en_core_web_sm") | |
def extract_text_from_pdf(pdf_file): | |
reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
def split_into_clauses(text): | |
# Preprocess the text | |
text = re.sub(r'\s+', ' ', text) # Remove extra whitespace | |
text = re.sub(r'\n+', '\n', text) # Remove extra newlines | |
# Use spaCy to parse the text | |
doc = nlp(text) | |
clauses = [] | |
current_clause = [] | |
for sent in doc.sents: | |
current_clause.append(sent.text) | |
# Check if this sentence ends a clause | |
if re.search(r'\d+\.|\([a-z]\)|\([iv]+\)', sent.text) or len(' '.join(current_clause)) > 200: | |
clauses.append(' '.join(current_clause)) | |
current_clause = [] | |
# Add any remaining text as the last clause | |
if current_clause: | |
clauses.append(' '.join(current_clause)) | |
# Post-process clauses | |
cleaned_clauses = [] | |
for clause in clauses: | |
# Remove leading/trailing whitespace and numbers | |
clause = re.sub(r'^\s*\d+\.?\s*', '', clause.strip()) | |
if clause: | |
cleaned_clauses.append(clause) | |
return cleaned_clauses |