TOSRoberta / utils /text_processing.py
CodeHima's picture
feat: Add utility functions for text processing and model prediction
8766819
raw
history blame
1.32 kB
import PyPDF2
import spacy
import re
nlp = spacy.load("en_core_web_sm")
def extract_text_from_pdf(pdf_file):
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
def split_into_clauses(text):
# Preprocess the text
text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
text = re.sub(r'\n+', '\n', text) # Remove extra newlines
# Use spaCy to parse the text
doc = nlp(text)
clauses = []
current_clause = []
for sent in doc.sents:
current_clause.append(sent.text)
# Check if this sentence ends a clause
if re.search(r'\d+\.|\([a-z]\)|\([iv]+\)', sent.text) or len(' '.join(current_clause)) > 200:
clauses.append(' '.join(current_clause))
current_clause = []
# Add any remaining text as the last clause
if current_clause:
clauses.append(' '.join(current_clause))
# Post-process clauses
cleaned_clauses = []
for clause in clauses:
# Remove leading/trailing whitespace and numbers
clause = re.sub(r'^\s*\d+\.?\s*', '', clause.strip())
if clause:
cleaned_clauses.append(clause)
return cleaned_clauses