Spaces:

CodeHima
/

TOSRoberta

Running

TOSRoberta / utils /text_processing.py

feat: Add utility functions for text processing and model prediction

8766819 10 months ago

1.32 kB

	import PyPDF2
	import spacy
	import re

	nlp = spacy.load("en_core_web_sm")

	def extract_text_from_pdf(pdf_file):
	reader = PyPDF2.PdfReader(pdf_file)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text

	def split_into_clauses(text):
	# Preprocess the text
	text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
	text = re.sub(r'\n+', '\n', text) # Remove extra newlines

	# Use spaCy to parse the text
	doc = nlp(text)

	clauses = []
	current_clause = []

	for sent in doc.sents:
	current_clause.append(sent.text)

	# Check if this sentence ends a clause
	if re.search(r'\d+\.\|\([a-z]\)\|\([iv]+\)', sent.text) or len(' '.join(current_clause)) > 200:
	clauses.append(' '.join(current_clause))
	current_clause = []

	# Add any remaining text as the last clause
	if current_clause:
	clauses.append(' '.join(current_clause))

	# Post-process clauses
	cleaned_clauses = []
	for clause in clauses:
	# Remove leading/trailing whitespace and numbers
	clause = re.sub(r'^\s\d+\.?\s', '', clause.strip())
	if clause:
	cleaned_clauses.append(clause)

	return cleaned_clauses