Spaces:
Sleeping
Sleeping
import re | |
import pandas as pd | |
from dotenv import load_dotenv | |
from llama_index.core import SimpleDirectoryReader | |
from llama_parse import LlamaParse | |
load_dotenv() | |
MIN_PARAGRAPH_LENGTH = 50 | |
def extract_paragraphs(markdown_text): | |
""" | |
Extract paragraphs from a markdown text. | |
""" | |
# Split the text into paragraphs using regex | |
paragraphs = re.split(r"\n\n+", markdown_text) | |
# Remove leading and trailing whitespaces from each paragraph | |
paragraphs = [p.strip() for p in paragraphs if p.strip()] | |
paragraphs = [ | |
p | |
for p in paragraphs | |
if len(p) >= MIN_PARAGRAPH_LENGTH and not p.startswith("#") | |
] | |
print(f"created {len(paragraphs)} paragraphs\n", paragraphs) | |
return paragraphs | |
def extract_endpoint_llama(file_paths): | |
""" | |
Extract PDFs using LlamaParse. | |
""" | |
# set up parser | |
parser = LlamaParse(result_type="markdown") # "markdown" and "text" are available | |
# use SimpleDirectoryReader to parse our file | |
file_extractor = {".pdf": parser} | |
documents = SimpleDirectoryReader( | |
input_files=file_paths, file_extractor=file_extractor | |
).load_data() | |
extracted_data = [] | |
for doc in documents: | |
print(doc.text[:500]) | |
paragraphs = extract_paragraphs(doc.text) | |
data = { | |
"paper": doc.metadata["file_name"], | |
"chunks": paragraphs, | |
} | |
extracted_data.append(data) | |
df = pd.DataFrame(extracted_data) | |
return [extracted_data, df] | |