Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

data_text_search / search_funcs /ingest.py

seanpedrickcase

Added semantic search using Jina

ceb8617 about 1 year ago

raw

history blame

14.5 kB

	# Install/ import stuff we need

	import os
	import time
	import re
	import ast
	import pandas as pd
	import gradio as gr
	from typing import Type, List, Literal
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	from pydantic import BaseModel, Field

	# Creating an alias for pandas DataFrame using Type
	PandasDataFrame = Type[pd.DataFrame]

	# class Document(BaseModel):
	# """Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""

	# page_content: str
	# """String text."""
	# metadata: dict = Field(default_factory=dict)
	# """Arbitrary metadata about the page content (e.g., source, relationships to other
	# documents, etc.).
	# """
	# type: Literal["Document"] = "Document"

	class Document(BaseModel):
	"""Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""

	page_content: str
	"""String text."""
	metadata: dict = Field(default_factory=dict)
	"""Arbitrary metadata about the page content (e.g., source, relationships to other
	documents, etc.).
	"""
	type: Literal["Document"] = "Document"


	# -

	split_strat = ["\n\n", "\n", ". ", "! ", "? "]
	chunk_size = 500
	chunk_overlap = 0
	start_index = True

	## Parse files
	def determine_file_type(file_path):
	"""
	Determine the file type based on its extension.

	Parameters:
	file_path (str): Path to the file.

	Returns:
	str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
	"""
	return os.path.splitext(file_path)[1].lower()

	def parse_file(file_paths, text_column='text'):
	"""
	Accepts a list of file paths, determines each file's type based on its extension,
	and passes it to the relevant parsing function.

	Parameters:
	file_paths (list): List of file paths.
	text_column (str): Name of the column in CSV/Excel files that contains the text content.

	Returns:
	dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
	"""



	if not isinstance(file_paths, list):
	raise ValueError("Expected a list of file paths.")

	extension_to_parser = {
	# '.pdf': parse_pdf,
	# '.docx': parse_docx,
	# '.txt': parse_txt,
	# '.html': parse_html,
	# '.htm': parse_html, # Considering both .html and .htm for HTML files
	'.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
	'.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column),
	'.parquet': lambda file_path: parse_csv_or_excel(file_path, text_column)
	}

	parsed_contents = {}
	file_names = []

	for file_path in file_paths:
	print(file_path.name)
	#file = open(file_path.name, 'r')
	#print(file)
	file_extension = determine_file_type(file_path.name)
	if file_extension in extension_to_parser:
	parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
	else:
	parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}"

	filename_end = get_file_path_end(file_path.name)

	file_names.append(filename_end)

	return parsed_contents, file_names

	def text_regex_clean(text):
	# Merge hyphenated words
	text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
	# If a double newline ends in a letter, add a full stop.
	text = re.sub(r'(?<=[a-zA-Z])\n\n', '.\n\n', text)
	# Fix newlines in the middle of sentences
	text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
	# Remove multiple newlines
	text = re.sub(r"\n\s*\n", "\n\n", text)
	text = re.sub(r" ", " ", text)
	# Add full stops and new lines between words with no space between where the second one has a capital letter
	text = re.sub(r'(?<=[a-z])(?=[A-Z])', '. \n\n', text)

	return text

	def parse_csv_or_excel(file_path, text_column = "text"):
	"""
	Read in a CSV or Excel file.

	Parameters:
	file_path (str): Path to the CSV file.
	text_column (str): Name of the column in the CSV file that contains the text content.

	Returns:
	Pandas DataFrame: Dataframe output from file read
	"""

	#out_df = pd.DataFrame()

	file_list = [string.name for string in file_path]

	print(file_list)

	data_file_names = [string for string in file_list if "tokenised" not in string]


	#for file_path in file_paths:
	file_extension = determine_file_type(data_file_names[0])
	file_name = get_file_path_end(data_file_names[0])
	file_names = [file_name]

	print(file_extension)

	if file_extension == ".csv":
	df = pd.read_csv(data_file_names[0], low_memory=False)
	if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
	df['source'] = file_name
	df['page_section'] = ""
	elif file_extension == ".xlsx":
	df = pd.read_excel(data_file_names[0], engine='openpyxl')
	if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
	df['source'] = file_name
	df['page_section'] = ""
	elif file_extension == ".parquet":
	df = pd.read_parquet(data_file_names[0])
	if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
	df['source'] = file_name
	df['page_section'] = ""
	else:
	print(f"Unsupported file type: {file_extension}")
	return pd.DataFrame(), ['Please choose a valid file type']

	message = "Loaded in file. Now converting to document format."
	print(message)

	return df, file_names, message

	def get_file_path_end(file_path):
	match = re.search(r'(.*[\/\\])?(.+)$', file_path)

	filename_end = match.group(2) if match else ''

	return filename_end

	# +
	# Convert parsed text to docs
	# -

	def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document]:
	"""
	Converts the output of parse_file (a dictionary of file paths to content)
	to a list of Documents with metadata.
	"""

	doc_sections = []
	parent_doc_sections = []

	for file_path, content in text_dict.items():
	ext = os.path.splitext(file_path)[1].lower()

	# Depending on the file extension, handle the content
	# if ext == '.pdf':
	# docs, page_docs = pdf_text_to_docs(content, chunk_size)
	# elif ext in ['.html', '.htm', '.txt', '.docx']:
	# docs = html_text_to_docs(content, chunk_size)
	if ext in ['.csv', '.xlsx']:
	docs, page_docs = csv_excel_text_to_docs(content, chunk_size)
	else:
	print(f"Unsupported file type {ext} for {file_path}. Skipping.")
	continue


	filename_end = get_file_path_end(file_path)

	#match = re.search(r'(.*[\/\\])?(.+)$', file_path)
	#filename_end = match.group(2) if match else ''

	# Add filename as metadata
	for doc in docs: doc.metadata["source"] = filename_end
	#for parent_doc in parent_docs: parent_doc.metadata["source"] = filename_end

	doc_sections.extend(docs)
	#parent_doc_sections.extend(parent_docs)

	return doc_sections#, page_docs


	def write_out_metadata_as_string(metadata_in):
	# If metadata_in is a single dictionary, wrap it in a list
	if isinstance(metadata_in, dict):
	metadata_in = [metadata_in]

	metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
	return metadata_string

	def combine_metadata_columns(df, cols):

	df['metadatas'] = "{"
	df['blank_column'] = ""

	for n, col in enumerate(cols):
	df[col] = df[col].astype(str).str.replace('"',"'").str.replace('\n', ' ').str.replace('\r', ' ').str.replace('\r\n', ' ').str.cat(df['blank_column'].astype(str), sep="")

	df['metadatas'] = df['metadatas'] + '"' + cols[n] + '": "' + df[col] + '", '


	df['metadatas'] = (df['metadatas'] + "}").str.replace(', }', '}')

	return df['metadatas']

	def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
	"""Converts a DataFrame's content to a list of Documents with metadata."""

	#print(df.head())

	print("Converting to documents.")

	doc_sections = []
	df[text_column] = df[text_column].astype(str) # Ensure column is a string column

	# For each row in the dataframe
	for idx, row in df.iterrows():
	# Extract the text content for the document
	doc_content = row[text_column]

	# Generate metadata containing other columns' data
	metadata = {"row": idx + 1}
	for col, value in row.items():
	if col != text_column:
	metadata[col] = value

	metadata_string = write_out_metadata_as_string(metadata)[0]

	# If chunk_size is provided, split the text into chunks
	if chunk_size:
	# Assuming you have a text splitter function similar to the PDF handling
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	split_strat=split_strat,
	start_index=start_index
	) #Other arguments as required by the splitter

	sections = text_splitter.split_text(doc_content)


	# For each section, create a Document object
	for i, section in enumerate(sections):
	section = '. '.join([metadata_string, section])
	doc = Document(page_content=section,
	metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
	doc_sections.append(doc)

	#print("Chunking currently disabled")

	else:
	# If no chunk_size is provided, create a single Document object for the row
	#doc_content = '. '.join([metadata_string, doc_content])
	doc = Document(page_content=doc_content, metadata=metadata)
	doc_sections.append(doc)

	message = "Data converted to document format. Now creating/loading document embeddings."
	print(message)

	return doc_sections, message



	def clean_line_breaks(text):
	# Replace \n and \r\n with a space
	return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')

	def parse_metadata(row):
	try:
	# Ensure the 'title' field is a string and clean line breaks
	#if 'TITLE' in row:
	# row['TITLE'] = clean_line_breaks(row['TITLE'])

	# Convert the row to a string if it's not already
	row_str = str(row) if not isinstance(row, str) else row

	row_str.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')

	# Parse the string
	metadata = ast.literal_eval(row_str)
	# Process metadata
	return metadata
	except SyntaxError as e:
	print(f"Failed to parse metadata: {row_str}")
	print(f"Error: {e}")
	# Handle the error or log it
	return None # or some default value

	def csv_excel_text_to_docs(df, text_column='text', chunk_size=None, progress=gr.Progress()) -> List[Document]:
	"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""

	ingest_tic = time.perf_counter()

	doc_sections = []
	df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column

	cols = [col for col in df.columns if col != text_column]

	df["metadata"] = combine_metadata_columns(df, cols)

	df = df.rename(columns={text_column:"page_content"})

	#print(df[["page_content", "metadata"]].to_dict(orient='records'))

	#doc_sections = df[["page_content", "metadata"]].to_dict(orient='records')
	#doc_sections = [Document(**row) for row in df[["page_content", "metadata"]].to_dict(orient='records')]

	# Create a list of Document objects
	doc_sections = [Document(page_content=row['page_content'],
	metadata= parse_metadata(row["metadata"]))
	for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]

	ingest_toc = time.perf_counter()

	ingest_time_out = f"Preparing documents took {ingest_toc - ingest_tic:0.1f} seconds"
	print(ingest_time_out)

	return doc_sections, "Finished splitting documents"

	# # Functions for working with documents after loading them back in

	def pull_out_data(series):

	# define a lambda function to convert each string into a tuple
	to_tuple = lambda x: eval(x)

	# apply the lambda function to each element of the series
	series_tup = series.apply(to_tuple)

	series_tup_content = list(zip(*series_tup))[1]

	series = pd.Series(list(series_tup_content))#.str.replace("^Main post content", "", regex=True).str.strip()

	return series

	def docs_from_csv(df):

	import ast

	documents = []

	page_content = pull_out_data(df["0"])
	metadatas = pull_out_data(df["1"])

	for x in range(0,len(df)):
	new_doc = Document(page_content=page_content[x], metadata=metadatas[x])
	documents.append(new_doc)

	return documents

	def docs_from_lists(docs, metadatas):

	documents = []

	for x, doc in enumerate(docs):
	new_doc = Document(page_content=doc, metadata=metadatas[x])
	documents.append(new_doc)

	return documents

	def docs_elements_from_csv_save(docs_path="documents.csv"):

	documents = pd.read_csv(docs_path)

	docs_out = docs_from_csv(documents)

	out_df = pd.DataFrame(docs_out)

	docs_content = pull_out_data(out_df[0].astype(str))

	docs_meta = pull_out_data(out_df[1].astype(str))

	doc_sources = [d['source'] for d in docs_meta]

	return out_df, docs_content, docs_meta, doc_sources