Spaces:

seanpedrickcase
/

data_text_search

Sleeping

File size: 14,524 Bytes

# Install/ import stuff we need

import os
import time
import re
import ast
import pandas as pd
import gradio as gr
from typing import Type, List, Literal
from langchain.text_splitter import RecursiveCharacterTextSplitter

from pydantic import BaseModel, Field

# Creating an alias for pandas DataFrame using Type
PandasDataFrame = Type[pd.DataFrame]

# class Document(BaseModel):
#     """Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""

#     page_content: str
#     """String text."""
#     metadata: dict = Field(default_factory=dict)
#     """Arbitrary metadata about the page content (e.g., source, relationships to other
#         documents, etc.).
#     """
#     type: Literal["Document"] = "Document"

class Document(BaseModel):
    """Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""

    page_content: str
    """String text."""
    metadata: dict = Field(default_factory=dict)
    """Arbitrary metadata about the page content (e.g., source, relationships to other
        documents, etc.).
    """
    type: Literal["Document"] = "Document"

split_strat = ["\n\n", "\n", ". ", "! ", "? "]
chunk_size = 500
chunk_overlap = 0
start_index = True

## Parse files
def determine_file_type(file_path):
        """
        Determine the file type based on its extension.
    
        Parameters:
            file_path (str): Path to the file.
    
        Returns:
            str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
        """
        return os.path.splitext(file_path)[1].lower()

def parse_file(file_paths, text_column='text'):
    """
    Accepts a list of file paths, determines each file's type based on its extension,
    and passes it to the relevant parsing function.
    
    Parameters:
        file_paths (list): List of file paths.
        text_column (str): Name of the column in CSV/Excel files that contains the text content.
    
    Returns:
        dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
    """
    
    

    if not isinstance(file_paths, list):
        raise ValueError("Expected a list of file paths.")
    
    extension_to_parser = {
        # '.pdf': parse_pdf,
        # '.docx': parse_docx,
        # '.txt': parse_txt,
        # '.html': parse_html,
        # '.htm': parse_html,  # Considering both .html and .htm for HTML files
        '.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
        '.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column),
        '.parquet': lambda file_path: parse_csv_or_excel(file_path, text_column)
    }
    
    parsed_contents = {}
    file_names = []

    for file_path in file_paths:
        print(file_path.name)
        #file = open(file_path.name, 'r')
        #print(file)
        file_extension = determine_file_type(file_path.name)
        if file_extension in extension_to_parser:
            parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
        else:
            parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}"

        filename_end = get_file_path_end(file_path.name)

        file_names.append(filename_end)
    
    return parsed_contents, file_names

def text_regex_clean(text):
    # Merge hyphenated words
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        # If a double newline ends in a letter, add a full stop.
        text = re.sub(r'(?<=[a-zA-Z])\n\n', '.\n\n', text)
        # Fix newlines in the middle of sentences
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        # Remove multiple newlines
        text = re.sub(r"\n\s*\n", "\n\n", text)
        text = re.sub(r"  ", " ", text)
        # Add full stops and new lines between words with no space between where the second one has a capital letter
        text = re.sub(r'(?<=[a-z])(?=[A-Z])', '. \n\n', text)

        return text

def parse_csv_or_excel(file_path, text_column = "text"):
        """
        Read in a CSV or Excel file.
        
        Parameters:
            file_path (str): Path to the CSV file.
            text_column (str): Name of the column in the CSV file that contains the text content.
        
        Returns:
            Pandas DataFrame: Dataframe output from file read
        """

        #out_df = pd.DataFrame()

        file_list = [string.name for string in file_path]

        print(file_list)

        data_file_names = [string for string in file_list if "tokenised" not in string]
        
        
        #for file_path in file_paths:
        file_extension = determine_file_type(data_file_names[0])
        file_name = get_file_path_end(data_file_names[0])
        file_names = [file_name]

        print(file_extension)

        if file_extension == ".csv":
                df = pd.read_csv(data_file_names[0], low_memory=False)
                if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
                df['source'] = file_name
                df['page_section'] = ""
        elif file_extension == ".xlsx":
                df = pd.read_excel(data_file_names[0], engine='openpyxl')
                if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
                df['source'] = file_name
                df['page_section'] = ""
        elif file_extension == ".parquet":
                df = pd.read_parquet(data_file_names[0])
                if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
                df['source'] = file_name
                df['page_section'] = ""
        else:
                print(f"Unsupported file type: {file_extension}")
                return pd.DataFrame(), ['Please choose a valid file type']
        
        message = "Loaded in file. Now converting to document format."
        print(message)

        return df, file_names, message

def get_file_path_end(file_path):
    match = re.search(r'(.*[\/\\])?(.+)$', file_path)
        
    filename_end = match.group(2) if match else ''

    return filename_end

# +
# Convert parsed text to docs
# -

def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document]:
    """
    Converts the output of parse_file (a dictionary of file paths to content)
    to a list of Documents with metadata.
    """
    
    doc_sections = []
    parent_doc_sections = []

    for file_path, content in text_dict.items():
        ext = os.path.splitext(file_path)[1].lower()

        # Depending on the file extension, handle the content
        # if ext == '.pdf':
        #     docs, page_docs = pdf_text_to_docs(content, chunk_size)
        # elif ext in ['.html', '.htm', '.txt', '.docx']:
        #     docs = html_text_to_docs(content, chunk_size)
        if ext in ['.csv', '.xlsx']:
            docs, page_docs = csv_excel_text_to_docs(content, chunk_size)
        else:
            print(f"Unsupported file type {ext} for {file_path}. Skipping.")
            continue

        
        filename_end = get_file_path_end(file_path)

        #match = re.search(r'(.*[\/\\])?(.+)$', file_path)
        #filename_end = match.group(2) if match else ''

        # Add filename as metadata
        for doc in docs: doc.metadata["source"] = filename_end
        #for parent_doc in parent_docs: parent_doc.metadata["source"] = filename_end
        
        doc_sections.extend(docs)
        #parent_doc_sections.extend(parent_docs)

    return doc_sections#, page_docs

def write_out_metadata_as_string(metadata_in):
    # If metadata_in is a single dictionary, wrap it in a list
    if isinstance(metadata_in, dict):
        metadata_in = [metadata_in]

    metadata_string = [f"{'  '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
    return metadata_string

def combine_metadata_columns(df, cols):

    df['metadatas'] = "{"
    df['blank_column'] = ""

    for n, col in enumerate(cols):
        df[col] = df[col].astype(str).str.replace('"',"'").str.replace('\n', ' ').str.replace('\r', ' ').str.replace('\r\n', ' ').str.cat(df['blank_column'].astype(str), sep="")

        df['metadatas'] = df['metadatas'] + '"' + cols[n] + '": "' + df[col] + '", '


    df['metadatas'] = (df['metadatas'] + "}").str.replace(', }', '}')

    return df['metadatas']

def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
    """Converts a DataFrame's content to a list of Documents with metadata."""
    
    #print(df.head())

    print("Converting to documents.")

    doc_sections = []
    df[text_column] = df[text_column].astype(str) # Ensure column is a string column

    # For each row in the dataframe
    for idx, row in df.iterrows():
        # Extract the text content for the document
        doc_content = row[text_column]
        
        # Generate metadata containing other columns' data
        metadata = {"row": idx + 1}
        for col, value in row.items():
            if col != text_column:
                metadata[col] = value

        metadata_string = write_out_metadata_as_string(metadata)[0]      

        # If chunk_size is provided, split the text into chunks
        if chunk_size:
            # Assuming you have a text splitter function similar to the PDF handling
            text_splitter = RecursiveCharacterTextSplitter(
               chunk_size=chunk_size,
               chunk_overlap=chunk_overlap,
               split_strat=split_strat,
               start_index=start_index                
            ) #Other arguments as required by the splitter

            sections = text_splitter.split_text(doc_content)

            
            # For each section, create a Document object
            for i, section in enumerate(sections):
                section = '. '.join([metadata_string, section])
                doc = Document(page_content=section, 
                              metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
                doc_sections.append(doc)
            
            #print("Chunking currently disabled")

        else:
            # If no chunk_size is provided, create a single Document object for the row
            #doc_content = '. '.join([metadata_string, doc_content])
            doc = Document(page_content=doc_content, metadata=metadata)
            doc_sections.append(doc)

        message = "Data converted to document format. Now creating/loading document embeddings."
        print(message)

    return doc_sections, message

def clean_line_breaks(text):
    # Replace \n and \r\n with a space
    return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')

def parse_metadata(row):
    try:
        # Ensure the 'title' field is a string and clean line breaks
        #if 'TITLE' in row:
        #    row['TITLE'] = clean_line_breaks(row['TITLE'])

        # Convert the row to a string if it's not already
        row_str = str(row) if not isinstance(row, str) else row

        row_str.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')

        # Parse the string
        metadata = ast.literal_eval(row_str)
        # Process metadata
        return metadata
    except SyntaxError as e:
        print(f"Failed to parse metadata: {row_str}")
        print(f"Error: {e}")
        # Handle the error or log it
        return None  # or some default value

def csv_excel_text_to_docs(df, text_column='text', chunk_size=None, progress=gr.Progress()) -> List[Document]:
    """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
    
    ingest_tic = time.perf_counter()

    doc_sections = []
    df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column

    cols = [col for col in df.columns if col != text_column]

    df["metadata"] = combine_metadata_columns(df, cols)

    df = df.rename(columns={text_column:"page_content"})

    #print(df[["page_content", "metadata"]].to_dict(orient='records'))

    #doc_sections = df[["page_content", "metadata"]].to_dict(orient='records')
    #doc_sections = [Document(**row) for row in df[["page_content", "metadata"]].to_dict(orient='records')]

    # Create a list of Document objects
    doc_sections = [Document(page_content=row['page_content'], 
                        metadata= parse_metadata(row["metadata"]))
               for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
    
    ingest_toc = time.perf_counter()

    ingest_time_out = f"Preparing documents took {ingest_toc - ingest_tic:0.1f} seconds"
    print(ingest_time_out)

    return doc_sections, "Finished splitting documents"

# # Functions for working with documents after loading them back in

def pull_out_data(series):

    # define a lambda function to convert each string into a tuple
    to_tuple = lambda x: eval(x)

    # apply the lambda function to each element of the series
    series_tup = series.apply(to_tuple)

    series_tup_content = list(zip(*series_tup))[1]

    series = pd.Series(list(series_tup_content))#.str.replace("^Main post content", "", regex=True).str.strip()

    return series

def docs_from_csv(df):

    import ast
    
    documents = []
    
    page_content = pull_out_data(df["0"])
    metadatas = pull_out_data(df["1"])

    for x in range(0,len(df)):       
        new_doc = Document(page_content=page_content[x], metadata=metadatas[x])
        documents.append(new_doc)
        
    return documents

def docs_from_lists(docs, metadatas):

    documents = []

    for x, doc in enumerate(docs):
        new_doc = Document(page_content=doc, metadata=metadatas[x])
        documents.append(new_doc)
        
    return documents

def docs_elements_from_csv_save(docs_path="documents.csv"):

    documents = pd.read_csv(docs_path)

    docs_out = docs_from_csv(documents)

    out_df = pd.DataFrame(docs_out)

    docs_content = pull_out_data(out_df[0].astype(str))

    docs_meta = pull_out_data(out_df[1].astype(str))

    doc_sources = [d['source'] for d in docs_meta]

    return out_df, docs_content, docs_meta, doc_sources