import re
import os
from typing import TypeVar, List
import pandas as pd
# Model packages
import torch.cuda
# Alternative model sources
#from dataclasses import asdict, dataclass
# Langchain functions
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
# For keyword extraction (not currently used)
#import nltk
#nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
# For Name Entity Recognition model
#from span_marker import SpanMarkerModel # Not currently used
import gradio as gr
torch.cuda.empty_cache()
PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
embeddings = None # global variable setup
vectorstore = None # global variable setup
model_type = None # global variable setup
max_memory_length = 0 # How long should the memory of the conversation last?
full_text = "" # Define dummy source text (full text) just to enable highlight function to load
model = [] # Define empty list for model functions to run
tokenizer = [] # Define empty list for model functions to run
## Highlight text constants
hlt_chunk_size = 12
hlt_strat = [" ", ". ", "! ", "? ", ": ", "\n\n", "\n", ", "]
hlt_overlap = 4
## Initialise NER model ##
ner_model = []#SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-multinerd") # Not currently used
# Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
if torch.cuda.is_available():
torch_device = "cuda"
gpu_layers = 0
else:
torch_device = "cpu"
gpu_layers = 0
print("Running on device:", torch_device)
threads = 6 #torch.get_num_threads()
print("CPU threads:", threads)
# Vectorstore funcs
# Prompt functions
def write_out_metadata_as_string(metadata_in):
metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
return metadata_string
def determine_file_type(file_path):
"""
Determine the file type based on its extension.
Parameters:
file_path (str): Path to the file.
Returns:
str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
"""
return os.path.splitext(file_path)[1].lower()
def create_doc_df(docs_keep_out):
# Extract content and metadata from 'winning' passages.
content=[]
meta=[]
meta_url=[]
page_section=[]
score=[]
doc_df = pd.DataFrame()
for item in docs_keep_out:
content.append(item[0].page_content)
meta.append(item[0].metadata)
meta_url.append(item[0].metadata['source'])
file_extension = determine_file_type(item[0].metadata['source'])
if (file_extension != ".csv") & (file_extension != ".xlsx"):
page_section.append(item[0].metadata['page_section'])
else: page_section.append("")
score.append(item[1])
# Create df from 'winning' passages
doc_df = pd.DataFrame(list(zip(content, meta, page_section, meta_url, score)),
columns =['page_content', 'metadata', 'page_section', 'meta_url', 'score'])
docs_content = doc_df['page_content'].astype(str)
doc_df['full_url'] = "https://" + doc_df['meta_url']
return doc_df
def get_expanded_passages(vectorstore, docs, width):
"""
Extracts expanded passages based on given documents and a width for context.
Parameters:
- vectorstore: The primary data source.
- docs: List of documents to be expanded.
- width: Number of documents to expand around a given document for context.
Returns:
- expanded_docs: List of expanded Document objects.
- doc_df: DataFrame representation of expanded_docs.
"""
from collections import defaultdict
def get_docs_from_vstore(vectorstore):
vector = vectorstore.docstore._dict
return list(vector.items())
def extract_details(docs_list):
docs_list_out = [tup[1] for tup in docs_list]
content = [doc.page_content for doc in docs_list_out]
meta = [doc.metadata for doc in docs_list_out]
return ''.join(content), meta[0], meta[-1]
def get_parent_content_and_meta(vstore_docs, width, target):
#target_range = range(max(0, target - width), min(len(vstore_docs), target + width + 1))
target_range = range(max(0, target), min(len(vstore_docs), target + width + 1)) # Now only selects extra passages AFTER the found passage
parent_vstore_out = [vstore_docs[i] for i in target_range]
content_str_out, meta_first_out, meta_last_out = [], [], []
for _ in parent_vstore_out:
content_str, meta_first, meta_last = extract_details(parent_vstore_out)
content_str_out.append(content_str)
meta_first_out.append(meta_first)
meta_last_out.append(meta_last)
return content_str_out, meta_first_out, meta_last_out
def merge_dicts_except_source(d1, d2):
merged = {}
for key in d1:
if key != "source":
merged[key] = str(d1[key]) + " to " + str(d2[key])
else:
merged[key] = d1[key] # or d2[key], based on preference
return merged
def merge_two_lists_of_dicts(list1, list2):
return [merge_dicts_except_source(d1, d2) for d1, d2 in zip(list1, list2)]
# Step 1: Filter vstore_docs
vstore_docs = get_docs_from_vstore(vectorstore)
doc_sources = {doc.metadata['source'] for doc, _ in docs}
vstore_docs = [(k, v) for k, v in vstore_docs if v.metadata.get('source') in doc_sources]
# Step 2: Group by source and proceed
vstore_by_source = defaultdict(list)
for k, v in vstore_docs:
vstore_by_source[v.metadata['source']].append((k, v))
expanded_docs = []
for doc, score in docs:
search_source = doc.metadata['source']
#if file_type == ".csv" | file_type == ".xlsx":
# content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_by_source[search_source], 0, search_index)
#else:
search_section = doc.metadata['page_section']
parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_by_source[search_source]]
search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_by_source[search_source], width, search_index)
meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
expanded_doc = (Document(page_content=content_str[0], metadata=meta_full[0]), score)
expanded_docs.append(expanded_doc)
doc_df = pd.DataFrame()
doc_df = create_doc_df(expanded_docs) # Assuming you've defined the 'create_doc_df' function elsewhere
return expanded_docs, doc_df
def highlight_found_text(search_text: str, full_text: str, hlt_chunk_size:int=hlt_chunk_size, hlt_strat:List=hlt_strat, hlt_overlap:int=hlt_overlap) -> str:
"""
Highlights occurrences of search_text within full_text.
Parameters:
- search_text (str): The text to be searched for within full_text.
- full_text (str): The text within which search_text occurrences will be highlighted.
Returns:
- str: A string with occurrences of search_text highlighted.
Example:
>>> highlight_found_text("world", "Hello, world! This is a test. Another world awaits.")
'Hello, world! This is a test. Another world awaits.'
"""
def extract_text_from_input(text, i=0):
if isinstance(text, str):
return text.replace(" ", " ").strip()
elif isinstance(text, list):
return text[i][0].replace(" ", " ").strip()
else:
return ""
def extract_search_text_from_input(text):
if isinstance(text, str):
return text.replace(" ", " ").strip()
elif isinstance(text, list):
return text[-1][1].replace(" ", " ").strip()
else:
return ""
full_text = extract_text_from_input(full_text)
search_text = extract_search_text_from_input(search_text)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=hlt_chunk_size,
separators=hlt_strat,
chunk_overlap=hlt_overlap,
)
sections = text_splitter.split_text(search_text)
found_positions = {}
for x in sections:
text_start_pos = 0
while text_start_pos != -1:
text_start_pos = full_text.find(x, text_start_pos)
if text_start_pos != -1:
found_positions[text_start_pos] = text_start_pos + len(x)
text_start_pos += 1
# Combine overlapping or adjacent positions
sorted_starts = sorted(found_positions.keys())
combined_positions = []
if sorted_starts:
current_start, current_end = sorted_starts[0], found_positions[sorted_starts[0]]
for start in sorted_starts[1:]:
if start <= (current_end + 10):
current_end = max(current_end, found_positions[start])
else:
combined_positions.append((current_start, current_end))
current_start, current_end = start, found_positions[start]
combined_positions.append((current_start, current_end))
# Construct pos_tokens
pos_tokens = []
prev_end = 0
for start, end in combined_positions:
if end-start > 15: # Only combine if there is a significant amount of matched text. Avoids picking up single words like 'and' etc.
pos_tokens.append(full_text[prev_end:start])
pos_tokens.append('' + full_text[start:end] + '')
prev_end = end
pos_tokens.append(full_text[prev_end:])
return "".join(pos_tokens)
# # Chat history functions
def clear_chat(chat_history_state, sources, chat_message, current_topic):
chat_history_state = []
sources = ''
chat_message = ''
current_topic = ''
return chat_history_state, sources, chat_message, current_topic
# Keyword functions
def remove_q_stopwords(question): # Remove stopwords from question. Not used at the moment
# Prepare keywords from question by removing stopwords
text = question.lower()
# Remove numbers
text = re.sub('[0-9]', '', text)
tokenizer = RegexpTokenizer(r'\w+')
text_tokens = tokenizer.tokenize(text)
#text_tokens = word_tokenize(text)
tokens_without_sw = [word for word in text_tokens if not word in stopwords]
# Remove duplicate words while preserving order
ordered_tokens = set()
result = []
for word in tokens_without_sw:
if word not in ordered_tokens:
ordered_tokens.add(word)
result.append(word)
new_question_keywords = ' '.join(result)
return new_question_keywords
def remove_q_ner_extractor(question):
predict_out = ner_model.predict(question)
predict_tokens = [' '.join(v for k, v in d.items() if k == 'span') for d in predict_out]
# Remove duplicate words while preserving order
ordered_tokens = set()
result = []
for word in predict_tokens:
if word not in ordered_tokens:
ordered_tokens.add(word)
result.append(word)
new_question_keywords = ' '.join(result).lower()
return new_question_keywords
def apply_lemmatize(text, wnl=WordNetLemmatizer()):
def prep_for_lemma(text):
# Remove numbers
text = re.sub('[0-9]', '', text)
print(text)
tokenizer = RegexpTokenizer(r'\w+')
text_tokens = tokenizer.tokenize(text)
#text_tokens = word_tokenize(text)
return text_tokens
tokens = prep_for_lemma(text)
def lem_word(word):
if len(word) > 3: out_word = wnl.lemmatize(word)
else: out_word = word
return out_word
return [lem_word(token) for token in tokens]
def keybert_keywords(text, n, kw_model):
tokens_lemma = apply_lemmatize(text)
lemmatised_text = ' '.join(tokens_lemma)
keywords_text = KeyBERT(model=kw_model).extract_keywords(lemmatised_text, stop_words='english', top_n=n,
keyphrase_ngram_range=(1, 1))
keywords_list = [item[0] for item in keywords_text]
return keywords_list
# Gradio functions
def turn_off_interactivity(user_message, history):
return gr.update(value="", interactive=False), history + [[user_message, None]]
def restore_interactivity():
return gr.update(interactive=True)
def update_message(dropdown_value):
return gr.Textbox.update(value=dropdown_value)
def hide_block():
return gr.Radio.update(visible=False)