|
|
|
|
|
|
|
__all__ = ['rawtext_to_doc_split', 'files_to_text', 'youtube_to_text', 'save_text', 'get_youtube_transcript', |
|
'website_to_text_web', 'website_to_text_unstructured', 'get_document_segments', 'create_local_vector_store'] |
|
|
|
|
|
|
|
import os |
|
import itertools |
|
|
|
from langchain.embeddings import OpenAIEmbeddings |
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader |
|
from langchain.document_loaders.generic import GenericLoader |
|
from langchain.document_loaders.parsers import OpenAIWhisperParser |
|
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader |
|
from langchain.document_loaders import WebBaseLoader, UnstructuredURLLoader |
|
from langchain.docstore.document import Document |
|
|
|
from langchain.vectorstores import Chroma |
|
from langchain.chains import RetrievalQAWithSourcesChain |
|
|
|
|
|
def rawtext_to_doc_split(text, chunk_size=1500, chunk_overlap=150): |
|
|
|
|
|
if not isinstance(text, list): |
|
text = [text] |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, |
|
chunk_overlap=chunk_overlap, |
|
add_start_index = True) |
|
|
|
|
|
if isinstance(text[0], Document): |
|
doc_segments = text_splitter.split_documents(text) |
|
else: |
|
doc_segments = text_splitter.split_documents(text_splitter.create_documents(text)) |
|
|
|
|
|
doc_segments = list(itertools.chain(*doc_segments)) if isinstance(doc_segments[0], list) else doc_segments |
|
|
|
return doc_segments |
|
|
|
|
|
|
|
def _file_to_text(single_file, chunk_size = 1000, chunk_overlap=150): |
|
|
|
|
|
loader = UnstructuredFileLoader(single_file) |
|
doc_segments = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=chunk_size, |
|
chunk_overlap=chunk_overlap, |
|
add_start_index=True)) |
|
return doc_segments |
|
|
|
|
|
|
|
def files_to_text(files_list, chunk_size=1000, chunk_overlap=150): |
|
|
|
|
|
if not isinstance(files_list, list): |
|
files_list = [files_list] |
|
|
|
|
|
all_segments = [_file_to_text(single_file, chunk_size=chunk_size, chunk_overlap=chunk_overlap) for single_file in files_list] |
|
all_segments = list(itertools.chain(*all_segments)) if isinstance(all_segments[0], list) else all_segments |
|
|
|
return all_segments |
|
|
|
|
|
def youtube_to_text(urls, save_dir = "content"): |
|
|
|
|
|
|
|
if not isinstance(urls, list): |
|
urls = [urls] |
|
|
|
youtube_loader = GenericLoader(YoutubeAudioLoader(urls, save_dir), OpenAIWhisperParser()) |
|
youtube_docs = youtube_loader.load() |
|
|
|
return youtube_docs |
|
|
|
|
|
def save_text(text, text_name = None): |
|
if not text_name: |
|
text_name = text[:20] |
|
text_path = os.path.join("/content",text_name+".txt") |
|
|
|
with open(text_path, "x") as f: |
|
f.write(text) |
|
|
|
return text_path |
|
|
|
|
|
def get_youtube_transcript(yt_url, save_transcript = False, temp_audio_dir = "sample_data"): |
|
|
|
|
|
|
|
youtube_docs = youtube_to_text(yt_url, save_dir = temp_audio_dir) |
|
|
|
|
|
combined_docs = [doc.page_content for doc in youtube_docs] |
|
combined_text = " ".join(combined_docs) |
|
|
|
|
|
video_path = youtube_docs[0].metadata["source"] |
|
youtube_name = os.path.splitext(os.path.basename(video_path))[0] |
|
|
|
save_path = None |
|
if save_transcript: |
|
save_path = save_text(combined_text, youtube_name) |
|
|
|
return youtube_docs, save_path |
|
|
|
|
|
def website_to_text_web(url, chunk_size = 1500, chunk_overlap=100): |
|
|
|
|
|
website_loader = WebBaseLoader(url) |
|
website_raw = website_loader.load() |
|
|
|
website_data = rawtext_to_doc_split(website_raw, chunk_size = chunk_size, chunk_overlap=chunk_overlap) |
|
|
|
|
|
return website_data |
|
|
|
|
|
def website_to_text_unstructured(web_urls, chunk_size = 1500, chunk_overlap=100): |
|
|
|
|
|
if not isinstance(web_urls, list): |
|
web_urls = [web_urls] |
|
|
|
|
|
website_loader = UnstructuredURLLoader(web_urls) |
|
website_raw = website_loader.load() |
|
|
|
website_data = rawtext_to_doc_split(website_raw, chunk_size = chunk_size, chunk_overlap=chunk_overlap) |
|
|
|
|
|
return website_data |
|
|
|
|
|
def get_document_segments(context_info, data_type, chunk_size = 1500, chunk_overlap=100): |
|
|
|
load_fcn = None |
|
addtnl_params = {'chunk_size': chunk_size, 'chunk_overlap': chunk_overlap} |
|
|
|
|
|
if data_type == 'text': |
|
load_fcn = rawtext_to_doc_split |
|
elif data_type == 'web_page': |
|
load_fcn = website_to_text_unstructured |
|
elif data_type == 'youtube_video': |
|
load_fcn = youtube_to_text |
|
else: |
|
load_fcn = files_to_text |
|
|
|
|
|
doc_segments = load_fcn(context_info, **addtnl_params) |
|
|
|
return doc_segments |
|
|
|
|
|
def create_local_vector_store(document_segments, **retriever_kwargs): |
|
embeddings = OpenAIEmbeddings() |
|
db = Chroma.from_documents(document_segments, embeddings) |
|
retriever = db.as_retriever(**retriever_kwargs) |
|
|
|
return db, retriever |
|
|