Spaces:
Sleeping
Sleeping
File size: 6,138 Bytes
dd3611a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/media_stores.ipynb.
# %% auto 0
__all__ = ['rawtext_to_doc_split', 'files_to_text', 'youtube_to_text', 'save_text', 'get_youtube_transcript',
'website_to_text_web', 'website_to_text_unstructured', 'get_document_segments', 'create_local_vector_store']
# %% ../nbs/media_stores.ipynb 3
# import libraries here
import os
import itertools
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
from langchain.document_loaders import WebBaseLoader, UnstructuredURLLoader
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQAWithSourcesChain
# %% ../nbs/media_stores.ipynb 8
def rawtext_to_doc_split(text, chunk_size=1500, chunk_overlap=150):
# Quick type checking
if not isinstance(text, list):
text = [text]
# Create splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
add_start_index = True)
#Split into docs segments
if isinstance(text[0], Document):
doc_segments = text_splitter.split_documents(text)
else:
doc_segments = text_splitter.split_documents(text_splitter.create_documents(text))
# Make into one big list
doc_segments = list(itertools.chain(*doc_segments)) if isinstance(doc_segments[0], list) else doc_segments
return doc_segments
# %% ../nbs/media_stores.ipynb 16
## A single File
def _file_to_text(single_file, chunk_size = 1000, chunk_overlap=150):
# Create loader and get segments
loader = UnstructuredFileLoader(single_file)
doc_segments = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
add_start_index=True))
return doc_segments
## Multiple files
def files_to_text(files_list, chunk_size=1000, chunk_overlap=150):
# Quick type checking
if not isinstance(files_list, list):
files_list = [files_list]
# This is currently a fix because the UnstructuredFileLoader expects a list of files yet can't split them correctly yet
all_segments = [_file_to_text(single_file, chunk_size=chunk_size, chunk_overlap=chunk_overlap) for single_file in files_list]
all_segments = list(itertools.chain(*all_segments)) if isinstance(all_segments[0], list) else all_segments
return all_segments
# %% ../nbs/media_stores.ipynb 20
def youtube_to_text(urls, save_dir = "content"):
# Transcribe the videos to text
# save_dir: directory to save audio files
if not isinstance(urls, list):
urls = [urls]
youtube_loader = GenericLoader(YoutubeAudioLoader(urls, save_dir), OpenAIWhisperParser())
youtube_docs = youtube_loader.load()
return youtube_docs
# %% ../nbs/media_stores.ipynb 24
def save_text(text, text_name = None):
if not text_name:
text_name = text[:20]
text_path = os.path.join("/content",text_name+".txt")
with open(text_path, "x") as f:
f.write(text)
# Return the location at which the transcript is saved
return text_path
# %% ../nbs/media_stores.ipynb 25
def get_youtube_transcript(yt_url, save_transcript = False, temp_audio_dir = "sample_data"):
# Transcribe the videos to text and save to file in /content
# save_dir: directory to save audio files
youtube_docs = youtube_to_text(yt_url, save_dir = temp_audio_dir)
# Combine doc
combined_docs = [doc.page_content for doc in youtube_docs]
combined_text = " ".join(combined_docs)
# Save text to file
video_path = youtube_docs[0].metadata["source"]
youtube_name = os.path.splitext(os.path.basename(video_path))[0]
save_path = None
if save_transcript:
save_path = save_text(combined_text, youtube_name)
return youtube_docs, save_path
# %% ../nbs/media_stores.ipynb 27
def website_to_text_web(url, chunk_size = 1500, chunk_overlap=100):
# Url can be a single string or list
website_loader = WebBaseLoader(url)
website_raw = website_loader.load()
website_data = rawtext_to_doc_split(website_raw, chunk_size = chunk_size, chunk_overlap=chunk_overlap)
# Combine doc
return website_data
# %% ../nbs/media_stores.ipynb 33
def website_to_text_unstructured(web_urls, chunk_size = 1500, chunk_overlap=100):
# Make sure it's a list
if not isinstance(web_urls, list):
web_urls = [web_urls]
# Url can be a single string or list
website_loader = UnstructuredURLLoader(web_urls)
website_raw = website_loader.load()
website_data = rawtext_to_doc_split(website_raw, chunk_size = chunk_size, chunk_overlap=chunk_overlap)
# Return individual docs or list
return website_data
# %% ../nbs/media_stores.ipynb 45
def get_document_segments(context_info, data_type, chunk_size = 1500, chunk_overlap=100):
load_fcn = None
addtnl_params = {'chunk_size': chunk_size, 'chunk_overlap': chunk_overlap}
# Define function use to do the loading
if data_type == 'text':
load_fcn = rawtext_to_doc_split
elif data_type == 'web_page':
load_fcn = website_to_text_unstructured
elif data_type == 'youtube_video':
load_fcn = youtube_to_text
else:
load_fcn = files_to_text
# Get the document segments
doc_segments = load_fcn(context_info, **addtnl_params)
return doc_segments
# %% ../nbs/media_stores.ipynb 47
def create_local_vector_store(document_segments, **retriever_kwargs):
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(document_segments, embeddings)
retriever = db.as_retriever(**retriever_kwargs)
return db, retriever
|