File size: 6,138 Bytes
dd3611a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/media_stores.ipynb.

# %% auto 0
__all__ = ['rawtext_to_doc_split', 'files_to_text', 'youtube_to_text', 'save_text', 'get_youtube_transcript',
           'website_to_text_web', 'website_to_text_unstructured', 'get_document_segments', 'create_local_vector_store']

# %% ../nbs/media_stores.ipynb 3
# import libraries here
import os
import itertools

from langchain.embeddings import OpenAIEmbeddings

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
from langchain.document_loaders import WebBaseLoader, UnstructuredURLLoader
from langchain.docstore.document import Document

from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQAWithSourcesChain

# %% ../nbs/media_stores.ipynb 8
def rawtext_to_doc_split(text, chunk_size=1500, chunk_overlap=150):
  
  # Quick type checking
  if not isinstance(text, list):
    text = [text]

  # Create splitter
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                 chunk_overlap=chunk_overlap,
                                                 add_start_index = True)
  
  #Split into docs segments
  if isinstance(text[0], Document):
    doc_segments = text_splitter.split_documents(text)
  else:
    doc_segments = text_splitter.split_documents(text_splitter.create_documents(text))

  # Make into one big list
  doc_segments = list(itertools.chain(*doc_segments)) if isinstance(doc_segments[0], list) else doc_segments

  return doc_segments

# %% ../nbs/media_stores.ipynb 16
## A single File
def _file_to_text(single_file, chunk_size = 1000, chunk_overlap=150):

  # Create loader and get segments
  loader = UnstructuredFileLoader(single_file)
  doc_segments = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                                      chunk_overlap=chunk_overlap,
                                                                      add_start_index=True))
  return doc_segments


## Multiple files
def files_to_text(files_list, chunk_size=1000, chunk_overlap=150):
  
  # Quick type checking
  if not isinstance(files_list, list):
    files_list = [files_list]

  # This is currently a fix because the UnstructuredFileLoader expects a list of files yet can't split them correctly yet
  all_segments = [_file_to_text(single_file, chunk_size=chunk_size, chunk_overlap=chunk_overlap) for single_file in files_list]
  all_segments = list(itertools.chain(*all_segments)) if isinstance(all_segments[0], list) else all_segments

  return all_segments

# %% ../nbs/media_stores.ipynb 20
def youtube_to_text(urls, save_dir = "content"):
  # Transcribe the videos to text
  # save_dir: directory to save audio files

  if not isinstance(urls, list):
    urls = [urls]
  
  youtube_loader = GenericLoader(YoutubeAudioLoader(urls, save_dir), OpenAIWhisperParser())
  youtube_docs = youtube_loader.load()
  
  return youtube_docs

# %% ../nbs/media_stores.ipynb 24
def save_text(text, text_name = None):
  if not text_name:
    text_name = text[:20]
  text_path = os.path.join("/content",text_name+".txt")
  
  with open(text_path, "x") as f:
    f.write(text)
  # Return the location at which the transcript is saved
  return text_path

# %% ../nbs/media_stores.ipynb 25
def get_youtube_transcript(yt_url, save_transcript = False, temp_audio_dir = "sample_data"):
  # Transcribe the videos to text and save to file in /content
  # save_dir: directory to save audio files

  youtube_docs = youtube_to_text(yt_url, save_dir = temp_audio_dir)
  
  # Combine doc
  combined_docs = [doc.page_content for doc in youtube_docs]
  combined_text = " ".join(combined_docs)
  
  # Save text to file
  video_path = youtube_docs[0].metadata["source"]
  youtube_name = os.path.splitext(os.path.basename(video_path))[0]

  save_path = None
  if save_transcript:
    save_path = save_text(combined_text, youtube_name)
  
  return youtube_docs, save_path

# %% ../nbs/media_stores.ipynb 27
def website_to_text_web(url, chunk_size = 1500, chunk_overlap=100):
  
    # Url can be a single string or list
    website_loader = WebBaseLoader(url)
    website_raw = website_loader.load()

    website_data = rawtext_to_doc_split(website_raw, chunk_size = chunk_size, chunk_overlap=chunk_overlap)
  
    # Combine doc
    return website_data

# %% ../nbs/media_stores.ipynb 33
def website_to_text_unstructured(web_urls, chunk_size = 1500, chunk_overlap=100):

    # Make sure it's a list
    if not isinstance(web_urls, list):
        web_urls = [web_urls]
  
    # Url can be a single string or list
    website_loader = UnstructuredURLLoader(web_urls)
    website_raw = website_loader.load()

    website_data = rawtext_to_doc_split(website_raw, chunk_size = chunk_size, chunk_overlap=chunk_overlap)
  
    # Return individual docs or list
    return website_data

# %% ../nbs/media_stores.ipynb 45
def get_document_segments(context_info, data_type, chunk_size = 1500, chunk_overlap=100):

    load_fcn = None
    addtnl_params = {'chunk_size': chunk_size, 'chunk_overlap': chunk_overlap}

    # Define function use to do the loading
    if data_type == 'text':
        load_fcn = rawtext_to_doc_split
    elif data_type == 'web_page':
        load_fcn = website_to_text_unstructured
    elif data_type == 'youtube_video':
        load_fcn = youtube_to_text
    else:
        load_fcn = files_to_text
    
    # Get the document segments
    doc_segments = load_fcn(context_info, **addtnl_params)

    return doc_segments

# %% ../nbs/media_stores.ipynb 47
def create_local_vector_store(document_segments, **retriever_kwargs):
    embeddings = OpenAIEmbeddings()
    db = Chroma.from_documents(document_segments, embeddings)
    retriever = db.as_retriever(**retriever_kwargs)
    
    return db, retriever