File size: 7,196 Bytes
3f39329
631f6af
 
 
 
 
8fdc6bf
2fd3f0a
 
 
9237552
fa99d8f
a0edccd
3f39329
fa99d8f
2fd3f0a
 
 
3f39329
a0df48e
 
 
 
2e6490e
 
a0df48e
2e6490e
 
3f39329
a0df48e
2fd3f0a
 
a0df48e
93e9eab
 
 
a0df48e
2fd3f0a
fa99d8f
 
92304dd
3f39329
d713a77
 
 
 
 
1314610
d713a77
3f39329
fa99d8f
 
d713a77
3f39329
8fdc6bf
 
 
 
 
3f39329
d713a77
 
 
 
 
3f39329
d713a77
 
3f39329
d713a77
2fd3f0a
a0edccd
d713a77
 
d083f29
a0edccd
1314610
a0edccd
 
 
 
 
8fdc6bf
 
 
 
 
a0edccd
 
9237552
 
 
a0edccd
 
 
 
 
 
 
d713a77
 
 
 
a0df48e
d713a77
 
 
 
a0df48e
d713a77
fa99d8f
d713a77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fd3f0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93e9eab
 
 
2fd3f0a
93e9eab
2fd3f0a
 
1314610
2fd3f0a
 
 
 
 
93e9eab
 
2fd3f0a
 
93e9eab
2fd3f0a
f790226
 
d713a77
 
a0df48e
d713a77
 
 
 
5bf49b7
a0df48e
5bf49b7
fa99d8f
5bf49b7
d713a77
 
 
 
 
a0df48e
f790226
d713a77
 
 
 
fa99d8f
a0df48e
f790226
d713a77
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
from langchain.tools import BaseTool, StructuredTool, tool
from langchain_community.retrievers import ArxivRetriever
#from langchain_community.utilities import SerpAPIWrapper
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
#from langchain.tools import Tool
from langchain_google_community import GoogleSearchAPIWrapper
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.embeddings import GPT4AllEmbeddings
from app.core.config import settings
from langchain_community.vectorstores import Chroma
import arxiv
#import ast

import chromadb

# hacky and should be replaced with a database
# from app.source_container.container import (
#     all_sources
# )
from app.utils.utils import (
    parse_list_to_dicts, format_wiki_summaries, format_arxiv_documents, format_search_results
)
from app.crud.db_handler import (
    add_many
)

from app.vector_store.chroma_vector_store import (
    add_pdf_to_vector_store
)
from app.utils.utils import (
    create_wikipedia_urls_from_text, create_folder_if_not_exists,
)
import os
# from app.utils import create_wikipedia_urls_from_text

#persist_directory = os.getenv('VECTOR_DATABASE_LOCATION')
persist_directory = settings.VECTOR_DATABASE_LOCATION

@tool
def memory_search(query:str) -> str:
    """Search the memory vector store for existing knowledge and relevent pervious researches. \
        This is your primary source to start your search with checking what you already have learned from the past, before going online."""
    # Since we have more than one collections we should change the name of this tool
    client = chromadb.PersistentClient(
     path=persist_directory,
    )
    
    #collection_name = os.getenv('CONVERSATION_COLLECTION_NAME')
    collection_name = settings.CONVERSATION_COLLECTION_NAME
    #store using envar
    
    embedding_function = SentenceTransformerEmbeddings(
         model_name=settings.EMBEDDING_MODEL
         #model_name=os.getenv("EMBEDDING_MODEL"),
         )
    #embedding_function = GPT4AllEmbeddings()
    
    vector_db = Chroma(
    client=client, # client for Chroma
    collection_name=collection_name,
    embedding_function=embedding_function,
    )
    
    retriever = vector_db.as_retriever()
    docs = retriever.get_relevant_documents(query)
    
    return docs.__str__()

@tool
def knowledgeBase_search(query:str) -> str:
    """Search the internal knowledge base for research papers and relevent chunks"""
    # Since we have more than one collections we should change the name of this tool
    client = chromadb.PersistentClient(
     path=persist_directory,
    )
    
    collection_name="ArxivPapers"
    #store using envar
    
    embedding_function = SentenceTransformerEmbeddings(
         #model_name=os.getenv("EMBEDDING_MODEL"),
         model_name=settings.EMBEDDING_MODEL
         )
    #embedding_function = GPT4AllEmbeddings()
    
    vector_db = Chroma(
        client=client, # client for Chroma
        collection_name=collection_name,
        embedding_function=embedding_function,
    )
    
    retriever = vector_db.as_retriever()
    docs = retriever.get_relevant_documents(query)
    
    return docs.__str__()

@tool
def arxiv_search(query: str) -> str:
    """Search arxiv database for scientific research papers and studies. This is your primary online information source.
    always check it first when you search for additional information, before using any other online tool."""
    #global all_sources
    arxiv_retriever = ArxivRetriever(load_max_docs=3)
    data = arxiv_retriever.invoke(query)
    meta_data = [i.metadata for i in data]
    formatted_sources = format_arxiv_documents(data)
    #all_sources += formatted_sources
    parsed_sources = parse_list_to_dicts(formatted_sources)
    add_many(parsed_sources)
  
    return data.__str__()

@tool
def get_arxiv_paper(paper_id:str) -> None:
    """Download a paper from axriv to download a paper please input 
    the axriv id such as "1605.08386v1" This tool is named get_arxiv_paper
    If you input "http://arxiv.org/abs/2312.02813", This will break the code. Also only do 
    "2312.02813". In addition please download one paper at a time. Pleaase keep the inputs/output
    free of additional information only have the id. 
    """
    # code from https://lukasschwab.me/arxiv.py/arxiv.html
    paper = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id])))
    
    number_without_period = paper_id.replace('.', '')
    
    # Download the PDF to a specified directory with a custom filename.
    paper.download_pdf(dirpath="./downloaded_papers", filename=f"{number_without_period}.pdf")

@tool
def embed_arvix_paper(paper_id:str) -> None:
    """Download a paper from axriv to download a paper please input 
    the axriv id such as "1605.08386v1" This tool is named get_arxiv_paper
    If you input "http://arxiv.org/abs/2312.02813", This will break the code. Also only do 
    "2312.02813". In addition please download one paper at a time. Pleaase keep the inputs/output
    free of additional information only have the id. 
    """
    # code from https://lukasschwab.me/arxiv.py/arxiv.html
    paper = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id])))
    
    number_without_period = paper_id.replace('.', '')
    
    pdf_file_name = f"{number_without_period}.pdf"
    
    pdf_directory = "./downloaded_papers"
    create_folder_if_not_exists(pdf_directory)
    
    # Download the PDF to a specified directory with a custom filename.
    paper.download_pdf(dirpath=pdf_directory, filename=f"{number_without_period}.pdf")
    
    client = chromadb.PersistentClient(
     path=persist_directory,
    )
    
    collection_name="ArxivPapers"
    #store using envar
    
    full_path = os.path.join(pdf_directory, pdf_file_name)
    
    add_pdf_to_vector_store(
        collection_name=collection_name,
        pdf_file_location=full_path,
    )
    
@tool
def wikipedia_search(query: str) -> str:
    """Search Wikipedia for additional information to expand on research papers or when no papers can be found."""
    #global all_sources

    api_wrapper = WikipediaAPIWrapper()
    wikipedia_search = WikipediaQueryRun(api_wrapper=api_wrapper)
    wikipedia_results = wikipedia_search.run(query)
    formatted_summaries = format_wiki_summaries(wikipedia_results)
    #all_sources += formatted_summaries
    parsed_summaries = parse_list_to_dicts(formatted_summaries)
    add_many(parsed_summaries)
    #all_sources += create_wikipedia_urls_from_text(wikipedia_results)
    return wikipedia_results

@tool
def google_search(query: str) -> str:
    """Search Google for additional results when you can't answer questions using arxiv search or wikipedia search."""
    #global all_sources
    
    websearch = GoogleSearchAPIWrapper()
    search_results:dict = websearch.results(query, 3)
    cleaner_sources =format_search_results(search_results)
    parsed_csources = parse_list_to_dicts(cleaner_sources)
    add_many(parsed_csources)
    #all_sources += cleaner_sources    
    
    return cleaner_sources.__str__()