Spaces:
Sleeping
Sleeping
File size: 7,196 Bytes
3f39329 631f6af 8fdc6bf 2fd3f0a 9237552 fa99d8f a0edccd 3f39329 fa99d8f 2fd3f0a 3f39329 a0df48e 2e6490e a0df48e 2e6490e 3f39329 a0df48e 2fd3f0a a0df48e 93e9eab a0df48e 2fd3f0a fa99d8f 92304dd 3f39329 d713a77 1314610 d713a77 3f39329 fa99d8f d713a77 3f39329 8fdc6bf 3f39329 d713a77 3f39329 d713a77 3f39329 d713a77 2fd3f0a a0edccd d713a77 d083f29 a0edccd 1314610 a0edccd 8fdc6bf a0edccd 9237552 a0edccd d713a77 a0df48e d713a77 a0df48e d713a77 fa99d8f d713a77 2fd3f0a 93e9eab 2fd3f0a 93e9eab 2fd3f0a 1314610 2fd3f0a 93e9eab 2fd3f0a 93e9eab 2fd3f0a f790226 d713a77 a0df48e d713a77 5bf49b7 a0df48e 5bf49b7 fa99d8f 5bf49b7 d713a77 a0df48e f790226 d713a77 fa99d8f a0df48e f790226 d713a77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
from langchain.tools import BaseTool, StructuredTool, tool
from langchain_community.retrievers import ArxivRetriever
#from langchain_community.utilities import SerpAPIWrapper
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
#from langchain.tools import Tool
from langchain_google_community import GoogleSearchAPIWrapper
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
from langchain_community.embeddings import GPT4AllEmbeddings
from app.core.config import settings
from langchain_community.vectorstores import Chroma
import arxiv
#import ast
import chromadb
# hacky and should be replaced with a database
# from app.source_container.container import (
# all_sources
# )
from app.utils.utils import (
parse_list_to_dicts, format_wiki_summaries, format_arxiv_documents, format_search_results
)
from app.crud.db_handler import (
add_many
)
from app.vector_store.chroma_vector_store import (
add_pdf_to_vector_store
)
from app.utils.utils import (
create_wikipedia_urls_from_text, create_folder_if_not_exists,
)
import os
# from app.utils import create_wikipedia_urls_from_text
#persist_directory = os.getenv('VECTOR_DATABASE_LOCATION')
persist_directory = settings.VECTOR_DATABASE_LOCATION
@tool
def memory_search(query:str) -> str:
"""Search the memory vector store for existing knowledge and relevent pervious researches. \
This is your primary source to start your search with checking what you already have learned from the past, before going online."""
# Since we have more than one collections we should change the name of this tool
client = chromadb.PersistentClient(
path=persist_directory,
)
#collection_name = os.getenv('CONVERSATION_COLLECTION_NAME')
collection_name = settings.CONVERSATION_COLLECTION_NAME
#store using envar
embedding_function = SentenceTransformerEmbeddings(
model_name=settings.EMBEDDING_MODEL
#model_name=os.getenv("EMBEDDING_MODEL"),
)
#embedding_function = GPT4AllEmbeddings()
vector_db = Chroma(
client=client, # client for Chroma
collection_name=collection_name,
embedding_function=embedding_function,
)
retriever = vector_db.as_retriever()
docs = retriever.get_relevant_documents(query)
return docs.__str__()
@tool
def knowledgeBase_search(query:str) -> str:
"""Search the internal knowledge base for research papers and relevent chunks"""
# Since we have more than one collections we should change the name of this tool
client = chromadb.PersistentClient(
path=persist_directory,
)
collection_name="ArxivPapers"
#store using envar
embedding_function = SentenceTransformerEmbeddings(
#model_name=os.getenv("EMBEDDING_MODEL"),
model_name=settings.EMBEDDING_MODEL
)
#embedding_function = GPT4AllEmbeddings()
vector_db = Chroma(
client=client, # client for Chroma
collection_name=collection_name,
embedding_function=embedding_function,
)
retriever = vector_db.as_retriever()
docs = retriever.get_relevant_documents(query)
return docs.__str__()
@tool
def arxiv_search(query: str) -> str:
"""Search arxiv database for scientific research papers and studies. This is your primary online information source.
always check it first when you search for additional information, before using any other online tool."""
#global all_sources
arxiv_retriever = ArxivRetriever(load_max_docs=3)
data = arxiv_retriever.invoke(query)
meta_data = [i.metadata for i in data]
formatted_sources = format_arxiv_documents(data)
#all_sources += formatted_sources
parsed_sources = parse_list_to_dicts(formatted_sources)
add_many(parsed_sources)
return data.__str__()
@tool
def get_arxiv_paper(paper_id:str) -> None:
"""Download a paper from axriv to download a paper please input
the axriv id such as "1605.08386v1" This tool is named get_arxiv_paper
If you input "http://arxiv.org/abs/2312.02813", This will break the code. Also only do
"2312.02813". In addition please download one paper at a time. Pleaase keep the inputs/output
free of additional information only have the id.
"""
# code from https://lukasschwab.me/arxiv.py/arxiv.html
paper = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id])))
number_without_period = paper_id.replace('.', '')
# Download the PDF to a specified directory with a custom filename.
paper.download_pdf(dirpath="./downloaded_papers", filename=f"{number_without_period}.pdf")
@tool
def embed_arvix_paper(paper_id:str) -> None:
"""Download a paper from axriv to download a paper please input
the axriv id such as "1605.08386v1" This tool is named get_arxiv_paper
If you input "http://arxiv.org/abs/2312.02813", This will break the code. Also only do
"2312.02813". In addition please download one paper at a time. Pleaase keep the inputs/output
free of additional information only have the id.
"""
# code from https://lukasschwab.me/arxiv.py/arxiv.html
paper = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id])))
number_without_period = paper_id.replace('.', '')
pdf_file_name = f"{number_without_period}.pdf"
pdf_directory = "./downloaded_papers"
create_folder_if_not_exists(pdf_directory)
# Download the PDF to a specified directory with a custom filename.
paper.download_pdf(dirpath=pdf_directory, filename=f"{number_without_period}.pdf")
client = chromadb.PersistentClient(
path=persist_directory,
)
collection_name="ArxivPapers"
#store using envar
full_path = os.path.join(pdf_directory, pdf_file_name)
add_pdf_to_vector_store(
collection_name=collection_name,
pdf_file_location=full_path,
)
@tool
def wikipedia_search(query: str) -> str:
"""Search Wikipedia for additional information to expand on research papers or when no papers can be found."""
#global all_sources
api_wrapper = WikipediaAPIWrapper()
wikipedia_search = WikipediaQueryRun(api_wrapper=api_wrapper)
wikipedia_results = wikipedia_search.run(query)
formatted_summaries = format_wiki_summaries(wikipedia_results)
#all_sources += formatted_summaries
parsed_summaries = parse_list_to_dicts(formatted_summaries)
add_many(parsed_summaries)
#all_sources += create_wikipedia_urls_from_text(wikipedia_results)
return wikipedia_results
@tool
def google_search(query: str) -> str:
"""Search Google for additional results when you can't answer questions using arxiv search or wikipedia search."""
#global all_sources
websearch = GoogleSearchAPIWrapper()
search_results:dict = websearch.results(query, 3)
cleaner_sources =format_search_results(search_results)
parsed_csources = parse_list_to_dicts(cleaner_sources)
add_many(parsed_csources)
#all_sources += cleaner_sources
return cleaner_sources.__str__() |