isayahc commited on
Commit
2fd3f0a
·
1 Parent(s): e1f572c

merging code

Browse files
innovation_pathfinder_ai/structured_tools/structured_tools.py CHANGED
@@ -5,8 +5,14 @@ from langchain_community.tools import WikipediaQueryRun
5
  from langchain_community.utilities import WikipediaAPIWrapper
6
  #from langchain.tools import Tool
7
  from langchain_community.utilities import GoogleSearchAPIWrapper
 
 
 
8
  import arxiv
9
  import ast
 
 
 
10
  # hacky and should be replaced with a database
11
  from innovation_pathfinder_ai.source_container.container import (
12
  all_sources
@@ -18,6 +24,12 @@ from innovation_pathfinder_ai.database.db_handler import (
18
  add_many
19
  )
20
 
 
 
 
 
 
 
21
  @tool
22
  def arxiv_search(query: str) -> str:
23
  """Search arxiv database for scientific research papers and studies. This is your primary information source.
@@ -72,9 +84,41 @@ def wikipedia_search(query: str) -> str:
72
  api_wrapper = WikipediaAPIWrapper()
73
  wikipedia_search = WikipediaQueryRun(api_wrapper=api_wrapper)
74
  wikipedia_results = wikipedia_search.run(query)
75
- formatted_summaries = format_wiki_summaries(wikipedia_results)
76
- all_sources += formatted_summaries
77
- parsed_summaries = parse_list_to_dicts(formatted_summaries)
78
- add_many(parsed_summaries)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- return wikipedia_results.__str__()
 
 
 
 
 
5
  from langchain_community.utilities import WikipediaAPIWrapper
6
  #from langchain.tools import Tool
7
  from langchain_community.utilities import GoogleSearchAPIWrapper
8
+ from langchain_community.embeddings.sentence_transformer import (
9
+ SentenceTransformerEmbeddings,
10
+ )
11
  import arxiv
12
  import ast
13
+
14
+ import chromadb
15
+
16
  # hacky and should be replaced with a database
17
  from innovation_pathfinder_ai.source_container.container import (
18
  all_sources
 
24
  add_many
25
  )
26
 
27
+ from innovation_pathfinder_ai.vector_store.chroma_vector_store import (
28
+ add_pdf_to_vector_store
29
+ )
30
+
31
+ from innovation_pathfinder_ai.utils import create_wikipedia_urls_from_text
32
+
33
  @tool
34
  def arxiv_search(query: str) -> str:
35
  """Search arxiv database for scientific research papers and studies. This is your primary information source.
 
84
  api_wrapper = WikipediaAPIWrapper()
85
  wikipedia_search = WikipediaQueryRun(api_wrapper=api_wrapper)
86
  wikipedia_results = wikipedia_search.run(query)
87
+ all_sources += create_wikipedia_urls_from_text(wikipedia_results)
88
+ return wikipedia_results
89
+
90
+
91
+ @tool
92
+ def embed_arvix_paper(paper_id:str) -> None:
93
+ """Download a paper from axriv to download a paper please input
94
+ the axriv id such as "1605.08386v1" This tool is named get_arxiv_paper
95
+ If you input "http://arxiv.org/abs/2312.02813", This will break the code. Also only do
96
+ "2312.02813". In addition please download one paper at a time. Pleaase keep the inputs/output
97
+ free of additional information only have the id.
98
+ """
99
+ # code from https://lukasschwab.me/arxiv.py/arxiv.html
100
+ paper = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id])))
101
+
102
+ number_without_period = paper_id.replace('.', '')
103
+
104
+ pdf_file_name = f"{number_without_period}.pdf"
105
+
106
+ # Download the PDF to a specified directory with a custom filename.
107
+ paper.download_pdf(dirpath="./downloaded_papers", filename=f"{number_without_period}.pdf")
108
+
109
+ client = chromadb.PersistentClient(
110
+ # path=persist_directory,
111
+ )
112
+
113
+ collection_name="ArxivPapers"
114
+ #store using envar
115
+
116
+ embedding_function = SentenceTransformerEmbeddings(
117
+ model_name="all-MiniLM-L6-v2",
118
+ )
119
 
120
+ add_pdf_to_vector_store(
121
+ collection_name=collection_name,
122
+ pdf_file_location=pdf_file_name,
123
+ )
124
+