vincentmin commited on
Commit
ee21478
·
1 Parent(s): b6bb841

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -8
app.py CHANGED
@@ -4,16 +4,24 @@ from langchain.document_loaders import ArxivLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.vectorstores import Chroma
6
  from langchain.embeddings import HuggingFaceEmbeddings
 
7
 
8
- def get_data(user_query: str, load_max_docs: int = 5, chunk_size: int=1000):
9
- min_date = (date.today() - timedelta(days=2)).strftime('%Y%m%d')
10
- max_date = date.today().strftime('%Y%m%d')
11
- query = f"cat:hep-th AND submittedDate:[{min_date} TO {max_date}]"
12
- loader = ArxivLoader(query=query, load_max_docs=load_max_docs)
 
 
 
 
 
 
13
  documents = loader.load()
14
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
15
- texts = text_splitter.split_documents(documents)
16
- embeddings = HuggingFaceEmbeddings()
 
17
  db = Chroma.from_documents(texts, embeddings)
18
  retriever = db.as_retriever()
19
  docs = retriever.get_relevant_documents(user_query)
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.vectorstores import Chroma
6
  from langchain.embeddings import HuggingFaceEmbeddings
7
+ # from langchain.document_loaders import Document
8
 
9
+ CHUNK_SIZE = 1000
10
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
11
+
12
+ min_date = (date.today() - timedelta(days=2)).strftime('%Y%m%d')
13
+ max_date = date.today().strftime('%Y%m%d')
14
+ query = f"cat:hep-th AND submittedDate:[{min_date} TO {max_date}]"
15
+ loader = ArxivLoader(query=query, load_max_docs=load_max_docs)
16
+
17
+ embeddings = HuggingFaceEmbeddings()
18
+
19
+ def get_data(user_query: str, load_max_docs: int = 5):
20
  documents = loader.load()
21
+ # texts = text_splitter.split_documents(documents)
22
+ texts = documents
23
+ for doc in texts:
24
+ doc.page_content = doc.metadata["Summary"]
25
  db = Chroma.from_documents(texts, embeddings)
26
  retriever = db.as_retriever()
27
  docs = retriever.get_relevant_documents(user_query)