Spaces:

vincentmin
/

ArxivNewsLetter

Sleeping

vincentmin commited on Jun 24, 2023

Commit

ee21478

1 Parent(s): b6bb841

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,16 +4,24 @@ from langchain.document_loaders import ArxivLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
-def get_data(user_query: str, load_max_docs: int = 5, chunk_size: int=1000):
-    min_date = (date.today() - timedelta(days=2)).strftime('%Y%m%d')
-    max_date = date.today().strftime('%Y%m%d')
-    query = f"cat:hep-th AND submittedDate:[{min_date} TO {max_date}]"
-    loader = ArxivLoader(query=query, load_max_docs=load_max_docs)
     documents = loader.load()
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
-    texts = text_splitter.split_documents(documents)
-    embeddings = HuggingFaceEmbeddings()
     db = Chroma.from_documents(texts, embeddings)
     retriever = db.as_retriever()
     docs = retriever.get_relevant_documents(user_query)

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import Chroma
 from langchain.embeddings import HuggingFaceEmbeddings
+# from langchain.document_loaders import Document
+CHUNK_SIZE = 1000
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
+min_date = (date.today() - timedelta(days=2)).strftime('%Y%m%d')
+max_date = date.today().strftime('%Y%m%d')
+query = f"cat:hep-th AND submittedDate:[{min_date} TO {max_date}]"
+loader = ArxivLoader(query=query, load_max_docs=load_max_docs)
+embeddings = HuggingFaceEmbeddings()
+def get_data(user_query: str, load_max_docs: int = 5):
     documents = loader.load()
+    # texts = text_splitter.split_documents(documents)
+    texts = documents
+    for doc in texts:
+        doc.page_content = doc.metadata["Summary"]
     db = Chroma.from_documents(texts, embeddings)
     retriever = db.as_retriever()
     docs = retriever.get_relevant_documents(user_query)