llm-wizard commited on
Commit
0c62088
·
0 Parent(s):

Duplicate from c-s-ale/ArxivChainLitDemo

Browse files
Files changed (8) hide show
  1. .env.example +1 -0
  2. .gitattributes +34 -0
  3. .gitignore +4 -0
  4. Dockerfile +11 -0
  5. README.md +10 -0
  6. app.py +103 -0
  7. chainlit.md +11 -0
  8. requirements.txt +7 -0
.env.example ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .env
2
+ .vscode
3
+ .chroma
4
+ __pycache__
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install -r requirements.txt
10
+ COPY . .
11
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ArxivChainLitDemo
3
+ emoji: 💻
4
+ colorFrom: indigo
5
+ colorTo: gray
6
+ sdk: docker
7
+ pinned: false
8
+ license: openrail
9
+ duplicated_from: c-s-ale/ArxivChainLitDemo
10
+ ---
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.embeddings.openai import OpenAIEmbeddings
2
+ from langchain.document_loaders import PyMuPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import Chroma
5
+ from langchain.chains import RetrievalQAWithSourcesChain
6
+ from langchain.chat_models import ChatOpenAI
7
+ from langchain.prompts.chat import (
8
+ ChatPromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ HumanMessagePromptTemplate,
11
+ )
12
+ import os
13
+ import arxiv
14
+ import chainlit as cl
15
+ from chainlit import user_session
16
+
17
+ @cl.langchain_factory(use_async=True)
18
+ async def init():
19
+ arxiv_query = None
20
+
21
+ # Wait for the user to ask an Arxiv question
22
+ while arxiv_query == None:
23
+ arxiv_query = await cl.AskUserMessage(
24
+ content="Please enter a topic to begin!", timeout=15
25
+ ).send()
26
+
27
+ # Obtain the top 30 results from Arxiv for the query
28
+ search = arxiv.Search(
29
+ query=arxiv_query["content"],
30
+ max_results=3,
31
+ sort_by=arxiv.SortCriterion.Relevance,
32
+ )
33
+
34
+ await cl.Message(content="Downloading and chunking articles...").send()
35
+ # download each of the pdfs
36
+ pdf_data = []
37
+ for result in search.results():
38
+ loader = PyMuPDFLoader(result.pdf_url)
39
+ loaded_pdf = loader.load()
40
+
41
+ for document in loaded_pdf:
42
+ document.metadata["source"] = result.entry_id
43
+ document.metadata["file_path"] = result.pdf_url
44
+ document.metadata["title"] = result.title
45
+ pdf_data.append(document)
46
+
47
+ # Create a Chroma vector store
48
+ embeddings = OpenAIEmbeddings(
49
+ disallowed_special=(),
50
+ )
51
+
52
+ # If operation takes too long, make_async allows to run in a thread
53
+ # docsearch = await cl.make_async(Chroma.from_documents)(pdf_data, embeddings)
54
+ docsearch = Chroma.from_documents(pdf_data, embeddings)
55
+
56
+ # Create a chain that uses the Chroma vector store
57
+ chain = RetrievalQAWithSourcesChain.from_chain_type(
58
+ ChatOpenAI(
59
+ model_name="gpt-3.5-turbo-16k",
60
+ temperature=0,
61
+ ),
62
+ chain_type="stuff",
63
+ retriever=docsearch.as_retriever(),
64
+ return_source_documents=True,
65
+ )
66
+
67
+ # Let the user know that the system is ready
68
+ await cl.Message(
69
+ content=f"We found a few papers about `{arxiv_query['content']}` you can now ask questions!"
70
+ ).send()
71
+
72
+ return chain
73
+
74
+
75
+ @cl.langchain_postprocess
76
+ async def process_response(res):
77
+ answer = res["answer"]
78
+ source_elements_dict = {}
79
+ source_elements = []
80
+ for idx, source in enumerate(res["source_documents"]):
81
+ title = source.metadata["title"]
82
+
83
+ if title not in source_elements_dict:
84
+ source_elements_dict[title] = {
85
+ "page_number": [source.metadata["page"]],
86
+ "url": source.metadata["file_path"],
87
+ }
88
+
89
+ else:
90
+ source_elements_dict[title]["page_number"].append(source.metadata["page"])
91
+
92
+ # sort the page numbers
93
+ source_elements_dict[title]["page_number"].sort()
94
+
95
+ for title, source in source_elements_dict.items():
96
+ # create a string for the page numbers
97
+ page_numbers = ", ".join([str(x) for x in source["page_number"]])
98
+ text_for_source = f"Page Number(s): {page_numbers}\nURL: {source['url']}"
99
+ source_elements.append(
100
+ cl.Text(name=title, content=text_for_source, display="inline")
101
+ )
102
+
103
+ await cl.Message(content=answer, elements=source_elements).send()
chainlit.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ⚠️ Warning ⚠️
2
+
3
+ You will need a GPT-4 API key to use this app due to large context size!
4
+
5
+ # Welcome to AskArxiv powered by Chainlit!
6
+
7
+ In this app, you'll be able to enter a topic - and then ask ~30 papers from Arxiv about that topic!
8
+
9
+ ### Link To Demo
10
+
11
+ [Hugging Face Space]()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ arxiv==1.4.7
2
+ langchain==0.0.202
3
+ chainlit==0.4.1
4
+ openai
5
+ chromadb
6
+ tiktoken
7
+ pymupdf