xuyingliKepler commited on
Commit
613ac12
β€’
1 Parent(s): 40f4687

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -102
app.py CHANGED
@@ -42,121 +42,121 @@ def process_pdf(uploaded_file):
42
 
43
 
44
  def smaller_chunks_strategy(docs):
45
- with st.spinner('Processing with smaller_chunks_strategy'):
46
- vectorstore = Chroma(
47
- collection_name="full_documents",
48
- embedding_function=OpenAIEmbeddings()
49
- )
50
- store = InMemoryStore()
51
- id_key = "doc_id"
52
- retriever = MultiVectorRetriever(
53
- vectorstore=vectorstore,
54
- docstore=store,
55
- id_key=id_key,
56
- )
57
- doc_ids = [str(uuid.uuid4()) for _ in docs]
58
- child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
59
- sub_docs = []
60
- for i, doc in enumerate(docs):
61
- _id = doc_ids[i]
62
- _sub_docs = child_text_splitter.split_documents([doc])
63
- for _doc in _sub_docs:
64
- _doc.metadata[id_key] = _id
65
- sub_docs.extend(_sub_docs)
66
-
67
- retriever.vectorstore.add_documents(sub_docs)
68
- retriever.docstore.mset(list(zip(doc_ids, docs)))
69
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
70
- qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=memory)
71
  prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="1")
72
  if prompt:
73
- st.info(prompt, icon="🧐")
74
- result = qa({"question": prompt})
75
- st.success(result['answer'], icon="πŸ€–")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
 
78
  def summary_strategy(docs):
79
- with st.spinner('Processing with summary_strategy'):
80
- chain = (
81
- {"doc": lambda x: x.page_content}
82
- | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
83
- | ChatOpenAI(max_retries=0)
84
- | StrOutputParser()
85
- )
86
- summaries = chain.batch(docs, {"max_concurrency": 5})
87
- vectorstore = Chroma(
88
- collection_name="summaries",
89
- embedding_function= OpenAIEmbeddings()
90
- )
91
- store = InMemoryStore()
92
- id_key = "doc_id"
93
- retriever = MultiVectorRetriever(
94
- vectorstore=vectorstore,
95
- docstore=store,
96
- id_key=id_key,
97
- )
98
- doc_ids = [str(uuid.uuid4()) for _ in docs]
99
- summary_docs = [Document(page_content=s, metadata={id_key: doc_ids[i]}) for i, s in enumerate(summaries)]
100
- retriever.vectorstore.add_documents(summary_docs)
101
- retriever.docstore.mset(list(zip(doc_ids, docs)))
102
- qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True))
103
  prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="2")
104
  if prompt:
105
- st.info(prompt, icon="🧐")
106
- result = qa({"question": prompt})
107
- st.success(result['answer'], icon="πŸ€–")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
 
110
  def hypothetical_questions_strategy(docs):
111
- with st.spinner('Processing with hypothetical_questions_strategy'):
112
- functions = [
113
- {
114
- "name": "hypothetical_questions",
115
- "description": "Generate hypothetical questions",
116
- "parameters": {
117
- "type": "object",
118
- "properties": {
119
- "questions": {
120
- "type": "array",
121
- "items": {
122
- "type": "string"
 
 
 
123
  },
124
  },
125
- },
126
- "required": ["questions"]
127
  }
128
- }
129
- ]
130
- chain = (
131
- {"doc": lambda x: x.page_content}
132
- | ChatPromptTemplate.from_template("Generate a list of 3 hypothetical questions that the below document could be used to answer:\n\n{doc}")
133
- | ChatOpenAI(max_retries=0, model="gpt-4").bind(functions=functions, function_call={"name": "hypothetical_questions"})
134
- | JsonKeyOutputFunctionsParser(key_name="questions")
135
- )
136
- hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})
137
- vectorstore = Chroma(
138
- collection_name="hypo-questions",
139
- embedding_function=OpenAIEmbeddings()
140
- )
141
- store = InMemoryStore()
142
- id_key = "doc_id"
143
- retriever = MultiVectorRetriever(
144
- vectorstore=vectorstore,
145
- docstore=store,
146
- id_key=id_key,
147
- )
148
- doc_ids = [str(uuid.uuid4()) for _ in docs]
149
- question_docs = []
150
- for i, question_list in enumerate(hypothetical_questions):
151
- question_docs.extend([Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list])
152
- retriever.vectorstore.add_documents(question_docs)
153
- retriever.docstore.mset(list(zip(doc_ids, docs)))
154
- qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True))
155
- prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="3")
156
- if prompt:
157
- st.info(prompt, icon="🧐")
158
- result = qa({"question": prompt})
159
- st.success(result['answer'], icon="πŸ€–")
160
 
161
 
162
 
 
42
 
43
 
44
  def smaller_chunks_strategy(docs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="1")
46
  if prompt:
47
+ with st.spinner('Processing with smaller_chunks_strategy'):
48
+ vectorstore = Chroma(
49
+ collection_name="full_documents",
50
+ embedding_function=OpenAIEmbeddings()
51
+ )
52
+ store = InMemoryStore()
53
+ id_key = "doc_id"
54
+ retriever = MultiVectorRetriever(
55
+ vectorstore=vectorstore,
56
+ docstore=store,
57
+ id_key=id_key,
58
+ )
59
+ doc_ids = [str(uuid.uuid4()) for _ in docs]
60
+ child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
61
+ sub_docs = []
62
+ for i, doc in enumerate(docs):
63
+ _id = doc_ids[i]
64
+ _sub_docs = child_text_splitter.split_documents([doc])
65
+ for _doc in _sub_docs:
66
+ _doc.metadata[id_key] = _id
67
+ sub_docs.extend(_sub_docs)
68
+
69
+ retriever.vectorstore.add_documents(sub_docs)
70
+ retriever.docstore.mset(list(zip(doc_ids, docs)))
71
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
72
+ qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=memory)
73
+ st.info(prompt, icon="🧐")
74
+ result = qa({"question": prompt})
75
+ st.success(result['answer'], icon="πŸ€–")
76
 
77
 
78
  def summary_strategy(docs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="2")
80
  if prompt:
81
+ with st.spinner('Processing with summary_strategy'):
82
+ chain = (
83
+ {"doc": lambda x: x.page_content}
84
+ | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
85
+ | ChatOpenAI(max_retries=0)
86
+ | StrOutputParser()
87
+ )
88
+ summaries = chain.batch(docs, {"max_concurrency": 5})
89
+ vectorstore = Chroma(
90
+ collection_name="summaries",
91
+ embedding_function= OpenAIEmbeddings()
92
+ )
93
+ store = InMemoryStore()
94
+ id_key = "doc_id"
95
+ retriever = MultiVectorRetriever(
96
+ vectorstore=vectorstore,
97
+ docstore=store,
98
+ id_key=id_key,
99
+ )
100
+ doc_ids = [str(uuid.uuid4()) for _ in docs]
101
+ summary_docs = [Document(page_content=s, metadata={id_key: doc_ids[i]}) for i, s in enumerate(summaries)]
102
+ retriever.vectorstore.add_documents(summary_docs)
103
+ retriever.docstore.mset(list(zip(doc_ids, docs)))
104
+ qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True))
105
+ st.info(prompt, icon="🧐")
106
+ result = qa({"question": prompt})
107
+ st.success(result['answer'], icon="πŸ€–")
108
 
109
 
110
  def hypothetical_questions_strategy(docs):
111
+ prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="3")
112
+ if prompt:
113
+ with st.spinner('Processing with hypothetical_questions_strategy'):
114
+ functions = [
115
+ {
116
+ "name": "hypothetical_questions",
117
+ "description": "Generate hypothetical questions",
118
+ "parameters": {
119
+ "type": "object",
120
+ "properties": {
121
+ "questions": {
122
+ "type": "array",
123
+ "items": {
124
+ "type": "string"
125
+ },
126
  },
127
  },
128
+ "required": ["questions"]
129
+ }
130
  }
131
+ ]
132
+ chain = (
133
+ {"doc": lambda x: x.page_content}
134
+ | ChatPromptTemplate.from_template("Generate a list of 3 hypothetical questions that the below document could be used to answer:\n\n{doc}")
135
+ | ChatOpenAI(max_retries=0, model="gpt-4").bind(functions=functions, function_call={"name": "hypothetical_questions"})
136
+ | JsonKeyOutputFunctionsParser(key_name="questions")
137
+ )
138
+ hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})
139
+ vectorstore = Chroma(
140
+ collection_name="hypo-questions",
141
+ embedding_function=OpenAIEmbeddings()
142
+ )
143
+ store = InMemoryStore()
144
+ id_key = "doc_id"
145
+ retriever = MultiVectorRetriever(
146
+ vectorstore=vectorstore,
147
+ docstore=store,
148
+ id_key=id_key,
149
+ )
150
+ doc_ids = [str(uuid.uuid4()) for _ in docs]
151
+ question_docs = []
152
+ for i, question_list in enumerate(hypothetical_questions):
153
+ question_docs.extend([Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list])
154
+ retriever.vectorstore.add_documents(question_docs)
155
+ retriever.docstore.mset(list(zip(doc_ids, docs)))
156
+ qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True))
157
+ st.info(prompt, icon="🧐")
158
+ result = qa({"question": prompt})
159
+ st.success(result['answer'], icon="πŸ€–")
 
 
 
160
 
161
 
162