hkoppen commited on
Commit
eeee47d
1 Parent(s): d9de169

Updates after sync

Browse files
Files changed (3) hide show
  1. app.py +11 -7
  2. document_qa_engine.py +142 -142
  3. utils.py +1 -1
app.py CHANGED
@@ -38,7 +38,7 @@ def manage_files(modal, document_store):
38
  if modal.is_open():
39
  with modal.container():
40
  uploaded_file = st.file_uploader(
41
- "Upload a CV in PDF format",
42
  type=("pdf",),
43
  on_change=new_file(),
44
  disabled=st.session_state['document_qa_model'] is None,
@@ -57,7 +57,7 @@ def manage_files(modal, document_store):
57
  if uploaded_file:
58
  st.session_state['file_uploaded'] = True
59
  st.session_state['files'] = pd.concat([st.session_state['files'], edited_df])
60
- with st.spinner('Processing the CV content...'):
61
  store_file_in_table(document_store, uploaded_file)
62
  ingest_document(uploaded_file)
63
 
@@ -103,7 +103,7 @@ def init_session_state():
103
 
104
  def set_page_config():
105
  st.set_page_config(
106
- page_title="CV Insights AI Assistant",
107
  page_icon=":shark:",
108
  initial_sidebar_state="expanded",
109
  layout="wide",
@@ -121,7 +121,8 @@ def update_running_model(api_key, model):
121
 
122
 
123
  def init_api_key_dict():
124
- st.session_state['models'] = OPENAI_MODELS + list(OPEN_MODELS) + ['local LLM']
 
125
  for model_name in OPENAI_MODELS:
126
  st.session_state['api_keys'][model_name] = None
127
 
@@ -158,8 +159,11 @@ def setup_model_selection():
158
  if model == 'local LLM':
159
  st.session_state['document_qa_model'] = init_qa(model)
160
 
161
- api_key = st.sidebar.text_input("Enter LLM-authorization Key:", type="password",
162
- disabled=st.session_state['current_selected_model'] == 'local LLM')
 
 
 
163
  if api_key and api_key != st.session_state['current_api_key']:
164
  update_running_model(api_key, model)
165
  st.session_state['current_api_key'] = api_key
@@ -213,7 +217,7 @@ class StreamlitApp:
213
  # Sidebar for Task Selection
214
  st.sidebar.header('Options:')
215
  model = setup_model_selection()
216
- setup_task_selection(model)
217
  st.divider()
218
  self.authenticator.logout()
219
  reset_chat_memory()
 
38
  if modal.is_open():
39
  with modal.container():
40
  uploaded_file = st.file_uploader(
41
+ "Upload a document in PDF format",
42
  type=("pdf",),
43
  on_change=new_file(),
44
  disabled=st.session_state['document_qa_model'] is None,
 
57
  if uploaded_file:
58
  st.session_state['file_uploaded'] = True
59
  st.session_state['files'] = pd.concat([st.session_state['files'], edited_df])
60
+ with st.spinner('Processing the document...'):
61
  store_file_in_table(document_store, uploaded_file)
62
  ingest_document(uploaded_file)
63
 
 
103
 
104
  def set_page_config():
105
  st.set_page_config(
106
+ page_title="Document Insights AI Assistant",
107
  page_icon=":shark:",
108
  initial_sidebar_state="expanded",
109
  layout="wide",
 
121
 
122
 
123
  def init_api_key_dict():
124
+ # st.session_state['models'] = OPENAI_MODELS + list(OPEN_MODELS) + ['local LLM']
125
+ st.session_state['models'] = OPENAI_MODELS
126
  for model_name in OPENAI_MODELS:
127
  st.session_state['api_keys'][model_name] = None
128
 
 
159
  if model == 'local LLM':
160
  st.session_state['document_qa_model'] = init_qa(model)
161
 
162
+ # api_key = st.sidebar.text_input("Enter LLM-authorization Key:", type="password",
163
+ # disabled=st.session_state['current_selected_model'] == 'local LLM')
164
+
165
+ api_key = "sk-proj-vQgkXQKYjy8m3waKtDFQT3BlbkFJ7uuMeDinKxql7J0Q161N"
166
+
167
  if api_key and api_key != st.session_state['current_api_key']:
168
  update_running_model(api_key, model)
169
  st.session_state['current_api_key'] = api_key
 
217
  # Sidebar for Task Selection
218
  st.sidebar.header('Options:')
219
  model = setup_model_selection()
220
+ # setup_task_selection(model)
221
  st.divider()
222
  self.authenticator.logout()
223
  reset_chat_memory()
document_qa_engine.py CHANGED
@@ -1,142 +1,142 @@
1
- from typing import List
2
-
3
- from haystack.dataclasses import ChatMessage
4
- from pypdf import PdfReader
5
- from haystack.utils import Secret
6
- from haystack import Pipeline, Document, component
7
-
8
- from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
9
- from haystack.components.writers import DocumentWriter
10
- from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
11
- from haystack.document_stores.in_memory import InMemoryDocumentStore
12
- from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
13
- from haystack.components.builders import DynamicChatPromptBuilder
14
- from haystack.components.generators.chat import OpenAIChatGenerator, HuggingFaceTGIChatGenerator
15
- from haystack.document_stores.types import DuplicatePolicy
16
-
17
- SENTENCE_RETREIVER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
18
-
19
- MAX_TOKENS = 500
20
-
21
- template = """
22
- As a professional HR recruiter given the following information, answer the question shortly and concisely in 1 or 2 sentences.
23
-
24
- Context:
25
- {% for document in documents %}
26
- {{ document.content }}
27
- {% endfor %}
28
-
29
- Question: {{question}}
30
- Answer:
31
- """
32
-
33
-
34
- @component
35
- class UploadedFileConverter:
36
- """
37
- A component to convert uploaded PDF files to Documents
38
- """
39
-
40
- @component.output_types(documents=List[Document])
41
- def run(self, uploaded_file):
42
- pdf = PdfReader(uploaded_file)
43
- documents = []
44
- # uploaded file name without .pdf at the end and with _ and page number at the end
45
- name = uploaded_file.name.rstrip('.PDF') + '_'
46
- for page in pdf.pages:
47
- documents.append(
48
- Document(
49
- content=page.extract_text(),
50
- meta={'name': name + f"_{page.page_number}"}))
51
- return {"documents": documents}
52
-
53
-
54
- def create_ingestion_pipeline(document_store):
55
- doc_embedder = SentenceTransformersDocumentEmbedder(model=SENTENCE_RETREIVER_MODEL)
56
- doc_embedder.warm_up()
57
-
58
- pipeline = Pipeline()
59
- pipeline.add_component("converter", UploadedFileConverter())
60
- pipeline.add_component("cleaner", DocumentCleaner())
61
- pipeline.add_component("splitter",
62
- DocumentSplitter(split_by="passage", split_length=100, split_overlap=10))
63
- pipeline.add_component("embedder", doc_embedder)
64
- pipeline.add_component("writer",
65
- DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE))
66
-
67
- pipeline.connect("converter", "cleaner")
68
- pipeline.connect("cleaner", "splitter")
69
- pipeline.connect("splitter", "embedder")
70
- pipeline.connect("embedder", "writer")
71
- return pipeline
72
-
73
-
74
- def create_inference_pipeline(document_store, model_name, api_key):
75
- if model_name == "local LLM":
76
- generator = OpenAIChatGenerator(api_key=Secret.from_token("<local LLM doesn't need an API key>"),
77
- model=model_name,
78
- api_base_url="http://localhost:1234/v1",
79
- generation_kwargs={"max_tokens": MAX_TOKENS}
80
- )
81
- elif "gpt" in model_name:
82
- generator = OpenAIChatGenerator(api_key=Secret.from_token(api_key), model=model_name,
83
- generation_kwargs={"max_tokens": MAX_TOKENS},
84
- streaming_callback=lambda x: print(x),
85
- )
86
- else:
87
- generator = HuggingFaceTGIChatGenerator(token=Secret.from_token(api_key), model=model_name,
88
- generation_kwargs={"max_new_tokens": MAX_TOKENS}
89
- )
90
- pipeline = Pipeline()
91
- pipeline.add_component("text_embedder",
92
- SentenceTransformersTextEmbedder(model=SENTENCE_RETREIVER_MODEL))
93
- pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=3))
94
- pipeline.add_component("prompt_builder",
95
- DynamicChatPromptBuilder(runtime_variables=["query", "documents"]))
96
- pipeline.add_component("llm", generator)
97
- pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
98
- pipeline.connect("retriever.documents", "prompt_builder.documents")
99
- pipeline.connect("prompt_builder.prompt", "llm.messages")
100
-
101
- return pipeline
102
-
103
-
104
- class DocumentQAEngine:
105
- def __init__(self,
106
- model_name,
107
- api_key=None
108
- ):
109
- self.api_key = api_key
110
- self.model_name = model_name
111
- document_store = InMemoryDocumentStore()
112
- self.chunks = []
113
- self.inference_pipeline = create_inference_pipeline(document_store, model_name, api_key)
114
- self.pdf_ingestion_pipeline = create_ingestion_pipeline(document_store)
115
-
116
- def ingest_pdf(self, uploaded_file):
117
- self.pdf_ingestion_pipeline.run({"converter": {"uploaded_file": uploaded_file}})
118
-
119
- def inference(self, query, input_messages: List[dict]):
120
- system_message = ChatMessage.from_system(
121
- "You are a consultant answering questions about potential AI use cases based on the uploaded document. Please provide accurate, concise answers in 1-5 sentences, referencing the document content.")
122
- messages = [system_message]
123
- for message in input_messages:
124
- if message["role"] == "user":
125
- messages.append(ChatMessage.from_system(message["content"]))
126
- else:
127
- messages.append(
128
- ChatMessage.from_user(message["content"]))
129
- messages.append(ChatMessage.from_user("""
130
- Relevant information from the uploaded documents:
131
- {% for doc in documents %}
132
- {{ doc.content }}
133
- {% endfor %}
134
-
135
- \nQuestion: {{query}}
136
- \nAnswer:
137
- """))
138
- res = self.inference_pipeline.run(data={"text_embedder": {"text": query},
139
- "prompt_builder": {"prompt_source": messages,
140
- "query": query
141
- }})
142
- return res["llm"]["replies"][0].content
 
1
+ from typing import List
2
+
3
+ from haystack.dataclasses import ChatMessage
4
+ from pypdf import PdfReader
5
+ from haystack.utils import Secret
6
+ from haystack import Pipeline, Document, component
7
+
8
+ from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
9
+ from haystack.components.writers import DocumentWriter
10
+ from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
11
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
12
+ from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
13
+ from haystack.components.builders import DynamicChatPromptBuilder
14
+ from haystack.components.generators.chat import OpenAIChatGenerator, HuggingFaceTGIChatGenerator
15
+ from haystack.document_stores.types import DuplicatePolicy
16
+
17
+ SENTENCE_RETREIVER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
18
+
19
+ MAX_TOKENS = 500
20
+
21
+ template = """
22
+ As a professional HR recruiter given the following information, answer the question shortly and concisely in 1 or 2 sentences.
23
+
24
+ Context:
25
+ {% for document in documents %}
26
+ {{ document.content }}
27
+ {% endfor %}
28
+
29
+ Question: {{question}}
30
+ Answer:
31
+ """
32
+
33
+
34
+ @component
35
+ class UploadedFileConverter:
36
+ """
37
+ A component to convert uploaded PDF files to Documents
38
+ """
39
+
40
+ @component.output_types(documents=List[Document])
41
+ def run(self, uploaded_file):
42
+ pdf = PdfReader(uploaded_file)
43
+ documents = []
44
+ # uploaded file name without .pdf at the end and with _ and page number at the end
45
+ name = uploaded_file.name.rstrip('.PDF') + '_'
46
+ for page in pdf.pages:
47
+ documents.append(
48
+ Document(
49
+ content=page.extract_text(),
50
+ meta={'name': name + f"_{page.page_number}"}))
51
+ return {"documents": documents}
52
+
53
+
54
+ def create_ingestion_pipeline(document_store):
55
+ doc_embedder = SentenceTransformersDocumentEmbedder(model=SENTENCE_RETREIVER_MODEL)
56
+ doc_embedder.warm_up()
57
+
58
+ pipeline = Pipeline()
59
+ pipeline.add_component("converter", UploadedFileConverter())
60
+ pipeline.add_component("cleaner", DocumentCleaner())
61
+ pipeline.add_component("splitter",
62
+ DocumentSplitter(split_by="passage", split_length=100, split_overlap=10))
63
+ pipeline.add_component("embedder", doc_embedder)
64
+ pipeline.add_component("writer",
65
+ DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE))
66
+
67
+ pipeline.connect("converter", "cleaner")
68
+ pipeline.connect("cleaner", "splitter")
69
+ pipeline.connect("splitter", "embedder")
70
+ pipeline.connect("embedder", "writer")
71
+ return pipeline
72
+
73
+
74
+ def create_inference_pipeline(document_store, model_name, api_key):
75
+ if model_name == "local LLM":
76
+ generator = OpenAIChatGenerator(api_key=Secret.from_token("<local LLM doesn't need an API key>"),
77
+ model=model_name,
78
+ api_base_url="http://localhost:1234/v1",
79
+ generation_kwargs={"max_tokens": MAX_TOKENS}
80
+ )
81
+ elif "gpt" in model_name:
82
+ generator = OpenAIChatGenerator(api_key=Secret.from_token(api_key), model=model_name,
83
+ generation_kwargs={"max_tokens": MAX_TOKENS},
84
+ streaming_callback=lambda chunk: print(chunk.content, end="", flush=True),
85
+ )
86
+ else:
87
+ generator = HuggingFaceTGIChatGenerator(token=Secret.from_token(api_key), model=model_name,
88
+ generation_kwargs={"max_new_tokens": MAX_TOKENS}
89
+ )
90
+ pipeline = Pipeline()
91
+ pipeline.add_component("text_embedder",
92
+ SentenceTransformersTextEmbedder(model=SENTENCE_RETREIVER_MODEL))
93
+ pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=3))
94
+ pipeline.add_component("prompt_builder",
95
+ DynamicChatPromptBuilder(runtime_variables=["query", "documents"]))
96
+ pipeline.add_component("llm", generator)
97
+ pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
98
+ pipeline.connect("retriever.documents", "prompt_builder.documents")
99
+ pipeline.connect("prompt_builder.prompt", "llm.messages")
100
+
101
+ return pipeline
102
+
103
+
104
+ class DocumentQAEngine:
105
+ def __init__(self,
106
+ model_name,
107
+ api_key=None
108
+ ):
109
+ self.api_key = api_key
110
+ self.model_name = model_name
111
+ document_store = InMemoryDocumentStore()
112
+ self.chunks = []
113
+ self.inference_pipeline = create_inference_pipeline(document_store, model_name, api_key)
114
+ self.pdf_ingestion_pipeline = create_ingestion_pipeline(document_store)
115
+
116
+ def ingest_pdf(self, uploaded_file):
117
+ self.pdf_ingestion_pipeline.run({"converter": {"uploaded_file": uploaded_file}})
118
+
119
+ def inference(self, query, input_messages: List[dict]):
120
+ system_message = ChatMessage.from_system(
121
+ "You are a consultant answering questions about potential AI use cases based on the uploaded document. Please provide accurate, concise answers in 3-5 sentences, referencing both the document content and additional sources.")
122
+ messages = [system_message]
123
+ for message in input_messages:
124
+ if message["role"] == "user":
125
+ messages.append(ChatMessage.from_system(message["content"]))
126
+ else:
127
+ messages.append(
128
+ ChatMessage.from_user(message["content"]))
129
+ messages.append(ChatMessage.from_user("""
130
+ Relevant information from the uploaded documents:
131
+ {% for doc in documents %}
132
+ {{ doc.content }}
133
+ {% endfor %}
134
+
135
+ \nQuestion: {{query}}
136
+ \nAnswer:
137
+ """))
138
+ res = self.inference_pipeline.run(data={"text_embedder": {"text": query},
139
+ "prompt_builder": {"prompt_source": messages,
140
+ "query": query
141
+ }})
142
+ return res["llm"]["replies"][0].content
utils.py CHANGED
@@ -50,7 +50,7 @@ def append_documentation_to_sidebar():
50
  with st.expander("Documentation"):
51
  st.markdown(
52
  """
53
- Upload a CV as PDF document. Once the spinner stops, you can proceed to ask your questions. The answers will
54
  be displayed in the right column. The system will answer your questions using the content of the document
55
  and mark refrences over the PDF viewer.
56
  """)
 
50
  with st.expander("Documentation"):
51
  st.markdown(
52
  """
53
+ Upload a document as a PDF document. Once the spinner stops, you can proceed to ask your questions. The answers will
54
  be displayed in the right column. The system will answer your questions using the content of the document
55
  and mark refrences over the PDF viewer.
56
  """)