TomData commited on
Commit
f3d0f1e
·
1 Parent(s): 87da358

coppy 22.04

Browse files
Home.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.chatbot import chatbot, keyword_search
3
+
4
+ # Adjust size of each block is not yet working
5
+ output = gr.DataFrame(height=1000, show_label=True, scale=2)
6
+ input = gr.Textbox(scale=1)
7
+
8
+ with gr.Blocks() as App:
9
+ with gr.Tab("ChatBot"):
10
+ # Apply RAG using chatbut function from local file ChatBot.py
11
+ gr.ChatInterface(chatbot,
12
+ title="PoliticsToYou",
13
+ description= "This chatbot uses the infomation of speeches of the german parliament (since 2021) \
14
+ to get insight on the view points of the german parties and the debate of the parliament.",
15
+ examples=["Wie steht die CDU zur Cannabislegalisierung?","Was waren die wichtigsten Themen in der aktuellen Legislaturperiode?"], #change to meaningful examples
16
+ cache_examples=False, #true increases the loading time
17
+ )
18
+ with gr.Tab("KeyWordSearch"):
19
+ gr.Interface(fn=keyword_search, inputs=input, outputs=output)
20
+
21
+
22
+ if __name__ == "__main__":
23
+ App.launch(share=True)
24
+
25
+
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: pink
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 4.26.0
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
 
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 4.26.0
8
+ app_file: Home.py
9
  pinned: false
10
  ---
11
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas==2.1.3
2
+ langchain==0.1.15
3
+ transformers==4.35.2
4
+ gradio==4.26.0
5
+ sentence-transformers==2.6.1
6
+ python-dotenv
7
+ faiss-cpu
src/FAISS.ipynb ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# Create vectorstore\n",
10
+ "import pandas as pd\n",
11
+ "from vectordatabase import load_documents\n",
12
+ "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
13
+ "from langchain_community.vectorstores import FAISS\n",
14
+ "\n",
15
+ "\n",
16
+ "df = pd.read_pickle(\"C:\\\\Users\\Tom\\SynologyDrive\\Programming\\\\NLP\\PoliticsToYou\\ChatBot\\Speeches\\speeches_1949_09_12.pkl\")\n",
17
+ "# Split speeches into documents\n",
18
+ "documents = load_documents(df)\n",
19
+ "embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
20
+ "db = FAISS.from_documents(documents, embeddings)\n",
21
+ "db.save_local(folder_path=\"ChatBot\\FAISS\", index_name=\"speeches_1949_09_12\")\n"
22
+ ]
23
+ }
24
+ ],
25
+ "metadata": {
26
+ "kernelspec": {
27
+ "display_name": "Python 3",
28
+ "language": "python",
29
+ "name": "python3"
30
+ },
31
+ "language_info": {
32
+ "codemirror_mode": {
33
+ "name": "ipython",
34
+ "version": 3
35
+ },
36
+ "file_extension": ".py",
37
+ "mimetype": "text/x-python",
38
+ "name": "python",
39
+ "nbconvert_exporter": "python",
40
+ "pygments_lexer": "ipython3",
41
+ "version": "3.11.4"
42
+ }
43
+ },
44
+ "nbformat": 4,
45
+ "nbformat_minor": 2
46
+ }
src/FAISS/legislature20.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e748028f9f5a5424be29c57df6b2387e3c723b6490cc48d33b4ab75fdc813de
3
+ size 82403373
src/FAISS/legislature20.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:614bbfa822757149ba2aa055d349b3e68828ba4eda24dfe1531ca523c446a3ea
3
+ size 73684827
src/Speeches/querry.ipynb ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import psycopg2\n",
10
+ "import pandas as pd"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "markdown",
15
+ "metadata": {},
16
+ "source": [
17
+ "### Pandas\n"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 2,
23
+ "metadata": {},
24
+ "outputs": [
25
+ {
26
+ "name": "stderr",
27
+ "output_type": "stream",
28
+ "text": [
29
+ "C:\\Users\\Tom\\AppData\\Local\\Temp\\ipykernel_21040\\1041354989.py:12: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
30
+ " df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n"
31
+ ]
32
+ }
33
+ ],
34
+ "source": [
35
+ "# db_connection -----------------------------------------------------------\n",
36
+ "con_details = {\n",
37
+ " \"host\" : \"localhost\",\n",
38
+ " \"database\" : \"next\",\n",
39
+ " \"user\" : \"postgres\",\n",
40
+ " \"password\" : \"postgres\",\n",
41
+ " \"port\" : \"5432\"\n",
42
+ "}\n",
43
+ "con = psycopg2.connect(**con_details)\n",
44
+ "\n",
45
+ "# get data tables ---------------------------------------------------------\n",
46
+ "df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n",
47
+ " FROM open_discourse.speeches AS s\n",
48
+ " INNER JOIN open_discourse.factions AS f ON\n",
49
+ " s.faction_id = f.id;\"\"\", con)\n",
50
+ "\n",
51
+ "\n"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "markdown",
56
+ "metadata": {},
57
+ "source": [
58
+ "### Data Cleaning"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 3,
64
+ "metadata": {},
65
+ "outputs": [
66
+ {
67
+ "data": {
68
+ "text/html": [
69
+ "<div>\n",
70
+ "<style scoped>\n",
71
+ " .dataframe tbody tr th:only-of-type {\n",
72
+ " vertical-align: middle;\n",
73
+ " }\n",
74
+ "\n",
75
+ " .dataframe tbody tr th {\n",
76
+ " vertical-align: top;\n",
77
+ " }\n",
78
+ "\n",
79
+ " .dataframe thead th {\n",
80
+ " text-align: right;\n",
81
+ " }\n",
82
+ "</style>\n",
83
+ "<table border=\"1\" class=\"dataframe\">\n",
84
+ " <thead>\n",
85
+ " <tr style=\"text-align: right;\">\n",
86
+ " <th></th>\n",
87
+ " <th>id</th>\n",
88
+ " <th>speech_content</th>\n",
89
+ " <th>date</th>\n",
90
+ " <th>party</th>\n",
91
+ " </tr>\n",
92
+ " </thead>\n",
93
+ " <tbody>\n",
94
+ " <tr>\n",
95
+ " <th>0</th>\n",
96
+ " <td>0</td>\n",
97
+ " <td>Meine Damen und Herren! Ich eröffne die 2. Sit...</td>\n",
98
+ " <td>1949-09-12</td>\n",
99
+ " <td>not found</td>\n",
100
+ " </tr>\n",
101
+ " <tr>\n",
102
+ " <th>1</th>\n",
103
+ " <td>1</td>\n",
104
+ " <td>Der Bundesrat ist versammelt, Herr Präsident.\\n</td>\n",
105
+ " <td>1949-09-12</td>\n",
106
+ " <td>not found</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>2</th>\n",
110
+ " <td>2</td>\n",
111
+ " <td>Ich danke für diese Erklärung. Ich stelle dami...</td>\n",
112
+ " <td>1949-09-12</td>\n",
113
+ " <td>not found</td>\n",
114
+ " </tr>\n",
115
+ " <tr>\n",
116
+ " <th>3</th>\n",
117
+ " <td>3</td>\n",
118
+ " <td>Ja, ich habe den Wunsch.\\n</td>\n",
119
+ " <td>1949-09-12</td>\n",
120
+ " <td>not found</td>\n",
121
+ " </tr>\n",
122
+ " <tr>\n",
123
+ " <th>4</th>\n",
124
+ " <td>4</td>\n",
125
+ " <td>Ich erteile dem Herrn Bundespräsidenten das Wo...</td>\n",
126
+ " <td>1949-09-12</td>\n",
127
+ " <td>not found</td>\n",
128
+ " </tr>\n",
129
+ " <tr>\n",
130
+ " <th>...</th>\n",
131
+ " <td>...</td>\n",
132
+ " <td>...</td>\n",
133
+ " <td>...</td>\n",
134
+ " <td>...</td>\n",
135
+ " </tr>\n",
136
+ " <tr>\n",
137
+ " <th>930955</th>\n",
138
+ " <td>1084268</td>\n",
139
+ " <td>\\n\\nWir sind zwar Kollegen.</td>\n",
140
+ " <td>2022-12-16</td>\n",
141
+ " <td>not found</td>\n",
142
+ " </tr>\n",
143
+ " <tr>\n",
144
+ " <th>930956</th>\n",
145
+ " <td>1084269</td>\n",
146
+ " <td>\\n\\nLiebe, sehr geehrte Frau Präsidentin!</td>\n",
147
+ " <td>2022-12-16</td>\n",
148
+ " <td>CDU/CSU</td>\n",
149
+ " </tr>\n",
150
+ " <tr>\n",
151
+ " <th>930957</th>\n",
152
+ " <td>1084270</td>\n",
153
+ " <td>\\n\\nVielen Dank.</td>\n",
154
+ " <td>2022-12-16</td>\n",
155
+ " <td>not found</td>\n",
156
+ " </tr>\n",
157
+ " <tr>\n",
158
+ " <th>930958</th>\n",
159
+ " <td>1084272</td>\n",
160
+ " <td>\\n\\nDen Abschluss dieser Aktuellen Stunde bild...</td>\n",
161
+ " <td>2022-12-16</td>\n",
162
+ " <td>not found</td>\n",
163
+ " </tr>\n",
164
+ " <tr>\n",
165
+ " <th>930959</th>\n",
166
+ " <td>1084273</td>\n",
167
+ " <td>\\n\\nSehr geehrte Frau Präsidentin! Werte Kolle...</td>\n",
168
+ " <td>2022-12-16</td>\n",
169
+ " <td>SPD</td>\n",
170
+ " </tr>\n",
171
+ " </tbody>\n",
172
+ "</table>\n",
173
+ "<p>930960 rows × 4 columns</p>\n",
174
+ "</div>"
175
+ ],
176
+ "text/plain": [
177
+ " id speech_content \\\n",
178
+ "0 0 Meine Damen und Herren! Ich eröffne die 2. Sit... \n",
179
+ "1 1 Der Bundesrat ist versammelt, Herr Präsident.\\n \n",
180
+ "2 2 Ich danke für diese Erklärung. Ich stelle dami... \n",
181
+ "3 3 Ja, ich habe den Wunsch.\\n \n",
182
+ "4 4 Ich erteile dem Herrn Bundespräsidenten das Wo... \n",
183
+ "... ... ... \n",
184
+ "930955 1084268 \\n\\nWir sind zwar Kollegen. \n",
185
+ "930956 1084269 \\n\\nLiebe, sehr geehrte Frau Präsidentin! \n",
186
+ "930957 1084270 \\n\\nVielen Dank. \n",
187
+ "930958 1084272 \\n\\nDen Abschluss dieser Aktuellen Stunde bild... \n",
188
+ "930959 1084273 \\n\\nSehr geehrte Frau Präsidentin! Werte Kolle... \n",
189
+ "\n",
190
+ " date party \n",
191
+ "0 1949-09-12 not found \n",
192
+ "1 1949-09-12 not found \n",
193
+ "2 1949-09-12 not found \n",
194
+ "3 1949-09-12 not found \n",
195
+ "4 1949-09-12 not found \n",
196
+ "... ... ... \n",
197
+ "930955 2022-12-16 not found \n",
198
+ "930956 2022-12-16 CDU/CSU \n",
199
+ "930957 2022-12-16 not found \n",
200
+ "930958 2022-12-16 not found \n",
201
+ "930959 2022-12-16 SPD \n",
202
+ "\n",
203
+ "[930960 rows x 4 columns]"
204
+ ]
205
+ },
206
+ "execution_count": 3,
207
+ "metadata": {},
208
+ "output_type": "execute_result"
209
+ }
210
+ ],
211
+ "source": [
212
+ "df[\"speech_content\"].replace(\"\\({\\d+}\\)\", \"\", inplace=True, regex=True) #removing keys from interruptions\n",
213
+ "df"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": null,
219
+ "metadata": {},
220
+ "outputs": [],
221
+ "source": [
222
+ "df.to_pickle(\"speeches_1949_09_12\")"
223
+ ]
224
+ }
225
+ ],
226
+ "metadata": {
227
+ "kernelspec": {
228
+ "display_name": "Python 3",
229
+ "language": "python",
230
+ "name": "python3"
231
+ },
232
+ "language_info": {
233
+ "codemirror_mode": {
234
+ "name": "ipython",
235
+ "version": 3
236
+ },
237
+ "file_extension": ".py",
238
+ "mimetype": "text/x-python",
239
+ "name": "python",
240
+ "nbconvert_exporter": "python",
241
+ "pygments_lexer": "ipython3",
242
+ "version": "3.11.4"
243
+ }
244
+ },
245
+ "nbformat": 4,
246
+ "nbformat_minor": 2
247
+ }
src/chatbot.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
2
+ from langchain_core.prompts import ChatPromptTemplate
3
+ from langchain_community.llms.huggingface_hub import HuggingFaceHub
4
+ from langchain_community.embeddings import HuggingFaceEmbeddings
5
+
6
+ from src.vectordatabase import RAG, get_vectorstore
7
+ import pandas as pd
8
+ import os
9
+ from dotenv import load_dotenv, find_dotenv
10
+
11
+ #Load environmental variables from .env-file
12
+ load_dotenv(find_dotenv())
13
+
14
+
15
+ embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
16
+ llm = HuggingFaceHub(
17
+ # Try different model here
18
+ # repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
19
+ repo_id="CohereForAI/c4ai-command-r-v01",
20
+ task="text-generation",
21
+ model_kwargs={
22
+ "max_new_tokens": 512,
23
+ "top_k": 30,
24
+ "temperature": 0.1,
25
+ "repetition_penalty": 1.03,
26
+ }
27
+ )
28
+ # To Do: Experiment with different templates replying in german or english depending on the input language
29
+ prompt1 = ChatPromptTemplate.from_template("""<s>[INST]
30
+ Instruction: Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
31
+
32
+ Context: {context}
33
+
34
+ Question: {input}
35
+ [/INST]"""
36
+ # Returns the answer in English!?
37
+ )
38
+
39
+ prompt2 = ChatPromptTemplate.from_template("""Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
40
+
41
+ <context>
42
+ {context}
43
+ </context>
44
+
45
+ Frage: {input}
46
+ """
47
+ # Returns the answer in German
48
+ )
49
+
50
+
51
+ folder_path = "./src/FAISS"
52
+ #index_name = "speeches_1949_09_12"
53
+ index_name = "legislature20"
54
+ db = get_vectorstore(embeddings=embeddings, folder_path=folder_path, index_name=index_name)
55
+
56
+ def chatbot(message, history, db=db, llm=llm, prompt=prompt2):
57
+ raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
58
+ response = raw_response['answer'].split("Antwort: ")[1]
59
+ return response
60
+
61
+ # Retrieve speech contents based on keywords
62
+ def keyword_search(query, db=db, embeddings=embeddings):
63
+ query_embedding = embeddings.embed_query(query)
64
+ results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding)
65
+ # Format vector store query results into dataframe
66
+ #print(results[0][0].metadata.keys())
67
+
68
+ df_res = pd.DataFrame(columns=['Speech Content', 'Relevance']) # Add Date/Party/Politician
69
+ for doc in results:
70
+ speech_content = doc[0].page_content
71
+ #speech_date = doc[0].metadata["date"]
72
+ score = doc[1] # Relevance based on relevance search
73
+ df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
74
+ #'Date': [speech_date],
75
+ 'Relevance': [score]})], ignore_index=True)
76
+
77
+ df_res.sort_values('Relevance', inplace=True)
78
+ return df_res
src/vectordatabase.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import DataFrameLoader
2
+ from langchain_community.embeddings import HuggingFaceEmbeddings
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain_community.llms import HuggingFaceHub
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.chains.combine_documents import create_stuff_documents_chain
8
+ from langchain.chains import create_retrieval_chain
9
+
10
+ import os
11
+ #from dotenv import load_dotenv
12
+
13
+ #Load environmental variables from .env-file
14
+ #load_dotenv()
15
+
16
+
17
+ # Load documents to create a vectorstore later
18
+ def load_documents(df):
19
+ # To Do: Create one initial vectore store loading all the documents with this function
20
+ #loader = CSVLoader(index_name, source_column="speech_content") #unprocessed csv file
21
+ loader = DataFrameLoader(data_frame=df, page_content_column='speech_content') #df
22
+ data = loader.load()
23
+ splitter = RecursiveCharacterTextSplitter(
24
+ chunk_size=1024,
25
+ chunk_overlap=32,
26
+ length_function=len,
27
+ is_separator_regex=False,
28
+ )
29
+ documents = splitter.split_documents(documents=data)
30
+ return documents
31
+
32
+ def get_vectorstore(embeddings, folder_path, index_name):
33
+ path = folder_path + "/" + index_name
34
+ print(path)
35
+ # To Do: Dynamicly update and merge verctorstores
36
+ #if os.path.exists(path):
37
+ db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
38
+ embeddings=embeddings, allow_dangerous_deserialization=True)
39
+ #else:
40
+ #db = FAISS.from_documents(documents, embeddings)
41
+ #db.save_local(folder_path=folder_path, index_name=index_name)
42
+ #pass
43
+ return db
44
+
45
+ # Apply RAG by providing the context and the question to the LLM using the predefined template
46
+ def RAG(llm, prompt, db, question):
47
+ document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
48
+ retriever = db.as_retriever()
49
+ retrieval_chain = create_retrieval_chain(retriever, document_chain)
50
+
51
+ response = retrieval_chain.invoke({"input": question})
52
+ return response
53
+