AlbertoFH98 commited on
Commit
04558b7
·
1 Parent(s): 6996634

Upload application.py

Browse files
Files changed (1) hide show
  1. application.py +135 -0
application.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -- Import libraries
2
+ from langchain.prompts import PromptTemplate
3
+ from PIL import Image
4
+ from streamlit.logger import get_logger
5
+ from streamlit_player import st_player
6
+ import pandas as pd
7
+ import streamlit as st
8
+ import urllib.request
9
+ import argparse
10
+ import together
11
+ import logging
12
+ import requests
13
+ import utils
14
+ import spacy
15
+ import time
16
+ import os
17
+ import re
18
+
19
+ def main():
20
+ st.set_page_config(layout="wide")
21
+
22
+ # -- 1. Setup arguments
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument('--DEFAULT_SYSTEM_PROMPT_LINK', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt", help='Valor para DEFAULT_SYSTEM_PROMPT_LINK')
25
+ parser.add_argument('--PODCAST_URL_VIDEO_PATH', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv", help='Valor para PODCAST_URL_VIDEO_PATH')
26
+ parser.add_argument('--TRANSCRIPTION', type=str, default='worldcast_roberto_vaquero', help='Name of the trascription')
27
+ parser.add_argument('--MODEL', type=str, default='togethercomputer/llama-2-13b-chat', help='Model name')
28
+ parser.add_argument('--EMB_MODEL', type=str, default='sentence-transformers/paraphrase-multilingual-mpnet-base-v2', help='Embedding model name')
29
+ os.system("python -m spacy download es_core_news_lg")
30
+
31
+ # -- 2. Setup env and logger
32
+ os.environ["TOGETHER_API_KEY"] = "6101599d6e33e3bda336b8d007ca22e35a64c72cfd52c2d8197f663389fc50c5"
33
+ logger = get_logger(__name__)
34
+
35
+ # -- 3. Setup constants
36
+ B_INST, E_INST = "[INST]", "[/INST]"
37
+ B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
38
+ args = parser.parse_args()
39
+ PODCAST_URL_VIDEO_PATH = args.PODCAST_URL_VIDEO_PATH
40
+ DEFAULT_SYSTEM_PROMPT_LINK = args.DEFAULT_SYSTEM_PROMPT_LINK
41
+ TRANSCRIPTION = args.TRANSCRIPTION
42
+ TRANSCRIPTION_PATH = '{}_transcription.txt'.format(TRANSCRIPTION)
43
+ MODEL = args.MODEL
44
+ EMB_MODEL = args.EMB_MODEL
45
+ SOCIAL_ICONS = {
46
+ "LinkedIn": ["https://www.linkedin.com/in/alberto-fernandez-hernandez-3a3474136/", "https://icon.signature.email/social/linkedin-circle-medium-0077b5-FFFFFF.png", ],
47
+ "GitHub": ["https://github.com/AlbertoUAH", "https://icon.signature.email/social/github-circle-medium-24292e-FFFFFF.png"]
48
+ }
49
+ social_icons_html = [f"<a href='{SOCIAL_ICONS[platform][0]}' target='_blank' style='margin-right: 10px;'><img class='social-icon' src='{SOCIAL_ICONS[platform][1]}'' alt='{platform}''></a>" for platform in SOCIAL_ICONS]
50
+
51
+ together.api_key = os.environ["TOGETHER_API_KEY"]
52
+ together.Models.start(MODEL)
53
+ podcast_url_video_df = pd.read_csv(PODCAST_URL_VIDEO_PATH, sep=';')
54
+
55
+ r = requests.get("https://raw.githubusercontent.com/AlbertoUAH/Castena/main/media/castena-animated-icon.gif", stream=True)
56
+ icon = Image.open(r.raw)
57
+ icon = icon.resize((100, 100))
58
+ st.sidebar.image(icon)
59
+ video_option = st.sidebar.selectbox(
60
+ "Seleccione el podcast",
61
+ list(podcast_url_video_df['podcast_name_lit'].apply(lambda x: x.replace("'", "")))
62
+ )
63
+ video_option_joined = '_'.join(video_option.replace(': Entrevista a ', ' ').lower().split(' ')).replace("\'", "")
64
+ video_option_joined_path = "{}_transcription.txt".format(video_option_joined)
65
+ print("Filtering: {}".format(video_option_joined))
66
+ youtube_video_url = list(podcast_url_video_df[podcast_url_video_df['podcast_name'].str.contains(video_option_joined)]['youtube_video_url'])[0].replace("\'", "")
67
+
68
+ # -- 4. Setup request for system prompt
69
+ f = urllib.request.urlopen(DEFAULT_SYSTEM_PROMPT_LINK)
70
+ DEFAULT_SYSTEM_PROMPT = str(f.read(), 'UTF-8')
71
+
72
+ # -- 5. Setup app
73
+ translator, nlp, retriever = utils.setup_app(video_option_joined_path, EMB_MODEL, MODEL, logger)
74
+
75
+
76
+ # -- 6. Setup prompt template + llm chain
77
+ instruction = """CONTEXTO:/n/n {context}/n
78
+
79
+ PREGUNTA: {question}
80
+
81
+ RESPUESTA: """
82
+ prompt_template = utils.get_prompt(instruction, DEFAULT_SYSTEM_PROMPT, B_SYS, E_SYS, B_INST, E_INST, logger)
83
+
84
+ llama_prompt = PromptTemplate(
85
+ template=prompt_template, input_variables=["context", "question"]
86
+ )
87
+ chain_type_kwargs = {"prompt": llama_prompt}
88
+
89
+ qa_chain = utils.create_llm_chain(MODEL, retriever, chain_type_kwargs, logger, video_option_joined_path)
90
+
91
+ # ---------------------------------------------------------------------
92
+ # -- 7. Setup Streamlit app
93
+ st.title("[Podcast: {}]({})".format(video_option.replace("'", "").title(), youtube_video_url))
94
+
95
+ width = 50
96
+ side = (100 - width) / 2
97
+ _, container, _ = st.columns([side, width, side])
98
+ with container:
99
+ st_player(utils.typewrite(youtube_video_url))
100
+
101
+ if "messages" not in st.session_state:
102
+ st.session_state.messages = []
103
+ for message in st.session_state.messages:
104
+ with st.chat_message(message["role"]):
105
+ st.markdown(message["content"])
106
+ if prompt := st.chat_input("¡Pregunta lo que quieras!"):
107
+ st.session_state.messages.append({"role": "user", "content": prompt})
108
+ with st.chat_message("user"):
109
+ st.markdown(prompt)
110
+
111
+ with st.chat_message("assistant"):
112
+ llm_response = qa_chain(prompt)
113
+ llm_response = utils.process_llm_response(llm_response, nlp)
114
+ st.markdown(llm_response)
115
+ start_time_str_list = []; start_time_seconds_list = []; end_time_seconds_list = []
116
+ for response in llm_response.split('\n'):
117
+ if re.search(r'(\d{2}:\d{2}:\d{2}(.\d{6})?)', response) != None:
118
+ start_time_str, start_time_seconds, _, end_time_seconds = utils.add_hyperlink_and_convert_to_seconds(response)
119
+ start_time_str_list.append(start_time_str)
120
+ start_time_seconds_list.append(start_time_seconds)
121
+ end_time_seconds_list.append(end_time_seconds)
122
+
123
+ if start_time_str_list:
124
+ width = 40
125
+ side = (100 - width) / 2
126
+ for start_time_seconds, start_time_str, end_time_seconds in zip(start_time_seconds_list, start_time_str_list, end_time_seconds_list):
127
+ st.markdown("__Fragmento: " + start_time_str + "__")
128
+ _, container, _ = st.columns([side, width, side])
129
+ with container:
130
+ st_player(youtube_video_url.replace("?enablejsapi=1", "") + f'?start={start_time_seconds}&end={end_time_seconds}')
131
+
132
+ st.session_state.messages.append({"role": "assistant", "content": llm_response})
133
+ # -- Sample: streamlit run app.py -- --DEFAULT_SYSTEM_PROMPT_LINK=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt --PODCAST_URL_VIDEO_PATH=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv --TRANSCRIPTION=worldcast_roberto_vaquero --MODEL=togethercomputer/llama-2-7b-chat --EMB_MODEL=BAAI/bge-base-en-v1.5
134
+ if __name__ == '__main__':
135
+ main()