Spaces:
Runtime error
Runtime error
Commit
·
04558b7
1
Parent(s):
6996634
Upload application.py
Browse files- application.py +135 -0
application.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -- Import libraries
|
2 |
+
from langchain.prompts import PromptTemplate
|
3 |
+
from PIL import Image
|
4 |
+
from streamlit.logger import get_logger
|
5 |
+
from streamlit_player import st_player
|
6 |
+
import pandas as pd
|
7 |
+
import streamlit as st
|
8 |
+
import urllib.request
|
9 |
+
import argparse
|
10 |
+
import together
|
11 |
+
import logging
|
12 |
+
import requests
|
13 |
+
import utils
|
14 |
+
import spacy
|
15 |
+
import time
|
16 |
+
import os
|
17 |
+
import re
|
18 |
+
|
19 |
+
def main():
|
20 |
+
st.set_page_config(layout="wide")
|
21 |
+
|
22 |
+
# -- 1. Setup arguments
|
23 |
+
parser = argparse.ArgumentParser()
|
24 |
+
parser.add_argument('--DEFAULT_SYSTEM_PROMPT_LINK', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt", help='Valor para DEFAULT_SYSTEM_PROMPT_LINK')
|
25 |
+
parser.add_argument('--PODCAST_URL_VIDEO_PATH', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv", help='Valor para PODCAST_URL_VIDEO_PATH')
|
26 |
+
parser.add_argument('--TRANSCRIPTION', type=str, default='worldcast_roberto_vaquero', help='Name of the trascription')
|
27 |
+
parser.add_argument('--MODEL', type=str, default='togethercomputer/llama-2-13b-chat', help='Model name')
|
28 |
+
parser.add_argument('--EMB_MODEL', type=str, default='sentence-transformers/paraphrase-multilingual-mpnet-base-v2', help='Embedding model name')
|
29 |
+
os.system("python -m spacy download es_core_news_lg")
|
30 |
+
|
31 |
+
# -- 2. Setup env and logger
|
32 |
+
os.environ["TOGETHER_API_KEY"] = "6101599d6e33e3bda336b8d007ca22e35a64c72cfd52c2d8197f663389fc50c5"
|
33 |
+
logger = get_logger(__name__)
|
34 |
+
|
35 |
+
# -- 3. Setup constants
|
36 |
+
B_INST, E_INST = "[INST]", "[/INST]"
|
37 |
+
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
|
38 |
+
args = parser.parse_args()
|
39 |
+
PODCAST_URL_VIDEO_PATH = args.PODCAST_URL_VIDEO_PATH
|
40 |
+
DEFAULT_SYSTEM_PROMPT_LINK = args.DEFAULT_SYSTEM_PROMPT_LINK
|
41 |
+
TRANSCRIPTION = args.TRANSCRIPTION
|
42 |
+
TRANSCRIPTION_PATH = '{}_transcription.txt'.format(TRANSCRIPTION)
|
43 |
+
MODEL = args.MODEL
|
44 |
+
EMB_MODEL = args.EMB_MODEL
|
45 |
+
SOCIAL_ICONS = {
|
46 |
+
"LinkedIn": ["https://www.linkedin.com/in/alberto-fernandez-hernandez-3a3474136/", "https://icon.signature.email/social/linkedin-circle-medium-0077b5-FFFFFF.png", ],
|
47 |
+
"GitHub": ["https://github.com/AlbertoUAH", "https://icon.signature.email/social/github-circle-medium-24292e-FFFFFF.png"]
|
48 |
+
}
|
49 |
+
social_icons_html = [f"<a href='{SOCIAL_ICONS[platform][0]}' target='_blank' style='margin-right: 10px;'><img class='social-icon' src='{SOCIAL_ICONS[platform][1]}'' alt='{platform}''></a>" for platform in SOCIAL_ICONS]
|
50 |
+
|
51 |
+
together.api_key = os.environ["TOGETHER_API_KEY"]
|
52 |
+
together.Models.start(MODEL)
|
53 |
+
podcast_url_video_df = pd.read_csv(PODCAST_URL_VIDEO_PATH, sep=';')
|
54 |
+
|
55 |
+
r = requests.get("https://raw.githubusercontent.com/AlbertoUAH/Castena/main/media/castena-animated-icon.gif", stream=True)
|
56 |
+
icon = Image.open(r.raw)
|
57 |
+
icon = icon.resize((100, 100))
|
58 |
+
st.sidebar.image(icon)
|
59 |
+
video_option = st.sidebar.selectbox(
|
60 |
+
"Seleccione el podcast",
|
61 |
+
list(podcast_url_video_df['podcast_name_lit'].apply(lambda x: x.replace("'", "")))
|
62 |
+
)
|
63 |
+
video_option_joined = '_'.join(video_option.replace(': Entrevista a ', ' ').lower().split(' ')).replace("\'", "")
|
64 |
+
video_option_joined_path = "{}_transcription.txt".format(video_option_joined)
|
65 |
+
print("Filtering: {}".format(video_option_joined))
|
66 |
+
youtube_video_url = list(podcast_url_video_df[podcast_url_video_df['podcast_name'].str.contains(video_option_joined)]['youtube_video_url'])[0].replace("\'", "")
|
67 |
+
|
68 |
+
# -- 4. Setup request for system prompt
|
69 |
+
f = urllib.request.urlopen(DEFAULT_SYSTEM_PROMPT_LINK)
|
70 |
+
DEFAULT_SYSTEM_PROMPT = str(f.read(), 'UTF-8')
|
71 |
+
|
72 |
+
# -- 5. Setup app
|
73 |
+
translator, nlp, retriever = utils.setup_app(video_option_joined_path, EMB_MODEL, MODEL, logger)
|
74 |
+
|
75 |
+
|
76 |
+
# -- 6. Setup prompt template + llm chain
|
77 |
+
instruction = """CONTEXTO:/n/n {context}/n
|
78 |
+
|
79 |
+
PREGUNTA: {question}
|
80 |
+
|
81 |
+
RESPUESTA: """
|
82 |
+
prompt_template = utils.get_prompt(instruction, DEFAULT_SYSTEM_PROMPT, B_SYS, E_SYS, B_INST, E_INST, logger)
|
83 |
+
|
84 |
+
llama_prompt = PromptTemplate(
|
85 |
+
template=prompt_template, input_variables=["context", "question"]
|
86 |
+
)
|
87 |
+
chain_type_kwargs = {"prompt": llama_prompt}
|
88 |
+
|
89 |
+
qa_chain = utils.create_llm_chain(MODEL, retriever, chain_type_kwargs, logger, video_option_joined_path)
|
90 |
+
|
91 |
+
# ---------------------------------------------------------------------
|
92 |
+
# -- 7. Setup Streamlit app
|
93 |
+
st.title("[Podcast: {}]({})".format(video_option.replace("'", "").title(), youtube_video_url))
|
94 |
+
|
95 |
+
width = 50
|
96 |
+
side = (100 - width) / 2
|
97 |
+
_, container, _ = st.columns([side, width, side])
|
98 |
+
with container:
|
99 |
+
st_player(utils.typewrite(youtube_video_url))
|
100 |
+
|
101 |
+
if "messages" not in st.session_state:
|
102 |
+
st.session_state.messages = []
|
103 |
+
for message in st.session_state.messages:
|
104 |
+
with st.chat_message(message["role"]):
|
105 |
+
st.markdown(message["content"])
|
106 |
+
if prompt := st.chat_input("¡Pregunta lo que quieras!"):
|
107 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
108 |
+
with st.chat_message("user"):
|
109 |
+
st.markdown(prompt)
|
110 |
+
|
111 |
+
with st.chat_message("assistant"):
|
112 |
+
llm_response = qa_chain(prompt)
|
113 |
+
llm_response = utils.process_llm_response(llm_response, nlp)
|
114 |
+
st.markdown(llm_response)
|
115 |
+
start_time_str_list = []; start_time_seconds_list = []; end_time_seconds_list = []
|
116 |
+
for response in llm_response.split('\n'):
|
117 |
+
if re.search(r'(\d{2}:\d{2}:\d{2}(.\d{6})?)', response) != None:
|
118 |
+
start_time_str, start_time_seconds, _, end_time_seconds = utils.add_hyperlink_and_convert_to_seconds(response)
|
119 |
+
start_time_str_list.append(start_time_str)
|
120 |
+
start_time_seconds_list.append(start_time_seconds)
|
121 |
+
end_time_seconds_list.append(end_time_seconds)
|
122 |
+
|
123 |
+
if start_time_str_list:
|
124 |
+
width = 40
|
125 |
+
side = (100 - width) / 2
|
126 |
+
for start_time_seconds, start_time_str, end_time_seconds in zip(start_time_seconds_list, start_time_str_list, end_time_seconds_list):
|
127 |
+
st.markdown("__Fragmento: " + start_time_str + "__")
|
128 |
+
_, container, _ = st.columns([side, width, side])
|
129 |
+
with container:
|
130 |
+
st_player(youtube_video_url.replace("?enablejsapi=1", "") + f'?start={start_time_seconds}&end={end_time_seconds}')
|
131 |
+
|
132 |
+
st.session_state.messages.append({"role": "assistant", "content": llm_response})
|
133 |
+
# -- Sample: streamlit run app.py -- --DEFAULT_SYSTEM_PROMPT_LINK=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt --PODCAST_URL_VIDEO_PATH=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv --TRANSCRIPTION=worldcast_roberto_vaquero --MODEL=togethercomputer/llama-2-7b-chat --EMB_MODEL=BAAI/bge-base-en-v1.5
|
134 |
+
if __name__ == '__main__':
|
135 |
+
main()
|