# -- Import libraries from langchain.prompts import PromptTemplate from PIL import Image from streamlit.logger import get_logger from streamlit_player import st_player import pandas as pd import streamlit as st import urllib.request import argparse import together import logging import requests import utils import spacy import time import os import re def main(): st.set_page_config(layout="wide") # -- 1. Setup arguments parser = argparse.ArgumentParser() parser.add_argument('--DEFAULT_SYSTEM_PROMPT_LINK', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt", help='Valor para DEFAULT_SYSTEM_PROMPT_LINK') parser.add_argument('--PODCAST_URL_VIDEO_PATH', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv", help='Valor para PODCAST_URL_VIDEO_PATH') parser.add_argument('--TRANSCRIPTION', type=str, default='worldcast_roberto_vaquero', help='Name of the trascription') parser.add_argument('--MODEL', type=str, default='togethercomputer/llama-2-13b-chat', help='Model name') parser.add_argument('--EMB_MODEL', type=str, default='sentence-transformers/paraphrase-multilingual-mpnet-base-v2', help='Embedding model name') os.system("python -m spacy download es_core_news_lg") # -- 2. Setup env and logger os.environ["TOGETHER_API_KEY"] = "6101599d6e33e3bda336b8d007ca22e35a64c72cfd52c2d8197f663389fc50c5" logger = get_logger(__name__) # -- 3. Setup constants B_INST, E_INST = "[INST]", "[/INST]" B_SYS, E_SYS = "<>\n", "\n<>\n\n" args = parser.parse_args() PODCAST_URL_VIDEO_PATH = args.PODCAST_URL_VIDEO_PATH DEFAULT_SYSTEM_PROMPT_LINK = args.DEFAULT_SYSTEM_PROMPT_LINK TRANSCRIPTION = args.TRANSCRIPTION TRANSCRIPTION_PATH = '{}_transcription.txt'.format(TRANSCRIPTION) MODEL = args.MODEL EMB_MODEL = args.EMB_MODEL together.api_key = os.environ["TOGETHER_API_KEY"] together.Models.start(MODEL) podcast_url_video_df = pd.read_csv(PODCAST_URL_VIDEO_PATH, sep=';') r = requests.get("https://raw.githubusercontent.com/AlbertoUAH/Castena/main/media/castena-animated-icon.gif", stream=True) icon = Image.open(r.raw) icon = icon.resize((100, 100)) st.sidebar.image(icon) video_option = st.sidebar.selectbox( "Seleccione el podcast", list(podcast_url_video_df['podcast_name_lit'].apply(lambda x: x.replace("'", ""))) ) video_option_joined = '_'.join(video_option.replace(': Entrevista a ', ' ').lower().split(' ')).replace("\'", "") video_option_joined_path = "{}_transcription.txt".format(video_option_joined) youtube_video_url = list(podcast_url_video_df[podcast_url_video_df['podcast_name'].str.contains(video_option_joined)]['youtube_video_url'])[0].replace("\'", "") # -- 4. Setup request for system prompt f = urllib.request.urlopen(DEFAULT_SYSTEM_PROMPT_LINK) DEFAULT_SYSTEM_PROMPT = str(f.read(), 'UTF-8') # -- 5. Setup app translator, nlp, retriever = utils.setup_app(video_option_joined_path, EMB_MODEL, MODEL, logger) # -- 6. Setup prompt template + llm chain instruction = """CONTEXTO:/n/n {context}/n PREGUNTA: {question} RESPUESTA: """ prompt_template = utils.get_prompt(instruction, DEFAULT_SYSTEM_PROMPT, B_SYS, E_SYS, B_INST, E_INST, logger) llama_prompt = PromptTemplate( template=prompt_template, input_variables=["context", "question"] ) chain_type_kwargs = {"prompt": llama_prompt} qa_chain = utils.create_llm_chain(MODEL, retriever, chain_type_kwargs, logger, video_option_joined_path) # --------------------------------------------------------------------- # -- 7. Setup Streamlit app st.title("[Podcast: {}]({})".format(video_option.replace("'", "").title(), youtube_video_url)) width = 50 side = (100 - width) / 2 _, container, _ = st.columns([side, width, side]) with container: st_player(utils.typewrite(youtube_video_url)) if "messages" not in st.session_state: st.session_state.messages = [] for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) if prompt := st.chat_input("¡Pregunta lo que quieras!"): with st.chat_message("user"): st.markdown(prompt) st.session_state.messages.append({"role": "user", "content": prompt}) with st.chat_message("assistant"): llm_response = qa_chain(prompt) llm_response = utils.process_llm_response(llm_response, nlp) st.markdown(llm_response) start_time_str_list = []; start_time_seconds_list = []; end_time_seconds_list = [] for response in llm_response.split('\n'): if re.search(r'(\d{2}:\d{2}:\d{2}(.\d{6})?)', response) != None: start_time_str, start_time_seconds, _, end_time_seconds = utils.add_hyperlink_and_convert_to_seconds(response) start_time_str_list.append(start_time_str) start_time_seconds_list.append(start_time_seconds) end_time_seconds_list.append(end_time_seconds) if start_time_str_list: width = 40 side = (100 - width) / 2 for start_time_seconds, start_time_str, end_time_seconds in zip(start_time_seconds_list, start_time_str_list, end_time_seconds_list): st.markdown("__Fragmento: " + start_time_str + "__") _, container, _ = st.columns([side, width, side]) with container: st_player(youtube_video_url.replace("?enablejsapi=1", "") + f'?start={start_time_seconds}&end={end_time_seconds}') st.session_state.messages.append({"role": "assistant", "content": llm_response}) # -- Sample: streamlit run app.py -- --DEFAULT_SYSTEM_PROMPT_LINK=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt --PODCAST_URL_VIDEO_PATH=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv --TRANSCRIPTION=worldcast_roberto_vaquero --MODEL=togethercomputer/llama-2-7b-chat --EMB_MODEL=BAAI/bge-base-en-v1.5 if __name__ == '__main__': main()