hienbm commited on
Commit
902c2a5
·
verified ·
1 Parent(s): c29c7e6

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -1
app.py CHANGED
@@ -1 +1 @@
1
- {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":30733,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"%%time\n%%capture\n!pip install -U langchain langchain_community langchain_huggingface langchain_experimental langchain-google-genai pillow langchain-google-vertexai\n!pip install -U sentence-transformers\n!pip install -U faiss-cpu\n!pip install -U bs4\n!pip install -U replicate\n!pip install -U docarray\n!pip install git+https://github.com/huggingface/accelerate.git\n!pip install git+https://github.com/huggingface/transformers.git\n!pip install -U bitsandbytes\n!pip install -U huggingface_hub\n!pip install -U chromadb\n!pip install -U youtube-transcript-api\n!pip install -U streamlit \n\nimport IPython\nIPython.Application.instance().kernel.do_shutdown(True)","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import torch\nimport bitsandbytes as bnb\nimport transformers\nimport bs4\nimport pandas as pd\nimport re\nimport streamlit as st\nimport pandas as pd\nimport os\n\nfrom langchain_core.messages import AIMessage, HumanMessage\nfrom langchain_core.output_parsers import StrOutputParser\nfrom IPython.display import clear_output\nfrom langchain.schema.runnable import RunnablePassthrough\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\nfrom langchain_community.document_loaders import YoutubeLoader\nfrom langchain_community.document_loaders import WebBaseLoader, DataFrameLoader, CSVLoader\nfrom langchain_community.vectorstores.utils import filter_complex_metadata\nfrom langchain_community.embeddings import HuggingFaceEmbeddings\nfrom langchain_community.vectorstores import FAISS\nfrom langchain.chains import RetrievalQA\nfrom langchain.llms import HuggingFacePipeline\nfrom langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate\n\nfrom IPython.display import display, Markdown, clear_output\nfrom transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n\n\nfrom huggingface_hub import login\nlogin(token=API_TOKEN)\n\nos.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"max_split_size_mb:15000\"\n\nmodel_id = \"google/gemma-2-9b-it\"\nquantization_config = BitsAndBytesConfig(load_in_4bit=True)\n\ntokenizer = AutoTokenizer.from_pretrained(\n model_id, \n return_tensors=\"pt\",\n padding=True, \n truncation=True,\n trust_remote_code=True,\n)\ntokenizer.pad_token = tokenizer.eos_token\ntokenizer.padding_side = \"right\"\n\nmodel = AutoModelForCausalLM.from_pretrained(\n model_id,\n quantization_config=quantization_config,\n device_map=\"auto\",\n low_cpu_mem_usage=True,\n pad_token_id=0,\n)\nmodel.config.use_cache = False\n\n# Create a text generation pipeline with specific settings\npipe = transformers.pipeline(\n task=\"text-generation\",\n model=model,\n tokenizer=tokenizer,\n torch_dtype=torch.float16,\n device_map=\"auto\",\n temperature=0.0,\n top_p=0.9,\n num_return_sequences=1,\n eos_token_id=tokenizer.eos_token_id,\n max_length=4096,\n truncation=True,\n)\n\nchat_model = HuggingFacePipeline(pipeline=pipe)\n\ntemplate = \"\"\"\nYou are a genius trader with extensive knowledge of the financial and stock markets, capable of providing deep and insightful analysis of financial stocks with remarkable accuracy.\n\n**ALWAYS**\nSummarize and provide the main insights.\nBe as detailed as possible, but don't make up any information that’s not from the context.\nIf you don't know an answer, say you don't know.\nLet's think step by step.\n\nPlease ensure responses are informative, accurate, and tailored to the user's queries and preferences.\nUse natural language to engage users and provide readable content throughout your response.\n\nChat history:\n{chat_history}\n\nUser question:\n{user_question}\n\"\"\"\n\nprompt_template = ChatPromptTemplate.from_template(template)\n\ndef find_youtube_links(text):\n # Define the regular expression pattern for YouTube URLs\n youtube_regex = (r'(https?://(?:www\\.)?(?:youtube\\.com/watch\\?v=|youtu\\.be/)[^ \\n]+)')\n # Use re.findall() to find all matches in the text\n matches = re.findall(youtube_regex, text)\n return str(' '.join(matches))\n\n\n# Initialize session state\nif \"chat_history\" not in st.session_state:\n st.session_state.chat_history = [AIMessage(content=\"Hello, how can I help you?\")]\n \n \n# Display chat history\nfor message in st.session_state.chat_history:\n if isinstance(message, AIMessage):\n with st.chat_message(\"AI\"):\n st.write(message.content)\n elif isinstance(message, HumanMessage):\n with st.chat_message(\"Human\"):\n st.write(message.content)\n\n \n# User input\nuser_query = st.chat_input(\"Type your message here...\")\nif user_query is not None and user_query != \"\":\n st.session_state.chat_history.append(HumanMessage(content=user_query))\n\n with st.chat_message(\"Human\"):\n st.markdown(user_query)\n \n loader = YoutubeLoader.from_youtube_url(\n find_youtube_links(user_query),\n add_video_info=False,\n language=[\"en\", \"vi\"],\n translation=\"en\",\n )\n docs = loader.load()\n # Convert the loaded documents to a list of dictionaries\n data_list = [\n {\n \"source\": doc.metadata['source'],\n \"page_content\": doc.page_content\n }\n for doc in docs\n ]\n\n df = pd.DataFrame(data_list)\n loader = DataFrameLoader(df, page_content_column='page_content')\n content = loader.load()\n # reviews = filter_complex_metadata(reviews)\n\n # Split the document into chunks with a specified chunk size\n text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)\n all_splits = text_splitter.split_documents(content)\n \n # Initialize the embedding model\n embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L12-v2\")\n \n # Store the document into a vector store with a specific embedding model\n vectorstore = FAISS.from_documents(all_splits, embedding_model)\n reviews_retriever = vectorstore.as_retriever()\n \n # Function to get a response from the model\n def get_response(user_query, chat_history):\n chain = prompt_template | chat_model | StrOutputParser()\n response = chain.invoke({\n \"user_question\": user_query,\n \"chat_history\": chat_history,\n })\n return response\n \n response = get_response(reviews_retriever, st.session_state.chat_history)\n \n with st.chat_message(\"AI\"):\n st.write(response)\n\n st.session_state.chat_history.append(AIMessage(content=response))","metadata":{"execution":{"iopub.status.busy":"2024-07-05T12:18:13.933873Z","iopub.execute_input":"2024-07-05T12:18:13.934280Z","iopub.status.idle":"2024-07-05T12:19:42.733200Z","shell.execute_reply.started":"2024-07-05T12:18:13.934251Z","shell.execute_reply":"2024-07-05T12:19:42.732422Z"},"trusted":true},"execution_count":6,"outputs":[{"name":"stdout","text":"The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\nToken is valid (permission: write).\nYour token has been saved to /root/.cache/huggingface/token\nLogin successful\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c4d93f1a24df44f2b5b1d6f898ee86d5"}},"metadata":{}}]}]}
 
1
+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":30733,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import torch\nimport bitsandbytes as bnb\nimport transformers\nimport bs4\nimport pandas as pd\nimport re\nimport streamlit as st\nimport pandas as pd\nimport os\n\nfrom langchain_core.messages import AIMessage, HumanMessage\nfrom langchain_core.output_parsers import StrOutputParser\nfrom IPython.display import clear_output\nfrom langchain.schema.runnable import RunnablePassthrough\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\nfrom langchain_community.document_loaders import YoutubeLoader\nfrom langchain_community.document_loaders import WebBaseLoader, DataFrameLoader, CSVLoader\nfrom langchain_community.vectorstores.utils import filter_complex_metadata\nfrom langchain_community.embeddings import HuggingFaceEmbeddings\nfrom langchain_community.vectorstores import FAISS\nfrom langchain.chains import RetrievalQA\nfrom langchain.llms import HuggingFacePipeline\nfrom langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate\n\nfrom IPython.display import display, Markdown, clear_output\nfrom transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n\nfrom huggingface_hub import login\nlogin(token=API_TOKEN)\n\n# App config\nst.set_page_config(page_title=\"GOAHEAD.AI\",page_icon= \"🌍\")\nst.title(\"GOAHEAD.AI ✈️\")\n\nos.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"max_split_size_mb:15000\"\n\nmodel_id = \"google/gemma-2-9b-it\"\nquantization_config = BitsAndBytesConfig(load_in_4bit=True)\n\ntokenizer = AutoTokenizer.from_pretrained(\n model_id, \n return_tensors=\"pt\",\n padding=True, \n truncation=True,\n trust_remote_code=True,\n)\ntokenizer.pad_token = tokenizer.eos_token\ntokenizer.padding_side = \"right\"\n\nmodel = AutoModelForCausalLM.from_pretrained(\n model_id,\n quantization_config=quantization_config,\n device_map=\"auto\",\n low_cpu_mem_usage=True,\n pad_token_id=0,\n)\nmodel.config.use_cache = False\n\n# Create a text generation pipeline with specific settings\npipe = transformers.pipeline(\n task=\"text-generation\",\n model=model,\n tokenizer=tokenizer,\n torch_dtype=torch.float16,\n device_map=\"auto\",\n temperature=0.0,\n top_p=0.9,\n num_return_sequences=1,\n eos_token_id=tokenizer.eos_token_id,\n max_length=4096,\n truncation=True,\n)\n\nchat_model = HuggingFacePipeline(pipeline=pipe)\n\ntemplate = \"\"\"\nYou are a genius trader with extensive knowledge of the financial and stock markets, capable of providing deep and insightful analysis of financial stocks with remarkable accuracy.\n\n**ALWAYS**\nSummarize and provide the main insights.\nBe as detailed as possible, but don't make up any information that’s not from the context.\nIf you don't know an answer, say you don't know.\nLet's think step by step.\n\nPlease ensure responses are informative, accurate, and tailored to the user's queries and preferences.\nUse natural language to engage users and provide readable content throughout your response.\n\nChat history:\n{chat_history}\n\nUser question:\n{user_question}\n\"\"\"\n\nprompt_template = ChatPromptTemplate.from_template(template)\n\ndef find_youtube_links(text):\n # Define the regular expression pattern for YouTube URLs\n youtube_regex = (r'(https?://(?:www\\.)?(?:youtube\\.com/watch\\?v=|youtu\\.be/)[^ \\n]+)')\n # Use re.findall() to find all matches in the text\n matches = re.findall(youtube_regex, text)\n return str(' '.join(matches))\n\n\n# Initialize session state\nif \"chat_history\" not in st.session_state:\n st.session_state.chat_history = [AIMessage(content=\"Hello, how can I help you?\")]\n \n \n# Display chat history\nfor message in st.session_state.chat_history:\n if isinstance(message, AIMessage):\n with st.chat_message(\"AI\"):\n st.write(message.content)\n elif isinstance(message, HumanMessage):\n with st.chat_message(\"Human\"):\n st.write(message.content)\n\n \n# User input\nuser_query = st.chat_input(\"Type your message here...\")\nif user_query is not None and user_query != \"\":\n st.session_state.chat_history.append(HumanMessage(content=user_query))\n\n with st.chat_message(\"Human\"):\n st.markdown(user_query)\n \n loader = YoutubeLoader.from_youtube_url(\n find_youtube_links(user_query),\n add_video_info=False,\n language=[\"en\", \"vi\"],\n translation=\"en\",\n )\n docs = loader.load()\n # Convert the loaded documents to a list of dictionaries\n data_list = [\n {\n \"source\": doc.metadata['source'],\n \"page_content\": doc.page_content\n }\n for doc in docs\n ]\n\n df = pd.DataFrame(data_list)\n loader = DataFrameLoader(df, page_content_column='page_content')\n content = loader.load()\n # reviews = filter_complex_metadata(reviews)\n\n # Split the document into chunks with a specified chunk size\n text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)\n all_splits = text_splitter.split_documents(content)\n \n # Initialize the embedding model\n embedding_model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L12-v2\")\n \n # Store the document into a vector store with a specific embedding model\n vectorstore = FAISS.from_documents(all_splits, embedding_model)\n reviews_retriever = vectorstore.as_retriever()\n \n # Function to get a response from the model\n def get_response(user_query, chat_history):\n chain = prompt_template | chat_model | StrOutputParser()\n response = chain.invoke({\n \"user_question\": user_query,\n \"chat_history\": chat_history,\n })\n return response\n \n response = get_response(reviews_retriever, st.session_state.chat_history)\n \n with st.chat_message(\"AI\"):\n st.write(response)\n\n st.session_state.chat_history.append(AIMessage(content=response))","metadata":{"execution":{"iopub.status.busy":"2024-07-05T12:18:13.933873Z","iopub.execute_input":"2024-07-05T12:18:13.934280Z","iopub.status.idle":"2024-07-05T12:19:42.733200Z","shell.execute_reply.started":"2024-07-05T12:18:13.934251Z","shell.execute_reply":"2024-07-05T12:19:42.732422Z"},"trusted":true},"execution_count":6,"outputs":[{"name":"stdout","text":"The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\nToken is valid (permission: write).\nYour token has been saved to /root/.cache/huggingface/token\nLogin successful\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c4d93f1a24df44f2b5b1d6f898ee86d5"}},"metadata":{}}]}]}