Spaces:

Effyis
/

Smart-Retrieval

Sleeping

App Files Files Community

Nechba commited on Jun 11, 2024

Commit

a4200f5

1 Parent(s): 534b4e8

fisrt commit1

Browse files

Files changed (9) hide show

app.py +163 -0
logo.png +0 -0
requirments.txt +13 -0
utlis/__init__.py +0 -0
utlis/__pycache__/__init__.cpython-39.pyc +0 -0
utlis/__pycache__/constant.cpython-39.pyc +0 -0
utlis/__pycache__/helper.cpython-39.pyc +0 -0
utlis/constant.py +25 -0
utlis/helper.py +216 -0

app.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from utlis.helper import *
+initialize_session_state()
+with st.sidebar:
+    st.image("logo.png", width=170)
+    st.title("Smart Retrieval")
+    # Get List of models
+    llms = ['Gemini-Pro','Cohere','Mistral-7B-Instruct-v0.3','gemma-2b','Meta-Llama-3-8B-Instruct','Phi-3-mini-4k-instruct','zephyr-7b-beta']
+    st.session_state.llm = st.selectbox("Choose LLM",llms)
+    genre = st.radio(
+    "Choose option",
+    ["Select document(s)", "Add document(s)","Delete service(s)", "Delete document(s)"])
+    if genre=="Add document(s)":
+        st.title('Add Document(s)')
+        # Check service status
+        # Get all available services
+        add_new_service = st.checkbox("Add new service")
+        if add_new_service:
+            new_service = st.text_input("Enter service name")
+            # Get list of Embedding models
+            res_request= requests.get(EMBEDDING_MODELS_API)
+            embidding_models =json.loads(res_request.text)
+            embdding_model = st.selectbox("Choose Embidding model",embidding_models["Model_Names_paid"])
+            if  new_service and st.button('Add'):
+                add_service(st.session_state.token,new_service, embdding_model)
+        services =  requests.get(SERVICES_API+st.session_state.token)
+        services =json.loads(services.text)
+        if len(services)>0:
+           st.session_state.service = st.selectbox("Choose Service",services)
+        # Get list of Indexing methods
+        # indexing_method_list = ['FLAT','HSNW']
+        # st.session_state.indexing_method = st.selectbox("Choose Indexing method",indexing_method_list)
+        # Send Document to API
+        if st.session_state.service:
+            st.session_state.uploaded_files = st.file_uploader("Upload PDF files",  type=["pdf", "png", "jpg", "jpeg"], accept_multiple_files=True)
+            if st.session_state.uploaded_files:
+                st.session_state.process = st.button('Process')
+                if st.session_state.process:
+                    add_document(st.session_state.token,st.session_state.service)
+    elif genre=="Select document(s)":
+        st.title('Chat with Document(s)')
+        services =  requests.get(SERVICES_API+st.session_state.token)
+        services =json.loads(services.text)
+        if len(services)>0:
+            st.session_state.service_slected_to_chat = st.selectbox("Choose Service",services)
+            st.session_state.top_k = st.number_input("Top k ", min_value=1, value=5)
+            history_document = requests.get(DOCUMENT_API+f'/{st.session_state.token}/{st.session_state.service_slected_to_chat}')
+            history_document =json.loads(history_document.text).get("documents",[])
+            if len(history_document)>=2:
+                history_document.append("ALL")
+            # Get list of documents from histrory
+            if "ALL" in history_document:
+                st.session_state.doument_slected_to_chat = st.multiselect(
+                    "",history_document ,default="ALL"
+                    )
+            elif len(history_document)==1:
+                st.session_state.doument_slected_to_chat = st.multiselect(
+                    "",history_document,default=history_document[0]
+                    )
+            else:
+                st.session_state.doument_slected_to_chat = st.multiselect(
+                    "",history_document
+                    )
+            if "ALL" in st.session_state.doument_slected_to_chat:
+                st.session_state.doument_slected_to_chat = history_document
+                st.session_state.doument_slected_to_chat.remove("ALL")
+            st.write("You selected:", st.session_state.doument_slected_to_chat)
+    elif genre == "Delete service(s)":
+        st.title('Delete Service(s)')
+        services =  requests.get(SERVICES_API+st.session_state.token)
+        services =json.loads(services.text)
+        if len(services)>=2:
+            services.append("ALL")
+            # Get list of documents from histrory
+        if "ALL" in services:
+            service_slected = st.multiselect(
+                    "",services ,default="ALL"
+                    )
+        elif len(services)==1:
+            service_slected = st.multiselect(
+                    "",services,default=services[0]
+                    )
+        else:
+            service_slected = st.multiselect(
+                    "",services
+                    )
+        if "ALL" in service_slected:
+            service_slected = services
+            service_slected.remove("ALL")
+        st.write("You selected:", service_slected)
+        if len(service_slected) > 0:
+            st.session_state.delete = st.button('Delete')
+            if st.session_state.delete:
+                delete_service(st.session_state.token ,service_slected)
+    elif genre == "Delete document(s)":
+        st.title('Delete Document(s)')
+        services =  requests.get(SERVICES_API+st.session_state.token)
+        services =json.loads(services.text)
+        if len(services)>0:
+            service = st.selectbox("Choose Service",services)
+            history_document = requests.get(DOCUMENT_API+f'/{st.session_state.token}/{service}')
+            history_document =json.loads(history_document.text).get("documents",[])
+            if len(history_document)>=2:
+                history_document.append("ALL")
+            # Get list of documents from histrory
+            if "ALL" in history_document:
+                document_slected_to_delete = st.multiselect(
+                    "",history_document ,default="ALL"
+                    )
+            elif len(history_document)==1:
+                document_slected_to_delete = st.multiselect(
+                    "",history_document,default=history_document[0]
+                    )
+            else:
+                document_slected_to_delete = st.multiselect(
+                    "",history_document
+                    )
+            if "ALL" in document_slected_to_delete:
+                document_slected_to_delete = history_document
+                document_slected_to_delete.remove("ALL")
+            st.write("You selected:", document_slected_to_delete)
+            if len(document_slected_to_delete) > 0:
+                st.session_state.delete = st.button('Delete')
+                if st.session_state.delete:
+                    delete_document(st.session_state.token,st.session_state.service ,document_slected_to_delete)
+for msg in st.session_state.messages:
+    if msg["role"] == "user":
+        st.chat_message(msg["role"], avatar="🧑‍💻").write(msg["content"])
+    else:
+        st.chat_message(msg["role"], avatar="🤖").write(msg["content"])
+if prompt := st.chat_input():
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    st.chat_message("user", avatar="🧑‍💻").write(prompt)
+    context = get_context(prompt,st.session_state.token,st.session_state.service_slected_to_chat,st.session_state.top_k)
+    template = " "
+    for i in range(0,len(context)):
+        template += f"Chunk{i}: "+context[i] + "\n"
+    print(template)
+    response=generate_response(st.session_state.llm, prompt, context = template)
+    #response = generate_response(st.session_state.llm,prompt, context)
+    st.session_state.messages.append({"role": "assistant", "content": response})
+    # with st.chat_message("assistant"):
+    #     message_placeholder = st.empty()
+    #     message_placeholder.markdown("Search...")
+    #     message_placeholder.markdown(response)
+    st.chat_message("assistant", avatar="🤖").write(response)

logo.png ADDED Viewed

requirments.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+streamlit
+pdfplumber
+python-dotenv
+haystack-ai
+transformers
+accelerate
+bitsandbytes
+redis
+python-multipart
+sentence-transformers
+langchain
+semantic_text_splitter
+google-generativeai

utlis/__init__.py ADDED Viewed

File without changes

utlis/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (140 Bytes). View file

utlis/__pycache__/constant.cpython-39.pyc ADDED Viewed

Binary file (1.05 kB). View file

utlis/__pycache__/helper.cpython-39.pyc ADDED Viewed

Binary file (5.37 kB). View file

utlis/constant.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+from dotenv import load_dotenv
+import redis
+REDIS_CONNECTION = redis.Redis(
+    host="redis-13875.c240.us-east-1-3.ec2.redns.redis-cloud.com",
+    port=13875,
+    password="CWkHBok23bakpa9lRif3nGSk6y0baVPu",
+    ssl=False  # Enable SSL for the connection
+)
+# IP_WEB_SERVER = "https://f564-196-65-150-53.ngrok-free.app"
+# IP_MODEL_SERVER = "https://fluffy-mole-81.telebit.io"
+IP_WEB_SERVER = "http://192.168.11.119:8000"
+IP_MODEL_SERVER = "http://192.168.11.119:8001"
+EMBEDDING_MODELS_API = IP_MODEL_SERVER+"/models_&_sizes"
+SERVICES_API = IP_WEB_SERVER+"/services/"
+ADD_SERVICES_API = IP_WEB_SERVER+"/add_services"
+CHUNK_STORE_API = IP_WEB_SERVER+"/chunk_and_store"
+SEARCH_API = IP_WEB_SERVER+"/search"
+DOCUMENT_API = IP_WEB_SERVER+"/documents"
+REMOVE_DOCUMENT_API = IP_WEB_SERVER+"/remove_documents"
+REMOVE_SERVICE_API = IP_WEB_SERVER+"/remove_service"
+CHAT_API = IP_MODEL_SERVER+"/chat"

utlis/helper.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import pdfplumber
+import streamlit as st
+import requests
+import json
+import redis
+import redis.commands.search
+from redis.commands.search.field import TagField, VectorField, TextField
+from redis.commands.search.indexDefinition import IndexDefinition, IndexType
+import logging
+from redis.commands.search.query import Query
+import numpy as np
+from typing import List, Dict, Any
+from semantic_text_splitter import TextSplitter
+from tokenizers import Tokenizer
+from sentence_transformers import SentenceTransformer
+from utlis.constant import *
+from PIL import Image
+import google.generativeai as genai
+genai.configure(api_key="AIzaSyAhz9UBzkEIYI886zZRm40qqB1Kd_9Y4-0")
+def initialize_session_state():
+    if "token" not in st.session_state:
+        st.session_state["token"] ="abcd"
+    if "service" not in st.session_state:
+        st.session_state["service"] = None
+    if "use_document" not in st.session_state:
+        st.session_state.use_document = False
+    if "flag" not in st.session_state:
+        st.session_state.flag = False
+    if "embdding_model" not in st.session_state:
+        st.session_state["embdding_model"] = None
+    if "indexing_method" not in st.session_state:
+        st.session_state["indexing_method"] = None
+    if "uploaded_files" not in st.session_state:
+        st.session_state["uploaded_files"] = None
+    if "messages" not in st.session_state:
+        st.session_state["messages"] = [{"role": "assistant", "content": "How can I help you?"}]
+def extract_text_from_pdf(pdf_path):
+    text=""
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_number, page in enumerate(pdf.pages, start=1):
+            # Try to extract the text
+            text+= page.extract_text(x_tolerance=2, y_tolerance=4, layout=True, x_density=5, y_density=10)
+    return text
+def delete_service(token,service_slected_to_delete):
+    for srevice_name in service_slected_to_delete:
+        url = REMOVE_SERVICE_API
+        # JSON payload to be sent
+        data = {
+            "token": token,
+            "servicename": srevice_name
+            }
+        json_data = json.dumps(data)
+        # Set the headers to specify that the content type is JSON
+        headers = {'Content-Type': 'application/json'}
+        # Send the POST request
+        response = requests.post(url, data=json_data, headers=headers)
+        if json.loads( response.text).get("success")==True:
+            st.success(f"{srevice_name} deleted successfully")
+        else:
+            st.error(f"{srevice_name} not deleted successfully")
+def delete_document(token, service,document_slected_to_delete):
+    for document_name in document_slected_to_delete:
+        url = REMOVE_DOCUMENT_API
+        # JSON payload to be sent
+        data = {
+    "token": token,
+    "servicename": service,
+    "documentname":document_name}
+        # Convert the dictionary to a JSON formatted string
+        json_data = json.dumps(data)
+        # Set the headers to specify that the content type is JSON
+        headers = {'Content-Type': 'application/json'}
+        # Send the POST request
+        response = requests.post(url, data=json_data, headers=headers)
+        if json.loads( response.text).get("status")=="success":
+            st.success(f"{document_name} deleted successfully")
+        else:
+            st.error(f"{document_name} not deleted successfully")
+def gemini_vision(file):
+    load_image = Image.open(file)
+    prompt= "please extract all text fromt this image"
+    model = genai.GenerativeModel('gemini-pro-vision')
+    response = model.generate_content([prompt, load_image])
+    return response.text
+def add_service(token,servicename,embdding_model):
+    url = ADD_SERVICES_API
+    # JSON payload to be sent
+    data = {
+        "token": token,
+        "services": [
+            {
+                "servicename": servicename,
+                "modelname": embdding_model
+            }
+        ]
+    }
+    # Convert the dictionary to a JSON formatted string
+    json_data = json.dumps(data)
+    # Set the headers to specify that the content type is JSON
+    headers = {'Content-Type': 'application/json'}
+    # Send the POST request
+    response = requests.post(url, data=json_data, headers=headers)
+    if json.loads( response.text).get("added_services"):
+        st.success(f"{servicename} added successfully")
+    else:
+        st.error(response.text)
+def add_document(token,servicename):
+    for file in st.session_state.uploaded_files:
+        if file.type.split('/')[-1]=='pdf':
+            text= extract_text_from_pdf(file)
+        else:
+            text = gemini_vision(file)
+            print(text)
+        if text:
+            url = CHUNK_STORE_API
+            # JSON payload to be sent
+            document_name = file.name.replace(" ","")
+            #document_name = document_name.replace(".pdf","")
+            document_name = document_name.replace("(","_")
+            document_name = document_name.replace(")","_")
+            document_name = document_name.replace("-","_")
+            data = {
+                "text": text,
+                "document_name":document_name,
+                "user_id": token,
+                "service_name": servicename
+            }
+            # Convert the dictionary to a JSON formatted string
+            json_data = json.dumps(data)
+            # Set the headers to specify that the content type is JSON
+            headers = {'Content-Type': 'application/json'}
+            # Send the POST request
+            response = requests.post(url, data=json_data, headers=headers)
+            document_name = file.name.replace(" ","_")
+            if json.loads( response.text).get("success")==True:
+                st.success(f"{document_name} uploaded successfully")
+            else:
+                st.error(f"{document_name} not uploaded successfully")
+        else:
+            st.error("we can't extract text from {}".format(file.name))
+def get_context(prompt,token,service_name,top_k):
+    url = SEARCH_API
+    # JSON payload to be sent
+    data = {
+    "userid": token,
+    "service_name": service_name,
+    "query_str": prompt,
+    "document_names":st.session_state.doument_slected_to_chat ,
+    "top_k": top_k
+    }
+    # Convert the dictionary to a JSON formatted string
+    json_data = json.dumps(data)
+    # Set the headers to specify that the content type is JSON
+    headers = {'Content-Type': 'application/json'}
+    # Send the POST request
+    response = requests.post(url, data=json_data, headers=headers)
+    if json.loads( response.text).get("results"):
+        context = []
+        for chunk in json.loads( response.text).get("results"):
+             context.append(chunk['chunk'])
+        return context
+    else:
+         return []
+def query(payload):
+	response = requests.post(API_URL, headers=HEADERS, json=payload)
+	return response.json()
+def generate_response(llm_name, question, context = None):
+    url = CHAT_API
+    #st.chat_message("assistant", avatar="🤖").write(context)
+    # JSON payload to be sent
+    data = {
+        "context": context,
+        "question": question,
+        "model_name": llm_name,
+    }
+    # Convert the dictionary to a JSON formatted string
+    json_data = json.dumps(data)
+    # Set the headers to specify that the content type is JSON
+    headers = {'Content-Type': 'application/json'}
+    # Send the POST request
+    response = requests.post(url, data=json_data, headers=headers)
+    return json.loads( response.text).get("response", "429 Quota exceeded for quota metric.")