Spaces:

Effyis
/

AGDS-UI

Sleeping

App Files Files Community

Nechba commited on Jun 20, 2024

Commit

c398ab5

1 Parent(s): 217a700

fisrt commit

Browse files

Files changed (9) hide show

app.py +194 -0
logo.png +0 -0
requirments.txt +13 -0
utlis/__init__.py +0 -0
utlis/__pycache__/__init__.cpython-39.pyc +0 -0
utlis/__pycache__/constant.cpython-39.pyc +0 -0
utlis/__pycache__/helper.cpython-39.pyc +0 -0
utlis/constant.py +31 -0
utlis/helper.py +199 -0

app.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from utlis.helper import *
+initialize_session_state()
+with st.sidebar:
+    st.image("logo.png", width=170)
+    st.title("AGDC")
+    # Get List of models
+    llms = ['gpt-3.5-turbo', 'gemini']
+    st.session_state.llm = st.selectbox("Choose LLM",llms)
+    st.session_state.genre = st.radio(
+    "Choose option",
+    ["Select document", "Add document(s)","Delete service(s)", "Delete document(s)"])
+    if st.session_state.genre=="Add document(s)":
+        st.title('Add Document(s)')
+        # Check service status
+        # Get all available services
+        add_new_service = st.checkbox("Add new service")
+        if add_new_service:
+            new_service = st.text_input("Enter service name")
+            # Get list of Embedding models
+            if  new_service and st.button('Add'):
+                add_service(st.session_state.token,new_service)
+        data = {"token": st.session_state.token}
+        json_data = json.dumps(data)
+        headers = {'Content-Type': 'application/json'}
+        services  = requests.get(SERVICES_API,data=json_data, headers=headers)
+        services =json.loads(services.text)
+        if len(services)>0:
+           st.session_state.service = st.selectbox("Choose Service",services)
+        if len(services)>0:
+            st.session_state.uploaded_files = st.file_uploader("Upload PDF file",  type=["pdf"], accept_multiple_files=False)
+            if st.session_state.uploaded_files:
+                st.session_state.process = st.button('Process')
+                if st.session_state.process:
+                    add_document(st.session_state.token,st.session_state.service)
+    elif st.session_state.genre=="Select document":
+        st.title('Scrape Document')
+        data = {"token": st.session_state.token}
+        json_data = json.dumps(data)
+        headers = {'Content-Type': 'application/json'}
+        services  = requests.get(SERVICES_API,data=json_data, headers=headers)
+        services =json.loads(services.text)
+        if len(services)>0:
+            st.session_state.service_slected_to_chat = st.selectbox("Choose Service",services)
+            data = {"token": st.session_state.token, "servicename": st.session_state.service_slected_to_chat}
+            json_data = json.dumps(data)
+            headers = {'Content-Type': 'application/json'}
+            history_document  = requests.get(DOCUMENT_API,data=json_data, headers=headers)
+            history_document =json.loads(history_document.text).get("documents",[])
+            history_document = [doc["documentname"] for doc in history_document]
+            st.session_state.doument_slected_to_chat = st.selectbox("Choose Documnet",history_document)
+            data = {"token": st.session_state.token, "service_name": st.session_state.service_slected_to_chat,"document_name":st.session_state.doument_slected_to_chat}
+            json_data = json.dumps(data)
+            headers = {'Content-Type': 'application/json'}
+            number_pages = requests.get(GET_NUM_PAGES,data=json_data, headers=headers)
+            number_pages =json.loads(number_pages.text).get("num_pages")
+            page_options = list(range(1, int(number_pages) + 1))
+            st.session_state.start_page = st.selectbox("Start Page",page_options)
+            st.session_state.end_page = st.selectbox("End Page", page_options, index=len(page_options) - 1)
+            st.session_state.method = st.selectbox("Chunking Method", ["chunk_per_page", "personalize_chunking"])
+            if st.session_state.method=="personalize_chunking":
+               st.session_state.split_token = st.text_area("Split Token")
+        else:
+            st.session_state.service_slected_to_chat = None
+    elif st.session_state.genre == "Delete service(s)":
+        st.title('Delete Service(s)')
+        data = {"token": st.session_state.token}
+        json_data = json.dumps(data)
+        headers = {'Content-Type': 'application/json'}
+        services  = requests.get(SERVICES_API,data=json_data, headers=headers)
+        services =json.loads(services.text)
+        if len(services)>=2:
+            services.append("ALL")
+            # Get list of documents from histrory
+        if "ALL" in services:
+            service_slected = st.multiselect(
+                    "",services ,default="ALL"
+                    )
+        elif len(services)==1:
+            service_slected = st.multiselect(
+                    "",services,default=services[0]
+                    )
+        else:
+            service_slected = st.multiselect(
+                    "",services
+                    )
+        if "ALL" in service_slected:
+            service_slected = services
+            service_slected.remove("ALL")
+        st.write("You selected:", service_slected)
+        if len(service_slected) > 0:
+            st.session_state.delete = st.button('Delete')
+            if st.session_state.delete:
+                delete_service(st.session_state.token ,service_slected)
+    elif st.session_state.genre == "Delete document(s)":
+        st.title('Delete Document(s)')
+        data = {"token": st.session_state.token}
+        json_data = json.dumps(data)
+        headers = {'Content-Type': 'application/json'}
+        services  = requests.get(SERVICES_API,data=json_data, headers=headers)
+        services =json.loads(services.text)
+        if len(services)>0:
+            service = st.selectbox("Choose Service",services)
+            data = {"token": st.session_state.token, "servicename": service}
+            json_data = json.dumps(data)
+            headers = {'Content-Type': 'application/json'}
+            history_document  = requests.get(DOCUMENT_API,data=json_data, headers=headers)
+            history_document =json.loads(history_document.text).get("documents",[])
+            history_document = [doc["documentname"] for doc in history_document]
+            if len(history_document)>=2:
+                history_document.append("ALL")
+            # Get list of documents from histrory
+            if "ALL" in history_document:
+                document_slected_to_delete = st.multiselect(
+                    "",history_document ,default="ALL"
+                    )
+            elif len(history_document)==1:
+                document_slected_to_delete = st.multiselect(
+                    "",history_document,default=history_document[0]
+                    )
+            else:
+                document_slected_to_delete = st.multiselect(
+                    "",history_document
+                    )
+            if "ALL" in document_slected_to_delete:
+                document_slected_to_delete = history_document
+                document_slected_to_delete.remove("ALL")
+            st.write("You selected:", document_slected_to_delete)
+            if len(document_slected_to_delete) > 0:
+                st.session_state.delete = st.button('Delete')
+                if st.session_state.delete:
+                    delete_document(st.session_state.token,st.session_state.service ,document_slected_to_delete)
+css_style = """
+<style>
+.title {
+    white-space: nowrap;
+}
+</style>
+"""
+st.markdown(css_style, unsafe_allow_html=True)
+with st.container():
+    st.markdown('<h1 class="title">Augmented Generative Document Scraper</h1>', unsafe_allow_html=True)
+    if st.session_state.genre=="Select document" and st.session_state.service_slected_to_chat:
+        schema = display_and_validate_schema()
+        comments = None
+        if schema and st.checkbox("Add comments")  :
+            keys = get_all_keys(schema)
+            comments = handle_comments(keys)
+        if schema and st.button('Process') :
+            data = {"token": st.session_state.token,
+            "service_name": st.session_state.service_slected_to_chat,
+            "document_name": st.session_state.doument_slected_to_chat,
+            "method": st.session_state.method,
+            "model": st.session_state.llm,
+            "schema": schema,
+            "comment": comments,
+            "split_token": st.session_state.split_token if st.session_state.method == "personalize_chunking" else "",
+            "start_page": st.session_state.start_page,
+            "end_page": st.session_state.end_page}
+            json_data = json.dumps(data)
+            headers = {'Content-Type': 'application/json'}
+            response  = requests.get(RESPONSE_API,data=json_data, headers=headers)
+            response_data = json.loads(response.text)
+            if response_data.get('status')=='success':
+                json_str =response_data.get("json")
+                json_str_formatted = json.dumps(json_str)
+                # Encode this JSON string to bytes, which is required for the download
+                json_bytes = json_str_formatted.encode('utf-8')
+                st.download_button(
+                    label="Download JSON",
+                    data=json_bytes,
+                    file_name="results.json",
+                    mime="application/json"
+                )
+            else:
+                st.error("Error in processing document")

logo.png ADDED Viewed

requirments.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+streamlit
+pdfplumber
+python-dotenv
+haystack-ai
+transformers
+accelerate
+bitsandbytes
+redis
+python-multipart
+sentence-transformers
+langchain
+semantic_text_splitter
+google-generativeai

utlis/__init__.py ADDED Viewed

File without changes

utlis/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (140 Bytes). View file

utlis/__pycache__/constant.cpython-39.pyc ADDED Viewed

Binary file (759 Bytes). View file

utlis/__pycache__/helper.cpython-39.pyc ADDED Viewed

Binary file (5.74 kB). View file

utlis/constant.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# IP_WEB_SERVER = "https://f564-196-65-150-53.ngrok-free.app"
+# IP_MODEL_SERVER = "https://fluffy-mole-81.telebit.io"
+IP_WEB_SERVER = "http://localhost:80"
+SERVICES_API = IP_WEB_SERVER+"/services/"
+ADD_SERVICES_API = IP_WEB_SERVER+"/add_services"
+ADD_STORE_DOCUMENT = IP_WEB_SERVER+"/add_and_store_document"
+DOCUMENT_API = IP_WEB_SERVER+"/documents"
+REMOVE_DOCUMENTS_API = IP_WEB_SERVER+"/remove_documents"
+REMOVE_SERVICE_API = IP_WEB_SERVER+"/remove_service"
+GET_NUM_PAGES = IP_WEB_SERVER+"/get_num_pages"
+RESPONSE_API = IP_WEB_SERVER+"/structure_response"
+DEFAULT_SCHEMA = {
+    "GeographicContext": "<variable>",
+    "SubGeographicContext": "<variable>",
+    "Channel": "<variable>",
+    "RateType": "<variable>",
+    "Notes": ["<variable>"],
+    "Rates": [
+        {
+            "PaymentProduct": "<variable>",
+            "Details": [
+                {
+                    "FeeTier": "<variable>",
+                    "IRD": ["<variable>"],
+                    "Rate": "<variable>"
+                },
+            ]
+        },
+    ]
+}

utlis/helper.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import pdfplumber
+import streamlit as st
+import requests
+import json
+import redis
+import redis.commands.search
+from redis.commands.search.field import TagField, VectorField, TextField
+from redis.commands.search.indexDefinition import IndexDefinition, IndexType
+import logging
+from redis.commands.search.query import Query
+import numpy as np
+from typing import List, Dict, Any
+from semantic_text_splitter import TextSplitter
+from tokenizers import Tokenizer
+from sentence_transformers import SentenceTransformer
+from utlis.constant import *
+from PIL import Image
+import google.generativeai as genai
+genai.configure(api_key="AIzaSyAhz9UBzkEIYI886zZRm40qqB1Kd_9Y4-0")
+import base64
+def initialize_session_state():
+    if "token" not in st.session_state:
+        st.session_state["token"] ="abcd"
+    if "service" not in st.session_state:
+        st.session_state["service"] = None
+    if "use_document" not in st.session_state:
+        st.session_state.use_document = False
+    if "flag" not in st.session_state:
+        st.session_state.flag = False
+    if "embdding_model" not in st.session_state:
+        st.session_state["embdding_model"] = None
+    if "indexing_method" not in st.session_state:
+        st.session_state["indexing_method"] = None
+    if "uploaded_files" not in st.session_state:
+        st.session_state["uploaded_files"] = None
+    if "messages" not in st.session_state:
+        st.session_state["messages"] = [{"role": "assistant", "content": "How can I help you?"}]
+def extract_text_from_pdf(pdf_path):
+    text=""
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_number, page in enumerate(pdf.pages, start=1):
+            # Try to extract the text
+            text+= page.extract_text(x_tolerance=2, y_tolerance=4, layout=True, x_density=5, y_density=10)
+    return text
+def delete_service(token,service_slected_to_delete):
+    for srevice_name in service_slected_to_delete:
+        url = REMOVE_SERVICE_API
+        # JSON payload to be sent
+        data = {
+            "token": token,
+            "servicename": srevice_name
+            }
+        json_data = json.dumps(data)
+        # Set the headers to specify that the content type is JSON
+        headers = {'Content-Type': 'application/json'}
+        # Send the POST request
+        response = requests.delete(url, data=json_data, headers=headers)
+        if json.loads( response.text).get("success")==True:
+            st.success(f"{srevice_name} deleted successfully")
+        else:
+            st.error(f"{srevice_name} not deleted successfully")
+def delete_document(token, service,document_slected_to_delete):
+        print(document_slected_to_delete)
+    # for document_name in document_slected_to_delete:
+        url = REMOVE_DOCUMENTS_API
+        # JSON payload to be sent
+        data = {
+        "token": token,
+        "service_name": service,
+        "document_names":document_slected_to_delete
+        }
+        # Convert the dictionary to a JSON formatted string
+        json_data = json.dumps(data)
+        # Set the headers to specify that the content type is JSON
+        headers = {'Content-Type': 'application/json'}
+        # Send the POST request
+        response = requests.delete(url, data=json_data, headers=headers)
+        print(response)
+        if json.loads( response.text).get("status")=="success":
+            st.success("document(s) deleted successfully")
+        else:
+            st.error("document(s) not deleted successfully")
+def gemini_vision(file):
+    load_image = Image.open(file)
+    prompt= "please extract all text fromt this image"
+    model = genai.GenerativeModel('gemini-pro-vision')
+    response = model.generate_content([prompt, load_image])
+    return response.text
+def add_service(token,servicename):
+    url = ADD_SERVICES_API
+    # JSON payload to be sent
+    data = {
+        "token": token,
+        "services": [
+            {
+                "servicename": servicename
+            }
+        ]
+    }
+    # Convert the dictionary to a JSON formatted string
+    json_data = json.dumps(data)
+    # Set the headers to specify that the content type is JSON
+    headers = {'Content-Type': 'application/json'}
+    # Send the POST request
+    response = requests.post(url, data=json_data, headers=headers)
+    if json.loads( response.text).get("added_services"):
+        st.success(f"{servicename} added successfully")
+    else:
+        st.error(response.text)
+def add_document(token,servicename):
+            file = st.session_state.uploaded_files
+            print(file)
+            url = ADD_STORE_DOCUMENT
+            # JSON payload to be sent
+            document_name = file.name.replace(" ","")
+            #document_name = document_name.replace(".pdf","")
+            document_name = document_name.replace("(","_")
+            document_name = document_name.replace(")","_")
+            document_name = document_name.replace("-","_")
+            document_name = document_name.replace(".","_")
+            encoded_file = base64.b64encode(file.read()).decode('utf-8')
+            data = {
+            "token": token,
+            "service_name": servicename,
+            "document_name": document_name,
+            "file":encoded_file
+            }
+            # Convert the dictionary to a JSON formatted string
+            json_data = json.dumps(data)
+            # Set the headers to specify that the content type is JSON
+            headers = {'Content-Type': 'application/json'}
+            # Send the POST request
+            response = requests.post(url, data=json_data, headers=headers)
+            document_name = file.name.replace(" ","_")
+            if json.loads( response.text).get("status")=="success":
+                st.success(f"{document_name} uploaded successfully")
+            else:
+                st.error(f"{document_name} not uploaded successfully")
+def get_all_keys(d):
+    all_keys = set()
+    def get_keys(d):
+        for k, v in d.items():
+            all_keys.add(k)
+            if isinstance(v, dict):
+                get_keys(v)
+            elif isinstance(v, list):
+                for item in v:
+                    if isinstance(item, dict):
+                        get_keys(item)
+    get_keys(d)
+    return list(all_keys)
+def display_and_validate_schema():
+    schema_str = json.dumps(DEFAULT_SCHEMA, indent=2)
+    schema_input = st.text_area("JSON Schema", schema_str, height=300)
+    try:
+        schema = json.loads(schema_input)
+        st.success("JSON schema is valid.")
+        return schema
+    except json.JSONDecodeError:
+        st.error("The JSON schema is invalid. Please correct it and try again.")
+        return None
+def handle_comments(keys):
+    comments = {}
+    items_per_page = 6  # Adjust this number based on your preference
+    total_pages = (len(keys) + items_per_page - 1) // items_per_page
+    st.write("Please provide comments for each key to assist our system:")
+    page = st.number_input("Page", min_value=1, max_value=total_pages, step=1)
+    start_idx = (page - 1) * items_per_page
+    end_idx = start_idx + items_per_page
+    for key in keys[start_idx:end_idx]:
+        with st.expander(f"{key}"):
+            comments[key] = st.text_input(f"{key}")
+    # if st.button("Submit"):
+    #   st.session_state.flag=False
+    return comments