Nechba commited on
Commit
c398ab5
1 Parent(s): 217a700

fisrt commit

Browse files
app.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utlis.helper import *
2
+
3
+ initialize_session_state()
4
+
5
+ with st.sidebar:
6
+ st.image("logo.png", width=170)
7
+ st.title("AGDC")
8
+ # Get List of models
9
+ llms = ['gpt-3.5-turbo', 'gemini']
10
+ st.session_state.llm = st.selectbox("Choose LLM",llms)
11
+ st.session_state.genre = st.radio(
12
+ "Choose option",
13
+ ["Select document", "Add document(s)","Delete service(s)", "Delete document(s)"])
14
+
15
+ if st.session_state.genre=="Add document(s)":
16
+ st.title('Add Document(s)')
17
+ # Check service status
18
+ # Get all available services
19
+ add_new_service = st.checkbox("Add new service")
20
+ if add_new_service:
21
+ new_service = st.text_input("Enter service name")
22
+ # Get list of Embedding models
23
+
24
+ if new_service and st.button('Add'):
25
+ add_service(st.session_state.token,new_service)
26
+ data = {"token": st.session_state.token}
27
+ json_data = json.dumps(data)
28
+ headers = {'Content-Type': 'application/json'}
29
+ services = requests.get(SERVICES_API,data=json_data, headers=headers)
30
+ services =json.loads(services.text)
31
+ if len(services)>0:
32
+ st.session_state.service = st.selectbox("Choose Service",services)
33
+
34
+
35
+ if len(services)>0:
36
+ st.session_state.uploaded_files = st.file_uploader("Upload PDF file", type=["pdf"], accept_multiple_files=False)
37
+ if st.session_state.uploaded_files:
38
+ st.session_state.process = st.button('Process')
39
+ if st.session_state.process:
40
+ add_document(st.session_state.token,st.session_state.service)
41
+
42
+ elif st.session_state.genre=="Select document":
43
+ st.title('Scrape Document')
44
+ data = {"token": st.session_state.token}
45
+ json_data = json.dumps(data)
46
+ headers = {'Content-Type': 'application/json'}
47
+ services = requests.get(SERVICES_API,data=json_data, headers=headers)
48
+ services =json.loads(services.text)
49
+
50
+ if len(services)>0:
51
+ st.session_state.service_slected_to_chat = st.selectbox("Choose Service",services)
52
+ data = {"token": st.session_state.token, "servicename": st.session_state.service_slected_to_chat}
53
+ json_data = json.dumps(data)
54
+ headers = {'Content-Type': 'application/json'}
55
+ history_document = requests.get(DOCUMENT_API,data=json_data, headers=headers)
56
+ history_document =json.loads(history_document.text).get("documents",[])
57
+ history_document = [doc["documentname"] for doc in history_document]
58
+ st.session_state.doument_slected_to_chat = st.selectbox("Choose Documnet",history_document)
59
+ data = {"token": st.session_state.token, "service_name": st.session_state.service_slected_to_chat,"document_name":st.session_state.doument_slected_to_chat}
60
+ json_data = json.dumps(data)
61
+ headers = {'Content-Type': 'application/json'}
62
+ number_pages = requests.get(GET_NUM_PAGES,data=json_data, headers=headers)
63
+ number_pages =json.loads(number_pages.text).get("num_pages")
64
+ page_options = list(range(1, int(number_pages) + 1))
65
+
66
+ st.session_state.start_page = st.selectbox("Start Page",page_options)
67
+ st.session_state.end_page = st.selectbox("End Page", page_options, index=len(page_options) - 1)
68
+ st.session_state.method = st.selectbox("Chunking Method", ["chunk_per_page", "personalize_chunking"])
69
+ if st.session_state.method=="personalize_chunking":
70
+ st.session_state.split_token = st.text_area("Split Token")
71
+ else:
72
+ st.session_state.service_slected_to_chat = None
73
+
74
+
75
+ elif st.session_state.genre == "Delete service(s)":
76
+ st.title('Delete Service(s)')
77
+ data = {"token": st.session_state.token}
78
+ json_data = json.dumps(data)
79
+ headers = {'Content-Type': 'application/json'}
80
+ services = requests.get(SERVICES_API,data=json_data, headers=headers)
81
+ services =json.loads(services.text)
82
+ if len(services)>=2:
83
+ services.append("ALL")
84
+ # Get list of documents from histrory
85
+ if "ALL" in services:
86
+ service_slected = st.multiselect(
87
+ "",services ,default="ALL"
88
+ )
89
+ elif len(services)==1:
90
+ service_slected = st.multiselect(
91
+ "",services,default=services[0]
92
+ )
93
+ else:
94
+ service_slected = st.multiselect(
95
+ "",services
96
+ )
97
+ if "ALL" in service_slected:
98
+ service_slected = services
99
+ service_slected.remove("ALL")
100
+ st.write("You selected:", service_slected)
101
+
102
+ if len(service_slected) > 0:
103
+ st.session_state.delete = st.button('Delete')
104
+ if st.session_state.delete:
105
+ delete_service(st.session_state.token ,service_slected)
106
+
107
+ elif st.session_state.genre == "Delete document(s)":
108
+ st.title('Delete Document(s)')
109
+ data = {"token": st.session_state.token}
110
+ json_data = json.dumps(data)
111
+ headers = {'Content-Type': 'application/json'}
112
+ services = requests.get(SERVICES_API,data=json_data, headers=headers)
113
+ services =json.loads(services.text)
114
+ if len(services)>0:
115
+ service = st.selectbox("Choose Service",services)
116
+ data = {"token": st.session_state.token, "servicename": service}
117
+ json_data = json.dumps(data)
118
+ headers = {'Content-Type': 'application/json'}
119
+ history_document = requests.get(DOCUMENT_API,data=json_data, headers=headers)
120
+ history_document =json.loads(history_document.text).get("documents",[])
121
+ history_document = [doc["documentname"] for doc in history_document]
122
+ if len(history_document)>=2:
123
+ history_document.append("ALL")
124
+ # Get list of documents from histrory
125
+ if "ALL" in history_document:
126
+ document_slected_to_delete = st.multiselect(
127
+ "",history_document ,default="ALL"
128
+ )
129
+ elif len(history_document)==1:
130
+ document_slected_to_delete = st.multiselect(
131
+ "",history_document,default=history_document[0]
132
+ )
133
+ else:
134
+ document_slected_to_delete = st.multiselect(
135
+ "",history_document
136
+ )
137
+ if "ALL" in document_slected_to_delete:
138
+ document_slected_to_delete = history_document
139
+ document_slected_to_delete.remove("ALL")
140
+
141
+ st.write("You selected:", document_slected_to_delete)
142
+ if len(document_slected_to_delete) > 0:
143
+ st.session_state.delete = st.button('Delete')
144
+ if st.session_state.delete:
145
+ delete_document(st.session_state.token,st.session_state.service ,document_slected_to_delete)
146
+
147
+ css_style = """
148
+ <style>
149
+ .title {
150
+ white-space: nowrap;
151
+ }
152
+ </style>
153
+ """
154
+
155
+ st.markdown(css_style, unsafe_allow_html=True)
156
+
157
+ with st.container():
158
+ st.markdown('<h1 class="title">Augmented Generative Document Scraper</h1>', unsafe_allow_html=True)
159
+ if st.session_state.genre=="Select document" and st.session_state.service_slected_to_chat:
160
+ schema = display_and_validate_schema()
161
+ comments = None
162
+ if schema and st.checkbox("Add comments") :
163
+ keys = get_all_keys(schema)
164
+ comments = handle_comments(keys)
165
+ if schema and st.button('Process') :
166
+ data = {"token": st.session_state.token,
167
+ "service_name": st.session_state.service_slected_to_chat,
168
+ "document_name": st.session_state.doument_slected_to_chat,
169
+ "method": st.session_state.method,
170
+ "model": st.session_state.llm,
171
+ "schema": schema,
172
+ "comment": comments,
173
+ "split_token": st.session_state.split_token if st.session_state.method == "personalize_chunking" else "",
174
+ "start_page": st.session_state.start_page,
175
+ "end_page": st.session_state.end_page}
176
+ json_data = json.dumps(data)
177
+ headers = {'Content-Type': 'application/json'}
178
+ response = requests.get(RESPONSE_API,data=json_data, headers=headers)
179
+ response_data = json.loads(response.text)
180
+ if response_data.get('status')=='success':
181
+ json_str =response_data.get("json")
182
+ json_str_formatted = json.dumps(json_str)
183
+
184
+ # Encode this JSON string to bytes, which is required for the download
185
+ json_bytes = json_str_formatted.encode('utf-8')
186
+ st.download_button(
187
+ label="Download JSON",
188
+ data=json_bytes,
189
+ file_name="results.json",
190
+ mime="application/json"
191
+ )
192
+ else:
193
+ st.error("Error in processing document")
194
+
logo.png ADDED
requirments.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pdfplumber
3
+ python-dotenv
4
+ haystack-ai
5
+ transformers
6
+ accelerate
7
+ bitsandbytes
8
+ redis
9
+ python-multipart
10
+ sentence-transformers
11
+ langchain
12
+ semantic_text_splitter
13
+ google-generativeai
utlis/__init__.py ADDED
File without changes
utlis/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (140 Bytes). View file
 
utlis/__pycache__/constant.cpython-39.pyc ADDED
Binary file (759 Bytes). View file
 
utlis/__pycache__/helper.cpython-39.pyc ADDED
Binary file (5.74 kB). View file
 
utlis/constant.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # IP_WEB_SERVER = "https://f564-196-65-150-53.ngrok-free.app"
3
+ # IP_MODEL_SERVER = "https://fluffy-mole-81.telebit.io"
4
+ IP_WEB_SERVER = "http://localhost:80"
5
+ SERVICES_API = IP_WEB_SERVER+"/services/"
6
+ ADD_SERVICES_API = IP_WEB_SERVER+"/add_services"
7
+ ADD_STORE_DOCUMENT = IP_WEB_SERVER+"/add_and_store_document"
8
+ DOCUMENT_API = IP_WEB_SERVER+"/documents"
9
+ REMOVE_DOCUMENTS_API = IP_WEB_SERVER+"/remove_documents"
10
+ REMOVE_SERVICE_API = IP_WEB_SERVER+"/remove_service"
11
+ GET_NUM_PAGES = IP_WEB_SERVER+"/get_num_pages"
12
+ RESPONSE_API = IP_WEB_SERVER+"/structure_response"
13
+ DEFAULT_SCHEMA = {
14
+ "GeographicContext": "<variable>",
15
+ "SubGeographicContext": "<variable>",
16
+ "Channel": "<variable>",
17
+ "RateType": "<variable>",
18
+ "Notes": ["<variable>"],
19
+ "Rates": [
20
+ {
21
+ "PaymentProduct": "<variable>",
22
+ "Details": [
23
+ {
24
+ "FeeTier": "<variable>",
25
+ "IRD": ["<variable>"],
26
+ "Rate": "<variable>"
27
+ },
28
+ ]
29
+ },
30
+ ]
31
+ }
utlis/helper.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ import streamlit as st
3
+ import requests
4
+ import json
5
+ import redis
6
+ import redis.commands.search
7
+ from redis.commands.search.field import TagField, VectorField, TextField
8
+ from redis.commands.search.indexDefinition import IndexDefinition, IndexType
9
+ import logging
10
+ from redis.commands.search.query import Query
11
+ import numpy as np
12
+ from typing import List, Dict, Any
13
+ from semantic_text_splitter import TextSplitter
14
+ from tokenizers import Tokenizer
15
+ from sentence_transformers import SentenceTransformer
16
+ from utlis.constant import *
17
+ from PIL import Image
18
+ import google.generativeai as genai
19
+ genai.configure(api_key="AIzaSyAhz9UBzkEIYI886zZRm40qqB1Kd_9Y4-0")
20
+ import base64
21
+ def initialize_session_state():
22
+ if "token" not in st.session_state:
23
+ st.session_state["token"] ="abcd"
24
+ if "service" not in st.session_state:
25
+ st.session_state["service"] = None
26
+ if "use_document" not in st.session_state:
27
+ st.session_state.use_document = False
28
+ if "flag" not in st.session_state:
29
+ st.session_state.flag = False
30
+ if "embdding_model" not in st.session_state:
31
+ st.session_state["embdding_model"] = None
32
+ if "indexing_method" not in st.session_state:
33
+ st.session_state["indexing_method"] = None
34
+ if "uploaded_files" not in st.session_state:
35
+ st.session_state["uploaded_files"] = None
36
+
37
+ if "messages" not in st.session_state:
38
+ st.session_state["messages"] = [{"role": "assistant", "content": "How can I help you?"}]
39
+
40
+
41
+ def extract_text_from_pdf(pdf_path):
42
+ text=""
43
+ with pdfplumber.open(pdf_path) as pdf:
44
+ for page_number, page in enumerate(pdf.pages, start=1):
45
+ # Try to extract the text
46
+ text+= page.extract_text(x_tolerance=2, y_tolerance=4, layout=True, x_density=5, y_density=10)
47
+ return text
48
+
49
+ def delete_service(token,service_slected_to_delete):
50
+ for srevice_name in service_slected_to_delete:
51
+ url = REMOVE_SERVICE_API
52
+ # JSON payload to be sent
53
+ data = {
54
+ "token": token,
55
+ "servicename": srevice_name
56
+ }
57
+ json_data = json.dumps(data)
58
+
59
+ # Set the headers to specify that the content type is JSON
60
+ headers = {'Content-Type': 'application/json'}
61
+
62
+ # Send the POST request
63
+ response = requests.delete(url, data=json_data, headers=headers)
64
+ if json.loads( response.text).get("success")==True:
65
+ st.success(f"{srevice_name} deleted successfully")
66
+ else:
67
+ st.error(f"{srevice_name} not deleted successfully")
68
+
69
+ def delete_document(token, service,document_slected_to_delete):
70
+ print(document_slected_to_delete)
71
+ # for document_name in document_slected_to_delete:
72
+ url = REMOVE_DOCUMENTS_API
73
+ # JSON payload to be sent
74
+ data = {
75
+ "token": token,
76
+ "service_name": service,
77
+ "document_names":document_slected_to_delete
78
+ }
79
+
80
+ # Convert the dictionary to a JSON formatted string
81
+ json_data = json.dumps(data)
82
+ # Set the headers to specify that the content type is JSON
83
+ headers = {'Content-Type': 'application/json'}
84
+
85
+ # Send the POST request
86
+ response = requests.delete(url, data=json_data, headers=headers)
87
+ print(response)
88
+ if json.loads( response.text).get("status")=="success":
89
+ st.success("document(s) deleted successfully")
90
+ else:
91
+ st.error("document(s) not deleted successfully")
92
+ def gemini_vision(file):
93
+ load_image = Image.open(file)
94
+ prompt= "please extract all text fromt this image"
95
+ model = genai.GenerativeModel('gemini-pro-vision')
96
+ response = model.generate_content([prompt, load_image])
97
+
98
+ return response.text
99
+ def add_service(token,servicename):
100
+ url = ADD_SERVICES_API
101
+ # JSON payload to be sent
102
+ data = {
103
+ "token": token,
104
+ "services": [
105
+ {
106
+ "servicename": servicename
107
+ }
108
+ ]
109
+ }
110
+
111
+ # Convert the dictionary to a JSON formatted string
112
+ json_data = json.dumps(data)
113
+
114
+ # Set the headers to specify that the content type is JSON
115
+ headers = {'Content-Type': 'application/json'}
116
+
117
+ # Send the POST request
118
+ response = requests.post(url, data=json_data, headers=headers)
119
+ if json.loads( response.text).get("added_services"):
120
+ st.success(f"{servicename} added successfully")
121
+ else:
122
+ st.error(response.text)
123
+ def add_document(token,servicename):
124
+
125
+
126
+ file = st.session_state.uploaded_files
127
+ print(file)
128
+
129
+ url = ADD_STORE_DOCUMENT
130
+ # JSON payload to be sent
131
+ document_name = file.name.replace(" ","")
132
+ #document_name = document_name.replace(".pdf","")
133
+ document_name = document_name.replace("(","_")
134
+ document_name = document_name.replace(")","_")
135
+ document_name = document_name.replace("-","_")
136
+ document_name = document_name.replace(".","_")
137
+ encoded_file = base64.b64encode(file.read()).decode('utf-8')
138
+ data = {
139
+ "token": token,
140
+ "service_name": servicename,
141
+ "document_name": document_name,
142
+ "file":encoded_file
143
+ }
144
+
145
+ # Convert the dictionary to a JSON formatted string
146
+ json_data = json.dumps(data)
147
+
148
+ # Set the headers to specify that the content type is JSON
149
+ headers = {'Content-Type': 'application/json'}
150
+
151
+ # Send the POST request
152
+ response = requests.post(url, data=json_data, headers=headers)
153
+ document_name = file.name.replace(" ","_")
154
+ if json.loads( response.text).get("status")=="success":
155
+ st.success(f"{document_name} uploaded successfully")
156
+ else:
157
+ st.error(f"{document_name} not uploaded successfully")
158
+
159
+
160
+ def get_all_keys(d):
161
+ all_keys = set()
162
+ def get_keys(d):
163
+ for k, v in d.items():
164
+ all_keys.add(k)
165
+ if isinstance(v, dict):
166
+ get_keys(v)
167
+ elif isinstance(v, list):
168
+ for item in v:
169
+ if isinstance(item, dict):
170
+ get_keys(item)
171
+ get_keys(d)
172
+ return list(all_keys)
173
+ def display_and_validate_schema():
174
+ schema_str = json.dumps(DEFAULT_SCHEMA, indent=2)
175
+ schema_input = st.text_area("JSON Schema", schema_str, height=300)
176
+ try:
177
+ schema = json.loads(schema_input)
178
+ st.success("JSON schema is valid.")
179
+ return schema
180
+ except json.JSONDecodeError:
181
+ st.error("The JSON schema is invalid. Please correct it and try again.")
182
+ return None
183
+ def handle_comments(keys):
184
+ comments = {}
185
+ items_per_page = 6 # Adjust this number based on your preference
186
+ total_pages = (len(keys) + items_per_page - 1) // items_per_page
187
+
188
+ st.write("Please provide comments for each key to assist our system:")
189
+
190
+ page = st.number_input("Page", min_value=1, max_value=total_pages, step=1)
191
+ start_idx = (page - 1) * items_per_page
192
+ end_idx = start_idx + items_per_page
193
+
194
+ for key in keys[start_idx:end_idx]:
195
+ with st.expander(f"{key}"):
196
+ comments[key] = st.text_input(f"{key}")
197
+ # if st.button("Submit"):
198
+ # st.session_state.flag=False
199
+ return comments