Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,7 @@ import numpy as np
|
|
10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
from langchain.schema import Document
|
12 |
from sentence_transformers import SentenceTransformer,util
|
|
|
13 |
from streamlit_image_select import image_select
|
14 |
import os
|
15 |
import fitz
|
@@ -66,34 +67,92 @@ def encoding_model():
|
|
66 |
model = SentenceTransformer(model_name)
|
67 |
return model
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
# )
|
74 |
-
|
75 |
-
# completion = client.chat.completions.create(
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
#
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
def send_prompt():
|
99 |
return "please respond according to the prompt asked below from the above context"
|
@@ -271,8 +330,14 @@ with column2:
|
|
271 |
if prompts_[-1]=="@working":
|
272 |
if index==0:
|
273 |
|
274 |
-
|
275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
dictionary['every_prompt_with_val'][-1]=(prompts_[0],str(data_need))
|
277 |
|
278 |
elif isinstance(prompts_[-1],str):
|
@@ -529,68 +594,82 @@ with st.spinner('Wait for it...'):
|
|
529 |
with column1:
|
530 |
# Create a canvas component
|
531 |
changes,implementation,current=st.columns([0.01,0.9,0.01])
|
|
|
532 |
model = encoding_model()
|
533 |
with implementation:
|
|
|
|
|
534 |
st.write("<br>"*3,unsafe_allow_html=True)
|
535 |
if bg_doc:
|
536 |
-
|
537 |
canvas_result=None
|
538 |
-
|
539 |
-
|
540 |
-
f.write(bg_doc.getbuffer())
|
541 |
|
542 |
-
|
543 |
-
data = process_pdf("temp.pdf")
|
544 |
-
if str(data) not in dictionary['text_embeddings']:
|
545 |
-
dictionary['text_embeddings']={}
|
546 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=100)
|
547 |
-
chunks = text_splitter.split_documents(data)
|
548 |
-
|
549 |
-
dictionary['text_embeddings'][str(data)]={str(chunk.page_content):np.array(model.encode(str(chunk.page_content))) for chunk in chunks}
|
550 |
-
|
551 |
-
embeddings = [dictionary['text_embeddings'][str(data)][i] for i in dictionary['text_embeddings'][str(data)]]
|
552 |
-
vector_store = []
|
553 |
-
for i in dictionary['text_embeddings'][str(data)]:
|
554 |
-
vector_store.append((dictionary['text_embeddings'][str(data)][i],i))
|
555 |
-
else:
|
556 |
-
embeddings = [dictionary['text_embeddings'][str(data)][i] for i in dictionary['text_embeddings'][str(data)]]
|
557 |
-
vector_store = []
|
558 |
-
for i in dictionary['text_embeddings'][str(data)]:
|
559 |
-
vector_store.append((dictionary['text_embeddings'][str(data)][i],i))
|
560 |
-
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
561 |
-
# chunks = text_splitter.split_documents(data)
|
562 |
-
# # chunk_texts = [str(chunk.page_content) for chunk in chunks]
|
563 |
-
# # print("testing",chunk_texts)
|
564 |
-
|
565 |
-
# model = encoding_model()
|
566 |
-
# embeddings = [model.encode(str(chunk.page_content)) for chunk in chunks]
|
567 |
-
|
568 |
-
# vector_store = []
|
569 |
-
# for chunk, embedding in zip(chunks, embeddings):
|
570 |
-
# vector_store.append((embedding, chunk.page_content) )
|
571 |
|
572 |
-
|
573 |
-
|
574 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
575 |
else:
|
576 |
-
|
577 |
-
|
578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
579 |
|
580 |
canvas_result = st_canvas(
|
581 |
fill_color="rgba(0, 0, 0, 0.3)", # Fixed fill color with some opacity
|
582 |
stroke_width=stroke_width,
|
583 |
stroke_color=stroke_color,
|
584 |
background_color=bg_color,
|
585 |
-
background_image=gen_image if gen_image else Image.open("
|
586 |
update_streamlit=True,
|
587 |
-
height=screen_height,
|
588 |
-
width=screen_width,
|
589 |
drawing_mode=drawing_mode,
|
590 |
point_display_radius=point_display_radius if drawing_mode == 'point' else 0,
|
591 |
key="canvas",
|
592 |
)
|
593 |
-
|
594 |
|
595 |
|
596 |
|
@@ -603,46 +682,50 @@ with column1:
|
|
603 |
|
604 |
# run=st.button("run_experiment")
|
605 |
if bg_doc:
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
retrieved_chunks = max([(util.cos_sim(match[0],query_embedding),match[-1])for match in vector_store])[-1]
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
with implementation:
|
617 |
-
with st.spinner('Wait for it...'):
|
618 |
text_lookup=retrieved_chunks
|
619 |
pages=[]
|
620 |
buffer = bg_doc.getbuffer()
|
621 |
byte_data = bytes(buffer)
|
622 |
-
with fitz.open("
|
623 |
-
|
624 |
for page_no in range(doc.page_count):
|
625 |
pages.append(doc.load_page(page_no - 1))
|
626 |
|
627 |
-
# areas = pages[page_number-1].search_for(text_lookup)
|
628 |
with st.container(height=int(screen_height//1.8)):
|
629 |
for pg_no in pages[::-1]:
|
630 |
areas = pg_no.search_for(text_lookup)
|
631 |
for area in areas:
|
632 |
pg_no.add_rect_annot(area)
|
633 |
-
|
634 |
pix = pg_no.get_pixmap(dpi=100).tobytes()
|
635 |
-
st.image(pix,
|
636 |
|
637 |
if bg_doc and prompt:
|
|
|
638 |
query_embedding = model.encode([prompt])
|
639 |
-
|
640 |
-
|
641 |
-
prompt = "Context: "+ retrieved_chunks +"\n"+send_prompt()+ "\n"+prompt
|
642 |
|
643 |
-
|
644 |
-
|
645 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
646 |
elif not bg_doc and canvas_result.image_data is not None:
|
647 |
if prompt:
|
648 |
|
|
|
10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
from langchain.schema import Document
|
12 |
from sentence_transformers import SentenceTransformer,util
|
13 |
+
from code_editor import code_editor
|
14 |
from streamlit_image_select import image_select
|
15 |
import os
|
16 |
import fitz
|
|
|
67 |
model = SentenceTransformer(model_name)
|
68 |
return model
|
69 |
|
70 |
+
def executer(query):
|
71 |
+
try:
|
72 |
+
output = io.StringIO()
|
73 |
+
sys.stdout = output
|
|
|
|
|
|
|
74 |
|
75 |
+
exec(query)
|
76 |
+
sys.stdout = sys.__stdout__
|
77 |
+
print(output.getvalue())
|
78 |
+
return False
|
79 |
+
except Exception as e:
|
80 |
+
return f"Error: {str(e)}"
|
81 |
+
def dataframe_info(data):
|
82 |
+
value= data[:5]
|
83 |
+
return str(value)
|
84 |
+
|
85 |
+
def extract_python_code(text):
|
86 |
+
"""
|
87 |
+
Extracts code blocks from a given text.
|
88 |
+
Supports triple-backtick blocks, indented code, and inline code.
|
89 |
+
"""
|
90 |
+
code_snippets = []
|
91 |
+
|
92 |
+
# Extract triple-backtick code blocks
|
93 |
+
triple_backtick_blocks = re.findall(r"```(?:[\w+\-]*)\n(.*?)```", text, re.DOTALL)
|
94 |
+
code_snippets.extend(triple_backtick_blocks)
|
95 |
+
|
96 |
+
# Extract indented code blocks (4 spaces or tab)
|
97 |
+
indented_blocks = re.findall(r"(?:^|\n)((?: |\t).+(\n(?: |\t).+)*)", text)
|
98 |
+
code_snippets.extend([block[0] for block in indented_blocks])
|
99 |
+
|
100 |
+
# Extract inline code snippets with single backticks
|
101 |
+
inline_code = re.findall(r"`([^`\n]+)`", text)
|
102 |
+
code_snippets.extend(inline_code)
|
103 |
+
|
104 |
+
return [snippet.strip() for snippet in code_snippets if snippet.strip()]
|
105 |
+
|
106 |
|
107 |
+
# @st.cache_resource
|
108 |
+
def run_code_blocks(code_blocks,df):
|
109 |
+
import io
|
110 |
+
from contextlib import redirect_stdout
|
111 |
+
|
112 |
+
buffer = io.StringIO()
|
113 |
+
# coder="".join(code_blocks)
|
114 |
+
coder = str(code_blocks)
|
115 |
+
# print(coder)
|
116 |
+
@st.cache_resource
|
117 |
+
def file_handler(file):
|
118 |
+
"""
|
119 |
+
Handles file upload and returns the file path.
|
120 |
+
"""
|
121 |
+
file_name = file.name
|
122 |
+
if file_name.split(".")[-1] in ["csv"]:
|
123 |
+
value = pd.read_csv(file)
|
124 |
+
return value
|
125 |
+
elif file_name.split(".")[-1] in ["xlsx"]:
|
126 |
+
value = pd.read_excel(file)
|
127 |
+
return value
|
128 |
+
|
129 |
+
else:
|
130 |
+
return None
|
131 |
+
def run_agent(prompt,df):
|
132 |
+
# progress = True
|
133 |
+
# return_val = ""
|
134 |
+
# while isinstance(progress,list):
|
135 |
+
llm = OllamaLLM(model="llama3:latest")
|
136 |
+
intermediate_steps = prompt+"\n"
|
137 |
+
intermediate_steps += "\nAbove is the user request that has to be completed. \n"
|
138 |
+
intermediate_steps += "Below is the dataframe sample provided . \n"
|
139 |
+
intermediate_steps += "The dataframe is as follows:\n"
|
140 |
+
intermediate_steps += dataframe_info(df)+"\n"
|
141 |
+
intermediate_steps += "You are a senior pandas dataframe developer and you have to write code to complete the user request.\n"
|
142 |
+
intermediate_steps += "Below are the instructions\n"
|
143 |
+
intermediate_steps += "There is a variable name 'df' which is a dataframe and have values.\n"
|
144 |
+
intermediate_steps += "write code using df to manipulate it and give result according to user instruction.\n"
|
145 |
+
intermediate_steps += "No need to load the data 'df' is the required variable.\n"
|
146 |
+
intermediate_steps += "Whole team told you that you no need to use pd.read data is already there in df.\n"
|
147 |
+
intermediate_steps += "Since we are showing code output in streamlit not in terminal so code it properly. \n"
|
148 |
+
intermediate_steps += "This is last warning as a ceo of the company, you have to return only required code as per user request.\n"
|
149 |
+
intermediate_steps += "Example\n"
|
150 |
+
intermediate_steps += "User request: 'show me the rows which has highest electricity_kwh_per_month\n"
|
151 |
+
intermediate_steps += "```\nst.write(df[df['electricity_kwh_per_month'] == df['electricity_kwh_per_month'].max()])\n```\n"
|
152 |
+
intermediate_steps += "You can see that above is the required code(in quotes) for user query(only required) and below is the next request.\n"
|
153 |
+
intermediate_steps += "User request: {prompt}\n"
|
154 |
+
intermediate_steps += "Generate code for the above request only but write some code to full fill user query.\n"
|
155 |
+
return intermediate_steps
|
156 |
|
157 |
def send_prompt():
|
158 |
return "please respond according to the prompt asked below from the above context"
|
|
|
330 |
if prompts_[-1]=="@working":
|
331 |
if index==0:
|
332 |
|
333 |
+
data_need=""
|
334 |
+
while(len(data_need)==0):
|
335 |
+
if len(prompts_)==3:
|
336 |
+
data_need = st.write_stream(consume_llm_api(prompts_[1]))
|
337 |
+
else:
|
338 |
+
data_need=st.write_stream(consume_llm_api(prompts_[0]))
|
339 |
+
|
340 |
+
|
341 |
dictionary['every_prompt_with_val'][-1]=(prompts_[0],str(data_need))
|
342 |
|
343 |
elif isinstance(prompts_[-1],str):
|
|
|
594 |
with column1:
|
595 |
# Create a canvas component
|
596 |
changes,implementation,current=st.columns([0.01,0.9,0.01])
|
597 |
+
|
598 |
model = encoding_model()
|
599 |
with implementation:
|
600 |
+
with st.spinner('Wait for it...'):
|
601 |
+
# pdf_file = st.file_uploader("Upload PDF file", type=('pdf'))
|
602 |
st.write("<br>"*3,unsafe_allow_html=True)
|
603 |
if bg_doc:
|
|
|
604 |
canvas_result=None
|
605 |
+
# st.write(bg_doc.name)
|
606 |
+
file_type = file_handler(bg_doc)
|
|
|
607 |
|
608 |
+
if isinstance(file_type,type(None)) :
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
609 |
|
610 |
+
with open(bg_doc.name, "wb") as f_work:
|
611 |
+
f_work.write(bg_doc.getbuffer())
|
612 |
+
|
613 |
+
data = process_pdf(bg_doc.name)
|
614 |
+
if str(data) not in dictionary['text_embeddings']:
|
615 |
+
dictionary['text_embeddings']={}
|
616 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=100)
|
617 |
+
chunks = text_splitter.split_documents(data)
|
618 |
+
|
619 |
+
dictionary['text_embeddings'][str(data)]={str(chunk.page_content):model.encode(str(chunk.page_content)) for chunk in chunks}
|
620 |
+
|
621 |
+
embeddings = [dictionary['text_embeddings'][str(data)][i] for i in dictionary['text_embeddings'][str(data)]]
|
622 |
+
vector_store = []
|
623 |
+
for i in dictionary['text_embeddings'][str(data)]:
|
624 |
+
vector_store.append((dictionary['text_embeddings'][str(data)][i],i))
|
625 |
+
else:
|
626 |
+
embeddings = [dictionary['text_embeddings'][str(data)][i] for i in dictionary['text_embeddings'][str(data)]]
|
627 |
+
vector_store = []
|
628 |
+
for i in dictionary['text_embeddings'][str(data)]:
|
629 |
+
vector_store.append((dictionary['text_embeddings'][str(data)][i],i))
|
630 |
else:
|
631 |
+
|
632 |
+
code_runner,code_check,data_frame = st.tabs(["π code runner", "code","π Chart"])
|
633 |
+
with data_frame:
|
634 |
+
file_type = st.data_editor(file_type,hide_index=True,use_container_width=True,num_rows='dynamic')
|
635 |
+
with code_check:
|
636 |
+
if len(dictionary['every_prompt_with_val'])!=0:
|
637 |
+
with st.form("code_form"):
|
638 |
+
code_new=extract_python_code(dictionary['every_prompt_with_val'][-1][-1])
|
639 |
+
code_new = "".join(code_new)
|
640 |
+
response = code_editor(code_new, lang="python", key="editor1",height=screen_height/4,allow_reset=True,response_mode="blur",focus=True)
|
641 |
+
print(response)
|
642 |
+
submitted = st.form_submit_button("Submit Code")
|
643 |
+
|
644 |
+
with code_runner:
|
645 |
+
if len(dictionary['every_prompt_with_val'])!=0 and submitted:
|
646 |
+
code_new = response.get('text')
|
647 |
+
print(code_new,response)
|
648 |
+
run_code_blocks(code_new,file_type)
|
649 |
+
elif len(dictionary['every_prompt_with_val'])!=0 :
|
650 |
+
code_new=extract_python_code(dictionary['every_prompt_with_val'][-1][-1])
|
651 |
+
code_new = "".join(code_new)
|
652 |
+
run_code_blocks(code_new,file_type)
|
653 |
+
else:
|
654 |
+
st.header("Please ask your query from data")
|
655 |
+
|
656 |
+
|
657 |
+
else:
|
658 |
|
659 |
canvas_result = st_canvas(
|
660 |
fill_color="rgba(0, 0, 0, 0.3)", # Fixed fill color with some opacity
|
661 |
stroke_width=stroke_width,
|
662 |
stroke_color=stroke_color,
|
663 |
background_color=bg_color,
|
664 |
+
background_image=gen_image if gen_image else Image.open("ALL_image_formation\image_gen.png"),
|
665 |
update_streamlit=True,
|
666 |
+
height=int(screen_height//2.16) if screen_height!=1180 else screen_height//2,
|
667 |
+
width=int(screen_width//2.3) if screen_width!=820 else screen_width//2,
|
668 |
drawing_mode=drawing_mode,
|
669 |
point_display_radius=point_display_radius if drawing_mode == 'point' else 0,
|
670 |
key="canvas",
|
671 |
)
|
672 |
+
# st.rerun()
|
673 |
|
674 |
|
675 |
|
|
|
682 |
|
683 |
# run=st.button("run_experiment")
|
684 |
if bg_doc:
|
685 |
+
if len(dictionary['every_prompt_with_val'])==0:
|
686 |
+
query_embedding = model.encode(["something"])
|
687 |
+
else:
|
688 |
+
|
689 |
+
query_embedding = model.encode([dictionary['every_prompt_with_val'][-1][0]])
|
690 |
+
if isinstance(file_type,type(None)) :
|
691 |
retrieved_chunks = max([(util.cos_sim(match[0],query_embedding),match[-1])for match in vector_store])[-1]
|
|
|
|
|
|
|
692 |
with implementation:
|
693 |
+
with st.spinner('Wait for it...'):
|
694 |
text_lookup=retrieved_chunks
|
695 |
pages=[]
|
696 |
buffer = bg_doc.getbuffer()
|
697 |
byte_data = bytes(buffer)
|
698 |
+
with fitz.open(stream=byte_data, filetype="pdf") as doc:
|
699 |
+
|
700 |
for page_no in range(doc.page_count):
|
701 |
pages.append(doc.load_page(page_no - 1))
|
702 |
|
|
|
703 |
with st.container(height=int(screen_height//1.8)):
|
704 |
for pg_no in pages[::-1]:
|
705 |
areas = pg_no.search_for(text_lookup)
|
706 |
for area in areas:
|
707 |
pg_no.add_rect_annot(area)
|
708 |
+
|
709 |
pix = pg_no.get_pixmap(dpi=100).tobytes()
|
710 |
+
st.image(pix,use_column_width=True)
|
711 |
|
712 |
if bg_doc and prompt:
|
713 |
+
|
714 |
query_embedding = model.encode([prompt])
|
715 |
+
if isinstance(file_type,type(None)) :
|
716 |
+
retrieved_chunks = max([(util.cos_sim(match[0],query_embedding),match[-1]) for match in vector_store])[-1]
|
|
|
717 |
|
718 |
+
|
719 |
+
prompt = "Context: "+ retrieved_chunks +"\n"+send_prompt()+ "\n"+prompt
|
720 |
+
|
721 |
+
modifiedValue="@working"
|
722 |
+
dictionary['every_prompt_with_val'].append((prompt,modifiedValue))
|
723 |
+
st.rerun()
|
724 |
+
else:
|
725 |
+
modifiedValue="@working"
|
726 |
+
new_prompt = run_agent(prompt,file_type)
|
727 |
+
dictionary['every_prompt_with_val'].append((prompt,new_prompt,modifiedValue))
|
728 |
+
st.rerun()
|
729 |
elif not bg_doc and canvas_result.image_data is not None:
|
730 |
if prompt:
|
731 |
|