Spaces:
Configuration error
Configuration error
Upload 17 files
Browse files- .gitattributes +1 -0
- .gitignore +11 -0
- README.md +46 -20
- agent/__pycache__/agent.cpython-311.pyc +0 -0
- agent/agent.py +62 -0
- app.py +34 -0
- app_debug.log +3 -0
- generator.py +8 -0
- main.py +89 -0
- my_faiss_index/index.faiss +3 -0
- my_faiss_index/index.pkl +3 -0
- processed_files.txt +18 -0
- requirements.txt +22 -3
- summerizer/imageSummerizer.py +17 -0
- summerizer/textSummerizer.py +26 -0
- utils/helper.py +107 -0
- vectorStore/__pycache__/vectorStore.cpython-311.pyc +0 -0
- vectorStore/vectorStore.py +92 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
my_faiss_index/index.faiss filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python-generated files
|
2 |
+
__pycache__/
|
3 |
+
*.py[oc]
|
4 |
+
build/
|
5 |
+
dist/
|
6 |
+
wheels/
|
7 |
+
*.egg-info
|
8 |
+
.env
|
9 |
+
|
10 |
+
# Virtual environments
|
11 |
+
.venv
|
README.md
CHANGED
@@ -1,20 +1,46 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Backend
|
2 |
+
|
3 |
+
This folder contains the backend services for the Document Chat App.
|
4 |
+
|
5 |
+
### `app.py`
|
6 |
+
This file is the main entry point for the Streamlit web application. It handles the user interface, chat history management, and interacts with the `agent` to process user queries and generate responses.
|
7 |
+
|
8 |
+
### `main.py`
|
9 |
+
This script is responsible for processing documents. It loads and extracts data (tables, texts, images) from PDF files in the `data` directory, summarizes them using the `summerizer` module, and then chunks and adds the processed documents to the vector store. It keeps track of processed files in `processed_files.txt` to avoid reprocessing.
|
10 |
+
|
11 |
+
### `data/`
|
12 |
+
This directory is intended to store the raw PDF documents that need to be processed by the system.
|
13 |
+
|
14 |
+
### `vectorStore/`
|
15 |
+
This directory stores the generated vector embeddings of the processed documents. These embeddings are used by the `agent` for retrieving relevant information during the chat.
|
16 |
+
|
17 |
+
### `agent/`
|
18 |
+
This module contains the logic for the conversational agent, which uses the vector store to answer questions based on the processed documents.
|
19 |
+
|
20 |
+
### `summerizer/`
|
21 |
+
This module provides functionalities for summarizing different types of content (text, images) extracted from the documents.
|
22 |
+
|
23 |
+
### `utils/`
|
24 |
+
This module contains utility functions, such as `helper.py` for loading and extracting data from documents.
|
25 |
+
|
26 |
+
### `tool/`
|
27 |
+
This module likely contains tools or functions used by the agent to perform specific tasks.
|
28 |
+
|
29 |
+
### `generator.py`
|
30 |
+
This file likely contains code related to generating responses or content within the application.
|
31 |
+
|
32 |
+
### How to Run
|
33 |
+
To run the backend application, you will typically run `app.py` using Streamlit after ensuring all dependencies are installed and documents are processed by `main.py`.
|
34 |
+
|
35 |
+
```bash
|
36 |
+
streamlit run app.py
|
37 |
+
```
|
38 |
+
|
39 |
+
### Running `main.py` for Document Embedding
|
40 |
+
To process and embed documents, run the `main.py` script. This script will load PDF files from the `data` directory, extract and summarize their contents, and then add them to the vector store.
|
41 |
+
|
42 |
+
```bash
|
43 |
+
python main.py
|
44 |
+
```
|
45 |
+
|
46 |
+
Make sure that the `data` directory contains the PDF files you want to process. The script will log processed files in `processed_files.txt` to avoid reprocessing them.
|
agent/__pycache__/agent.cpython-311.pyc
ADDED
Binary file (3.75 kB). View file
|
|
agent/agent.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from agno.agent import Agent
|
2 |
+
from agno.agent import Agent, RunResponse
|
3 |
+
from agno.models.groq import Groq
|
4 |
+
from agno.tools.reasoning import ReasoningTools
|
5 |
+
from agno.tools.thinking import ThinkingTools
|
6 |
+
from vectorStore.vectorStore import GetContext
|
7 |
+
from agno.models.openai import OpenAIChat
|
8 |
+
|
9 |
+
|
10 |
+
def RunAgent(query):
|
11 |
+
"""
|
12 |
+
This agent can run hte query and return the response
|
13 |
+
retriever_tool can accept query and user_id and return the response
|
14 |
+
"""
|
15 |
+
try:
|
16 |
+
agent = Agent(
|
17 |
+
tools=[GetContext,
|
18 |
+
ReasoningTools(add_instructions=True),
|
19 |
+
ThinkingTools(add_instructions=True)
|
20 |
+
],
|
21 |
+
description = "This agent strictly processes user queries using ONLY the provided context. It must not use external knowledge or assumptions beyond the context. "
|
22 |
+
"If the exact answer is not found, it must reason based on the available information to generate a helpful response. "
|
23 |
+
"If reasoning is not possible from the given context, the agent must clearly state that it cannot answer the query and prompt the user to try a related query. "
|
24 |
+
"At no point should the agent fabricate information or rely on knowledge not present in the provided context.",
|
25 |
+
instructions = [
|
26 |
+
"""
|
27 |
+
Role:
|
28 |
+
- You are an assistant representing <BOT_NAME>. Your job is to assist users strictly based on the provided context.
|
29 |
+
|
30 |
+
Core Rules:
|
31 |
+
1. Use ONLY the provided context to generate responses.
|
32 |
+
2. DO NOT use any external knowledge, assumptions, prior training data, or general world knowledge.
|
33 |
+
3. If the context does not provide a clear answer, try to infer a reasonable response *only within* the scope of the context.
|
34 |
+
4. If a reasonable answer cannot be formed from the context, respond exactly with:
|
35 |
+
"Apologies; I am not sure about that. Please head over to <SUPPORT_URL> for some additional help from our team."
|
36 |
+
5. NEVER fabricate, guess, or hallucinate any information not clearly supported by the context.
|
37 |
+
6. Do NOT suggest or imply you have access to any knowledge beyond the context.
|
38 |
+
7.nclude the resource/document name at the end response.
|
39 |
+
|
40 |
+
Compliance:
|
41 |
+
- This is a ZERO-TOLERANCE instruction set.
|
42 |
+
- Any use of information outside the provided context is a strict violation.
|
43 |
+
"""
|
44 |
+
],
|
45 |
+
|
46 |
+
show_tool_calls=True,
|
47 |
+
markdown=True,
|
48 |
+
model=OpenAIChat(id="gpt-4o",api_key="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")
|
49 |
+
)
|
50 |
+
|
51 |
+
|
52 |
+
# Run Agent
|
53 |
+
response: RunResponse = agent.run(query, stream=False,structured_outputs=True)
|
54 |
+
|
55 |
+
return response.content
|
56 |
+
except Exception as e:
|
57 |
+
return str(e)
|
58 |
+
|
59 |
+
if __name__ == "__main__":
|
60 |
+
query = "tell me about enery and climate changes?"
|
61 |
+
response = RunAgent(query=query)
|
62 |
+
print(">> Response: ",response)
|
app.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from agent.agent import RunAgent
|
3 |
+
|
4 |
+
# Set Streamlit layout
|
5 |
+
st.set_page_config(page_title="Document Chat App", layout="wide")
|
6 |
+
st.title("📄 Document Chat App")
|
7 |
+
|
8 |
+
# Initialize session state
|
9 |
+
if 'chat_history' not in st.session_state:
|
10 |
+
st.session_state.chat_history = []
|
11 |
+
|
12 |
+
# Display chat history
|
13 |
+
for speaker, message in st.session_state.chat_history:
|
14 |
+
with st.chat_message(name=speaker):
|
15 |
+
st.markdown(message)
|
16 |
+
|
17 |
+
# Chat input
|
18 |
+
user_input = st.chat_input("Ask something about your document...")
|
19 |
+
|
20 |
+
if user_input:
|
21 |
+
# Show user message
|
22 |
+
with st.chat_message("You"):
|
23 |
+
st.markdown(user_input)
|
24 |
+
|
25 |
+
# Run agent
|
26 |
+
response = RunAgent(query=user_input)
|
27 |
+
|
28 |
+
# Show bot response
|
29 |
+
with st.chat_message("Bot"):
|
30 |
+
st.markdown(response)
|
31 |
+
|
32 |
+
# Save to chat history
|
33 |
+
st.session_state.chat_history.append(("You", user_input))
|
34 |
+
st.session_state.chat_history.append(("Bot", response))
|
app_debug.log
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
2025-06-16 00:17:48,085 - INFO - Session state initialized
|
2 |
+
2025-06-16 00:17:48,085 - INFO - Debug: App started successfully
|
3 |
+
2025-06-16 00:17:48,087 - INFO - Debug: Directories created successfully
|
generator.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from agent.agent import RunAgent
|
2 |
+
|
3 |
+
if __name__ == "__main__":
|
4 |
+
query = """
|
5 |
+
Explain how the crystalline structure of cellulose impacts its enzymatic hydrolysis. What methods are used to overcome this challenge?
|
6 |
+
"""
|
7 |
+
response = RunAgent(query=query)
|
8 |
+
print(">> Response: ",response)
|
main.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
from utils.helper import LoadAndExtractData # Uncomment if you want to process files
|
4 |
+
from summerizer.imageSummerizer import Image_Summerizer
|
5 |
+
from summerizer.textSummerizer import TextSummerizer
|
6 |
+
from langchain_core.documents import Document
|
7 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
+
from vectorStore.vectorStore import add_to_vector_store
|
9 |
+
|
10 |
+
def main():
|
11 |
+
try:
|
12 |
+
root_dir = "data"
|
13 |
+
processed_log_path = "processed_files.txt"
|
14 |
+
|
15 |
+
# Load already processed file names
|
16 |
+
if os.path.exists(processed_log_path):
|
17 |
+
with open(processed_log_path, 'r') as f:
|
18 |
+
processed_files = set(f.read().splitlines())
|
19 |
+
else:
|
20 |
+
processed_files = set()
|
21 |
+
|
22 |
+
files = os.listdir(root_dir)
|
23 |
+
print(">> Files: ",files)
|
24 |
+
print(">> Process Files: ",processed_files)
|
25 |
+
print(">> Processing Files ")
|
26 |
+
for file in files:
|
27 |
+
file_path = os.path.join(root_dir, file)
|
28 |
+
|
29 |
+
# Only process files that don't exist in the process directory
|
30 |
+
if file not in processed_files and file.lower().endswith('.pdf'):
|
31 |
+
print(f">> Processing: {file}")
|
32 |
+
|
33 |
+
tables, texts, images = LoadAndExtractData(file_path)
|
34 |
+
|
35 |
+
print(">> Generating Summaries ")
|
36 |
+
text_summary = TextSummerizer(data=texts)
|
37 |
+
tables_summary = TextSummerizer(data=tables)
|
38 |
+
images_summary = Image_Summerizer(data=images)
|
39 |
+
|
40 |
+
print("Text Sumary: ",text_summary)
|
41 |
+
print("Table Summary: ",tables_summary)
|
42 |
+
print("Image Susmmary: ",images_summary)
|
43 |
+
|
44 |
+
print(">> Summary Generated")
|
45 |
+
|
46 |
+
print(">> Combine Each and every thing into one document")
|
47 |
+
# Create Document objects for text chunks
|
48 |
+
text_docs = [Document(page_content=str(text), metadata={"type": "text", "summary": text_summary[i], "source":file_path,"name":file}) for i, text in enumerate(texts)]
|
49 |
+
|
50 |
+
# Create Document objects for table summaries (using the HTML representation)
|
51 |
+
table_docs = [Document(page_content=tables[i], metadata={"type": "table", "summary": tables_summary[i],"source":file_path,"name":file}) for i, table in enumerate(tables)]
|
52 |
+
|
53 |
+
# Create Document objects for image summaries
|
54 |
+
image_docs = [Document(page_content=images[i], metadata={"type": "image", "summary": images_summary[i],"source":file_path,"name":file}) for i, image in enumerate(images)]
|
55 |
+
|
56 |
+
# Combine all document types into a single list
|
57 |
+
docs = text_docs + table_docs + image_docs
|
58 |
+
|
59 |
+
print(">> Splitting Documents")
|
60 |
+
document_splitter = RecursiveCharacterTextSplitter(
|
61 |
+
chunk_size=1000, # Example size, adjust based on your needs
|
62 |
+
chunk_overlap=200, # Example overlap, adjust based on your needs
|
63 |
+
length_function=len,
|
64 |
+
is_separator_regex=False,
|
65 |
+
)
|
66 |
+
|
67 |
+
# Spli the documents
|
68 |
+
docs_chunks = document_splitter.split_documents(docs)
|
69 |
+
print(">> Splitting Done")
|
70 |
+
|
71 |
+
add_to_vector_store(docs_chunks=docs_chunks)
|
72 |
+
|
73 |
+
|
74 |
+
# Append to log file
|
75 |
+
with open(processed_log_path, 'a') as f:
|
76 |
+
f.write(file + '\n')
|
77 |
+
|
78 |
+
print(f">> Marked {file} as processed")
|
79 |
+
|
80 |
+
|
81 |
+
else:
|
82 |
+
print(f"!! Skipping already processed or unsupported file: {file}")
|
83 |
+
|
84 |
+
except Exception as e:
|
85 |
+
print("Error is:", str(e))
|
86 |
+
return str(e)
|
87 |
+
|
88 |
+
if __name__ == "__main__":
|
89 |
+
main()
|
my_faiss_index/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c04a5dd5bbd2a99da72a7ab0084626a7501a920cc1e05034fc3449ecb206330e
|
3 |
+
size 56463405
|
my_faiss_index/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e088d4825bc2685515fe7ef79aff669292ad3652c9ffbb296f6f7d87827cf00c
|
3 |
+
size 10708178
|
processed_files.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AMMONIA PRODUCTION TECHNOLOGIES CAP 8 STORAGE.pdf
|
2 |
+
Ammonias-Double-Edged-Sword-Clean-Energy-or-Catastrophic-Risk.pdf
|
3 |
+
Waste to Fuel in NY.pdf
|
4 |
+
Charting-a-Greener-Course-Embrace-CCS-on-Maritime-Vessels.pdf
|
5 |
+
wind-assisted ship propulsion.pdf
|
6 |
+
Clean Energy Market Analysis in the US.pdf
|
7 |
+
Clean investment US.pdf
|
8 |
+
emission-factors_2014.pdf
|
9 |
+
Green Hydrogen the Race to Success-Members.pdf
|
10 |
+
GREENHOUSE GAS EMISSIONS FROM BIo ETHANOL AND BIO-DIESEL FUEL SUPPLY.pdf
|
11 |
+
Hydrogen Bunkering at Ports by Eliseo Curcio.pdf
|
12 |
+
Cheat Sheet Hydrogen (1).pdf
|
13 |
+
Comparative Life Cycle Assessment of Bioethanol Production from Different Generations ofBiomass and Waste Feedstocks.pdf
|
14 |
+
Comparison of biofuel life-cycle GHG emissions assessment tools.pdf
|
15 |
+
Green-Hydrogen-The-Race-to-Success.pdf
|
16 |
+
LCFS Vs RFS.pdf
|
17 |
+
Post strategy.pdf
|
18 |
+
Review of Second Generation Bioethanol Production.pdf
|
requirements.txt
CHANGED
@@ -1,3 +1,22 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
unstructured[all-docs]
|
2 |
+
pillow
|
3 |
+
lxml
|
4 |
+
pillow
|
5 |
+
tiktoken
|
6 |
+
langchain
|
7 |
+
langchain-community
|
8 |
+
langchain-openai
|
9 |
+
langchain-groq
|
10 |
+
python_dotenv
|
11 |
+
pymilvus[model]
|
12 |
+
pymilvus
|
13 |
+
transformers
|
14 |
+
torch
|
15 |
+
poppler-utils
|
16 |
+
tesseract
|
17 |
+
pytesseract
|
18 |
+
langchain-groq
|
19 |
+
langchain-milvus
|
20 |
+
agno
|
21 |
+
streamlit
|
22 |
+
faiss-cpu
|
summerizer/imageSummerizer.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Image Summerizer
|
2 |
+
from utils.helper import Summarizer
|
3 |
+
|
4 |
+
prompt_template = """Describe the image in detail. For context,
|
5 |
+
the image is part of a research paper.
|
6 |
+
Be specific about graphs, such as bar plots."""
|
7 |
+
|
8 |
+
|
9 |
+
def Image_Summerizer(prompt_template =prompt_template,data=None):
|
10 |
+
try:
|
11 |
+
images_summary = Summarizer(prompt_template=prompt_template,data=data,config=False,set_messages=True)
|
12 |
+
return images_summary
|
13 |
+
except Exception as e:
|
14 |
+
pass
|
15 |
+
|
16 |
+
if __name__ == "__main__":
|
17 |
+
pass
|
summerizer/textSummerizer.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Text Summerize
|
2 |
+
from utils.helper import Summarizer
|
3 |
+
|
4 |
+
# define the pronpt
|
5 |
+
prompt_text = """
|
6 |
+
You are an assistant tasked with summarizing tables and text.
|
7 |
+
Give a concise summary of the table or text.
|
8 |
+
|
9 |
+
Respond only with the summary, no additionnal comment.
|
10 |
+
Do not start your message by saying "Here is a summary" or anything like that.
|
11 |
+
Just give the summary as it is.
|
12 |
+
|
13 |
+
Table or text chunk: {element}
|
14 |
+
|
15 |
+
"""
|
16 |
+
|
17 |
+
def TextSummerizer(prompt_template =prompt_text,data=None):
|
18 |
+
try:
|
19 |
+
text_summary = Summarizer(prompt_template=prompt_template,data=data,config=True,set_messages=False)
|
20 |
+
return text_summary
|
21 |
+
except Exception as e:
|
22 |
+
pass
|
23 |
+
if __name__ == "__main__":
|
24 |
+
print(TextSummerizer(data="""
|
25 |
+
To download and use Poppler as a Python library (or make it accessible to Python), follow these steps based on your operating system. Poppler is not a Python package—it's a C++ PDF rendering library with command-line tools like pdfinfo, pdftotext, and others, which Python libraries like unstructured or pdf2image call internally.
|
26 |
+
"""))
|
utils/helper.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from unstructured.partition.pdf import partition_pdf
|
2 |
+
from langchain_openai import ChatOpenAI
|
3 |
+
from langchain_core.prompts import ChatPromptTemplate
|
4 |
+
from langchain_core.output_parsers import StrOutputParser
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
|
9 |
+
def get_images_base64(chunks):
|
10 |
+
images_b64 = []
|
11 |
+
for chunk in chunks:
|
12 |
+
if "CompositeElement" in str(type(chunk)):
|
13 |
+
chunk_els = chunk.metadata.orig_elements
|
14 |
+
for el in chunk_els:
|
15 |
+
if "Image" in str(type(el)):
|
16 |
+
images_b64.append(el.metadata.image_base64)
|
17 |
+
return images_b64
|
18 |
+
|
19 |
+
|
20 |
+
def LoadAndExtractData(file_path):
|
21 |
+
try:
|
22 |
+
# separate tables from texts
|
23 |
+
tables = []
|
24 |
+
texts = []
|
25 |
+
|
26 |
+
print(">> Extracting Data")
|
27 |
+
data = partition_pdf(
|
28 |
+
filename=file_path,
|
29 |
+
infer_table_structure=True, # extract tables
|
30 |
+
# strategy="hi_res", # mandatory to infer tables
|
31 |
+
|
32 |
+
extract_image_block_types=["Image"], # Add 'Tabl
|
33 |
+
|
34 |
+
extract_image_block_to_payload=True, # if true, will extract base64 for API usage
|
35 |
+
|
36 |
+
chunking_strategy="by_title", # or 'basic'
|
37 |
+
max_characters=10000, # defaults to 500
|
38 |
+
combine_text_under_n_chars=2000, # defaults to 0
|
39 |
+
new_after_n_chars=6000,
|
40 |
+
|
41 |
+
# extract_images_in_pdf=True, # deprecated
|
42 |
+
)
|
43 |
+
|
44 |
+
# Extract the tables and text
|
45 |
+
print(">> Extracting Text and tables...")
|
46 |
+
for chunk in data:
|
47 |
+
if "Table" in str(type(chunk)):
|
48 |
+
tables.append(chunk)
|
49 |
+
|
50 |
+
if "CompositeElement" in str(type((chunk))):
|
51 |
+
texts.append(chunk)
|
52 |
+
print(">> Chunks are: ",data)
|
53 |
+
# extract the image
|
54 |
+
print(">> Extracting Images...")
|
55 |
+
images = get_images_base64(data)
|
56 |
+
return tables ,texts, images
|
57 |
+
except Exception as e:
|
58 |
+
print("Error is: ",str(e))
|
59 |
+
return [], [], str(e)
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
# Summarizer Function
|
64 |
+
def Summarizer(prompt_template, data, config=True, set_messages=False):
|
65 |
+
"""
|
66 |
+
This function summarizes documents using a prompt template and the ChatOpenAI model.
|
67 |
+
|
68 |
+
Args:
|
69 |
+
prompt_template (str): Template string for the prompt.
|
70 |
+
data (List[Dict] or List[str]): Input data to be summarized.
|
71 |
+
config (bool): Whether to run the chain with concurrency limit.
|
72 |
+
set_messages (bool): Whether to set messages as chat messages with an image.
|
73 |
+
|
74 |
+
Returns:
|
75 |
+
List[str]: List of summaries.
|
76 |
+
"""
|
77 |
+
try:
|
78 |
+
# api_key = os.getenv()
|
79 |
+
if set_messages:
|
80 |
+
messages = [
|
81 |
+
(
|
82 |
+
"user",
|
83 |
+
[
|
84 |
+
{"type": "text", "text": prompt_template},
|
85 |
+
{
|
86 |
+
"type": "image_url",
|
87 |
+
"image_url": {"url": "data:image/jpeg;base64,{image}"},
|
88 |
+
},
|
89 |
+
],
|
90 |
+
)
|
91 |
+
]
|
92 |
+
prompt = ChatPromptTemplate.from_messages(messages)
|
93 |
+
model = ChatOpenAI(temperature=0.5, model="gpt-4o-mini")
|
94 |
+
summarize_chain = {"image": lambda x: x} | prompt | model | StrOutputParser()
|
95 |
+
else:
|
96 |
+
prompt = ChatPromptTemplate.from_template(prompt_template)
|
97 |
+
model = ChatOpenAI(temperature=0.5, model="gpt-4o-mini")
|
98 |
+
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
|
99 |
+
|
100 |
+
|
101 |
+
if config:
|
102 |
+
return summarize_chain.batch(data, {"max_concurrency": 3})
|
103 |
+
else:
|
104 |
+
return summarize_chain.batch(data)
|
105 |
+
except Exception as e:
|
106 |
+
return str(e)
|
107 |
+
|
vectorStore/__pycache__/vectorStore.cpython-311.pyc
ADDED
Binary file (4.28 kB). View file
|
|
vectorStore/vectorStore.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pymilvus import MilvusClient
|
2 |
+
# from langchain_milvus import Milvus
|
3 |
+
from langchain_openai import OpenAIEmbeddings
|
4 |
+
from langchain.docstore.document import Document
|
5 |
+
from tqdm import tqdm
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from typing import List
|
8 |
+
import os
|
9 |
+
|
10 |
+
load_dotenv()
|
11 |
+
|
12 |
+
embeddings = OpenAIEmbeddings(openai_api_key="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")
|
13 |
+
|
14 |
+
from typing import List
|
15 |
+
|
16 |
+
# =============Fais Setup============#
|
17 |
+
from typing import List
|
18 |
+
from langchain_core.documents import Document
|
19 |
+
import faiss
|
20 |
+
from langchain_community.docstore.in_memory import InMemoryDocstore
|
21 |
+
from langchain_community.vectorstores import FAISS
|
22 |
+
from uuid import uuid4
|
23 |
+
from tqdm import tqdm
|
24 |
+
|
25 |
+
|
26 |
+
def add_to_vector_store(docs_chunks: List[Document],batch_size:int = 64,vector_store_path = "my_faiss_index"):
|
27 |
+
"""
|
28 |
+
Embeds document chunks and stores them in a FAISS vector store.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
docs_chunks (List[Document]): List of LangChain Document objects.
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
dict: Status message and vector store.
|
35 |
+
"""
|
36 |
+
print(f">> Starting embedding for {len(docs_chunks)} documents...\n")
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
if os.path.exists(vector_store_path):
|
41 |
+
print(">> Loading the index <<")
|
42 |
+
vector_store = FAISS.load_local(vector_store_path, embeddings,allow_dangerous_deserialization=True)
|
43 |
+
else:
|
44 |
+
print(">> Creating the index <<")
|
45 |
+
# Create an index using the dimensionality of one sample embedding
|
46 |
+
dimension = len(embeddings.embed_query("hello world"))
|
47 |
+
index = faiss.IndexFlatL2(dimension)
|
48 |
+
# Initialize vector store
|
49 |
+
vector_store = FAISS(
|
50 |
+
embedding_function=embeddings,
|
51 |
+
index=index,
|
52 |
+
docstore=InMemoryDocstore(),
|
53 |
+
index_to_docstore_id={},
|
54 |
+
)
|
55 |
+
|
56 |
+
# Generate unique IDs for documents
|
57 |
+
uuids = [str(uuid4()) for _ in docs_chunks]
|
58 |
+
|
59 |
+
print(f"\n📦 Preparing to insert {len(docs_chunks)} documents into FAISS...\n")
|
60 |
+
# Loop over documents in batches
|
61 |
+
for i in tqdm(range(0, len(docs_chunks), batch_size), desc="🔍 Embedding & Inserting", unit="batch"):
|
62 |
+
batch_docs = docs_chunks[i:i+batch_size]
|
63 |
+
batch_ids = uuids[i:i+batch_size]
|
64 |
+
|
65 |
+
vector_store.add_documents(documents=batch_docs, ids=batch_ids)
|
66 |
+
|
67 |
+
vector_store.save_local(vector_store_path)
|
68 |
+
print("✅ Data insertion successful!\n")
|
69 |
+
|
70 |
+
return {
|
71 |
+
"status": "success",
|
72 |
+
"vector_store": vector_store,
|
73 |
+
"num_documents": len(docs_chunks)
|
74 |
+
}
|
75 |
+
|
76 |
+
def GetContext(query:str):
|
77 |
+
vector_store = FAISS.load_local("my_faiss_index", embeddings,allow_dangerous_deserialization=True)
|
78 |
+
|
79 |
+
results = vector_store.similarity_search(
|
80 |
+
query,
|
81 |
+
k=2,
|
82 |
+
# filter={"source": "tweet"},
|
83 |
+
)
|
84 |
+
# for res in results:
|
85 |
+
# print(f"* {res.page_content} [{res.metadata}]")
|
86 |
+
|
87 |
+
return {"Context":results}
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
if __name__ == "__main__":
|
92 |
+
pass
|