LalitMahale
commited on
Commit
·
f58e385
1
Parent(s):
1c2bd30
summarization api added
Browse files- .env +1 -0
- .gitignore +2 -1
- all_answers.pkl +0 -0
- all_merged_list.pkl +0 -0
- all_mix_embedding.pkl +0 -0
- app.py +23 -12
- process.py +1 -1
- requirements.txt +3 -0
- utils/convert_embedding.py +21 -21
- utils/rag.py +33 -33
- utils/summary.py +39 -0
.env
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
GOOGLE_API = 'AIzaSyB3wI2r6ZgQnYQ3V39PX5S0zWSRqy5ldYw'
|
2 |
TOKEN = "AIzaSyB3wI2r6ZgQnYQ3V39PX5S0zWSRqy5ldYw_Lalit"
|
3 |
GROQ_API = "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"
|
|
|
|
1 |
GOOGLE_API = 'AIzaSyB3wI2r6ZgQnYQ3V39PX5S0zWSRqy5ldYw'
|
2 |
TOKEN = "AIzaSyB3wI2r6ZgQnYQ3V39PX5S0zWSRqy5ldYw_Lalit"
|
3 |
GROQ_API = "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"
|
4 |
+
MODEL = "llama-3.3-70b-versatile"
|
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
__py*
|
2 |
-
utils/__py*
|
|
|
|
1 |
__py*
|
2 |
+
utils/__py*
|
3 |
+
uploads*
|
all_answers.pkl
DELETED
Binary file (53.4 kB)
|
|
all_merged_list.pkl
DELETED
Binary file (71.6 kB)
|
|
all_mix_embedding.pkl
DELETED
Binary file (708 kB)
|
|
app.py
CHANGED
@@ -11,6 +11,7 @@ import base64
|
|
11 |
from pathlib import Path
|
12 |
from process import Response
|
13 |
import uuid
|
|
|
14 |
load_dotenv()
|
15 |
# Create the FastAPI app instance
|
16 |
os.makedirs("/tmp/huggingface_cache", exist_ok=True)
|
@@ -118,34 +119,44 @@ async def audio_chat(audio: UploadFile = File(...), token: str = ""):
|
|
118 |
|
119 |
|
120 |
# Request model
|
|
|
|
|
|
|
121 |
class FileUploadRequest(BaseModel):
|
|
|
122 |
filename: str
|
123 |
content_type: str
|
124 |
base64_file: str
|
125 |
|
126 |
-
UPLOAD_DIR = "/tmp/uploads"
|
127 |
-
Path(UPLOAD_DIR).mkdir(parents=True, exist_ok=True)
|
128 |
-
|
129 |
@app.post("/summarizer")
|
130 |
async def upload_base64(file_data: FileUploadRequest):
|
131 |
try:
|
132 |
-
|
133 |
-
file_path = os.path.join(UPLOAD_DIR, file_data.filename)
|
134 |
|
135 |
with open(file_path, "wb") as f:
|
136 |
f.write(base64.b64decode(file_data.base64_file))
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
except Exception as e:
|
|
|
|
|
146 |
raise HTTPException(status_code=500, detail=str(e))
|
147 |
|
|
|
148 |
|
|
|
149 |
@app.post("/upload")
|
150 |
async def upload_file(req: UploadRequest):
|
151 |
session_id = str(uuid.uuid4())
|
|
|
11 |
from pathlib import Path
|
12 |
from process import Response
|
13 |
import uuid
|
14 |
+
from utils.summary import Summary
|
15 |
load_dotenv()
|
16 |
# Create the FastAPI app instance
|
17 |
os.makedirs("/tmp/huggingface_cache", exist_ok=True)
|
|
|
119 |
|
120 |
|
121 |
# Request model
|
122 |
+
UPLOAD_DIR = "uploads"
|
123 |
+
Path(UPLOAD_DIR).mkdir(parents=True, exist_ok=True)
|
124 |
+
|
125 |
class FileUploadRequest(BaseModel):
|
126 |
+
token: str
|
127 |
filename: str
|
128 |
content_type: str
|
129 |
base64_file: str
|
130 |
|
|
|
|
|
|
|
131 |
@app.post("/summarizer")
|
132 |
async def upload_base64(file_data: FileUploadRequest):
|
133 |
try:
|
134 |
+
file_path = os.path.join(os.getcwd(),UPLOAD_DIR, file_data.filename)
|
|
|
135 |
|
136 |
with open(file_path, "wb") as f:
|
137 |
f.write(base64.b64decode(file_data.base64_file))
|
138 |
+
if file_data.content_type == "application/pdf":
|
139 |
+
doc = Summary().load_doc(doc_path=file_path)
|
140 |
+
chunks,size = Summary().doc_chunk(docs_path=doc)
|
141 |
+
summary = Summary().get_summary(document=chunks,len_document=size)
|
142 |
+
print(summary)
|
143 |
+
return {"text":summary}
|
144 |
+
else:
|
145 |
+
return {"text":f"{file_data.content_type} not supported"}
|
146 |
+
# with open(file_path, "rb") as f:
|
147 |
+
# content = f.read(100)
|
148 |
+
summary = "# working in *progress*.."
|
149 |
+
|
150 |
+
return {"text": summary}
|
151 |
|
152 |
except Exception as e:
|
153 |
+
import traceback
|
154 |
+
traceback.print_exc() # 🔥 Show full error in terminal
|
155 |
raise HTTPException(status_code=500, detail=str(e))
|
156 |
|
157 |
+
|
158 |
|
159 |
+
## RAG Chatbot
|
160 |
@app.post("/upload")
|
161 |
async def upload_file(req: UploadRequest):
|
162 |
session_id = str(uuid.uuid4())
|
process.py
CHANGED
@@ -13,7 +13,7 @@ class Response:
|
|
13 |
res = self.client.chat.completions.create(
|
14 |
messages=[
|
15 |
{"role":"system",
|
16 |
-
"content":f"You are a Question answer chatbot. You have to understand the given content based on that provide answer. If don't know tell unable to get details. only give answers like your are lalit. do not provide
|
17 |
"role": "user",
|
18 |
"content": f"Content : {mytext}\n\n Question : {query}",
|
19 |
}
|
|
|
13 |
res = self.client.chat.completions.create(
|
14 |
messages=[
|
15 |
{"role":"system",
|
16 |
+
"content":f"You are a Question answer chatbot. You have to understand the given content based on that provide answer. If don't know tell unable to get details. only give answers like your are lalit. Provide response required for user question do not provide additional context.",
|
17 |
"role": "user",
|
18 |
"content": f"Content : {mytext}\n\n Question : {query}",
|
19 |
}
|
requirements.txt
CHANGED
@@ -8,3 +8,6 @@ langchain-google-genai
|
|
8 |
faster_whisper
|
9 |
groq==0.28.0
|
10 |
python-dotenv==1.1.0
|
|
|
|
|
|
|
|
8 |
faster_whisper
|
9 |
groq==0.28.0
|
10 |
python-dotenv==1.1.0
|
11 |
+
langchain-groq==0.3.4
|
12 |
+
pymupdf==1.26.1
|
13 |
+
transformers==4.53.0
|
utils/convert_embedding.py
CHANGED
@@ -1,24 +1,24 @@
|
|
1 |
-
from sentence_transformers import SentenceTransformer
|
2 |
|
3 |
-
class GetEmbedding:
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
|
22 |
-
if __name__ == "__main__":
|
23 |
-
|
24 |
-
|
|
|
1 |
+
# from sentence_transformers import SentenceTransformer
|
2 |
|
3 |
+
# class GetEmbedding:
|
4 |
+
# def __init__(self,data:list):
|
5 |
+
# self.data = data
|
6 |
+
# def user_query_emb(self,model_name:str = 'paraphrase-MiniLM-L6-v2'):
|
7 |
+
# try:
|
8 |
+
# model = SentenceTransformer(model_name_or_path=model_name)
|
9 |
+
# embedding = model.encode(self.data)
|
10 |
+
# return embedding
|
11 |
+
# except Exception as e:
|
12 |
+
# print(e)
|
13 |
|
14 |
+
# def convert_data(self,model_name:str = 'paraphrase-MiniLM-L6-v2'):
|
15 |
+
# try:
|
16 |
+
# model = SentenceTransformer(model_name)
|
17 |
+
# embeddings = model.encode(self.data)
|
18 |
+
# return embeddings
|
19 |
+
# except Exception as e:
|
20 |
+
# print(e)
|
21 |
|
22 |
+
# if __name__ == "__main__":
|
23 |
+
# emb = GetEmbedding("lalit")
|
24 |
+
# print( emb)
|
utils/rag.py
CHANGED
@@ -1,42 +1,42 @@
|
|
1 |
-
from langchain_google_genai import GoogleGenerativeAI
|
2 |
-
import requests
|
3 |
-
from bs4 import BeautifulSoup
|
4 |
-
from dotenv import load_dotenv
|
5 |
-
import os
|
6 |
-
load_dotenv()
|
7 |
|
8 |
|
9 |
-
class RAG:
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
|
29 |
-
|
30 |
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
|
39 |
|
40 |
-
if __name__ == "__main__":
|
41 |
-
|
42 |
-
|
|
|
1 |
+
# from langchain_google_genai import GoogleGenerativeAI
|
2 |
+
# import requests
|
3 |
+
# from bs4 import BeautifulSoup
|
4 |
+
# from dotenv import load_dotenv
|
5 |
+
# import os
|
6 |
+
# load_dotenv()
|
7 |
|
8 |
|
9 |
+
# class RAG:
|
10 |
+
# def __init__(self):
|
11 |
+
# self.url = 'https://lalitmahale.github.io'
|
12 |
+
# self.llm = GoogleGenerativeAI(google_api_key=os.getenv("GOOGLE_API"),model="gemini-1.5-pro")
|
13 |
|
14 |
|
15 |
+
# def get_data(self):
|
16 |
+
# try:
|
17 |
+
# res = requests.get(self.url)
|
18 |
+
# soup = BeautifulSoup(res.content, "html.parser")
|
19 |
+
# return soup.get_text()
|
20 |
+
# except Exception as e:
|
21 |
+
# print(e)
|
22 |
+
|
23 |
+
# def clean_text(self):
|
24 |
+
# return self.get_data().replace("\n","")
|
25 |
+
|
26 |
+
# def prompt(self):
|
27 |
+
# return """You are a helpfull assistant for me and Your name is lalit mahale. understand the below context and give answer for user question.
|
28 |
|
29 |
+
# context : {context}\n\nQuestion : {question}\n\nGive proper answer for this questions."""
|
30 |
|
31 |
|
32 |
+
# def pipeline(self,query):
|
33 |
+
# try:
|
34 |
+
# prompt = self.prompt().format(context = self.clean_text(),question = query)
|
35 |
+
# return self.llm.invoke(prompt)
|
36 |
+
# except Exception as e:
|
37 |
+
# print(e)
|
38 |
|
39 |
|
40 |
+
# if __name__ == "__main__":
|
41 |
+
# res = RAG().pipeline("who is lalit mahale")
|
42 |
+
# print(res)
|
utils/summary.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
# from langchain_google_genai import GoogleGenerativeAI
|
4 |
+
from langchain_groq import ChatGroq
|
5 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
+
from langchain.chains.summarize import load_summarize_chain
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
class Summary:
|
11 |
+
def __init__(self):
|
12 |
+
self.model =llm = ChatGroq(api_key=os.getenv("GROQ_API"),model=os.getenv("MODEL"))
|
13 |
+
#GoogleGenerativeAI(api_key = os.getenv("API_KEY"),model = os.getenv("MODEL"))
|
14 |
+
|
15 |
+
def load_doc(self,doc_path:str):
|
16 |
+
try:
|
17 |
+
print("doc-path :",doc_path)
|
18 |
+
doc_loader = PyMuPDFLoader(file_path=doc_path)
|
19 |
+
return doc_loader.load()
|
20 |
+
except Exception as e:
|
21 |
+
print(e)
|
22 |
+
|
23 |
+
def doc_chunk(self,docs_path:list,CHUNK_SIZE:int= 3000,CHUNK_OVERLAP:int = 100):
|
24 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size = CHUNK_SIZE,chunk_overlap = CHUNK_OVERLAP)
|
25 |
+
chunks = splitter.split_documents(docs_path)
|
26 |
+
return chunks,len(chunks)
|
27 |
+
|
28 |
+
def get_summary(self,document:list, len_document:int):
|
29 |
+
try:
|
30 |
+
if len_document == 1:
|
31 |
+
chain = load_summarize_chain(chain_type="stuff",llm = self.model)
|
32 |
+
result = chain.invoke(document)
|
33 |
+
return result["output_text"]
|
34 |
+
else:
|
35 |
+
chain = load_summarize_chain(chain_type="map_reduce",llm = self.model)
|
36 |
+
result = chain.invoke(document)
|
37 |
+
return result["output_text"]
|
38 |
+
except Exception as e:
|
39 |
+
print(e)
|