HARISH20205 commited on
Commit
c8c7a9e
·
1 Parent(s): 168162d
.dockerignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ myenv/
2
+ __pycache__/
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ myenv
2
+ __pycache__
3
+ services
4
+ scrap.txt
5
+ .env
6
+ code.ipynb
7
+ try.py
8
+ files
Dockerfile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base image with Python
2
+ FROM python:3.11-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONDONTWRITEBYTECODE 1
6
+ ENV PYTHONUNBUFFERED 1
7
+
8
+ # Set working directory
9
+ WORKDIR /app
10
+
11
+ # Copy the requirements file
12
+ COPY requirements.txt /app/
13
+
14
+ # Install Python dependencies
15
+ RUN pip install --no-cache-dir --upgrade pip && \
16
+ pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Install system dependencies individually
19
+ RUN apt-get update && apt-get install -y libgl1-mesa-glx
20
+ RUN apt-get install -y libglib2.0-0
21
+ RUN apt-get install -y libsm6
22
+ RUN apt-get install -y libxrender1
23
+ RUN apt-get install -y libxext6
24
+ RUN apt-get install -y ffmpeg
25
+
26
+ # Clean up apt cache
27
+ RUN apt-get clean && rm -rf /var/lib/apt/lists/*
28
+
29
+
30
+
31
+ # Copy the application code
32
+ COPY . /app/
33
+
34
+ # Expose the port
35
+ EXPOSE 7860
36
+
37
+ # Start the application
38
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
39
+
firebase.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import firebase_admin
2
+ # from firebase_admin import credentials, firestore
3
+ # import os
4
+
5
+ # credPath = os.path.join("services","firebase_credentials.json")
6
+ # # Path to your Firebase credentials JSON file
7
+ # cred = credentials.Certificate(credPath)
8
+
9
+ # # Initialize the Firebase app with storageBucket
10
+ # firebase_admin.initialize_app(cred, {
11
+ # 'storageBucket': 'verbisense.appspot.com' # Replace with your bucket name
12
+ # })
13
+
14
+ # # Initialize Firestore DB
15
+ # db = firestore.client()
main.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ # from firebase import db
4
+ # from firebase_admin import auth, storage
5
+ from pydantic import BaseModel
6
+ from typing import Dict, List
7
+ import os
8
+ from source import main
9
+
10
+ app = FastAPI()
11
+
12
+ app.add_middleware(
13
+ CORSMiddleware,
14
+ allow_origins=['*'], # Allow only localhost:5173
15
+ allow_credentials=True,
16
+ allow_methods=['*'],
17
+ allow_headers=['*']
18
+ )
19
+
20
+ class QueryChat(BaseModel):
21
+ userId: str
22
+ files: List
23
+ query: str
24
+
25
+
26
+ # bucket = storage.bucket("verbisense.appspot.com")
27
+
28
+ @app.get("/")
29
+ def read_root():
30
+ return {"message": "Welcome to Verbisense!"}
31
+
32
+ @app.post("/chat")
33
+ async def chat(data: QueryChat):
34
+ try:
35
+ print("userId : ",data.userId)
36
+ print("files : ",data.files)
37
+ print("query : ",data.query)
38
+
39
+ response = main(data.files,data.query)
40
+
41
+ print("\n" + "="*50)
42
+ print(response)
43
+ print("="*50)
44
+ if not response:
45
+ return False
46
+ return {"query":data.query,"response":response}
47
+
48
+ except Exception as e:
49
+ raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
requirements.txt ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohappyeyeballs==2.4.0
3
+ aiohttp==3.10.5
4
+ aiosignal==1.3.1
5
+ altair==5.4.1
6
+ annotated-types==0.7.0
7
+ anyio==4.4.0
8
+ asttokens==2.4.1
9
+ attrs==24.2.0
10
+ backcall==0.2.0
11
+ beautifulsoup4==4.12.3
12
+ bleach==6.1.0
13
+ blis==0.7.11
14
+ CacheControl==0.14.0
15
+ cachetools==5.5.0
16
+ catalogue==2.0.10
17
+ certifi==2024.8.30
18
+ cffi==1.17.1
19
+ charset-normalizer==3.3.2
20
+ click==8.1.7
21
+ cloudpathlib==0.19.0
22
+ colorama==0.4.6
23
+ comm==0.2.2
24
+ confection==0.1.5
25
+ contourpy==1.3.0
26
+ cryptography==43.0.1
27
+ cycler==0.12.1
28
+ cymem==2.0.8
29
+ debugpy==1.8.5
30
+ decorator==5.1.1
31
+ defusedxml==0.7.1
32
+ diskcache==5.6.3
33
+ docopt==0.6.2
34
+ easyocr==1.7.1
35
+ executing==2.1.0
36
+ fastapi==0.114.0
37
+ fastjsonschema==2.20.0
38
+ ffmpeg-python==0.2.0
39
+ ffmpy==0.4.0
40
+ filelock==3.16.0
41
+ firebase-admin==6.5.0
42
+ fonttools==4.53.1
43
+ frozenlist==1.4.1
44
+ fsspec==2024.9.0
45
+ future==1.0.0
46
+ gemini-api==0.1.6
47
+ google-ai-generativelanguage==0.6.10
48
+ google-api-core==2.19.2
49
+ google-api-python-client==2.144.0
50
+ google-auth==2.34.0
51
+ google-auth-httplib2==0.2.0
52
+ google-cloud-core==2.4.1
53
+ google-cloud-firestore==2.18.0
54
+ google-cloud-storage==2.18.2
55
+ google-crc32c==1.6.0
56
+ google-generativeai==0.8.2
57
+ google-resumable-media==2.7.2
58
+ googleapis-common-protos==1.65.0
59
+ gradio==3.38.0
60
+ gradio_client==1.4.2
61
+ greenlet==3.1.0
62
+ grpcio==1.66.1
63
+ grpcio-status==1.66.1
64
+ h11==0.14.0
65
+ httpcore==1.0.5
66
+ httplib2==0.22.0
67
+ httpx==0.27.2
68
+ huggingface-hub==0.24.6
69
+ idna==3.8
70
+ imageio==2.35.1
71
+ importlib_resources==6.4.5
72
+ ipykernel==6.29.5
73
+ ipython==8.12.3
74
+ ipywidgets==8.1.5
75
+ jedi==0.19.1
76
+ Jinja2==3.1.4
77
+ joblib==1.4.2
78
+ jsonpatch==1.33
79
+ jsonpointer==3.0.0
80
+ jsonschema==4.23.0
81
+ jsonschema-specifications==2023.12.1
82
+ jupyter_client==8.6.2
83
+ jupyter_core==5.7.2
84
+ jupyterlab_pygments==0.3.0
85
+ jupyterlab_widgets==3.0.13
86
+ kiwisolver==1.4.7
87
+ langchain==0.3.0
88
+ langchain-core==0.3.1
89
+ langchain-text-splitters==0.3.0
90
+ langcodes==3.4.0
91
+ langsmith==0.1.121
92
+ language_data==1.2.0
93
+ lazy_loader==0.4
94
+ linkify-it-py==2.0.3
95
+ llvmlite==0.43.0
96
+ lxml==5.3.0
97
+ marisa-trie==1.2.0
98
+ markdown-it-py==2.2.0
99
+ MarkupSafe==2.1.5
100
+ matplotlib==3.9.2
101
+ matplotlib-inline==0.1.7
102
+ mdit-py-plugins==0.3.3
103
+ mdurl==0.1.2
104
+ mistune==3.0.2
105
+ more-itertools==10.5.0
106
+ mpmath==1.3.0
107
+ msgpack==1.0.8
108
+ multidict==6.1.0
109
+ murmurhash==1.0.10
110
+ narwhals==1.10.0
111
+ nbclient==0.10.0
112
+ nbconvert==7.16.4
113
+ nbformat==5.10.4
114
+ nest-asyncio==1.6.0
115
+ networkx==3.3
116
+ ninja==1.11.1.1
117
+ numba==0.60.0
118
+ numpy==1.26.4
119
+ openai-whisper==20231117
120
+ opencv-python==4.10.0.84
121
+ opencv-python-headless==4.10.0.84
122
+ orjson==3.10.7
123
+ packaging==24.1
124
+ pandas==2.2.2
125
+ pandocfilters==1.5.1
126
+ parso==0.8.4
127
+ pickleshare==0.7.5
128
+ pillow==10.4.0
129
+ pip-chill==1.0.3
130
+ platformdirs==4.3.2
131
+ preshed==3.0.9
132
+ prompt_toolkit==3.0.47
133
+ proto-plus==1.24.0
134
+ protobuf==5.28.0
135
+ psutil==6.0.0
136
+ pure_eval==0.2.3
137
+ py-cpuinfo==9.0.0
138
+ pyasn1==0.6.0
139
+ pyasn1_modules==0.4.0
140
+ pyclipper==1.3.0.post5
141
+ pycparser==2.22
142
+ pydantic==2.9.1
143
+ pydantic_core==2.23.3
144
+ pydub==0.25.1
145
+ Pygments==2.18.0
146
+ PyJWT==2.9.0
147
+ PyMuPDF==1.24.10
148
+ PyMuPDFb==1.24.10
149
+ pyparsing==3.1.4
150
+ python-bidi==0.6.0
151
+ python-dateutil==2.9.0.post0
152
+ python-docx==1.1.2
153
+ python-dotenv==1.0.1
154
+ python-multipart==0.0.9
155
+ pytz==2024.1
156
+ PyYAML==6.0.2
157
+ pyzmq==26.2.0
158
+ referencing==0.35.1
159
+ regex==2024.7.24
160
+ requests==2.32.3
161
+ rich==13.8.0
162
+ rpds-py==0.20.0
163
+ rsa==4.9
164
+ ruff==0.6.5
165
+ safetensors==0.4.5
166
+ scikit-image==0.24.0
167
+ scikit-learn==1.5.1
168
+ scipy==1.14.1
169
+ seaborn==0.13.2
170
+ semantic-version==2.10.0
171
+ sentence-transformers==3.0.1
172
+ shapely==2.0.6
173
+ shellingham==1.5.4
174
+ six==1.16.0
175
+ smart-open==7.0.4
176
+ sniffio==1.3.1
177
+ soundfile==0.12.1
178
+ soupsieve==2.6
179
+ spacy==3.7.6
180
+ spacy-legacy==3.0.12
181
+ spacy-loggers==1.0.5
182
+ SQLAlchemy==2.0.35
183
+ srsly==2.4.8
184
+ stack-data==0.6.3
185
+ starlette==0.38.5
186
+ sympy==1.13.2
187
+ tenacity==8.5.0
188
+ thinc==8.2.4
189
+ threadpoolctl==3.5.0
190
+ tifffile==2024.8.30
191
+ tiktoken==0.7.0
192
+ tinycss2==1.3.0
193
+ tokenizers==0.19.1
194
+ tomlkit==0.12.0
195
+ torch
196
+ torchaudio
197
+ torchvision
198
+ tornado==6.4.1
199
+ tqdm==4.66.5
200
+ traitlets==5.14.3
201
+ transformers==4.44.2
202
+ typer==0.12.5
203
+ typing_extensions==4.12.2
204
+ tzdata==2024.1
205
+ uc-micro-py==1.0.3
206
+ ultralytics==8.3.13
207
+ ultralytics-thop==2.0.9
208
+ uritemplate==4.1.1
209
+ urllib3==2.2.2
210
+ uvicorn==0.30.6
211
+ wasabi==1.1.3
212
+ wcwidth==0.2.13
213
+ weasel==0.4.1
214
+ webencodings==0.5.1
215
+ websockets==11.0.3
216
+ widgetsnbextension==4.0.13
217
+ wrapt==1.16.0
218
+ yarg==0.1.9
219
+ yarl==1.11.1
response.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import google.generativeai as genai
4
+ import json
5
+ from dotenv import load_dotenv
6
+ import re
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+
11
+ # Configure Gemini API
12
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
13
+
14
+ # Set up logging
15
+ logging.basicConfig(level=logging.INFO)
16
+
17
+
18
+ def format_response(json_string):
19
+ # Remove the "```json" at the start and "```" at the end
20
+ clean_string = json_string.strip().replace("```json", "").replace("```", "").replace("*","").replace("`","").strip()
21
+ # Convert the cleaned string to a Python dictionary
22
+ return json.loads(clean_string)
23
+
24
+ def generate_response(context: str, query: str) -> dict:
25
+ """Generates a response from the Gemini model based on the provided context and query."""
26
+
27
+ model = genai.GenerativeModel(
28
+ "models/gemini-1.5-flash",
29
+ system_instruction="""
30
+ You are a Document query system named Verbisense
31
+ Instructions for handling context and query:
32
+ 1. When context is provided: Answer the query by prioritizing the information from the context. If the context is sufficient to address the query, base your response on it.
33
+ 2. When no context is provided: Answer the query directly, ensuring clarity and relevance.
34
+ 3. When the context is incomplete or insufficient: Supplement the context with relevant details from the query to provide a well-rounded and comprehensive answer.
35
+
36
+ The response should be generated in the format with the following structure:
37
+ {{
38
+ "summary": "A clear and concise summary of the answer.",
39
+ "heading1": "Main Heading",
40
+ "heading2": [
41
+ "Subheading 1",
42
+ "Subheading 2"
43
+ ]
44
+ "points": [
45
+ "Subheading 1" : ["point 1", "point 2", ....],
46
+ "Subheading 2" : ["point 1", "point 2", ....],
47
+ ],
48
+ "example": [
49
+ "Example for Subheading 1",
50
+ "Example for Subheading 2"
51
+ ],
52
+ "key_takeaways": "Key takeaways or insights from the answer."
53
+ }}
54
+
55
+ Guidelines for formatting and content creation:
56
+ 1. Provide Summary only if the context is not sufficient to answer the query. The summary should be a concise overview of the response.
57
+ 2. Use simple, clear, and user-friendly language. Your responses should be easily understandable by a general audience.
58
+ 3. Ensure the JSON structure is properly formatted. Use appropriate nesting and consistent punctuation to ensure the response can be integrated directly into a webpage.
59
+ 4. Provide detailed, insightful, and informative answers. Ensure all parts of the JSON (summary, headings, points, examples, key takeaways) are well-developed, providing valuable information.
60
+ 5. Organize information logically. Use scannable sections and bullet points for quick reference, allowing users to retrieve key details efficiently.
61
+ 6. provide the key takeaways in the response if its not a greeting or simple message. This should be a clear and concise statement summarizing the main insights or conclusions from the answer.
62
+ 7. try to provide 5-10 points for each subheading. This will help to provide a comprehensive and detailed response to the query.
63
+ 8. dont limit the headings and subheadings to the ones provided in the query. Feel free to add more headings and subheadings as needed to provide a complete response.
64
+ 9. provided as much information as possible in the response. This will help to ensure that the user gets a comprehensive answer to their query.
65
+ 10. check multiple times wheather the output is in the correct mentioned format or not. This will help to ensure that the response can be easily integrated into a webpage.
66
+
67
+ Guidelines for greeting handling:
68
+ 1. Use a warm and approachable tone. Keep it friendly, but concise and welcoming.
69
+ 2. Limit greeting responses to the 'summary' key only. For example, respond with a brief statement like: "Hello! How can I assist you today?"
70
+ 3. Avoid unnecessary over-explanation in greetings. Keep the focus on inviting the user to continue the interaction.
71
+
72
+ Key considerations for all responses:
73
+ 1. Your identity is Verbisense. Ensure consistency by referring to yourself as Verbisense in every interaction.
74
+ 2. Prioritize information and engagement. Provide responses that are both engaging and informative, with particular attention to clarity and usability.
75
+ 3. Tailor each response to the context and query. Ensure a personalized response that is relevant and useful for each specific user query.
76
+ """, generation_config={"response_mime_type": "application/json"}
77
+ )
78
+
79
+ # Define a general prompt template for other queries
80
+ general_prompt_template = f"""
81
+ Given the following context and query, generate a JSON-formatted answer optimized for direct integration into a webpage.
82
+
83
+ Context: {context if context else "None" }
84
+ Query: {query}
85
+
86
+ """
87
+
88
+
89
+ try:
90
+ # Generate content from the model
91
+ response = model.generate_content(general_prompt_template)
92
+ print(response.text)
93
+ response_json = format_response(response.text)
94
+
95
+ print(response.text)
96
+ logging.info("Response generated successfully.")
97
+
98
+ return response_json
99
+
100
+ except Exception as e:
101
+ logging.error(f"Error generating content from Gemini: {e}")
102
+ return {"error": "Failed to generate content from Gemini."}
103
+
104
+
source.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Dict, Any
3
+ import pandas as pd
4
+ import numpy as np
5
+ import torch
6
+ from sentence_transformers import SentenceTransformer, util
7
+ from time import perf_counter as timer
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from dotenv import load_dotenv
10
+ import logging
11
+ import google.generativeai as genai
12
+ import warnings
13
+ import json
14
+
15
+
16
+ # Suppress specific FutureWarning messages
17
+ warnings.filterwarnings("ignore", category=FutureWarning)
18
+
19
+
20
+ # Load environment variables
21
+ load_dotenv()
22
+
23
+ # Gemini-API key
24
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
25
+
26
+ # Set up logging
27
+ logging.basicConfig(level=logging.INFO)
28
+
29
+ # Importing processors (assumed to be your custom modules)
30
+ from src.text_processor import process_text_file
31
+ from src.audio_processor import process_audio_from_url
32
+ from src.video_processor import process_video_file
33
+ from src.image_processor import process_image_file
34
+
35
+ from response import generate_response
36
+
37
+ def process_files(file_paths: List[str]) -> List[Dict[str, Any]]:
38
+ """Processes a list of files in parallel and returns their processed content."""
39
+ if file_paths == []:
40
+ logging.info("No files to process")
41
+ return []
42
+ def process_single_file(file_path):
43
+ _, extension = os.path.splitext(file_path)
44
+ extension = extension.lower()
45
+ file_name = os.path.basename(file_path)
46
+
47
+ if "?alt=media&token=" in extension:
48
+ extension = list(extension.split("?"))[0]
49
+ print("\nprocessing file type : ",extension)
50
+ try:
51
+ if extension in ['.txt', '.pdf', '.docx']:
52
+ return process_text_file(file_path)
53
+ elif extension in ['.mp3', '.wav', '.flac']:
54
+ return process_audio_from_url(file_path)
55
+ elif extension in ['.mp4']:
56
+ return process_video_file(file_path)
57
+ elif extension in ['.png', '.jpg', '.jpeg']:
58
+ return process_image_file(file_path)
59
+ else:
60
+ logging.warning(f"Unsupported file type: {extension} for file {file_name}")
61
+ return []
62
+ except Exception as e:
63
+ logging.error(f"Error processing file {file_name}: {e}", exc_info=True)
64
+ return []
65
+ try:
66
+ # Process files in parallel, limiting threads to the number of CPU cores
67
+ with ThreadPoolExecutor(max_workers=min(len(file_paths), os.cpu_count())) as executor:
68
+ results = executor.map(process_single_file, file_paths)
69
+ # Flatten the results
70
+ processed_data = [item for result in results for item in result]
71
+
72
+ if not processed_data:
73
+ return []
74
+ return processed_data
75
+ except ValueError:
76
+ logging.error("contains invalid file paths")
77
+
78
+
79
+ def create_embeddings(processed_data: List[Dict[str, Any]], embedding_model: SentenceTransformer) -> pd.DataFrame:
80
+ """Generates embeddings for processed data."""
81
+ try:
82
+ text_chunks = [item["text"] for item in processed_data]
83
+ embeddings_list = [] # Store embeddings in a list
84
+ batch_size = 32
85
+
86
+ # Process embeddings in batches to optimize memory usage
87
+ for i in range(0, len(text_chunks), batch_size):
88
+ batch_embeddings = embedding_model.encode(text_chunks[i:i + batch_size], convert_to_tensor=False) # Avoid torch tensor
89
+ embeddings_list.extend(batch_embeddings) # Accumulate embeddings
90
+ logging.info(f"Processed batch {i // batch_size + 1}/{(len(text_chunks) + batch_size - 1) // batch_size}")
91
+
92
+ # Convert to numpy array of float32 for compatibility with Annoy
93
+ embeddings_np = np.array(embeddings_list).astype('float32')
94
+
95
+ # Create a DataFrame with the embeddings
96
+ df = pd.DataFrame(processed_data)
97
+ df["embedding"] = embeddings_np.tolist()
98
+ return df
99
+ except Exception as e:
100
+ logging.error(f"Error creating embeddings: {e}", exc_info=True)
101
+ return pd.DataFrame()
102
+
103
+
104
+
105
+
106
+ def semantic_search(query: str, embeddings_df: pd.DataFrame, embedding_model: SentenceTransformer, num_results: int) -> List[Dict[str, Any]]:
107
+ """Performs semantic search using embeddings and returns the top results."""
108
+ try:
109
+ # Create embedding for the query
110
+ query_embedding = embedding_model.encode(query, convert_to_tensor=True)
111
+
112
+ # Convert embeddings from DataFrame to a tensor
113
+ embeddings = torch.tensor(np.array(embeddings_df["embedding"].tolist()), dtype=torch.float32).to(embedding_model.device)
114
+
115
+ # Measure search time
116
+ start_time = timer()
117
+ dot_scores = util.dot_score(query_embedding, embeddings)[0]
118
+ end_time = timer()
119
+ logging.info(f"Time taken to get scores on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds.")
120
+
121
+ # Get the top results
122
+ top_results = torch.topk(dot_scores, k=num_results)
123
+ results = []
124
+
125
+ # Format the results
126
+ for score, idx in zip(top_results.values, top_results.indices):
127
+ idx = idx.item() # Convert tensor to integer
128
+ result = {
129
+ "score": score.item(),
130
+ "text": embeddings_df.iloc[idx]["text"],
131
+ "file_name": embeddings_df.iloc[idx]["file_name"],
132
+ **{k: v for k, v in embeddings_df.iloc[idx].items() if k not in ["text", "file_name", "embedding"]}
133
+ }
134
+ results.append(result)
135
+
136
+ return results
137
+ except Exception as e:
138
+ logging.error(f"Error during semantic search: {e}", exc_info=True)
139
+ return []
140
+
141
+
142
+ def count_tokens(text: str) -> int:
143
+ """Roughly estimate the number of tokens in a text."""
144
+ return len(text.split())
145
+
146
+ def main(files: list, query: str, min_text_length: int = 1000000, max_gemini_tokens: int = 7300):
147
+ """Main function to process files, perform semantic search or send data directly to Gemini."""
148
+
149
+ try:
150
+ # Process files (your existing file processing logic)
151
+ processed_data = process_files(files)
152
+ # Combine all text chunks
153
+ combined_text = " ".join([item["text"] for item in processed_data])
154
+
155
+ logging.info(f"Total text length: {len(combined_text)} characters")
156
+
157
+ # Count tokens and check if they exceed the allowed limit for Gemini
158
+ token_count = count_tokens(combined_text)
159
+ print("Token count : ",token_count)
160
+ # If token count is within limits, send directly to Gemini for response generation
161
+ if token_count < min_text_length:
162
+ logging.info(f"Text is below the threshold ({min_text_length} tokens). Sending directly to Gemini.")
163
+ response = generate_response(combined_text, query)
164
+ return response
165
+ else:
166
+ logging.info(f"Text exceeds the maximum allowed tokens ({max_gemini_tokens}). Performing semantic search.")
167
+ # Only initialize embeddings when needed
168
+ embedding_model = SentenceTransformer("all-mpnet-base-v2", device="cuda" if torch.cuda.is_available() else "cpu")
169
+
170
+ # Create embeddings
171
+ embeddings_df = create_embeddings(processed_data, embedding_model)
172
+ if embeddings_df.empty:
173
+ logging.error("No embeddings created. Exiting.")
174
+ return {"error": "Failed to create embeddings from the processed data."}
175
+
176
+ # Perform semantic search
177
+ num_results = min(1, len(embeddings_df)) # Adjust number of results based on available data
178
+ results = semantic_search(query, embeddings_df, embedding_model, num_results)
179
+ print("Semantic Searchs return the top results with relevant scores and contextual information. \n",results)
180
+ if not results:
181
+ logging.error("No results found. Exiting.")
182
+ return {"error": "Semantic search returned no results."}
183
+ context = " ".join([result['text'] for result in results]) # Example context generation from results
184
+ response = generate_response(context, query)
185
+ return response
186
+ except Exception as e:
187
+ logging.error(f"Error: {e}")
188
+ return {"error": "An error occurred during the main process."}
189
+
190
+ if __name__ == "__main__":
191
+ files = [
192
+ # Your file paths go here
193
+ ]
194
+ query = "Introduce yourself, what are you?"
195
+ main(files, query)
src/audio_processor.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import requests
3
+ import ffmpeg
4
+ import numpy as np
5
+ from typing import List, Dict, Any
6
+
7
+ def process_audio_from_url(audio_url: str) -> List[Dict[str, Any]]:
8
+ # Download the audio file content
9
+ response = requests.get(audio_url, stream=True)
10
+ response.raise_for_status()
11
+
12
+ # Use ffmpeg to decode the audio stream
13
+ try:
14
+ out, _ = (
15
+ ffmpeg
16
+ .input('pipe:0')
17
+ .output('pipe:1', format='f32le', acodec='pcm_f32le', ac=1, ar='16k')
18
+ .run(input=response.raw.read(), capture_stdout=True, capture_stderr=True)
19
+ )
20
+ except ffmpeg.Error as e:
21
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
22
+
23
+ # Convert the audio to the format Whisper expects
24
+ audio = np.frombuffer(out, np.float32).flatten()
25
+
26
+ # Load the Whisper model
27
+ model = whisper.load_model("base")
28
+
29
+ # Transcribe the audio
30
+ result = model.transcribe(audio)
31
+
32
+ segments = []
33
+ for segment in result["segments"]:
34
+ segments.append({
35
+ "file_name": audio_url.split("/")[-1], # Extract filename from URL
36
+ "text": segment["text"]
37
+ })
38
+ return segments
39
+
40
+ def process_audio_data(audio: np.ndarray, file_name: str) -> List[Dict[str, Any]]:
41
+ # Load the Whisper model
42
+ model = whisper.load_model("base")
43
+
44
+ # Transcribe the audio
45
+ result = model.transcribe(audio)
46
+
47
+ segments = []
48
+ for segment in result["segments"]:
49
+ segments.append({
50
+ "file_name": file_name, # Ensure file_name is added
51
+ "text": segment["text"]
52
+ })
53
+ return segments
src/image_processor.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import easyocr
2
+ import requests
3
+ import io
4
+ from PIL import Image
5
+ from typing import List, Dict, Any
6
+ import os
7
+ import numpy as np
8
+ from gradio_client import Client
9
+
10
+
11
+ def process_image_file(image_url: str) -> List[Dict[str, Any]]:
12
+ # Fetch the image content from the URL
13
+ response = requests.get(image_url)
14
+
15
+ # Check if the request was successful
16
+ if response.status_code == 200:
17
+ # Load the image from the response content using PIL
18
+ image_stream = io.BytesIO(response.content)
19
+ image = Image.open(image_stream)
20
+
21
+ # Convert the image to a NumPy array, which is supported by EasyOCR
22
+ image_np = np.array(image)
23
+
24
+ # Use EasyOCR to extract text from the image
25
+ reader = easyocr.Reader(['en'])
26
+ result = reader.readtext(image_np)
27
+
28
+ print("*" * 50 + image_url)
29
+
30
+
31
+
32
+ # Combine the extracted text from EasyOCR
33
+ extracted_text = "\n".join([detection[1] for detection in result])
34
+
35
+ if len(extracted_text.split())<5 :
36
+ # Use the BLIP model for image captioning
37
+ client = Client("HARISH20205/blip-image-caption")
38
+ caption_result = client.predict(image_url=image_url, api_name="/predict")
39
+ content = "\nImage Caption:\n" + str(caption_result)
40
+ return [{
41
+ "file_name": os.path.basename(image_url),
42
+ "text": content,
43
+ }]
44
+ # Format the content
45
+ content = "Image Data:\n" + extracted_text
46
+
47
+ return [{
48
+ "file_name": os.path.basename(image_url),
49
+ "text": content,
50
+ }]
51
+ else:
52
+ return [{
53
+ "file_name": os.path.basename(image_url),
54
+ "text": "Failed to retrieve image.",
55
+ }]
src/text_processor.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Dict, Any
3
+ import fitz # PyMuPDF
4
+ import docx
5
+ import requests
6
+ import io
7
+
8
+ def process_text_file(file_url: str) -> List[Dict[str, Any]]:
9
+ _, extension = os.path.splitext(file_url)
10
+ extension = extension.lower()
11
+
12
+ if "?alt=media&token=" in extension:
13
+ extension = list(extension.split("?"))[0]
14
+ if extension == '.txt':
15
+ return process_txt(file_url)
16
+ elif extension == '.pdf':
17
+ return process_pdf(file_url)
18
+ elif extension == '.docx':
19
+ return process_docx(file_url)
20
+ else:
21
+ raise ValueError(f"Unsupported text file type: {extension}")
22
+
23
+ def process_txt(txt_url: str) -> List[Dict[str, Any]]:
24
+ # Fetch the TXT file content from the URL
25
+ response = requests.get(txt_url)
26
+
27
+ # Check if the request was successful
28
+ if response.status_code == 200:
29
+ content = response.text
30
+ return [{
31
+ "file_name": os.path.basename(txt_url),
32
+ "text": content,
33
+ "page_number": 1
34
+ }]
35
+ else:
36
+ print(f"Failed to fetch the TXT file. Status code: {response.status_code}")
37
+ return []
38
+
39
+ def process_pdf(pdf_url: str) -> List[Dict[str, Any]]:
40
+ # Fetch the PDF file content from the URL
41
+ response = requests.get(pdf_url)
42
+
43
+ # Check if the request was successful
44
+ if response.status_code == 200:
45
+ # Load the PDF file from the response content
46
+ pdf_stream = io.BytesIO(response.content)
47
+
48
+ # Open the PDF file with PyMuPDF
49
+ pdf_document = fitz.open(stream=pdf_stream, filetype="pdf")
50
+
51
+ # Extract text from all pages
52
+ pdf_text = ""
53
+ for page_num in range(len(pdf_document)):
54
+ page = pdf_document.load_page(page_num) # Load the page
55
+ pdf_text += page.get_text("text") # Extract text from the page
56
+
57
+ return [{
58
+ "file_name": os.path.basename(pdf_url),
59
+ "text": pdf_text
60
+ }]
61
+ else:
62
+ print(f"Failed to fetch the PDF file. Status code: {response.status_code}")
63
+ return []
64
+
65
+ def process_docx(docx_url: str) -> List[Dict[str, Any]]:
66
+ # Fetch the DOCX file content from the URL
67
+ response = requests.get(docx_url)
68
+
69
+ # Check if the request was successful
70
+ if response.status_code == 200:
71
+ # Load the DOCX file from the response content
72
+ docx_stream = io.BytesIO(response.content)
73
+
74
+ # Open the DOCX file with python-docx
75
+ doc = docx.Document(docx_stream)
76
+
77
+ # Extract text from the DOCX file
78
+ content = "\n".join([para.text for para in doc.paragraphs])
79
+ return [{
80
+ "file_name": os.path.basename(docx_url),
81
+ "text": content,
82
+ "page_number": 1 # DOCX doesn't have pages, so just 1
83
+ }]
84
+ else:
85
+ print(f"Failed to fetch the DOCX file. Status code: {response.status_code}")
src/video_processor.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ffmpeg
2
+ import numpy as np
3
+ from src.audio_processor import process_audio_data
4
+ import os
5
+ def process_video_file(file_path: str):
6
+ # Use ffmpeg to extract audio from the video file
7
+ try:
8
+ out, _ = (
9
+ ffmpeg
10
+ .input(file_path)
11
+ .output('pipe:1', format='f32le', acodec='pcm_f32le', ac=1, ar='16k')
12
+ .run(capture_stdout=True, capture_stderr=True)
13
+ )
14
+ except ffmpeg.Error as e:
15
+ raise RuntimeError(f"Failed to extract audio from video: {e.stderr.decode()}") from e
16
+
17
+ # Convert the audio to the format Whisper expects
18
+ audio = np.frombuffer(out, np.float32).flatten()
19
+
20
+ # Pass file name to audio processor
21
+ file_name = os.path.basename(file_path)
22
+ result = process_audio_data(audio, file_name)
23
+ return result