Spaces:

HARISH20205
/

verbisense

Sleeping

App Files Files Community

HARISH20205 commited on Nov 12, 2024

Commit

c8c7a9e

1 Parent(s): 168162d

first

Browse files

Files changed (12) hide show

.dockerignore +2 -0
.gitignore +8 -0
Dockerfile +39 -0
firebase.py +15 -0
main.py +49 -0
requirements.txt +219 -0
response.py +104 -0
source.py +195 -0
src/audio_processor.py +53 -0
src/image_processor.py +55 -0
src/text_processor.py +85 -0
src/video_processor.py +23 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ myenv/
2	+ __pycache__/

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+myenv
+__pycache__
+services
+scrap.txt
+.env
+code.ipynb
+try.py
+files

Dockerfile ADDED Viewed

	@@ -0,0 +1,39 @@

+# Base image with Python
+FROM python:3.11-slim
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+# Set working directory
+WORKDIR /app
+# Copy the requirements file
+COPY requirements.txt /app/
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Install system dependencies individually
+RUN apt-get update && apt-get install -y libgl1-mesa-glx
+RUN apt-get install -y libglib2.0-0
+RUN apt-get install -y libsm6
+RUN apt-get install -y libxrender1
+RUN apt-get install -y libxext6
+RUN apt-get install -y ffmpeg
+# Clean up apt cache
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
+# Copy the application code
+COPY . /app/
+# Expose the port
+EXPOSE 7860
+# Start the application
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

firebase.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# import firebase_admin
+# from firebase_admin import credentials, firestore
+# import os
+# credPath = os.path.join("services","firebase_credentials.json")
+# # Path to your Firebase credentials JSON file
+# cred = credentials.Certificate(credPath)
+# # Initialize the Firebase app with storageBucket
+# firebase_admin.initialize_app(cred, {
+#     'storageBucket': 'verbisense.appspot.com'  # Replace with your bucket name
+# })
+# # Initialize Firestore DB
+# db = firestore.client()

main.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+# from firebase import db
+# from firebase_admin import auth, storage
+from pydantic import BaseModel
+from typing import Dict, List
+import os
+from source import main
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=['*'],  # Allow only localhost:5173
+    allow_credentials=True,
+    allow_methods=['*'],
+    allow_headers=['*']
+)
+class QueryChat(BaseModel):
+    userId: str
+    files: List
+    query: str
+# bucket = storage.bucket("verbisense.appspot.com")
+@app.get("/")
+def read_root():
+    return {"message": "Welcome to Verbisense!"}
+@app.post("/chat")
+async def chat(data: QueryChat):
+    try:
+        print("userId : ",data.userId)
+        print("files : ",data.files)
+        print("query : ",data.query)
+        response = main(data.files,data.query)
+        print("\n" + "="*50)
+        print(response)
+        print("="*50)
+        if not response:
+            return False
+        return {"query":data.query,"response":response}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"An error occurred: {e}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,219 @@

+aiofiles==23.2.1
+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+altair==5.4.1
+annotated-types==0.7.0
+anyio==4.4.0
+asttokens==2.4.1
+attrs==24.2.0
+backcall==0.2.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+CacheControl==0.14.0
+cachetools==5.5.0
+catalogue==2.0.10
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.19.0
+colorama==0.4.6
+comm==0.2.2
+confection==0.1.5
+contourpy==1.3.0
+cryptography==43.0.1
+cycler==0.12.1
+cymem==2.0.8
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+diskcache==5.6.3
+docopt==0.6.2
+easyocr==1.7.1
+executing==2.1.0
+fastapi==0.114.0
+fastjsonschema==2.20.0
+ffmpeg-python==0.2.0
+ffmpy==0.4.0
+filelock==3.16.0
+firebase-admin==6.5.0
+fonttools==4.53.1
+frozenlist==1.4.1
+fsspec==2024.9.0
+future==1.0.0
+gemini-api==0.1.6
+google-ai-generativelanguage==0.6.10
+google-api-core==2.19.2
+google-api-python-client==2.144.0
+google-auth==2.34.0
+google-auth-httplib2==0.2.0
+google-cloud-core==2.4.1
+google-cloud-firestore==2.18.0
+google-cloud-storage==2.18.2
+google-crc32c==1.6.0
+google-generativeai==0.8.2
+google-resumable-media==2.7.2
+googleapis-common-protos==1.65.0
+gradio==3.38.0
+gradio_client==1.4.2
+greenlet==3.1.0
+grpcio==1.66.1
+grpcio-status==1.66.1
+h11==0.14.0
+httpcore==1.0.5
+httplib2==0.22.0
+httpx==0.27.2
+huggingface-hub==0.24.6
+idna==3.8
+imageio==2.35.1
+importlib_resources==6.4.5
+ipykernel==6.29.5
+ipython==8.12.3
+ipywidgets==8.1.5
+jedi==0.19.1
+Jinja2==3.1.4
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter_client==8.6.2
+jupyter_core==5.7.2
+jupyterlab_pygments==0.3.0
+jupyterlab_widgets==3.0.13
+kiwisolver==1.4.7
+langchain==0.3.0
+langchain-core==0.3.1
+langchain-text-splitters==0.3.0
+langcodes==3.4.0
+langsmith==0.1.121
+language_data==1.2.0
+lazy_loader==0.4
+linkify-it-py==2.0.3
+llvmlite==0.43.0
+lxml==5.3.0
+marisa-trie==1.2.0
+markdown-it-py==2.2.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+matplotlib-inline==0.1.7
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+mistune==3.0.2
+more-itertools==10.5.0
+mpmath==1.3.0
+msgpack==1.0.8
+multidict==6.1.0
+murmurhash==1.0.10
+narwhals==1.10.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.3
+ninja==1.11.1.1
+numba==0.60.0
+numpy==1.26.4
+openai-whisper==20231117
+opencv-python==4.10.0.84
+opencv-python-headless==4.10.0.84
+orjson==3.10.7
+packaging==24.1
+pandas==2.2.2
+pandocfilters==1.5.1
+parso==0.8.4
+pickleshare==0.7.5
+pillow==10.4.0
+pip-chill==1.0.3
+platformdirs==4.3.2
+preshed==3.0.9
+prompt_toolkit==3.0.47
+proto-plus==1.24.0
+protobuf==5.28.0
+psutil==6.0.0
+pure_eval==0.2.3
+py-cpuinfo==9.0.0
+pyasn1==0.6.0
+pyasn1_modules==0.4.0
+pyclipper==1.3.0.post5
+pycparser==2.22
+pydantic==2.9.1
+pydantic_core==2.23.3
+pydub==0.25.1
+Pygments==2.18.0
+PyJWT==2.9.0
+PyMuPDF==1.24.10
+PyMuPDFb==1.24.10
+pyparsing==3.1.4
+python-bidi==0.6.0
+python-dateutil==2.9.0.post0
+python-docx==1.1.2
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.2
+pyzmq==26.2.0
+referencing==0.35.1
+regex==2024.7.24
+requests==2.32.3
+rich==13.8.0
+rpds-py==0.20.0
+rsa==4.9
+ruff==0.6.5
+safetensors==0.4.5
+scikit-image==0.24.0
+scikit-learn==1.5.1
+scipy==1.14.1
+seaborn==0.13.2
+semantic-version==2.10.0
+sentence-transformers==3.0.1
+shapely==2.0.6
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.4
+sniffio==1.3.1
+soundfile==0.12.1
+soupsieve==2.6
+spacy==3.7.6
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+SQLAlchemy==2.0.35
+srsly==2.4.8
+stack-data==0.6.3
+starlette==0.38.5
+sympy==1.13.2
+tenacity==8.5.0
+thinc==8.2.4
+threadpoolctl==3.5.0
+tifffile==2024.8.30
+tiktoken==0.7.0
+tinycss2==1.3.0
+tokenizers==0.19.1
+tomlkit==0.12.0
+torch
+torchaudio
+torchvision
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.44.2
+typer==0.12.5
+typing_extensions==4.12.2
+tzdata==2024.1
+uc-micro-py==1.0.3
+ultralytics==8.3.13
+ultralytics-thop==2.0.9
+uritemplate==4.1.1
+urllib3==2.2.2
+uvicorn==0.30.6
+wasabi==1.1.3
+wcwidth==0.2.13
+weasel==0.4.1
+webencodings==0.5.1
+websockets==11.0.3
+widgetsnbextension==4.0.13
+wrapt==1.16.0
+yarg==0.1.9
+yarl==1.11.1

response.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+import logging
+import google.generativeai as genai
+import json
+from dotenv import load_dotenv
+import re
+# Load environment variables
+load_dotenv()
+# Configure Gemini API
+genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+def format_response(json_string):
+    # Remove the "```json" at the start and "```" at the end
+    clean_string = json_string.strip().replace("```json", "").replace("```", "").replace("*","").replace("`","").strip()
+    # Convert the cleaned string to a Python dictionary
+    return json.loads(clean_string)
+def generate_response(context: str, query: str) -> dict:
+    """Generates a response from the Gemini model based on the provided context and query."""
+    model = genai.GenerativeModel(
+    "models/gemini-1.5-flash",
+    system_instruction="""
+    You are a Document query system named Verbisense
+    Instructions for handling context and query:
+    1. When context is provided: Answer the query by prioritizing the information from the context. If the context is sufficient to address the query, base your response on it.
+    2. When no context is provided: Answer the query directly, ensuring clarity and relevance.
+    3. When the context is incomplete or insufficient: Supplement the context with relevant details from the query to provide a well-rounded and comprehensive answer.
+    The response should be generated in the format with the following structure:
+     {{
+            "summary": "A clear and concise summary of the answer.",
+            "heading1": "Main Heading",
+            "heading2": [
+                "Subheading 1",
+                "Subheading 2"
+            ]
+            "points": [
+                "Subheading 1" : ["point 1", "point 2", ....],
+                "Subheading 2" : ["point 1", "point 2", ....],
+            ],
+            "example": [
+                "Example for Subheading 1",
+                "Example for Subheading 2"
+            ],
+            "key_takeaways": "Key takeaways or insights from the answer."
+        }}
+    Guidelines for formatting and content creation:
+    1. Provide Summary only if the context is not sufficient to answer the query. The summary should be a concise overview of the response.
+    2. Use simple, clear, and user-friendly language. Your responses should be easily understandable by a general audience.
+    3. Ensure the JSON structure is properly formatted. Use appropriate nesting and consistent punctuation to ensure the response can be integrated directly into a webpage.
+    4. Provide detailed, insightful, and informative answers. Ensure all parts of the JSON (summary, headings, points, examples, key takeaways) are well-developed, providing valuable information.
+    5. Organize information logically. Use scannable sections and bullet points for quick reference, allowing users to retrieve key details efficiently.
+    6. provide the key takeaways in the response if its not a greeting or simple message. This should be a clear and concise statement summarizing the main insights or conclusions from the answer.
+    7. try to provide 5-10 points for each subheading. This will help to provide a comprehensive and detailed response to the query.
+    8. dont limit the headings and subheadings to the ones provided in the query. Feel free to add more headings and subheadings as needed to provide a complete response.
+    9. provided as much information as possible in the response. This will help to ensure that the user gets a comprehensive answer to their query.
+    10. check multiple times wheather the output is in the correct mentioned format or not. This will help to ensure that the response can be easily integrated into a webpage.
+    Guidelines for greeting handling:
+    1. Use a warm and approachable tone. Keep it friendly, but concise and welcoming.
+    2. Limit greeting responses to the 'summary' key only. For example, respond with a brief statement like: "Hello! How can I assist you today?"
+    3. Avoid unnecessary over-explanation in greetings. Keep the focus on inviting the user to continue the interaction.
+    Key considerations for all responses:
+    1. Your identity is Verbisense. Ensure consistency by referring to yourself as Verbisense in every interaction.
+    2. Prioritize information and engagement. Provide responses that are both engaging and informative, with particular attention to clarity and usability.
+    3. Tailor each response to the context and query. Ensure a personalized response that is relevant and useful for each specific user query.
+""", generation_config={"response_mime_type": "application/json"}
+    )
+    # Define a general prompt template for other queries
+    general_prompt_template = f"""
+    Given the following context and query, generate a JSON-formatted answer optimized for direct integration into a webpage.
+    Context: {context if context else "None" }
+    Query: {query}
+    """
+    try:
+        # Generate content from the model
+        response = model.generate_content(general_prompt_template)
+        print(response.text)
+        response_json = format_response(response.text)
+        print(response.text)
+        logging.info("Response generated successfully.")
+        return response_json
+    except Exception as e:
+        logging.error(f"Error generating content from Gemini: {e}")
+        return {"error": "Failed to generate content from Gemini."}

source.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import os
+from typing import List, Dict, Any
+import pandas as pd
+import numpy as np
+import torch
+from sentence_transformers import SentenceTransformer, util
+from time import perf_counter as timer
+from concurrent.futures import ThreadPoolExecutor
+from dotenv import load_dotenv
+import logging
+import google.generativeai as genai
+import warnings
+import json
+# Suppress specific FutureWarning messages
+warnings.filterwarnings("ignore", category=FutureWarning)
+# Load environment variables
+load_dotenv()
+# Gemini-API key
+genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+# Importing processors (assumed to be your custom modules)
+from src.text_processor import process_text_file
+from src.audio_processor import process_audio_from_url
+from src.video_processor import process_video_file
+from src.image_processor import process_image_file
+from response import generate_response
+def process_files(file_paths: List[str]) -> List[Dict[str, Any]]:
+    """Processes a list of files in parallel and returns their processed content."""
+    if file_paths == []:
+        logging.info("No files to process")
+        return []
+    def process_single_file(file_path):
+        _, extension = os.path.splitext(file_path)
+        extension = extension.lower()
+        file_name = os.path.basename(file_path)
+        if "?alt=media&token=" in extension:
+            extension = list(extension.split("?"))[0]
+        print("\nprocessing file type : ",extension)
+        try:
+            if extension in ['.txt', '.pdf', '.docx']:
+                return process_text_file(file_path)
+            elif extension in ['.mp3', '.wav', '.flac']:
+                return process_audio_from_url(file_path)
+            elif extension in ['.mp4']:
+                return process_video_file(file_path)
+            elif extension in ['.png', '.jpg', '.jpeg']:
+                return process_image_file(file_path)
+            else:
+                logging.warning(f"Unsupported file type: {extension} for file {file_name}")
+                return []
+        except Exception as e:
+            logging.error(f"Error processing file {file_name}: {e}", exc_info=True)
+            return []
+    try:
+        # Process files in parallel, limiting threads to the number of CPU cores
+        with ThreadPoolExecutor(max_workers=min(len(file_paths), os.cpu_count())) as executor:
+            results = executor.map(process_single_file, file_paths)
+        # Flatten the results
+        processed_data = [item for result in results for item in result]
+        if not processed_data:
+            return []
+        return processed_data
+    except ValueError:
+        logging.error("contains invalid file paths")
+def create_embeddings(processed_data: List[Dict[str, Any]], embedding_model: SentenceTransformer) -> pd.DataFrame:
+    """Generates embeddings for processed data."""
+    try:
+        text_chunks = [item["text"] for item in processed_data]
+        embeddings_list = []  # Store embeddings in a list
+        batch_size = 32
+        # Process embeddings in batches to optimize memory usage
+        for i in range(0, len(text_chunks), batch_size):
+            batch_embeddings = embedding_model.encode(text_chunks[i:i + batch_size], convert_to_tensor=False)  # Avoid torch tensor
+            embeddings_list.extend(batch_embeddings)  # Accumulate embeddings
+            logging.info(f"Processed batch {i // batch_size + 1}/{(len(text_chunks) + batch_size - 1) // batch_size}")
+        # Convert to numpy array of float32 for compatibility with Annoy
+        embeddings_np = np.array(embeddings_list).astype('float32')
+        # Create a DataFrame with the embeddings
+        df = pd.DataFrame(processed_data)
+        df["embedding"] = embeddings_np.tolist()
+        return df
+    except Exception as e:
+        logging.error(f"Error creating embeddings: {e}", exc_info=True)
+        return pd.DataFrame()
+def semantic_search(query: str, embeddings_df: pd.DataFrame, embedding_model: SentenceTransformer, num_results: int) -> List[Dict[str, Any]]:
+    """Performs semantic search using embeddings and returns the top results."""
+    try:
+        # Create embedding for the query
+        query_embedding = embedding_model.encode(query, convert_to_tensor=True)
+        # Convert embeddings from DataFrame to a tensor
+        embeddings = torch.tensor(np.array(embeddings_df["embedding"].tolist()), dtype=torch.float32).to(embedding_model.device)
+        # Measure search time
+        start_time = timer()
+        dot_scores = util.dot_score(query_embedding, embeddings)[0]
+        end_time = timer()
+        logging.info(f"Time taken to get scores on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds.")
+        # Get the top results
+        top_results = torch.topk(dot_scores, k=num_results)
+        results = []
+        # Format the results
+        for score, idx in zip(top_results.values, top_results.indices):
+            idx = idx.item()  # Convert tensor to integer
+            result = {
+                "score": score.item(),
+                "text": embeddings_df.iloc[idx]["text"],
+                "file_name": embeddings_df.iloc[idx]["file_name"],
+                **{k: v for k, v in embeddings_df.iloc[idx].items() if k not in ["text", "file_name", "embedding"]}
+            }
+            results.append(result)
+        return results
+    except Exception as e:
+        logging.error(f"Error during semantic search: {e}", exc_info=True)
+        return []
+def count_tokens(text: str) -> int:
+    """Roughly estimate the number of tokens in a text."""
+    return len(text.split())
+def main(files: list, query: str, min_text_length: int = 1000000, max_gemini_tokens: int = 7300):
+    """Main function to process files, perform semantic search or send data directly to Gemini."""
+    try:
+        # Process files (your existing file processing logic)
+        processed_data = process_files(files)
+        # Combine all text chunks
+        combined_text = " ".join([item["text"] for item in processed_data])
+        logging.info(f"Total text length: {len(combined_text)} characters")
+        # Count tokens and check if they exceed the allowed limit for Gemini
+        token_count = count_tokens(combined_text)
+        print("Token count : ",token_count)
+        # If token count is within limits, send directly to Gemini for response generation
+        if token_count < min_text_length:
+            logging.info(f"Text is below the threshold ({min_text_length} tokens). Sending directly to Gemini.")
+            response = generate_response(combined_text, query)
+            return response
+        else:
+            logging.info(f"Text exceeds the maximum allowed tokens ({max_gemini_tokens}). Performing semantic search.")
+            # Only initialize embeddings when needed
+            embedding_model = SentenceTransformer("all-mpnet-base-v2", device="cuda" if torch.cuda.is_available() else "cpu")
+            # Create embeddings
+            embeddings_df = create_embeddings(processed_data, embedding_model)
+            if embeddings_df.empty:
+                logging.error("No embeddings created. Exiting.")
+                return {"error": "Failed to create embeddings from the processed data."}
+            # Perform semantic search
+            num_results = min(1, len(embeddings_df))  # Adjust number of results based on available data
+            results = semantic_search(query, embeddings_df, embedding_model, num_results)
+            print("Semantic Searchs return the top results with relevant scores and contextual information. \n",results)
+            if not results:
+                logging.error("No results found. Exiting.")
+                return {"error": "Semantic search returned no results."}
+            context = " ".join([result['text'] for result in results])  # Example context generation from results
+            response = generate_response(context, query)
+            return response
+    except Exception as e:
+        logging.error(f"Error: {e}")
+        return {"error": "An error occurred during the main process."}
+if __name__ == "__main__":
+    files = [
+        # Your file paths go here
+    ]
+    query = "Introduce yourself, what are you?"
+    main(files, query)

src/audio_processor.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import whisper
+import requests
+import ffmpeg
+import numpy as np
+from typing import List, Dict, Any
+def process_audio_from_url(audio_url: str) -> List[Dict[str, Any]]:
+    # Download the audio file content
+    response = requests.get(audio_url, stream=True)
+    response.raise_for_status()
+    # Use ffmpeg to decode the audio stream
+    try:
+        out, _ = (
+            ffmpeg
+            .input('pipe:0')
+            .output('pipe:1', format='f32le', acodec='pcm_f32le', ac=1, ar='16k')
+            .run(input=response.raw.read(), capture_stdout=True, capture_stderr=True)
+        )
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    # Convert the audio to the format Whisper expects
+    audio = np.frombuffer(out, np.float32).flatten()
+    # Load the Whisper model
+    model = whisper.load_model("base")
+    # Transcribe the audio
+    result = model.transcribe(audio)
+    segments = []
+    for segment in result["segments"]:
+        segments.append({
+            "file_name": audio_url.split("/")[-1],  # Extract filename from URL
+            "text": segment["text"]
+        })
+    return segments
+def process_audio_data(audio: np.ndarray, file_name: str) -> List[Dict[str, Any]]:
+    # Load the Whisper model
+    model = whisper.load_model("base")
+    # Transcribe the audio
+    result = model.transcribe(audio)
+    segments = []
+    for segment in result["segments"]:
+        segments.append({
+            "file_name": file_name,  # Ensure file_name is added
+            "text": segment["text"]
+        })
+    return segments

src/image_processor.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import easyocr
+import requests
+import io
+from PIL import Image
+from typing import List, Dict, Any
+import os
+import numpy as np
+from gradio_client import Client
+def process_image_file(image_url: str) -> List[Dict[str, Any]]:
+    # Fetch the image content from the URL
+    response = requests.get(image_url)
+    # Check if the request was successful
+    if response.status_code == 200:
+        # Load the image from the response content using PIL
+        image_stream = io.BytesIO(response.content)
+        image = Image.open(image_stream)
+        # Convert the image to a NumPy array, which is supported by EasyOCR
+        image_np = np.array(image)
+        # Use EasyOCR to extract text from the image
+        reader = easyocr.Reader(['en'])
+        result = reader.readtext(image_np)
+        print("*" * 50 + image_url)
+        # Combine the extracted text from EasyOCR
+        extracted_text = "\n".join([detection[1] for detection in result])
+        if len(extracted_text.split())<5 :
+            # Use the BLIP model for image captioning
+            client = Client("HARISH20205/blip-image-caption")
+            caption_result = client.predict(image_url=image_url, api_name="/predict")
+            content = "\nImage Caption:\n" + str(caption_result)
+            return [{
+            "file_name": os.path.basename(image_url),
+            "text": content,
+            }]
+        # Format the content
+        content = "Image Data:\n" + extracted_text
+        return [{
+            "file_name": os.path.basename(image_url),
+            "text": content,
+        }]
+    else:
+        return [{
+            "file_name": os.path.basename(image_url),
+            "text": "Failed to retrieve image.",
+        }]

src/text_processor.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+from typing import List, Dict, Any
+import fitz  # PyMuPDF
+import docx
+import requests
+import io
+def process_text_file(file_url: str) -> List[Dict[str, Any]]:
+    _, extension = os.path.splitext(file_url)
+    extension = extension.lower()
+    if "?alt=media&token=" in extension:
+        extension = list(extension.split("?"))[0]
+    if extension == '.txt':
+        return process_txt(file_url)
+    elif extension == '.pdf':
+        return process_pdf(file_url)
+    elif extension == '.docx':
+        return process_docx(file_url)
+    else:
+        raise ValueError(f"Unsupported text file type: {extension}")
+def process_txt(txt_url: str) -> List[Dict[str, Any]]:
+    # Fetch the TXT file content from the URL
+    response = requests.get(txt_url)
+    # Check if the request was successful
+    if response.status_code == 200:
+        content = response.text
+        return [{
+            "file_name": os.path.basename(txt_url),
+            "text": content,
+            "page_number": 1
+        }]
+    else:
+        print(f"Failed to fetch the TXT file. Status code: {response.status_code}")
+        return []
+def process_pdf(pdf_url: str) -> List[Dict[str, Any]]:
+    # Fetch the PDF file content from the URL
+    response = requests.get(pdf_url)
+    # Check if the request was successful
+    if response.status_code == 200:
+        # Load the PDF file from the response content
+        pdf_stream = io.BytesIO(response.content)
+        # Open the PDF file with PyMuPDF
+        pdf_document = fitz.open(stream=pdf_stream, filetype="pdf")
+        # Extract text from all pages
+        pdf_text = ""
+        for page_num in range(len(pdf_document)):
+            page = pdf_document.load_page(page_num)  # Load the page
+            pdf_text += page.get_text("text")  # Extract text from the page
+        return [{
+            "file_name": os.path.basename(pdf_url),
+            "text": pdf_text
+        }]
+    else:
+        print(f"Failed to fetch the PDF file. Status code: {response.status_code}")
+        return []
+def process_docx(docx_url: str) -> List[Dict[str, Any]]:
+    # Fetch the DOCX file content from the URL
+    response = requests.get(docx_url)
+    # Check if the request was successful
+    if response.status_code == 200:
+        # Load the DOCX file from the response content
+        docx_stream = io.BytesIO(response.content)
+        # Open the DOCX file with python-docx
+        doc = docx.Document(docx_stream)
+        # Extract text from the DOCX file
+        content = "\n".join([para.text for para in doc.paragraphs])
+        return [{
+            "file_name": os.path.basename(docx_url),
+            "text": content,
+            "page_number": 1  # DOCX doesn't have pages, so just 1
+        }]
+    else:
+        print(f"Failed to fetch the DOCX file. Status code: {response.status_code}")

src/video_processor.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import ffmpeg
+import numpy as np
+from src.audio_processor import process_audio_data
+import os
+def process_video_file(file_path: str):
+    # Use ffmpeg to extract audio from the video file
+    try:
+        out, _ = (
+            ffmpeg
+            .input(file_path)
+            .output('pipe:1', format='f32le', acodec='pcm_f32le', ac=1, ar='16k')
+            .run(capture_stdout=True, capture_stderr=True)
+        )
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to extract audio from video: {e.stderr.decode()}") from e
+    # Convert the audio to the format Whisper expects
+    audio = np.frombuffer(out, np.float32).flatten()
+    # Pass file name to audio processor
+    file_name = os.path.basename(file_path)
+    result = process_audio_data(audio, file_name)
+    return result