Spaces:

surabhic
/

RAG-powered-Document-analyzer

Sleeping

App Files Files Community

surabhic commited on May 9

Commit

86a19ae

1 Parent(s): 4a32b59

Add code for RAG-powered Document Analyzer

Browse files

Files changed (4) hide show

.gitignore +47 -0
Dockerfile.txt +22 -0
app(gradio).py +191 -0
requirements.txt +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,47 @@

+# Python
+__pycache__/
+*.py[cod]
+venv/
+.env
+# GPT4All models
+*.bin
+*.gguf
+# Uploads
+uploads/
+# Logs
+*.log
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+ENV/
+# Application specific
+uploads/
+models/
+*.log
+.env

Dockerfile.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+# Use a lightweight Python base image
+FROM python:3.10-slim
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+# Set working directory
+WORKDIR /code
+# Copy files
+COPY . /code/
+# Install dependencies
+RUN pip install --upgrade pip
+RUN pip install -r requirements.txt
+# Expose port
+EXPOSE 7860
+# Command to run the app
+CMD ["python", "app.py"]

app(gradio).py ADDED Viewed

	@@ -0,0 +1,191 @@

+import os
+import re
+import logging
+import math
+import time
+from datetime import datetime
+from flask import Flask, render_template, request, jsonify, send_from_directory
+from werkzeug.utils import secure_filename
+import gradio as gr
+from typing import List, Dict, Optional
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import TextLoader, PyPDFLoader
+from langchain_community.vectorstores import FAISS
+from sentence_transformers import SentenceTransformer
+from gpt4all import GPT4All
+from transformers import pipeline
+import huggingface_hub
+# Ensure correct version of huggingface_hub is installed
+try:
+    if huggingface_hub.__version__ != '0.16.4':
+        raise ImportError("Wrong huggingface-hub version")
+except ImportError:
+    raise ImportError("Please install huggingface-hub==0.16.4")
+# Initialize Flask app (optional, for custom routes)
+app = Flask(__name__)
+# Set environment variables
+app.config['UPLOAD_FOLDER'] = 'uploads'
+app.config['ALLOWED_EXTENSIONS'] = {'pdf', 'txt'}
+app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16MB max upload size
+os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+# Initialize models
+embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+llm = pipeline("text-generation", model="tiiuae/falcon-7b-instruct", device=0)
+# Initialize vector store
+vector_store = None
+# Configure logging
+logging.basicConfig(
+    filename='agent.log',
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+# DocumentProcessor class
+class DocumentProcessor:
+    @staticmethod
+    def allowed_file(filename: str) -> bool:
+        return '.' in filename and filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']
+    @staticmethod
+    def load_and_process_documents(file_paths: List[str]) -> List[str]:
+        documents = []
+        for file_path in file_paths:
+            try:
+                if file_path.endswith('.pdf'):
+                    loader = PyPDFLoader(file_path)
+                else:
+                    loader = TextLoader(file_path)
+                documents.extend(loader.load())
+            except Exception as e:
+                logging.error(f"Error loading {file_path}: {str(e)}")
+                continue
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            length_function=len,
+            separators=["\n\n", "\n", ".", " ", ""]
+        )
+        chunks = text_splitter.split_documents(documents)
+        return chunks
+    @staticmethod
+    def create_vector_store(chunks: List[str]) -> FAISS:
+        texts = [chunk.page_content for chunk in chunks]
+        embeddings = embedding_model.encode(texts, show_progress_bar=True)
+        global vector_store
+        vector_store = FAISS.from_embeddings(
+            text_embeddings=list(zip(texts, embeddings)),
+            embedding=embedding_model
+        )
+        return vector_store
+# QueryProcessor class
+class QueryProcessor:
+    @staticmethod
+    def retrieve_relevant_chunks(query: str, k: int = 3) -> List[str]:
+        if vector_store is None:
+            return []
+        query_embedding = embedding_model.encode([query])
+        docs = vector_store.similarity_search_by_vector(query_embedding[0], k=k)
+        return [doc.page_content for doc in docs]
+    @staticmethod
+    def generate_answer(query: str, context: str) -> str:
+        prompt = f"""You are a helpful AI assistant. Answer the question based on the context provided.
+        If the answer isn't in the context, say you don't know. Be concise but informative.
+Context:
+{context}
+Question: {query}
+Answer:"""
+        try:
+            response = llm.generate(prompt, max_tokens=1500, temp=0.7)
+            return response.strip()
+        except Exception as e:
+            logging.error(f"LLM generation error: {str(e)}")
+            return "I encountered an error while generating an answer."
+    @staticmethod
+    def calculate_expression(expression: str) -> str:
+        try:
+            safe_expr = re.sub(r'[^0-9+\-*/(). ]', '', expression)
+            result = eval(safe_expr, {'__builtins__': None}, {'math': math})
+            return str(result)
+        except Exception as e:
+            logging.error(f"Calculation error: {str(e)}")
+            return "I couldn't calculate that expression."
+    @staticmethod
+    def define_term(term: str) -> str:
+        definitions = {
+            "algorithm": "A set of rules or steps used to solve a problem or perform a computation.",
+            "api": "Application Programming Interface - a set of protocols for building and integrating software.",
+            "database": "An organized collection of structured information or data.",
+            "rag": "Retrieval-Augmented Generation - combines information retrieval with text generation.",
+            "llm": "Large Language Model - an AI model trained on vast amounts of text data."
+        }
+        return definitions.get(term.lower(), f"I don't have a definition for '{term}'.")
+    @staticmethod
+    def route_query(query: str) -> Dict:
+        logging.info(f"Routing query: {query}")
+        # Check for calculation requests
+        if any(word in query.lower() for word in ['calculate', 'compute', 'math', 'solve', 'what is']):
+            match = re.search(r'([\d+\-*/(). ]+)', query)
+            if match:
+                expression = match.group(1)
+                result = QueryProcessor.calculate_expression(expression)
+                logging.info(f"Used calculator for expression: {expression}")
+                return {
+                    "tool": "calculator",
+                    "result": result,
+                    "context": None
+                }
+        # Check for definition requests
+        if any(word in query.lower() for word in ['define', 'definition', 'what is a']):
+            match = re.search(r'(?:define|what is a?) (.+?)(?:\?|$)', query.lower())
+            if match:
+                term = match.group(1)
+                result = QueryProcessor.define_term(term)
+                logging.info(f"Used dictionary for term: {term}")
+                return {
+                    "tool": "dictionary",
+                    "result": result,
+                    "context": None
+                }
+        # Default to RAG pipeline
+        context_chunks = QueryProcessor.retrieve_relevant_chunks(query)
+        context = "\n\n".join(context_chunks) if context_chunks else "No relevant context found."
+        answer = QueryProcessor.generate_answer(query, context)
+        logging.info(f"Used RAG pipeline with {len(context_chunks)} context chunks")
+        return {
+            "tool": "RAG",
+            "result": answer,
+            "context": context_chunks
+        }
+# Gradio interface
+def query_function(query):
+    response = QueryProcessor.route_query(query)
+    return response['result']
+# Gradio setup
+interface = gr.Interface(fn=query_function, inputs="text", outputs="text", live=True)
+if __name__ == '__main__':
+    interface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+flask==2.3.2
+sentence-transformers==2.2.2
+faiss-cpu==1.7.4
+langchain==0.0.346
+PyPDF2==3.0.1
+transformers
+huggingface-hub==0.16.4
+accelerate
+numpy==1.24.3