surabhic commited on
Commit
86a19ae
·
1 Parent(s): 4a32b59

Add code for RAG-powered Document Analyzer

Browse files
Files changed (4) hide show
  1. .gitignore +47 -0
  2. Dockerfile.txt +22 -0
  3. app(gradio).py +191 -0
  4. requirements.txt +9 -0
.gitignore ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ venv/
5
+ .env
6
+
7
+ # GPT4All models
8
+ *.bin
9
+ *.gguf
10
+
11
+ # Uploads
12
+ uploads/
13
+
14
+ # Logs
15
+ *.log
16
+
17
+ # Python
18
+ __pycache__/
19
+ *.py[cod]
20
+ *$py.class
21
+ *.so
22
+ .Python
23
+ env/
24
+ build/
25
+ develop-eggs/
26
+ dist/
27
+ downloads/
28
+ eggs/
29
+ .eggs/
30
+ lib/
31
+ lib64/
32
+ parts/
33
+ sdist/
34
+ var/
35
+ *.egg-info/
36
+ .installed.cfg
37
+ *.egg
38
+
39
+ # Virtual Environment
40
+ venv/
41
+ ENV/
42
+
43
+ # Application specific
44
+ uploads/
45
+ models/
46
+ *.log
47
+ .env
Dockerfile.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a lightweight Python base image
2
+ FROM python:3.10-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONDONTWRITEBYTECODE 1
6
+ ENV PYTHONUNBUFFERED 1
7
+
8
+ # Set working directory
9
+ WORKDIR /code
10
+
11
+ # Copy files
12
+ COPY . /code/
13
+
14
+ # Install dependencies
15
+ RUN pip install --upgrade pip
16
+ RUN pip install -r requirements.txt
17
+
18
+ # Expose port
19
+ EXPOSE 7860
20
+
21
+ # Command to run the app
22
+ CMD ["python", "app.py"]
app(gradio).py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import logging
4
+ import math
5
+ import time
6
+ from datetime import datetime
7
+ from flask import Flask, render_template, request, jsonify, send_from_directory
8
+ from werkzeug.utils import secure_filename
9
+ import gradio as gr
10
+ from typing import List, Dict, Optional
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain_community.document_loaders import TextLoader, PyPDFLoader
13
+ from langchain_community.vectorstores import FAISS
14
+ from sentence_transformers import SentenceTransformer
15
+ from gpt4all import GPT4All
16
+ from transformers import pipeline
17
+ import huggingface_hub
18
+
19
+ # Ensure correct version of huggingface_hub is installed
20
+ try:
21
+ if huggingface_hub.__version__ != '0.16.4':
22
+ raise ImportError("Wrong huggingface-hub version")
23
+ except ImportError:
24
+ raise ImportError("Please install huggingface-hub==0.16.4")
25
+
26
+ # Initialize Flask app (optional, for custom routes)
27
+ app = Flask(__name__)
28
+
29
+ # Set environment variables
30
+ app.config['UPLOAD_FOLDER'] = 'uploads'
31
+ app.config['ALLOWED_EXTENSIONS'] = {'pdf', 'txt'}
32
+ app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max upload size
33
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
34
+
35
+ # Initialize models
36
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
37
+ llm = pipeline("text-generation", model="tiiuae/falcon-7b-instruct", device=0)
38
+
39
+ # Initialize vector store
40
+ vector_store = None
41
+
42
+ # Configure logging
43
+ logging.basicConfig(
44
+ filename='agent.log',
45
+ level=logging.INFO,
46
+ format='%(asctime)s - %(levelname)s - %(message)s'
47
+ )
48
+
49
+ # DocumentProcessor class
50
+ class DocumentProcessor:
51
+ @staticmethod
52
+ def allowed_file(filename: str) -> bool:
53
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']
54
+
55
+ @staticmethod
56
+ def load_and_process_documents(file_paths: List[str]) -> List[str]:
57
+ documents = []
58
+ for file_path in file_paths:
59
+ try:
60
+ if file_path.endswith('.pdf'):
61
+ loader = PyPDFLoader(file_path)
62
+ else:
63
+ loader = TextLoader(file_path)
64
+ documents.extend(loader.load())
65
+ except Exception as e:
66
+ logging.error(f"Error loading {file_path}: {str(e)}")
67
+ continue
68
+
69
+ text_splitter = RecursiveCharacterTextSplitter(
70
+ chunk_size=1000,
71
+ chunk_overlap=200,
72
+ length_function=len,
73
+ separators=["\n\n", "\n", ".", " ", ""]
74
+ )
75
+ chunks = text_splitter.split_documents(documents)
76
+ return chunks
77
+
78
+ @staticmethod
79
+ def create_vector_store(chunks: List[str]) -> FAISS:
80
+ texts = [chunk.page_content for chunk in chunks]
81
+ embeddings = embedding_model.encode(texts, show_progress_bar=True)
82
+
83
+ global vector_store
84
+ vector_store = FAISS.from_embeddings(
85
+ text_embeddings=list(zip(texts, embeddings)),
86
+ embedding=embedding_model
87
+ )
88
+ return vector_store
89
+
90
+ # QueryProcessor class
91
+ class QueryProcessor:
92
+ @staticmethod
93
+ def retrieve_relevant_chunks(query: str, k: int = 3) -> List[str]:
94
+ if vector_store is None:
95
+ return []
96
+
97
+ query_embedding = embedding_model.encode([query])
98
+ docs = vector_store.similarity_search_by_vector(query_embedding[0], k=k)
99
+ return [doc.page_content for doc in docs]
100
+
101
+ @staticmethod
102
+ def generate_answer(query: str, context: str) -> str:
103
+ prompt = f"""You are a helpful AI assistant. Answer the question based on the context provided.
104
+ If the answer isn't in the context, say you don't know. Be concise but informative.
105
+
106
+ Context:
107
+ {context}
108
+
109
+ Question: {query}
110
+ Answer:"""
111
+
112
+ try:
113
+ response = llm.generate(prompt, max_tokens=1500, temp=0.7)
114
+ return response.strip()
115
+ except Exception as e:
116
+ logging.error(f"LLM generation error: {str(e)}")
117
+ return "I encountered an error while generating an answer."
118
+
119
+ @staticmethod
120
+ def calculate_expression(expression: str) -> str:
121
+ try:
122
+ safe_expr = re.sub(r'[^0-9+\-*/(). ]', '', expression)
123
+ result = eval(safe_expr, {'__builtins__': None}, {'math': math})
124
+ return str(result)
125
+ except Exception as e:
126
+ logging.error(f"Calculation error: {str(e)}")
127
+ return "I couldn't calculate that expression."
128
+
129
+ @staticmethod
130
+ def define_term(term: str) -> str:
131
+ definitions = {
132
+ "algorithm": "A set of rules or steps used to solve a problem or perform a computation.",
133
+ "api": "Application Programming Interface - a set of protocols for building and integrating software.",
134
+ "database": "An organized collection of structured information or data.",
135
+ "rag": "Retrieval-Augmented Generation - combines information retrieval with text generation.",
136
+ "llm": "Large Language Model - an AI model trained on vast amounts of text data."
137
+ }
138
+ return definitions.get(term.lower(), f"I don't have a definition for '{term}'.")
139
+
140
+ @staticmethod
141
+ def route_query(query: str) -> Dict:
142
+ logging.info(f"Routing query: {query}")
143
+
144
+ # Check for calculation requests
145
+ if any(word in query.lower() for word in ['calculate', 'compute', 'math', 'solve', 'what is']):
146
+ match = re.search(r'([\d+\-*/(). ]+)', query)
147
+ if match:
148
+ expression = match.group(1)
149
+ result = QueryProcessor.calculate_expression(expression)
150
+ logging.info(f"Used calculator for expression: {expression}")
151
+ return {
152
+ "tool": "calculator",
153
+ "result": result,
154
+ "context": None
155
+ }
156
+
157
+ # Check for definition requests
158
+ if any(word in query.lower() for word in ['define', 'definition', 'what is a']):
159
+ match = re.search(r'(?:define|what is a?) (.+?)(?:\?|$)', query.lower())
160
+ if match:
161
+ term = match.group(1)
162
+ result = QueryProcessor.define_term(term)
163
+ logging.info(f"Used dictionary for term: {term}")
164
+ return {
165
+ "tool": "dictionary",
166
+ "result": result,
167
+ "context": None
168
+ }
169
+
170
+ # Default to RAG pipeline
171
+ context_chunks = QueryProcessor.retrieve_relevant_chunks(query)
172
+ context = "\n\n".join(context_chunks) if context_chunks else "No relevant context found."
173
+ answer = QueryProcessor.generate_answer(query, context)
174
+
175
+ logging.info(f"Used RAG pipeline with {len(context_chunks)} context chunks")
176
+ return {
177
+ "tool": "RAG",
178
+ "result": answer,
179
+ "context": context_chunks
180
+ }
181
+
182
+ # Gradio interface
183
+ def query_function(query):
184
+ response = QueryProcessor.route_query(query)
185
+ return response['result']
186
+
187
+ # Gradio setup
188
+ interface = gr.Interface(fn=query_function, inputs="text", outputs="text", live=True)
189
+
190
+ if __name__ == '__main__':
191
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ flask==2.3.2
2
+ sentence-transformers==2.2.2
3
+ faiss-cpu==1.7.4
4
+ langchain==0.0.346
5
+ PyPDF2==3.0.1
6
+ transformers
7
+ huggingface-hub==0.16.4
8
+ accelerate
9
+ numpy==1.24.3