Spaces:
Sleeping
Sleeping
import os | |
from langchain.document_loaders import ( | |
PyPDFLoader, | |
TextLoader, | |
Docx2txtLoader | |
) | |
from langchain.text_splitter import CharacterTextSplitter | |
# from PyPDF2 import PdfReader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
import google.generativeai as genai | |
from langchain.vectorstores import FAISS | |
from langchain_google_genai import ChatGoogleGenerativeAI | |
from langchain.chains.question_answering import load_qa_chain | |
from langchain.prompts import PromptTemplate | |
from langchain.memory import ConversationBufferMemory | |
from dotenv import load_dotenv | |
from src.agent import build_qa_chain | |
import gradio as gr | |
load_dotenv() | |
genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) | |
class AgentChain: | |
def __init__(self): | |
self.agent = None | |
self.db = None | |
agent_chain = AgentChain() | |
agent_chain.agent = build_qa_chain() | |
def extract_text_from_files(docs): | |
documents = [] | |
files = os.listdir(docs) | |
if len(files) == 0: | |
return "Directory is empty" | |
base_dir = docs.split("/") | |
base_dir = "/".join(base_dir) | |
for file in files: | |
if file.endswith(".pdf"): | |
pdf_path=os.path.join(base_dir, file) | |
loader=PyPDFLoader(pdf_path) | |
documents.extend(loader.load()) | |
elif file.endswith('.docx') or file.endswith('.doc'): | |
doc_path=os.path.join(base_dir, file) | |
loader=Docx2txtLoader(doc_path) | |
documents.extend(loader.load()) | |
elif file.endswith('.txt'): | |
text_path=os.path.join(base_dir, file) | |
loader=TextLoader(text_path) | |
documents.extend(loader.load()) | |
return documents | |
def extract_text_from_file(file): | |
documents = [] | |
filename = str(file) | |
if filename.endswith(".pdf"): | |
loader=PyPDFLoader(file) | |
documents.extend(loader.load()) | |
elif filename.endswith('.docx') or file.endswith('.doc'): | |
loader=Docx2txtLoader(file) | |
documents.extend(loader.load()) | |
elif filename.endswith('.txt'): | |
loader=TextLoader(file) | |
documents.extend(loader.load()) | |
print("Text extracted") | |
return documents | |
def get_text_chunks(text): | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000) | |
chunks = text_splitter.split_documents(text) | |
print("Chunks splitted") | |
return chunks | |
def save_in_faiss(text_chunks, save=False): | |
embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001") | |
vector_store = FAISS.from_documents(text_chunks, embedding=embeddings) | |
if save: | |
vector_store.save_local("faiss_index") | |
print("Document search created") | |
return vector_store | |
def process_files(file): | |
documents = extract_text_from_file(file) | |
text_chunks = get_text_chunks(documents) | |
vector_store = save_in_faiss(text_chunks) | |
agent_chain.db = vector_store | |
gr.Info("Processing completed") | |
return file | |
def answer_query(message, history): | |
if agent_chain.db is not None: | |
docs = agent_chain.db.similarity_search(message) | |
docs = [] | |
response = agent_chain.agent({"input_documents": docs, "human_input": message}, return_only_outputs=True) | |
return response['output_text'] |