JSearch / app.py
demoPOC's picture
Update app.py
5a35d7e
raw
history blame
7.42 kB
import openai
import os
openai.api_key=os.getenv("OPENAI_API_KEY")
from dotenv import load_dotenv
load_dotenv()
from flask import Flask, jsonify, render_template, request
import requests, json
# import nltk
# nltk.download("punkt")
import shutil
from werkzeug.utils import secure_filename
from werkzeug.datastructures import FileStorage
import nltk
from datetime import datetime
import openai
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
#from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.document_loaders import SeleniumURLLoader, PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import VectorDBQA
from langchain.document_loaders import UnstructuredFileLoader, TextLoader
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferWindowMemory
import warnings
warnings.filterwarnings("ignore")
openai.api_key = os.environ["OPENAI_API_KEY"]
#app = Flask(__name__)
app = Flask(__name__, template_folder="./")
# Create a directory in a known location to save files to.
uploads_dir = os.path.join(app.root_path,'static', 'searchUploads')
os.makedirs(uploads_dir, exist_ok=True)
def pretty_print_docs(docs):
print(f"\n{'-' * 100}\n".join([f"Document {i + 1}:\n\n" + "Document Length>>>" + str(
len(d.page_content)) + "\n\nDocument Source>>> " + d.metadata['source'] + "\n\nContent>>> " + d.page_content for
i, d in enumerate(docs)]))
def getEmbeddingModel(embeddingId):
# if (embeddingId == 1):
# embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# else:
# embeddings = OpenAIEmbeddings()
return OpenAIEmbeddings()
def clearKBUploadDirectory(uploads_dir):
for filename in os.listdir(uploads_dir):
file_path = os.path.join(uploads_dir, filename)
print("Clearing Doc Directory. Trying to delete" + file_path)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print('Failed to delete %s. Reason: %s' % (file_path, e))
@app.route('/', methods=['GET'])
def test():
return "Docker hello"
@app.route('/KBUploader')
def KBUpload():
return render_template("KBTrain.html")
@app.route('/aiassist')
def aiassist():
return render_template("index.html")
@app.route('/post_json', methods=['POST'])
def post_json():
print(f"\n{'*' * 100}\n")
print("Request Received >>>>>>>>>>>>>>>>>>", datetime.now().strftime("%H:%M:%S"))
content_type = request.headers.get('Content-Type')
if (content_type == 'application/json'):
requestQuery = request.get_json()
print()
relevantDoc=vectordb.similarity_search_with_score(requestQuery['query'],distance_metric="cos", k = 3)
searchResultArray=[]
for doc in relevantDoc:
searchResult = {}
print(f"\n{'-' * 100}\n")
searchResult['documentSource']=doc[len(doc)-2].metadata['source']
searchResult['pageContent']=doc[len(doc)-2].page_content
searchResult['similarityScore']=str(doc[len(doc)-1])
print(doc)
print("Document Source>>>>>> "+searchResult['documentSource']+"\n\n")
print("Page Content>>>>>> "+searchResult['pageContent']+"\n\n")
print("Similarity Score>>>> "+searchResult['similarityScore'])
print(f"\n{'-' * 100}\n")
searchResultArray.append(searchResult)
print(f"\n{'*' * 100}\n")
return jsonify(botMessage=searchResultArray)
else:
return 'Content-Type not supported!'
@app.route('/file_upload', methods=['POST'])
def file_Upload():
fileprovided=not request.files.getlist('files[]')[0].filename==''
urlProvided=not request.form.getlist('weburl')[0]==''
print("*******")
print("File Provided:"+str(fileprovided))
print("URL Provided:"+str(urlProvided))
print("*******")
documents = []
if fileprovided:
#Delete Files
for filename in os.listdir(uploads_dir):
file_path = os.path.join(uploads_dir, filename)
print("Clearing Doc Directory. Trying to delete"+file_path)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print('Failed to delete %s. Reason: %s' % (file_path, e))
#Read and Embed New Files provided
for file in request.files.getlist('files[]'):
print("File Received>>>"+file.filename)
file.save(os.path.join(uploads_dir, secure_filename(file.filename)))
#loader = UnstructuredFileLoader(os.path.join(uploads_dir, secure_filename(file.filename)), mode='elements')
loader = PyPDFLoader(os.path.join(uploads_dir, secure_filename(file.filename)))
documents.extend(loader.load())
if urlProvided:
weburl=request.form.getlist('weburl')
print(weburl)
urlList=weburl[0].split(';')
print(urlList)
print("Selenium Started", datetime.now().strftime("%H:%M:%S"))
#urlLoader=RecursiveUrlLoader(urlList[0])
urlLoader=SeleniumURLLoader(urlList)
print("Selenium Completed", datetime.now().strftime("%H:%M:%S"))
documents.extend(urlLoader.load())
print(uploads_dir)
global chain;
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)
#text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=150,separator="</Q>")
texts = text_splitter.split_documents(documents)
print("All chunk List START ***********************\n\n")
pretty_print_docs(texts)
print("All chunk List END ***********************\n\n")
embeddings = OpenAIEmbeddings()
#from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
#embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# from langchain.embeddings import HuggingFaceEmbeddings
# model_name = "sentence-transformers/all-MiniLM-L6-v2"
# model_kwargs = {'device': 'cpu'}
# encode_kwargs = {'normalize_embeddings': False}
# embeddings = HuggingFaceEmbeddings(
# model_name=model_name,
# model_kwargs=model_kwargs,
# encode_kwargs=encode_kwargs
# )
global vectordb
#vectordb = Chroma.from_documents(texts,embeddings)
vectordb=Chroma.from_documents(documents=texts, embedding=embeddings, collection_metadata={"hnsw:space": "cosine"})
return render_template("index.html")
if __name__ == '__main__':
app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)))