Spaces:
Runtime error
Runtime error
File size: 8,281 Bytes
73b7e8f 9bfe161 fe50673 73b7e8f 5092118 5bd97ab 73b7e8f 9bfe161 73b7e8f d708600 ae4723b 73b7e8f 8123181 73b7e8f c9fb296 8123181 a63cff4 c9fb296 a63cff4 c9fb296 73b7e8f 1abc445 925e9f9 f460139 27a6e08 f460139 925e9f9 27a6e08 a63cff4 27a6e08 98b05d1 1abc445 73b7e8f 2fa565c cca19e3 2fa565c a9130a4 2fa565c a9130a4 0314997 73b7e8f 0314997 fffd0a7 61e5cbb f460139 aafa703 925e9f9 27a6e08 925e9f9 a63cff4 fffd0a7 0314997 a63cff4 925e9f9 27a6e08 925e9f9 a63cff4 aafa703 0314997 fffd0a7 0314997 4a41d20 a63cff4 6f5f455 a63cff4 aafa703 925e9f9 91cc99e 925e9f9 a63cff4 e38e27c e80ebc9 aafa703 e80ebc9 ee4fac2 e80ebc9 ea1053c ee4fac2 ea1053c e80ebc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
import gradio as gr
from langchain.document_loaders import PyPDFLoader
#from langchain.document_loaders import TextLoader
#from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone
import pinecone
import requests
from langchain.chains.question_answering import load_qa_chain
from langchain import PromptTemplate
from langchain import HuggingFaceHub
from PyPDF2 import PdfReader
from pathlib import Path
from time import sleep
import sys
import os
import random
import string
from dotenv import load_dotenv
load_dotenv()
#from sentence_transformers.util import semantic_search
#import pandas as pd
#import torch
def generate_random_string(length):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(length))
random_string = generate_random_string(10)
#file_path = os.path.join(os.getcwd(), "valuation.pdf")
file_path = os.path.join(os.getcwd(), "60LEADERSONAI.pdf")
#loader = PyPDFLoader("60LEADERSONAI.pdf")
#loader = PyPDFLoader(file_path)
#data = loader.load()
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
#db_texts = text_splitter.split_documents(data)
data = PdfReader(file_path)
raw_text = ''
db_texts=''
for i, page in enumerate(data.pages):
text = page.extract_text()
if text:
raw_text += text
text_splitter = RecursiveCharacterTextSplitter(
# separator = "\n",
chunk_size = 1000,
chunk_overlap = 100, #striding over the text
length_function = len,
)
db_texts = text_splitter.split_text(raw_text)
class HFEmbeddings:
def __init__(self, api_url, headers):
self.api_url = api_url
self.headers = headers
def get_embeddings(self, texts):
response = requests.post(self.api_url, headers=self.headers, json={"inputs": texts, "options": {"wait_for_model": True}})
embeddings = response.json()
return embeddings
def embed_documents(self, texts):
embeddings = self.get_embeddings(texts)
return embeddings
def __call__(self, texts):
return self.embed_documents(texts)
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
model_id = os.getenv('model_id')
hf_token = os.getenv('hf_token')
repo_id = os.getenv('repo_id')
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}
hf_embeddings = HFEmbeddings(api_url, headers)
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT')
PINECONE_INDEX_NAME = os.getenv('PINECONE_INDEX_NAME')
print(PINECONE_INDEX_NAME)
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
index_name = PINECONE_INDEX_NAME
#index_name = pinecone.Index(index_name)
print("index_name:"+index_name)
namespace = random_string
print("namespace:"+namespace)
vector_db = Pinecone.from_texts(db_texts, hf_embeddings, index_name=index_name, namespace=namespace)
#vector_db = Pinecone.from_texts([t.page_content for t in db_texts], hf_embeddings, index_name=index_name, namespace=namespace)
#docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name, namespace=namespace)
print("***********************************")
print("Pinecone Vector/Embedding DB Ready.")
index_name_extracted=pinecone.list_indexes()
print("index_name_extracted:"+str(index_name_extracted))
print(index_name_extracted)
index_current = pinecone.Index(index_name=index_name)
print("index_current:"+str(index_current))
print(index_current)
index_status=index_current.describe_index_stats()
print("index_status:"+str(index_status))
print(index_status)
print("namespace:"+namespace)
print(namespace)
print("**************Ready for QA*********************")
llm = HuggingFaceHub(repo_id=repo_id,
model_kwargs={"min_length":100,
"max_new_tokens":1024, "do_sample":True,
"temperature":0.1,
"top_k":50,
"top_p":0.95, "eos_token_id":49155})
#prompt_template = """You are a very helpful AI assistant. Please ONLY use {context} to answer the user's input question. If you don't know the answer, just say that you don't know. DON'T try to make up an answer and do NOT go beyond the given context without the user's explicitly asking you to do so!
#Question: {question}
#Helpful AI Repsonse:
#"""
prompt_template = """You are a very helpful AI assistant. Please ONLY use the givens context to answer the user's input question. If you don't know the answer, just say that you don't know.
Context: {context}
Question: {question}
Helpful AI Repsonse:
"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain = load_qa_chain(llm=llm, chain_type="stuff", prompt=PROMPT)
#chain = load_qa_chain(llm=llm, chain_type="stuff")
def run_chain(user_query):
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
index_name_extracted=pinecone.list_indexes()
index_current = pinecone.Index(index_name=index_name)
index_status=index_current.describe_index_stats()
print("****************Start of QA*******************")
print("index_name_extracted:"+str(index_name_extracted))
print("index_current:"+str(index_current))
print("index_status:"+str(index_status))
print("namespace:"+namespace)
if user_query !="" and not user_query.strip().isspace() and not user_query.isspace():
print("Your query:\n"+user_query)
vector_db_from_index = Pinecone.from_existing_index(index_name, hf_embeddings, namespace=namespace)
ss_results = vector_db_from_index.similarity_search(query=user_query, namespace=namespace, k=5)
initial_ai_response = chain.run(input_documents=ss_results, question=user_query, return_only_outputs=True)
#initial_ai_response=chain({"input_documents": ss_results, "question": user_query}, return_only_outputs=True)
temp_ai_response = initial_ai_response.partition('<|end|>')[0]
final_ai_response = temp_ai_response.replace('\n', '')
print("final_ai_response:"+final_ai_response)
print("***********************************")
print("index_name_extracted:"+str(index_name_extracted))
print("index_current:"+str(index_current))
print("index_status:"+str(index_status))
print("namespace:"+namespace)
print("****************End of QA*******************")
return final_ai_response
else:
print("Invalid inputs.")
def delete_index_namespace():
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
index_name_extracted=pinecone.list_indexes()
index_current = pinecone.Index(index_name=index_name)
index_status=index_current.describe_index_stats()
print("****************Start of Namespace Deletion*******************")
print("index_name_extracted:"+str(index_name_extracted))
print("index_current:"+str(index_current))
print("index_status:"+str(index_status))
print("namespace:"+namespace)
index_namespace_to_delete = pinecone.Index(index_name=index_name)
index_namespace_to_delete.delete(delete_all=True, namespace=namespace)
print("Pinecone Index Namespace: "+namespace+" has been deleted!")
print("****************End of Namespace Deletion*******************")
with gr.Blocks() as demo:
gr.Markdown("Enter your question below & click Get AI Response. Remember to clear data before exiting program.")
with gr.Row():
user_query = gr.Textbox(label="User query input box", placeholder="Enter your query here.")
ai_response = gr.Textbox(label="AI Response display area", placeholder="AI Response to be displayed here.")
query_btn = gr.Button("Get AI Response")
ai_res_btn = gr.Button("Clear Data & Exit")
query_btn.click(fn=run_chain, inputs=user_query, outputs=ai_response)
ai_res_btn.click(fn=delete_index_namespace)
demo.launch() |