Spaces:
Runtime error
Runtime error
File size: 5,814 Bytes
73b7e8f 3dfc710 73b7e8f 3dfc710 9ae281c 3dfc710 73b7e8f 9ae281c 73b7e8f 3dfc710 73b7e8f 9ae281c 73b7e8f 3dfc710 73b7e8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
#import atexit
import gradio as gr
#from langchain.document_loaders import UnstructuredPDFLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone
import pinecone
import requests
import sys
#from langchain.prompts.chat import (ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate)
from langchain.chains.question_answering import load_qa_chain
#from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from langchain import HuggingFaceHub
from PyPDF2 import PdfReader
#from langchain.document_loaders import TextLoader
#from sentence_transformers.util import semantic_search
from pathlib import Path
from time import sleep
#import pandas as pd
#import torch
import os
import random
import string
from dotenv import load_dotenv
load_dotenv()
file_path = os.path.join(os.getcwd(), "valuation.pdf")
#loader = PyPDFLoader("60LEADERSONAI.pdf")
#loader = PyPDFLoader(file_path)
#data = loader.load()
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
#db_texts = text_splitter.split_documents(data)
data = PdfReader(file_path)
raw_text = ''
db_texts=''
for i, page in enumerate(data.pages):
text = page.extract_text()
if text:
raw_text += text
text_splitter = RecursiveCharacterTextSplitter(
# separator = "\n",
chunk_size = 1000,
chunk_overlap = 100, #striding over the text
length_function = len,
)
db_texts = text_splitter.split_text(raw_text)
class HFEmbeddings:
def __init__(self, api_url, headers):
self.api_url = api_url
self.headers = headers
def get_embeddings(self, texts):
response = requests.post(self.api_url, headers=self.headers, json={"inputs": texts, "options": {"wait_for_model": True}})
embeddings = response.json()
return embeddings
def embed_documents(self, texts):
embeddings = self.get_embeddings(texts)
return embeddings
def __call__(self, texts):
return self.embed_documents(texts)
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
model_id = os.getenv('model_id')
hf_token = os.getenv('hf_token')
repo_id = os.getenv('repo_id')
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}
hf_embeddings = HFEmbeddings(api_url, headers)
#Pinecone账号:
#PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
#PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT')
#PINECONE_INDEX_NAME = os.getenv('PINECONE_INDEX_NAME')
#def generate_random_string(length):
# letters = string.ascii_letters
# random_string = ''.join(random.choice(letters) for _ in range(length))
# return random_string
#random_string = generate_random_string(8)
#def generate_random_string(length):
# letters = string.ascii_lowercase
# return ''.join(random.choice(letters) for i in range(length))
#random_string = generate_random_string(8)
PINECONE_API_KEY = "5f07b52e-2a16-42a3-89c4-8899c584109e"
PINECONE_ENVIRONMENT = "asia-southeast1-gcp-free"
PINECONE_INDEX_NAME = "myindex-allminilm-l6-v2-384"
print(PINECONE_INDEX_NAME)
index_name = PINECONE_INDEX_NAME
print(index_name)
#namespace = random_string
namespace = "HF-GRADIO-0914"
#def exit_handler():
# pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
# index_namespace_to_delete = pinecone.Index(index_name=index_name)
# index_namespace_to_delete.delete(delete_all=True, namespace=namespace)
#atexit.register(exit_handler)
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
index_name = pinecone.Index(index_name)
print(index_name)
vector_db = Pinecone.from_texts(db_texts, hf_embeddings, index_name=index_name, namespace=namespace)
#Considering Python apps automatically execute codes, a Vector-DB should have been created under namespace = "HF-GRADIO-0909"
#when this app begins to run, however, the real world test results show not that way (i.e. namespace not created).
#then input something in the input text box and click submit, to see how the app will react.
#vector_db = Pinecone.from_texts([t.page_content for t in db_texts], hf_embeddings, index_name=index_name, namespace=namespace)
#docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name, namespace=namespace)
print("***********************************")
print("Pinecone Vector/Embedding DB Ready.")
llm = HuggingFaceHub(repo_id=repo_id,
model_kwargs={"min_length":100,
"max_new_tokens":1024, "do_sample":True,
"temperature":0.1,
"top_k":50,
"top_p":0.95, "eos_token_id":49155})
chain = load_qa_chain(llm=llm, chain_type="stuff")
def run_chain(user_query):
if user_query !="" and not user_query.strip().isspace() and not user_query.isspace():
print("Your query:\n"+user_query)
vector_db_from_index = Pinecone.from_existing_index(index_name, hf_embeddings, namespace=namespace)
ss_results = vector_db_from_index.similarity_search(query=user_query, namespace=namespace, k=5)
initial_ai_response = chain.run(input_documents=ss_results, question=user_query)
temp_ai_response = initial_ai_response.partition('<|end|>')[0]
final_ai_response = temp_ai_response.replace('\n', '')
return final_ai_response
else:
print("Invalid inputs.")
iface = gr.Interface(fn=run_chain, inputs="text", outputs="text", title="AI Response")
iface.launch() |