File size: 3,837 Bytes
301614f 9035153 604b59c 6c36800 1ea407d 1288197 2cd4e0a 7ab093b 058d9a5 2cd4e0a 604b59c b0a8958 7ab093b b0a8958 058d9a5 d121146 9035153 7ab093b 9035153 03e01d3 9035153 03e01d3 9035153 d99a408 9035153 725d485 2a5a407 d121146 b0a8958 725d485 7ab093b f80ac06 9035153 dffeb2d 911a8be 5573a68 911a8be 868e2fc b0a8958 986cfd0 b0a8958 4627abd 725d485 911a8be 5573a68 911a8be 2024184 dffeb2d e9d0a51 59b084c dffeb2d 59b084c 23d4171 7f9bf9b 59b084c 7c1d20d fc4d061 e554b8b 7ab093b 71db1e4 fc4d061 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import gradio as gr
from langchain.document_loaders import PDFMinerLoader, PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
import chromadb
import chromadb.config
from chromadb.config import Settings
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch
import uuid
from sentence_transformers import SentenceTransformer
import os
#
model_name = 'google/flan-t5-base'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
ST_name = 'sentence-transformers/sentence-t5-base'
st_model = SentenceTransformer(ST_name)
client = chromadb.Client()
collection = client.create_collection("my_database")
def get_context(query_text):
query_emb = st_model.encode(query_text)
query_response = collection.query(query_embeddings=query_emb.tolist(), n_results=4)
context = query_response['documents'][0][0]
context = context.replace('\n', ' ').replace(' ', ' ')
return context
def local_query(query, context):
t5query = """Using the available context, please answer the question.
If you are not sure please say I don't know.
Context: {}
Question: {}
""".format(context, query)
inputs = tokenizer(t5query, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=20)
return tokenizer.batch_decode(outputs, skip_special_tokens=True)
def run_query(btn, history, query):
context = get_context(query)
result = local_query(query, context)
history.append((query, str(result[0])))
return history, ""
def upload_pdf(file):
try:
if file is not None:
global collection
file_name = file.name
loader = PDFMinerLoader(file_name)
doc = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(doc)
texts = [i.page_content for i in texts]
doc_emb = st_model.encode(texts)
doc_emb = doc_emb.tolist()
ids = [str(uuid.uuid1()) for _ in doc_emb]
collection.add(
embeddings=doc_emb,
documents=texts,
ids=ids
)
return 'Successfully uploaded!'
else:
return "No file uploaded."
except Exception as e:
return f"An error occurred: {e}"
with gr.Blocks() as demo:
btn = gr.UploadButton("Upload a PDF", file_types=[".pdf"])
output = gr.Textbox(label="Output Box", style={"height": "100px", "margin-top": "20px"})
chatbot = gr.Chatbot(height=240)
with gr.Row():
with gr.Column(scale=0.70):
txt = gr.Textbox(
show_label=False,
placeholder="Enter a question",
)
# with gr.Blocks() as demo:
# btn = gr.UploadButton("Upload a PDF", file_types=[".pdf"])
# output = gr.Textbox(label="Output Box", style={"height": "100px", "margin-top": "20px"})
# chatbot = gr.Chatbot(height=240, placeholder="Ask me anything...", style={"margin-top": "20px"})
# with gr.Row(style={"margin-top": "20px"}):
# with gr.Column(scale=0.70):
# # Styled Textbox
# txt = gr.Textbox(
# show_label=False,
# placeholder="Enter a question",
# style={"width": "100%", "height": "100px", "margin-bottom": "10px"}
# )
# # Event handler for uploading a PDF
# btn.upload(fn=upload_pdf, inputs=[btn], outputs=[output])
# txt.submit(run_query, [btn, chatbot, txt], [chatbot, txt])
gr.close_all()
demo.queue().launch()
|