File size: 2,448 Bytes
301614f 9035153 604b59c 058d9a5 604b59c 058d9a5 9035153 058d9a5 301614f 544ea93 301614f 9035153 42d5877 9035153 544ea93 9035153 058d9a5 f05dba6 058d9a5 f05dba6 058d9a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import gradio as gr
from langchain.document_loaders import PDFMinerLoader, PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
import chromadb
import chromadb.config
from chromadb.config import Settings
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch
import gradio as gr
import uuid
from sentence_transformers import SentenceTransformer
model_name = 'google/flan-t5-base'
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map='auto', offload_folder="offload")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print('flan read')
ST_name = 'sentence-transformers/sentence-t5-base'
st_model = SentenceTransformer(ST_name)
print('sentence read')
def get_context(query_text):
query_emb = st_model.encode(query_text)
query_response = collection.query(query_embeddings=query_emb.tolist(), n_results=4)
context = query_response['documents'][0][0]
context = context.replace('\n', ' ').replace(' ', ' ')
return context
def local_query(query, context):
t5query = """Using the available context, please answer the question.
If you aren't sure please say i don't know.
Context: {}
Question: {}
""".format(context, query)
inputs = tokenizer(t5query, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=20)
return tokenizer.batch_decode(outputs, skip_special_tokens=True)
def run_query(query):
context = get_context(query)
result = local_query(query, context)
return result
def upload_pdf(file):
# Save the uploaded file
file_name = file.name
pdf_filename = os.path.basename(file_path)
# Load a document
loader = PDFMinerLoader(pdf_filename)
doc = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(doc)
texts = [i.page_content for i in texts]
doc_emb = st_model.encode(texts)
doc_emb = doc_emb.tolist()
ids = [str(uuid.uuid1()) for _ in doc_emb]
client = chromadb.Client()
collection = client.create_collection("test_db")
collection.add(
embeddings=doc_emb,
documents=texts,
ids=ids
)
return 'hello'
iface = gr.Interface(
fn=upload_pdf,
inputs="file",
outputs="text",
title="PDF File Uploader",
description="Upload a PDF file and get its filename.",
)
iface.launch()
|