PDF_CHATBOT_2 / app.py
rajsecrets0's picture
Upload app.py
82df4eb
pip install -qU cassio datasets langchain openai tiktoken
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
# Support for dataset retrieval with Hugging Face
from datasets import load_dataset
# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio
pip install PyPDF2
from PyPDF2 import PdfReader
ASTRA_DB_APPLICATION_TOKEN = "AstraCS:OsOjMKLLxkWFoUpmNbWeJwIP:d8b4df7fd17c288edd265f9d167fa821e97e9d97098842c2e3ed4140d756d02d"
ASTRA_DB_ID = "f97bbcce-b48b-4b42-8ad0-fdc38b2e165e" # enter your Database ID
OPENAI_API_KEY = "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" # enter your OpenAI key
# provide the path of pdf file/files.
pdfreader = PdfReader('Ethics.pdf')
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
content = page.extract_text()
if content:
raw_text += content
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
astra_vector_store = Cassandra(
embedding=embedding,
table_name="qa_mini_demo",
session=None,
keyspace=None,
)
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
separator = "\n",
chunk_size = 800,
chunk_overlap = 200,
length_function = len,
)
texts = text_splitter.split_text(raw_text)
astra_vector_store.add_texts(texts[:])
print("Inserted %i headlines." % len(texts[:]))
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)
first_question = True
while True:
if first_question:
query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
else:
query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()
if query_text.lower() == "quit":
break
if query_text == "":
continue
first_question = False
print("\nQUESTION: \"%s\"" % query_text)
answer = astra_vector_index.query(query_text, llm=llm).strip()
print("ANSWER: \"%s\"\n" % answer)